Yuxiao319 commited on 14 days ago

Commit

812f9be

•

1 Parent(s): 9537b33

wonder3d_plus_ckpt

Browse files

Files changed (40) hide show

ckpts/unet/diffusion_pytorch_model.bin +2 -2
configs/mvdiffusion-joint-plus.yaml +50 -0
example_images/000-chair.jpg +0 -0
example_images/001-bananaman.jpg +0 -0
example_images/25-251458_powder-puff-girls-the-powerpuff-girls-cartoon-characters.png +0 -0
example_images/blue_dragon2.png +0 -0
example_images/cartoon_dinosaur.png +0 -0
example_images/cat.png +0 -0
example_images/chair_wood.jpg +0 -0
example_images/chili.png +0 -0
example_images/dragon2.png +0 -0
example_images/fox3.png +0 -0
example_images/generated_1715763329_frame0.png +0 -0
example_images/jelly.png +0 -0
example_images/man-head.jpeg +0 -0
example_images/milk.png +0 -0
example_images/mushroom_teapot.jpg +0 -0
example_images/red_dragon3.png +0 -0
example_images/turtle_ortho.png +0 -0
gradio_app.py +26 -12
mv_diffusion_30/data/depth_utils.py +126 -0
mv_diffusion_30/data/fixed_poses/nine_views.zip +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_back_RT.txt +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_back_left_RT.txt +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_back_right_RT.txt +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_front_RT.txt +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_front_left_RT.txt +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_front_right_RT.txt +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_left_RT.txt +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_right_RT.txt +3 -0
mv_diffusion_30/data/fixed_poses/nine_views/000_top_RT.txt +3 -0
mv_diffusion_30/data/multiview_image_dataset.py +308 -0
mv_diffusion_30/data/normal_utils.py +45 -0
mv_diffusion_30/data/objaverse_dataset.py +1359 -0
mv_diffusion_30/data/single_image_dataset.py +337 -0
mv_diffusion_30/models/transformer_mv2d.py +1093 -0
mv_diffusion_30/models/unet_mv2d_blocks.py +922 -0
mv_diffusion_30/models/unet_mv2d_condition.py +1498 -0
mv_diffusion_30/pipelines/pipeline_mvdiffusion_image.py +555 -0
requirements.txt +16 -7

ckpts/unet/diffusion_pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f29b1dbdda72c7187e012f2c86de212ca0f6dd60d27840559b674ef507080ecf
-size 3445026909

 version https://git-lfs.github.com/spec/v1
+oid sha256:25a338df9e3e913ac9de83fb6d1585ea01031dba03434e2adc32237127fddbab
+size 3643509774

configs/mvdiffusion-joint-plus.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+pretrained_model_name_or_path: 'lambdalabs/sd-image-variations-diffusers'   # or './ckpts'
+pretrained_unet_path: '/mvfs/workspace/code/mv_proj/outputs-v6/stage3_w_pretrain_single_crop/unet-20000/unet'
+revision: null
+validation_dataset:
+  root_dir: "example_images" # the folder path stores testing images
+  num_views: 6
+  bg_color: 'white'
+  img_wh: [256, 256]
+  num_validation_samples: 1000
+  crop_size: 192
+  filepaths: ['owl.png']
+  cam_types: ['ortho']
+  load_cam_type: true
+save_dir: 'outputs-inference/'
+pred_type: 'joint_color_normal'
+seed: 33
+validation_batch_size: 1
+dataloader_num_workers: 64
+local_rank: -1
+pipe_kwargs:
+  camera_embedding_type: 'e_de_da_sincos'
+  num_views: 6
+  pred_type: 'joint_color_normal'
+validation_guidance_scales: [2.0]
+pipe_validation_kwargs:
+  eta: 1.0
+validation_grid_nrow: 6
+unet_from_pretrained_kwargs:
+  camera_embedding_type: 'e_de_da_sincos'
+  projection_class_embeddings_input_dim: 14
+  num_views: 6
+  sample_size: 32
+  cd_attention_mid: true
+  zero_init_conv_in: false
+  zero_init_camera_projection: false
+  multiview_attention: true
+  sparse_mv_attention: false
+  mvcd_attention: false
+num_views: 6
+camera_embedding_type: 'e_de_da_sincos'
+load_task: true
+enable_xformers_memory_efficient_attention: False

example_images/000-chair.jpg DELETED Viewed

Binary file (65.9 kB)

example_images/001-bananaman.jpg DELETED Viewed

Binary file (83.4 kB)

example_images/25-251458_powder-puff-girls-the-powerpuff-girls-cartoon-characters.png DELETED Viewed

Binary file (277 kB)

example_images/blue_dragon2.png ADDED Viewed

example_images/cartoon_dinosaur.png ADDED Viewed

example_images/cat.png DELETED Viewed

Binary file (66.2 kB)

example_images/chair_wood.jpg ADDED Viewed

example_images/chili.png DELETED Viewed

Binary file (18.2 kB)

example_images/dragon2.png ADDED Viewed

example_images/fox3.png DELETED Viewed

Binary file (358 kB)

example_images/generated_1715763329_frame0.png ADDED Viewed

example_images/jelly.png ADDED Viewed

example_images/man-head.jpeg DELETED Viewed

Binary file (5.7 kB)

example_images/milk.png DELETED Viewed

Binary file (28.3 kB)

example_images/mushroom_teapot.jpg ADDED Viewed

example_images/red_dragon3.png ADDED Viewed

example_images/turtle_ortho.png ADDED Viewed

gradio_app.py CHANGED Viewed

@@ -23,9 +23,9 @@ from typing import Dict, Optional, Tuple, List
 from dataclasses import dataclass
 import huggingface_hub
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
-from mvdiffusion.models.unet_mv2d_condition import UNetMV2DConditionModel
-from mvdiffusion.data.single_image_dataset import SingleImageDataset as MVDiffusionDataset
-from mvdiffusion.pipelines.pipeline_mvdiffusion_image import MVDiffusionImagePipeline
 from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
 from einops import rearrange
 import numpy as np
@@ -51,6 +51,7 @@ Generate consistent multi-view normals maps and color images.
 <div>
 The demo does not include the mesh reconstruction part, please visit <a href="https://github.com/xxlong0/Wonder3D/">our github repo</a> to get a textured mesh.
 </div>
 '''
 _GPU_ID = 0
@@ -147,7 +148,7 @@ def load_wonder3d_pipeline(cfg):
     feature_extractor = CLIPImageProcessor.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="feature_extractor", revision=cfg.revision)
     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", revision=cfg.revision)
     unet = UNetMV2DConditionModel.from_pretrained_2d(cfg.pretrained_unet_path, subfolder="unet", revision=cfg.revision, **cfg.unet_from_pretrained_kwargs)
-    unet.enable_xformers_memory_efficient_attention()
     # Move text_encode and vae to gpu and cast to weight_dtype
     image_encoder.to(dtype=weight_dtype)
@@ -165,24 +166,28 @@ def load_wonder3d_pipeline(cfg):
     # sys.main_lock = threading.Lock()
     return pipeline
-from mvdiffusion.data.single_image_dataset import SingleImageDataset
-def prepare_data(single_image, crop_size):
     dataset = SingleImageDataset(
         root_dir = None,
         num_views = 6,
         img_wh=[256, 256],
         bg_color='white',
         crop_size=crop_size,
-        single_image=single_image
     )
     return dataset[0]
-def run_pipeline(pipeline, cfg, single_image, guidance_scale, steps, seed, crop_size):
     import pdb
     # pdb.set_trace()
-    batch = prepare_data(single_image, crop_size)
     pipeline.set_progress_bar_config(disable=True)
     seed = int(seed)
@@ -249,13 +254,14 @@ class TestConfig:
     cond_on_normals: bool
     cond_on_colors: bool
 def run_demo():
     from utils.misc import load_config
     from omegaconf import OmegaConf
     # parse YAML config to OmegaConf
-    cfg = load_config("./configs/mvdiffusion-joint-ortho-6views.yaml")
     # print(cfg)
     schema = OmegaConf.structured(TestConfig)
     cfg = OmegaConf.merge(schema, cfg)
@@ -307,7 +313,7 @@ def run_demo():
                             output_processing = gr.CheckboxGroup(['Background Removal'], label='Output Image Postprocessing', value=[])
                     with gr.Row():
                         with gr.Column():
-                            scale_slider = gr.Slider(1, 5, value=3, step=1,
                                                         label='Classifier Free Guidance Scale')
                         with gr.Column():
                             steps_slider = gr.Slider(15, 100, value=50, step=1,
@@ -317,6 +323,14 @@ def run_demo():
                             seed = gr.Number(42, label='Seed')
                         with gr.Column():
                             crop_size = gr.Number(192, label='Crop size')
                     # crop_size = 192
                 run_btn = gr.Button('Generate', variant='primary', interactive=True)
         with gr.Row():
@@ -343,7 +357,7 @@ def run_demo():
                         inputs=[input_image, input_processing],
                         outputs=[processed_image_highres, processed_image], queue=True
             ).success(fn=partial(run_pipeline, pipeline, cfg),
-                        inputs=[processed_image_highres, scale_slider, steps_slider, seed, crop_size],
                         outputs=[view_1, view_2, view_3, view_4, view_5, view_6,
                                  normal_1, normal_2, normal_3, normal_4, normal_5, normal_6,
                                  view_gallery, normal_gallery]

 from dataclasses import dataclass
 import huggingface_hub
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from mv_diffusion_30.models.unet_mv2d_condition import UNetMV2DConditionModel
+from mv_diffusion_30.data.single_image_dataset import SingleImageDataset as MVDiffusionDataset
+from mv_diffusion_30.pipelines.pipeline_mvdiffusion_image import MVDiffusionImagePipeline
 from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
 from einops import rearrange
 import numpy as np
 <div>
 The demo does not include the mesh reconstruction part, please visit <a href="https://github.com/xxlong0/Wonder3D/">our github repo</a> to get a textured mesh.
 </div>
+<span style="font-weight: bold; color: #d9534f;">- 2024.11.5 We shift our ckpt to the a more powerful model [Wonder3D_Plus] that supports both orthogonal and perspective camera settings and further improves generalizability.</span>
 '''
 _GPU_ID = 0
     feature_extractor = CLIPImageProcessor.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="feature_extractor", revision=cfg.revision)
     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_name_or_path, subfolder="vae", revision=cfg.revision)
     unet = UNetMV2DConditionModel.from_pretrained_2d(cfg.pretrained_unet_path, subfolder="unet", revision=cfg.revision, **cfg.unet_from_pretrained_kwargs)
+    # unet.enable_xformers_memory_efficient_attention()
     # Move text_encode and vae to gpu and cast to weight_dtype
     image_encoder.to(dtype=weight_dtype)
     # sys.main_lock = threading.Lock()
     return pipeline
+from mv_diffusion_30.data.single_image_dataset import SingleImageDataset
+def prepare_data(single_image, crop_size, input_camera_type):
     dataset = SingleImageDataset(
         root_dir = None,
         num_views = 6,
         img_wh=[256, 256],
         bg_color='white',
         crop_size=crop_size,
+        single_image=single_image,
+        load_cam_type=True,
+        cam_types=[input_camera_type]
     )
     return dataset[0]
+def run_pipeline(pipeline, cfg, single_image, guidance_scale, steps, seed, crop_size, input_camera_type):
     import pdb
     # pdb.set_trace()
+    batch = prepare_data(single_image, crop_size, input_camera_type)
     pipeline.set_progress_bar_config(disable=True)
     seed = int(seed)
     cond_on_normals: bool
     cond_on_colors: bool
+    load_task: bool
 def run_demo():
     from utils.misc import load_config
     from omegaconf import OmegaConf
     # parse YAML config to OmegaConf
+    cfg = load_config("./configs/mvdiffusion-joint-plus.yaml")
     # print(cfg)
     schema = OmegaConf.structured(TestConfig)
     cfg = OmegaConf.merge(schema, cfg)
                             output_processing = gr.CheckboxGroup(['Background Removal'], label='Output Image Postprocessing', value=[])
                     with gr.Row():
                         with gr.Column():
+                            scale_slider = gr.Slider(1, 5, value=2, step=1,
                                                         label='Classifier Free Guidance Scale')
                         with gr.Column():
                             steps_slider = gr.Slider(15, 100, value=50, step=1,
                             seed = gr.Number(42, label='Seed')
                         with gr.Column():
                             crop_size = gr.Number(192, label='Crop size')
+                    with gr.Row():
+                        camera_type = gr.Radio(
+                            choices=[("Orthogonal Camera", "ortho"), ("Perspective Camera", "persp")],
+                            value="ortho",
+                            label="Camera Type"
+                        )
                     # crop_size = 192
                 run_btn = gr.Button('Generate', variant='primary', interactive=True)
         with gr.Row():
                         inputs=[input_image, input_processing],
                         outputs=[processed_image_highres, processed_image], queue=True
             ).success(fn=partial(run_pipeline, pipeline, cfg),
+                        inputs=[processed_image_highres, scale_slider, steps_slider, seed, crop_size, camera_type],
                         outputs=[view_1, view_2, view_3, view_4, view_5, view_6,
                                  normal_1, normal_2, normal_3, normal_4, normal_5, normal_6,
                                  view_gallery, normal_gallery]

mv_diffusion_30/data/depth_utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import matplotlib
+import numpy as np
+import torch
+def colorize_depth_maps(
+    depth_map, min_depth, max_depth, cmap="Spectral", valid_mask=None
+):
+    """
+    Colorize depth maps.
+    """
+    assert len(depth_map.shape) >= 2, "Invalid dimension"
+    if isinstance(depth_map, torch.Tensor):
+        depth = depth_map.detach().squeeze().numpy()
+    elif isinstance(depth_map, np.ndarray):
+        depth = depth_map.copy().squeeze()
+    # reshape to [ (B,) H, W ]
+    if depth.ndim < 3:
+        depth = depth[np.newaxis, :, :]
+    # colorize
+    cm = matplotlib.colormaps[cmap]
+    depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
+    img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3]  # value from 0 to 1
+    img_colored_np = np.rollaxis(img_colored_np, 3, 1)
+    if valid_mask is not None:
+        if isinstance(depth_map, torch.Tensor):
+            valid_mask = valid_mask.detach().numpy()
+        valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
+        if valid_mask.ndim < 3:
+            valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
+        else:
+            valid_mask = valid_mask[:, np.newaxis, :, :]
+        valid_mask = np.repeat(valid_mask, 3, axis=1)
+        img_colored_np[~valid_mask] = 0
+    if isinstance(depth_map, torch.Tensor):
+        img_colored = torch.from_numpy(img_colored_np).float()
+    elif isinstance(depth_map, np.ndarray):
+        img_colored = img_colored_np
+    return img_colored
+def scale_depth_to_model(depth, camera_type='ortho'):
+    """
+    Scale depth from the original range.
+    """
+    assert camera_type == 'ortho' or camera_type == 'persp'
+    w, h = depth.shape
+    if camera_type == 'ortho':
+        original_min = 9000
+        original_max = 17000
+        target_min = 2000
+        target_max = 62000
+        mask = depth != 0
+        # Scale depth to [0, 1]
+        depth_normalized = np.zeros([w, h])
+        depth_normalized[mask] = (depth[mask] - original_min) / (original_max - original_min)
+        # Scale depth to [2000, 60000]
+        scaled_depth = np.zeros([w, h])
+        scaled_depth[mask] = depth_normalized[mask] * (target_max - target_min) + target_min
+    else:
+        original_min = 4000
+        original_max = 13000
+        target_min = 2000
+        target_max = 62000
+        mask = depth != 0
+        # Scale depth to [0, 1]
+        depth_normalized = np.zeros([w, h])
+        depth_normalized[mask] = (depth[mask] - original_min) / (original_max - original_min)
+        # Scale depth to [2000, 60000]
+        scaled_depth = np.zeros([w, h])
+        scaled_depth[mask] = depth_normalized[mask] * (target_max - target_min) + target_min
+    scaled_depth[scaled_depth > 62000] = 0
+    scaled_depth = scaled_depth / 65535. # [0, 1]
+    return scaled_depth
+def rescale_depth_to_world(scaled_depth, camera_type='ortho'):
+    """
+    Rescale depth from the scaled range back to the original range.
+    """
+    assert camera_type == 'ortho' or camera_type == 'persp'
+    scaled_depth = scaled_depth * 65535.
+    w, h = scaled_depth.shape
+    if camera_type == 'ortho':
+        original_min = 9000
+        original_max = 17000
+        target_min = 2000
+        target_max = 62000
+        mask = scaled_depth != 0
+        rescaled_depth_norm = np.zeros([w, h])
+        # Rescale depth to [0, 1]
+        rescaled_depth_norm[mask] = (scaled_depth[mask] - target_min) / (target_max - target_min)
+        # Rescale depth to [9000, 17000]
+        rescaled_depth = np.zeros([w, h])
+        rescaled_depth[mask] = rescaled_depth_norm[mask] * (original_max - original_min) + original_min
+    else:
+        original_min = 4000
+        original_max = 13000
+        target_min = 2000
+        target_max = 62000
+        mask = scaled_depth != 0
+        rescaled_depth_norm = np.zeros([w, h])
+        # Rescale depth to [0, 1]
+        rescaled_depth_norm[mask] = (scaled_depth[mask] - target_min) / (target_max - target_min)
+        # Rescale depth to [9000, 17000]
+        rescaled_depth = np.zeros([w, h])
+        rescaled_depth[mask] = rescaled_depth_norm[mask] * (original_max - original_min) + original_min
+    return rescaled_depth

mv_diffusion_30/data/fixed_poses/nine_views.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a30afc8a8c757429716f3be7ee58e7a9a5e0fb5ec5cb4d106bc04e43550ac2b
+size 7385

mv_diffusion_30/data/fixed_poses/nine_views/000_back_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-5.266582965850830078e-01 7.410295009613037109e-01 -4.165407419204711914e-01 -5.960464477539062500e-08
+5.865638996738198330e-08 4.900035560131072998e-01 8.717204332351684570e-01 -9.462351613365171943e-08
+8.500770330429077148e-01 4.590988159179687500e-01 -2.580644786357879639e-01 -1.300000071525573730e+00

mv_diffusion_30/data/fixed_poses/nine_views/000_back_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-9.734988808631896973e-01 1.993551850318908691e-01 -1.120596975088119507e-01 -1.713633537292480469e-07
+3.790224578636980368e-09 4.900034964084625244e-01 8.717204928398132324e-01 1.772203575001185527e-07
+2.286916375160217285e-01 8.486189246177673340e-01 -4.770178496837615967e-01 -1.838477611541748047e+00

mv_diffusion_30/data/fixed_poses/nine_views/000_back_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+2.286914736032485962e-01 8.486190438270568848e-01 -4.770178198814392090e-01 1.564621925354003906e-07
+-3.417914484771245043e-08 4.900034070014953613e-01 8.717205524444580078e-01 -7.293811421504869941e-08
+9.734990000724792480e-01 -1.993550658226013184e-01 1.120596155524253845e-01 -1.838477969169616699e+00

mv_diffusion_30/data/fixed_poses/nine_views/000_front_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+5.266583561897277832e-01 -7.410295009613037109e-01 4.165407419204711914e-01 0.000000000000000000e+00
+5.865638996738198330e-08 4.900035560131072998e-01 8.717204332351684570e-01 9.462351613365171943e-08
+-8.500770330429077148e-01 -4.590988159179687500e-01 2.580645382404327393e-01 -1.300000071525573730e+00

mv_diffusion_30/data/fixed_poses/nine_views/000_front_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-2.286916971206665039e-01 -8.486189842224121094e-01 4.770179092884063721e-01 -2.458691596984863281e-07
+9.085837859856837895e-09 4.900034666061401367e-01 8.717205524444580078e-01 1.205695667749751010e-07
+-9.734990000724792480e-01 1.993551701307296753e-01 -1.120597645640373230e-01 -1.838477969169616699e+00

mv_diffusion_30/data/fixed_poses/nine_views/000_front_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+9.734989404678344727e-01 -1.993551850318908691e-01 1.120596975088119507e-01 -1.415610313415527344e-07
+3.790224578636980368e-09 4.900034964084625244e-01 8.717204928398132324e-01 -1.772203575001185527e-07
+-2.286916375160217285e-01 -8.486189246177673340e-01 4.770178794860839844e-01 -1.838477611541748047e+00

mv_diffusion_30/data/fixed_poses/nine_views/000_left_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+-8.500771522521972656e-01 -4.590989053249359131e-01 2.580644488334655762e-01 0.000000000000000000e+00
+-4.257411134744870651e-08 4.900034964084625244e-01 8.717204928398132324e-01 9.006067358541258727e-08
+-5.266583561897277832e-01 7.410295605659484863e-01 -4.165408313274383545e-01 -1.300000071525573730e+00

mv_diffusion_30/data/fixed_poses/nine_views/000_right_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+8.500770330429077148e-01 4.590989053249359131e-01 -2.580644488334655762e-01 5.960464477539062500e-08
+-4.257411134744870651e-08 4.900034964084625244e-01 8.717204928398132324e-01 -9.006067358541258727e-08
+5.266583561897277832e-01 -7.410295605659484863e-01 4.165407419204711914e-01 -1.300000071525573730e+00

mv_diffusion_30/data/fixed_poses/nine_views/000_top_RT.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+9.958608150482177734e-01 7.923202216625213623e-02 -4.453715682029724121e-02 -3.098167056236889039e-09
+-9.089154005050659180e-02 8.681122064590454102e-01 -4.879753291606903076e-01 5.784738377201392723e-08
+-2.028124157504862524e-08 4.900035560131072998e-01 8.717204332351684570e-01 -1.300000071525573730e+00

mv_diffusion_30/data/multiview_image_dataset.py ADDED Viewed

	@@ -0,0 +1,308 @@

+from typing import Dict
+import numpy as np
+from omegaconf import DictConfig, ListConfig
+import torch
+from torch.utils.data import Dataset
+from pathlib import Path
+import json
+from PIL import Image
+from torchvision import transforms
+from einops import rearrange
+from typing import Literal, Tuple, Optional, Any
+import cv2
+import random
+import json
+import os, sys
+import math
+from glob import glob
+import PIL.Image
+from .normal_utils import trans_normal, normal2img, img2normal
+import pdb
+import cv2
+import numpy as np
+def add_margin(pil_img, color=0, size=256):
+    width, height = pil_img.size
+    result = Image.new(pil_img.mode, (size, size), color)
+    result.paste(pil_img, ((size - width) // 2, (size - height) // 2))
+    return result
+def scale_and_place_object(image, scale_factor):
+    assert np.shape(image)[-1]==4  # RGBA
+    # Extract the alpha channel (transparency) and the object (RGB channels)
+    alpha_channel = image[:, :, 3]
+    # Find the bounding box coordinates of the object
+    coords = cv2.findNonZero(alpha_channel)
+    x, y, width, height = cv2.boundingRect(coords)
+    # Calculate the scale factor for resizing
+    original_height, original_width = image.shape[:2]
+    if width > height:
+        size = width
+        original_size = original_width
+    else:
+        size = height
+        original_size = original_height
+    scale_factor = min(scale_factor, size / (original_size+0.0))
+    new_size = scale_factor * original_size
+    scale_factor = new_size / size
+    # Calculate the new size based on the scale factor
+    new_width = int(width * scale_factor)
+    new_height = int(height * scale_factor)
+    center_x = original_width // 2
+    center_y = original_height // 2
+    paste_x = center_x - (new_width // 2)
+    paste_y = center_y - (new_height // 2)
+    # Resize the object (RGB channels) to the new size
+    rescaled_object = cv2.resize(image[y:y+height, x:x+width], (new_width, new_height))
+    # Create a new RGBA image with the resized image
+    new_image = np.zeros((original_height, original_width, 4), dtype=np.uint8)
+    new_image[paste_y:paste_y + new_height, paste_x:paste_x + new_width] = rescaled_object
+    return new_image
+class InferenceImageDataset(Dataset):
+    def __init__(self,
+        root_dir: str,
+        num_views: int,
+        img_wh: Tuple[int, int],
+        bg_color: str,
+        crop_size: int = 224,
+        single_image: Optional[PIL.Image.Image] = None,
+        num_validation_samples: Optional[int] = None,
+        filepaths: Optional[list] = None,
+        cam_types: Optional[list] = None,
+        cond_type: Optional[str] = None,
+        load_cam_type: Optional[bool] = True
+        ) -> None:
+        """Create a dataset from a folder of images.
+        If you pass in a root directory it will be searched for images
+        ending in ext (ext can be a list)
+        """
+        self.root_dir = root_dir
+        self.num_views = num_views
+        self.img_wh = img_wh
+        self.crop_size = crop_size
+        self.bg_color = bg_color
+        self.cond_type = cond_type
+        self.load_cam_type = load_cam_type
+        self.cam_types = cam_types
+        if self.num_views == 4:
+            self.view_types  = ['front', 'right', 'back', 'left']
+        elif self.num_views == 5:
+            self.view_types  = ['front', 'front_right', 'right', 'back', 'left']
+        elif self.num_views == 6:
+            self.view_types  = ['front', 'front_right', 'right', 'back', 'left', 'front_left']
+        self.fix_cam_pose_dir = "./mvdiffusion/data/fixed_poses/nine_views"
+        self.fix_cam_poses = self.load_fixed_poses()  # world2cam matrix
+        if filepaths is None:
+            # Get a list of all files in the directory
+            file_list = os.listdir(self.root_dir)
+            self.cam_types = ['ortho'] * len(file_list) + ['persp']* len(file_list)
+            file_list = file_list * 2
+        else:
+            file_list = filepaths
+        print(filepaths, root_dir)
+        # Filter the files that end with .png or .jpg
+        self.file_list = [file for file in file_list]
+        self.bg_color = self.get_bg_color()
+    def __len__(self):
+        return len(self.file_list)
+    def load_fixed_poses(self):
+        poses = {}
+        for face in self.view_types:
+            RT = np.loadtxt(os.path.join(self.fix_cam_pose_dir,'%03d_%s_RT.txt'%(0, face)))
+            poses[face] = RT
+        return poses
+    def cartesian_to_spherical(self, xyz):
+        ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
+        xy = xyz[:,0]**2 + xyz[:,1]**2
+        z = np.sqrt(xy + xyz[:,2]**2)
+        theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down
+        #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
+        azimuth = np.arctan2(xyz[:,1], xyz[:,0])
+        return np.array([theta, azimuth, z])
+    def get_T(self, target_RT, cond_RT):
+        R, T = target_RT[:3, :3], target_RT[:, -1]
+        T_target = -R.T @ T # change to cam2world
+        R, T = cond_RT[:3, :3], cond_RT[:, -1]
+        T_cond = -R.T @ T
+        theta_cond, azimuth_cond, z_cond = self.cartesian_to_spherical(T_cond[None, :])
+        theta_target, azimuth_target, z_target = self.cartesian_to_spherical(T_target[None, :])
+        d_theta = theta_target - theta_cond
+        d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
+        d_z = z_target - z_cond
+        # d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
+        return d_theta, d_azimuth
+    def get_bg_color(self):
+        if self.bg_color == 'white':
+            bg_color = np.array([1., 1., 1.], dtype=np.float32)
+        elif self.bg_color == 'black':
+            bg_color = np.array([0., 0., 0.], dtype=np.float32)
+        elif self.bg_color == 'gray':
+            bg_color = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+        elif self.bg_color == 'random':
+            bg_color = np.random.rand(3)
+        elif isinstance(self.bg_color, float):
+            bg_color = np.array([self.bg_color] * 3, dtype=np.float32)
+        else:
+            raise NotImplementedError
+        return bg_color
+    def load_image(self, img_path, bg_color, return_type='pt', Imagefile=None):
+        # pil always returns uint8
+        if Imagefile is None:
+            image_input = Image.open(img_path)
+        else:
+            image_input = Imagefile
+        image_size = self.img_wh[0]
+        # if self.crop_size!=-1:
+        #     alpha_np = np.asarray(image_input)[:, :, 3]
+        #     coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)]
+        #     min_x, min_y = np.min(coords, 0)
+        #     max_x, max_y = np.max(coords, 0)
+        #     ref_img_ = image_input.crop((min_x, min_y, max_x, max_y))
+        #     h, w = ref_img_.height, ref_img_.width
+        #     scale = self.crop_size / max(h, w)
+        #     h_, w_ = int(scale * h), int(scale * w)
+        #     ref_img_ = ref_img_.resize((w_, h_))
+        #     image_input = add_margin(ref_img_, size=image_size)
+        # else:
+        #     image_input = add_margin(image_input, size=max(image_input.height, image_input.width))
+        #     image_input = image_input.resize((image_size, image_size))
+        # img = scale_and_place_object(img, self.scale_ratio)
+        img = np.array(image_input)
+        img = img.astype(np.float32) / 255. # [0, 1]
+        assert img.shape[-1] == 4 # RGBA
+        alpha = img[...,3:4]
+        img = img[...,:3] * alpha + bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+            alpha = torch.from_numpy(alpha)
+        else:
+            raise NotImplementedError
+        return img, alpha
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, index):
+        # image = self.all_images[index%len(self.all_images)]
+        # alpha = self.all_alphas[index%len(self.all_images)]
+        cam_type = self.cam_types[index%len(self.file_list)]
+        if self.file_list is not None:
+            filename = self.file_list[index%len(self.file_list)].replace(".png", "")
+        else:
+            filename = 'null'
+        cond_w2c = self.fix_cam_poses['front']
+        tgt_w2cs = [self.fix_cam_poses[view] for view in self.view_types]
+        elevations = []
+        azimuths = []
+        img_tensors_in = []
+        for view in self.view_types:
+            img_path = os.path.join(self.root_dir, filename, cam_type,"color_000_%s.png" % (view))
+            img_tensor, alpha = self.load_image(img_path, self.bg_color, return_type="pt")
+            img_tensor = img_tensor.permute(2, 0, 1)
+            img_tensors_in.append(img_tensor)
+        alpha_tensors_in = [
+            alpha.permute(2, 0, 1)
+        ] * self.num_views
+        for view, tgt_w2c in zip(self.view_types, tgt_w2cs):
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        # alpha_tensors_in = torch.stack(alpha_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class] * self.num_views, dim=0)  # (Nv, 2)
+        color_class = torch.tensor([0, 1]).float()
+        depth_task_embeddings = torch.stack([color_class] * self.num_views, dim=0)  # (Nv, 2)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1) # (Nv, 3)
+        if cam_type == 'ortho':
+            cam_type_emb = torch.tensor([0, 1]).expand(self.num_views, -1)
+        else:
+            cam_type_emb = torch.tensor([1, 0]).expand(self.num_views, -1)
+        if self.load_cam_type:
+            camera_embeddings = torch.cat((camera_embeddings, cam_type_emb), dim=-1)  # (Nv, 5)
+        out =  {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'alphas': alpha_tensors_in,
+            'camera_embeddings': camera_embeddings,
+            'normal_task_embeddings': normal_task_embeddings,
+            'depth_task_embeddings': depth_task_embeddings,
+            'filename': filename,
+            'cam_type': cam_type
+        }
+        return out

mv_diffusion_30/data/normal_utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import numpy as np
+def camNormal2worldNormal(rot_c2w, camNormal):
+    H,W,_ = camNormal.shape
+    normal_img = np.matmul(rot_c2w[None, :, :], camNormal.reshape(-1,3)[:, :, None]).reshape([H, W, 3])
+    return normal_img
+def worldNormal2camNormal(rot_w2c, normal_map_world):
+    H,W,_ = normal_map_world.shape
+    # normal_img = np.matmul(rot_w2c[None, :, :], worldNormal.reshape(-1,3)[:, :, None]).reshape([H, W, 3])
+    # faster version
+    # Reshape the normal map into a 2D array where each row represents a normal vector
+    normal_map_flat = normal_map_world.reshape(-1, 3)
+    # Transform the normal vectors using the transformation matrix
+    normal_map_camera_flat = np.dot(normal_map_flat, rot_w2c.T)
+    # Reshape the transformed normal map back to its original shape
+    normal_map_camera = normal_map_camera_flat.reshape(normal_map_world.shape)
+    return normal_map_camera
+def trans_normal(normal, RT_w2c, RT_w2c_target):
+    # normal_world = camNormal2worldNormal(np.linalg.inv(RT_w2c[:3,:3]), normal)
+    # normal_target_cam = worldNormal2camNormal(RT_w2c_target[:3,:3], normal_world)
+    relative_RT = np.matmul(RT_w2c_target[:3,:3], np.linalg.inv(RT_w2c[:3,:3]))
+    normal_target_cam = worldNormal2camNormal(relative_RT[:3,:3], normal)
+    return normal_target_cam
+def img2normal(img):
+    return (img/255.)*2-1
+def normal2img(normal):
+    return np.uint8((normal*0.5+0.5)*255)
+def norm_normalize(normal, dim=-1):
+    normal = normal/(np.linalg.norm(normal, axis=dim, keepdims=True)+1e-6)
+    return normal

mv_diffusion_30/data/objaverse_dataset.py ADDED Viewed

	@@ -0,0 +1,1359 @@

+from typing import Dict
+import numpy as np
+from omegaconf import DictConfig, ListConfig
+import torch
+from torch.utils.data import Dataset
+from pathlib import Path
+import json
+from PIL import Image
+from torchvision import transforms
+from einops import rearrange
+from typing import Literal, Tuple, Optional, Any
+import cv2
+import random
+import json
+import os, sys
+import math
+import PIL.Image
+from .normal_utils import trans_normal, normal2img, img2normal
+import pdb
+from .depth_utils import scale_depth_to_model
+import traceback
+class ObjaverseDataset(Dataset):
+    def __init__(self,
+        root_dir_ortho: str,
+        root_dir_persp: str,
+        pred_ortho: bool,
+        pred_persp: bool,
+        num_views: int,
+        bg_color: Any,
+        img_wh: Tuple[int, int],
+        object_list: str,
+        groups_num: int=1,
+        validation: bool = False,
+        data_view_num: int = 6,
+        num_validation_samples: int = 64,
+        num_samples: Optional[int] = None,
+        invalid_list: Optional[str] = None,
+        trans_norm_system: bool = True,   # if True, transform all normals map into the cam system of front view
+        augment_data: bool = False,
+        read_normal: bool = True,
+        read_color: bool = False,
+        read_depth: bool = False,
+        read_mask: bool = False,
+        pred_type: str = 'color',
+        suffix: str = 'png',
+        subscene_tag: int = 2,
+        load_cam_type: bool = False,
+        backup_scene: str = "0306b42594fb447ca574f597352d4b56",
+        ortho_crop_size: int = 360,
+        persp_crop_size: int = 440,
+        load_switcher: bool = True
+        ) -> None:
+        """Create a dataset from a folder of images.
+        If you pass in a root directory it will be searched for images
+        ending in ext (ext can be a list)
+        """
+        self.load_cam_type = load_cam_type
+        self.root_dir_ortho = Path(root_dir_ortho)
+        self.root_dir_persp = Path(root_dir_persp)
+        self.pred_ortho = pred_ortho
+        self.pred_persp = pred_persp
+        self.num_views = num_views
+        self.bg_color = bg_color
+        self.validation = validation
+        self.num_samples = num_samples
+        self.trans_norm_system = trans_norm_system
+        self.augment_data = augment_data
+        self.invalid_list = invalid_list
+        self.groups_num = groups_num
+        print("augment data: ", self.augment_data)
+        self.img_wh = img_wh
+        self.read_normal = read_normal
+        self.read_color = read_color
+        self.read_depth = read_depth
+        self.read_mask = read_mask
+        self.pred_type = pred_type  # load type
+        self.suffix = suffix
+        self.subscene_tag = subscene_tag
+        self.view_types  = ['front', 'front_right', 'right', 'back', 'left', 'front_left']
+        self.fix_cam_pose_dir = "./mvdiffusion/data/fixed_poses/nine_views"
+        self.fix_cam_poses = self.load_fixed_poses()  # world2cam matrix
+        self.ortho_crop_size = ortho_crop_size
+        self.persp_crop_size = persp_crop_size
+        self.load_switcher = load_switcher
+        if object_list is not None:
+            with open(object_list) as f:
+                self.objects = json.load(f)
+            self.objects = [os.path.basename(o).replace(".glb", "") for o in self.objects]
+        else:
+            self.objects = os.listdir(self.root_dir)
+            self.objects = sorted(self.objects)
+        if self.invalid_list is not None:
+            with open(self.invalid_list) as f:
+                self.invalid_objects = json.load(f)
+            self.invalid_objects = [os.path.basename(o).replace(".glb", "") for o in self.invalid_objects]
+        else:
+            self.invalid_objects = []
+        self.all_objects = set(self.objects) - (set(self.invalid_objects) & set(self.objects))
+        self.all_objects = list(self.all_objects)
+        if not validation:
+            self.all_objects = self.all_objects[:-num_validation_samples]
+        else:
+            self.all_objects = self.all_objects[-num_validation_samples:]
+        if num_samples is not None:
+            self.all_objects = self.all_objects[:num_samples]
+        print("loading ", len(self.all_objects), " objects in the dataset")
+        if self.pred_type == 'color':
+            self.backup_data = self.__getitem_color__(0, backup_scene)
+        elif self.pred_type == 'normal_depth':
+            self.backup_data = self.__getitem_normal_depth__(0, backup_scene)
+        elif self.pred_type == 'mixed_rgb_normal_depth':
+            self.backup_data = self.__getitem_mixed__(0, backup_scene)
+        elif self.pred_type == 'mixed_color_normal':
+            self.backup_data = self.__getitem_image_normal_mixed__(0, backup_scene)
+        elif self.pred_type == 'mixed_rgb_noraml_mask':
+            self.backup_data = self.__getitem_mixed_rgb_noraml_mask__(0, backup_scene)
+        elif self.pred_type == 'joint_color_normal':
+            self.backup_data = self.__getitem_joint_rgb_noraml__(0, backup_scene)
+    def __len__(self):
+        return len(self.objects)*self.total_view
+    def load_fixed_poses(self):
+        poses = {}
+        for face in self.view_types:
+            RT = np.loadtxt(os.path.join(self.fix_cam_pose_dir,'%03d_%s_RT.txt'%(0, face)))
+            poses[face] = RT
+        return poses
+    def cartesian_to_spherical(self, xyz):
+        ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
+        xy = xyz[:,0]**2 + xyz[:,1]**2
+        z = np.sqrt(xy + xyz[:,2]**2)
+        theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down
+        #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
+        azimuth = np.arctan2(xyz[:,1], xyz[:,0])
+        return np.array([theta, azimuth, z])
+    def get_T(self, target_RT, cond_RT):
+        R, T = target_RT[:3, :3], target_RT[:, -1]
+        T_target = -R.T @ T # change to cam2world
+        R, T = cond_RT[:3, :3], cond_RT[:, -1]
+        T_cond = -R.T @ T
+        theta_cond, azimuth_cond, z_cond = self.cartesian_to_spherical(T_cond[None, :])
+        theta_target, azimuth_target, z_target = self.cartesian_to_spherical(T_target[None, :])
+        d_theta = theta_target - theta_cond
+        d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
+        d_z = z_target - z_cond
+        # d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
+        return d_theta, d_azimuth
+    def get_bg_color(self):
+        if self.bg_color == 'white':
+            bg_color = np.array([1., 1., 1.], dtype=np.float32)
+        elif self.bg_color == 'black':
+            bg_color = np.array([0., 0., 0.], dtype=np.float32)
+        elif self.bg_color == 'gray':
+            bg_color = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+        elif self.bg_color == 'random':
+            bg_color = np.random.rand(3)
+        elif self.bg_color == 'three_choices':
+            white = np.array([1., 1., 1.], dtype=np.float32)
+            black = np.array([0., 0., 0.], dtype=np.float32)
+            gray = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+            bg_color = random.choice([white, black, gray])
+        elif isinstance(self.bg_color, float):
+            bg_color = np.array([self.bg_color] * 3, dtype=np.float32)
+        else:
+            raise NotImplementedError
+        return bg_color
+    def load_mask(self, img_path, return_type='np'):
+        # not using cv2 as may load in uint16 format
+        # img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) # [0, 255]
+        # img = cv2.resize(img, self.img_wh, interpolation=cv2.INTER_CUBIC)
+        # pil always returns uint8
+        img = np.array(Image.open(img_path).resize(self.img_wh))
+        img = np.float32(img > 0)
+        assert len(np.shape(img)) == 2
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+        else:
+            raise NotImplementedError
+        return img
+    def load_mask_from_rgba(self, img_path, camera_type):
+        img = Image.open(img_path)
+        if camera_type == 'ortho':
+            left = (img.width - self.ortho_crop_size) // 2
+            right = (img.width + self.ortho_crop_size) // 2
+            top = (img.height - self.ortho_crop_size) // 2
+            bottom = (img.height + self.ortho_crop_size) // 2
+            img = img.crop((left, top, right, bottom))
+        if camera_type == 'persp':
+            left = (img.width - self.persp_crop_size) // 2
+            right = (img.width + self.persp_crop_size) // 2
+            top = (img.height - self.persp_crop_size) // 2
+            bottom = (img.height + self.persp_crop_size) // 2
+            img = img.crop((left, top, right, bottom))
+        img = img.resize(self.img_wh)
+        img = np.array(img).astype(np.float32) / 255.  # [0, 1]
+        assert img.shape[-1] == 4 # must RGBA
+        alpha = img[:, :, 3:]
+        if alpha.shape[-1] != 1:
+            alpha = alpha[:, :, None]
+        return alpha
+    def load_image(self, img_path, bg_color, alpha, return_type='np', camera_type=None, read_depth=False, center_crop_size=None):
+        # not using cv2 as may load in uint16 format
+        # img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) # [0, 255]
+        # img = cv2.resize(img, self.img_wh, interpolation=cv2.INTER_CUBIC)
+        # pil always returns uint8
+        img = Image.open(img_path)
+        if center_crop_size == None:
+            if camera_type == 'ortho':
+                left = (img.width - self.ortho_crop_size) // 2
+                right = (img.width + self.ortho_crop_size) // 2
+                top = (img.height - self.ortho_crop_size) // 2
+                bottom = (img.height + self.ortho_crop_size) // 2
+                img = img.crop((left, top, right, bottom))
+            if camera_type == 'persp':
+                left = (img.width - self.persp_crop_size) // 2
+                right = (img.width + self.persp_crop_size) // 2
+                top = (img.height - self.persp_crop_size) // 2
+                bottom = (img.height + self.persp_crop_size) // 2
+                img = img.crop((left, top, right, bottom))
+        else:
+            center_crop_size = min(center_crop_size, 512)
+            left = (img.width - center_crop_size) // 2
+            right = (img.width + center_crop_size) // 2
+            top = (img.height - center_crop_size) // 2
+            bottom = (img.height + center_crop_size) // 2
+            img = img.crop((left, top, right, bottom))
+        img = img.resize(self.img_wh)
+        img = np.array(img).astype(np.float32) / 255.  # [0, 1]
+        assert img.shape[-1] == 3 or img.shape[-1] == 4 # RGB or RGBA
+        if alpha is None and img.shape[-1] == 4:
+            alpha = img[:, :, 3:]
+            img = img[:, :, :3]
+        if alpha.shape[-1] != 1:
+            alpha = alpha[:, :, None]
+        if read_depth:
+            bg_color = np.array([1., 1., 1.], dtype=np.float32)
+        img = img[...,:3] * alpha + bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+        else:
+            raise NotImplementedError
+        return img
+    def load_depth(self, img_path, bg_color, alpha, return_type='np', camera_type=None):
+        # not using cv2 as may load in uint16 format
+        # img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) # [0, 255]
+        # img = cv2.resize(img, self.img_wh, interpolation=cv2.INTER_CUBIC)
+        # pil always returns uint8
+        depth_bg_color = np.array([1., 1., 1.], dtype=np.float32) # white for depth
+        depth_map = Image.open(img_path)
+        if camera_type == 'ortho':
+            left = (depth_map.width - self.ortho_crop_size) // 2
+            right = (depth_map.width + self.ortho_crop_size) // 2
+            top = (depth_map.height - self.ortho_crop_size) // 2
+            bottom = (depth_map.height + self.ortho_crop_size) // 2
+            depth_map = depth_map.crop((left, top, right, bottom))
+        if camera_type == 'persp':
+            left = (depth_map.width - self.persp_crop_size) // 2
+            right = (depth_map.width + self.persp_crop_size) // 2
+            top = (depth_map.height - self.persp_crop_size) // 2
+            bottom = (depth_map.height + self.persp_crop_size) // 2
+            depth_map = depth_map.crop((left, top, right, bottom))
+        depth_map = depth_map.resize(self.img_wh)
+        depth_map = np.array(depth_map)
+        # scale the depth map:
+        depth_map = scale_depth_to_model(depth_map.astype(np.float32))
+        # depth_map = depth_map / 65535. # [0, 1]
+        # depth_map[depth_map > 0.4] = 0
+        # depth_map = depth_map / 0.4
+        assert depth_map.ndim == 2 # depth
+        img = np.stack([depth_map]*3, axis=-1)
+        if alpha.shape[-1] != 1:
+            alpha = alpha[:, :, None]
+        # print(np.max(img[:, :, 0]))
+        # print(np.min(img[...,:3]), np.max(img[...,:3]))
+        img = img[...,:3] * alpha + depth_bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+        else:
+            raise NotImplementedError
+        return img
+    def transform_mask_as_input(self, mask, return_type='np'):
+        # mask = mask * 255
+        # print(np.max(mask))
+        # mask = mask.resize(self.img_wh)
+        mask = np.squeeze(mask, axis=-1)
+        assert mask.ndim == 2 #
+        mask = np.stack([mask]*3, axis=-1)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            mask = torch.from_numpy(mask)
+        else:
+            raise NotImplementedError
+        return mask
+    def load_normal(self, img_path, bg_color, alpha, RT_w2c=None, RT_w2c_cond=None, return_type='np', camera_type=None, center_crop_size=None):
+        # not using cv2 as may load in uint16 format
+        # img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) # [0, 255]
+        # img = cv2.resize(img, self.img_wh, interpolation=cv2.INTER_CUBIC)
+        # pil always returns uint8
+        # normal = Image.open(img_path)
+        img = Image.open(img_path)
+        if center_crop_size == None:
+            if camera_type == 'ortho':
+                left = (img.width - self.ortho_crop_size) // 2
+                right = (img.width + self.ortho_crop_size) // 2
+                top = (img.height - self.ortho_crop_size) // 2
+                bottom = (img.height + self.ortho_crop_size) // 2
+                img = img.crop((left, top, right, bottom))
+            if camera_type == 'persp':
+                left = (img.width - self.persp_crop_size) // 2
+                right = (img.width + self.persp_crop_size) // 2
+                top = (img.height - self.persp_crop_size) // 2
+                bottom = (img.height + self.persp_crop_size) // 2
+                img = img.crop((left, top, right, bottom))
+        else:
+            center_crop_size = min(center_crop_size, 512)
+            left = (img.width - center_crop_size) // 2
+            right = (img.width + center_crop_size) // 2
+            top = (img.height - center_crop_size) // 2
+            bottom = (img.height + center_crop_size) // 2
+            img = img.crop((left, top, right, bottom))
+        normal = np.array(img.resize(self.img_wh))
+        assert normal.shape[-1] == 3 or normal.shape[-1] == 4 # RGB or RGBA
+        if alpha is None and normal.shape[-1] == 4:
+            alpha = normal[:, :, 3:] / 255.
+            normal = normal[:, :, :3]
+        normal = trans_normal(img2normal(normal), RT_w2c, RT_w2c_cond)
+        img = (normal*0.5 + 0.5).astype(np.float32)  # [0, 1]
+        if alpha.shape[-1] != 1:
+            alpha = alpha[:, :, None]
+        img = img[...,:3] * alpha + bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+        else:
+            raise NotImplementedError
+        return img
+    def __len__(self):
+        return len(self.all_objects)
+    def __getitem_color__(self, index, debug_object=None):
+        if debug_object is not None:
+            object_name = debug_object  #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0]  # without replacement
+        else:
+            object_name = self.all_objects[index % len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        assert self.pred_ortho or self.pred_persp
+        if self.pred_ortho and self.pred_persp:
+            if random.random() < 0.5:
+                load_dir = self.root_dir_ortho
+                load_cam_type = 'ortho'
+            else:
+                load_dir = self.root_dir_persp
+                load_cam_type = 'persp'
+        elif self.pred_ortho and not self.pred_persp:
+            load_dir = self.root_dir_ortho
+            load_cam_type = 'ortho'
+        elif self.pred_persp and not self.pred_ortho:
+            load_dir = self.root_dir_persp
+            load_cam_type = 'persp'
+        # ! if you would like predict depth; modify here
+        read_color, read_normal, read_depth = True, False, False
+        assert (read_color and (read_normal or read_depth)) is False
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        if self.read_mask:
+            cond_alpha = self.load_mask(os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                                     "mask_%03d_%s.%s" % (set_idx, cond_view, self.suffix)),
+                                        return_type='np')
+        else:
+            cond_alpha = None
+        img_tensors_in = [
+                             self.load_image(os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                                          "rgb_%03d_%s.%s" % (set_idx, cond_view, self.suffix)),
+                                             bg_color, cond_alpha, return_type='pt', camera_type=load_cam_type).permute(2, 0, 1)
+                         ] * self.num_views
+        img_tensors_out = []
+        for view, tgt_w2c in zip(view_types, tgt_w2cs):
+            img_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                    "rgb_%03d_%s.%s" % (set_idx, view, self.suffix))
+            mask_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                     "mask_%03d_%s.%s" % (set_idx, view, self.suffix))
+            normal_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                       "normals_%03d_%s.%s" % (set_idx, view, self.suffix))
+            depth_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                      "depth_%03d_%s.%s" % (set_idx, view, self.suffix))
+            if self.read_mask:
+                alpha = self.load_mask(mask_path, return_type='np')
+            else:
+                alpha = None
+            if read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type)
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if read_normal:
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c,
+                                                 return_type="pt", camera_type=load_cam_type).permute(2, 0, 1)
+                img_tensors_out.append(normal_tensor)
+            if read_depth:
+                depth_tensor = self.load_depth(depth_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type).permute(2, 0, 1)
+                img_tensors_out.append(depth_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float()  # (Nv, 3, H, W)
+        img_tensors_out = torch.stack(img_tensors_out, dim=0).float()  # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        if load_cam_type == 'ortho':
+            cam_type_emb = torch.tensor([0, 1]).expand(self.num_views, -1)
+        else:
+            cam_type_emb = torch.tensor([1, 0]).expand(self.num_views, -1)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1)
+        # if self.pred_ortho and self.pred_persp:
+        if self.load_cam_type:
+            camera_embeddings = torch.cat((camera_embeddings, cam_type_emb), dim=-1)  # (Nv, 5)
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class] * self.num_views, dim=0)  # (Nv, 2)
+        color_class = torch.tensor([0, 1]).float()
+        color_task_embeddings = torch.stack([color_class] * self.num_views, dim=0)  # (Nv, 2)
+        if read_normal or read_depth:
+            task_embeddings = normal_task_embeddings
+        if read_color:
+            task_embeddings = color_task_embeddings
+        # print(elevations)
+        # print(azimuths)
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'task_embeddings': task_embeddings
+        }
+    def __getitem_normal_depth__(self, index, debug_object=None):
+        if debug_object is not  None:
+            object_name =  debug_object #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0] # without replacement
+        else:
+            object_name = self.all_objects[index%len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        assert self.pred_ortho or self.pred_persp
+        if self.pred_ortho and self.pred_persp:
+            if random.random() < 0.5:
+                load_dir = self.root_dir_ortho
+                load_cam_type = 'ortho'
+            else:
+                load_dir = self.root_dir_persp
+                load_cam_type = 'persp'
+        elif self.pred_ortho and not self.pred_persp:
+            load_dir = self.root_dir_ortho
+            load_cam_type = 'ortho'
+        elif self.pred_persp and not self.pred_ortho:
+            load_dir = self.root_dir_persp
+            load_cam_type = 'persp'
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        if self.read_mask:
+            cond_alpha = self.load_mask(os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, cond_view, self.suffix)), return_type='np')
+        else:
+            cond_alpha = None
+        # img_tensors_in = [
+        #     self.load_image(os.path.join(self.root_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, cond_view, self.suffix)), bg_color, cond_alpha, return_type='pt').permute(2, 0, 1)
+        # ] * self.num_views
+        img_tensors_out = []
+        normal_tensors_out = []
+        depth_tensors_out = []
+        for view, tgt_w2c in zip(view_types, tgt_w2cs):
+            img_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, view, self.suffix))
+            mask_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, view, self.suffix))
+            depth_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name, "depth_%03d_%s.%s" % (set_idx, view, self.suffix))
+            if self.read_mask:
+                alpha = self.load_mask(mask_path, return_type='np')
+            else:
+                alpha = None
+            if self.read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type)
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if self.read_normal:
+                normal_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "normals_%03d_%s.%s" % (set_idx, view, self.suffix))
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c, return_type="pt", camera_type=load_cam_type).permute(2, 0, 1)
+                normal_tensors_out.append(normal_tensor)
+            if self.read_depth:
+                if alpha is None:
+                    alpha = self.load_mask_from_rgba(img_path, camera_type=load_cam_type)
+                depth_tensor = self.load_depth(depth_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type).permute(2, 0, 1)
+                depth_tensors_out.append(depth_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = img_tensors_out
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        if self.read_color:
+            img_tensors_out = torch.stack(img_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        if self.read_normal:
+            normal_tensors_out = torch.stack(normal_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        if self.read_depth:
+            depth_tensors_out = torch.stack(depth_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        if load_cam_type == 'ortho':
+            cam_type_emb = torch.tensor([0, 1]).expand(self.num_views, -1)
+        else:
+            cam_type_emb = torch.tensor([1, 0]).expand(self.num_views, -1)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1)
+        # if self.pred_ortho and self.pred_persp:
+        if self.load_cam_type:
+            camera_embeddings = torch.cat((camera_embeddings, cam_type_emb), dim=-1)  # (Nv, 5)
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class]*self.num_views, dim=0)  # (Nv, 2)
+        color_class = torch.tensor([0, 1]).float()
+        depth_task_embeddings = torch.stack([color_class]*self.num_views, dim=0)  # (Nv, 2)
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'normals_out': normal_tensors_out,
+            'depth_out': depth_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'normal_task_embeddings': normal_task_embeddings,
+            'depth_task_embeddings': depth_task_embeddings
+        }
+    def __getitem_mixed_rgb_noraml_mask__(self, index, debug_object=None):
+        if debug_object is not  None:
+            object_name =  debug_object #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0] # without replacement
+        else:
+            object_name = self.all_objects[index%len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        assert self.pred_ortho or self.pred_persp
+        if self.pred_ortho and self.pred_persp:
+            if random.random() < 0.5:
+                load_dir = self.root_dir_ortho
+                load_cam_type = 'ortho'
+            else:
+                load_dir = self.root_dir_persp
+                load_cam_type = 'persp'
+        elif self.pred_ortho and not self.pred_persp:
+            load_dir = self.root_dir_ortho
+            load_cam_type = 'ortho'
+        elif self.pred_persp and not self.pred_ortho:
+            load_dir = self.root_dir_persp
+            load_cam_type = 'persp'
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        if self.read_mask:
+            cond_alpha = self.load_mask(os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, cond_view, self.suffix)), return_type='np')
+        else:
+            cond_alpha = None
+        img_tensors_out = []
+        normal_tensors_out = []
+        depth_tensors_out = []
+        random_select = random.random()
+        read_color, read_normal, read_mask = [random_select < 1 / 3, 1 / 3 <= random_select <= 2 / 3,
+                                               random_select > 2 / 3]
+        # print(read_color, read_normal, read_depth)
+        assert sum([read_color, read_normal, read_mask]) == 1, "Only one variable should be True"
+        for view, tgt_w2c in zip(view_types, tgt_w2cs):
+            img_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, view, self.suffix))
+            mask_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, view, self.suffix))
+            depth_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name, "depth_%03d_%s.%s" % (set_idx, view, self.suffix))
+            if self.read_mask:
+                alpha = self.load_mask(mask_path, return_type='np')
+            else:
+                alpha = None
+            if read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type, read_depth=False)
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if read_normal:
+                normal_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "normals_%03d_%s.%s" % (set_idx, view, self.suffix))
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c, return_type="pt", camera_type=load_cam_type).permute(2, 0, 1)
+                img_tensors_out.append(normal_tensor)
+            if read_mask:
+                if alpha is None:
+                    alpha = self.load_mask_from_rgba(img_path, camera_type=load_cam_type)
+                mask_tensor = self.transform_mask_as_input(alpha, return_type='pt').permute(2, 0, 1)
+                img_tensors_out.append(mask_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        if self.load_switcher: # rgb input, use domain switcher to control the output type
+            img_tensors_in = [
+                                     self.load_image(os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                                                  "normals_%03d_%s.%s" % (set_idx, cond_view, self.suffix)),
+                                                     bg_color, cond_alpha, RT_w2c=cond_w2c, RT_w2c_cond=cond_w2c, return_type='pt', camera_type=load_cam_type).permute(
+                                         2, 0, 1)
+                                 ] * self.num_views
+            color_class = torch.tensor([0, 1]).float()
+            color_task_embeddings = torch.stack([color_class] * self.num_views, dim=0)  # (Nv, 2)
+            normal_class = torch.tensor([1, 0]).float()
+            normal_task_embeddings = torch.stack([normal_class] * self.num_views, dim=0)  # (Nv, 2)
+            mask_class = torch.tensor([1, 1]).float()
+            mask_task_embeddings = torch.stack([mask_class] * self.num_views, dim=0)
+            if read_color:
+                task_embeddings = color_task_embeddings
+                # img_tensors_out = depth_tensors_out
+            elif read_normal:
+                task_embeddings = normal_task_embeddings
+                # img_tensors_out = normal_tensors_out
+            elif read_mask:
+                task_embeddings = mask_task_embeddings
+                # img_tensors_out = depth_tensors_out
+        else: # for stage 1 training, the input and the output are in the same domain
+            img_tensors_in = [img_tensors_out[0]] * self.num_views
+            empty_class = torch.tensor([0, 0]).float() # empty task
+            empty_task_embeddings = torch.stack([empty_class] * self.num_views, dim=0)
+            task_embeddings = empty_task_embeddings
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        img_tensors_out = torch.stack(img_tensors_out, dim=0).float()  # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        if load_cam_type == 'ortho':
+            cam_type_emb = torch.tensor([0, 1]).expand(self.num_views, -1)
+        else:
+            cam_type_emb = torch.tensor([1, 0]).expand(self.num_views, -1)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1)
+        if self.load_cam_type:
+            camera_embeddings = torch.cat((camera_embeddings, cam_type_emb), dim=-1)  # (Nv, 5)
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'normals_out': normal_tensors_out,
+            'depth_out': depth_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'task_embeddings': task_embeddings,
+        }
+    def __getitem_mixed__(self, index, debug_object=None):
+        if debug_object is not  None:
+            object_name =  debug_object #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0] # without replacement
+        else:
+            object_name = self.all_objects[index%len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        assert self.pred_ortho or self.pred_persp
+        if self.pred_ortho and self.pred_persp:
+            if random.random() < 0.5:
+                load_dir = self.root_dir_ortho
+                load_cam_type = 'ortho'
+            else:
+                load_dir = self.root_dir_persp
+                load_cam_type = 'persp'
+        elif self.pred_ortho and not self.pred_persp:
+            load_dir = self.root_dir_ortho
+            load_cam_type = 'ortho'
+        elif self.pred_persp and not self.pred_ortho:
+            load_dir = self.root_dir_persp
+            load_cam_type = 'persp'
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        if self.read_mask:
+            cond_alpha = self.load_mask(os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, cond_view, self.suffix)), return_type='np')
+        else:
+            cond_alpha = None
+        # img_tensors_in = [
+        #     self.load_image(os.path.join(self.root_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, cond_view, self.suffix)), bg_color, cond_alpha, return_type='pt').permute(2, 0, 1)
+        # ] * self.num_views
+        img_tensors_out = []
+        normal_tensors_out = []
+        depth_tensors_out = []
+        random_select = random.random()
+        read_color, read_normal, read_depth = [random_select < 1 / 3, 1 / 3 <= random_select <= 2 / 3,
+                                               random_select > 2 / 3]
+        # print(read_color, read_normal, read_depth)
+        assert sum([read_color, read_normal, read_depth]) == 1, "Only one variable should be True"
+        for view, tgt_w2c in zip(view_types, tgt_w2cs):
+            img_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, view, self.suffix))
+            mask_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, view, self.suffix))
+            depth_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name, "depth_%03d_%s.%s" % (set_idx, view, self.suffix))
+            if self.read_mask:
+                alpha = self.load_mask(mask_path, return_type='np')
+            else:
+                alpha = None
+            if read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type, read_depth=read_depth)
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if read_normal:
+                normal_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "normals_%03d_%s.%s" % (set_idx, view, self.suffix))
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c, return_type="pt", camera_type=load_cam_type).permute(2, 0, 1)
+                img_tensors_out.append(normal_tensor)
+            if read_depth:
+                if alpha is None:
+                    alpha = self.load_mask_from_rgba(img_path, camera_type=load_cam_type)
+                depth_tensor = self.load_depth(depth_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type).permute(2, 0, 1)
+                img_tensors_out.append(depth_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = [
+                             self.load_image(os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                                          "rgb_%03d_%s.%s" % (set_idx, cond_view, self.suffix)),
+                                             bg_color, cond_alpha, return_type='pt', camera_type=load_cam_type, read_depth=read_depth).permute(
+                                 2, 0, 1)
+                         ] * self.num_views
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        # if self.read_color:
+        #     img_tensors_out = torch.stack(img_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        # if self.read_normal:
+        #     normal_tensors_out = torch.stack(normal_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        # if self.read_depth:
+        #     depth_tensors_out = torch.stack(depth_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        img_tensors_out = torch.stack(img_tensors_out, dim=0).float()  # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        if load_cam_type == 'ortho':
+            cam_type_emb = torch.tensor([0, 1]).expand(self.num_views, -1)
+        else:
+            cam_type_emb = torch.tensor([1, 0]).expand(self.num_views, -1)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1)
+        # if self.pred_ortho and self.pred_persp:
+        if self.load_cam_type:
+            camera_embeddings = torch.cat((camera_embeddings, cam_type_emb), dim=-1)  # (Nv, 5)
+        color_class = torch.tensor([0, 1]).float()
+        color_task_embeddings = torch.stack([color_class]*self.num_views, dim=0)  # (Nv, 2)
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class]*self.num_views, dim=0)  # (Nv, 2)
+        depth_class = torch.tensor([1, 1]).float()
+        depth_task_embeddings = torch.stack([depth_class]*self.num_views, dim=0)
+        if read_color:
+            task_embeddings = color_task_embeddings
+            # img_tensors_out = depth_tensors_out
+        elif read_normal:
+            task_embeddings = normal_task_embeddings
+            # img_tensors_out = normal_tensors_out
+        elif read_depth:
+            task_embeddings = depth_task_embeddings
+            # img_tensors_out = depth_tensors_out
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'normals_out': normal_tensors_out,
+            'depth_out': depth_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'task_embeddings': task_embeddings,
+        }
+    def __getitem_image_normal_mixed__(self, index, debug_object=None):
+        if debug_object is not  None:
+            object_name =  debug_object #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0] # without replacement
+        else:
+            object_name = self.all_objects[index%len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        assert self.pred_ortho or self.pred_persp
+        if self.pred_ortho and self.pred_persp:
+            if random.random() < 0.5:
+                load_dir = self.root_dir_ortho
+                load_cam_type = 'ortho'
+            else:
+                load_dir = self.root_dir_persp
+                load_cam_type = 'persp'
+        elif self.pred_ortho and not self.pred_persp:
+            load_dir = self.root_dir_ortho
+            load_cam_type = 'ortho'
+        elif self.pred_persp and not self.pred_ortho:
+            load_dir = self.root_dir_persp
+            load_cam_type = 'persp'
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        # get crop size for each mv instance:
+        center_crop_size = 0
+        for view in view_types:
+            img_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, view, self.suffix))
+            img = Image.open(img_path)
+            img = img.resize([512,512])
+            img = np.array(img).astype(np.float32) / 255.  # [0, 1]
+            max_w_h = self.cal_single_view_crop(img)
+            center_crop_size = max(center_crop_size, max_w_h)
+        center_crop_size = center_crop_size * 4. / 3.
+        center_crop_size = center_crop_size + (random.random()-0.5) * 10.
+        if self.read_mask:
+            cond_alpha = self.load_mask(os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, cond_view, self.suffix)), return_type='np')
+        else:
+            cond_alpha = None
+        # img_tensors_in = [
+        #     self.load_image(os.path.join(self.root_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, cond_view, self.suffix)), bg_color, cond_alpha, return_type='pt').permute(2, 0, 1)
+        # ] * self.num_views
+        img_tensors_out = []
+        normal_tensors_out = []
+        depth_tensors_out = []
+        random_select = random.random()
+        read_color, read_normal = [random_select < 1 / 2, 1 / 2 <= random_select <= 1]
+        # print(read_color, read_normal, read_depth)
+        assert sum([read_color, read_normal]) == 1, "Only one variable should be True"
+        for view, tgt_w2c in zip(view_types, tgt_w2cs):
+            img_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, view, self.suffix))
+            mask_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, view, self.suffix))
+            depth_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name, "depth_%03d_%s.%s" % (set_idx, view, self.suffix))
+            if self.read_mask:
+                alpha = self.load_mask(mask_path, return_type='np')
+            else:
+                alpha = None
+            if read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type, read_depth=False, center_crop_size=center_crop_size)
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if read_normal:
+                normal_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "normals_%03d_%s.%s" % (set_idx, view, self.suffix))
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c, return_type="pt", camera_type=load_cam_type, center_crop_size=center_crop_size).permute(2, 0, 1)
+                img_tensors_out.append(normal_tensor)
+            # if read_depth:
+            #     if alpha is None:
+            #         alpha = self.load_mask_from_rgba(img_path, camera_type=load_cam_type)
+            #     depth_tensor = self.load_depth(depth_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type).permute(2, 0, 1)
+            #     img_tensors_out.append(depth_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = [
+                             self.load_image(os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                                          "rgb_%03d_%s.%s" % (set_idx, cond_view, self.suffix)),
+                                             bg_color, cond_alpha, return_type='pt', camera_type=load_cam_type, read_depth=False, center_crop_size=center_crop_size).permute(
+                                 2, 0, 1)
+                         ] * self.num_views
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        # if self.read_color:
+        #     img_tensors_out = torch.stack(img_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        # if self.read_normal:
+        #     normal_tensors_out = torch.stack(normal_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        # if self.read_depth:
+        #     depth_tensors_out = torch.stack(depth_tensors_out, dim=0).float() # (Nv, 3, H, W)
+        img_tensors_out = torch.stack(img_tensors_out, dim=0).float()  # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        if load_cam_type == 'ortho':
+            cam_type_emb = torch.tensor([0, 1]).expand(self.num_views, -1)
+        else:
+            cam_type_emb = torch.tensor([1, 0]).expand(self.num_views, -1)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1)
+        # if self.pred_ortho and self.pred_persp:
+        if self.load_cam_type:
+            camera_embeddings = torch.cat((camera_embeddings, cam_type_emb), dim=-1)  # (Nv, 5)
+        color_class = torch.tensor([0, 1]).float()
+        color_task_embeddings = torch.stack([color_class]*self.num_views, dim=0)  # (Nv, 2)
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class]*self.num_views, dim=0)  # (Nv, 2)
+        # depth_class = torch.tensor([1, 1]).float()
+        # depth_task_embeddings = torch.stack([depth_class]*self.num_views, dim=0)
+        if read_color:
+            task_embeddings = color_task_embeddings
+            # img_tensors_out = depth_tensors_out
+        elif read_normal:
+            task_embeddings = normal_task_embeddings
+            # img_tensors_out = normal_tensors_out
+        # elif read_depth:
+        #     task_embeddings = depth_task_embeddings
+            # img_tensors_out = depth_tensors_out
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'normals_out': normal_tensors_out,
+            'depth_out': depth_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'task_embeddings': task_embeddings,
+        }
+    def cal_single_view_crop(self, image):
+        assert np.shape(image)[-1] == 4  # RGBA
+        # Extract the alpha channel (transparency) and the object (RGB channels)
+        alpha_channel = image[:, :, 3]
+        # Find the bounding box coordinates of the object
+        coords = cv2.findNonZero(alpha_channel)
+        x, y, width, height = cv2.boundingRect(coords)
+        return max(width, height)
+    def __getitem_joint_rgb_noraml__(self, index, debug_object=None):
+        if debug_object is not  None:
+            object_name =  debug_object #
+            set_idx = random.sample(range(0, self.groups_num), 1)[0] # without replacement
+        else:
+            object_name = self.all_objects[index%len(self.all_objects)]
+            set_idx = 0
+        if self.augment_data:
+            cond_view = random.sample(self.view_types, k=1)[0]
+        else:
+            cond_view = 'front'
+        assert self.pred_ortho or self.pred_persp
+        if self.pred_ortho and self.pred_persp:
+            if random.random() < 0.5:
+                load_dir = self.root_dir_ortho
+                load_cam_type = 'ortho'
+            else:
+                load_dir = self.root_dir_persp
+                load_cam_type = 'persp'
+        elif self.pred_ortho and not self.pred_persp:
+            load_dir = self.root_dir_ortho
+            load_cam_type = 'ortho'
+        elif self.pred_persp and not self.pred_ortho:
+            load_dir = self.root_dir_persp
+            load_cam_type = 'persp'
+        view_types = self.view_types
+        cond_w2c = self.fix_cam_poses[cond_view]
+        tgt_w2cs = [self.fix_cam_poses[view] for view in view_types]
+        elevations = []
+        azimuths = []
+        # get the bg color
+        bg_color = self.get_bg_color()
+        if self.read_mask:
+            cond_alpha = self.load_mask(os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, cond_view, self.suffix)), return_type='np')
+        else:
+            cond_alpha = None
+        img_tensors_out = []
+        normal_tensors_out = []
+        read_color, read_normal = True, True
+        # print(read_color, read_normal, read_depth)
+        # get crop size for each mv instance:
+        center_crop_size = 0
+        for view in view_types:
+            img_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, view, self.suffix))
+            img = Image.open(img_path)
+            img = img.resize([512,512])
+            img = np.array(img).astype(np.float32) / 255.  # [0, 1]
+            max_w_h = self.cal_single_view_crop(img)
+            center_crop_size = max(center_crop_size, max_w_h)
+        center_crop_size = center_crop_size * 4. / 3.
+        center_crop_size = center_crop_size + (random.random()-0.5) * 10.
+        for view, tgt_w2c in zip(view_types, tgt_w2cs):
+            img_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "rgb_%03d_%s.%s" % (set_idx, view, self.suffix))
+            mask_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "mask_%03d_%s.%s" % (set_idx, view, self.suffix))
+            depth_path = os.path.join(load_dir, object_name[:self.subscene_tag], object_name, "depth_%03d_%s.%s" % (set_idx, view, self.suffix))
+            if self.read_mask:
+                alpha = self.load_mask(mask_path, return_type='np')
+            else:
+                alpha = None
+            if read_color:
+                img_tensor = self.load_image(img_path, bg_color, alpha, return_type="pt", camera_type=load_cam_type, read_depth=False, center_crop_size=center_crop_size)
+                img_tensor = img_tensor.permute(2, 0, 1)
+                img_tensors_out.append(img_tensor)
+            if read_normal:
+                normal_path = os.path.join(load_dir,  object_name[:self.subscene_tag], object_name, "normals_%03d_%s.%s" % (set_idx, view, self.suffix))
+                normal_tensor = self.load_normal(normal_path, bg_color, alpha, RT_w2c=tgt_w2c, RT_w2c_cond=cond_w2c, return_type="pt", camera_type=load_cam_type, center_crop_size=center_crop_size).permute(2, 0, 1)
+                normal_tensors_out.append(normal_tensor)
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        if self.load_switcher: # rgb input, use domain switcher to control the output type
+            img_tensors_in = [
+                                 self.load_image(os.path.join(load_dir, object_name[:self.subscene_tag], object_name,
+                                                              "rgb_%03d_%s.%s" % (set_idx, cond_view, self.suffix)),
+                                                 bg_color, cond_alpha, return_type='pt', camera_type=load_cam_type,
+                                                 read_depth=False, center_crop_size=center_crop_size).permute(
+                                     2, 0, 1)
+                             ] * self.num_views
+            color_class = torch.tensor([0, 1]).float()
+            color_task_embeddings = torch.stack([color_class] * self.num_views, dim=0)  # (Nv, 2)
+            normal_class = torch.tensor([1, 0]).float()
+            normal_task_embeddings = torch.stack([normal_class] * self.num_views, dim=0)  # (Nv, 2)
+            if read_color:
+                task_embeddings = color_task_embeddings
+                # img_tensors_out = depth_tensors_out
+            elif read_normal:
+                task_embeddings = normal_task_embeddings
+                # img_tensors_out = normal_tensors_out
+        else: # for stage 1 training, the input and the output are in the same domain
+            img_tensors_in = [img_tensors_out[0]] * self.num_views
+            empty_class = torch.tensor([0, 0]).float() # empty task
+            empty_task_embeddings = torch.stack([empty_class] * self.num_views, dim=0)
+            task_embeddings = empty_task_embeddings
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        img_tensors_out = torch.stack(img_tensors_out, dim=0).float()  # (Nv, 3, H, W)
+        normal_tensors_out = torch.stack(normal_tensors_out, dim=0).float()  # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()  # fixed only use 4 views to train
+        if load_cam_type == 'ortho':
+            cam_type_emb = torch.tensor([0, 1]).expand(self.num_views, -1)
+        else:
+            cam_type_emb = torch.tensor([1, 0]).expand(self.num_views, -1)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1)
+        if self.load_cam_type:
+            camera_embeddings = torch.cat((camera_embeddings, cam_type_emb), dim=-1)  # (Nv, 5)
+        return {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'imgs_out': img_tensors_out,
+            'normals_out': normal_tensors_out,
+            'camera_embeddings': camera_embeddings,
+            'color_task_embeddings': color_task_embeddings,
+            'normal_task_embeddings': normal_task_embeddings
+        }
+    def __getitem__(self, index):
+        try:
+            if self.pred_type == 'color':
+                data = self.backup_data = self.__getitem_color__(index)
+            elif self.pred_type == 'normal_depth':
+                data = self.backup_data = self.__getitem_normal_depth__(index)
+            elif self.pred_type == 'mixed_rgb_normal_depth':
+                data = self.backup_data = self.__getitem_mixed__(index)
+            elif self.pred_type == 'mixed_color_normal':
+                data = self.backup_data = self.__getitem_image_normal_mixed__(index)
+            elif self.pred_type == 'mixed_rgb_noraml_mask':
+                data = self.backup_data = self.__getitem_mixed_rgb_noraml_mask__(index)
+            elif self.pred_type == 'joint_color_normal':
+                data = self.backup_data = self.__getitem_joint_rgb_noraml__(index)
+            return data
+        except:
+            print("load error ", self.all_objects[index%len(self.all_objects)])
+            return self.backup_data
+class ConcatDataset(torch.utils.data.Dataset):
+    def __init__(self, datasets, weights):
+        self.datasets = datasets
+        self.weights = weights
+        self.num_datasets = len(datasets)
+    def __getitem__(self, i):
+        chosen = random.choices(self.datasets, self.weights, k=1)[0]
+        return chosen[i]
+    def __len__(self):
+        return max(len(d) for d in self.datasets)
+if __name__ == "__main__":
+    train_dataset = ObjaverseDataset(
+        root_dir="/ghome/l5/xxlong/.objaverse/hf-objaverse-v1/renderings",
+        size=(128, 128),
+        ext="hdf5",
+        default_trans=torch.zeros(3),
+        return_paths=False,
+        total_view=8,
+        validation=False,
+        object_list=None,
+        views_mode='fourviews'
+    )
+    data0 = train_dataset[0]
+    data1  = train_dataset[50]
+    # print(data)

mv_diffusion_30/data/single_image_dataset.py ADDED Viewed

	@@ -0,0 +1,337 @@

+from typing import Dict
+import numpy as np
+from omegaconf import DictConfig, ListConfig
+import torch
+from torch.utils.data import Dataset
+from pathlib import Path
+import json
+from PIL import Image
+from torchvision import transforms
+from einops import rearrange
+from typing import Literal, Tuple, Optional, Any
+import cv2
+import random
+import json
+import os, sys
+import math
+from glob import glob
+import PIL.Image
+from .normal_utils import trans_normal, normal2img, img2normal
+import pdb
+from rembg import remove
+import cv2
+import numpy as np
+def add_margin(pil_img, color=0, size=256):
+    width, height = pil_img.size
+    result = Image.new(pil_img.mode, (size, size), color)
+    result.paste(pil_img, ((size - width) // 2, (size - height) // 2))
+    return result
+def scale_and_place_object(image, scale_factor):
+    assert np.shape(image)[-1]==4  # RGBA
+    # Extract the alpha channel (transparency) and the object (RGB channels)
+    alpha_channel = image[:, :, 3]
+    # Find the bounding box coordinates of the object
+    coords = cv2.findNonZero(alpha_channel)
+    x, y, width, height = cv2.boundingRect(coords)
+    # Calculate the scale factor for resizing
+    original_height, original_width = image.shape[:2]
+    if width > height:
+        size = width
+        original_size = original_width
+    else:
+        size = height
+        original_size = original_height
+    scale_factor = min(scale_factor, size / (original_size+0.0))
+    new_size = scale_factor * original_size
+    scale_factor = new_size / size
+    # Calculate the new size based on the scale factor
+    new_width = int(width * scale_factor)
+    new_height = int(height * scale_factor)
+    center_x = original_width // 2
+    center_y = original_height // 2
+    paste_x = center_x - (new_width // 2)
+    paste_y = center_y - (new_height // 2)
+    # Resize the object (RGB channels) to the new size
+    rescaled_object = cv2.resize(image[y:y+height, x:x+width], (new_width, new_height))
+    # Create a new RGBA image with the resized image
+    new_image = np.zeros((original_height, original_width, 4), dtype=np.uint8)
+    new_image[paste_y:paste_y + new_height, paste_x:paste_x + new_width] = rescaled_object
+    return new_image
+class SingleImageDataset(Dataset):
+    def __init__(self,
+        root_dir: str,
+        num_views: int,
+        img_wh: Tuple[int, int],
+        bg_color: str,
+        crop_size: int = 224,
+        single_image: Optional[PIL.Image.Image] = None,
+        num_validation_samples: Optional[int] = None,
+        filepaths: Optional[list] = None,
+        cam_types: Optional[list] = None,
+        cond_type: Optional[str] = None,
+        load_cam_type: Optional[bool] = True
+        ) -> None:
+        """Create a dataset from a folder of images.
+        If you pass in a root directory it will be searched for images
+        ending in ext (ext can be a list)
+        """
+        self.root_dir = root_dir
+        self.num_views = num_views
+        self.img_wh = img_wh
+        self.crop_size = crop_size
+        self.bg_color = bg_color
+        self.cond_type = cond_type
+        self.load_cam_type = load_cam_type
+        self.cam_types = cam_types
+        if self.num_views == 4:
+            self.view_types  = ['front', 'right', 'back', 'left']
+        elif self.num_views == 5:
+            self.view_types  = ['front', 'front_right', 'right', 'back', 'left']
+        elif self.num_views == 6:
+            self.view_types  = ['front', 'front_right', 'right', 'back', 'left', 'front_left']
+        self.fix_cam_pose_dir = "./mvdiffusion/data/fixed_poses/nine_views"
+        self.fix_cam_poses = self.load_fixed_poses()  # world2cam matrix
+        if single_image is None:
+            if filepaths is None:
+                # Get a list of all files in the directory
+                file_list = os.listdir(self.root_dir)
+                self.cam_types =  ['ortho'] * len(file_list) + ['persp']* len(file_list)
+                file_list = file_list * 2
+            else:
+                file_list = filepaths
+            # Filter the files that end with .png or .jpg
+            self.file_list = [file for file in file_list if file.endswith(('.png', '.jpg'))]
+        else:
+            self.file_list = None
+        # load all images
+        self.all_images = []
+        self.all_alphas = []
+        bg_color = self.get_bg_color()
+        if single_image is not None:
+            image, alpha = self.load_image(None, bg_color, return_type='pt', Imagefile=single_image)
+            self.all_images.append(image)
+            self.all_alphas.append(alpha)
+        else:
+            for file in self.file_list:
+                print(os.path.join(self.root_dir, file))
+                image, alpha = self.load_image(os.path.join(self.root_dir, file), bg_color, return_type='pt')
+                self.all_images.append(image)
+                self.all_alphas.append(alpha)
+        #
+        # assert len(self.file_list) == len(self.cam_types)
+        self.all_images = self.all_images[:num_validation_samples]
+        self.all_alphas = self.all_alphas[:num_validation_samples]
+    def __len__(self):
+        return len(self.all_images)
+    def load_fixed_poses(self):
+        poses = {}
+        for face in self.view_types:
+            RT = np.loadtxt(os.path.join(self.fix_cam_pose_dir,'%03d_%s_RT.txt'%(0, face)))
+            poses[face] = RT
+        return poses
+    def cartesian_to_spherical(self, xyz):
+        ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
+        xy = xyz[:,0]**2 + xyz[:,1]**2
+        z = np.sqrt(xy + xyz[:,2]**2)
+        theta = np.arctan2(np.sqrt(xy), xyz[:,2]) # for elevation angle defined from Z-axis down
+        #ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
+        azimuth = np.arctan2(xyz[:,1], xyz[:,0])
+        return np.array([theta, azimuth, z])
+    def get_T(self, target_RT, cond_RT):
+        R, T = target_RT[:3, :3], target_RT[:, -1]
+        T_target = -R.T @ T # change to cam2world
+        R, T = cond_RT[:3, :3], cond_RT[:, -1]
+        T_cond = -R.T @ T
+        theta_cond, azimuth_cond, z_cond = self.cartesian_to_spherical(T_cond[None, :])
+        theta_target, azimuth_target, z_target = self.cartesian_to_spherical(T_target[None, :])
+        d_theta = theta_target - theta_cond
+        d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
+        d_z = z_target - z_cond
+        # d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
+        return d_theta, d_azimuth
+    def get_bg_color(self):
+        if self.bg_color == 'white':
+            bg_color = np.array([1., 1., 1.], dtype=np.float32)
+        elif self.bg_color == 'black':
+            bg_color = np.array([0., 0., 0.], dtype=np.float32)
+        elif self.bg_color == 'gray':
+            bg_color = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+        elif self.bg_color == 'random':
+            bg_color = np.random.rand(3)
+        elif isinstance(self.bg_color, float):
+            bg_color = np.array([self.bg_color] * 3, dtype=np.float32)
+        else:
+            raise NotImplementedError
+        return bg_color
+    def load_image(self, img_path, bg_color, return_type='np', Imagefile=None):
+        # pil always returns uint8
+        if Imagefile is None:
+            image_input = Image.open(img_path)
+        else:
+            image_input = Imagefile
+        image_size = self.img_wh[0]
+        if np.asarray(image_input).shape[-1] != 4:
+            print('move background for:', image_input)
+            image_input = remove(image_input)
+        if self.crop_size!=-1:
+            alpha_np = np.asarray(image_input)[:, :, 3]
+            coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)]
+            min_x, min_y = np.min(coords, 0)
+            max_x, max_y = np.max(coords, 0)
+            ref_img_ = image_input.crop((min_x, min_y, max_x, max_y))
+            h, w = ref_img_.height, ref_img_.width
+            scale = self.crop_size / max(h, w)
+            h_, w_ = int(scale * h), int(scale * w)
+            ref_img_ = ref_img_.resize((w_, h_))
+            image_input = add_margin(ref_img_, size=image_size)
+        else:
+            image_input = add_margin(image_input, size=max(image_input.height, image_input.width))
+            image_input = image_input.resize((image_size, image_size))
+        # img = scale_and_place_object(img, self.scale_ratio)
+        img = np.array(image_input)
+        img = img.astype(np.float32) / 255. # [0, 1]
+        assert img.shape[-1] == 4 # RGBA
+        alpha = img[...,3:4]
+        img = img[...,:3] * alpha + bg_color * (1 - alpha)
+        if return_type == "np":
+            pass
+        elif return_type == "pt":
+            img = torch.from_numpy(img)
+            alpha = torch.from_numpy(alpha)
+        else:
+            raise NotImplementedError
+        return img, alpha
+    def __len__(self):
+        return len(self.all_images)
+    def __getitem__(self, index):
+        image = self.all_images[index%len(self.all_images)]
+        alpha = self.all_alphas[index%len(self.all_images)]
+        if self.load_cam_type:
+            cam_type = self.cam_types[index%len(self.all_images)]
+        else:
+            cam_type = 'ortho'
+        if self.file_list is not None:
+            filename = self.file_list[index%len(self.all_images)].replace(".png", "")
+        else:
+            filename = 'null'
+        print(self.cam_types, self.file_list)
+        print('self camera type:', self.cam_types, cam_type)
+        cond_w2c = self.fix_cam_poses['front']
+        tgt_w2cs = [self.fix_cam_poses[view] for view in self.view_types]
+        elevations = []
+        azimuths = []
+        img_tensors_in = [
+            image.permute(2, 0, 1)
+        ] * self.num_views
+        alpha_tensors_in = [
+            alpha.permute(2, 0, 1)
+        ] * self.num_views
+        for view, tgt_w2c in zip(self.view_types, tgt_w2cs):
+            # evelations, azimuths
+            elevation, azimuth = self.get_T(tgt_w2c, cond_w2c)
+            elevations.append(elevation)
+            azimuths.append(azimuth)
+        img_tensors_in = torch.stack(img_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        alpha_tensors_in = torch.stack(alpha_tensors_in, dim=0).float() # (Nv, 3, H, W)
+        elevations = torch.as_tensor(elevations).float().squeeze(1)
+        azimuths = torch.as_tensor(azimuths).float().squeeze(1)
+        elevations_cond = torch.as_tensor([0] * self.num_views).float()
+        normal_class = torch.tensor([1, 0]).float()
+        normal_task_embeddings = torch.stack([normal_class]*self.num_views, dim=0)  # (Nv, 2)
+        color_class = torch.tensor([0, 1]).float()
+        color_task_embeddings = torch.stack([color_class]*self.num_views, dim=0)  # (Nv, 2)
+        depth_class = torch.tensor([1, 1]).float()
+        depth_task_embeddings = torch.stack([depth_class]*self.num_views, dim=0)
+        camera_embeddings = torch.stack([elevations_cond, elevations, azimuths], dim=-1) # (Nv, 3)
+        print("camera type:", cam_type)
+        if cam_type == 'ortho':
+            cam_type_emb = torch.tensor([0, 1]).expand(self.num_views, -1)
+        else:
+            cam_type_emb = torch.tensor([1, 0]).expand(self.num_views, -1)
+        if self.load_cam_type:
+            camera_embeddings = torch.cat((camera_embeddings, cam_type_emb), dim=-1)  # (Nv, 5)
+        out =  {
+            'elevations_cond': elevations_cond,
+            'elevations_cond_deg': torch.rad2deg(elevations_cond),
+            'elevations': elevations,
+            'azimuths': azimuths,
+            'elevations_deg': torch.rad2deg(elevations),
+            'azimuths_deg': torch.rad2deg(azimuths),
+            'imgs_in': img_tensors_in,
+            'alphas': alpha_tensors_in,
+            'camera_embeddings': camera_embeddings,
+            'normal_task_embeddings': normal_task_embeddings,
+            'color_task_embeddings': color_task_embeddings,
+            'depth_task_embeddings': depth_task_embeddings,
+            'filename': filename,
+            'cam_type': cam_type
+        }
+        return out

mv_diffusion_30/models/transformer_mv2d.py ADDED Viewed

	@@ -0,0 +1,1093 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+# from torch.nn.attention import SDPBackend, sdpa_kernel
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput, deprecate
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward, AdaLayerNorm, AdaLayerNormZero, Attention
+from diffusers.models.embeddings import PatchEmbed
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange, repeat
+import pdb
+import random
+# if is_xformers_available():
+#     import xformers
+#     import xformers.ops
+# else:
+#     xformers = None
+def my_repeat(tensor, num_repeats):
+    """
+    Repeat a tensor along a given dimension
+    """
+    if len(tensor.shape) == 3:
+        return repeat(tensor,  "b d c -> (b v) d c", v=num_repeats)
+    elif len(tensor.shape) == 4:
+        return repeat(tensor,  "a b d c -> (a v) b d c", v=num_repeats)
+@dataclass
+class TransformerMV2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+class TransformerMV2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        num_views: int = 1,
+        cd_attention_last: bool=False,
+        cd_attention_mid: bool=False,
+        multiview_attention: bool=True,
+        sparse_mv_attention: bool = False,
+        mvcd_attention: bool=False
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = LoRACompatibleLinear(in_channels, inner_dim)
+            else:
+                self.proj_in = LoRACompatibleConv(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicMVTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    num_views=num_views,
+                    cd_attention_last=cd_attention_last,
+                    cd_attention_mid=cd_attention_mid,
+                    multiview_attention=multiview_attention,
+                    sparse_mv_attention=sparse_mv_attention,
+                    mvcd_attention=mvcd_attention
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = LoRACompatibleLinear(inner_dim, in_channels)
+            else:
+                self.proj_out = LoRACompatibleConv(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches:
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            hidden_states = self.pos_embed(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        elif self.is_input_patches:
+            # TODO: cleanup!
+            conditioning = self.transformer_blocks[0].norm1.emb(
+                timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+            hidden_states = self.proj_out_2(hidden_states)
+            # unpatchify
+            height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        if not return_dict:
+            return (output,)
+        return TransformerMV2DModelOutput(sample=output)
+@maybe_allow_in_graph
+class BasicMVTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+        num_views: int = 1,
+        cd_attention_last: bool = False,
+        cd_attention_mid: bool = False,
+        multiview_attention: bool = True,
+        sparse_mv_attention: bool = False,
+        mvcd_attention: bool = False
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.multiview_attention = multiview_attention
+        self.sparse_mv_attention = sparse_mv_attention
+        self.mvcd_attention = mvcd_attention
+        self.attn1 = CustomAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            processor=MVAttnProcessor()
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                # processor=CrossAttnProcessor()
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+        self.num_views = num_views
+        self.cd_attention_last = cd_attention_last
+        if self.cd_attention_last:
+            # Joint task -Attn
+            self.attn_joint_last = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+                processor=JointAttnProcessor()
+            )
+            nn.init.zeros_(self.attn_joint_last.to_out[0].weight.data)
+            self.norm_joint_last = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        self.cd_attention_mid = cd_attention_mid
+        if self.cd_attention_mid:
+            # print("cross-domain attn in the middle")
+            # Joint task -Attn
+            self.attn_joint_mid = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+                processor=JointAttnProcessor()
+            )
+            nn.init.zeros_(self.attn_joint_mid.to_out[0].weight.data)
+            self.norm_joint_mid = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+    ):
+        """
+        :type attention_mask: object
+        """
+        assert attention_mask is None # not supported yet
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 1. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            num_views=self.num_views,
+            multiview_attention=self.multiview_attention,
+            sparse_mv_attention=self.sparse_mv_attention,
+            mvcd_attention=self.mvcd_attention,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # joint attention twice
+        if self.cd_attention_mid:
+            norm_hidden_states = (
+                self.norm_joint_mid(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_joint_mid(hidden_states)
+            )
+            hidden_states = self.attn_joint_mid(norm_hidden_states) + hidden_states
+        # 2. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        if self.cd_attention_last:
+            norm_hidden_states = (
+                self.norm_joint_last(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_joint_last(hidden_states)
+            )
+            hidden_states = self.attn_joint_last(norm_hidden_states) + hidden_states
+        return hidden_states
+class CustomAttention(Attention):
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, *args, **kwargs
+    ):
+        processor = XFormersMVAttnProcessor()
+        self.set_processor(processor)
+        # print("using xformers attention processor")
+class CustomJointAttention(Attention):
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, *args, **kwargs
+    ):
+        processor = XFormersJointAttnProcessor()
+        self.set_processor(processor)
+        # print("using xformers attention processor")
+class MVAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        num_views=1,
+        multiview_attention=True,
+        sparse_mv_attention=False,
+        mvcd_attention=False,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, input_dim = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # print('query', query.shape, 'key', key.shape, 'value', value.shape)
+        #([bx4, 1024, 320]) key torch.Size([bx4, 1024, 320]) value torch.Size([bx4, 1024, 320])
+        # pdb.set_trace()
+        # multi-view self-attention
+        if multiview_attention:
+            key = rearrange(key, "(b t) d c -> b (t d) c", t=num_views).repeat_interleave(num_views, dim=0)
+            value = rearrange(value, "(b t) d c -> b (t d) c", t=num_views).repeat_interleave(num_views, dim=0)
+        # batch, n_heads, n_tokens, channel
+        query = attn.head_to_batch_dim(query, out_dim=4).contiguous()
+        key = attn.head_to_batch_dim(key, out_dim=4).contiguous()
+        value = attn.head_to_batch_dim(value, out_dim=4).contiguous()
+        with torch.backends.cuda.sdp_kernel(
+                enable_flash=True,
+                enable_math=False,
+                enable_mem_efficient=True
+        ):
+            hidden_states = F.scaled_dot_product_attention(query, key, value)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, sequence_length, input_dim)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class XFormersMVAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        num_views=1.,
+        multiview_attention=True,
+        sparse_mv_attention=False,
+        mvcd_attention=False,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        # from yuancheng; here attention_mask is None
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key_raw = attn.to_k(encoder_hidden_states)
+        value_raw = attn.to_v(encoder_hidden_states)
+        # print('query', query.shape, 'key', key.shape, 'value', value.shape)
+        #([bx4, 1024, 320]) key torch.Size([bx4, 1024, 320]) value torch.Size([bx4, 1024, 320])
+        # pdb.set_trace()
+        # multi-view self-attention
+        if multiview_attention:
+            if not sparse_mv_attention:
+                key = my_repeat(rearrange(key_raw, "(b t) d c -> b (t d) c", t=num_views), num_views)
+                value = my_repeat(rearrange(value_raw, "(b t) d c -> b (t d) c", t=num_views), num_views)
+            else:
+                key_front = my_repeat(rearrange(key_raw, "(b t) d c -> b t d c", t=num_views)[:, 0, :, :], num_views) # [(b t), d, c]
+                value_front = my_repeat(rearrange(value_raw, "(b t) d c -> b t d c", t=num_views)[:, 0, :, :], num_views)
+                key = torch.cat([key_front, key_raw], dim=1) # shape (b t) (2 d) c
+                value = torch.cat([value_front, value_raw], dim=1)
+        else:
+            # print("don't use multiview attention.")
+            key = key_raw
+            value = value_raw
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        # for flash attention implementation
+        # with torch.backends.cuda.sdp_kernel(enable_math=False):
+        #     hidden_states = F.scaled_dot_product_attention(query, key, value, attn_bias=attention_mask)
+        # hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class XFormersJointAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        num_tasks=2
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        # from yuancheng; here attention_mask is None
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        assert num_tasks == 2  # only support two tasks now
+        key_0, key_1 = torch.chunk(key, dim=0, chunks=2)  # keys shape (b t) d c
+        value_0, value_1 = torch.chunk(value, dim=0, chunks=2)
+        key = torch.cat([key_0, key_1], dim=1)  # (b t) 2d c
+        value = torch.cat([value_0, value_1], dim=1)  # (b t) 2d c
+        key = torch.cat([key]*2, dim=0)   # ( 2 b t) 2d c
+        value = torch.cat([value]*2, dim=0)  # (2 b t) 2d c
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
+        # for flash attention implementation
+        # with torch.backends.cuda.sdp_kernel(enable_math=False):
+        #     hidden_states = F.scaled_dot_product_attention(query, key, value, attn_bias=attention_mask)
+        # hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+# class JointAttnProcessor:
+#     r"""
+#     Default processor for performing attention-related computations.
+#     """
+#
+#     def __call__(
+#         self,
+#         attn: Attention,
+#         hidden_states,
+#         encoder_hidden_states=None,
+#         attention_mask=None,
+#         temb=None,
+#         num_tasks=2
+#     ):
+#
+#         residual = hidden_states
+#
+#         if attn.spatial_norm is not None:
+#             hidden_states = attn.spatial_norm(hidden_states, temb)
+#
+#         input_ndim = hidden_states.ndim
+#
+#         if input_ndim == 4:
+#             batch_size, channel, height, width = hidden_states.shape
+#             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+#
+#         batch_size, sequence_length, input_dim = (
+#             hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+#         )
+#         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+#
+#
+#         if attn.group_norm is not None:
+#             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+#
+#         query = attn.to_q(hidden_states)
+#
+#         if encoder_hidden_states is None:
+#             encoder_hidden_states = hidden_states
+#         elif attn.norm_cross:
+#             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+#
+#         key = attn.to_k(encoder_hidden_states)
+#         value = attn.to_v(encoder_hidden_states)
+#
+#         assert num_tasks == 2  # only support two tasks now
+#
+#         key_0, key_1 = torch.chunk(key, dim=0, chunks=2)  # keys shape (b t) d c
+#         value_0, value_1 = torch.chunk(value, dim=0, chunks=2)
+#         key = torch.cat([key_0, key_1], dim=1)  # (b t) 2d c
+#         value = torch.cat([value_0, value_1], dim=1)  # (b t) 2d c
+#         key = torch.cat([key]*2, dim=0)   # ( 2 b t) 2d c
+#         value = torch.cat([value]*2, dim=0)  # (2 b t) 2d c
+#
+#
+#         # batch, n_heads, n_tokens, channel
+#         query = attn.head_to_batch_dim(query, out_dim=4).contiguous()
+#         key = attn.head_to_batch_dim(key, out_dim=4).contiguous()
+#         value = attn.head_to_batch_dim(value, out_dim=4).contiguous()
+#
+#         # attention_probs = attn.get_attention_scores(query, key, attention_mask)
+#         # hidden_states = torch.bmm(attention_probs, value)
+#         # hidden_states = attn.batch_to_head_dim(hidden_states)
+#
+#         # for flash attention implementation
+#         with torch.backends.cuda.sdp_kernel(
+#                 enable_flash=True,
+#                 enable_math=False,
+#                 enable_mem_efficient=True
+#         ):
+#             hidden_states = F.scaled_dot_product_attention(query, key, value)
+#
+#         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, sequence_length, input_dim)
+#
+#         # linear proj
+#         hidden_states = attn.to_out[0](hidden_states)
+#         # dropout
+#         hidden_states = attn.to_out[1](hidden_states)
+#
+#         if input_ndim == 4:
+#             hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+#
+#         if attn.residual_connection:
+#             hidden_states = hidden_states + residual
+#
+#         hidden_states = hidden_states / attn.rescale_output_factor
+#
+#         return hidden_states
+class JointAttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+            self,
+            attn: Attention,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+            num_tasks=2
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        assert num_tasks == 2  # only support two tasks now
+        key_0, key_1 = torch.chunk(key, dim=0, chunks=2)  # keys shape (b t) d c
+        value_0, value_1 = torch.chunk(value, dim=0, chunks=2)
+        key = torch.cat([key_0, key_1], dim=1)  # (b t) 2d c
+        value = torch.cat([value_0, value_1], dim=1)  # (b t) 2d c
+        key = torch.cat([key] * 2, dim=0)  # ( 2 b t) 2d c
+        value = torch.cat([value] * 2, dim=0)  # (2 b t) 2d c
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

mv_diffusion_30/models/unet_mv2d_blocks.py ADDED Viewed

	@@ -0,0 +1,922 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import is_torch_version, logging
+# from diffusers.models.normalization import AdaGroupNorm
+# from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
+# from diffusers.models.transformers.dual_transformer_2d import DualTransformer2DModel
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
+from mv_diffusion_30.models.transformer_mv2d import TransformerMV2DModel
+from diffusers.models.unets.unet_2d_blocks import DownBlock2D, ResnetDownsampleBlock2D, AttnDownBlock2D, CrossAttnDownBlock2D, SimpleCrossAttnDownBlock2D, SkipDownBlock2D, AttnSkipDownBlock2D, DownEncoderBlock2D, AttnDownEncoderBlock2D, KDownBlock2D, KCrossAttnDownBlock2D
+from diffusers.models.unets.unet_2d_blocks import UpBlock2D, ResnetUpsampleBlock2D, CrossAttnUpBlock2D, SimpleCrossAttnUpBlock2D, AttnUpBlock2D, SkipUpBlock2D, AttnSkipUpBlock2D, UpDecoderBlock2D, AttnUpDecoderBlock2D, KUpBlock2D, KCrossAttnUpBlock2D
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    attention_head_dim=None,
+    downsample_type=None,
+    num_views=1,
+    cd_attention_last: bool = False,
+    cd_attention_mid: bool = False,
+    multiview_attention: bool = True,
+    sparse_mv_attention: bool = False,
+    mvcd_attention: bool=False
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "ResnetDownsampleBlock2D":
+        return ResnetDownsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        if add_downsample is False:
+            downsample_type = None
+        else:
+            downsample_type = downsample_type or "conv"  # default to 'conv'
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            downsample_type=downsample_type,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    # custom MV2D attention block
+    elif down_block_type == "CrossAttnDownBlockMV2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMV2D")
+        return CrossAttnDownBlockMV2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            num_views=num_views,
+            cd_attention_last=cd_attention_last,
+            cd_attention_mid=cd_attention_mid,
+            multiview_attention=multiview_attention,
+            sparse_mv_attention=sparse_mv_attention,
+            mvcd_attention=mvcd_attention
+        )
+    elif down_block_type == "SimpleCrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
+        return SimpleCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "KDownBlock2D":
+        return KDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif down_block_type == "KCrossAttnDownBlock2D":
+        return KCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            add_self_attention=True if not add_downsample else False,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    attention_head_dim=None,
+    upsample_type=None,
+    num_views=1,
+    cd_attention_last: bool = False,
+    cd_attention_mid: bool = False,
+    multiview_attention: bool = True,
+    sparse_mv_attention: bool = False,
+    mvcd_attention: bool=False
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "ResnetUpsampleBlock2D":
+        return ResnetUpsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    # custom MV2D attention block
+    elif up_block_type == "CrossAttnUpBlockMV2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMV2D")
+        return CrossAttnUpBlockMV2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            num_views=num_views,
+            cd_attention_last=cd_attention_last,
+            cd_attention_mid=cd_attention_mid,
+            multiview_attention=multiview_attention,
+            sparse_mv_attention=sparse_mv_attention,
+            mvcd_attention=mvcd_attention
+        )
+    elif up_block_type == "SimpleCrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
+        return SimpleCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        if add_upsample is False:
+            upsample_type = None
+        else:
+            upsample_type = upsample_type or "conv"  # default to 'conv'
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            upsample_type=upsample_type,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "KUpBlock2D":
+        return KUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "KCrossAttnUpBlock2D":
+        return KCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlockMV2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+        num_views: int = 1,
+        cd_attention_last: bool = False,
+        cd_attention_mid: bool = False,
+        multiview_attention: bool = True,
+        sparse_mv_attention: bool = False,
+        mvcd_attention: bool=False
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    TransformerMV2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        num_views=num_views,
+                        cd_attention_last=cd_attention_last,
+                        cd_attention_mid=cd_attention_mid,
+                        multiview_attention=multiview_attention,
+                        sparse_mv_attention=sparse_mv_attention,
+                        mvcd_attention=mvcd_attention
+                    )
+                )
+            else:
+                raise NotImplementedError
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+            )[0]
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class CrossAttnUpBlockMV2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        num_views: int = 1,
+        cd_attention_last: bool = False,
+        cd_attention_mid: bool = False,
+        multiview_attention: bool = True,
+        sparse_mv_attention: bool = False,
+        mvcd_attention: bool=False
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    TransformerMV2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        num_views=num_views,
+                        cd_attention_last=cd_attention_last,
+                        cd_attention_mid=cd_attention_mid,
+                        multiview_attention=multiview_attention,
+                        sparse_mv_attention=sparse_mv_attention,
+                        mvcd_attention=mvcd_attention
+                    )
+                )
+            else:
+                raise NotImplementedError
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    None,  # timestep
+                    None,  # class_labels
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                    **ckpt_kwargs,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class CrossAttnDownBlockMV2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        num_views: int = 1,
+        cd_attention_last: bool = False,
+        cd_attention_mid: bool = False,
+        multiview_attention: bool = True,
+        sparse_mv_attention: bool = False,
+        mvcd_attention: bool=False
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    TransformerMV2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        num_views=num_views,
+                        cd_attention_last=cd_attention_last,
+                        cd_attention_mid=cd_attention_mid,
+                        multiview_attention=multiview_attention,
+                        sparse_mv_attention=sparse_mv_attention,
+                        mvcd_attention=mvcd_attention
+                    )
+                )
+            else:
+                raise NotImplementedError
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals=None,
+    ):
+        output_states = ()
+        blocks = list(zip(self.resnets, self.attentions))
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    None,  # timestep
+                    None,  # class_labels
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                    **ckpt_kwargs,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states = output_states + (hidden_states,)
+        return hidden_states, output_states

mv_diffusion_30/models/unet_mv2d_condition.py ADDED Viewed

	@@ -0,0 +1,1498 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import os
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin, load_state_dict, _load_state_dict_into_model
+from diffusers.models.unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    UpBlock2D,
+)
+from diffusers.utils import (
+    CONFIG_NAME,
+    HF_MODULES_CACHE,
+    FLAX_WEIGHTS_NAME,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    _add_variant,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_safetensors_available,
+    is_torch_version,
+    logging,
+)
+from diffusers import __version__
+from mv_diffusion_30.models.unet_mv2d_blocks import (
+    CrossAttnDownBlockMV2D,
+    CrossAttnUpBlockMV2D,
+    UNetMidBlockMV2DCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+from huggingface_hub.constants import HF_HUB_OFFLINE
+hf_cache_home = os.path.expanduser(
+    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
+DIFFUSERS_CACHE = os.path.join(hf_cache_home, "diffusers")
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNetMV2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+class UNetMV2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockMV2D",
+            "CrossAttnDownBlockMV2D",
+            "CrossAttnDownBlockMV2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlockMV2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlockMV2D", "CrossAttnUpBlockMV2D", "CrossAttnUpBlockMV2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+        num_views: int = 1,
+        cd_attention_last: bool = False,
+        cd_attention_mid: bool = False,
+        multiview_attention: bool = True,
+        sparse_mv_attention: bool = False,
+        mvcd_attention: bool = False
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                num_views=num_views,
+                cd_attention_last=cd_attention_last,
+                cd_attention_mid=cd_attention_mid,
+                multiview_attention=multiview_attention,
+                sparse_mv_attention=sparse_mv_attention,
+                mvcd_attention=mvcd_attention
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        # custom MV2D attention block
+        elif mid_block_type == "UNetMidBlockMV2DCrossAttn":
+            self.mid_block = UNetMidBlockMV2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                num_views=num_views,
+                cd_attention_last=cd_attention_last,
+                cd_attention_mid=cd_attention_mid,
+                multiview_attention=multiview_attention,
+                sparse_mv_attention=sparse_mv_attention,
+                mvcd_attention=mvcd_attention
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                num_views=num_views,
+                cd_attention_last=cd_attention_last,
+                cd_attention_mid=cd_attention_mid,
+                multiview_attention=multiview_attention,
+                sparse_mv_attention=sparse_mv_attention,
+                mvcd_attention=mvcd_attention
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, CrossAttnDownBlockMV2D, DownBlock2D, CrossAttnUpBlock2D, CrossAttnUpBlockMV2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNetMV2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        # self.time_embedding.to(dtype=t_emb.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    sample += down_block_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNetMV2DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_2d(
+            cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+            camera_embedding_type: str, num_views: int, sample_size: int,
+            zero_init_conv_in: bool = True, zero_init_camera_projection: bool = False,
+            projection_class_embeddings_input_dim: int=6, cd_attention_last: bool = False,
+            cd_attention_mid: bool = False, multiview_attention: bool = True,
+            sparse_mv_attention: bool = False, mvcd_attention: bool = False,
+            in_channels: int = 8, out_channels: int = 4,
+            **kwargs
+        ):
+        r"""
+        Instantiate a pretrained PyTorch model from a pretrained model configuration.
+        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
+        train the model, set it back in training mode with `model.train()`.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`~ModelMixin.save_pretrained`].
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info (`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if `device_map` contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
+                weights. If set to `False`, `safetensors` weights are not loaded.
+        <Tip>
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+        </Tip>
+        Example:
+        ```py
+        from diffusers import UNet2DConditionModel
+        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        ```
+        If you get the error message below, you need to finetune the weights for your downstream task:
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        if use_safetensors and not is_safetensors_available():
+            raise ValueError(
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+            )
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = is_safetensors_available()
+            allow_pickle = True
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            subfolder=subfolder,
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            offload_state_dict=offload_state_dict,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # modify config
+        config["_class_name"] = cls.__name__
+        config['in_channels'] = in_channels
+        config['out_channels'] = out_channels
+        config['sample_size'] = sample_size # training resolution
+        config['num_views'] = num_views
+        config['cd_attention_last'] = cd_attention_last
+        config['cd_attention_mid'] = cd_attention_mid
+        config['multiview_attention'] = multiview_attention
+        config['sparse_mv_attention'] = sparse_mv_attention
+        config['mvcd_attention'] = mvcd_attention
+        config["down_block_types"] = [
+            "CrossAttnDownBlockMV2D",
+            "CrossAttnDownBlockMV2D",
+            "CrossAttnDownBlockMV2D",
+            "DownBlock2D"
+        ]
+        config['mid_block_type'] = "UNetMidBlockMV2DCrossAttn"
+        config["up_block_types"] = [
+            "UpBlock2D",
+            "CrossAttnUpBlockMV2D",
+            "CrossAttnUpBlockMV2D",
+            "CrossAttnUpBlockMV2D"
+        ]
+        config['class_embed_type'] = 'projection'
+        if camera_embedding_type == 'e_de_da_sincos':
+            config['projection_class_embeddings_input_dim'] = projection_class_embeddings_input_dim # default 6
+        else:
+            raise NotImplementedError
+        # load model
+        model_file = None
+        if from_flax:
+            raise NotImplementedError
+        else:
+            if use_safetensors:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+            model = cls.from_config(config, **unused_kwargs)
+            import copy
+            state_dict_v0 = load_state_dict(model_file, variant=variant)
+            state_dict = copy.deepcopy(state_dict_v0)
+            # attn_joint -> attn_joint_last; norm_joint -> norm_joint_last
+            # attn_joint_twice -> attn_joint_mid; norm_joint_twice -> norm_joint_mid
+            for key in state_dict_v0:
+                if 'attn_joint.' in key:
+                    tmp = copy.deepcopy(key)
+                    state_dict[key.replace("attn_joint.", "attn_joint_last.")] = state_dict.pop(tmp)
+                if 'norm_joint.' in key:
+                    tmp = copy.deepcopy(key)
+                    state_dict[key.replace("norm_joint.", "norm_joint_last.")] = state_dict.pop(tmp)
+                if 'attn_joint_twice.' in key:
+                    tmp = copy.deepcopy(key)
+                    state_dict[key.replace("attn_joint_twice.", "attn_joint_mid.")] = state_dict.pop(tmp)
+                if 'norm_joint_twice.' in key:
+                    tmp = copy.deepcopy(key)
+                    state_dict[key.replace("norm_joint_twice.", "norm_joint_mid.")] = state_dict.pop(tmp)
+            model._convert_deprecated_attention_blocks(state_dict)
+            conv_in_weight = state_dict['conv_in.weight']
+            conv_out_weight = state_dict['conv_out.weight']
+            model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model_2d(
+                model,
+                state_dict,
+                model_file,
+                pretrained_model_name_or_path,
+                ignore_mismatched_sizes=True,
+            )
+            if any([key == 'conv_in.weight' for key, _, _ in mismatched_keys]):
+                # initialize from the original SD structure
+                model.conv_in.weight.data[:,:4] = conv_in_weight
+            # whether to place all zero to new layers?
+            if zero_init_conv_in:
+                model.conv_in.weight.data[:,4:] = 0.
+            if any([key == 'conv_out.weight' for key, _, _ in mismatched_keys]):
+                # initialize from the original SD structure
+                model.conv_out.weight.data[:,:4] = conv_out_weight
+                if out_channels == 8: # copy for the last 4 channels
+                    model.conv_out.weight.data[:, 4:] = conv_out_weight
+            # if zero_init_camera_projection:
+            #     for p in model.class_embedding.parameters():
+            #         torch.nn.init.zeros_(p)
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "mismatched_keys": mismatched_keys,
+                "error_msgs": error_msgs,
+            }
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+    @classmethod
+    def _load_pretrained_model_2d(
+        cls,
+        model,
+        state_dict,
+        resolved_archive_file,
+        pretrained_model_name_or_path,
+        ignore_mismatched_sizes=False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+        expected_keys = list(model_state_dict.keys())
+        original_loaded_keys = loaded_keys
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs

mv_diffusion_30/pipelines/pipeline_mvdiffusion_image.py ADDED Viewed

	@@ -0,0 +1,555 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import warnings
+from typing import Callable, List, Optional, Union
+import PIL
+import torch
+import torchvision.transforms.functional as TF
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils.torch_utils import logging, randn_tensor
+from diffusers.utils.deprecation_utils import deprecate
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from einops import rearrange, repeat
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class MVDiffusionImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate image variations from an input image using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    # TODO: feature_extractor is required to encode images (if they are in PIL format),
+    # we should give a descriptive message if the pipeline doesn't have one.
+    _optional_components = ["safety_checker"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+        camera_embedding_type: str = 'e_de_da_sincos',
+        num_views: int = 6,
+        pred_type: str = 'color',
+    ):
+        super().__init__()
+        if safety_checker is None and requires_safety_checker:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.camera_embedding_type: str = camera_embedding_type
+        self.num_views: int = num_views
+        self.pred_type = pred_type
+        self.camera_embedding =  torch.tensor(
+            [[ 0.0000,  0.0000,  0.0000,  1.0000,  0.0000],
+            [ 0.0000, -0.2362,  0.8125,  1.0000,  0.0000],
+            [ 0.0000, -0.1686,  1.6934,  1.0000,  0.0000],
+            [ 0.0000,  0.5220,  3.1406,  1.0000,  0.0000],
+            [ 0.0000,  0.6904,  4.8359,  1.0000,  0.0000],
+            [ 0.0000,  0.3733,  5.5859,  1.0000,  0.0000],
+            [ 0.0000,  0.0000,  0.0000,  0.0000,  1.0000],
+            [ 0.0000, -0.2362,  0.8125,  0.0000,  1.0000],
+            [ 0.0000, -0.1686,  1.6934,  0.0000,  1.0000],
+            [ 0.0000,  0.5220,  3.1406,  0.0000,  1.0000],
+            [ 0.0000,  0.6904,  4.8359,  0.0000,  1.0000],
+            [ 0.0000,  0.3733,  5.5859,  0.0000,  1.0000]], dtype=torch.float16)
+    def _encode_image(self, image_pil, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+        image_pt = self.feature_extractor(images=image_pil, return_tensors="pt").pixel_values
+        image_pt = image_pt.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image_pt).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        # Note: repeat differently from official pipelines
+        # B1B2B3B4 -> B1B2B3B4B1B2B3B4
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(num_images_per_prompt, 1, 1)
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+        image_pt = torch.stack([TF.to_tensor(img) for img in image_pil], dim=0).to(device).to(dtype)
+        image_pt = image_pt * 2.0 - 1.0
+        image_latents = self.vae.encode(image_pt).latent_dist.mode() * self.vae.config.scaling_factor
+        # Note: repeat differently from official pipelines
+        # B1B2B3B4 -> B1B2B3B4B1B2B3B4
+        image_latents = image_latents.repeat(num_images_per_prompt, 1, 1, 1)
+        if do_classifier_free_guidance:
+            image_latents = torch.cat([torch.zeros_like(image_latents), image_latents])
+        return image_embeddings, image_latents
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        warnings.warn(
+            "The decode_latents method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor instead",
+            FutureWarning,
+        )
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None, cross_domain_latnte=True):
+        if cross_domain_latnte:
+            # generate cross-domain initial latents
+            # for cross-domain task, make sure the two domain are start from a same initial latents
+            assert batch_size % 2 == 0
+            batch_size = batch_size // 2
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        if cross_domain_latnte:
+            latents = torch.cat([latents] * 2)
+        return latents
+    def prepare_camera_embedding(self, camera_embedding: Union[float, torch.Tensor], do_classifier_free_guidance, num_images_per_prompt=1):
+        # (B, 3)
+        camera_embedding = camera_embedding.to(dtype=self.unet.dtype, device=self.unet.device)
+        if self.camera_embedding_type == 'e_de_da_sincos':
+            # (B, 6)
+            camera_embedding = torch.cat([
+                torch.sin(camera_embedding),
+                torch.cos(camera_embedding)
+            ], dim=-1)
+            assert self.unet.config.class_embed_type == 'projection'
+            assert self.unet.config.projection_class_embeddings_input_dim == 14 or self.unet.config.projection_class_embeddings_input_dim == 10
+        else:
+            raise NotImplementedError
+        # Note: repeat differently from official pipelines
+        # B1B2B3B4 -> B1B2B3B4B1B2B3B4
+        camera_embedding = camera_embedding.repeat(num_images_per_prompt, 1)
+        if do_classifier_free_guidance:
+            camera_embedding = torch.cat([
+                camera_embedding,
+                camera_embedding
+            ], dim=0)
+        return camera_embedding
+    def reshape_to_cd_input(self, input):
+        # reshape input for cross-domain attention
+        input_norm_uc, input_rgb_uc, input_norm_cond, input_rgb_cond = torch.chunk(
+            input, dim=0, chunks=4)
+        input = torch.cat(
+            [input_norm_uc, input_norm_cond, input_rgb_uc, input_rgb_cond], dim=0)
+        return input
+    def reshape_to_cfg_output(self, output):
+        # reshape input for cfg
+        output_norm_uc, output_norm_cond, output_rgb_uc, output_rgb_cond = torch.chunk(
+            output, dim=0, chunks=4)
+        output = torch.cat(
+            [output_norm_uc, output_rgb_uc, output_norm_cond, output_rgb_cond],
+            dim=0)
+        return output
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[List[PIL.Image.Image], torch.FloatTensor],
+        # elevation_cond: torch.FloatTensor,
+        # elevation: torch.FloatTensor,
+        # azimuth: torch.FloatTensor,
+        camera_embedding: Optional[torch.FloatTensor]=None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        normal_cond: Optional[Union[List[PIL.Image.Image], torch.FloatTensor]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        Examples:
+        ```py
+        from diffusers import StableDiffusionImageVariationPipeline
+        from PIL import Image
+        from io import BytesIO
+        import requests
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
+        )
+        pipe = pipe.to("cuda")
+        url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
+        out["images"][0].save("result.jpg")
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+        # 2. Define call parameters
+        if isinstance(image, list):
+            batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]
+            assert batch_size >= self.num_views and batch_size % self.num_views == 0
+        elif isinstance(image, PIL.Image.Image):
+            image = [image]*self.num_views*2
+            batch_size = self.num_views*2
+        device = self._execution_device
+        dtype = self.vae.dtype
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale != 1.0
+        # 3. Encode input image
+        if isinstance(image, list):
+            image_pil = image
+        elif isinstance(image, torch.Tensor):
+            image_pil = [TF.to_pil_image(image[i]) for i in range(image.shape[0])]
+        image_embeddings, image_latents = self._encode_image(image_pil, device, num_images_per_prompt, do_classifier_free_guidance)
+        if normal_cond is not None:
+            if isinstance(normal_cond, list):
+                normal_cond_pil = normal_cond
+            elif isinstance(normal_cond, torch.Tensor):
+                normal_cond_pil = [TF.to_pil_image(normal_cond[i]) for i in range(normal_cond.shape[0])]
+            _, image_latents = self._encode_image(normal_cond_pil, device, num_images_per_prompt, do_classifier_free_guidance)
+        # assert len(elevation_cond) == batch_size and len(elevation) == batch_size and len(azimuth) == batch_size
+        # camera_embeddings = self.prepare_camera_condition(elevation_cond, elevation, azimuth, do_classifier_free_guidance=do_classifier_free_guidance, num_images_per_prompt=num_images_per_prompt)
+        if camera_embedding is not None:
+            assert len(camera_embedding) == batch_size
+        else:
+            camera_embedding = self.camera_embedding.to(dtype)
+            camera_embedding = repeat(camera_embedding, "Nv Nce -> (B Nv) Nce", B=batch_size//len(camera_embedding))
+        camera_embeddings = self.prepare_camera_embedding(camera_embedding, do_classifier_free_guidance=do_classifier_free_guidance, num_images_per_prompt=num_images_per_prompt)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+            cross_domain_latnte=True
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            if do_classifier_free_guidance and self.pred_type == 'joint_color_normal':
+                print("reshape the input to cross-domain format")
+                image_embeddings = self.reshape_to_cd_input(image_embeddings)
+                camera_embeddings = self.reshape_to_cd_input(camera_embeddings)
+                image_latents = self.reshape_to_cd_input(image_latents)
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                # latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                if do_classifier_free_guidance and self.pred_type == 'joint_color_normal':
+                    latent_model_input = torch.cat([latents] * 2)
+                    latent_model_input = self.reshape_to_cd_input(latent_model_input)
+                elif do_classifier_free_guidance and self.pred_type != 'joint_color_normal':
+                    latent_model_input = torch.cat([latents] * 2)
+                else:
+                    latent_model_input = latents
+                latent_model_input = torch.cat([
+                    latent_model_input, image_latents
+                ], dim=1)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings,
+                                       class_labels=camera_embeddings).sample
+                # perform guidance
+                if do_classifier_free_guidance and self.pred_type != 'joint_color_normal':
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                elif do_classifier_free_guidance and self.pred_type == 'joint_color_normal':
+                    noise_pred = self.reshape_to_cfg_output(noise_pred)
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if not output_type == "latent":
+            if num_channels_latents == 8:
+                latents = torch.cat([latents[:, :4], latents[:, 4:]], dim=0)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
---extra-index-url https://download.pytorch.org/whl/cu117
-torch==1.13.1
-torchvision
-diffusers[torch]==0.19.3
-xformers==0.0.16
 transformers>=4.25.1
 bitsandbytes==0.35.4
 decord==0.6.0
@@ -12,7 +12,7 @@ nerfacc==0.3.3
 trimesh==3.9.8
 pyhocon==0.3.57
 icecream==2.1.0
-#PyMCubes==0.1.2
 accelerate
 modelcards
 einops
@@ -28,4 +28,13 @@ torch_efficient_distloss
 tensorboard
 rembg
 segment_anything
-fire

+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.0.1
+torchvision==0.15.2
+diffusers[torch]==0.29.1
+# xformers==0.0.16
 transformers>=4.25.1
 bitsandbytes==0.35.4
 decord==0.6.0
 trimesh==3.9.8
 pyhocon==0.3.57
 icecream==2.1.0
+# PyMCubes==0.1.2
 accelerate
 modelcards
 einops
 tensorboard
 rembg
 segment_anything
+gradio==3.50.2
+mosaicml-streaming
+onnxruntime_gpu
+pyrender
+jaxtyping
+pymeshlab
+cholespy
+torch_scatter
+pytorch3d @ git+https://github.com/facebookresearch/pytorch3d.git@51fd114d8b8eed19226870ee7fd12dba1e25d550