Spaces:

kxic
/

EscherNet

Running on Zero

App Files Files Community

kxhit commited on Jun 7

Commit

23aae87

•

1 Parent(s): ad4ee48

clean

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +0 -43
3drecon/configs/neus_36.yaml +0 -26
3drecon/raymarching/__init__.py +0 -1
3drecon/raymarching/backend.py +0 -40
3drecon/raymarching/raymarching.py +0 -373
3drecon/raymarching/setup.py +0 -62
3drecon/raymarching/src/bindings.cpp +0 -19
3drecon/raymarching/src/raymarching.cu +0 -914
3drecon/raymarching/src/raymarching.h +0 -18
3drecon/renderer/agg_net.py +0 -83
3drecon/renderer/cost_reg_net.py +0 -95
3drecon/renderer/dummy_dataset.py +0 -40
3drecon/renderer/feature_net.py +0 -42
3drecon/renderer/neus_networks.py +0 -503
3drecon/renderer/ngp_renderer.py +0 -721
3drecon/renderer/renderer.py +0 -640
3drecon/run_NeuS.py +0 -32
3drecon/train_renderer.py +0 -188
3drecon/util.py +0 -54
4DoF/CN_encoder.py +0 -36
4DoF/dataset.py +0 -228
4DoF/diffusers/__init__.py +0 -281
4DoF/diffusers/commands/__init__.py +0 -27
4DoF/diffusers/commands/diffusers_cli.py +0 -41
4DoF/diffusers/commands/env.py +0 -84
4DoF/diffusers/configuration_utils.py +0 -664
4DoF/diffusers/dependency_versions_check.py +0 -47
4DoF/diffusers/dependency_versions_table.py +0 -44
4DoF/diffusers/experimental/__init__.py +0 -1
4DoF/diffusers/experimental/rl/__init__.py +0 -1
4DoF/diffusers/experimental/rl/value_guided_sampling.py +0 -152
4DoF/diffusers/image_processor.py +0 -366
4DoF/diffusers/loaders.py +0 -1492
4DoF/diffusers/models/__init__.py +0 -35
4DoF/diffusers/models/activations.py +0 -12
4DoF/diffusers/models/attention.py +0 -392
4DoF/diffusers/models/attention_flax.py +0 -446
4DoF/diffusers/models/attention_processor.py +0 -1714
4DoF/diffusers/models/autoencoder_kl.py +0 -411
4DoF/diffusers/models/controlnet.py +0 -705
4DoF/diffusers/models/controlnet_flax.py +0 -394
4DoF/diffusers/models/cross_attention.py +0 -94
4DoF/diffusers/models/dual_transformer_2d.py +0 -151
4DoF/diffusers/models/embeddings.py +0 -546
4DoF/diffusers/models/embeddings_flax.py +0 -95
4DoF/diffusers/models/modeling_flax_pytorch_utils.py +0 -118
4DoF/diffusers/models/modeling_flax_utils.py +0 -534
4DoF/diffusers/models/modeling_pytorch_flax_utils.py +0 -161
4DoF/diffusers/models/modeling_utils.py +0 -980
4DoF/diffusers/models/prior_transformer.py +0 -364

.gitattributes DELETED Viewed

@@ -1,43 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-logs/user_object/eschernet/output.gif filter=lfs diff=lfs merge=lfs -text
-logs/user_object/scene.glb filter=lfs diff=lfs merge=lfs -text
-3drecon/ours_GSO_T1/NeuS/grandmother/mesh.ply filter=lfs diff=lfs merge=lfs -text
-3drecon/ours_GSO_T1/NeuS/lion/mesh.ply filter=lfs diff=lfs merge=lfs -text
-gradio_demo/examples/bike/003.jpg filter=lfs diff=lfs merge=lfs -text
-gradio_demo/examples/bike/027.jpg filter=lfs diff=lfs merge=lfs -text
-gradio_demo/examples/bike/bike_0.jpg filter=lfs diff=lfs merge=lfs -text
-gradio_demo/examples/bike/bike_2.jpg filter=lfs diff=lfs merge=lfs -text

3drecon/configs/neus_36.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-model:
-  base_lr: 5.0e-4
-  target: renderer.renderer.RendererTrainer
-  params:
-    total_steps: 2000
-    warm_up_steps: 100
-    train_batch_num: 2560
-    train_batch_fg_num: 512
-    test_batch_num: 4096
-    use_mask: true
-    lambda_rgb_loss: 0.5
-    lambda_mask_loss: 1.0
-    lambda_eikonal_loss: 0.1
-    use_warm_up: true
-data:
-  target: renderer.dummy_dataset.DummyDataset
-  params: {}
-callbacks:
-  save_interval: 500
-trainer:
-  val_check_interval: 500
-  max_steps: 2000

3drecon/raymarching/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .raymarching import *

3drecon/raymarching/backend.py DELETED Viewed

@@ -1,40 +0,0 @@
-import os
-from torch.utils.cpp_extension import load
-_src_path = os.path.dirname(os.path.abspath(__file__))
-nvcc_flags = [
-    '-O3', '-std=c++14',
-    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
-]
-if os.name == "posix":
-    c_flags = ['-O3', '-std=c++14']
-elif os.name == "nt":
-    c_flags = ['/O2', '/std:c++17']
-    # find cl.exe
-    def find_cl_path():
-        import glob
-        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
-            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
-            if paths:
-                return paths[0]
-    # If cl.exe is not on path, try to find it.
-    if os.system("where cl.exe >nul 2>nul") != 0:
-        cl_path = find_cl_path()
-        if cl_path is None:
-            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
-        os.environ["PATH"] += ";" + cl_path
-_backend = load(name='_raymarching',
-                extra_cflags=c_flags,
-                extra_cuda_cflags=nvcc_flags,
-                sources=[os.path.join(_src_path, 'src', f) for f in [
-                    'raymarching.cu',
-                    'bindings.cpp',
-                ]],
-                )
-__all__ = ['_backend']

3drecon/raymarching/raymarching.py DELETED Viewed

@@ -1,373 +0,0 @@
-import numpy as np
-import time
-import torch
-import torch.nn as nn
-from torch.autograd import Function
-from torch.cuda.amp import custom_bwd, custom_fwd
-try:
-    import _raymarching as _backend
-except ImportError:
-    from .backend import _backend
-# ----------------------------------------
-# utils
-# ----------------------------------------
-class _near_far_from_aabb(Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, rays_o, rays_d, aabb, min_near=0.2):
-        ''' near_far_from_aabb, CUDA implementation
-        Calculate rays' intersection time (near and far) with aabb
-        Args:
-            rays_o: float, [N, 3]
-            rays_d: float, [N, 3]
-            aabb: float, [6], (xmin, ymin, zmin, xmax, ymax, zmax)
-            min_near: float, scalar
-        Returns:
-            nears: float, [N]
-            fars: float, [N]
-        '''
-        if not rays_o.is_cuda: rays_o = rays_o.cuda()
-        if not rays_d.is_cuda: rays_d = rays_d.cuda()
-        rays_o = rays_o.contiguous().view(-1, 3)
-        rays_d = rays_d.contiguous().view(-1, 3)
-        N = rays_o.shape[0] # num rays
-        nears = torch.empty(N, dtype=rays_o.dtype, device=rays_o.device)
-        fars = torch.empty(N, dtype=rays_o.dtype, device=rays_o.device)
-        _backend.near_far_from_aabb(rays_o, rays_d, aabb, N, min_near, nears, fars)
-        return nears, fars
-near_far_from_aabb = _near_far_from_aabb.apply
-class _sph_from_ray(Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, rays_o, rays_d, radius):
-        ''' sph_from_ray, CUDA implementation
-        get spherical coordinate on the background sphere from rays.
-        Assume rays_o are inside the Sphere(radius).
-        Args:
-            rays_o: [N, 3]
-            rays_d: [N, 3]
-            radius: scalar, float
-        Return:
-            coords: [N, 2], in [-1, 1], theta and phi on a sphere. (further-surface)
-        '''
-        if not rays_o.is_cuda: rays_o = rays_o.cuda()
-        if not rays_d.is_cuda: rays_d = rays_d.cuda()
-        rays_o = rays_o.contiguous().view(-1, 3)
-        rays_d = rays_d.contiguous().view(-1, 3)
-        N = rays_o.shape[0] # num rays
-        coords = torch.empty(N, 2, dtype=rays_o.dtype, device=rays_o.device)
-        _backend.sph_from_ray(rays_o, rays_d, radius, N, coords)
-        return coords
-sph_from_ray = _sph_from_ray.apply
-class _morton3D(Function):
-    @staticmethod
-    def forward(ctx, coords):
-        ''' morton3D, CUDA implementation
-        Args:
-            coords: [N, 3], int32, in [0, 128) (for some reason there is no uint32 tensor in torch...)
-            TODO: check if the coord range is valid! (current 128 is safe)
-        Returns:
-            indices: [N], int32, in [0, 128^3)
-        '''
-        if not coords.is_cuda: coords = coords.cuda()
-        N = coords.shape[0]
-        indices = torch.empty(N, dtype=torch.int32, device=coords.device)
-        _backend.morton3D(coords.int(), N, indices)
-        return indices
-morton3D = _morton3D.apply
-class _morton3D_invert(Function):
-    @staticmethod
-    def forward(ctx, indices):
-        ''' morton3D_invert, CUDA implementation
-        Args:
-            indices: [N], int32, in [0, 128^3)
-        Returns:
-            coords: [N, 3], int32, in [0, 128)
-        '''
-        if not indices.is_cuda: indices = indices.cuda()
-        N = indices.shape[0]
-        coords = torch.empty(N, 3, dtype=torch.int32, device=indices.device)
-        _backend.morton3D_invert(indices.int(), N, coords)
-        return coords
-morton3D_invert = _morton3D_invert.apply
-class _packbits(Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, grid, thresh, bitfield=None):
-        ''' packbits, CUDA implementation
-        Pack up the density grid into a bit field to accelerate ray marching.
-        Args:
-            grid: float, [C, H * H * H], assume H % 2 == 0
-            thresh: float, threshold
-        Returns:
-            bitfield: uint8, [C, H * H * H / 8]
-        '''
-        if not grid.is_cuda: grid = grid.cuda()
-        grid = grid.contiguous()
-        C = grid.shape[0]
-        H3 = grid.shape[1]
-        N = C * H3 // 8
-        if bitfield is None:
-            bitfield = torch.empty(N, dtype=torch.uint8, device=grid.device)
-        _backend.packbits(grid, N, thresh, bitfield)
-        return bitfield
-packbits = _packbits.apply
-# ----------------------------------------
-# train functions
-# ----------------------------------------
-class _march_rays_train(Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, rays_o, rays_d, bound, density_bitfield, C, H, nears, fars, step_counter=None, mean_count=-1, perturb=False, align=-1, force_all_rays=False, dt_gamma=0, max_steps=1024):
-        ''' march rays to generate points (forward only)
-        Args:
-            rays_o/d: float, [N, 3]
-            bound: float, scalar
-            density_bitfield: uint8: [CHHH // 8]
-            C: int
-            H: int
-            nears/fars: float, [N]
-            step_counter: int32, (2), used to count the actual number of generated points.
-            mean_count: int32, estimated mean steps to accelerate training. (but will randomly drop rays if the actual point count exceeded this threshold.)
-            perturb: bool
-            align: int, pad output so its size is dividable by align, set to -1 to disable.
-            force_all_rays: bool, ignore step_counter and mean_count, always calculate all rays. Useful if rendering the whole image, instead of some rays.
-            dt_gamma: float, called cone_angle in instant-ngp, exponentially accelerate ray marching if > 0. (very significant effect, but generally lead to worse performance)
-            max_steps: int, max number of sampled points along each ray, also affect min_stepsize.
-        Returns:
-            xyzs: float, [M, 3], all generated points' coords. (all rays concated, need to use `rays` to extract points belonging to each ray)
-            dirs: float, [M, 3], all generated points' view dirs.
-            deltas: float, [M, 2], all generated points' deltas. (first for RGB, second for Depth)
-            rays: int32, [N, 3], all rays' (index, point_offset, point_count), e.g., xyzs[rays[i, 1]:rays[i, 2]] --> points belonging to rays[i, 0]
-        '''
-        if not rays_o.is_cuda: rays_o = rays_o.cuda()
-        if not rays_d.is_cuda: rays_d = rays_d.cuda()
-        if not density_bitfield.is_cuda: density_bitfield = density_bitfield.cuda()
-        rays_o = rays_o.contiguous().view(-1, 3)
-        rays_d = rays_d.contiguous().view(-1, 3)
-        density_bitfield = density_bitfield.contiguous()
-        N = rays_o.shape[0] # num rays
-        M = N * max_steps # init max points number in total
-        # running average based on previous epoch (mimic `measured_batch_size_before_compaction` in instant-ngp)
-        # It estimate the max points number to enable faster training, but will lead to random ignored rays if underestimated.
-        if not force_all_rays and mean_count > 0:
-            if align > 0:
-                mean_count += align - mean_count % align
-            M = mean_count
-        xyzs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
-        dirs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
-        deltas = torch.zeros(M, 2, dtype=rays_o.dtype, device=rays_o.device)
-        rays = torch.empty(N, 3, dtype=torch.int32, device=rays_o.device) # id, offset, num_steps
-        if step_counter is None:
-            step_counter = torch.zeros(2, dtype=torch.int32, device=rays_o.device) # point counter, ray counter
-        if perturb:
-            noises = torch.rand(N, dtype=rays_o.dtype, device=rays_o.device)
-        else:
-            noises = torch.zeros(N, dtype=rays_o.dtype, device=rays_o.device)
-        _backend.march_rays_train(rays_o, rays_d, density_bitfield, bound, dt_gamma, max_steps, N, C, H, M, nears, fars, xyzs, dirs, deltas, rays, step_counter, noises) # m is the actually used points number
-        #print(step_counter, M)
-        # only used at the first (few) epochs.
-        if force_all_rays or mean_count <= 0:
-            m = step_counter[0].item() # D2H copy
-            if align > 0:
-                m += align - m % align
-            xyzs = xyzs[:m]
-            dirs = dirs[:m]
-            deltas = deltas[:m]
-            torch.cuda.empty_cache()
-        return xyzs, dirs, deltas, rays
-march_rays_train = _march_rays_train.apply
-class _composite_rays_train(Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, sigmas, rgbs, deltas, rays, T_thresh=1e-4):
-        ''' composite rays' rgbs, according to the ray marching formula.
-        Args:
-            rgbs: float, [M, 3]
-            sigmas: float, [M,]
-            deltas: float, [M, 2]
-            rays: int32, [N, 3]
-        Returns:
-            weights_sum: float, [N,], the alpha channel
-            depth: float, [N, ], the Depth
-            image: float, [N, 3], the RGB channel (after multiplying alpha!)
-        '''
-        sigmas = sigmas.contiguous()
-        rgbs = rgbs.contiguous()
-        M = sigmas.shape[0]
-        N = rays.shape[0]
-        weights_sum = torch.empty(N, dtype=sigmas.dtype, device=sigmas.device)
-        depth = torch.empty(N, dtype=sigmas.dtype, device=sigmas.device)
-        image = torch.empty(N, 3, dtype=sigmas.dtype, device=sigmas.device)
-        _backend.composite_rays_train_forward(sigmas, rgbs, deltas, rays, M, N, T_thresh, weights_sum, depth, image)
-        ctx.save_for_backward(sigmas, rgbs, deltas, rays, weights_sum, depth, image)
-        ctx.dims = [M, N, T_thresh]
-        return weights_sum, depth, image
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_weights_sum, grad_depth, grad_image):
-        # NOTE: grad_depth is not used now! It won't be propagated to sigmas.
-        grad_weights_sum = grad_weights_sum.contiguous()
-        grad_image = grad_image.contiguous()
-        sigmas, rgbs, deltas, rays, weights_sum, depth, image = ctx.saved_tensors
-        M, N, T_thresh = ctx.dims
-        grad_sigmas = torch.zeros_like(sigmas)
-        grad_rgbs = torch.zeros_like(rgbs)
-        _backend.composite_rays_train_backward(grad_weights_sum, grad_image, sigmas, rgbs, deltas, rays, weights_sum, image, M, N, T_thresh, grad_sigmas, grad_rgbs)
-        return grad_sigmas, grad_rgbs, None, None, None
-composite_rays_train = _composite_rays_train.apply
-# ----------------------------------------
-# infer functions
-# ----------------------------------------
-class _march_rays(Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, n_alive, n_step, rays_alive, rays_t, rays_o, rays_d, bound, density_bitfield, C, H, near, far, align=-1, perturb=False, dt_gamma=0, max_steps=1024):
-        ''' march rays to generate points (forward only, for inference)
-        Args:
-            n_alive: int, number of alive rays
-            n_step: int, how many steps we march
-            rays_alive: int, [N], the alive rays' IDs in N (N >= n_alive, but we only use first n_alive)
-            rays_t: float, [N], the alive rays' time, we only use the first n_alive.
-            rays_o/d: float, [N, 3]
-            bound: float, scalar
-            density_bitfield: uint8: [CHHH // 8]
-            C: int
-            H: int
-            nears/fars: float, [N]
-            align: int, pad output so its size is dividable by align, set to -1 to disable.
-            perturb: bool/int, int > 0 is used as the random seed.
-            dt_gamma: float, called cone_angle in instant-ngp, exponentially accelerate ray marching if > 0. (very significant effect, but generally lead to worse performance)
-            max_steps: int, max number of sampled points along each ray, also affect min_stepsize.
-        Returns:
-            xyzs: float, [n_alive * n_step, 3], all generated points' coords
-            dirs: float, [n_alive * n_step, 3], all generated points' view dirs.
-            deltas: float, [n_alive * n_step, 2], all generated points' deltas (here we record two deltas, the first is for RGB, the second for depth).
-        '''
-        if not rays_o.is_cuda: rays_o = rays_o.cuda()
-        if not rays_d.is_cuda: rays_d = rays_d.cuda()
-        rays_o = rays_o.contiguous().view(-1, 3)
-        rays_d = rays_d.contiguous().view(-1, 3)
-        M = n_alive * n_step
-        if align > 0:
-            M += align - (M % align)
-        xyzs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
-        dirs = torch.zeros(M, 3, dtype=rays_o.dtype, device=rays_o.device)
-        deltas = torch.zeros(M, 2, dtype=rays_o.dtype, device=rays_o.device) # 2 vals, one for rgb, one for depth
-        if perturb:
-            # torch.manual_seed(perturb) # test_gui uses spp index as seed
-            noises = torch.rand(n_alive, dtype=rays_o.dtype, device=rays_o.device)
-        else:
-            noises = torch.zeros(n_alive, dtype=rays_o.dtype, device=rays_o.device)
-        _backend.march_rays(n_alive, n_step, rays_alive, rays_t, rays_o, rays_d, bound, dt_gamma, max_steps, C, H, density_bitfield, near, far, xyzs, dirs, deltas, noises)
-        return xyzs, dirs, deltas
-march_rays = _march_rays.apply
-class _composite_rays(Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32) # need to cast sigmas & rgbs to float
-    def forward(ctx, n_alive, n_step, rays_alive, rays_t, sigmas, rgbs, deltas, weights_sum, depth, image, T_thresh=1e-2):
-        ''' composite rays' rgbs, according to the ray marching formula. (for inference)
-        Args:
-            n_alive: int, number of alive rays
-            n_step: int, how many steps we march
-            rays_alive: int, [n_alive], the alive rays' IDs in N (N >= n_alive)
-            rays_t: float, [N], the alive rays' time
-            sigmas: float, [n_alive * n_step,]
-            rgbs: float, [n_alive * n_step, 3]
-            deltas: float, [n_alive * n_step, 2], all generated points' deltas (here we record two deltas, the first is for RGB, the second for depth).
-        In-place Outputs:
-            weights_sum: float, [N,], the alpha channel
-            depth: float, [N,], the depth value
-            image: float, [N, 3], the RGB channel (after multiplying alpha!)
-        '''
-        _backend.composite_rays(n_alive, n_step, T_thresh, rays_alive, rays_t, sigmas, rgbs, deltas, weights_sum, depth, image)
-        return tuple()
-composite_rays = _composite_rays.apply

3drecon/raymarching/setup.py DELETED Viewed

@@ -1,62 +0,0 @@
-import os
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-_src_path = os.path.dirname(os.path.abspath(__file__))
-nvcc_flags = [
-    '-O3', '-std=c++14',
-    '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
-]
-if os.name == "posix":
-    c_flags = ['-O3', '-std=c++14']
-elif os.name == "nt":
-    c_flags = ['/O2', '/std:c++17']
-    # find cl.exe
-    def find_cl_path():
-        import glob
-        for edition in ["Enterprise", "Professional", "BuildTools", "Community"]:
-            paths = sorted(glob.glob(r"C:\\Program Files (x86)\\Microsoft Visual Studio\\*\\%s\\VC\\Tools\\MSVC\\*\\bin\\Hostx64\\x64" % edition), reverse=True)
-            if paths:
-                return paths[0]
-    # If cl.exe is not on path, try to find it.
-    if os.system("where cl.exe >nul 2>nul") != 0:
-        cl_path = find_cl_path()
-        if cl_path is None:
-            raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
-        os.environ["PATH"] += ";" + cl_path
-'''
-Usage:
-python setup.py build_ext --inplace # build extensions locally, do not install (only can be used from the parent directory)
-python setup.py install # build extensions and install (copy) to PATH.
-pip install . # ditto but better (e.g., dependency & metadata handling)
-python setup.py develop # build extensions and install (symbolic) to PATH.
-pip install -e . # ditto but better (e.g., dependency & metadata handling)
-'''
-setup(
-    name='raymarching', # package name, import this to use python API
-    ext_modules=[
-        CUDAExtension(
-            name='_raymarching', # extension name, import this to use CUDA API
-            sources=[os.path.join(_src_path, 'src', f) for f in [
-                'raymarching.cu',
-                'bindings.cpp',
-            ]],
-            extra_compile_args={
-                'cxx': c_flags,
-                'nvcc': nvcc_flags,
-            }
-        ),
-    ],
-    cmdclass={
-        'build_ext': BuildExtension,
-    }
-)

3drecon/raymarching/src/bindings.cpp DELETED Viewed

@@ -1,19 +0,0 @@
-#include <torch/extension.h>
-#include "raymarching.h"
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    // utils
-    m.def("packbits", &packbits, "packbits (CUDA)");
-    m.def("near_far_from_aabb", &near_far_from_aabb, "near_far_from_aabb (CUDA)");
-    m.def("sph_from_ray", &sph_from_ray, "sph_from_ray (CUDA)");
-    m.def("morton3D", &morton3D, "morton3D (CUDA)");
-    m.def("morton3D_invert", &morton3D_invert, "morton3D_invert (CUDA)");
-    // train
-    m.def("march_rays_train", &march_rays_train, "march_rays_train (CUDA)");
-    m.def("composite_rays_train_forward", &composite_rays_train_forward, "composite_rays_train_forward (CUDA)");
-    m.def("composite_rays_train_backward", &composite_rays_train_backward, "composite_rays_train_backward (CUDA)");
-    // infer
-    m.def("march_rays", &march_rays, "march rays (CUDA)");
-    m.def("composite_rays", &composite_rays, "composite rays (CUDA)");
-}

3drecon/raymarching/src/raymarching.cu DELETED Viewed

@@ -1,914 +0,0 @@
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/torch.h>
-#include <cstdio>
-#include <stdint.h>
-#include <stdexcept>
-#include <limits>
-#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
-#define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor")
-#define CHECK_IS_FLOATING(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float || x.scalar_type() == at::ScalarType::Half || x.scalar_type() == at::ScalarType::Double, #x " must be a floating tensor")
-inline constexpr __device__ float SQRT3() { return 1.7320508075688772f; }
-inline constexpr __device__ float RSQRT3() { return 0.5773502691896258f; }
-inline constexpr __device__ float PI() { return 3.141592653589793f; }
-inline constexpr __device__ float RPI() { return 0.3183098861837907f; }
-template <typename T>
-inline __host__ __device__ T div_round_up(T val, T divisor) {
-    return (val + divisor - 1) / divisor;
-}
-inline __host__ __device__ float signf(const float x) {
-    return copysignf(1.0, x);
-}
-inline __host__ __device__ float clamp(const float x, const float min, const float max) {
-    return fminf(max, fmaxf(min, x));
-}
-inline __host__ __device__ void swapf(float& a, float& b) {
-    float c = a; a = b; b = c;
-}
-inline __device__ int mip_from_pos(const float x, const float y, const float z, const float max_cascade) {
-    const float mx = fmaxf(fabsf(x), fmaxf(fabs(y), fabs(z)));
-    int exponent;
-    frexpf(mx, &exponent); // [0, 0.5) --> -1, [0.5, 1) --> 0, [1, 2) --> 1, [2, 4) --> 2, ...
-    return fminf(max_cascade - 1, fmaxf(0, exponent));
-}
-inline __device__ int mip_from_dt(const float dt, const float H, const float max_cascade) {
-    const float mx = dt * H * 0.5;
-    int exponent;
-    frexpf(mx, &exponent);
-    return fminf(max_cascade - 1, fmaxf(0, exponent));
-}
-inline __host__ __device__ uint32_t __expand_bits(uint32_t v)
-{
-	v = (v * 0x00010001u) & 0xFF0000FFu;
-	v = (v * 0x00000101u) & 0x0F00F00Fu;
-	v = (v * 0x00000011u) & 0xC30C30C3u;
-	v = (v * 0x00000005u) & 0x49249249u;
-	return v;
-}
-inline __host__ __device__ uint32_t __morton3D(uint32_t x, uint32_t y, uint32_t z)
-{
-	uint32_t xx = __expand_bits(x);
-	uint32_t yy = __expand_bits(y);
-	uint32_t zz = __expand_bits(z);
-	return xx | (yy << 1) | (zz << 2);
-}
-inline __host__ __device__ uint32_t __morton3D_invert(uint32_t x)
-{
-	x = x & 0x49249249;
-	x = (x | (x >> 2)) & 0xc30c30c3;
-	x = (x | (x >> 4)) & 0x0f00f00f;
-	x = (x | (x >> 8)) & 0xff0000ff;
-	x = (x | (x >> 16)) & 0x0000ffff;
-	return x;
-}
-////////////////////////////////////////////////////
-/////////////           utils          /////////////
-////////////////////////////////////////////////////
-// rays_o/d: [N, 3]
-// nears/fars: [N]
-// scalar_t should always be float in use.
-template <typename scalar_t>
-__global__ void kernel_near_far_from_aabb(
-    const scalar_t * __restrict__ rays_o,
-    const scalar_t * __restrict__ rays_d,
-    const scalar_t * __restrict__ aabb,
-    const uint32_t N,
-    const float min_near,
-    scalar_t * nears, scalar_t * fars
-) {
-    // parallel per ray
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= N) return;
-    // locate
-    rays_o += n * 3;
-    rays_d += n * 3;
-    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
-    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
-    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
-    // get near far (assume cube scene)
-    float near = (aabb[0] - ox) * rdx;
-    float far = (aabb[3] - ox) * rdx;
-    if (near > far) swapf(near, far);
-    float near_y = (aabb[1] - oy) * rdy;
-    float far_y = (aabb[4] - oy) * rdy;
-    if (near_y > far_y) swapf(near_y, far_y);
-    if (near > far_y || near_y > far) {
-        nears[n] = fars[n] = std::numeric_limits<scalar_t>::max();
-        return;
-    }
-    if (near_y > near) near = near_y;
-    if (far_y < far) far = far_y;
-    float near_z = (aabb[2] - oz) * rdz;
-    float far_z = (aabb[5] - oz) * rdz;
-    if (near_z > far_z) swapf(near_z, far_z);
-    if (near > far_z || near_z > far) {
-        nears[n] = fars[n] = std::numeric_limits<scalar_t>::max();
-        return;
-    }
-    if (near_z > near) near = near_z;
-    if (far_z < far) far = far_z;
-    if (near < min_near) near = min_near;
-    nears[n] = near;
-    fars[n] = far;
-}
-void near_far_from_aabb(const at::Tensor rays_o, const at::Tensor rays_d, const at::Tensor aabb, const uint32_t N, const float min_near, at::Tensor nears, at::Tensor fars) {
-    static constexpr uint32_t N_THREAD = 128;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    rays_o.scalar_type(), "near_far_from_aabb", ([&] {
-        kernel_near_far_from_aabb<<<div_round_up(N, N_THREAD), N_THREAD>>>(rays_o.data_ptr<scalar_t>(), rays_d.data_ptr<scalar_t>(), aabb.data_ptr<scalar_t>(), N, min_near, nears.data_ptr<scalar_t>(), fars.data_ptr<scalar_t>());
-    }));
-}
-// rays_o/d: [N, 3]
-// radius: float
-// coords: [N, 2]
-template <typename scalar_t>
-__global__ void kernel_sph_from_ray(
-    const scalar_t * __restrict__ rays_o,
-    const scalar_t * __restrict__ rays_d,
-    const float radius,
-    const uint32_t N,
-    scalar_t * coords
-) {
-    // parallel per ray
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= N) return;
-    // locate
-    rays_o += n * 3;
-    rays_d += n * 3;
-    coords += n * 2;
-    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
-    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
-    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
-    // solve t from || o + td || = radius
-    const float A = dx * dx + dy * dy + dz * dz;
-    const float B = ox * dx + oy * dy + oz * dz; // in fact B / 2
-    const float C = ox * ox + oy * oy + oz * oz - radius * radius;
-    const float t = (- B + sqrtf(B * B - A * C)) / A; // always use the larger solution (positive)
-    // solve theta, phi (assume y is the up axis)
-    const float x = ox + t * dx, y = oy + t * dy, z = oz + t * dz;
-    const float theta = atan2(sqrtf(x * x + z * z), y); // [0, PI)
-    const float phi = atan2(z, x); // [-PI, PI)
-    // normalize to [-1, 1]
-    coords[0] = 2 * theta * RPI() - 1;
-    coords[1] = phi * RPI();
-}
-void sph_from_ray(const at::Tensor rays_o, const at::Tensor rays_d, const float radius, const uint32_t N, at::Tensor coords) {
-    static constexpr uint32_t N_THREAD = 128;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    rays_o.scalar_type(), "sph_from_ray", ([&] {
-        kernel_sph_from_ray<<<div_round_up(N, N_THREAD), N_THREAD>>>(rays_o.data_ptr<scalar_t>(), rays_d.data_ptr<scalar_t>(), radius, N, coords.data_ptr<scalar_t>());
-    }));
-}
-// coords: int32, [N, 3]
-// indices: int32, [N]
-__global__ void kernel_morton3D(
-    const int * __restrict__ coords,
-    const uint32_t N,
-    int * indices
-) {
-    // parallel
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= N) return;
-    // locate
-    coords += n * 3;
-    indices[n] = __morton3D(coords[0], coords[1], coords[2]);
-}
-void morton3D(const at::Tensor coords, const uint32_t N, at::Tensor indices) {
-    static constexpr uint32_t N_THREAD = 128;
-    kernel_morton3D<<<div_round_up(N, N_THREAD), N_THREAD>>>(coords.data_ptr<int>(), N, indices.data_ptr<int>());
-}
-// indices: int32, [N]
-// coords: int32, [N, 3]
-__global__ void kernel_morton3D_invert(
-    const int * __restrict__ indices,
-    const uint32_t N,
-    int * coords
-) {
-    // parallel
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= N) return;
-    // locate
-    coords += n * 3;
-    const int ind = indices[n];
-    coords[0] = __morton3D_invert(ind >> 0);
-    coords[1] = __morton3D_invert(ind >> 1);
-    coords[2] = __morton3D_invert(ind >> 2);
-}
-void morton3D_invert(const at::Tensor indices, const uint32_t N, at::Tensor coords) {
-    static constexpr uint32_t N_THREAD = 128;
-    kernel_morton3D_invert<<<div_round_up(N, N_THREAD), N_THREAD>>>(indices.data_ptr<int>(), N, coords.data_ptr<int>());
-}
-// grid: float, [C, H, H, H]
-// N: int, C * H * H * H / 8
-// density_thresh: float
-// bitfield: uint8, [N]
-template <typename scalar_t>
-__global__ void kernel_packbits(
-    const scalar_t * __restrict__ grid,
-    const uint32_t N,
-    const float density_thresh,
-    uint8_t * bitfield
-) {
-    // parallel per byte
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= N) return;
-    // locate
-    grid += n * 8;
-    uint8_t bits = 0;
-    #pragma unroll
-    for (uint8_t i = 0; i < 8; i++) {
-        bits |= (grid[i] > density_thresh) ? ((uint8_t)1 << i) : 0;
-    }
-    bitfield[n] = bits;
-}
-void packbits(const at::Tensor grid, const uint32_t N, const float density_thresh, at::Tensor bitfield) {
-    static constexpr uint32_t N_THREAD = 128;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    grid.scalar_type(), "packbits", ([&] {
-        kernel_packbits<<<div_round_up(N, N_THREAD), N_THREAD>>>(grid.data_ptr<scalar_t>(), N, density_thresh, bitfield.data_ptr<uint8_t>());
-    }));
-}
-////////////////////////////////////////////////////
-/////////////         training         /////////////
-////////////////////////////////////////////////////
-// rays_o/d: [N, 3]
-// grid: [CHHH / 8]
-// xyzs, dirs, deltas: [M, 3], [M, 3], [M, 2]
-// dirs: [M, 3]
-// rays: [N, 3], idx, offset, num_steps
-template <typename scalar_t>
-__global__ void kernel_march_rays_train(
-    const scalar_t * __restrict__ rays_o,
-    const scalar_t * __restrict__ rays_d,
-    const uint8_t * __restrict__ grid,
-    const float bound,
-    const float dt_gamma, const uint32_t max_steps,
-    const uint32_t N, const uint32_t C, const uint32_t H, const uint32_t M,
-    const scalar_t* __restrict__ nears,
-    const scalar_t* __restrict__ fars,
-    scalar_t * xyzs, scalar_t * dirs, scalar_t * deltas,
-    int * rays,
-    int * counter,
-    const scalar_t* __restrict__ noises
-) {
-    // parallel per ray
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= N) return;
-    // locate
-    rays_o += n * 3;
-    rays_d += n * 3;
-    // ray marching
-    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
-    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
-    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
-    const float rH = 1 / (float)H;
-    const float H3 = H * H * H;
-    const float near = nears[n];
-    const float far = fars[n];
-    const float noise = noises[n];
-    const float dt_min = 2 * SQRT3() / max_steps;
-    const float dt_max = 2 * SQRT3() * (1 << (C - 1)) / H;
-    float t0 = near;
-    // perturb
-    t0 += clamp(t0 * dt_gamma, dt_min, dt_max) * noise;
-    // first pass: estimation of num_steps
-    float t = t0;
-    uint32_t num_steps = 0;
-    //if (t < far) printf("valid ray %d t=%f near=%f far=%f \n", n, t, near, far);
-    while (t < far && num_steps < max_steps) {
-        // current point
-        const float x = clamp(ox + t * dx, -bound, bound);
-        const float y = clamp(oy + t * dy, -bound, bound);
-        const float z = clamp(oz + t * dz, -bound, bound);
-        const float dt = clamp(t * dt_gamma, dt_min, dt_max);
-        // get mip level
-        const int level = max(mip_from_pos(x, y, z, C), mip_from_dt(dt, H, C)); // range in [0, C - 1]
-        const float mip_bound = fminf(scalbnf(1.0f, level), bound);
-        const float mip_rbound = 1 / mip_bound;
-        // convert to nearest grid position
-        const int nx = clamp(0.5 * (x * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        const int ny = clamp(0.5 * (y * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        const int nz = clamp(0.5 * (z * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        const uint32_t index = level * H3 + __morton3D(nx, ny, nz);
-        const bool occ = grid[index / 8] & (1 << (index % 8));
-        // if occpuied, advance a small step, and write to output
-        //if (n == 0) printf("t=%f density=%f vs thresh=%f step=%d\n", t, density, density_thresh, num_steps);
-        if (occ) {
-            num_steps++;
-            t += dt;
-        // else, skip a large step (basically skip a voxel grid)
-        } else {
-            // calc distance to next voxel
-            const float tx = (((nx + 0.5f + 0.5f * signf(dx)) * rH * 2 - 1) * mip_bound - x) * rdx;
-            const float ty = (((ny + 0.5f + 0.5f * signf(dy)) * rH * 2 - 1) * mip_bound - y) * rdy;
-            const float tz = (((nz + 0.5f + 0.5f * signf(dz)) * rH * 2 - 1) * mip_bound - z) * rdz;
-            const float tt = t + fmaxf(0.0f, fminf(tx, fminf(ty, tz)));
-            // step until next voxel
-            do {
-                t += clamp(t * dt_gamma, dt_min, dt_max);
-            } while (t < tt);
-        }
-    }
-    //printf("[n=%d] num_steps=%d, near=%f, far=%f, dt=%f, max_steps=%f\n", n, num_steps, near, far, dt_min, (far - near) / dt_min);
-    // second pass: really locate and write points & dirs
-    uint32_t point_index = atomicAdd(counter, num_steps);
-    uint32_t ray_index = atomicAdd(counter + 1, 1);
-    //printf("[n=%d] num_steps=%d, point_index=%d, ray_index=%d\n", n, num_steps, point_index, ray_index);
-    // write rays
-    rays[ray_index * 3] = n;
-    rays[ray_index * 3 + 1] = point_index;
-    rays[ray_index * 3 + 2] = num_steps;
-    if (num_steps == 0) return;
-    if (point_index + num_steps > M) return;
-    xyzs += point_index * 3;
-    dirs += point_index * 3;
-    deltas += point_index * 2;
-    t = t0;
-    uint32_t step = 0;
-    float last_t = t;
-    while (t < far && step < num_steps) {
-        // current point
-        const float x = clamp(ox + t * dx, -bound, bound);
-        const float y = clamp(oy + t * dy, -bound, bound);
-        const float z = clamp(oz + t * dz, -bound, bound);
-        const float dt = clamp(t * dt_gamma, dt_min, dt_max);
-        // get mip level
-        const int level = max(mip_from_pos(x, y, z, C), mip_from_dt(dt, H, C)); // range in [0, C - 1]
-        const float mip_bound = fminf(scalbnf(1.0f, level), bound);
-        const float mip_rbound = 1 / mip_bound;
-        // convert to nearest grid position
-        const int nx = clamp(0.5 * (x * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        const int ny = clamp(0.5 * (y * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        const int nz = clamp(0.5 * (z * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        // query grid
-        const uint32_t index = level * H3 + __morton3D(nx, ny, nz);
-        const bool occ = grid[index / 8] & (1 << (index % 8));
-        // if occpuied, advance a small step, and write to output
-        if (occ) {
-            // write step
-            xyzs[0] = x;
-            xyzs[1] = y;
-            xyzs[2] = z;
-            dirs[0] = dx;
-            dirs[1] = dy;
-            dirs[2] = dz;
-            t += dt;
-            deltas[0] = dt;
-            deltas[1] = t - last_t; // used to calc depth
-            last_t = t;
-            xyzs += 3;
-            dirs += 3;
-            deltas += 2;
-            step++;
-        // else, skip a large step (basically skip a voxel grid)
-        } else {
-            // calc distance to next voxel
-            const float tx = (((nx + 0.5f + 0.5f * signf(dx)) * rH * 2 - 1) * mip_bound - x) * rdx;
-            const float ty = (((ny + 0.5f + 0.5f * signf(dy)) * rH * 2 - 1) * mip_bound - y) * rdy;
-            const float tz = (((nz + 0.5f + 0.5f * signf(dz)) * rH * 2 - 1) * mip_bound - z) * rdz;
-            const float tt = t + fmaxf(0.0f, fminf(tx, fminf(ty, tz)));
-            // step until next voxel
-            do {
-                t += clamp(t * dt_gamma, dt_min, dt_max);
-            } while (t < tt);
-        }
-    }
-}
-void march_rays_train(const at::Tensor rays_o, const at::Tensor rays_d, const at::Tensor grid, const float bound, const float dt_gamma, const uint32_t max_steps, const uint32_t N, const uint32_t C, const uint32_t H, const uint32_t M, const at::Tensor nears, const at::Tensor fars, at::Tensor xyzs, at::Tensor dirs, at::Tensor deltas, at::Tensor rays, at::Tensor counter, at::Tensor noises) {
-    static constexpr uint32_t N_THREAD = 128;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    rays_o.scalar_type(), "march_rays_train", ([&] {
-        kernel_march_rays_train<<<div_round_up(N, N_THREAD), N_THREAD>>>(rays_o.data_ptr<scalar_t>(), rays_d.data_ptr<scalar_t>(), grid.data_ptr<uint8_t>(), bound, dt_gamma, max_steps, N, C, H, M, nears.data_ptr<scalar_t>(), fars.data_ptr<scalar_t>(), xyzs.data_ptr<scalar_t>(), dirs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), rays.data_ptr<int>(), counter.data_ptr<int>(), noises.data_ptr<scalar_t>());
-    }));
-}
-// sigmas: [M]
-// rgbs: [M, 3]
-// deltas: [M, 2]
-// rays: [N, 3], idx, offset, num_steps
-// weights_sum: [N], final pixel alpha
-// depth: [N,]
-// image: [N, 3]
-template <typename scalar_t>
-__global__ void kernel_composite_rays_train_forward(
-    const scalar_t * __restrict__ sigmas,
-    const scalar_t * __restrict__ rgbs,
-    const scalar_t * __restrict__ deltas,
-    const int * __restrict__ rays,
-    const uint32_t M, const uint32_t N, const float T_thresh,
-    scalar_t * weights_sum,
-    scalar_t * depth,
-    scalar_t * image
-) {
-    // parallel per ray
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= N) return;
-    // locate
-    uint32_t index = rays[n * 3];
-    uint32_t offset = rays[n * 3 + 1];
-    uint32_t num_steps = rays[n * 3 + 2];
-    // empty ray, or ray that exceed max step count.
-    if (num_steps == 0 || offset + num_steps > M) {
-        weights_sum[index] = 0;
-        depth[index] = 0;
-        image[index * 3] = 0;
-        image[index * 3 + 1] = 0;
-        image[index * 3 + 2] = 0;
-        return;
-    }
-    sigmas += offset;
-    rgbs += offset * 3;
-    deltas += offset * 2;
-    // accumulate
-    uint32_t step = 0;
-    scalar_t T = 1.0f;
-    scalar_t r = 0, g = 0, b = 0, ws = 0, t = 0, d = 0;
-    while (step < num_steps) {
-        const scalar_t alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
-        const scalar_t weight = alpha * T;
-        r += weight * rgbs[0];
-        g += weight * rgbs[1];
-        b += weight * rgbs[2];
-        t += deltas[1]; // real delta
-        d += weight * t;
-        ws += weight;
-        T *= 1.0f - alpha;
-        // minimal remained transmittence
-        if (T < T_thresh) break;
-        //printf("[n=%d] num_steps=%d, alpha=%f, w=%f, T=%f, sum_dt=%f, d=%f\n", n, step, alpha, weight, T, sum_delta, d);
-        // locate
-        sigmas++;
-        rgbs += 3;
-        deltas += 2;
-        step++;
-    }
-    //printf("[n=%d] rgb=(%f, %f, %f), d=%f\n", n, r, g, b, d);
-    // write
-    weights_sum[index] = ws; // weights_sum
-    depth[index] = d;
-    image[index * 3] = r;
-    image[index * 3 + 1] = g;
-    image[index * 3 + 2] = b;
-}
-void composite_rays_train_forward(const at::Tensor sigmas, const at::Tensor rgbs, const at::Tensor deltas, const at::Tensor rays, const uint32_t M, const uint32_t N, const float T_thresh, at::Tensor weights_sum, at::Tensor depth, at::Tensor image) {
-    static constexpr uint32_t N_THREAD = 128;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    sigmas.scalar_type(), "composite_rays_train_forward", ([&] {
-        kernel_composite_rays_train_forward<<<div_round_up(N, N_THREAD), N_THREAD>>>(sigmas.data_ptr<scalar_t>(), rgbs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), rays.data_ptr<int>(), M, N, T_thresh, weights_sum.data_ptr<scalar_t>(), depth.data_ptr<scalar_t>(), image.data_ptr<scalar_t>());
-    }));
-}
-// grad_weights_sum: [N,]
-// grad: [N, 3]
-// sigmas: [M]
-// rgbs: [M, 3]
-// deltas: [M, 2]
-// rays: [N, 3], idx, offset, num_steps
-// weights_sum: [N,], weights_sum here
-// image: [N, 3]
-// grad_sigmas: [M]
-// grad_rgbs: [M, 3]
-template <typename scalar_t>
-__global__ void kernel_composite_rays_train_backward(
-    const scalar_t * __restrict__ grad_weights_sum,
-    const scalar_t * __restrict__ grad_image,
-    const scalar_t * __restrict__ sigmas,
-    const scalar_t * __restrict__ rgbs,
-    const scalar_t * __restrict__ deltas,
-    const int * __restrict__ rays,
-    const scalar_t * __restrict__ weights_sum,
-    const scalar_t * __restrict__ image,
-    const uint32_t M, const uint32_t N, const float T_thresh,
-    scalar_t * grad_sigmas,
-    scalar_t * grad_rgbs
-) {
-    // parallel per ray
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= N) return;
-    // locate
-    uint32_t index = rays[n * 3];
-    uint32_t offset = rays[n * 3 + 1];
-    uint32_t num_steps = rays[n * 3 + 2];
-    if (num_steps == 0 || offset + num_steps > M) return;
-    grad_weights_sum += index;
-    grad_image += index * 3;
-    weights_sum += index;
-    image += index * 3;
-    sigmas += offset;
-    rgbs += offset * 3;
-    deltas += offset * 2;
-    grad_sigmas += offset;
-    grad_rgbs += offset * 3;
-    // accumulate
-    uint32_t step = 0;
-    scalar_t T = 1.0f;
-    const scalar_t r_final = image[0], g_final = image[1], b_final = image[2], ws_final = weights_sum[0];
-    scalar_t r = 0, g = 0, b = 0, ws = 0;
-    while (step < num_steps) {
-        const scalar_t alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
-        const scalar_t weight = alpha * T;
-        r += weight * rgbs[0];
-        g += weight * rgbs[1];
-        b += weight * rgbs[2];
-        ws += weight;
-        T *= 1.0f - alpha;
-        // check https://note.kiui.moe/others/nerf_gradient/ for the gradient calculation.
-        // write grad_rgbs
-        grad_rgbs[0] = grad_image[0] * weight;
-        grad_rgbs[1] = grad_image[1] * weight;
-        grad_rgbs[2] = grad_image[2] * weight;
-        // write grad_sigmas
-        grad_sigmas[0] = deltas[0] * (
-            grad_image[0] * (T * rgbs[0] - (r_final - r)) +
-            grad_image[1] * (T * rgbs[1] - (g_final - g)) +
-            grad_image[2] * (T * rgbs[2] - (b_final - b)) +
-            grad_weights_sum[0] * (1 - ws_final)
-        );
-        //printf("[n=%d] num_steps=%d, T=%f, grad_sigmas=%f, r_final=%f, r=%f\n", n, step, T, grad_sigmas[0], r_final, r);
-        // minimal remained transmittence
-        if (T < T_thresh) break;
-        // locate
-        sigmas++;
-        rgbs += 3;
-        deltas += 2;
-        grad_sigmas++;
-        grad_rgbs += 3;
-        step++;
-    }
-}
-void composite_rays_train_backward(const at::Tensor grad_weights_sum, const at::Tensor grad_image, const at::Tensor sigmas, const at::Tensor rgbs, const at::Tensor deltas, const at::Tensor rays, const at::Tensor weights_sum, const at::Tensor image, const uint32_t M, const uint32_t N, const float T_thresh, at::Tensor grad_sigmas, at::Tensor grad_rgbs) {
-    static constexpr uint32_t N_THREAD = 128;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    grad_image.scalar_type(), "composite_rays_train_backward", ([&] {
-        kernel_composite_rays_train_backward<<<div_round_up(N, N_THREAD), N_THREAD>>>(grad_weights_sum.data_ptr<scalar_t>(), grad_image.data_ptr<scalar_t>(), sigmas.data_ptr<scalar_t>(), rgbs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), rays.data_ptr<int>(), weights_sum.data_ptr<scalar_t>(), image.data_ptr<scalar_t>(), M, N, T_thresh, grad_sigmas.data_ptr<scalar_t>(), grad_rgbs.data_ptr<scalar_t>());
-    }));
-}
-////////////////////////////////////////////////////
-/////////////          infernce        /////////////
-////////////////////////////////////////////////////
-template <typename scalar_t>
-__global__ void kernel_march_rays(
-    const uint32_t n_alive,
-    const uint32_t n_step,
-    const int* __restrict__ rays_alive,
-    const scalar_t* __restrict__ rays_t,
-    const scalar_t* __restrict__ rays_o,
-    const scalar_t* __restrict__ rays_d,
-    const float bound,
-    const float dt_gamma, const uint32_t max_steps,
-    const uint32_t C, const uint32_t H,
-    const uint8_t * __restrict__ grid,
-    const scalar_t* __restrict__ nears,
-    const scalar_t* __restrict__ fars,
-    scalar_t* xyzs, scalar_t* dirs, scalar_t* deltas,
-    const scalar_t* __restrict__ noises
-) {
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= n_alive) return;
-    const int index = rays_alive[n]; // ray id
-    const float noise = noises[n];
-    // locate
-    rays_o += index * 3;
-    rays_d += index * 3;
-    xyzs += n * n_step * 3;
-    dirs += n * n_step * 3;
-    deltas += n * n_step * 2;
-    const float ox = rays_o[0], oy = rays_o[1], oz = rays_o[2];
-    const float dx = rays_d[0], dy = rays_d[1], dz = rays_d[2];
-    const float rdx = 1 / dx, rdy = 1 / dy, rdz = 1 / dz;
-    const float rH = 1 / (float)H;
-    const float H3 = H * H * H;
-    float t = rays_t[index]; // current ray's t
-    const float near = nears[index], far = fars[index];
-    const float dt_min = 2 * SQRT3() / max_steps;
-    const float dt_max = 2 * SQRT3() * (1 << (C - 1)) / H;
-    // march for n_step steps, record points
-    uint32_t step = 0;
-    // introduce some randomness
-    t += clamp(t * dt_gamma, dt_min, dt_max) * noise;
-    float last_t = t;
-    while (t < far && step < n_step) {
-        // current point
-        const float x = clamp(ox + t * dx, -bound, bound);
-        const float y = clamp(oy + t * dy, -bound, bound);
-        const float z = clamp(oz + t * dz, -bound, bound);
-        const float dt = clamp(t * dt_gamma, dt_min, dt_max);
-        // get mip level
-        const int level = max(mip_from_pos(x, y, z, C), mip_from_dt(dt, H, C)); // range in [0, C - 1]
-        const float mip_bound = fminf(scalbnf(1, level), bound);
-        const float mip_rbound = 1 / mip_bound;
-        // convert to nearest grid position
-        const int nx = clamp(0.5 * (x * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        const int ny = clamp(0.5 * (y * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        const int nz = clamp(0.5 * (z * mip_rbound + 1) * H, 0.0f, (float)(H - 1));
-        const uint32_t index = level * H3 + __morton3D(nx, ny, nz);
-        const bool occ = grid[index / 8] & (1 << (index % 8));
-        // if occpuied, advance a small step, and write to output
-        if (occ) {
-            // write step
-            xyzs[0] = x;
-            xyzs[1] = y;
-            xyzs[2] = z;
-            dirs[0] = dx;
-            dirs[1] = dy;
-            dirs[2] = dz;
-            // calc dt
-            t += dt;
-            deltas[0] = dt;
-            deltas[1] = t - last_t; // used to calc depth
-            last_t = t;
-            // step
-            xyzs += 3;
-            dirs += 3;
-            deltas += 2;
-            step++;
-        // else, skip a large step (basically skip a voxel grid)
-        } else {
-            // calc distance to next voxel
-            const float tx = (((nx + 0.5f + 0.5f * signf(dx)) * rH * 2 - 1) * mip_bound - x) * rdx;
-            const float ty = (((ny + 0.5f + 0.5f * signf(dy)) * rH * 2 - 1) * mip_bound - y) * rdy;
-            const float tz = (((nz + 0.5f + 0.5f * signf(dz)) * rH * 2 - 1) * mip_bound - z) * rdz;
-            const float tt = t + fmaxf(0.0f, fminf(tx, fminf(ty, tz)));
-            // step until next voxel
-            do {
-                t += clamp(t * dt_gamma, dt_min, dt_max);
-            } while (t < tt);
-        }
-    }
-}
-void march_rays(const uint32_t n_alive, const uint32_t n_step, const at::Tensor rays_alive, const at::Tensor rays_t, const at::Tensor rays_o, const at::Tensor rays_d, const float bound, const float dt_gamma, const uint32_t max_steps, const uint32_t C, const uint32_t H, const at::Tensor grid, const at::Tensor near, const at::Tensor far, at::Tensor xyzs, at::Tensor dirs, at::Tensor deltas, at::Tensor noises) {
-    static constexpr uint32_t N_THREAD = 128;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    rays_o.scalar_type(), "march_rays", ([&] {
-        kernel_march_rays<<<div_round_up(n_alive, N_THREAD), N_THREAD>>>(n_alive, n_step, rays_alive.data_ptr<int>(), rays_t.data_ptr<scalar_t>(), rays_o.data_ptr<scalar_t>(), rays_d.data_ptr<scalar_t>(), bound, dt_gamma, max_steps, C, H, grid.data_ptr<uint8_t>(), near.data_ptr<scalar_t>(), far.data_ptr<scalar_t>(), xyzs.data_ptr<scalar_t>(), dirs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), noises.data_ptr<scalar_t>());
-    }));
-}
-template <typename scalar_t>
-__global__ void kernel_composite_rays(
-    const uint32_t n_alive,
-    const uint32_t n_step,
-    const float T_thresh,
-    int* rays_alive,
-    scalar_t* rays_t,
-    const scalar_t* __restrict__ sigmas,
-    const scalar_t* __restrict__ rgbs,
-    const scalar_t* __restrict__ deltas,
-    scalar_t* weights_sum, scalar_t* depth, scalar_t* image
-) {
-    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
-    if (n >= n_alive) return;
-    const int index = rays_alive[n]; // ray id
-    // locate
-    sigmas += n * n_step;
-    rgbs += n * n_step * 3;
-    deltas += n * n_step * 2;
-    rays_t += index;
-    weights_sum += index;
-    depth += index;
-    image += index * 3;
-    scalar_t t = rays_t[0]; // current ray's t
-    scalar_t weight_sum = weights_sum[0];
-    scalar_t d = depth[0];
-    scalar_t r = image[0];
-    scalar_t g = image[1];
-    scalar_t b = image[2];
-    // accumulate
-    uint32_t step = 0;
-    while (step < n_step) {
-        // ray is terminated if delta == 0
-        if (deltas[0] == 0) break;
-        const scalar_t alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
-        /*
-        T_0 = 1; T_i = \prod_{j=0}^{i-1} (1 - alpha_j)
-        w_i = alpha_i * T_i
-        -->
-        T_i = 1 - \sum_{j=0}^{i-1} w_j
-        */
-        const scalar_t T = 1 - weight_sum;
-        const scalar_t weight = alpha * T;
-        weight_sum += weight;
-        t += deltas[1]; // real delta
-        d += weight * t;
-        r += weight * rgbs[0];
-        g += weight * rgbs[1];
-        b += weight * rgbs[2];
-        //printf("[n=%d] num_steps=%d, alpha=%f, w=%f, T=%f, sum_dt=%f, d=%f\n", n, step, alpha, weight, T, sum_delta, d);
-        // ray is terminated if T is too small
-        // use a larger bound to further accelerate inference
-        if (T < T_thresh) break;
-        // locate
-        sigmas++;
-        rgbs += 3;
-        deltas += 2;
-        step++;
-    }
-    //printf("[n=%d] rgb=(%f, %f, %f), d=%f\n", n, r, g, b, d);
-    // rays_alive = -1 means ray is terminated early.
-    if (step < n_step) {
-        rays_alive[n] = -1;
-    } else {
-        rays_t[0] = t;
-    }
-    weights_sum[0] = weight_sum; // this is the thing I needed!
-    depth[0] = d;
-    image[0] = r;
-    image[1] = g;
-    image[2] = b;
-}
-void composite_rays(const uint32_t n_alive, const uint32_t n_step, const float T_thresh, at::Tensor rays_alive, at::Tensor rays_t, at::Tensor sigmas, at::Tensor rgbs, at::Tensor deltas, at::Tensor weights, at::Tensor depth, at::Tensor image) {
-    static constexpr uint32_t N_THREAD = 128;
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    image.scalar_type(), "composite_rays", ([&] {
-        kernel_composite_rays<<<div_round_up(n_alive, N_THREAD), N_THREAD>>>(n_alive, n_step, T_thresh, rays_alive.data_ptr<int>(), rays_t.data_ptr<scalar_t>(), sigmas.data_ptr<scalar_t>(), rgbs.data_ptr<scalar_t>(), deltas.data_ptr<scalar_t>(), weights.data_ptr<scalar_t>(), depth.data_ptr<scalar_t>(), image.data_ptr<scalar_t>());
-    }));
-}

3drecon/raymarching/src/raymarching.h DELETED Viewed

@@ -1,18 +0,0 @@
-#pragma once
-#include <stdint.h>
-#include <torch/torch.h>
-void near_far_from_aabb(const at::Tensor rays_o, const at::Tensor rays_d, const at::Tensor aabb, const uint32_t N, const float min_near, at::Tensor nears, at::Tensor fars);
-void sph_from_ray(const at::Tensor rays_o, const at::Tensor rays_d, const float radius, const uint32_t N, at::Tensor coords);
-void morton3D(const at::Tensor coords, const uint32_t N, at::Tensor indices);
-void morton3D_invert(const at::Tensor indices, const uint32_t N, at::Tensor coords);
-void packbits(const at::Tensor grid, const uint32_t N, const float density_thresh, at::Tensor bitfield);
-void march_rays_train(const at::Tensor rays_o, const at::Tensor rays_d, const at::Tensor grid, const float bound, const float dt_gamma, const uint32_t max_steps, const uint32_t N, const uint32_t C, const uint32_t H, const uint32_t M, const at::Tensor nears, const at::Tensor fars, at::Tensor xyzs, at::Tensor dirs, at::Tensor deltas, at::Tensor rays, at::Tensor counter, at::Tensor noises);
-void composite_rays_train_forward(const at::Tensor sigmas, const at::Tensor rgbs, const at::Tensor deltas, const at::Tensor rays, const uint32_t M, const uint32_t N, const float T_thresh, at::Tensor weights_sum, at::Tensor depth, at::Tensor image);
-void composite_rays_train_backward(const at::Tensor grad_weights_sum, const at::Tensor grad_image, const at::Tensor sigmas, const at::Tensor rgbs, const at::Tensor deltas, const at::Tensor rays, const at::Tensor weights_sum, const at::Tensor image, const uint32_t M, const uint32_t N, const float T_thresh, at::Tensor grad_sigmas, at::Tensor grad_rgbs);
-void march_rays(const uint32_t n_alive, const uint32_t n_step, const at::Tensor rays_alive, const at::Tensor rays_t, const at::Tensor rays_o, const at::Tensor rays_d, const float bound, const float dt_gamma, const uint32_t max_steps, const uint32_t C, const uint32_t H, const at::Tensor grid, const at::Tensor nears, const at::Tensor fars, at::Tensor xyzs, at::Tensor dirs, at::Tensor deltas, at::Tensor noises);
-void composite_rays(const uint32_t n_alive, const uint32_t n_step, const float T_thresh, at::Tensor rays_alive, at::Tensor rays_t, at::Tensor sigmas, at::Tensor rgbs, at::Tensor deltas, at::Tensor weights_sum, at::Tensor depth, at::Tensor image);

3drecon/renderer/agg_net.py DELETED Viewed

@@ -1,83 +0,0 @@
-import torch.nn.functional as F
-import torch.nn as nn
-import torch
-def weights_init(m):
-    if isinstance(m, nn.Linear):
-        nn.init.kaiming_normal_(m.weight.data)
-        if m.bias is not None:
-            nn.init.zeros_(m.bias.data)
-class NeRF(nn.Module):
-    def __init__(self, vol_n=8+8, feat_ch=8+16+32+3, hid_n=64):
-        super(NeRF, self).__init__()
-        self.hid_n = hid_n
-        self.agg = Agg(feat_ch)
-        self.lr0 = nn.Sequential(nn.Linear(vol_n+16, hid_n), nn.ReLU())
-        self.sigma = nn.Sequential(nn.Linear(hid_n, 1), nn.Softplus())
-        self.color = nn.Sequential(
-            nn.Linear(16+vol_n+feat_ch+hid_n+4, hid_n), # agg_feats+vox_feat+img_feat+lr0_feats+dir
-            nn.ReLU(),
-            nn.Linear(hid_n, 1)
-        )
-        self.lr0.apply(weights_init)
-        self.sigma.apply(weights_init)
-        self.color.apply(weights_init)
-    def forward(self, vox_feat, img_feat_rgb_dir, source_img_mask):
-        # assert torch.sum(torch.sum(source_img_mask,1)<2)==0
-        b, d, n, _ = img_feat_rgb_dir.shape # b,d,n,f=8+16+32+3+4
-        agg_feat = self.agg(img_feat_rgb_dir, source_img_mask) # b,d,f=16
-        x = self.lr0(torch.cat((vox_feat, agg_feat), dim=-1)) # b,d,f=64
-        sigma = self.sigma(x) # b,d,1
-        x = torch.cat((x, vox_feat, agg_feat), dim=-1) # b,d,f=16+16+64
-        x = x.view(b, d, 1, x.shape[-1]).repeat(1, 1, n, 1)
-        x = torch.cat((x, img_feat_rgb_dir), dim=-1)
-        logits = self.color(x)
-        source_img_mask_ = source_img_mask.reshape(b, 1, n, 1).repeat(1, logits.shape[1], 1, 1) == 0
-        logits[source_img_mask_] = -1e7
-        color_weight = F.softmax(logits, dim=-2)
-        color = torch.sum((img_feat_rgb_dir[..., -7:-4] * color_weight), dim=-2)
-        return color, sigma
-class Agg(nn.Module):
-    def __init__(self, feat_ch):
-        super(Agg, self).__init__()
-        self.feat_ch = feat_ch
-        self.view_fc = nn.Sequential(nn.Linear(4, feat_ch), nn.ReLU())
-        self.view_fc.apply(weights_init)
-        self.global_fc = nn.Sequential(nn.Linear(feat_ch*3, 32), nn.ReLU())
-        self.agg_w_fc = nn.Linear(32, 1)
-        self.fc = nn.Linear(32, 16)
-        self.global_fc.apply(weights_init)
-        self.agg_w_fc.apply(weights_init)
-        self.fc.apply(weights_init)
-    def masked_mean_var(self, img_feat_rgb, source_img_mask):
-        # img_feat_rgb: b,d,n,f   source_img_mask: b,n
-        b, n = source_img_mask.shape
-        source_img_mask = source_img_mask.view(b, 1, n, 1)
-        mean = torch.sum(source_img_mask * img_feat_rgb, dim=-2)/ (torch.sum(source_img_mask, dim=-2) + 1e-5)
-        var = torch.sum((img_feat_rgb - mean.unsqueeze(-2)) ** 2 * source_img_mask, dim=-2) / (torch.sum(source_img_mask, dim=-2) + 1e-5)
-        return mean, var
-    def forward(self, img_feat_rgb_dir, source_img_mask):
-        # img_feat_rgb_dir b,d,n,f
-        b, d, n, _ = img_feat_rgb_dir.shape
-        view_feat = self.view_fc(img_feat_rgb_dir[..., -4:]) # b,d,n,f-4
-        img_feat_rgb =  img_feat_rgb_dir[..., :-4] + view_feat
-        mean_feat, var_feat = self.masked_mean_var(img_feat_rgb, source_img_mask)
-        var_feat = var_feat.view(b, -1, 1, self.feat_ch).repeat(1, 1, n, 1)
-        avg_feat = mean_feat.view(b, -1, 1, self.feat_ch).repeat(1, 1, n, 1)
-        feat = torch.cat([img_feat_rgb, var_feat, avg_feat], dim=-1) # b,d,n,f
-        global_feat = self.global_fc(feat) # b,d,n,f
-        logits = self.agg_w_fc(global_feat) # b,d,n,1
-        source_img_mask_ = source_img_mask.reshape(b, 1, n, 1).repeat(1, logits.shape[1], 1, 1) == 0
-        logits[source_img_mask_] = -1e7
-        agg_w = F.softmax(logits, dim=-2)
-        im_feat = (global_feat * agg_w).sum(dim=-2)
-        return self.fc(im_feat)

3drecon/renderer/cost_reg_net.py DELETED Viewed

@@ -1,95 +0,0 @@
-import torch.nn as nn
-class ConvBnReLU3D(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, pad=1, norm_act=nn.BatchNorm3d):
-        super(ConvBnReLU3D, self).__init__()
-        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=pad, bias=False)
-        self.bn = norm_act(out_channels)
-        self.relu = nn.ReLU(inplace=True)
-    def forward(self, x):
-        return self.relu(self.bn(self.conv(x)))
-class CostRegNet(nn.Module):
-    def __init__(self, in_channels, norm_act=nn.BatchNorm3d):
-        super(CostRegNet, self).__init__()
-        self.conv0 = ConvBnReLU3D(in_channels, 8, norm_act=norm_act)
-        self.conv1 = ConvBnReLU3D(8, 16, stride=2, norm_act=norm_act)
-        self.conv2 = ConvBnReLU3D(16, 16, norm_act=norm_act)
-        self.conv3 = ConvBnReLU3D(16, 32, stride=2, norm_act=norm_act)
-        self.conv4 = ConvBnReLU3D(32, 32, norm_act=norm_act)
-        self.conv5 = ConvBnReLU3D(32, 64, stride=2, norm_act=norm_act)
-        self.conv6 = ConvBnReLU3D(64, 64, norm_act=norm_act)
-        self.conv7 = nn.Sequential(
-            nn.ConvTranspose3d(64, 32, 3, padding=1, output_padding=1, stride=2, bias=False),
-            norm_act(32)
-        )
-        self.conv9 = nn.Sequential(
-            nn.ConvTranspose3d(32, 16, 3, padding=1, output_padding=1, stride=2, bias=False),
-            norm_act(16)
-        )
-        self.conv11 = nn.Sequential(
-            nn.ConvTranspose3d(16, 8, 3, padding=1, output_padding=1,stride=2, bias=False),
-            norm_act(8)
-        )
-        self.depth_conv = nn.Sequential(nn.Conv3d(8, 1, 3, padding=1, bias=False))
-        self.feat_conv = nn.Sequential(nn.Conv3d(8, 8, 3, padding=1, bias=False))
-    def forward(self, x):
-        conv0 = self.conv0(x)
-        conv2 = self.conv2(self.conv1(conv0))
-        conv4 = self.conv4(self.conv3(conv2))
-        x = self.conv6(self.conv5(conv4))
-        x = conv4 + self.conv7(x)
-        del conv4
-        x = conv2 + self.conv9(x)
-        del conv2
-        x = conv0 + self.conv11(x)
-        del conv0
-        feat = self.feat_conv(x)
-        depth = self.depth_conv(x)
-        return feat, depth
-class MinCostRegNet(nn.Module):
-    def __init__(self, in_channels, norm_act=nn.BatchNorm3d):
-        super(MinCostRegNet, self).__init__()
-        self.conv0 = ConvBnReLU3D(in_channels, 8, norm_act=norm_act)
-        self.conv1 = ConvBnReLU3D(8, 16, stride=2, norm_act=norm_act)
-        self.conv2 = ConvBnReLU3D(16, 16, norm_act=norm_act)
-        self.conv3 = ConvBnReLU3D(16, 32, stride=2, norm_act=norm_act)
-        self.conv4 = ConvBnReLU3D(32, 32, norm_act=norm_act)
-        self.conv9 = nn.Sequential(
-            nn.ConvTranspose3d(32, 16, 3, padding=1, output_padding=1,
-                               stride=2, bias=False),
-            norm_act(16))
-        self.conv11 = nn.Sequential(
-            nn.ConvTranspose3d(16, 8, 3, padding=1, output_padding=1,
-                               stride=2, bias=False),
-            norm_act(8))
-        self.depth_conv = nn.Sequential(nn.Conv3d(8, 1, 3, padding=1, bias=False))
-        self.feat_conv = nn.Sequential(nn.Conv3d(8, 8, 3, padding=1, bias=False))
-    def forward(self, x):
-        conv0 = self.conv0(x)
-        conv2 = self.conv2(self.conv1(conv0))
-        conv4 = self.conv4(self.conv3(conv2))
-        x = conv4
-        x = conv2 + self.conv9(x)
-        del conv2
-        x = conv0 + self.conv11(x)
-        del conv0
-        feat = self.feat_conv(x)
-        depth = self.depth_conv(x)
-        return feat, depth

3drecon/renderer/dummy_dataset.py DELETED Viewed

@@ -1,40 +0,0 @@
-import pytorch_lightning as pl
-from torch.utils.data import Dataset
-import webdataset as wds
-from torch.utils.data.distributed import DistributedSampler
-class DummyDataset(pl.LightningDataModule):
-    def __init__(self,seed):
-        super().__init__()
-    def setup(self, stage):
-        if stage in ['fit']:
-            self.train_dataset = DummyData(True)
-            self.val_dataset = DummyData(False)
-        else:
-            raise NotImplementedError
-    def train_dataloader(self):
-        return wds.WebLoader(self.train_dataset, batch_size=1, num_workers=0, shuffle=False)
-    def val_dataloader(self):
-        return wds.WebLoader(self.val_dataset, batch_size=1, num_workers=0, shuffle=False)
-    def test_dataloader(self):
-        return wds.WebLoader(DummyData(False))
-class DummyData(Dataset):
-    def __init__(self,is_train):
-        self.is_train=is_train
-    def __len__(self):
-        if self.is_train:
-            return 99999999
-        else:
-            return 1
-    def __getitem__(self, index):
-        return {}

3drecon/renderer/feature_net.py DELETED Viewed

@@ -1,42 +0,0 @@
-import torch.nn as nn
-import torch.nn.functional as F
-class ConvBnReLU(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, pad=1, norm_act=nn.BatchNorm2d):
-        super(ConvBnReLU, self).__init__()
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=pad, bias=False)
-        self.bn = norm_act(out_channels)
-        self.relu = nn.ReLU(inplace=True)
-    def forward(self, x):
-        return self.relu(self.bn(self.conv(x)))
-class FeatureNet(nn.Module):
-    def __init__(self, norm_act=nn.BatchNorm2d):
-        super(FeatureNet, self).__init__()
-        self.conv0 = nn.Sequential(ConvBnReLU(3, 8, 3, 1, 1, norm_act=norm_act), ConvBnReLU(8, 8, 3, 1, 1, norm_act=norm_act))
-        self.conv1 = nn.Sequential(ConvBnReLU(8, 16, 5, 2, 2, norm_act=norm_act), ConvBnReLU(16, 16, 3, 1, 1, norm_act=norm_act))
-        self.conv2 = nn.Sequential(ConvBnReLU(16, 32, 5, 2, 2, norm_act=norm_act), ConvBnReLU(32, 32, 3, 1, 1, norm_act=norm_act))
-        self.toplayer = nn.Conv2d(32, 32, 1)
-        self.lat1 = nn.Conv2d(16, 32, 1)
-        self.lat0 = nn.Conv2d(8, 32, 1)
-        self.smooth1 = nn.Conv2d(32, 16, 3, padding=1)
-        self.smooth0 = nn.Conv2d(32, 8, 3, padding=1)
-    def _upsample_add(self, x, y):
-        return F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True) + y
-    def forward(self, x):
-        conv0 = self.conv0(x)
-        conv1 = self.conv1(conv0)
-        conv2 = self.conv2(conv1)
-        feat2 = self.toplayer(conv2)
-        feat1 = self._upsample_add(feat2, self.lat1(conv1))
-        feat0 = self._upsample_add(feat1, self.lat0(conv0))
-        feat1 = self.smooth1(feat1)
-        feat0 = self.smooth0(feat0)
-        return feat2, feat1, feat0

3drecon/renderer/neus_networks.py DELETED Viewed

@@ -1,503 +0,0 @@
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import tinycudann as tcnn
-# Positional encoding embedding. Code was taken from https://github.com/bmild/nerf.
-class Embedder:
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-        self.create_embedding_fn()
-    def create_embedding_fn(self):
-        embed_fns = []
-        d = self.kwargs['input_dims']
-        out_dim = 0
-        if self.kwargs['include_input']:
-            embed_fns.append(lambda x: x)
-            out_dim += d
-        max_freq = self.kwargs['max_freq_log2']
-        N_freqs = self.kwargs['num_freqs']
-        if self.kwargs['log_sampling']:
-            freq_bands = 2. ** torch.linspace(0., max_freq, N_freqs)
-        else:
-            freq_bands = torch.linspace(2. ** 0., 2. ** max_freq, N_freqs)
-        for freq in freq_bands:
-            for p_fn in self.kwargs['periodic_fns']:
-                embed_fns.append(lambda x, p_fn=p_fn, freq=freq: p_fn(x * freq))
-                out_dim += d
-        self.embed_fns = embed_fns
-        self.out_dim = out_dim
-    def embed(self, inputs):
-        return torch.cat([fn(inputs) for fn in self.embed_fns], -1)
-def get_embedder(multires, input_dims=3):
-    embed_kwargs = {
-        'include_input': True,
-        'input_dims': input_dims,
-        'max_freq_log2': multires - 1,
-        'num_freqs': multires,
-        'log_sampling': True,
-        'periodic_fns': [torch.sin, torch.cos],
-    }
-    embedder_obj = Embedder(**embed_kwargs)
-    def embed(x, eo=embedder_obj): return eo.embed(x)
-    return embed, embedder_obj.out_dim
-class SDFNetwork(nn.Module):
-    def __init__(self, d_in, d_out, d_hidden, n_layers, skip_in=(4,), multires=0, bias=0.5,
-                 scale=1, geometric_init=True, weight_norm=True, inside_outside=False):
-        super(SDFNetwork, self).__init__()
-        dims = [d_in] + [d_hidden for _ in range(n_layers)] + [d_out]
-        self.embed_fn_fine = None
-        if multires > 0:
-            embed_fn, input_ch = get_embedder(multires, input_dims=d_in)
-            self.embed_fn_fine = embed_fn
-            dims[0] = input_ch
-        self.num_layers = len(dims)
-        self.skip_in = skip_in
-        self.scale = scale
-        for l in range(0, self.num_layers - 1):
-            if l + 1 in self.skip_in:
-                out_dim = dims[l + 1] - dims[0]
-            else:
-                out_dim = dims[l + 1]
-            lin = nn.Linear(dims[l], out_dim)
-            if geometric_init:
-                if l == self.num_layers - 2:
-                    if not inside_outside:
-                        torch.nn.init.normal_(lin.weight, mean=np.sqrt(np.pi) / np.sqrt(dims[l]), std=0.0001)
-                        torch.nn.init.constant_(lin.bias, -bias)
-                    else:
-                        torch.nn.init.normal_(lin.weight, mean=-np.sqrt(np.pi) / np.sqrt(dims[l]), std=0.0001)
-                        torch.nn.init.constant_(lin.bias, bias)
-                elif multires > 0 and l == 0:
-                    torch.nn.init.constant_(lin.bias, 0.0)
-                    torch.nn.init.constant_(lin.weight[:, 3:], 0.0)
-                    torch.nn.init.normal_(lin.weight[:, :3], 0.0, np.sqrt(2) / np.sqrt(out_dim))
-                elif multires > 0 and l in self.skip_in:
-                    torch.nn.init.constant_(lin.bias, 0.0)
-                    torch.nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
-                    torch.nn.init.constant_(lin.weight[:, -(dims[0] - 3):], 0.0)
-                else:
-                    torch.nn.init.constant_(lin.bias, 0.0)
-                    torch.nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
-            if weight_norm:
-                lin = nn.utils.weight_norm(lin)
-            setattr(self, "lin" + str(l), lin)
-        self.activation = nn.Softplus(beta=100)
-    def forward(self, inputs):
-        inputs = inputs * self.scale
-        if self.embed_fn_fine is not None:
-            inputs = self.embed_fn_fine(inputs)
-        x = inputs
-        for l in range(0, self.num_layers - 1):
-            lin = getattr(self, "lin" + str(l))
-            if l in self.skip_in:
-                x = torch.cat([x, inputs], -1) / np.sqrt(2)
-            x = lin(x)
-            if l < self.num_layers - 2:
-                x = self.activation(x)
-        return x
-    def sdf(self, x):
-        return self.forward(x)[..., :1]
-    def sdf_hidden_appearance(self, x):
-        return self.forward(x)
-    def gradient(self, x):
-        x.requires_grad_(True)
-        with torch.enable_grad():
-            y = self.sdf(x)
-        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
-        gradients = torch.autograd.grad(
-            outputs=y,
-            inputs=x,
-            grad_outputs=d_output,
-            create_graph=True,
-            retain_graph=True,
-            only_inputs=True)[0]
-        return gradients
-    def sdf_normal(self, x):
-        x.requires_grad_(True)
-        with torch.enable_grad():
-            y = self.sdf(x)
-        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
-        gradients = torch.autograd.grad(
-            outputs=y,
-            inputs=x,
-            grad_outputs=d_output,
-            create_graph=True,
-            retain_graph=True,
-            only_inputs=True)[0]
-        return y[..., :1].detach(), gradients.detach()
-class SDFNetworkWithFeature(nn.Module):
-    def __init__(self, cube, dp_in, df_in, d_out, d_hidden, n_layers, skip_in=(4,), multires=0, bias=0.5,
-                 scale=1, geometric_init=True, weight_norm=True, inside_outside=False, cube_length=0.5):
-        super().__init__()
-        self.register_buffer("cube", cube)
-        self.cube_length = cube_length
-        dims = [dp_in+df_in] + [d_hidden for _ in range(n_layers)] + [d_out]
-        self.embed_fn_fine = None
-        if multires > 0:
-            embed_fn, input_ch = get_embedder(multires, input_dims=dp_in)
-            self.embed_fn_fine = embed_fn
-            dims[0] = input_ch + df_in
-        self.num_layers = len(dims)
-        self.skip_in = skip_in
-        self.scale = scale
-        for l in range(0, self.num_layers - 1):
-            if l + 1 in self.skip_in:
-                out_dim = dims[l + 1] - dims[0]
-            else:
-                out_dim = dims[l + 1]
-            lin = nn.Linear(dims[l], out_dim)
-            if geometric_init:
-                if l == self.num_layers - 2:
-                    if not inside_outside:
-                        torch.nn.init.normal_(lin.weight, mean=np.sqrt(np.pi) / np.sqrt(dims[l]), std=0.0001)
-                        torch.nn.init.constant_(lin.bias, -bias)
-                    else:
-                        torch.nn.init.normal_(lin.weight, mean=-np.sqrt(np.pi) / np.sqrt(dims[l]), std=0.0001)
-                        torch.nn.init.constant_(lin.bias, bias)
-                elif multires > 0 and l == 0:
-                    torch.nn.init.constant_(lin.bias, 0.0)
-                    torch.nn.init.constant_(lin.weight[:, 3:], 0.0)
-                    torch.nn.init.normal_(lin.weight[:, :3], 0.0, np.sqrt(2) / np.sqrt(out_dim))
-                elif multires > 0 and l in self.skip_in:
-                    torch.nn.init.constant_(lin.bias, 0.0)
-                    torch.nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
-                    torch.nn.init.constant_(lin.weight[:, -(dims[0] - 3):], 0.0)
-                else:
-                    torch.nn.init.constant_(lin.bias, 0.0)
-                    torch.nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
-            if weight_norm:
-                lin = nn.utils.weight_norm(lin)
-            setattr(self, "lin" + str(l), lin)
-        self.activation = nn.Softplus(beta=100)
-    def forward(self, points):
-        points = points * self.scale
-        # note: point*2 because the cube is [-0.5,0.5]
-        with torch.no_grad():
-            feats = F.grid_sample(self.cube, points.view(1,-1,1,1,3)/self.cube_length, mode='bilinear', align_corners=True, padding_mode='zeros').detach()
-        feats = feats.view(self.cube.shape[1], -1).permute(1,0).view(*points.shape[:-1], -1)
-        if self.embed_fn_fine is not None:
-            points = self.embed_fn_fine(points)
-        x = torch.cat([points, feats], -1)
-        for l in range(0, self.num_layers - 1):
-            lin = getattr(self, "lin" + str(l))
-            if l in self.skip_in:
-                x = torch.cat([x, points, feats], -1) / np.sqrt(2)
-            x = lin(x)
-            if l < self.num_layers - 2:
-                x = self.activation(x)
-        # concat feats
-        x = torch.cat([x, feats], -1)
-        return x
-    def sdf(self, x):
-        return self.forward(x)[..., :1]
-    def sdf_hidden_appearance(self, x):
-        return self.forward(x)
-    def gradient(self, x):
-        x.requires_grad_(True)
-        with torch.enable_grad():
-            y = self.sdf(x)
-        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
-        gradients = torch.autograd.grad(
-            outputs=y,
-            inputs=x,
-            grad_outputs=d_output,
-            create_graph=True,
-            retain_graph=True,
-            only_inputs=True)[0]
-        return gradients
-    def sdf_normal(self, x):
-        x.requires_grad_(True)
-        with torch.enable_grad():
-            y = self.sdf(x)
-        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
-        gradients = torch.autograd.grad(
-            outputs=y,
-            inputs=x,
-            grad_outputs=d_output,
-            create_graph=True,
-            retain_graph=True,
-            only_inputs=True)[0]
-        return y[..., :1].detach(), gradients.detach()
-class VanillaMLP(nn.Module):
-    def __init__(self, dim_in, dim_out, n_neurons, n_hidden_layers):
-        super().__init__()
-        self.n_neurons, self.n_hidden_layers = n_neurons, n_hidden_layers
-        self.sphere_init, self.weight_norm = True, True
-        self.sphere_init_radius = 0.5
-        self.layers = [self.make_linear(dim_in, self.n_neurons, is_first=True, is_last=False), self.make_activation()]
-        for i in range(self.n_hidden_layers - 1):
-            self.layers += [self.make_linear(self.n_neurons, self.n_neurons, is_first=False, is_last=False), self.make_activation()]
-        self.layers += [self.make_linear(self.n_neurons, dim_out, is_first=False, is_last=True)]
-        self.layers = nn.Sequential(*self.layers)
-    @torch.cuda.amp.autocast(False)
-    def forward(self, x):
-        x = self.layers(x.float())
-        return x
-    def make_linear(self, dim_in, dim_out, is_first, is_last):
-        layer = nn.Linear(dim_in, dim_out, bias=True)  # network without bias will degrade quality
-        if self.sphere_init:
-            if is_last:
-                torch.nn.init.constant_(layer.bias, -self.sphere_init_radius)
-                torch.nn.init.normal_(layer.weight, mean=math.sqrt(math.pi) / math.sqrt(dim_in), std=0.0001)
-            elif is_first:
-                torch.nn.init.constant_(layer.bias, 0.0)
-                torch.nn.init.constant_(layer.weight[:, 3:], 0.0)
-                torch.nn.init.normal_(layer.weight[:, :3], 0.0, math.sqrt(2) / math.sqrt(dim_out))
-            else:
-                torch.nn.init.constant_(layer.bias, 0.0)
-                torch.nn.init.normal_(layer.weight, 0.0, math.sqrt(2) / math.sqrt(dim_out))
-        else:
-            torch.nn.init.constant_(layer.bias, 0.0)
-            torch.nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')
-        if self.weight_norm:
-            layer = nn.utils.weight_norm(layer)
-        return layer
-    def make_activation(self):
-        if self.sphere_init:
-            return nn.Softplus(beta=100)
-        else:
-            return nn.ReLU(inplace=True)
-class SDFHashGridNetwork(nn.Module):
-    def __init__(self, bound=0.5, feats_dim=13):
-        super().__init__()
-        self.bound = bound
-        # max_resolution = 32
-        # base_resolution = 16
-        # n_levels = 4
-        # log2_hashmap_size = 16
-        # n_features_per_level = 8
-        max_resolution = 2048
-        base_resolution = 16
-        n_levels = 16
-        log2_hashmap_size = 19
-        n_features_per_level = 2
-        # max_res = base_res * t^(k-1)
-        per_level_scale = (max_resolution / base_resolution)** (1 / (n_levels - 1))
-        self.encoder = tcnn.Encoding(
-            n_input_dims=3,
-            encoding_config={
-                "otype": "HashGrid",
-                "n_levels": n_levels,
-                "n_features_per_level": n_features_per_level,
-                "log2_hashmap_size": log2_hashmap_size,
-                "base_resolution": base_resolution,
-                "per_level_scale": per_level_scale,
-            },
-        )
-        self.sdf_mlp = VanillaMLP(n_levels*n_features_per_level+3,feats_dim,64,1)
-    def forward(self, x):
-        shape = x.shape[:-1]
-        x = x.reshape(-1, 3)
-        x_ = (x + self.bound) / (2 * self.bound)
-        feats = self.encoder(x_)
-        feats = torch.cat([x, feats], 1)
-        feats = self.sdf_mlp(feats)
-        feats = feats.reshape(*shape,-1)
-        return feats
-    def sdf(self, x):
-        return self(x)[...,:1]
-    def gradient(self, x):
-        x.requires_grad_(True)
-        with torch.enable_grad():
-            y = self.sdf(x)
-        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
-        gradients = torch.autograd.grad(
-            outputs=y,
-            inputs=x,
-            grad_outputs=d_output,
-            create_graph=True,
-            retain_graph=True,
-            only_inputs=True)[0]
-        return gradients
-    def sdf_normal(self, x):
-        x.requires_grad_(True)
-        with torch.enable_grad():
-            y = self.sdf(x)
-        d_output = torch.ones_like(y, requires_grad=False, device=y.device)
-        gradients = torch.autograd.grad(
-            outputs=y,
-            inputs=x,
-            grad_outputs=d_output,
-            create_graph=True,
-            retain_graph=True,
-            only_inputs=True)[0]
-        return y[..., :1].detach(), gradients.detach()
-class RenderingFFNetwork(nn.Module):
-    def __init__(self, in_feats_dim=12):
-        super().__init__()
-        self.dir_encoder = tcnn.Encoding(
-            n_input_dims=3,
-            encoding_config={
-                "otype": "SphericalHarmonics",
-                "degree": 4,
-            },
-        )
-        self.color_mlp = tcnn.Network(
-            n_input_dims = in_feats_dim + 3 + self.dir_encoder.n_output_dims,
-            n_output_dims = 3,
-            network_config={
-              "otype": "FullyFusedMLP",
-              "activation": "ReLU",
-              "output_activation": "none",
-              "n_neurons": 64,
-              "n_hidden_layers": 2,
-            },
-        )
-    def forward(self, points, normals, view_dirs, feature_vectors):
-        normals = F.normalize(normals, dim=-1)
-        view_dirs = F.normalize(view_dirs, dim=-1)
-        reflective = torch.sum(view_dirs * normals, -1, keepdim=True) * normals * 2 - view_dirs
-        x = torch.cat([feature_vectors, normals, self.dir_encoder(reflective)], -1)
-        colors = self.color_mlp(x).float()
-        colors = F.sigmoid(colors)
-        return colors
-# This implementation is borrowed from IDR: https://github.com/lioryariv/idr
-class RenderingNetwork(nn.Module):
-    def __init__(self, d_feature, d_in, d_out, d_hidden,
-                 n_layers, weight_norm=True, multires_view=0, squeeze_out=True, use_view_dir=True):
-        super().__init__()
-        self.squeeze_out = squeeze_out
-        self.rgb_act=F.sigmoid
-        self.use_view_dir=use_view_dir
-        dims = [d_in + d_feature] + [d_hidden for _ in range(n_layers)] + [d_out]
-        self.embedview_fn = None
-        if multires_view > 0:
-            embedview_fn, input_ch = get_embedder(multires_view)
-            self.embedview_fn = embedview_fn
-            dims[0] += (input_ch - 3)
-        self.num_layers = len(dims)
-        for l in range(0, self.num_layers - 1):
-            out_dim = dims[l + 1]
-            lin = nn.Linear(dims[l], out_dim)
-            if weight_norm:
-                lin = nn.utils.weight_norm(lin)
-            setattr(self, "lin" + str(l), lin)
-        self.relu = nn.ReLU()
-    def forward(self, points, normals, view_dirs, feature_vectors):
-        if self.use_view_dir:
-            view_dirs = F.normalize(view_dirs, dim=-1)
-            normals = F.normalize(normals, dim=-1)
-            reflective = torch.sum(view_dirs*normals, -1, keepdim=True) * normals * 2 - view_dirs
-            if self.embedview_fn is not None: reflective = self.embedview_fn(reflective)
-            rendering_input = torch.cat([points, reflective, normals, feature_vectors], dim=-1)
-        else:
-            rendering_input = torch.cat([points, normals, feature_vectors], dim=-1)
-        x = rendering_input
-        for l in range(0, self.num_layers - 1):
-            lin = getattr(self, "lin" + str(l))
-            x = lin(x)
-            if l < self.num_layers - 2:
-                x = self.relu(x)
-        if self.squeeze_out:
-            x = self.rgb_act(x)
-        return x
-class SingleVarianceNetwork(nn.Module):
-    def __init__(self, init_val, activation='exp'):
-        super(SingleVarianceNetwork, self).__init__()
-        self.act = activation
-        self.register_parameter('variance', nn.Parameter(torch.tensor(init_val)))
-    def forward(self, x):
-        device = x.device
-        if self.act=='exp':
-            return torch.ones([*x.shape[:-1], 1], dtype=torch.float32, device=device) * torch.exp(self.variance * 10.0)
-        else:
-            raise NotImplementedError
-    def warp(self, x, inv_s):
-        device = x.device
-        return torch.ones([*x.shape[:-1], 1], dtype=torch.float32, device=device) * inv_s

3drecon/renderer/ngp_renderer.py DELETED Viewed

@@ -1,721 +0,0 @@
-import math
-import trimesh
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from packaging import version as pver
-import tinycudann as tcnn
-from torch.autograd import Function
-from torch.cuda.amp import custom_bwd, custom_fwd
-import raymarching
-def custom_meshgrid(*args):
-    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
-    if pver.parse(torch.__version__) < pver.parse('1.10'):
-        return torch.meshgrid(*args)
-    else:
-        return torch.meshgrid(*args, indexing='ij')
-def sample_pdf(bins, weights, n_samples, det=False):
-    # This implementation is from NeRF
-    # bins: [B, T], old_z_vals
-    # weights: [B, T - 1], bin weights.
-    # return: [B, n_samples], new_z_vals
-    # Get pdf
-    weights = weights + 1e-5  # prevent nans
-    pdf = weights / torch.sum(weights, -1, keepdim=True)
-    cdf = torch.cumsum(pdf, -1)
-    cdf = torch.cat([torch.zeros_like(cdf[..., :1]), cdf], -1)
-    # Take uniform samples
-    if det:
-        u = torch.linspace(0. + 0.5 / n_samples, 1. - 0.5 / n_samples, steps=n_samples).to(weights.device)
-        u = u.expand(list(cdf.shape[:-1]) + [n_samples])
-    else:
-        u = torch.rand(list(cdf.shape[:-1]) + [n_samples]).to(weights.device)
-    # Invert CDF
-    u = u.contiguous()
-    inds = torch.searchsorted(cdf, u, right=True)
-    below = torch.max(torch.zeros_like(inds - 1), inds - 1)
-    above = torch.min((cdf.shape[-1] - 1) * torch.ones_like(inds), inds)
-    inds_g = torch.stack([below, above], -1)  # (B, n_samples, 2)
-    matched_shape = [inds_g.shape[0], inds_g.shape[1], cdf.shape[-1]]
-    cdf_g = torch.gather(cdf.unsqueeze(1).expand(matched_shape), 2, inds_g)
-    bins_g = torch.gather(bins.unsqueeze(1).expand(matched_shape), 2, inds_g)
-    denom = (cdf_g[..., 1] - cdf_g[..., 0])
-    denom = torch.where(denom < 1e-5, torch.ones_like(denom), denom)
-    t = (u - cdf_g[..., 0]) / denom
-    samples = bins_g[..., 0] + t * (bins_g[..., 1] - bins_g[..., 0])
-    return samples
-def plot_pointcloud(pc, color=None):
-    # pc: [N, 3]
-    # color: [N, 3/4]
-    print('[visualize points]', pc.shape, pc.dtype, pc.min(0), pc.max(0))
-    pc = trimesh.PointCloud(pc, color)
-    # axis
-    axes = trimesh.creation.axis(axis_length=4)
-    # sphere
-    sphere = trimesh.creation.icosphere(radius=1)
-    trimesh.Scene([pc, axes, sphere]).show()
-class NGPRenderer(nn.Module):
-    def __init__(self,
-                 bound=1,
-                 cuda_ray=True,
-                 density_scale=1, # scale up deltas (or sigmas), to make the density grid more sharp. larger value than 1 usually improves performance.
-                 min_near=0.2,
-                 density_thresh=0.01,
-                 bg_radius=-1,
-                 ):
-        super().__init__()
-        self.bound = bound
-        self.cascade = 1
-        self.grid_size = 128
-        self.density_scale = density_scale
-        self.min_near = min_near
-        self.density_thresh = density_thresh
-        self.bg_radius = bg_radius # radius of the background sphere.
-        # prepare aabb with a 6D tensor (xmin, ymin, zmin, xmax, ymax, zmax)
-        # NOTE: aabb (can be rectangular) is only used to generate points, we still rely on bound (always cubic) to calculate density grid and hashing.
-        aabb_train = torch.FloatTensor([-bound, -bound, -bound, bound, bound, bound])
-        aabb_infer = aabb_train.clone()
-        self.register_buffer('aabb_train', aabb_train)
-        self.register_buffer('aabb_infer', aabb_infer)
-        # extra state for cuda raymarching
-        self.cuda_ray = cuda_ray
-        if cuda_ray:
-            # density grid
-            density_grid = torch.zeros([self.cascade, self.grid_size ** 3]) # [CAS, H * H * H]
-            density_bitfield = torch.zeros(self.cascade * self.grid_size ** 3 // 8, dtype=torch.uint8) # [CAS * H * H * H // 8]
-            self.register_buffer('density_grid', density_grid)
-            self.register_buffer('density_bitfield', density_bitfield)
-            self.mean_density = 0
-            self.iter_density = 0
-            # step counter
-            step_counter = torch.zeros(16, 2, dtype=torch.int32) # 16 is hardcoded for averaging...
-            self.register_buffer('step_counter', step_counter)
-            self.mean_count = 0
-            self.local_step = 0
-    def forward(self, x, d):
-        raise NotImplementedError()
-    # separated density and color query (can accelerate non-cuda-ray mode.)
-    def density(self, x):
-        raise NotImplementedError()
-    def color(self, x, d, mask=None, **kwargs):
-        raise NotImplementedError()
-    def reset_extra_state(self):
-        if not self.cuda_ray:
-            return
-        # density grid
-        self.density_grid.zero_()
-        self.mean_density = 0
-        self.iter_density = 0
-        # step counter
-        self.step_counter.zero_()
-        self.mean_count = 0
-        self.local_step = 0
-    def run(self, rays_o, rays_d, num_steps=128, upsample_steps=128, bg_color=None, perturb=False, **kwargs):
-        # rays_o, rays_d: [B, N, 3], assumes B == 1
-        # bg_color: [3] in range [0, 1]
-        # return: image: [B, N, 3], depth: [B, N]
-        prefix = rays_o.shape[:-1]
-        rays_o = rays_o.contiguous().view(-1, 3)
-        rays_d = rays_d.contiguous().view(-1, 3)
-        N = rays_o.shape[0] # N = B * N, in fact
-        device = rays_o.device
-        # choose aabb
-        aabb = self.aabb_train if self.training else self.aabb_infer
-        # sample steps
-        nears, fars = raymarching.near_far_from_aabb(rays_o, rays_d, aabb, self.min_near)
-        nears.unsqueeze_(-1)
-        fars.unsqueeze_(-1)
-        #print(f'nears = {nears.min().item()} ~ {nears.max().item()}, fars = {fars.min().item()} ~ {fars.max().item()}')
-        z_vals = torch.linspace(0.0, 1.0, num_steps, device=device).unsqueeze(0) # [1, T]
-        z_vals = z_vals.expand((N, num_steps)) # [N, T]
-        z_vals = nears + (fars - nears) * z_vals # [N, T], in [nears, fars]
-        # perturb z_vals
-        sample_dist = (fars - nears) / num_steps
-        if perturb:
-            z_vals = z_vals + (torch.rand(z_vals.shape, device=device) - 0.5) * sample_dist
-            #z_vals = z_vals.clamp(nears, fars) # avoid out of bounds xyzs.
-        # generate xyzs
-        xyzs = rays_o.unsqueeze(-2) + rays_d.unsqueeze(-2) * z_vals.unsqueeze(-1) # [N, 1, 3] * [N, T, 1] -> [N, T, 3]
-        xyzs = torch.min(torch.max(xyzs, aabb[:3]), aabb[3:]) # a manual clip.
-        #plot_pointcloud(xyzs.reshape(-1, 3).detach().cpu().numpy())
-        # query SDF and RGB
-        density_outputs = self.density(xyzs.reshape(-1, 3))
-        #sigmas = density_outputs['sigma'].view(N, num_steps) # [N, T]
-        for k, v in density_outputs.items():
-            density_outputs[k] = v.view(N, num_steps, -1)
-        # upsample z_vals (nerf-like)
-        if upsample_steps > 0:
-            with torch.no_grad():
-                deltas = z_vals[..., 1:] - z_vals[..., :-1] # [N, T-1]
-                deltas = torch.cat([deltas, sample_dist * torch.ones_like(deltas[..., :1])], dim=-1)
-                alphas = 1 - torch.exp(-deltas * self.density_scale * density_outputs['sigma'].squeeze(-1)) # [N, T]
-                alphas_shifted = torch.cat([torch.ones_like(alphas[..., :1]), 1 - alphas + 1e-15], dim=-1) # [N, T+1]
-                weights = alphas * torch.cumprod(alphas_shifted, dim=-1)[..., :-1] # [N, T]
-                # sample new z_vals
-                z_vals_mid = (z_vals[..., :-1] + 0.5 * deltas[..., :-1]) # [N, T-1]
-                new_z_vals = sample_pdf(z_vals_mid, weights[:, 1:-1], upsample_steps, det=not self.training).detach() # [N, t]
-                new_xyzs = rays_o.unsqueeze(-2) + rays_d.unsqueeze(-2) * new_z_vals.unsqueeze(-1) # [N, 1, 3] * [N, t, 1] -> [N, t, 3]
-                new_xyzs = torch.min(torch.max(new_xyzs, aabb[:3]), aabb[3:]) # a manual clip.
-            # only forward new points to save computation
-            new_density_outputs = self.density(new_xyzs.reshape(-1, 3))
-            #new_sigmas = new_density_outputs['sigma'].view(N, upsample_steps) # [N, t]
-            for k, v in new_density_outputs.items():
-                new_density_outputs[k] = v.view(N, upsample_steps, -1)
-            # re-order
-            z_vals = torch.cat([z_vals, new_z_vals], dim=1) # [N, T+t]
-            z_vals, z_index = torch.sort(z_vals, dim=1)
-            xyzs = torch.cat([xyzs, new_xyzs], dim=1) # [N, T+t, 3]
-            xyzs = torch.gather(xyzs, dim=1, index=z_index.unsqueeze(-1).expand_as(xyzs))
-            for k in density_outputs:
-                tmp_output = torch.cat([density_outputs[k], new_density_outputs[k]], dim=1)
-                density_outputs[k] = torch.gather(tmp_output, dim=1, index=z_index.unsqueeze(-1).expand_as(tmp_output))
-        deltas = z_vals[..., 1:] - z_vals[..., :-1] # [N, T+t-1]
-        deltas = torch.cat([deltas, sample_dist * torch.ones_like(deltas[..., :1])], dim=-1)
-        alphas = 1 - torch.exp(-deltas * self.density_scale * density_outputs['sigma'].squeeze(-1)) # [N, T+t]
-        alphas_shifted = torch.cat([torch.ones_like(alphas[..., :1]), 1 - alphas + 1e-15], dim=-1) # [N, T+t+1]
-        weights = alphas * torch.cumprod(alphas_shifted, dim=-1)[..., :-1] # [N, T+t]
-        dirs = rays_d.view(-1, 1, 3).expand_as(xyzs)
-        for k, v in density_outputs.items():
-            density_outputs[k] = v.view(-1, v.shape[-1])
-        mask = weights > 1e-4 # hard coded
-        rgbs = self.color(xyzs.reshape(-1, 3), dirs.reshape(-1, 3), mask=mask.reshape(-1), **density_outputs)
-        rgbs = rgbs.view(N, -1, 3) # [N, T+t, 3]
-        #print(xyzs.shape, 'valid_rgb:', mask.sum().item())
-        # calculate weight_sum (mask)
-        weights_sum = weights.sum(dim=-1) # [N]
-        # calculate depth
-        ori_z_vals = ((z_vals - nears) / (fars - nears)).clamp(0, 1)
-        depth = torch.sum(weights * ori_z_vals, dim=-1)
-        # calculate color
-        image = torch.sum(weights.unsqueeze(-1) * rgbs, dim=-2) # [N, 3], in [0, 1]
-        # mix background color
-        if self.bg_radius > 0:
-            # use the bg model to calculate bg_color
-            sph = raymarching.sph_from_ray(rays_o, rays_d, self.bg_radius) # [N, 2] in [-1, 1]
-            bg_color = self.background(sph, rays_d.reshape(-1, 3)) # [N, 3]
-        elif bg_color is None:
-            bg_color = 1
-        image = image + (1 - weights_sum).unsqueeze(-1) * bg_color
-        image = image.view(*prefix, 3)
-        depth = depth.view(*prefix)
-        # tmp: reg loss in mip-nerf 360
-        # z_vals_shifted = torch.cat([z_vals[..., 1:], sample_dist * torch.ones_like(z_vals[..., :1])], dim=-1)
-        # mid_zs = (z_vals + z_vals_shifted) / 2 # [N, T]
-        # loss_dist = (torch.abs(mid_zs.unsqueeze(1) - mid_zs.unsqueeze(2)) * (weights.unsqueeze(1) * weights.unsqueeze(2))).sum() + 1/3 * ((z_vals_shifted - z_vals_shifted) * (weights ** 2)).sum()
-        return {
-            'depth': depth,
-            'image': image,
-            'weights_sum': weights_sum,
-        }
-    def run_cuda(self, rays_o, rays_d, dt_gamma=0, bg_color=None, perturb=False, force_all_rays=False, max_steps=1024, T_thresh=1e-4, **kwargs):
-        # rays_o, rays_d: [B, N, 3], assumes B == 1
-        # return: image: [B, N, 3], depth: [B, N]
-        prefix = rays_o.shape[:-1]
-        rays_o = rays_o.contiguous().view(-1, 3)
-        rays_d = rays_d.contiguous().view(-1, 3)
-        N = rays_o.shape[0] # N = B * N, in fact
-        device = rays_o.device
-        # pre-calculate near far
-        nears, fars = raymarching.near_far_from_aabb(rays_o, rays_d, self.aabb_train if self.training else self.aabb_infer, self.min_near)
-        # mix background color
-        if self.bg_radius > 0:
-            # use the bg model to calculate bg_color
-            sph = raymarching.sph_from_ray(rays_o, rays_d, self.bg_radius) # [N, 2] in [-1, 1]
-            bg_color = self.background(sph, rays_d) # [N, 3]
-        elif bg_color is None:
-            bg_color = 1
-        results = {}
-        if self.training:
-            # setup counter
-            counter = self.step_counter[self.local_step % 16]
-            counter.zero_() # set to 0
-            self.local_step += 1
-            xyzs, dirs, deltas, rays = raymarching.march_rays_train(rays_o, rays_d, self.bound, self.density_bitfield, self.cascade, self.grid_size, nears, fars, counter, self.mean_count, perturb, 128, force_all_rays, dt_gamma, max_steps)
-            #plot_pointcloud(xyzs.reshape(-1, 3).detach().cpu().numpy())
-            sigmas, rgbs = self(xyzs, dirs)
-            sigmas = self.density_scale * sigmas
-            weights_sum, depth, image = raymarching.composite_rays_train(sigmas, rgbs, deltas, rays, T_thresh)
-            image = image + (1 - weights_sum).unsqueeze(-1) * bg_color
-            depth = torch.clamp(depth - nears, min=0) / (fars - nears)
-            image = image.view(*prefix, 3)
-            depth = depth.view(*prefix)
-        else:
-            # allocate outputs
-            # if use autocast, must init as half so it won't be autocasted and lose reference.
-            #dtype = torch.half if torch.is_autocast_enabled() else torch.float32
-            # output should always be float32! only network inference uses half.
-            dtype = torch.float32
-            weights_sum = torch.zeros(N, dtype=dtype, device=device)
-            depth = torch.zeros(N, dtype=dtype, device=device)
-            image = torch.zeros(N, 3, dtype=dtype, device=device)
-            n_alive = N
-            rays_alive = torch.arange(n_alive, dtype=torch.int32, device=device) # [N]
-            rays_t = nears.clone() # [N]
-            step = 0
-            while step < max_steps:
-                # count alive rays
-                n_alive = rays_alive.shape[0]
-                # exit loop
-                if n_alive <= 0:
-                    break
-                # decide compact_steps
-                n_step = max(min(N // n_alive, 8), 1)
-                xyzs, dirs, deltas = raymarching.march_rays(n_alive, n_step, rays_alive, rays_t, rays_o, rays_d, self.bound, self.density_bitfield, self.cascade, self.grid_size, nears, fars, 128, perturb if step == 0 else False, dt_gamma, max_steps)
-                sigmas, rgbs = self(xyzs, dirs)
-                # density_outputs = self.density(xyzs) # [M,], use a dict since it may include extra things, like geo_feat for rgb.
-                # sigmas = density_outputs['sigma']
-                # rgbs = self.color(xyzs, dirs, **density_outputs)
-                sigmas = self.density_scale * sigmas
-                raymarching.composite_rays(n_alive, n_step, rays_alive, rays_t, sigmas, rgbs, deltas, weights_sum, depth, image, T_thresh)
-                rays_alive = rays_alive[rays_alive >= 0]
-                #print(f'step = {step}, n_step = {n_step}, n_alive = {n_alive}, xyzs: {xyzs.shape}')
-                step += n_step
-            image = image + (1 - weights_sum).unsqueeze(-1) * bg_color
-            depth = torch.clamp(depth - nears, min=0) / (fars - nears)
-            image = image.view(*prefix, 3)
-            depth = depth.view(*prefix)
-        results['weights_sum'] = weights_sum
-        results['depth'] = depth
-        results['image'] = image
-        return results
-    @torch.no_grad()
-    def mark_untrained_grid(self, poses, intrinsic, S=64):
-        # poses: [B, 4, 4]
-        # intrinsic: [3, 3]
-        if not self.cuda_ray:
-            return
-        if isinstance(poses, np.ndarray):
-            poses = torch.from_numpy(poses)
-        B = poses.shape[0]
-        fx, fy, cx, cy = intrinsic
-        X = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
-        Y = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
-        Z = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
-        count = torch.zeros_like(self.density_grid)
-        poses = poses.to(count.device)
-        # 5-level loop, forgive me...
-        for xs in X:
-            for ys in Y:
-                for zs in Z:
-                    # construct points
-                    xx, yy, zz = custom_meshgrid(xs, ys, zs)
-                    coords = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1) # [N, 3], in [0, 128)
-                    indices = raymarching.morton3D(coords).long() # [N]
-                    world_xyzs = (2 * coords.float() / (self.grid_size - 1) - 1).unsqueeze(0) # [1, N, 3] in [-1, 1]
-                    # cascading
-                    for cas in range(self.cascade):
-                        bound = min(2 ** cas, self.bound)
-                        half_grid_size = bound / self.grid_size
-                        # scale to current cascade's resolution
-                        cas_world_xyzs = world_xyzs * (bound - half_grid_size)
-                        # split batch to avoid OOM
-                        head = 0
-                        while head < B:
-                            tail = min(head + S, B)
-                            # world2cam transform (poses is c2w, so we need to transpose it. Another transpose is needed for batched matmul, so the final form is without transpose.)
-                            cam_xyzs = cas_world_xyzs - poses[head:tail, :3, 3].unsqueeze(1)
-                            cam_xyzs = cam_xyzs @ poses[head:tail, :3, :3] # [S, N, 3]
-                            # query if point is covered by any camera
-                            mask_z = cam_xyzs[:, :, 2] > 0 # [S, N]
-                            mask_x = torch.abs(cam_xyzs[:, :, 0]) < cx / fx * cam_xyzs[:, :, 2] + half_grid_size * 2
-                            mask_y = torch.abs(cam_xyzs[:, :, 1]) < cy / fy * cam_xyzs[:, :, 2] + half_grid_size * 2
-                            mask = (mask_z & mask_x & mask_y).sum(0).reshape(-1) # [N]
-                            # update count
-                            count[cas, indices] += mask
-                            head += S
-        # mark untrained grid as -1
-        self.density_grid[count == 0] = -1
-        print(f'[mark untrained grid] {(count == 0).sum()} from {self.grid_size ** 3 * self.cascade}')
-    @torch.no_grad()
-    def update_extra_state(self, decay=0.95, S=128):
-        # call before each epoch to update extra states.
-        if not self.cuda_ray:
-            return
-        ### update density grid
-        tmp_grid = - torch.ones_like(self.density_grid)
-        # full update.
-        if self.iter_density < 16:
-        #if True:
-            X = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
-            Y = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
-            Z = torch.arange(self.grid_size, dtype=torch.int32, device=self.density_bitfield.device).split(S)
-            for xs in X:
-                for ys in Y:
-                    for zs in Z:
-                        # construct points
-                        xx, yy, zz = custom_meshgrid(xs, ys, zs)
-                        coords = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1) # [N, 3], in [0, 128)
-                        indices = raymarching.morton3D(coords).long() # [N]
-                        xyzs = 2 * coords.float() / (self.grid_size - 1) - 1 # [N, 3] in [-1, 1]
-                        # cascading
-                        for cas in range(self.cascade):
-                            bound = min(2 ** cas, self.bound)
-                            half_grid_size = bound / self.grid_size
-                            # scale to current cascade's resolution
-                            cas_xyzs = xyzs * (bound - half_grid_size)
-                            # add noise in [-hgs, hgs]
-                            cas_xyzs += (torch.rand_like(cas_xyzs) * 2 - 1) * half_grid_size
-                            # query density
-                            sigmas = self.density(cas_xyzs)['sigma'].reshape(-1).detach()
-                            sigmas *= self.density_scale
-                            # assign
-                            tmp_grid[cas, indices] = sigmas
-        # partial update (half the computation)
-        # TODO: why no need of maxpool ?
-        else:
-            N = self.grid_size ** 3 // 4 # H * H * H / 4
-            for cas in range(self.cascade):
-                # random sample some positions
-                coords = torch.randint(0, self.grid_size, (N, 3), device=self.density_bitfield.device) # [N, 3], in [0, 128)
-                indices = raymarching.morton3D(coords).long() # [N]
-                # random sample occupied positions
-                occ_indices = torch.nonzero(self.density_grid[cas] > 0).squeeze(-1) # [Nz]
-                rand_mask = torch.randint(0, occ_indices.shape[0], [N], dtype=torch.long, device=self.density_bitfield.device)
-                occ_indices = occ_indices[rand_mask] # [Nz] --> [N], allow for duplication
-                occ_coords = raymarching.morton3D_invert(occ_indices) # [N, 3]
-                # concat
-                indices = torch.cat([indices, occ_indices], dim=0)
-                coords = torch.cat([coords, occ_coords], dim=0)
-                # same below
-                xyzs = 2 * coords.float() / (self.grid_size - 1) - 1 # [N, 3] in [-1, 1]
-                bound = min(2 ** cas, self.bound)
-                half_grid_size = bound / self.grid_size
-                # scale to current cascade's resolution
-                cas_xyzs = xyzs * (bound - half_grid_size)
-                # add noise in [-hgs, hgs]
-                cas_xyzs += (torch.rand_like(cas_xyzs) * 2 - 1) * half_grid_size
-                # query density
-                sigmas = self.density(cas_xyzs)['sigma'].reshape(-1).detach()
-                sigmas *= self.density_scale
-                # assign
-                tmp_grid[cas, indices] = sigmas
-        ## max-pool on tmp_grid for less aggressive culling [No significant improvement...]
-        # invalid_mask = tmp_grid < 0
-        # tmp_grid = F.max_pool3d(tmp_grid.view(self.cascade, 1, self.grid_size, self.grid_size, self.grid_size), kernel_size=3, stride=1, padding=1).view(self.cascade, -1)
-        # tmp_grid[invalid_mask] = -1
-        # ema update
-        valid_mask = (self.density_grid >= 0) & (tmp_grid >= 0)
-        self.density_grid[valid_mask] = torch.maximum(self.density_grid[valid_mask] * decay, tmp_grid[valid_mask])
-        self.mean_density = torch.mean(self.density_grid.clamp(min=0)).item() # -1 regions are viewed as 0 density.
-        #self.mean_density = torch.mean(self.density_grid[self.density_grid > 0]).item() # do not count -1 regions
-        self.iter_density += 1
-        # convert to bitfield
-        density_thresh = min(self.mean_density, self.density_thresh)
-        self.density_bitfield = raymarching.packbits(self.density_grid, density_thresh, self.density_bitfield)
-        ### update step counter
-        total_step = min(16, self.local_step)
-        if total_step > 0:
-            self.mean_count = int(self.step_counter[:total_step, 0].sum().item() / total_step)
-        self.local_step = 0
-        #print(f'[density grid] min={self.density_grid.min().item():.4f}, max={self.density_grid.max().item():.4f}, mean={self.mean_density:.4f}, occ_rate={(self.density_grid > 0.01).sum() / (128**3 * self.cascade):.3f} | [step counter] mean={self.mean_count}')
-    def render(self, rays_o, rays_d, staged=False, max_ray_batch=4096, **kwargs):
-        # rays_o, rays_d: [B, N, 3], assumes B == 1
-        # return: pred_rgb: [B, N, 3]
-        if self.cuda_ray:
-            _run = self.run_cuda
-        else:
-            _run = self.run
-        results = _run(rays_o, rays_d, **kwargs)
-        return results
-class _trunc_exp(Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32) # cast to float32
-    def forward(ctx, x):
-        ctx.save_for_backward(x)
-        return torch.exp(x)
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, g):
-        x = ctx.saved_tensors[0]
-        return g * torch.exp(x.clamp(-15, 15))
-trunc_exp = _trunc_exp.apply
-class NGPNetwork(NGPRenderer):
-    def __init__(self,
-                 num_layers=2,
-                 hidden_dim=64,
-                 geo_feat_dim=15,
-                 num_layers_color=3,
-                 hidden_dim_color=64,
-                 bound=0.5,
-                 max_resolution=128,
-                 base_resolution=16,
-                 n_levels=16,
-                 **kwargs
-                 ):
-        super().__init__(bound, **kwargs)
-        # sigma network
-        self.num_layers = num_layers
-        self.hidden_dim = hidden_dim
-        self.geo_feat_dim = geo_feat_dim
-        self.bound = bound
-        log2_hashmap_size = 19
-        n_features_per_level = 2
-        per_level_scale = np.exp2(np.log2(max_resolution / base_resolution) / (n_levels - 1))
-        self.encoder = tcnn.Encoding(
-            n_input_dims=3,
-            encoding_config={
-                "otype": "HashGrid",
-                "n_levels": n_levels,
-                "n_features_per_level": n_features_per_level,
-                "log2_hashmap_size": log2_hashmap_size,
-                "base_resolution": base_resolution,
-                "per_level_scale": per_level_scale,
-            },
-        )
-        self.sigma_net = tcnn.Network(
-            n_input_dims = n_levels * 2,
-            n_output_dims=1 + self.geo_feat_dim,
-            network_config={
-                "otype": "FullyFusedMLP",
-                "activation": "ReLU",
-                "output_activation": "None",
-                "n_neurons": hidden_dim,
-                "n_hidden_layers": num_layers - 1,
-            },
-        )
-        # color network
-        self.num_layers_color = num_layers_color
-        self.hidden_dim_color = hidden_dim_color
-        self.encoder_dir = tcnn.Encoding(
-            n_input_dims=3,
-            encoding_config={
-                "otype": "SphericalHarmonics",
-                "degree": 4,
-            },
-        )
-        self.in_dim_color = self.encoder_dir.n_output_dims + self.geo_feat_dim
-        self.color_net = tcnn.Network(
-            n_input_dims = self.in_dim_color,
-            n_output_dims=3,
-            network_config={
-                "otype": "FullyFusedMLP",
-                "activation": "ReLU",
-                "output_activation": "None",
-                "n_neurons": hidden_dim_color,
-                "n_hidden_layers": num_layers_color - 1,
-            },
-        )
-        self.density_scale, self.density_std = 10.0, 0.25
-    def forward(self, x, d):
-        # x: [N, 3], in [-bound, bound]
-        # d: [N, 3], nomalized in [-1, 1]
-        # sigma
-        x_raw = x
-        x = (x + self.bound) / (2 * self.bound)  # to [0, 1]
-        x = self.encoder(x)
-        h = self.sigma_net(x)
-        # sigma = F.relu(h[..., 0])
-        density = h[..., 0]
-        # add density bias
-        dist = torch.norm(x_raw, dim=-1)
-        density_bias = (1 - dist / self.density_std) * self.density_scale
-        density = density_bias + density
-        sigma = F.softplus(density)
-        geo_feat = h[..., 1:]
-        # color
-        d = (d + 1) / 2  # tcnn SH encoding requires inputs to be in [0, 1]
-        d = self.encoder_dir(d)
-        # p = torch.zeros_like(geo_feat[..., :1]) # manual input padding
-        h = torch.cat([d, geo_feat], dim=-1)
-        h = self.color_net(h)
-        # sigmoid activation for rgb
-        color = torch.sigmoid(h)
-        return sigma, color
-    def density(self, x):
-        # x: [N, 3], in [-bound, bound]
-        x_raw = x
-        x = (x + self.bound) / (2 * self.bound)  # to [0, 1]
-        x = self.encoder(x)
-        h = self.sigma_net(x)
-        # sigma = F.relu(h[..., 0])
-        density = h[..., 0]
-        # add density bias
-        dist = torch.norm(x_raw, dim=-1)
-        density_bias = (1 - dist / self.density_std) * self.density_scale
-        density = density_bias + density
-        sigma = F.softplus(density)
-        geo_feat = h[..., 1:]
-        return {
-            'sigma': sigma,
-            'geo_feat': geo_feat,
-        }
-    # allow masked inference
-    def color(self, x, d, mask=None, geo_feat=None, **kwargs):
-        # x: [N, 3] in [-bound, bound]
-        # mask: [N,], bool, indicates where we actually needs to compute rgb.
-        x = (x + self.bound) / (2 * self.bound)  # to [0, 1]
-        if mask is not None:
-            rgbs = torch.zeros(mask.shape[0], 3, dtype=x.dtype, device=x.device) # [N, 3]
-            # in case of empty mask
-            if not mask.any():
-                return rgbs
-            x = x[mask]
-            d = d[mask]
-            geo_feat = geo_feat[mask]
-        # color
-        d = (d + 1) / 2  # tcnn SH encoding requires inputs to be in [0, 1]
-        d = self.encoder_dir(d)
-        h = torch.cat([d, geo_feat], dim=-1)
-        h = self.color_net(h)
-        # sigmoid activation for rgb
-        h = torch.sigmoid(h)
-        if mask is not None:
-            rgbs[mask] = h.to(rgbs.dtype)  # fp16 --> fp32
-        else:
-            rgbs = h
-        return rgbs

3drecon/renderer/renderer.py DELETED Viewed

@@ -1,640 +0,0 @@
-import abc
-import os
-from pathlib import Path
-import cv2
-import numpy as np
-import pytorch_lightning as pl
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from omegaconf import OmegaConf
-from skimage.io import imread, imsave
-from PIL import Image
-from torch.optim.lr_scheduler import LambdaLR
-from renderer.neus_networks import SDFNetwork, RenderingNetwork, SingleVarianceNetwork, SDFHashGridNetwork, RenderingFFNetwork
-from renderer.ngp_renderer import NGPNetwork
-from util import instantiate_from_config, read_pickle, concat_images_list
-DEFAULT_RADIUS = np.sqrt(3)/2
-DEFAULT_SIDE_LENGTH = 0.6
-def sample_pdf(bins, weights, n_samples, det=True):
-    device = bins.device
-    dtype = bins.dtype
-    # This implementation is from NeRF
-    # Get pdf
-    weights = weights + 1e-5  # prevent nans
-    pdf = weights / torch.sum(weights, -1, keepdim=True)
-    cdf = torch.cumsum(pdf, -1)
-    cdf = torch.cat([torch.zeros_like(cdf[..., :1]), cdf], -1)
-    # Take uniform samples
-    if det:
-        u = torch.linspace(0. + 0.5 / n_samples, 1. - 0.5 / n_samples, steps=n_samples, dtype=dtype, device=device)
-        u = u.expand(list(cdf.shape[:-1]) + [n_samples])
-    else:
-        u = torch.rand(list(cdf.shape[:-1]) + [n_samples], dtype=dtype, device=device)
-    # Invert CDF
-    u = u.contiguous()
-    inds = torch.searchsorted(cdf, u, right=True)
-    below = torch.max(torch.zeros_like(inds - 1), inds - 1)
-    above = torch.min((cdf.shape[-1] - 1) * torch.ones_like(inds), inds)
-    inds_g = torch.stack([below, above], -1)  # (batch, N_samples, 2)
-    matched_shape = [inds_g.shape[0], inds_g.shape[1], cdf.shape[-1]]
-    cdf_g = torch.gather(cdf.unsqueeze(1).expand(matched_shape), 2, inds_g)
-    bins_g = torch.gather(bins.unsqueeze(1).expand(matched_shape), 2, inds_g)
-    denom = (cdf_g[..., 1] - cdf_g[..., 0])
-    denom = torch.where(denom < 1e-5, torch.ones_like(denom), denom)
-    t = (u - cdf_g[..., 0]) / denom
-    samples = bins_g[..., 0] + t * (bins_g[..., 1] - bins_g[..., 0])
-    return samples
-def near_far_from_sphere(rays_o, rays_d, radius=DEFAULT_RADIUS):
-    a = torch.sum(rays_d ** 2, dim=-1, keepdim=True)
-    b = torch.sum(rays_o * rays_d, dim=-1, keepdim=True)
-    mid = -b / a
-    near = mid - radius
-    far = mid + radius
-    return near, far
-class BackgroundRemoval:
-    def __init__(self, device='cuda'):
-        from carvekit.api.high import HiInterface
-        self.interface = HiInterface(
-            object_type="object",  # Can be "object" or "hairs-like".
-            batch_size_seg=5,
-            batch_size_matting=1,
-            device=device,
-            seg_mask_size=640,  # Use 640 for Tracer B7 and 320 for U2Net
-            matting_mask_size=2048,
-            trimap_prob_threshold=231,
-            trimap_dilation=30,
-            trimap_erosion_iters=5,
-            fp16=True,
-        )
-    @torch.no_grad()
-    def __call__(self, image):
-        # image: [H, W, 3] array in [0, 255].
-        image = Image.fromarray(image)
-        image = self.interface([image])[0]
-        image = np.array(image)
-        return image
-class BaseRenderer(nn.Module):
-    def __init__(self, train_batch_num, test_batch_num):
-        super().__init__()
-        self.train_batch_num = train_batch_num
-        self.test_batch_num = test_batch_num
-    @abc.abstractmethod
-    def render_impl(self, ray_batch, is_train, step):
-        pass
-    @abc.abstractmethod
-    def render_with_loss(self, ray_batch, is_train, step):
-        pass
-    def render(self, ray_batch, is_train, step):
-        batch_num = self.train_batch_num if is_train else self.test_batch_num
-        ray_num = ray_batch['rays_o'].shape[0]
-        outputs = {}
-        for ri in range(0, ray_num, batch_num):
-            cur_ray_batch = {}
-            for k, v in ray_batch.items():
-                cur_ray_batch[k] = v[ri:ri + batch_num]
-            cur_outputs = self.render_impl(cur_ray_batch, is_train, step)
-            for k, v in cur_outputs.items():
-                if k not in outputs: outputs[k] = []
-                outputs[k].append(v)
-        for k, v in outputs.items():
-            outputs[k] = torch.cat(v, 0)
-        return outputs
-class NeuSRenderer(BaseRenderer):
-    def __init__(self, train_batch_num, test_batch_num, lambda_eikonal_loss=0.1, use_mask=True,
-                 lambda_rgb_loss=1.0, lambda_mask_loss=0.0, rgb_loss='soft_l1', coarse_sn=64, fine_sn=64):
-        super().__init__(train_batch_num, test_batch_num)
-        self.n_samples = coarse_sn
-        self.n_importance = fine_sn
-        self.up_sample_steps = 4
-        self.anneal_end = 200
-        self.use_mask = use_mask
-        self.lambda_eikonal_loss = lambda_eikonal_loss
-        self.lambda_rgb_loss = lambda_rgb_loss
-        self.lambda_mask_loss = lambda_mask_loss
-        self.rgb_loss = rgb_loss
-        self.sdf_network = SDFNetwork(d_out=257, d_in=3, d_hidden=256, n_layers=8, skip_in=[4], multires=6, bias=0.5, scale=1.0, geometric_init=True, weight_norm=True)
-        self.color_network = RenderingNetwork(d_feature=256, d_in=9, d_out=3, d_hidden=256, n_layers=4, weight_norm=True, multires_view=4, squeeze_out=True)
-        self.default_dtype = torch.float32
-        self.deviation_network = SingleVarianceNetwork(0.3)
-    @torch.no_grad()
-    def get_vertex_colors(self, vertices):
-        """
-        @param vertices:  n,3
-        @return:
-        """
-        V = vertices.shape[0]
-        bn = 20480
-        verts_colors = []
-        with torch.no_grad():
-            for vi in range(0, V, bn):
-                verts = torch.from_numpy(vertices[vi:vi+bn].astype(np.float32)).cuda()
-                feats = self.sdf_network(verts)[..., 1:]
-                gradients = self.sdf_network.gradient(verts)  # ...,3
-                gradients = F.normalize(gradients, dim=-1)
-                colors = self.color_network(verts, gradients, gradients, feats)
-                colors = torch.clamp(colors,min=0,max=1).cpu().numpy()
-                verts_colors.append(colors)
-        verts_colors = (np.concatenate(verts_colors, 0)*255).astype(np.uint8)
-        return verts_colors
-    def upsample(self, rays_o, rays_d, z_vals, sdf, n_importance, inv_s):
-        """
-        Up sampling give a fixed inv_s
-        """
-        device = rays_o.device
-        batch_size, n_samples = z_vals.shape
-        pts = rays_o[:, None, :] + rays_d[:, None, :] * z_vals[..., :, None]  # n_rays, n_samples, 3
-        inner_mask = self.get_inner_mask(pts)
-        # radius = torch.linalg.norm(pts, ord=2, dim=-1, keepdim=False)
-        inside_sphere = inner_mask[:, :-1] | inner_mask[:, 1:]
-        sdf = sdf.reshape(batch_size, n_samples)
-        prev_sdf, next_sdf = sdf[:, :-1], sdf[:, 1:]
-        prev_z_vals, next_z_vals = z_vals[:, :-1], z_vals[:, 1:]
-        mid_sdf = (prev_sdf + next_sdf) * 0.5
-        cos_val = (next_sdf - prev_sdf) / (next_z_vals - prev_z_vals + 1e-5)
-        prev_cos_val = torch.cat([torch.zeros([batch_size, 1], dtype=self.default_dtype, device=device), cos_val[:, :-1]], dim=-1)
-        cos_val = torch.stack([prev_cos_val, cos_val], dim=-1)
-        cos_val, _ = torch.min(cos_val, dim=-1, keepdim=False)
-        cos_val = cos_val.clip(-1e3, 0.0) * inside_sphere
-        dist = (next_z_vals - prev_z_vals)
-        prev_esti_sdf = mid_sdf - cos_val * dist * 0.5
-        next_esti_sdf = mid_sdf + cos_val * dist * 0.5
-        prev_cdf = torch.sigmoid(prev_esti_sdf * inv_s)
-        next_cdf = torch.sigmoid(next_esti_sdf * inv_s)
-        alpha = (prev_cdf - next_cdf + 1e-5) / (prev_cdf + 1e-5)
-        weights = alpha * torch.cumprod(
-            torch.cat([torch.ones([batch_size, 1], dtype=self.default_dtype, device=device), 1. - alpha + 1e-7], -1), -1)[:, :-1]
-        z_samples = sample_pdf(z_vals, weights, n_importance, det=True).detach()
-        return z_samples
-    def cat_z_vals(self, rays_o, rays_d, z_vals, new_z_vals, sdf, last=False):
-        batch_size, n_samples = z_vals.shape
-        _, n_importance = new_z_vals.shape
-        pts = rays_o[:, None, :] + rays_d[:, None, :] * new_z_vals[..., :, None]
-        z_vals = torch.cat([z_vals, new_z_vals], dim=-1)
-        z_vals, index = torch.sort(z_vals, dim=-1)
-        if not last:
-            device = pts.device
-            new_sdf = self.sdf_network.sdf(pts.reshape(-1, 3)).reshape(batch_size, n_importance)
-            sdf = torch.cat([sdf, new_sdf], dim=-1)
-            xx = torch.arange(batch_size)[:, None].expand(batch_size, n_samples + n_importance).reshape(-1).to(device)
-            index = index.reshape(-1)
-            sdf = sdf[(xx, index)].reshape(batch_size, n_samples + n_importance)
-        return z_vals, sdf
-    def sample_depth(self, rays_o, rays_d, near, far, perturb):
-        n_samples = self.n_samples
-        n_importance = self.n_importance
-        up_sample_steps = self.up_sample_steps
-        device = rays_o.device
-        # sample points
-        batch_size = len(rays_o)
-        z_vals = torch.linspace(0.0, 1.0, n_samples, dtype=self.default_dtype, device=device)   # sn
-        z_vals = near + (far - near) * z_vals[None, :]            # rn,sn
-        if perturb > 0:
-            t_rand = (torch.rand([batch_size, 1]).to(device) - 0.5)
-            z_vals = z_vals + t_rand * 2.0 / n_samples
-        # Up sample
-        with torch.no_grad():
-            pts = rays_o[:, None, :] + rays_d[:, None, :] * z_vals[..., :, None]
-            sdf = self.sdf_network.sdf(pts).reshape(batch_size, n_samples)
-            for i in range(up_sample_steps):
-                rn, sn = z_vals.shape
-                inv_s = torch.ones(rn, sn - 1, dtype=self.default_dtype, device=device) * 64 * 2 ** i
-                new_z_vals = self.upsample(rays_o, rays_d, z_vals, sdf, n_importance // up_sample_steps, inv_s)
-                z_vals, sdf = self.cat_z_vals(rays_o, rays_d, z_vals, new_z_vals, sdf, last=(i + 1 == up_sample_steps))
-        return z_vals
-    def compute_sdf_alpha(self, points, dists, dirs, cos_anneal_ratio, step):
-        # points [...,3] dists [...] dirs[...,3]
-        sdf_nn_output = self.sdf_network(points)
-        sdf = sdf_nn_output[..., 0]
-        feature_vector = sdf_nn_output[..., 1:]
-        gradients = self.sdf_network.gradient(points)  # ...,3
-        inv_s = self.deviation_network(points).clip(1e-6, 1e6)  # ...,1
-        inv_s = inv_s[..., 0]
-        true_cos = (dirs * gradients).sum(-1)  # [...]
-        iter_cos = -(F.relu(-true_cos * 0.5 + 0.5) * (1.0 - cos_anneal_ratio) +
-                     F.relu(-true_cos) * cos_anneal_ratio)  # always non-positive
-        # Estimate signed distances at section points
-        estimated_next_sdf = sdf + iter_cos * dists * 0.5
-        estimated_prev_sdf = sdf - iter_cos * dists * 0.5
-        prev_cdf = torch.sigmoid(estimated_prev_sdf * inv_s)
-        next_cdf = torch.sigmoid(estimated_next_sdf * inv_s)
-        p = prev_cdf - next_cdf
-        c = prev_cdf
-        alpha = ((p + 1e-5) / (c + 1e-5)).clip(0.0, 1.0)  # [...]
-        return alpha, gradients, feature_vector, inv_s, sdf
-    def get_anneal_val(self, step):
-        if self.anneal_end < 0:
-            return 1.0
-        else:
-            return np.min([1.0, step / self.anneal_end])
-    def get_inner_mask(self, points):
-        return torch.sum(torch.abs(points)<=DEFAULT_SIDE_LENGTH,-1)==3
-    def render_impl(self, ray_batch, is_train, step):
-        near, far = near_far_from_sphere(ray_batch['rays_o'], ray_batch['rays_d'])
-        rays_o, rays_d = ray_batch['rays_o'], ray_batch['rays_d']
-        z_vals = self.sample_depth(rays_o, rays_d, near, far, is_train)
-        batch_size, n_samples = z_vals.shape
-        # section length in original space
-        dists = z_vals[..., 1:] - z_vals[..., :-1]  # rn,sn-1
-        dists = torch.cat([dists, dists[..., -1:]], -1)  # rn,sn
-        mid_z_vals = z_vals + dists * 0.5
-        points = rays_o.unsqueeze(-2) + rays_d.unsqueeze(-2) * mid_z_vals.unsqueeze(-1) # rn, sn, 3
-        inner_mask = self.get_inner_mask(points)
-        dirs = rays_d.unsqueeze(-2).expand(batch_size, n_samples, 3)
-        dirs = F.normalize(dirs, dim=-1)
-        device = rays_o.device
-        alpha, sampled_color, gradient_error, normal = torch.zeros(batch_size, n_samples, dtype=self.default_dtype, device=device), \
-            torch.zeros(batch_size, n_samples, 3, dtype=self.default_dtype, device=device), \
-            torch.zeros([batch_size, n_samples], dtype=self.default_dtype, device=device), \
-            torch.zeros([batch_size, n_samples, 3], dtype=self.default_dtype, device=device)
-        if torch.sum(inner_mask) > 0:
-            cos_anneal_ratio = self.get_anneal_val(step) if is_train else 1.0
-            alpha[inner_mask], gradients, feature_vector, inv_s, sdf = self.compute_sdf_alpha(points[inner_mask], dists[inner_mask], dirs[inner_mask], cos_anneal_ratio, step)
-            sampled_color[inner_mask] = self.color_network(points[inner_mask], gradients, -dirs[inner_mask], feature_vector)
-            # Eikonal loss
-            gradient_error[inner_mask] = (torch.linalg.norm(gradients, ord=2, dim=-1) - 1.0) ** 2 # rn,sn
-            normal[inner_mask] = F.normalize(gradients, dim=-1)
-        weights = alpha * torch.cumprod(torch.cat([torch.ones([batch_size, 1], dtype=self.default_dtype, device=device), 1. - alpha + 1e-7], -1), -1)[..., :-1]  # rn,sn
-        mask = torch.sum(weights,dim=1).unsqueeze(-1) # rn,1
-        color = (sampled_color * weights[..., None]).sum(dim=1) + (1 - mask) # add white background
-        normal = (normal * weights[..., None]).sum(dim=1)
-        outputs = {
-            'rgb': color,  # rn,3
-            'gradient_error': gradient_error,  # rn,sn
-            'inner_mask': inner_mask,  # rn,sn
-            'normal': normal,  # rn,3
-            'mask': mask,  # rn,1
-        }
-        return outputs
-    def render_with_loss(self, ray_batch, is_train, step):
-        render_outputs = self.render(ray_batch, is_train, step)
-        rgb_gt = ray_batch['rgb']
-        rgb_pr = render_outputs['rgb']
-        if self.rgb_loss == 'soft_l1':
-            epsilon = 0.001
-            rgb_loss = torch.sqrt(torch.sum((rgb_gt - rgb_pr) ** 2, dim=-1) + epsilon)
-        elif self.rgb_loss =='mse':
-            rgb_loss = F.mse_loss(rgb_pr, rgb_gt, reduction='none')
-        else:
-            raise NotImplementedError
-        rgb_loss = torch.mean(rgb_loss)
-        eikonal_loss = torch.sum(render_outputs['gradient_error'] * render_outputs['inner_mask']) / torch.sum(render_outputs['inner_mask'] + 1e-5)
-        loss = rgb_loss * self.lambda_rgb_loss + eikonal_loss * self.lambda_eikonal_loss
-        loss_batch = {
-            'eikonal': eikonal_loss,
-            'rendering': rgb_loss,
-            # 'mask': mask_loss,
-        }
-        if self.lambda_mask_loss>0 and self.use_mask:
-            mask_loss = F.mse_loss(render_outputs['mask'], ray_batch['mask'], reduction='none').mean()
-            loss += mask_loss * self.lambda_mask_loss
-            loss_batch['mask'] = mask_loss
-        return loss, loss_batch
-class NeRFRenderer(BaseRenderer):
-    def __init__(self, train_batch_num, test_batch_num, bound=0.5, use_mask=False, lambda_rgb_loss=1.0, lambda_mask_loss=0.0):
-        super().__init__(train_batch_num, test_batch_num)
-        self.train_batch_num = train_batch_num
-        self.test_batch_num = test_batch_num
-        self.use_mask = use_mask
-        self.field = NGPNetwork(bound=bound)
-        self.update_interval = 16
-        self.fp16 = True
-        self.lambda_rgb_loss = lambda_rgb_loss
-        self.lambda_mask_loss = lambda_mask_loss
-    def render_impl(self, ray_batch, is_train, step):
-        rays_o, rays_d = ray_batch['rays_o'], ray_batch['rays_d']
-        with torch.cuda.amp.autocast(enabled=self.fp16):
-            if step % self.update_interval==0:
-                self.field.update_extra_state()
-            outputs = self.field.render(rays_o, rays_d,)
-        renderings={
-            'rgb': outputs['image'],
-            'depth': outputs['depth'],
-            'mask': outputs['weights_sum'].unsqueeze(-1),
-        }
-        return renderings
-    def render_with_loss(self, ray_batch, is_train, step):
-        render_outputs = self.render(ray_batch, is_train, step)
-        rgb_gt = ray_batch['rgb']
-        rgb_pr = render_outputs['rgb']
-        epsilon = 0.001
-        rgb_loss = torch.sqrt(torch.sum((rgb_gt - rgb_pr) ** 2, dim=-1) + epsilon)
-        rgb_loss = torch.mean(rgb_loss)
-        loss = rgb_loss * self.lambda_rgb_loss
-        loss_batch = {'rendering': rgb_loss}
-        if self.use_mask:
-            mask_loss = F.mse_loss(render_outputs['mask'], ray_batch['mask'], reduction='none')
-            mask_loss = torch.mean(mask_loss)
-            loss = loss + mask_loss * self.lambda_mask_loss
-            loss_batch['mask'] = mask_loss
-        return loss, loss_batch
-def cartesian_to_spherical(xyz):
-    ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
-    xy = xyz[:, 0] ** 2 + xyz[:, 1] ** 2
-    z = np.sqrt(xy + xyz[:, 2] ** 2)
-    theta = np.arctan2(np.sqrt(xy), xyz[:, 2])  # for elevation angle defined from Z-axis down
-    # ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
-    azimuth = np.arctan2(xyz[:, 1], xyz[:, 0])
-    return np.array([theta, azimuth, z])
-def get_pose(target_RT):
-    R, T = target_RT[:3, :3], target_RT[:, -1]
-    T_target = -R.T @ T
-    theta_target, azimuth_target, z_target = cartesian_to_spherical(T_target[None, :])
-    return theta_target, azimuth_target, z_target
-class RendererTrainer(pl.LightningModule):
-    def __init__(self, image_path, data_path, total_steps, warm_up_steps, log_dir, train_batch_fg_num=0,
-                 use_cube_feats=False, cube_ckpt=None, cube_cfg=None, cube_bound=0.5,
-                 train_batch_num=4096, test_batch_num=8192, use_warm_up=True, use_mask=True,
-                 lambda_rgb_loss=1.0, lambda_mask_loss=0.0, renderer='neus',
-                 # used in neus
-                 lambda_eikonal_loss=0.1,
-                 coarse_sn=64, fine_sn=64):
-        super().__init__()
-        self.num_images = 36 # todo ours 36, syncdreamer 16
-        self.image_size = 256
-        self.log_dir = log_dir
-        (Path(log_dir)/'images').mkdir(exist_ok=True, parents=True)
-        self.train_batch_num = train_batch_num
-        self.train_batch_fg_num = train_batch_fg_num
-        self.test_batch_num = test_batch_num
-        self.image_path = image_path
-        self.data_path = data_path
-        self.total_steps = total_steps
-        self.warm_up_steps = warm_up_steps
-        self.use_mask = use_mask
-        self.lambda_eikonal_loss = lambda_eikonal_loss
-        self.lambda_rgb_loss = lambda_rgb_loss
-        self.lambda_mask_loss = lambda_mask_loss
-        self.use_warm_up = use_warm_up
-        self.use_cube_feats, self.cube_cfg, self.cube_ckpt = use_cube_feats, cube_cfg, cube_ckpt
-        self._init_dataset()
-        if renderer=='neus':
-            self.renderer = NeuSRenderer(train_batch_num, test_batch_num,
-                                         lambda_rgb_loss=lambda_rgb_loss,
-                                         lambda_eikonal_loss=lambda_eikonal_loss,
-                                         lambda_mask_loss=lambda_mask_loss,
-                                         coarse_sn=coarse_sn, fine_sn=fine_sn)
-        elif renderer=='ngp':
-            self.renderer = NeRFRenderer(train_batch_num, test_batch_num, bound=cube_bound, use_mask=use_mask, lambda_mask_loss=lambda_mask_loss, lambda_rgb_loss=lambda_rgb_loss,)
-        else:
-            raise NotImplementedError
-        self.validation_index = 0
-    def _construct_ray_batch(self, images_info):
-        image_num = images_info['images'].shape[0]
-        _, h, w, _ = images_info['images'].shape
-        coords = torch.stack(torch.meshgrid(torch.arange(h), torch.arange(w)), -1)[:, :, (1, 0)]  # h,w,2
-        coords = coords.float()[None, :, :, :].repeat(image_num, 1, 1, 1)  # imn,h,w,2
-        coords = coords.reshape(image_num, h * w, 2)
-        coords = torch.cat([coords, torch.ones(image_num, h * w, 1, dtype=torch.float32)], 2)  # imn,h*w,3
-        # imn,h*w,3 @ imn,3,3 => imn,h*w,3
-        rays_d = coords @ torch.inverse(images_info['Ks']).permute(0, 2, 1)
-        poses = images_info['poses']  # imn,3,4
-        R, t = poses[:, :, :3], poses[:, :, 3:]
-        rays_d = rays_d @ R
-        rays_d = F.normalize(rays_d, dim=-1)
-        rays_o = -R.permute(0,2,1) @ t # imn,3,3 @ imn,3,1
-        rays_o = rays_o.permute(0, 2, 1).repeat(1, h*w, 1) # imn,h*w,3
-        ray_batch = {
-            'rgb': images_info['images'].reshape(image_num*h*w,3),
-            'mask': images_info['masks'].reshape(image_num*h*w,1),
-            'rays_o': rays_o.reshape(image_num*h*w,3).float(),
-            'rays_d': rays_d.reshape(image_num*h*w,3).float(),
-        }
-        return ray_batch
-    @staticmethod
-    def load_model(cfg, ckpt):
-        config = OmegaConf.load(cfg)
-        model = instantiate_from_config(config.model)
-        print(f'loading model from {ckpt} ...')
-        ckpt = torch.load(ckpt)
-        model.load_state_dict(ckpt['state_dict'])
-        model = model.cuda().eval()
-        return model
-    def _init_dataset(self):
-        mask_predictor = BackgroundRemoval()
-        # syncdreamer fixed 16 views
-        # self.K, self.azs, self.els, self.dists, self.poses = read_pickle(f'meta_info/camera-{self.num_images}.pkl')
-        # for ours+NeuS, we pre fix 36 views
-        self.K = np.array([[280.,0.,128.],[0.,280.,128.],[0.,0.,1.]], dtype=np.float32)
-        data_dir = os.path.join(self.data_path, "mario/render_sync_36_single/model/")   # fixed 36 views
-        # get all files .npy
-        self.azs = []
-        self.els = []
-        self.dists = []
-        self.poses = []
-        for index in range(self.num_images):
-            pose = np.load(os.path.join(data_dir, "%03d.npy"%index))[:3, :]   # in blender
-            self.poses.append(pose)
-            theta, azimuth, radius = get_pose(pose)
-            self.azs.append(azimuth)
-            self.els.append(theta)
-            self.dists.append(radius)
-        # stack to numpy along axis 0
-        self.azs = np.stack(self.azs, axis=0)       # [25,]
-        self.els = np.stack(self.els, axis=0)       # [25,]
-        self.dists = np.stack(self.dists, axis=0)   # [25,]
-        self.poses = np.stack(self.poses, axis=0)   # [25, 3, 4]
-        self.images_info = {'images': [] ,'masks': [], 'Ks': [], 'poses':[]}
-        img = imread(self.image_path)
-        for index in range(self.num_images):
-            rgb = np.copy(img[:,index*self.image_size:(index+1)*self.image_size,:])
-            # predict mask
-            if self.use_mask:
-                imsave(f'{self.log_dir}/input-{index}.png', rgb)
-                masked_image = mask_predictor(rgb)
-                imsave(f'{self.log_dir}/masked-{index}.png', masked_image)
-                mask = masked_image[:,:,3].astype(np.float32)/255
-            else:
-                h, w, _ = rgb.shape
-                mask = np.zeros([h,w], np.float32)
-            rgb = rgb.astype(np.float32)/255
-            K, pose = np.copy(self.K), self.poses[index]
-            self.images_info['images'].append(torch.from_numpy(rgb.astype(np.float32))) # h,w,3
-            self.images_info['masks'].append(torch.from_numpy(mask.astype(np.float32))) # h,w
-            self.images_info['Ks'].append(torch.from_numpy(K.astype(np.float32)))
-            self.images_info['poses'].append(torch.from_numpy(pose.astype(np.float32)))
-        for k, v in self.images_info.items(): self.images_info[k] = torch.stack(v, 0) # stack all values
-        self.train_batch = self._construct_ray_batch(self.images_info)
-        self.train_batch_pseudo_fg = {}
-        pseudo_fg_mask = torch.sum(self.train_batch['rgb']>0.99,1)!=3
-        for k, v in self.train_batch.items():
-            self.train_batch_pseudo_fg[k] = v[pseudo_fg_mask]
-        self.train_ray_fg_num = int(torch.sum(pseudo_fg_mask).cpu().numpy())
-        self.train_ray_num = self.num_images * self.image_size ** 2
-        self._shuffle_train_batch()
-        self._shuffle_train_fg_batch()
-    def _shuffle_train_batch(self):
-        self.train_batch_i = 0
-        shuffle_idxs = torch.randperm(self.train_ray_num, device='cpu') # shuffle
-        for k, v in self.train_batch.items():
-            self.train_batch[k] = v[shuffle_idxs]
-    def _shuffle_train_fg_batch(self):
-        self.train_batch_fg_i = 0
-        shuffle_idxs = torch.randperm(self.train_ray_fg_num, device='cpu') # shuffle
-        for k, v in self.train_batch_pseudo_fg.items():
-            self.train_batch_pseudo_fg[k] = v[shuffle_idxs]
-    def training_step(self, batch, batch_idx):
-        train_ray_batch = {k: v[self.train_batch_i:self.train_batch_i + self.train_batch_num].cuda() for k, v in self.train_batch.items()}
-        self.train_batch_i += self.train_batch_num
-        if self.train_batch_i + self.train_batch_num >= self.train_ray_num: self._shuffle_train_batch()
-        if self.train_batch_fg_num>0:
-            train_ray_batch_fg = {k: v[self.train_batch_fg_i:self.train_batch_fg_i+self.train_batch_fg_num].cuda() for k, v in self.train_batch_pseudo_fg.items()}
-            self.train_batch_fg_i += self.train_batch_fg_num
-            if self.train_batch_fg_i + self.train_batch_fg_num >= self.train_ray_fg_num: self._shuffle_train_fg_batch()
-            for k, v in train_ray_batch_fg.items():
-                train_ray_batch[k] = torch.cat([train_ray_batch[k], v], 0)
-        loss, loss_batch = self.renderer.render_with_loss(train_ray_batch, is_train=True, step=self.global_step)
-        self.log_dict(loss_batch, prog_bar=True, logger=True, on_step=True, on_epoch=False, rank_zero_only=True)
-        self.log('step', self.global_step, prog_bar=True, on_step=True, on_epoch=False, logger=False, rank_zero_only=True)
-        lr = self.optimizers().param_groups[0]['lr']
-        self.log('lr', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False, rank_zero_only=True)
-        return loss
-    def _slice_images_info(self, index):
-        return {k:v[index:index+1] for k, v in self.images_info.items()}
-    @torch.no_grad()
-    def validation_step(self, batch, batch_idx):
-        with torch.no_grad():
-            if self.global_rank==0:
-                # we output an rendering image
-                images_info = self._slice_images_info(self.validation_index)
-                self.validation_index += 1
-                self.validation_index %= self.num_images
-                test_ray_batch = self._construct_ray_batch(images_info)
-                test_ray_batch = {k: v.cuda() for k,v in test_ray_batch.items()}
-                test_ray_batch['near'], test_ray_batch['far'] = near_far_from_sphere(test_ray_batch['rays_o'], test_ray_batch['rays_d'])
-                render_outputs = self.renderer.render(test_ray_batch, False, self.global_step)
-                process = lambda x: (x.cpu().numpy() * 255).astype(np.uint8)
-                h, w = self.image_size, self.image_size
-                rgb = torch.clamp(render_outputs['rgb'].reshape(h, w, 3), max=1.0, min=0.0)
-                mask = torch.clamp(render_outputs['mask'].reshape(h, w, 1), max=1.0, min=0.0)
-                mask_ = torch.repeat_interleave(mask, 3, dim=-1)
-                output_image = concat_images_list(process(rgb), process(mask_))
-                if 'normal' in render_outputs:
-                    normal = torch.clamp((render_outputs['normal'].reshape(h, w, 3) + 1) / 2, max=1.0, min=0.0)
-                    normal = normal * mask # we only show foregound normal
-                    output_image = concat_images_list(output_image, process(normal))
-                # save images
-                imsave(f'{self.log_dir}/images/{self.global_step}.jpg', output_image)
-    def configure_optimizers(self):
-        lr = self.learning_rate
-        opt = torch.optim.AdamW([{"params": self.renderer.parameters(), "lr": lr},], lr=lr)
-        def schedule_fn(step):
-            total_step = self.total_steps
-            warm_up_step = self.warm_up_steps
-            warm_up_init = 0.02
-            warm_up_end = 1.0
-            final_lr = 0.02
-            interval = 1000
-            times = total_step // interval
-            ratio = np.power(final_lr, 1/times)
-            if step<warm_up_step:
-                learning_rate = (step / warm_up_step) * (warm_up_end - warm_up_init) + warm_up_init
-            else:
-                learning_rate = ratio ** (step // interval) * warm_up_end
-            return learning_rate
-        if self.use_warm_up:
-            scheduler = [{
-                    'scheduler': LambdaLR(opt, lr_lambda=schedule_fn),
-                    'interval': 'step',
-                    'frequency': 1
-                }]
-        else:
-            scheduler = []
-        return [opt], scheduler

3drecon/run_NeuS.py DELETED Viewed

@@ -1,32 +0,0 @@
-import os
-import numpy as np
-from tqdm import tqdm
-# ours + NeuS
-DATA_DIR = "/home/xin/data/EscherNet/Data/GSO30"   # GSO
-exp_dir = "/home/xin/6DoF/GSO3D/"
-config = "configs/neus_36.yaml"
-exps = [1]
-# exps = [1, 2, 3, 5, 10]
-for exp in exps:
-    OUTPUT_DIR = os.path.join(exp_dir, f"logs_GSO_T{exp}M36_99k")
-    output_NeuS = f"ours_GSO_T{exp}"
-    os.makedirs(output_NeuS, exist_ok=True)
-    obj_names = os.listdir(DATA_DIR)
-    for obj_name in tqdm(obj_names):
-        if os.path.exists(os.path.join(output_NeuS, "NeuS", obj_name, "mesh.ply")):
-            print("NeuS already trained for: ", obj_name)
-            continue
-        # remove the folder for new training
-        os.system(f"rm -rf {output_NeuS}/NeuS/{obj_name}")
-        print("Training NeuS for: ", obj_name)
-        input_img = os.path.join(OUTPUT_DIR, obj_name, "0.png")
-        # input_img = os.path.join(OUTPUT_DIR, obj_name, "gt.png")    #  ground truth image
-        cmd = f"python train_renderer.py -i {input_img} \
-              -d {DATA_DIR} \
-              -n {obj_name} \
-              -b {config} \
-              -l {output_NeuS}/NeuS"
-        os.system(cmd)

3drecon/train_renderer.py DELETED Viewed

@@ -1,188 +0,0 @@
-import argparse
-import imageio
-import numpy as np
-import torch
-import torch.nn.functional as F
-from pathlib import Path
-import trimesh
-from omegaconf import OmegaConf
-from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor, Callback
-from pytorch_lightning.loggers import TensorBoardLogger
-from pytorch_lightning import Trainer
-from skimage.io import imsave
-from tqdm import tqdm
-import mcubes
-from renderer.renderer import NeuSRenderer, DEFAULT_SIDE_LENGTH
-from util import instantiate_from_config, read_pickle
-class ResumeCallBacks(Callback):
-    def __init__(self):
-        pass
-    def on_train_start(self, trainer, pl_module):
-        pl_module.optimizers().param_groups = pl_module.optimizers()._optimizer.param_groups
-def render_images(model, output,):
-    # render from model
-    n = 180
-    azimuths = (np.arange(n) / n * np.pi * 2).astype(np.float32)
-    elevations = np.deg2rad(np.asarray([30] * n).astype(np.float32))
-    K, _, _, _, poses = read_pickle(f'meta_info/camera-16.pkl')
-    output_points
-    h, w = 256, 256
-    default_size = 256
-    K = np.diag([w/default_size,h/default_size,1.0]) @ K
-    imgs = []
-    for ni in tqdm(range(n)):
-        # R = euler2mat(azimuths[ni], elevations[ni], 0, 'szyx')
-        # R = np.asarray([[0,-1,0],[0,0,-1],[1,0,0]]) @ R
-        e, a = elevations[ni], azimuths[ni]
-        row1 = np.asarray([np.sin(e)*np.cos(a),np.sin(e)*np.sin(a),-np.cos(e)])
-        row0 = np.asarray([-np.sin(a),np.cos(a), 0])
-        row2 = np.cross(row0, row1)
-        R = np.stack([row0,row1,row2],0)
-        t = np.asarray([0,0,1.5])
-        pose = np.concatenate([R,t[:,None]],1)
-        pose_ = torch.from_numpy(pose.astype(np.float32)).unsqueeze(0)
-        K_ = torch.from_numpy(K.astype(np.float32)).unsqueeze(0) # [1,3,3]
-        coords = torch.stack(torch.meshgrid(torch.arange(h), torch.arange(w)), -1)[:, :, (1, 0)]  # h,w,2
-        coords = coords.float()[None, :, :, :].repeat(1, 1, 1, 1)  # imn,h,w,2
-        coords = coords.reshape(1, h * w, 2)
-        coords = torch.cat([coords, torch.ones(1, h * w, 1, dtype=torch.float32)], 2)  # imn,h*w,3
-        # imn,h*w,3 @ imn,3,3 => imn,h*w,3
-        rays_d = coords @ torch.inverse(K_).permute(0, 2, 1)
-        R, t = pose_[:, :, :3], pose_[:, :, 3:]
-        rays_d = rays_d @ R
-        rays_d = F.normalize(rays_d, dim=-1)
-        rays_o = -R.permute(0, 2, 1) @ t  # imn,3,3 @ imn,3,1
-        rays_o = rays_o.permute(0, 2, 1).repeat(1, h * w, 1)  # imn,h*w,3
-        ray_batch = {
-            'rays_o': rays_o.reshape(-1,3).cuda(),
-            'rays_d': rays_d.reshape(-1,3).cuda(),
-        }
-        with torch.no_grad():
-            image = model.renderer.render(ray_batch,False,5000)['rgb'].reshape(h,w,3)
-        image = (image.cpu().numpy() * 255).astype(np.uint8)
-        imgs.append(image)
-    imageio.mimsave(f'{output}/rendering.mp4', imgs, fps=30)
-def extract_fields(bound_min, bound_max, resolution, query_func, batch_size=64, outside_val=1.0):
-    N = batch_size
-    X = torch.linspace(bound_min[0], bound_max[0], resolution).split(N)
-    Y = torch.linspace(bound_min[1], bound_max[1], resolution).split(N)
-    Z = torch.linspace(bound_min[2], bound_max[2], resolution).split(N)
-    u = np.zeros([resolution, resolution, resolution], dtype=np.float32)
-    with torch.no_grad():
-        for xi, xs in enumerate(X):
-            for yi, ys in enumerate(Y):
-                for zi, zs in enumerate(Z):
-                    xx, yy, zz = torch.meshgrid(xs, ys, zs)
-                    pts = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1).cuda()
-                    val = query_func(pts).detach()
-                    outside_mask = torch.norm(pts,dim=-1)>=1.0
-                    val[outside_mask]=outside_val
-                    val = val.reshape(len(xs), len(ys), len(zs)).cpu().numpy()
-                    u[xi * N: xi * N + len(xs), yi * N: yi * N + len(ys), zi * N: zi * N + len(zs)] = val
-    return u
-def extract_geometry(bound_min, bound_max, resolution, threshold, query_func, color_func, outside_val=1.0):
-    u = extract_fields(bound_min, bound_max, resolution, query_func, outside_val=outside_val)
-    vertices, triangles = mcubes.marching_cubes(u, threshold)
-    b_max_np = bound_max.detach().cpu().numpy()
-    b_min_np = bound_min.detach().cpu().numpy()
-    vertices = vertices / (resolution - 1.0) * (b_max_np - b_min_np)[None, :] + b_min_np[None, :]
-    vertex_colors = color_func(vertices)
-    return vertices, triangles, vertex_colors
-def extract_mesh(model, output, resolution=512):
-    if not isinstance(model.renderer, NeuSRenderer): return
-    bbox_min = -torch.ones(3)*DEFAULT_SIDE_LENGTH
-    bbox_max = torch.ones(3)*DEFAULT_SIDE_LENGTH
-    with torch.no_grad():
-        vertices, triangles, vertex_colors = extract_geometry(bbox_min, bbox_max, resolution, 0, lambda x: model.renderer.sdf_network.sdf(x), lambda x: model.renderer.get_vertex_colors(x))
-    # output geometry
-    mesh = trimesh.Trimesh(vertices, triangles, vertex_colors=vertex_colors)
-    mesh.export(str(f'{output}/mesh.ply'))
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-i', '--image_path', type=str, required=True)
-    parser.add_argument('-n', '--name', type=str, required=True)
-    parser.add_argument('-b', '--base', type=str, default='configs/neus.yaml')
-    parser.add_argument('-d', '--data_path', type=str, default='/data/GSO/')
-    parser.add_argument('-l', '--log', type=str, default='output/renderer')
-    parser.add_argument('-s', '--seed', type=int, default=6033)
-    parser.add_argument('-g', '--gpus', type=str, default='0,')
-    parser.add_argument('-r', '--resume', action='store_true', default=False, dest='resume')
-    parser.add_argument('--fp16', action='store_true', default=False, dest='fp16')
-    opt = parser.parse_args()
-    # seed_everything(opt.seed)
-    # configs
-    cfg = OmegaConf.load(opt.base)
-    name = opt.name
-    log_dir, ckpt_dir = Path(opt.log) / name, Path(opt.log) / name / 'ckpt'
-    cfg.model.params['image_path'] = opt.image_path
-    cfg.model.params['log_dir'] = log_dir
-    cfg.model.params['data_path'] = opt.data_path
-    # setup
-    log_dir.mkdir(exist_ok=True, parents=True)
-    ckpt_dir.mkdir(exist_ok=True, parents=True)
-    trainer_config = cfg.trainer
-    callback_config = cfg.callbacks
-    model_config = cfg.model
-    data_config = cfg.data
-    data_config.params.seed = opt.seed
-    data = instantiate_from_config(data_config)
-    data.prepare_data()
-    data.setup('fit')
-    model = instantiate_from_config(model_config,)
-    model.cpu()
-    model.learning_rate = model_config.base_lr
-    # logger
-    logger = TensorBoardLogger(save_dir=log_dir, name='tensorboard_logs')
-    callbacks=[]
-    callbacks.append(LearningRateMonitor(logging_interval='step'))
-    callbacks.append(ModelCheckpoint(dirpath=ckpt_dir, filename="{epoch:06}", verbose=True, save_last=True, every_n_train_steps=callback_config.save_interval))
-    # trainer
-    trainer_config.update({
-        "accelerator": "cuda", "check_val_every_n_epoch": None,
-        "benchmark": True, "num_sanity_val_steps": 0,
-        "devices": 1, "gpus": opt.gpus,
-    })
-    if opt.fp16:
-        trainer_config['precision']=16
-    if opt.resume:
-        callbacks.append(ResumeCallBacks())
-        trainer_config['resume_from_checkpoint'] = str(ckpt_dir / 'last.ckpt')
-    else:
-        if (ckpt_dir / 'last.ckpt').exists():
-            raise RuntimeError(f"checkpoint {ckpt_dir / 'last.ckpt'} existing ...")
-    trainer = Trainer.from_argparse_args(args=argparse.Namespace(), **trainer_config, logger=logger, callbacks=callbacks)
-    trainer.fit(model, data)
-    model = model.cuda().eval()
-    # render_images(model, log_dir)
-    extract_mesh(model, log_dir)
-if __name__=="__main__":
-    main()

3drecon/util.py DELETED Viewed

@@ -1,54 +0,0 @@
-import importlib
-import pickle
-import numpy as np
-import cv2
-def instantiate_from_config(config):
-    if not "target" in config:
-        if config == '__is_first_stage__':
-            return None
-        elif config == "__is_unconditional__":
-            return None
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-def read_pickle(pkl_path):
-    with open(pkl_path, 'rb') as f:
-        return pickle.load(f)
-def output_points(fn,pts,colors=None):
-    with open(fn, 'w') as f:
-        for pi, pt in enumerate(pts):
-            f.write(f'{pt[0]:.6f} {pt[1]:.6f} {pt[2]:.6f} ')
-            if colors is not None:
-                f.write(f'{int(colors[pi,0])} {int(colors[pi,1])} {int(colors[pi,2])}')
-            f.write('\n')
-def concat_images(img0,img1,vert=False):
-    if not vert:
-        h0,h1=img0.shape[0],img1.shape[0],
-        if h0<h1: img0=cv2.copyMakeBorder(img0,0,h1-h0,0,0,borderType=cv2.BORDER_CONSTANT,value=0)
-        if h1<h0: img1=cv2.copyMakeBorder(img1,0,h0-h1,0,0,borderType=cv2.BORDER_CONSTANT,value=0)
-        img = np.concatenate([img0, img1], axis=1)
-    else:
-        w0,w1=img0.shape[1],img1.shape[1]
-        if w0<w1: img0=cv2.copyMakeBorder(img0,0,0,0,w1-w0,borderType=cv2.BORDER_CONSTANT,value=0)
-        if w1<w0: img1=cv2.copyMakeBorder(img1,0,0,0,w0-w1,borderType=cv2.BORDER_CONSTANT,value=0)
-        img = np.concatenate([img0, img1], axis=0)
-    return img
-def concat_images_list(*args,vert=False):
-    if len(args)==1: return args[0]
-    img_out=args[0]
-    for img in args[1:]:
-        img_out=concat_images(img_out,img,vert)
-    return img_out

4DoF/CN_encoder.py DELETED Viewed

@@ -1,36 +0,0 @@
-from transformers import ConvNextV2Model
-import torch
-from typing import Optional
-import einops
-class CN_encoder(ConvNextV2Model):
-    def __init__(self, config):
-        super().__init__(config)
-    def forward(
-            self,
-            pixel_values: torch.FloatTensor = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-        embedding_output = self.embeddings(pixel_values)
-        encoder_outputs = self.encoder(
-            embedding_output,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        last_hidden_state = encoder_outputs[0]
-        image_embeddings = einops.rearrange(last_hidden_state, 'b c h w -> b (h w) c')
-        image_embeddings = self.layernorm(image_embeddings)
-        return image_embeddings

4DoF/dataset.py DELETED Viewed

@@ -1,228 +0,0 @@
-import os
-import math
-from pathlib import Path
-import torch
-import torchvision
-from torch.utils.data import Dataset, DataLoader
-from torchvision import transforms
-from PIL import Image
-import numpy as np
-import webdataset as wds
-from torch.utils.data.distributed import DistributedSampler
-import matplotlib.pyplot as plt
-import sys
-class ObjaverseDataLoader():
-    def __init__(self, root_dir, batch_size, total_view=12, num_workers=4):
-        self.root_dir = root_dir
-        self.batch_size = batch_size
-        self.num_workers = num_workers
-        self.total_view = total_view
-        image_transforms = [torchvision.transforms.Resize((256, 256)),
-                            transforms.ToTensor(),
-                            transforms.Normalize([0.5], [0.5])]
-        self.image_transforms = torchvision.transforms.Compose(image_transforms)
-    def train_dataloader(self):
-        dataset = ObjaverseData(root_dir=self.root_dir, total_view=self.total_view, validation=False,
-                                image_transforms=self.image_transforms)
-        # sampler = DistributedSampler(dataset)
-        return wds.WebLoader(dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
-                             # sampler=sampler)
-    def val_dataloader(self):
-        dataset = ObjaverseData(root_dir=self.root_dir, total_view=self.total_view, validation=True,
-                                image_transforms=self.image_transforms)
-        sampler = DistributedSampler(dataset)
-        return wds.WebLoader(dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)
-def cartesian_to_spherical(xyz):
-    ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
-    xy = xyz[:, 0] ** 2 + xyz[:, 1] ** 2
-    z = np.sqrt(xy + xyz[:, 2] ** 2)
-    theta = np.arctan2(np.sqrt(xy), xyz[:, 2])  # for elevation angle defined from Z-axis down
-    # ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
-    azimuth = np.arctan2(xyz[:, 1], xyz[:, 0])
-    return np.array([theta, azimuth, z])
-def get_pose(target_RT):
-    target_RT = target_RT[:3, :]
-    R, T = target_RT[:3, :3], target_RT[:, -1]
-    T_target = -R.T @ T
-    theta_target, azimuth_target, z_target = cartesian_to_spherical(T_target[None, :])
-    # assert if z_target is out of range
-    if z_target.item() < 1.5 or z_target.item() > 2.2:
-        # print('z_target out of range 1.5-2.2', z_target.item())
-        z_target = np.clip(z_target.item(), 1.5, 2.2)
-    # with log scale for radius
-    target_T = torch.tensor([theta_target.item(), azimuth_target.item(), (np.log(z_target.item()) - np.log(1.5))/(np.log(2.2)-np.log(1.5)) * torch.pi, torch.tensor(0)])
-    assert torch.all(target_T <= torch.pi) and torch.all(target_T >= -torch.pi)
-    return target_T.numpy()
-class ObjaverseData(Dataset):
-    def __init__(self,
-                 root_dir='.objaverse/hf-objaverse-v1/views',
-                 image_transforms=None,
-                 total_view=12,
-                 validation=False,
-                 T_in=1,
-                 T_out=1,
-                 fix_sample=False,
-                 ) -> None:
-        """Create a dataset from a folder of images.
-        If you pass in a root directory it will be searched for images
-        ending in ext (ext can be a list)
-        """
-        self.root_dir = Path(root_dir)
-        self.total_view = total_view
-        self.T_in = T_in
-        self.T_out = T_out
-        self.fix_sample = fix_sample
-        self.paths = []
-        # # include all folders
-        # for folder in os.listdir(self.root_dir):
-        #     if os.path.isdir(os.path.join(self.root_dir, folder)):
-        #         self.paths.append(folder)
-        # load ids from .npy so we have exactly the same ids/order
-        self.paths = np.load("../scripts/obj_ids.npy")
-        # # only use 100K objects for ablation study
-        # self.paths = self.paths[:100000]
-        total_objects = len(self.paths)
-        assert total_objects == 790152, 'total objects %d' % total_objects
-        if validation:
-            self.paths = self.paths[math.floor(total_objects / 100. * 99.):]  # used last 1% as validation
-        else:
-            self.paths = self.paths[:math.floor(total_objects / 100. * 99.)]  # used first 99% as training
-        print('============= length of dataset %d =============' % len(self.paths))
-        self.tform = image_transforms
-        downscale = 512 / 256.
-        self.fx = 560. / downscale
-        self.fy = 560. / downscale
-        self.intrinsic = torch.tensor([[self.fx, 0, 128., 0, self.fy, 128., 0, 0, 1.]], dtype=torch.float64).view(3, 3)
-    def __len__(self):
-        return len(self.paths)
-    def cartesian_to_spherical(self, xyz):
-        ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
-        xy = xyz[:, 0] ** 2 + xyz[:, 1] ** 2
-        z = np.sqrt(xy + xyz[:, 2] ** 2)
-        theta = np.arctan2(np.sqrt(xy), xyz[:, 2])  # for elevation angle defined from Z-axis down
-        # ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
-        azimuth = np.arctan2(xyz[:, 1], xyz[:, 0])
-        return np.array([theta, azimuth, z])
-    def get_T(self, target_RT, cond_RT):
-        R, T = target_RT[:3, :3], target_RT[:, -1]
-        T_target = -R.T @ T
-        R, T = cond_RT[:3, :3], cond_RT[:, -1]
-        T_cond = -R.T @ T
-        theta_cond, azimuth_cond, z_cond = self.cartesian_to_spherical(T_cond[None, :])
-        theta_target, azimuth_target, z_target = self.cartesian_to_spherical(T_target[None, :])
-        d_theta = theta_target - theta_cond
-        d_azimuth = (azimuth_target - azimuth_cond) % (2 * math.pi)
-        d_z = z_target - z_cond
-        d_T = torch.tensor([d_theta.item(), math.sin(d_azimuth.item()), math.cos(d_azimuth.item()), d_z.item()])
-        return d_T
-    def get_pose(self, target_RT):
-        R, T = target_RT[:3, :3], target_RT[:, -1]
-        T_target = -R.T @ T
-        theta_target, azimuth_target, z_target = self.cartesian_to_spherical(T_target[None, :])
-        # assert if z_target is out of range
-        if z_target.item() < 1.5 or z_target.item() > 2.2:
-            # print('z_target out of range 1.5-2.2', z_target.item())
-            z_target = np.clip(z_target.item(), 1.5, 2.2)
-        # with log scale for radius
-        target_T = torch.tensor([theta_target.item(), azimuth_target.item(), (np.log(z_target.item()) - np.log(1.5))/(np.log(2.2)-np.log(1.5)) * torch.pi, torch.tensor(0)])
-        assert torch.all(target_T <= torch.pi) and torch.all(target_T >= -torch.pi)
-        return target_T
-    def load_im(self, path, color):
-        '''
-        replace background pixel with random color in rendering
-        '''
-        try:
-            img = plt.imread(path)
-        except:
-            print(path)
-            sys.exit()
-        img[img[:, :, -1] == 0.] = color
-        img = Image.fromarray(np.uint8(img[:, :, :3] * 255.))
-        return img
-    def __getitem__(self, index):
-        data = {}
-        total_view = 12
-        if self.fix_sample:
-            if self.T_out > 1:
-                indexes = range(total_view)
-                index_targets = list(indexes[:2]) + list(indexes[-(self.T_out-2):])
-                index_inputs = indexes[1:self.T_in+1]   # one overlap identity
-            else:
-                indexes = range(total_view)
-                index_targets = indexes[:self.T_out]
-                index_inputs = indexes[self.T_out-1:self.T_in+self.T_out-1] # one overlap identity
-        else:
-            assert self.T_in + self.T_out <= total_view
-            # training with replace, including identity
-            indexes = np.random.choice(range(total_view), self.T_in+self.T_out, replace=True)
-            index_inputs = indexes[:self.T_in]
-            index_targets = indexes[self.T_in:]
-        filename = os.path.join(self.root_dir, self.paths[index])
-        color = [1., 1., 1., 1.]
-        try:
-            input_ims = []
-            target_ims = []
-            target_Ts = []
-            cond_Ts = []
-            for i, index_input in enumerate(index_inputs):
-                input_im = self.process_im(self.load_im(os.path.join(filename, '%03d.png' % index_input), color))
-                input_ims.append(input_im)
-                input_RT = np.load(os.path.join(filename, '%03d.npy' % index_input))
-                cond_Ts.append(self.get_pose(input_RT))
-            for i, index_target in enumerate(index_targets):
-                target_im = self.process_im(self.load_im(os.path.join(filename, '%03d.png' % index_target), color))
-                target_ims.append(target_im)
-                target_RT = np.load(os.path.join(filename, '%03d.npy' % index_target))
-                target_Ts.append(self.get_pose(target_RT))
-        except:
-            print('error loading data ', filename)
-            filename = os.path.join(self.root_dir, '0a01f314e2864711aa7e33bace4bd8c8')  # this one we know is valid
-            input_ims = []
-            target_ims = []
-            target_Ts = []
-            cond_Ts = []
-            # very hacky solution, sorry about this
-            for i, index_input in enumerate(index_inputs):
-                input_im = self.process_im(self.load_im(os.path.join(filename, '%03d.png' % index_input), color))
-                input_ims.append(input_im)
-                input_RT = np.load(os.path.join(filename, '%03d.npy' % index_input))
-                cond_Ts.append(self.get_pose(input_RT))
-            for i, index_target in enumerate(index_targets):
-                target_im = self.process_im(self.load_im(os.path.join(filename, '%03d.png' % index_target), color))
-                target_ims.append(target_im)
-                target_RT = np.load(os.path.join(filename, '%03d.npy' % index_target))
-                target_Ts.append(self.get_pose(target_RT))
-        # stack to batch
-        data['image_input'] = torch.stack(input_ims, dim=0)
-        data['image_target'] = torch.stack(target_ims, dim=0)
-        data['pose_out'] = torch.stack(target_Ts, dim=0)
-        data['pose_in'] = torch.stack(cond_Ts, dim=0)
-        return data
-    def process_im(self, im):
-        im = im.convert("RGB")
-        return self.tform(im)

4DoF/diffusers/__init__.py DELETED Viewed

@@ -1,281 +0,0 @@
-__version__ = "0.18.2"
-from .configuration_utils import ConfigMixin
-from .utils import (
-    OptionalDependencyNotAvailable,
-    is_flax_available,
-    is_inflect_available,
-    is_invisible_watermark_available,
-    is_k_diffusion_available,
-    is_k_diffusion_version,
-    is_librosa_available,
-    is_note_seq_available,
-    is_onnx_available,
-    is_scipy_available,
-    is_torch_available,
-    is_torchsde_available,
-    is_transformers_available,
-    is_transformers_version,
-    is_unidecode_available,
-    logging,
-)
-try:
-    if not is_onnx_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_onnx_objects import *  # noqa F403
-else:
-    from .pipelines import OnnxRuntimeModel
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_pt_objects import *  # noqa F403
-else:
-    from .models import (
-        AutoencoderKL,
-        ControlNetModel,
-        ModelMixin,
-        PriorTransformer,
-        T5FilmDecoder,
-        Transformer2DModel,
-        UNet1DModel,
-        UNet2DConditionModel,
-        UNet2DModel,
-        UNet3DConditionModel,
-        VQModel,
-    )
-    from .optimization import (
-        get_constant_schedule,
-        get_constant_schedule_with_warmup,
-        get_cosine_schedule_with_warmup,
-        get_cosine_with_hard_restarts_schedule_with_warmup,
-        get_linear_schedule_with_warmup,
-        get_polynomial_decay_schedule_with_warmup,
-        get_scheduler,
-    )
-    from .pipelines import (
-        AudioPipelineOutput,
-        ConsistencyModelPipeline,
-        DanceDiffusionPipeline,
-        DDIMPipeline,
-        DDPMPipeline,
-        DiffusionPipeline,
-        DiTPipeline,
-        ImagePipelineOutput,
-        KarrasVePipeline,
-        LDMPipeline,
-        LDMSuperResolutionPipeline,
-        PNDMPipeline,
-        RePaintPipeline,
-        ScoreSdeVePipeline,
-    )
-    from .schedulers import (
-        CMStochasticIterativeScheduler,
-        DDIMInverseScheduler,
-        DDIMParallelScheduler,
-        DDIMScheduler,
-        DDPMParallelScheduler,
-        DDPMScheduler,
-        DEISMultistepScheduler,
-        DPMSolverMultistepInverseScheduler,
-        DPMSolverMultistepScheduler,
-        DPMSolverSinglestepScheduler,
-        EulerAncestralDiscreteScheduler,
-        EulerDiscreteScheduler,
-        HeunDiscreteScheduler,
-        IPNDMScheduler,
-        KarrasVeScheduler,
-        KDPM2AncestralDiscreteScheduler,
-        KDPM2DiscreteScheduler,
-        PNDMScheduler,
-        RePaintScheduler,
-        SchedulerMixin,
-        ScoreSdeVeScheduler,
-        UnCLIPScheduler,
-        UniPCMultistepScheduler,
-        VQDiffusionScheduler,
-    )
-    from .training_utils import EMAModel
-try:
-    if not (is_torch_available() and is_scipy_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_scipy_objects import *  # noqa F403
-else:
-    from .schedulers import LMSDiscreteScheduler
-try:
-    if not (is_torch_available() and is_torchsde_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_torchsde_objects import *  # noqa F403
-else:
-    from .schedulers import DPMSolverSDEScheduler
-try:
-    if not (is_torch_available() and is_transformers_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
-else:
-    from .pipelines import (
-        AltDiffusionImg2ImgPipeline,
-        AltDiffusionPipeline,
-        AudioLDMPipeline,
-        CycleDiffusionPipeline,
-        IFImg2ImgPipeline,
-        IFImg2ImgSuperResolutionPipeline,
-        IFInpaintingPipeline,
-        IFInpaintingSuperResolutionPipeline,
-        IFPipeline,
-        IFSuperResolutionPipeline,
-        ImageTextPipelineOutput,
-        KandinskyImg2ImgPipeline,
-        KandinskyInpaintPipeline,
-        KandinskyPipeline,
-        KandinskyPriorPipeline,
-        KandinskyV22ControlnetImg2ImgPipeline,
-        KandinskyV22ControlnetPipeline,
-        KandinskyV22Img2ImgPipeline,
-        KandinskyV22InpaintPipeline,
-        KandinskyV22Pipeline,
-        KandinskyV22PriorEmb2EmbPipeline,
-        KandinskyV22PriorPipeline,
-        LDMTextToImagePipeline,
-        PaintByExamplePipeline,
-        SemanticStableDiffusionPipeline,
-        ShapEImg2ImgPipeline,
-        ShapEPipeline,
-        StableDiffusionAttendAndExcitePipeline,
-        StableDiffusionControlNetImg2ImgPipeline,
-        StableDiffusionControlNetInpaintPipeline,
-        StableDiffusionControlNetPipeline,
-        StableDiffusionDepth2ImgPipeline,
-        StableDiffusionDiffEditPipeline,
-        StableDiffusionImageVariationPipeline,
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionInpaintPipelineLegacy,
-        StableDiffusionInstructPix2PixPipeline,
-        StableDiffusionLatentUpscalePipeline,
-        StableDiffusionLDM3DPipeline,
-        StableDiffusionModelEditingPipeline,
-        StableDiffusionPanoramaPipeline,
-        StableDiffusionParadigmsPipeline,
-        StableDiffusionPipeline,
-        StableDiffusionPipelineSafe,
-        StableDiffusionPix2PixZeroPipeline,
-        StableDiffusionSAGPipeline,
-        StableDiffusionUpscalePipeline,
-        StableUnCLIPImg2ImgPipeline,
-        StableUnCLIPPipeline,
-        TextToVideoSDPipeline,
-        TextToVideoZeroPipeline,
-        UnCLIPImageVariationPipeline,
-        UnCLIPPipeline,
-        UniDiffuserModel,
-        UniDiffuserPipeline,
-        UniDiffuserTextDecoder,
-        VersatileDiffusionDualGuidedPipeline,
-        VersatileDiffusionImageVariationPipeline,
-        VersatileDiffusionPipeline,
-        VersatileDiffusionTextToImagePipeline,
-        VideoToVideoSDPipeline,
-        VQDiffusionPipeline,
-    )
-try:
-    if not (is_torch_available() and is_transformers_available() and is_invisible_watermark_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_transformers_and_invisible_watermark_objects import *  # noqa F403
-else:
-    from .pipelines import StableDiffusionXLImg2ImgPipeline, StableDiffusionXLPipeline
-try:
-    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
-else:
-    from .pipelines import StableDiffusionKDiffusionPipeline
-try:
-    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
-else:
-    from .pipelines import (
-        OnnxStableDiffusionImg2ImgPipeline,
-        OnnxStableDiffusionInpaintPipeline,
-        OnnxStableDiffusionInpaintPipelineLegacy,
-        OnnxStableDiffusionPipeline,
-        OnnxStableDiffusionUpscalePipeline,
-        StableDiffusionOnnxPipeline,
-    )
-try:
-    if not (is_torch_available() and is_librosa_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_torch_and_librosa_objects import *  # noqa F403
-else:
-    from .pipelines import AudioDiffusionPipeline, Mel
-try:
-    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
-else:
-    from .pipelines import SpectrogramDiffusionPipeline
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_flax_objects import *  # noqa F403
-else:
-    from .models.controlnet_flax import FlaxControlNetModel
-    from .models.modeling_flax_utils import FlaxModelMixin
-    from .models.unet_2d_condition_flax import FlaxUNet2DConditionModel
-    from .models.vae_flax import FlaxAutoencoderKL
-    from .pipelines import FlaxDiffusionPipeline
-    from .schedulers import (
-        FlaxDDIMScheduler,
-        FlaxDDPMScheduler,
-        FlaxDPMSolverMultistepScheduler,
-        FlaxKarrasVeScheduler,
-        FlaxLMSDiscreteScheduler,
-        FlaxPNDMScheduler,
-        FlaxSchedulerMixin,
-        FlaxScoreSdeVeScheduler,
-    )
-try:
-    if not (is_flax_available() and is_transformers_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_flax_and_transformers_objects import *  # noqa F403
-else:
-    from .pipelines import (
-        FlaxStableDiffusionControlNetPipeline,
-        FlaxStableDiffusionImg2ImgPipeline,
-        FlaxStableDiffusionInpaintPipeline,
-        FlaxStableDiffusionPipeline,
-    )
-try:
-    if not (is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils.dummy_note_seq_objects import *  # noqa F403
-else:
-    from .pipelines import MidiProcessor

4DoF/diffusers/commands/__init__.py DELETED Viewed

@@ -1,27 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from abc import ABC, abstractmethod
-from argparse import ArgumentParser
-class BaseDiffusersCLICommand(ABC):
-    @staticmethod
-    @abstractmethod
-    def register_subcommand(parser: ArgumentParser):
-        raise NotImplementedError()
-    @abstractmethod
-    def run(self):
-        raise NotImplementedError()

4DoF/diffusers/commands/diffusers_cli.py DELETED Viewed

@@ -1,41 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from argparse import ArgumentParser
-from .env import EnvironmentCommand
-def main():
-    parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli <command> [<args>]")
-    commands_parser = parser.add_subparsers(help="diffusers-cli command helpers")
-    # Register commands
-    EnvironmentCommand.register_subcommand(commands_parser)
-    # Let's go
-    args = parser.parse_args()
-    if not hasattr(args, "func"):
-        parser.print_help()
-        exit(1)
-    # Run
-    service = args.func(args)
-    service.run()
-if __name__ == "__main__":
-    main()

4DoF/diffusers/commands/env.py DELETED Viewed

@@ -1,84 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import platform
-from argparse import ArgumentParser
-import huggingface_hub
-from .. import __version__ as version
-from ..utils import is_accelerate_available, is_torch_available, is_transformers_available, is_xformers_available
-from . import BaseDiffusersCLICommand
-def info_command_factory(_):
-    return EnvironmentCommand()
-class EnvironmentCommand(BaseDiffusersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        download_parser = parser.add_parser("env")
-        download_parser.set_defaults(func=info_command_factory)
-    def run(self):
-        hub_version = huggingface_hub.__version__
-        pt_version = "not installed"
-        pt_cuda_available = "NA"
-        if is_torch_available():
-            import torch
-            pt_version = torch.__version__
-            pt_cuda_available = torch.cuda.is_available()
-        transformers_version = "not installed"
-        if is_transformers_available():
-            import transformers
-            transformers_version = transformers.__version__
-        accelerate_version = "not installed"
-        if is_accelerate_available():
-            import accelerate
-            accelerate_version = accelerate.__version__
-        xformers_version = "not installed"
-        if is_xformers_available():
-            import xformers
-            xformers_version = xformers.__version__
-        info = {
-            "`diffusers` version": version,
-            "Platform": platform.platform(),
-            "Python version": platform.python_version(),
-            "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
-            "Huggingface_hub version": hub_version,
-            "Transformers version": transformers_version,
-            "Accelerate version": accelerate_version,
-            "xFormers version": xformers_version,
-            "Using GPU in script?": "<fill in>",
-            "Using distributed or parallel set-up in script?": "<fill in>",
-        }
-        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
-        print(self.format_dict(info))
-        return info
-    @staticmethod
-    def format_dict(d):
-        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"

4DoF/diffusers/configuration_utils.py DELETED Viewed

@@ -1,664 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" ConfigMixin base class and utilities."""
-import dataclasses
-import functools
-import importlib
-import inspect
-import json
-import os
-import re
-from collections import OrderedDict
-from pathlib import PosixPath
-from typing import Any, Dict, Tuple, Union
-import numpy as np
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
-from requests import HTTPError
-from . import __version__
-from .utils import (
-    DIFFUSERS_CACHE,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
-    DummyObject,
-    deprecate,
-    extract_commit_hash,
-    http_user_agent,
-    logging,
-)
-logger = logging.get_logger(__name__)
-_re_configuration_file = re.compile(r"config\.(.*)\.json")
-class FrozenDict(OrderedDict):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        for key, value in self.items():
-            setattr(self, key, value)
-        self.__frozen = True
-    def __delitem__(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
-    def setdefault(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
-    def pop(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
-    def update(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
-    def __setattr__(self, name, value):
-        if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
-        super().__setattr__(name, value)
-    def __setitem__(self, name, value):
-        if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
-        super().__setitem__(name, value)
-class ConfigMixin:
-    r"""
-    Base class for all configuration classes. All configuration parameters are stored under `self.config`. Also
-    provides the [`~ConfigMixin.from_config`] and [`~ConfigMixin.save_config`] methods for loading, downloading, and
-    saving classes that inherit from [`ConfigMixin`].
-    Class attributes:
-        - **config_name** (`str`) -- A filename under which the config should stored when calling
-          [`~ConfigMixin.save_config`] (should be overridden by parent class).
-        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
-          overridden by subclass).
-        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by subclass).
-        - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the `init` function
-          should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
-          subclass).
-    """
-    config_name = None
-    ignore_for_config = []
-    has_compatibles = False
-    _deprecated_kwargs = []
-    def register_to_config(self, **kwargs):
-        if self.config_name is None:
-            raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
-        # Special case for `kwargs` used in deprecation warning added to schedulers
-        # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
-        # or solve in a more general way.
-        kwargs.pop("kwargs", None)
-        if not hasattr(self, "_internal_dict"):
-            internal_dict = kwargs
-        else:
-            previous_dict = dict(self._internal_dict)
-            internal_dict = {**self._internal_dict, **kwargs}
-            logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
-        self._internal_dict = FrozenDict(internal_dict)
-    def __getattr__(self, name: str) -> Any:
-        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
-        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129
-        Tihs funtion is mostly copied from PyTorch's __getattr__ overwrite:
-        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        """
-        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
-        is_attribute = name in self.__dict__
-        if is_in_config and not is_attribute:
-            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'scheduler.config.{name}'."
-            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)
-            return self._internal_dict[name]
-        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
-    def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save a configuration object to the directory specified in `save_directory` so that it can be reloaded using the
-        [`~ConfigMixin.from_config`] class method.
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the configuration JSON file is saved (will be created if it does not exist).
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-        os.makedirs(save_directory, exist_ok=True)
-        # If we save using the predefined names, we can load using `from_config`
-        output_config_file = os.path.join(save_directory, self.config_name)
-        self.to_json_file(output_config_file)
-        logger.info(f"Configuration saved in {output_config_file}")
-    @classmethod
-    def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs):
-        r"""
-        Instantiate a Python class from a config dictionary.
-        Parameters:
-            config (`Dict[str, Any]`):
-                A config dictionary from which the Python class is instantiated. Make sure to only load configuration
-                files of compatible classes.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                Whether kwargs that are not consumed by the Python class should be returned or not.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to update the configuration object (after it is loaded) and initiate the Python class.
-                `**kwargs` are passed directly to the underlying scheduler/model's `__init__` method and eventually
-                overwrite the same named arguments in `config`.
-        Returns:
-            [`ModelMixin`] or [`SchedulerMixin`]:
-                A model or scheduler object instantiated from a config dictionary.
-        Examples:
-        ```python
-        >>> from diffusers import DDPMScheduler, DDIMScheduler, PNDMScheduler
-        >>> # Download scheduler from huggingface.co and cache.
-        >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cifar10-32")
-        >>> # Instantiate DDIM scheduler class with same config as DDPM
-        >>> scheduler = DDIMScheduler.from_config(scheduler.config)
-        >>> # Instantiate PNDM scheduler class with same config as DDPM
-        >>> scheduler = PNDMScheduler.from_config(scheduler.config)
-        ```
-        """
-        # <===== TO BE REMOVED WITH DEPRECATION
-        # TODO(Patrick) - make sure to remove the following lines when config=="model_path" is deprecated
-        if "pretrained_model_name_or_path" in kwargs:
-            config = kwargs.pop("pretrained_model_name_or_path")
-        if config is None:
-            raise ValueError("Please make sure to provide a config as the first positional argument.")
-        # ======>
-        if not isinstance(config, dict):
-            deprecation_message = "It is deprecated to pass a pretrained model name or path to `from_config`."
-            if "Scheduler" in cls.__name__:
-                deprecation_message += (
-                    f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead."
-                    " Otherwise, please make sure to pass a configuration dictionary instead. This functionality will"
-                    " be removed in v1.0.0."
-                )
-            elif "Model" in cls.__name__:
-                deprecation_message += (
-                    f"If you were trying to load a model, please use {cls}.load_config(...) followed by"
-                    f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary"
-                    " instead. This functionality will be removed in v1.0.0."
-                )
-            deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
-            config, kwargs = cls.load_config(pretrained_model_name_or_path=config, return_unused_kwargs=True, **kwargs)
-        init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs)
-        # Allow dtype to be specified on initialization
-        if "dtype" in unused_kwargs:
-            init_dict["dtype"] = unused_kwargs.pop("dtype")
-        # add possible deprecated kwargs
-        for deprecated_kwarg in cls._deprecated_kwargs:
-            if deprecated_kwarg in unused_kwargs:
-                init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg)
-        # Return model and optionally state and/or unused_kwargs
-        model = cls(**init_dict)
-        # make sure to also save config parameters that might be used for compatible classes
-        model.register_to_config(**hidden_dict)
-        # add hidden kwargs of compatible classes to unused_kwargs
-        unused_kwargs = {**unused_kwargs, **hidden_dict}
-        if return_unused_kwargs:
-            return (model, unused_kwargs)
-        else:
-            return model
-    @classmethod
-    def get_config_dict(cls, *args, **kwargs):
-        deprecation_message = (
-            f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be"
-            " removed in version v1.0.0"
-        )
-        deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False)
-        return cls.load_config(*args, **kwargs)
-    @classmethod
-    def load_config(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        return_unused_kwargs=False,
-        return_commit_hash=False,
-        **kwargs,
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        r"""
-        Load a model or scheduler configuration.
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with
-                      [`~ConfigMixin.save_config`].
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False):
-                Whether unused keyword arguments of the config are returned.
-            return_commit_hash (`bool`, *optional*, defaults to `False):
-                Whether the `commit_hash` of the loaded configuration are returned.
-        Returns:
-            `dict`:
-                A dictionary of all the parameters stored in a JSON configuration file.
-        """
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        revision = kwargs.pop("revision", None)
-        _ = kwargs.pop("mirror", None)
-        subfolder = kwargs.pop("subfolder", None)
-        user_agent = kwargs.pop("user_agent", {})
-        user_agent = {**user_agent, "file_type": "config"}
-        user_agent = http_user_agent(user_agent)
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if cls.config_name is None:
-            raise ValueError(
-                "`self.config_name` is not defined. Note that one should not load a config from "
-                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
-            )
-        if os.path.isfile(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        elif os.path.isdir(pretrained_model_name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
-                # Load from a PyTorch checkpoint
-                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
-            elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
-            ):
-                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
-            else:
-                raise EnvironmentError(
-                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
-                )
-        else:
-            try:
-                # Load from URL or cache if already cached
-                config_file = hf_hub_download(
-                    pretrained_model_name_or_path,
-                    filename=cls.config_name,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                    subfolder=subfolder,
-                    revision=revision,
-                )
-            except RepositoryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
-                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
-                    " token having permission to this repo with `use_auth_token` or log in with `huggingface-cli"
-                    " login`."
-                )
-            except RevisionNotFoundError:
-                raise EnvironmentError(
-                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
-                    " this model name. Check the model page at"
-                    f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-                )
-            except EntryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
-                )
-            except HTTPError as err:
-                raise EnvironmentError(
-                    "There was a specific connection error when trying to load"
-                    f" {pretrained_model_name_or_path}:\n{err}"
-                )
-            except ValueError:
-                raise EnvironmentError(
-                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
-                    f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
-                    " run the library in offline mode at"
-                    " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
-                )
-            except EnvironmentError:
-                raise EnvironmentError(
-                    f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                    f"containing a {cls.config_name} file"
-                )
-        try:
-            # Load config dict
-            config_dict = cls._dict_from_json_file(config_file)
-            commit_hash = extract_commit_hash(config_file)
-        except (json.JSONDecodeError, UnicodeDecodeError):
-            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
-        if not (return_unused_kwargs or return_commit_hash):
-            return config_dict
-        outputs = (config_dict,)
-        if return_unused_kwargs:
-            outputs += (kwargs,)
-        if return_commit_hash:
-            outputs += (commit_hash,)
-        return outputs
-    @staticmethod
-    def _get_init_keys(cls):
-        return set(dict(inspect.signature(cls.__init__).parameters).keys())
-    @classmethod
-    def extract_init_dict(cls, config_dict, **kwargs):
-        # Skip keys that were not present in the original config, so default __init__ values were used
-        used_defaults = config_dict.get("_use_default_values", [])
-        config_dict = {k: v for k, v in config_dict.items() if k not in used_defaults and k != "_use_default_values"}
-        # 0. Copy origin config dict
-        original_dict = dict(config_dict.items())
-        # 1. Retrieve expected config attributes from __init__ signature
-        expected_keys = cls._get_init_keys(cls)
-        expected_keys.remove("self")
-        # remove general kwargs if present in dict
-        if "kwargs" in expected_keys:
-            expected_keys.remove("kwargs")
-        # remove flax internal keys
-        if hasattr(cls, "_flax_internal_args"):
-            for arg in cls._flax_internal_args:
-                expected_keys.remove(arg)
-        # 2. Remove attributes that cannot be expected from expected config attributes
-        # remove keys to be ignored
-        if len(cls.ignore_for_config) > 0:
-            expected_keys = expected_keys - set(cls.ignore_for_config)
-        # load diffusers library to import compatible and original scheduler
-        diffusers_library = importlib.import_module(__name__.split(".")[0])
-        if cls.has_compatibles:
-            compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
-        else:
-            compatible_classes = []
-        expected_keys_comp_cls = set()
-        for c in compatible_classes:
-            expected_keys_c = cls._get_init_keys(c)
-            expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
-        expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
-        config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
-        # remove attributes from orig class that cannot be expected
-        orig_cls_name = config_dict.pop("_class_name", cls.__name__)
-        if orig_cls_name != cls.__name__ and hasattr(diffusers_library, orig_cls_name):
-            orig_cls = getattr(diffusers_library, orig_cls_name)
-            unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
-            config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
-        # remove private attributes
-        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
-        # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
-        init_dict = {}
-        for key in expected_keys:
-            # if config param is passed to kwarg and is present in config dict
-            # it should overwrite existing config dict key
-            if key in kwargs and key in config_dict:
-                config_dict[key] = kwargs.pop(key)
-            if key in kwargs:
-                # overwrite key
-                init_dict[key] = kwargs.pop(key)
-            elif key in config_dict:
-                # use value from config dict
-                init_dict[key] = config_dict.pop(key)
-        # 4. Give nice warning if unexpected values have been passed
-        if len(config_dict) > 0:
-            logger.warning(
-                f"The config attributes {config_dict} were passed to {cls.__name__}, "
-                "but are not expected and will be ignored. Please verify your "
-                f"{cls.config_name} configuration file."
-            )
-        # 5. Give nice info if config attributes are initiliazed to default because they have not been passed
-        passed_keys = set(init_dict.keys())
-        if len(expected_keys - passed_keys) > 0:
-            logger.info(
-                f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
-            )
-        # 6. Define unused keyword arguments
-        unused_kwargs = {**config_dict, **kwargs}
-        # 7. Define "hidden" config parameters that were saved for compatible classes
-        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
-        return init_dict, unused_kwargs, hidden_config_dict
-    @classmethod
-    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return json.loads(text)
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-    @property
-    def config(self) -> Dict[str, Any]:
-        """
-        Returns the config of the class as a frozen dictionary
-        Returns:
-            `Dict[str, Any]`: Config of the class.
-        """
-        return self._internal_dict
-    def to_json_string(self) -> str:
-        """
-        Serializes the configuration instance to a JSON string.
-        Returns:
-            `str`:
-                String containing all the attributes that make up the configuration instance in JSON format.
-        """
-        config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
-        config_dict["_class_name"] = self.__class__.__name__
-        config_dict["_diffusers_version"] = __version__
-        def to_json_saveable(value):
-            if isinstance(value, np.ndarray):
-                value = value.tolist()
-            elif isinstance(value, PosixPath):
-                value = str(value)
-            return value
-        config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
-        # Don't save "_ignore_files" or "_use_default_values"
-        config_dict.pop("_ignore_files", None)
-        config_dict.pop("_use_default_values", None)
-        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
-        """
-        Save the configuration instance's parameters to a JSON file.
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file to save a configuration instance's parameters.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-def register_to_config(init):
-    r"""
-    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
-    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
-    shouldn't be registered in the config, use the `ignore_for_config` class variable
-    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
-    """
-    @functools.wraps(init)
-    def inner_init(self, *args, **kwargs):
-        # Ignore private kwargs in the init.
-        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
-        config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
-        if not isinstance(self, ConfigMixin):
-            raise RuntimeError(
-                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
-                "not inherit from `ConfigMixin`."
-            )
-        ignore = getattr(self, "ignore_for_config", [])
-        # Get positional arguments aligned with kwargs
-        new_kwargs = {}
-        signature = inspect.signature(init)
-        parameters = {
-            name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
-        }
-        for arg, name in zip(args, parameters.keys()):
-            new_kwargs[name] = arg
-        # Then add all kwargs
-        new_kwargs.update(
-            {
-                k: init_kwargs.get(k, default)
-                for k, default in parameters.items()
-                if k not in ignore and k not in new_kwargs
-            }
-        )
-        # Take note of the parameters that were not present in the loaded config
-        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
-            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
-        new_kwargs = {**config_init_kwargs, **new_kwargs}
-        getattr(self, "register_to_config")(**new_kwargs)
-        init(self, *args, **init_kwargs)
-    return inner_init
-def flax_register_to_config(cls):
-    original_init = cls.__init__
-    @functools.wraps(original_init)
-    def init(self, *args, **kwargs):
-        if not isinstance(self, ConfigMixin):
-            raise RuntimeError(
-                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
-                "not inherit from `ConfigMixin`."
-            )
-        # Ignore private kwargs in the init. Retrieve all passed attributes
-        init_kwargs = dict(kwargs.items())
-        # Retrieve default values
-        fields = dataclasses.fields(self)
-        default_kwargs = {}
-        for field in fields:
-            # ignore flax specific attributes
-            if field.name in self._flax_internal_args:
-                continue
-            if type(field.default) == dataclasses._MISSING_TYPE:
-                default_kwargs[field.name] = None
-            else:
-                default_kwargs[field.name] = getattr(self, field.name)
-        # Make sure init_kwargs override default kwargs
-        new_kwargs = {**default_kwargs, **init_kwargs}
-        # dtype should be part of `init_kwargs`, but not `new_kwargs`
-        if "dtype" in new_kwargs:
-            new_kwargs.pop("dtype")
-        # Get positional arguments aligned with kwargs
-        for i, arg in enumerate(args):
-            name = fields[i].name
-            new_kwargs[name] = arg
-        # Take note of the parameters that were not present in the loaded config
-        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
-            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
-        getattr(self, "register_to_config")(**new_kwargs)
-        original_init(self, *args, **kwargs)
-    cls.__init__ = init
-    return cls

4DoF/diffusers/dependency_versions_check.py DELETED Viewed

@@ -1,47 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-from .dependency_versions_table import deps
-from .utils.versions import require_version, require_version_core
-# define which module versions we always want to check at run time
-# (usually the ones defined in `install_requires` in setup.py)
-#
-# order specific notes:
-# - tqdm must be checked before tokenizers
-pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split()
-if sys.version_info < (3, 7):
-    pkgs_to_check_at_runtime.append("dataclasses")
-if sys.version_info < (3, 8):
-    pkgs_to_check_at_runtime.append("importlib_metadata")
-for pkg in pkgs_to_check_at_runtime:
-    if pkg in deps:
-        if pkg == "tokenizers":
-            # must be loaded here, or else tqdm check may fail
-            from .utils import is_tokenizers_available
-            if not is_tokenizers_available():
-                continue  # not required, check version only if installed
-        require_version_core(deps[pkg])
-    else:
-        raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
-def dep_version_check(pkg, hint=None):
-    require_version(deps[pkg], hint)

4DoF/diffusers/dependency_versions_table.py DELETED Viewed

@@ -1,44 +0,0 @@
-# THIS FILE HAS BEEN AUTOGENERATED. To update:
-# 1. modify the `_deps` dict in setup.py
-# 2. run `make deps_table_update``
-deps = {
-    "Pillow": "Pillow",
-    "accelerate": "accelerate>=0.11.0",
-    "compel": "compel==0.1.8",
-    "black": "black~=23.1",
-    "datasets": "datasets",
-    "filelock": "filelock",
-    "flax": "flax>=0.4.1",
-    "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.13.2",
-    "requests-mock": "requests-mock==1.10.0",
-    "importlib_metadata": "importlib_metadata",
-    "invisible-watermark": "invisible-watermark",
-    "isort": "isort>=5.5.4",
-    "jax": "jax>=0.2.8,!=0.3.2",
-    "jaxlib": "jaxlib>=0.1.65",
-    "Jinja2": "Jinja2",
-    "k-diffusion": "k-diffusion>=0.0.12",
-    "torchsde": "torchsde",
-    "note_seq": "note_seq",
-    "librosa": "librosa",
-    "numpy": "numpy",
-    "omegaconf": "omegaconf",
-    "parameterized": "parameterized",
-    "protobuf": "protobuf>=3.20.3,<4",
-    "pytest": "pytest",
-    "pytest-timeout": "pytest-timeout",
-    "pytest-xdist": "pytest-xdist",
-    "ruff": "ruff>=0.0.241",
-    "safetensors": "safetensors",
-    "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
-    "scipy": "scipy",
-    "onnx": "onnx",
-    "regex": "regex!=2019.12.17",
-    "requests": "requests",
-    "tensorboard": "tensorboard",
-    "torch": "torch>=1.4",
-    "torchvision": "torchvision",
-    "transformers": "transformers>=4.25.1",
-    "urllib3": "urllib3<=2.0.0",
-}

4DoF/diffusers/experimental/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .rl import ValueGuidedRLPipeline

4DoF/diffusers/experimental/rl/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .value_guided_sampling import ValueGuidedRLPipeline

4DoF/diffusers/experimental/rl/value_guided_sampling.py DELETED Viewed

@@ -1,152 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import torch
-import tqdm
-from ...models.unet_1d import UNet1DModel
-from ...pipelines import DiffusionPipeline
-from ...utils import randn_tensor
-from ...utils.dummy_pt_objects import DDPMScheduler
-class ValueGuidedRLPipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-    Pipeline for sampling actions from a diffusion model trained to predict sequences of states.
-    Original implementation inspired by this repository: https://github.com/jannerm/diffuser.
-    Parameters:
-        value_function ([`UNet1DModel`]): A specialized UNet for fine-tuning trajectories base on reward.
-        unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded trajectories.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
-            application is [`DDPMScheduler`].
-        env: An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
-    """
-    def __init__(
-        self,
-        value_function: UNet1DModel,
-        unet: UNet1DModel,
-        scheduler: DDPMScheduler,
-        env,
-    ):
-        super().__init__()
-        self.value_function = value_function
-        self.unet = unet
-        self.scheduler = scheduler
-        self.env = env
-        self.data = env.get_dataset()
-        self.means = {}
-        for key in self.data.keys():
-            try:
-                self.means[key] = self.data[key].mean()
-            except:  # noqa: E722
-                pass
-        self.stds = {}
-        for key in self.data.keys():
-            try:
-                self.stds[key] = self.data[key].std()
-            except:  # noqa: E722
-                pass
-        self.state_dim = env.observation_space.shape[0]
-        self.action_dim = env.action_space.shape[0]
-    def normalize(self, x_in, key):
-        return (x_in - self.means[key]) / self.stds[key]
-    def de_normalize(self, x_in, key):
-        return x_in * self.stds[key] + self.means[key]
-    def to_torch(self, x_in):
-        if type(x_in) is dict:
-            return {k: self.to_torch(v) for k, v in x_in.items()}
-        elif torch.is_tensor(x_in):
-            return x_in.to(self.unet.device)
-        return torch.tensor(x_in, device=self.unet.device)
-    def reset_x0(self, x_in, cond, act_dim):
-        for key, val in cond.items():
-            x_in[:, key, act_dim:] = val.clone()
-        return x_in
-    def run_diffusion(self, x, conditions, n_guide_steps, scale):
-        batch_size = x.shape[0]
-        y = None
-        for i in tqdm.tqdm(self.scheduler.timesteps):
-            # create batch of timesteps to pass into model
-            timesteps = torch.full((batch_size,), i, device=self.unet.device, dtype=torch.long)
-            for _ in range(n_guide_steps):
-                with torch.enable_grad():
-                    x.requires_grad_()
-                    # permute to match dimension for pre-trained models
-                    y = self.value_function(x.permute(0, 2, 1), timesteps).sample
-                    grad = torch.autograd.grad([y.sum()], [x])[0]
-                    posterior_variance = self.scheduler._get_variance(i)
-                    model_std = torch.exp(0.5 * posterior_variance)
-                    grad = model_std * grad
-                grad[timesteps < 2] = 0
-                x = x.detach()
-                x = x + scale * grad
-                x = self.reset_x0(x, conditions, self.action_dim)
-            prev_x = self.unet(x.permute(0, 2, 1), timesteps).sample.permute(0, 2, 1)
-            # TODO: verify deprecation of this kwarg
-            x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
-            # apply conditions to the trajectory (set the initial state)
-            x = self.reset_x0(x, conditions, self.action_dim)
-            x = self.to_torch(x)
-        return x, y
-    def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
-        # normalize the observations and create  batch dimension
-        obs = self.normalize(obs, "observations")
-        obs = obs[None].repeat(batch_size, axis=0)
-        conditions = {0: self.to_torch(obs)}
-        shape = (batch_size, planning_horizon, self.state_dim + self.action_dim)
-        # generate initial noise and apply our conditions (to make the trajectories start at current state)
-        x1 = randn_tensor(shape, device=self.unet.device)
-        x = self.reset_x0(x1, conditions, self.action_dim)
-        x = self.to_torch(x)
-        # run the diffusion process
-        x, y = self.run_diffusion(x, conditions, n_guide_steps, scale)
-        # sort output trajectories by value
-        sorted_idx = y.argsort(0, descending=True).squeeze()
-        sorted_values = x[sorted_idx]
-        actions = sorted_values[:, :, : self.action_dim]
-        actions = actions.detach().cpu().numpy()
-        denorm_actions = self.de_normalize(actions, key="actions")
-        # select the action with the highest value
-        if y is not None:
-            selected_index = 0
-        else:
-            # if we didn't run value guiding, select a random action
-            selected_index = np.random.randint(0, batch_size)
-        denorm_actions = denorm_actions[selected_index, 0]
-        return denorm_actions

4DoF/diffusers/image_processor.py DELETED Viewed

@@ -1,366 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import warnings
-from typing import List, Optional, Union
-import numpy as np
-import PIL
-import torch
-from PIL import Image
-from .configuration_utils import ConfigMixin, register_to_config
-from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
-class VaeImageProcessor(ConfigMixin):
-    """
-    Image processor for VAE.
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
-            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
-        vae_scale_factor (`int`, *optional*, defaults to `8`):
-            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
-        resample (`str`, *optional*, defaults to `lanczos`):
-            Resampling filter to use when resizing the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image to [-1,1].
-        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
-            Whether to convert the images to RGB format.
-    """
-    config_name = CONFIG_NAME
-    @register_to_config
-    def __init__(
-        self,
-        do_resize: bool = True,
-        vae_scale_factor: int = 8,
-        resample: str = "lanczos",
-        do_normalize: bool = True,
-        do_convert_rgb: bool = False,
-    ):
-        super().__init__()
-    @staticmethod
-    def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image:
-        """
-        Convert a numpy image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        if images.shape[-1] == 1:
-            # special case for grayscale (single channel) images
-            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
-        else:
-            pil_images = [Image.fromarray(image) for image in images]
-        return pil_images
-    @staticmethod
-    def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
-        """
-        Convert a PIL image or a list of PIL images to NumPy arrays.
-        """
-        if not isinstance(images, list):
-            images = [images]
-        images = [np.array(image).astype(np.float32) / 255.0 for image in images]
-        images = np.stack(images, axis=0)
-        return images
-    @staticmethod
-    def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor:
-        """
-        Convert a NumPy image to a PyTorch tensor.
-        """
-        if images.ndim == 3:
-            images = images[..., None]
-        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
-        return images
-    @staticmethod
-    def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray:
-        """
-        Convert a PyTorch tensor to a NumPy image.
-        """
-        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
-        return images
-    @staticmethod
-    def normalize(images):
-        """
-        Normalize an image array to [-1,1].
-        """
-        return 2.0 * images - 1.0
-    @staticmethod
-    def denormalize(images):
-        """
-        Denormalize an image array to [0,1].
-        """
-        return (images / 2 + 0.5).clamp(0, 1)
-    @staticmethod
-    def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
-        """
-        Converts an image to RGB format.
-        """
-        image = image.convert("RGB")
-        return image
-    def resize(
-        self,
-        image: PIL.Image.Image,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ) -> PIL.Image.Image:
-        """
-        Resize a PIL image. Both height and width are downscaled to the next integer multiple of `vae_scale_factor`.
-        """
-        if height is None:
-            height = image.height
-        if width is None:
-            width = image.width
-        width, height = (
-            x - x % self.config.vae_scale_factor for x in (width, height)
-        )  # resize to integer multiple of vae_scale_factor
-        image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
-        return image
-    def preprocess(
-        self,
-        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
-        """
-        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
-        if isinstance(image, supported_formats):
-            image = [image]
-        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
-            raise ValueError(
-                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
-            )
-        if isinstance(image[0], PIL.Image.Image):
-            if self.config.do_convert_rgb:
-                image = [self.convert_to_rgb(i) for i in image]
-            if self.config.do_resize:
-                image = [self.resize(i, height, width) for i in image]
-            image = self.pil_to_numpy(image)  # to np
-            image = self.numpy_to_pt(image)  # to pt
-        elif isinstance(image[0], np.ndarray):
-            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            image = self.numpy_to_pt(image)
-            _, _, height, width = image.shape
-            if self.config.do_resize and (
-                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
-            ):
-                raise ValueError(
-                    f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.config.vae_scale_factor}"
-                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
-                )
-        elif isinstance(image[0], torch.Tensor):
-            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
-            _, channel, height, width = image.shape
-            # don't need any preprocess if the image is latents
-            if channel == 4:
-                return image
-            if self.config.do_resize and (
-                height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0
-            ):
-                raise ValueError(
-                    f"Currently we only support resizing for PIL image - please resize your pytorch tensor to be divisible by {self.config.vae_scale_factor}"
-                    f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor"
-                )
-        # expected range [0,1], normalize to [-1,1]
-        do_normalize = self.config.do_normalize
-        if image.min() < 0:
-            warnings.warn(
-                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
-                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
-                FutureWarning,
-            )
-            do_normalize = False
-        if do_normalize:
-            image = self.normalize(image)
-        return image
-    def postprocess(
-        self,
-        image: torch.FloatTensor,
-        output_type: str = "pil",
-        do_denormalize: Optional[List[bool]] = None,
-    ):
-        if not isinstance(image, torch.Tensor):
-            raise ValueError(
-                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
-            )
-        if output_type not in ["latent", "pt", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`pil`, `np`, `pt`, `latent`"
-            )
-            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
-            output_type = "np"
-        if output_type == "latent":
-            return image
-        if do_denormalize is None:
-            do_denormalize = [self.config.do_normalize] * image.shape[0]
-        image = torch.stack(
-            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
-        )
-        if output_type == "pt":
-            return image
-        image = self.pt_to_numpy(image)
-        if output_type == "np":
-            return image
-        if output_type == "pil":
-            return self.numpy_to_pil(image)
-class VaeImageProcessorLDM3D(VaeImageProcessor):
-    """
-    Image processor for VAE LDM3D.
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
-        vae_scale_factor (`int`, *optional*, defaults to `8`):
-            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
-        resample (`str`, *optional*, defaults to `lanczos`):
-            Resampling filter to use when resizing the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image to [-1,1].
-    """
-    config_name = CONFIG_NAME
-    @register_to_config
-    def __init__(
-        self,
-        do_resize: bool = True,
-        vae_scale_factor: int = 8,
-        resample: str = "lanczos",
-        do_normalize: bool = True,
-    ):
-        super().__init__()
-    @staticmethod
-    def numpy_to_pil(images):
-        """
-        Convert a NumPy image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        if images.shape[-1] == 1:
-            # special case for grayscale (single channel) images
-            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
-        else:
-            pil_images = [Image.fromarray(image[:, :, :3]) for image in images]
-        return pil_images
-    @staticmethod
-    def rgblike_to_depthmap(image):
-        """
-        Args:
-            image: RGB-like depth image
-        Returns: depth map
-        """
-        return image[:, :, 1] * 2**8 + image[:, :, 2]
-    def numpy_to_depth(self, images):
-        """
-        Convert a NumPy depth image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images_depth = images[:, :, :, 3:]
-        if images.shape[-1] == 6:
-            images_depth = (images_depth * 255).round().astype("uint8")
-            pil_images = [
-                Image.fromarray(self.rgblike_to_depthmap(image_depth), mode="I;16") for image_depth in images_depth
-            ]
-        elif images.shape[-1] == 4:
-            images_depth = (images_depth * 65535.0).astype(np.uint16)
-            pil_images = [Image.fromarray(image_depth, mode="I;16") for image_depth in images_depth]
-        else:
-            raise Exception("Not supported")
-        return pil_images
-    def postprocess(
-        self,
-        image: torch.FloatTensor,
-        output_type: str = "pil",
-        do_denormalize: Optional[List[bool]] = None,
-    ):
-        if not isinstance(image, torch.Tensor):
-            raise ValueError(
-                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
-            )
-        if output_type not in ["latent", "pt", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`pil`, `np`, `pt`, `latent`"
-            )
-            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
-            output_type = "np"
-        if do_denormalize is None:
-            do_denormalize = [self.config.do_normalize] * image.shape[0]
-        image = torch.stack(
-            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
-        )
-        image = self.pt_to_numpy(image)
-        if output_type == "np":
-            if image.shape[-1] == 6:
-                image_depth = np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0)
-            else:
-                image_depth = image[:, :, :, 3:]
-            return image[:, :, :, :3], image_depth
-        if output_type == "pil":
-            return self.numpy_to_pil(image), self.numpy_to_depth(image)
-        else:
-            raise Exception(f"This type {output_type} is not supported")

4DoF/diffusers/loaders.py DELETED Viewed

@@ -1,1492 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import warnings
-from collections import defaultdict
-from pathlib import Path
-from typing import Callable, Dict, List, Optional, Union
-import torch
-import torch.nn.functional as F
-from huggingface_hub import hf_hub_download
-from .models.attention_processor import (
-    AttnAddedKVProcessor,
-    AttnAddedKVProcessor2_0,
-    CustomDiffusionAttnProcessor,
-    CustomDiffusionXFormersAttnProcessor,
-    LoRAAttnAddedKVProcessor,
-    LoRAAttnProcessor,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
-    SlicedAttnAddedKVProcessor,
-    XFormersAttnProcessor,
-)
-from .utils import (
-    DIFFUSERS_CACHE,
-    HF_HUB_OFFLINE,
-    TEXT_ENCODER_ATTN_MODULE,
-    _get_model_file,
-    deprecate,
-    is_safetensors_available,
-    is_transformers_available,
-    logging,
-)
-if is_safetensors_available():
-    import safetensors
-if is_transformers_available():
-    from transformers import PreTrainedModel, PreTrainedTokenizer
-logger = logging.get_logger(__name__)
-TEXT_ENCODER_NAME = "text_encoder"
-UNET_NAME = "unet"
-LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
-LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
-TEXT_INVERSION_NAME = "learned_embeds.bin"
-TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
-CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
-CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
-class AttnProcsLayers(torch.nn.Module):
-    def __init__(self, state_dict: Dict[str, torch.Tensor]):
-        super().__init__()
-        self.layers = torch.nn.ModuleList(state_dict.values())
-        self.mapping = dict(enumerate(state_dict.keys()))
-        self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())}
-        # .processor for unet, .self_attn for text encoder
-        self.split_keys = [".processor", ".self_attn"]
-        # we add a hook to state_dict() and load_state_dict() so that the
-        # naming fits with `unet.attn_processors`
-        def map_to(module, state_dict, *args, **kwargs):
-            new_state_dict = {}
-            for key, value in state_dict.items():
-                num = int(key.split(".")[1])  # 0 is always "layers"
-                new_key = key.replace(f"layers.{num}", module.mapping[num])
-                new_state_dict[new_key] = value
-            return new_state_dict
-        def remap_key(key, state_dict):
-            for k in self.split_keys:
-                if k in key:
-                    return key.split(k)[0] + k
-            raise ValueError(
-                f"There seems to be a problem with the state_dict: {set(state_dict.keys())}. {key} has to have one of {self.split_keys}."
-            )
-        def map_from(module, state_dict, *args, **kwargs):
-            all_keys = list(state_dict.keys())
-            for key in all_keys:
-                replace_key = remap_key(key, state_dict)
-                new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}")
-                state_dict[new_key] = state_dict[key]
-                del state_dict[key]
-        self._register_state_dict_hook(map_to)
-        self._register_load_state_dict_pre_hook(map_from, with_module=True)
-class UNet2DConditionLoadersMixin:
-    text_encoder_name = TEXT_ENCODER_NAME
-    unet_name = UNET_NAME
-    def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
-        r"""
-        Load pretrained attention processor layers into [`UNet2DConditionModel`]. Attention processor layers have to be
-        defined in
-        [`cross_attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py)
-        and be a `torch.nn.Module` class.
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-                    - A string, the model id (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a directory (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            mirror (`str`, *optional*):
-                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
-                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
-                information.
-        """
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
-        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
-        network_alpha = kwargs.pop("network_alpha", None)
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
-            )
-        allow_pickle = False
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-            allow_pickle = True
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
-        model_file = None
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            # Let's first try to load .safetensors weights
-            if (use_safetensors and weight_name is None) or (
-                weight_name is not None and weight_name.endswith(".safetensors")
-            ):
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path_or_dict,
-                        weights_name=weight_name or LORA_WEIGHT_NAME_SAFE,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                    )
-                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
-                except IOError as e:
-                    if not allow_pickle:
-                        raise e
-                    # try loading non-safetensors weights
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path_or_dict,
-                    weights_name=weight_name or LORA_WEIGHT_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                )
-                state_dict = torch.load(model_file, map_location="cpu")
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-        # fill attn processors
-        attn_processors = {}
-        is_lora = all("lora" in k for k in state_dict.keys())
-        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
-        if is_lora:
-            is_new_lora_format = all(
-                key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
-            )
-            if is_new_lora_format:
-                # Strip the `"unet"` prefix.
-                is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
-                if is_text_encoder_present:
-                    warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
-                    warnings.warn(warn_message)
-                unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
-                state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
-            lora_grouped_dict = defaultdict(dict)
-            for key, value in state_dict.items():
-                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
-                lora_grouped_dict[attn_processor_key][sub_key] = value
-            for key, value_dict in lora_grouped_dict.items():
-                rank = value_dict["to_k_lora.down.weight"].shape[0]
-                hidden_size = value_dict["to_k_lora.up.weight"].shape[0]
-                attn_processor = self
-                for sub_key in key.split("."):
-                    attn_processor = getattr(attn_processor, sub_key)
-                if isinstance(
-                    attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)
-                ):
-                    cross_attention_dim = value_dict["add_k_proj_lora.down.weight"].shape[1]
-                    attn_processor_class = LoRAAttnAddedKVProcessor
-                else:
-                    cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1]
-                    if isinstance(attn_processor, (XFormersAttnProcessor, LoRAXFormersAttnProcessor)):
-                        attn_processor_class = LoRAXFormersAttnProcessor
-                    else:
-                        attn_processor_class = (
-                            LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
-                        )
-                attn_processors[key] = attn_processor_class(
-                    hidden_size=hidden_size,
-                    cross_attention_dim=cross_attention_dim,
-                    rank=rank,
-                    network_alpha=network_alpha,
-                )
-                attn_processors[key].load_state_dict(value_dict)
-        elif is_custom_diffusion:
-            custom_diffusion_grouped_dict = defaultdict(dict)
-            for key, value in state_dict.items():
-                if len(value) == 0:
-                    custom_diffusion_grouped_dict[key] = {}
-                else:
-                    if "to_out" in key:
-                        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
-                    else:
-                        attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
-                    custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
-            for key, value_dict in custom_diffusion_grouped_dict.items():
-                if len(value_dict) == 0:
-                    attn_processors[key] = CustomDiffusionAttnProcessor(
-                        train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
-                    )
-                else:
-                    cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
-                    hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
-                    train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
-                    attn_processors[key] = CustomDiffusionAttnProcessor(
-                        train_kv=True,
-                        train_q_out=train_q_out,
-                        hidden_size=hidden_size,
-                        cross_attention_dim=cross_attention_dim,
-                    )
-                    attn_processors[key].load_state_dict(value_dict)
-        else:
-            raise ValueError(
-                f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
-            )
-        # set correct dtype & device
-        attn_processors = {k: v.to(device=self.device, dtype=self.dtype) for k, v in attn_processors.items()}
-        # set layers
-        self.set_attn_processor(attn_processors)
-    def save_attn_procs(
-        self,
-        save_directory: Union[str, os.PathLike],
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = False,
-        **kwargs,
-    ):
-        r"""
-        Save an attention processor to a directory so that it can be reloaded using the
-        [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] method.
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save an attention processor to. Will be created if it doesn't exist.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-        """
-        weight_name = weight_name or deprecate(
-            "weights_name",
-            "0.20.0",
-            "`weights_name` is deprecated, please use `weight_name` instead.",
-            take_from=kwargs,
-        )
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-        if save_function is None:
-            if safe_serialization:
-                def save_function(weights, filename):
-                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
-            else:
-                save_function = torch.save
-        os.makedirs(save_directory, exist_ok=True)
-        is_custom_diffusion = any(
-            isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
-            for (_, x) in self.attn_processors.items()
-        )
-        if is_custom_diffusion:
-            model_to_save = AttnProcsLayers(
-                {
-                    y: x
-                    for (y, x) in self.attn_processors.items()
-                    if isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor))
-                }
-            )
-            state_dict = model_to_save.state_dict()
-            for name, attn in self.attn_processors.items():
-                if len(attn.state_dict()) == 0:
-                    state_dict[name] = {}
-        else:
-            model_to_save = AttnProcsLayers(self.attn_processors)
-            state_dict = model_to_save.state_dict()
-        if weight_name is None:
-            if safe_serialization:
-                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else LORA_WEIGHT_NAME_SAFE
-            else:
-                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else LORA_WEIGHT_NAME
-        # Save the model
-        save_function(state_dict, os.path.join(save_directory, weight_name))
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
-class TextualInversionLoaderMixin:
-    r"""
-    Load textual inversion tokens and embeddings to the tokenizer and text encoder.
-    """
-    def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"):
-        r"""
-        Processes prompts that include a special token corresponding to a multi-vector textual inversion embedding to
-        be replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
-        inversion token or if the textual inversion token is a single vector, the input prompt is returned.
-        Parameters:
-            prompt (`str` or list of `str`):
-                The prompt or prompts to guide the image generation.
-            tokenizer (`PreTrainedTokenizer`):
-                The tokenizer responsible for encoding the prompt into input tokens.
-        Returns:
-            `str` or list of `str`: The converted prompt
-        """
-        if not isinstance(prompt, List):
-            prompts = [prompt]
-        else:
-            prompts = prompt
-        prompts = [self._maybe_convert_prompt(p, tokenizer) for p in prompts]
-        if not isinstance(prompt, List):
-            return prompts[0]
-        return prompts
-    def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"):
-        r"""
-        Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
-        to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
-        is replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
-        inversion token or a textual inversion token that is a single vector, the input prompt is simply returned.
-        Parameters:
-            prompt (`str`):
-                The prompt to guide the image generation.
-            tokenizer (`PreTrainedTokenizer`):
-                The tokenizer responsible for encoding the prompt into input tokens.
-        Returns:
-            `str`: The converted prompt
-        """
-        tokens = tokenizer.tokenize(prompt)
-        unique_tokens = set(tokens)
-        for token in unique_tokens:
-            if token in tokenizer.added_tokens_encoder:
-                replacement = token
-                i = 1
-                while f"{token}_{i}" in tokenizer.added_tokens_encoder:
-                    replacement += f" {token}_{i}"
-                    i += 1
-                prompt = prompt.replace(token, replacement)
-        return prompt
-    def load_textual_inversion(
-        self,
-        pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]],
-        token: Optional[Union[str, List[str]]] = None,
-        **kwargs,
-    ):
-        r"""
-        Load textual inversion embeddings into the text encoder of [`StableDiffusionPipeline`] (both 🤗 Diffusers and
-        Automatic1111 formats are supported).
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]` or `Dict` or `List[Dict]`):
-                Can be either one of the following or a list of them:
-                    - A string, the *model id* (for example `sd-concepts-library/low-poly-hd-logos-icons`) of a
-                      pretrained model hosted on the Hub.
-                    - A path to a *directory* (for example `./my_text_inversion_directory/`) containing the textual
-                      inversion weights.
-                    - A path to a *file* (for example `./my_text_inversions.pt`) containing textual inversion weights.
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-            token (`str` or `List[str]`, *optional*):
-                Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a
-                list, then `token` must also be a list of equal length.
-            weight_name (`str`, *optional*):
-                Name of a custom weight file. This should be used when:
-                    - The saved textual inversion file is in 🤗 Diffusers format, but was saved under a specific weight
-                      name such as `text_inv.bin`.
-                    - The saved textual inversion file is in the Automatic1111 format.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            mirror (`str`, *optional*):
-                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
-                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
-                information.
-        Example:
-        To load a textual inversion embedding vector in 🤗 Diffusers format:
-        ```py
-        from diffusers import StableDiffusionPipeline
-        import torch
-        model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
-        pipe.load_textual_inversion("sd-concepts-library/cat-toy")
-        prompt = "A <cat-toy> backpack"
-        image = pipe(prompt, num_inference_steps=50).images[0]
-        image.save("cat-backpack.png")
-        ```
-        To load a textual inversion embedding vector in Automatic1111 format, make sure to download the vector first
-        (for example from [civitAI](https://civitai.com/models/3036?modelVersionId=9857)) and then load the vector
-        locally:
-        ```py
-        from diffusers import StableDiffusionPipeline
-        import torch
-        model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
-        pipe.load_textual_inversion("./charturnerv2.pt", token="charturnerv2")
-        prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a woman wearing a black jacket and red shirt, best quality, intricate details."
-        image = pipe(prompt, num_inference_steps=50).images[0]
-        image.save("character.png")
-        ```
-        """
-        if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PreTrainedTokenizer):
-            raise ValueError(
-                f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling"
-                f" `{self.load_textual_inversion.__name__}`"
-            )
-        if not hasattr(self, "text_encoder") or not isinstance(self.text_encoder, PreTrainedModel):
-            raise ValueError(
-                f"{self.__class__.__name__} requires `self.text_encoder` of type `PreTrainedModel` for calling"
-                f" `{self.load_textual_inversion.__name__}`"
-            )
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
-            )
-        allow_pickle = False
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-            allow_pickle = True
-        user_agent = {
-            "file_type": "text_inversion",
-            "framework": "pytorch",
-        }
-        if not isinstance(pretrained_model_name_or_path, list):
-            pretrained_model_name_or_paths = [pretrained_model_name_or_path]
-        else:
-            pretrained_model_name_or_paths = pretrained_model_name_or_path
-        if isinstance(token, str):
-            tokens = [token]
-        elif token is None:
-            tokens = [None] * len(pretrained_model_name_or_paths)
-        else:
-            tokens = token
-        if len(pretrained_model_name_or_paths) != len(tokens):
-            raise ValueError(
-                f"You have passed a list of models of length {len(pretrained_model_name_or_paths)}, and list of tokens of length {len(tokens)}"
-                f"Make sure both lists have the same length."
-            )
-        valid_tokens = [t for t in tokens if t is not None]
-        if len(set(valid_tokens)) < len(valid_tokens):
-            raise ValueError(f"You have passed a list of tokens that contains duplicates: {tokens}")
-        token_ids_and_embeddings = []
-        for pretrained_model_name_or_path, token in zip(pretrained_model_name_or_paths, tokens):
-            if not isinstance(pretrained_model_name_or_path, dict):
-                # 1. Load textual inversion file
-                model_file = None
-                # Let's first try to load .safetensors weights
-                if (use_safetensors and weight_name is None) or (
-                    weight_name is not None and weight_name.endswith(".safetensors")
-                ):
-                    try:
-                        model_file = _get_model_file(
-                            pretrained_model_name_or_path,
-                            weights_name=weight_name or TEXT_INVERSION_NAME_SAFE,
-                            cache_dir=cache_dir,
-                            force_download=force_download,
-                            resume_download=resume_download,
-                            proxies=proxies,
-                            local_files_only=local_files_only,
-                            use_auth_token=use_auth_token,
-                            revision=revision,
-                            subfolder=subfolder,
-                            user_agent=user_agent,
-                        )
-                        state_dict = safetensors.torch.load_file(model_file, device="cpu")
-                    except Exception as e:
-                        if not allow_pickle:
-                            raise e
-                        model_file = None
-                if model_file is None:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=weight_name or TEXT_INVERSION_NAME,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                    )
-                    state_dict = torch.load(model_file, map_location="cpu")
-            else:
-                state_dict = pretrained_model_name_or_path
-            # 2. Load token and embedding correcly from file
-            loaded_token = None
-            if isinstance(state_dict, torch.Tensor):
-                if token is None:
-                    raise ValueError(
-                        "You are trying to load a textual inversion embedding that has been saved as a PyTorch tensor. Make sure to pass the name of the corresponding token in this case: `token=...`."
-                    )
-                embedding = state_dict
-            elif len(state_dict) == 1:
-                # diffusers
-                loaded_token, embedding = next(iter(state_dict.items()))
-            elif "string_to_param" in state_dict:
-                # A1111
-                loaded_token = state_dict["name"]
-                embedding = state_dict["string_to_param"]["*"]
-            if token is not None and loaded_token != token:
-                logger.info(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
-            else:
-                token = loaded_token
-            embedding = embedding.to(dtype=self.text_encoder.dtype, device=self.text_encoder.device)
-            # 3. Make sure we don't mess up the tokenizer or text encoder
-            vocab = self.tokenizer.get_vocab()
-            if token in vocab:
-                raise ValueError(
-                    f"Token {token} already in tokenizer vocabulary. Please choose a different token name or remove {token} and embedding from the tokenizer and text encoder."
-                )
-            elif f"{token}_1" in vocab:
-                multi_vector_tokens = [token]
-                i = 1
-                while f"{token}_{i}" in self.tokenizer.added_tokens_encoder:
-                    multi_vector_tokens.append(f"{token}_{i}")
-                    i += 1
-                raise ValueError(
-                    f"Multi-vector Token {multi_vector_tokens} already in tokenizer vocabulary. Please choose a different token name or remove the {multi_vector_tokens} and embedding from the tokenizer and text encoder."
-                )
-            is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
-            if is_multi_vector:
-                tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
-                embeddings = [e for e in embedding]  # noqa: C416
-            else:
-                tokens = [token]
-                embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding]
-            # add tokens and get ids
-            self.tokenizer.add_tokens(tokens)
-            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-            token_ids_and_embeddings += zip(token_ids, embeddings)
-            logger.info(f"Loaded textual inversion embedding for {token}.")
-        # resize token embeddings and set all new embeddings
-        self.text_encoder.resize_token_embeddings(len(self.tokenizer))
-        for token_id, embedding in token_ids_and_embeddings:
-            self.text_encoder.get_input_embeddings().weight.data[token_id] = embedding
-class LoraLoaderMixin:
-    r"""
-    Load LoRA layers into [`UNet2DConditionModel`] and
-    [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
-    """
-    text_encoder_name = TEXT_ENCODER_NAME
-    unet_name = UNET_NAME
-    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
-        r"""
-        Load pretrained LoRA attention processor layers into [`UNet2DConditionModel`] and
-        [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            mirror (`str`, *optional*):
-                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
-                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
-                information.
-        """
-        # Load the main state dict first which has the LoRA layers for either of
-        # UNet and text encoder or both.
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        # set lora scale to a reasonable default
-        self._lora_scale = 1.0
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
-            )
-        allow_pickle = False
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-            allow_pickle = True
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
-        model_file = None
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            # Let's first try to load .safetensors weights
-            if (use_safetensors and weight_name is None) or (
-                weight_name is not None and weight_name.endswith(".safetensors")
-            ):
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path_or_dict,
-                        weights_name=weight_name or LORA_WEIGHT_NAME_SAFE,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                    )
-                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
-                except IOError as e:
-                    if not allow_pickle:
-                        raise e
-                    # try loading non-safetensors weights
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path_or_dict,
-                    weights_name=weight_name or LORA_WEIGHT_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                )
-                state_dict = torch.load(model_file, map_location="cpu")
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-        # Convert kohya-ss Style LoRA attn procs to diffusers attn procs
-        network_alpha = None
-        if all((k.startswith("lora_te_") or k.startswith("lora_unet_")) for k in state_dict.keys()):
-            state_dict, network_alpha = self._convert_kohya_lora_to_diffusers(state_dict)
-        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
-        # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
-        # their prefixes.
-        keys = list(state_dict.keys())
-        if all(key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in keys):
-            # Load the layers corresponding to UNet.
-            unet_keys = [k for k in keys if k.startswith(self.unet_name)]
-            logger.info(f"Loading {self.unet_name}.")
-            unet_lora_state_dict = {
-                k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys
-            }
-            self.unet.load_attn_procs(unet_lora_state_dict, network_alpha=network_alpha)
-            # Load the layers corresponding to text encoder and make necessary adjustments.
-            text_encoder_keys = [k for k in keys if k.startswith(self.text_encoder_name)]
-            text_encoder_lora_state_dict = {
-                k.replace(f"{self.text_encoder_name}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
-            }
-            if len(text_encoder_lora_state_dict) > 0:
-                logger.info(f"Loading {self.text_encoder_name}.")
-                attn_procs_text_encoder = self._load_text_encoder_attn_procs(
-                    text_encoder_lora_state_dict, network_alpha=network_alpha
-                )
-                self._modify_text_encoder(attn_procs_text_encoder)
-                # save lora attn procs of text encoder so that it can be easily retrieved
-                self._text_encoder_lora_attn_procs = attn_procs_text_encoder
-        # Otherwise, we're dealing with the old format. This means the `state_dict` should only
-        # contain the module names of the `unet` as its keys WITHOUT any prefix.
-        elif not all(
-            key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
-        ):
-            self.unet.load_attn_procs(state_dict)
-            warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`."
-            warnings.warn(warn_message)
-    @property
-    def lora_scale(self) -> float:
-        # property function that returns the lora scale which can be set at run time by the pipeline.
-        # if _lora_scale has not been set, return 1
-        return self._lora_scale if hasattr(self, "_lora_scale") else 1.0
-    @property
-    def text_encoder_lora_attn_procs(self):
-        if hasattr(self, "_text_encoder_lora_attn_procs"):
-            return self._text_encoder_lora_attn_procs
-        return
-    def _remove_text_encoder_monkey_patch(self):
-        # Loop over the CLIPAttention module of text_encoder
-        for name, attn_module in self.text_encoder.named_modules():
-            if name.endswith(TEXT_ENCODER_ATTN_MODULE):
-                # Loop over the LoRA layers
-                for _, text_encoder_attr in self._lora_attn_processor_attr_to_text_encoder_attr.items():
-                    # Retrieve the q/k/v/out projection of CLIPAttention
-                    module = attn_module.get_submodule(text_encoder_attr)
-                    if hasattr(module, "old_forward"):
-                        # restore original `forward` to remove monkey-patch
-                        module.forward = module.old_forward
-                        delattr(module, "old_forward")
-    def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]):
-        r"""
-        Monkey-patches the forward passes of attention modules of the text encoder.
-        Parameters:
-            attn_processors: Dict[str, `LoRAAttnProcessor`]:
-                A dictionary mapping the module names and their corresponding [`~LoRAAttnProcessor`].
-        """
-        # First, remove any monkey-patch that might have been applied before
-        self._remove_text_encoder_monkey_patch()
-        # Loop over the CLIPAttention module of text_encoder
-        for name, attn_module in self.text_encoder.named_modules():
-            if name.endswith(TEXT_ENCODER_ATTN_MODULE):
-                # Loop over the LoRA layers
-                for attn_proc_attr, text_encoder_attr in self._lora_attn_processor_attr_to_text_encoder_attr.items():
-                    # Retrieve the q/k/v/out projection of CLIPAttention and its corresponding LoRA layer.
-                    module = attn_module.get_submodule(text_encoder_attr)
-                    lora_layer = attn_processors[name].get_submodule(attn_proc_attr)
-                    # save old_forward to module that can be used to remove monkey-patch
-                    old_forward = module.old_forward = module.forward
-                    # create a new scope that locks in the old_forward, lora_layer value for each new_forward function
-                    # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060
-                    def make_new_forward(old_forward, lora_layer):
-                        def new_forward(x):
-                            result = old_forward(x) + self.lora_scale * lora_layer(x)
-                            return result
-                        return new_forward
-                    # Monkey-patch.
-                    module.forward = make_new_forward(old_forward, lora_layer)
-    @property
-    def _lora_attn_processor_attr_to_text_encoder_attr(self):
-        return {
-            "to_q_lora": "q_proj",
-            "to_k_lora": "k_proj",
-            "to_v_lora": "v_proj",
-            "to_out_lora": "out_proj",
-        }
-    def _load_text_encoder_attn_procs(
-        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs
-    ):
-        r"""
-        Load pretrained attention processor layers for
-        [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
-        <Tip warning={true}>
-        This function is experimental and might change in the future.
-        </Tip>
-        Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                Can be either:
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-        Returns:
-            `Dict[name, LoRAAttnProcessor]`: Mapping between the module names and their corresponding
-            [`LoRAAttnProcessor`].
-        <Tip>
-        It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-        models](https://huggingface.co/docs/hub/models-gated#gated-models).
-        </Tip>
-        """
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        network_alpha = kwargs.pop("network_alpha", None)
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
-            )
-        allow_pickle = False
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-            allow_pickle = True
-        user_agent = {
-            "file_type": "attn_procs_weights",
-            "framework": "pytorch",
-        }
-        model_file = None
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            # Let's first try to load .safetensors weights
-            if (use_safetensors and weight_name is None) or (
-                weight_name is not None and weight_name.endswith(".safetensors")
-            ):
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path_or_dict,
-                        weights_name=weight_name or LORA_WEIGHT_NAME_SAFE,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                    )
-                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
-                except IOError as e:
-                    if not allow_pickle:
-                        raise e
-                    # try loading non-safetensors weights
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path_or_dict,
-                    weights_name=weight_name or LORA_WEIGHT_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                )
-                state_dict = torch.load(model_file, map_location="cpu")
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-        # fill attn processors
-        attn_processors = {}
-        is_lora = all("lora" in k for k in state_dict.keys())
-        if is_lora:
-            lora_grouped_dict = defaultdict(dict)
-            for key, value in state_dict.items():
-                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
-                lora_grouped_dict[attn_processor_key][sub_key] = value
-            for key, value_dict in lora_grouped_dict.items():
-                rank = value_dict["to_k_lora.down.weight"].shape[0]
-                cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1]
-                hidden_size = value_dict["to_k_lora.up.weight"].shape[0]
-                attn_processor_class = (
-                    LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
-                )
-                attn_processors[key] = attn_processor_class(
-                    hidden_size=hidden_size,
-                    cross_attention_dim=cross_attention_dim,
-                    rank=rank,
-                    network_alpha=network_alpha,
-                )
-                attn_processors[key].load_state_dict(value_dict)
-        else:
-            raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.")
-        # set correct dtype & device
-        attn_processors = {
-            k: v.to(device=self.device, dtype=self.text_encoder.dtype) for k, v in attn_processors.items()
-        }
-        return attn_processors
-    @classmethod
-    def save_lora_weights(
-        self,
-        save_directory: Union[str, os.PathLike],
-        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
-        text_encoder_lora_layers: Dict[str, torch.nn.Module] = None,
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = False,
-    ):
-        r"""
-        Save the LoRA parameters corresponding to the UNet and text encoder.
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
-            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the UNet.
-            text_encoder_lora_layers (`Dict[str, torch.nn.Module] or `Dict[str, torch.Tensor]`):
-                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
-                encoder LoRA state dict because it comes 🤗 Transformers.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-        if save_function is None:
-            if safe_serialization:
-                def save_function(weights, filename):
-                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
-            else:
-                save_function = torch.save
-        os.makedirs(save_directory, exist_ok=True)
-        # Create a flat dictionary.
-        state_dict = {}
-        if unet_lora_layers is not None:
-            weights = (
-                unet_lora_layers.state_dict() if isinstance(unet_lora_layers, torch.nn.Module) else unet_lora_layers
-            )
-            unet_lora_state_dict = {f"{self.unet_name}.{module_name}": param for module_name, param in weights.items()}
-            state_dict.update(unet_lora_state_dict)
-        if text_encoder_lora_layers is not None:
-            weights = (
-                text_encoder_lora_layers.state_dict()
-                if isinstance(text_encoder_lora_layers, torch.nn.Module)
-                else text_encoder_lora_layers
-            )
-            text_encoder_lora_state_dict = {
-                f"{self.text_encoder_name}.{module_name}": param for module_name, param in weights.items()
-            }
-            state_dict.update(text_encoder_lora_state_dict)
-        # Save the model
-        if weight_name is None:
-            if safe_serialization:
-                weight_name = LORA_WEIGHT_NAME_SAFE
-            else:
-                weight_name = LORA_WEIGHT_NAME
-        save_function(state_dict, os.path.join(save_directory, weight_name))
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
-    def _convert_kohya_lora_to_diffusers(self, state_dict):
-        unet_state_dict = {}
-        te_state_dict = {}
-        network_alpha = None
-        for key, value in state_dict.items():
-            if "lora_down" in key:
-                lora_name = key.split(".")[0]
-                lora_name_up = lora_name + ".lora_up.weight"
-                lora_name_alpha = lora_name + ".alpha"
-                if lora_name_alpha in state_dict:
-                    alpha = state_dict[lora_name_alpha].item()
-                    if network_alpha is None:
-                        network_alpha = alpha
-                    elif network_alpha != alpha:
-                        raise ValueError("Network alpha is not consistent")
-                if lora_name.startswith("lora_unet_"):
-                    diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
-                    diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
-                    diffusers_name = diffusers_name.replace("mid.block", "mid_block")
-                    diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
-                    diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
-                    diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
-                    diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
-                    diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
-                    diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
-                    if "transformer_blocks" in diffusers_name:
-                        if "attn1" in diffusers_name or "attn2" in diffusers_name:
-                            diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
-                            diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
-                            unet_state_dict[diffusers_name] = value
-                            unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
-                elif lora_name.startswith("lora_te_"):
-                    diffusers_name = key.replace("lora_te_", "").replace("_", ".")
-                    diffusers_name = diffusers_name.replace("text.model", "text_model")
-                    diffusers_name = diffusers_name.replace("self.attn", "self_attn")
-                    diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
-                    diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
-                    diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
-                    diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
-                    if "self_attn" in diffusers_name:
-                        te_state_dict[diffusers_name] = value
-                        te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
-        unet_state_dict = {f"{UNET_NAME}.{module_name}": params for module_name, params in unet_state_dict.items()}
-        te_state_dict = {f"{TEXT_ENCODER_NAME}.{module_name}": params for module_name, params in te_state_dict.items()}
-        new_state_dict = {**unet_state_dict, **te_state_dict}
-        return new_state_dict, network_alpha
-class FromSingleFileMixin:
-    """
-    Load model weights saved in the `.ckpt` format into a [`DiffusionPipeline`].
-    """
-    @classmethod
-    def from_ckpt(cls, *args, **kwargs):
-        deprecation_message = "The function `from_ckpt` is deprecated in favor of `from_single_file` and will be removed in diffusers v.0.21. Please make sure to use `StableDiffusionPipeline.from_single_file(...)` instead."
-        deprecate("from_ckpt", "0.21.0", deprecation_message, standard_warn=False)
-        return cls.from_single_file(*args, **kwargs)
-    @classmethod
-    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
-        r"""
-        Instantiate a [`DiffusionPipeline`] from pretrained pipeline weights saved in the `.ckpt` format. The pipeline
-        is set in evaluation mode (`model.eval()`) by default.
-        Parameters:
-            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A link to the `.ckpt` file (for example
-                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
-                    - A path to a *file* containing all pipeline weights.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to True, the model
-                won't be downloaded from the Hub.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
-                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
-                weights. If set to `False`, safetensors weights are not loaded.
-            extract_ema (`bool`, *optional*, defaults to `False`):
-                Whether to extract the EMA weights or not. Pass `True` to extract the EMA weights which usually yield
-                higher quality images for inference. Non-EMA weights are usually better to continue finetuning.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            prediction_type (`str`, *optional*):
-                The prediction type the model was trained on. Use `'epsilon'` for all Stable Diffusion v1 models and
-                the Stable Diffusion v2 base model. Use `'v_prediction'` for Stable Diffusion v2.
-            num_in_channels (`int`, *optional*, defaults to `None`):
-                The number of input channels. If `None`, it will be automatically inferred.
-            scheduler_type (`str`, *optional*, defaults to `"pndm"`):
-                Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
-                "ddim"]`.
-            load_safety_checker (`bool`, *optional*, defaults to `True`):
-                Whether to load the safety checker or not.
-            text_encoder (`CLIPTextModel`, *optional*, defaults to `None`):
-                An instance of
-                [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel) to use,
-                specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)
-                variant. If this parameter is `None`, the function will load a new instance of [CLIP] by itself, if
-                needed.
-            tokenizer (`CLIPTokenizer`, *optional*, defaults to `None`):
-                An instance of
-                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
-                to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by
-                itself, if needed.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load and saveable variables (for example the pipeline components of the
-                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
-                method. See example below for more information.
-        Examples:
-        ```py
-        >>> from diffusers import StableDiffusionPipeline
-        >>> # Download pipeline from huggingface.co and cache.
-        >>> pipeline = StableDiffusionPipeline.from_single_file(
-        ...     "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
-        ... )
-        >>> # Download pipeline from local file
-        >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt
-        >>> pipeline = StableDiffusionPipeline.from_single_file("./v1-5-pruned-emaonly")
-        >>> # Enable float16 and move to GPU
-        >>> pipeline = StableDiffusionPipeline.from_single_file(
-        ...     "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
-        ...     torch_dtype=torch.float16,
-        ... )
-        >>> pipeline.to("cuda")
-        ```
-        """
-        # import here to avoid circular dependency
-        from .pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        extract_ema = kwargs.pop("extract_ema", False)
-        image_size = kwargs.pop("image_size", None)
-        scheduler_type = kwargs.pop("scheduler_type", "pndm")
-        num_in_channels = kwargs.pop("num_in_channels", None)
-        upcast_attention = kwargs.pop("upcast_attention", None)
-        load_safety_checker = kwargs.pop("load_safety_checker", True)
-        prediction_type = kwargs.pop("prediction_type", None)
-        text_encoder = kwargs.pop("text_encoder", None)
-        tokenizer = kwargs.pop("tokenizer", None)
-        torch_dtype = kwargs.pop("torch_dtype", None)
-        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
-        pipeline_name = cls.__name__
-        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
-        from_safetensors = file_extension == "safetensors"
-        if from_safetensors and use_safetensors is False:
-            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
-        # TODO: For now we only support stable diffusion
-        stable_unclip = None
-        model_type = None
-        controlnet = False
-        if pipeline_name == "StableDiffusionControlNetPipeline":
-            # Model type will be inferred from the checkpoint.
-            controlnet = True
-        elif "StableDiffusion" in pipeline_name:
-            # Model type will be inferred from the checkpoint.
-            pass
-        elif pipeline_name == "StableUnCLIPPipeline":
-            model_type = "FrozenOpenCLIPEmbedder"
-            stable_unclip = "txt2img"
-        elif pipeline_name == "StableUnCLIPImg2ImgPipeline":
-            model_type = "FrozenOpenCLIPEmbedder"
-            stable_unclip = "img2img"
-        elif pipeline_name == "PaintByExamplePipeline":
-            model_type = "PaintByExample"
-        elif pipeline_name == "LDMTextToImagePipeline":
-            model_type = "LDMTextToImage"
-        else:
-            raise ValueError(f"Unhandled pipeline class: {pipeline_name}")
-        # remove huggingface url
-        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
-            if pretrained_model_link_or_path.startswith(prefix):
-                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-        ckpt_path = Path(pretrained_model_link_or_path)
-        if not ckpt_path.is_file():
-            # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = "/".join(ckpt_path.parts[:2])
-            file_path = "/".join(ckpt_path.parts[2:])
-            if file_path.startswith("blob/"):
-                file_path = file_path[len("blob/") :]
-            if file_path.startswith("main/"):
-                file_path = file_path[len("main/") :]
-            pretrained_model_link_or_path = hf_hub_download(
-                repo_id,
-                filename=file_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                force_download=force_download,
-            )
-        pipe = download_from_original_stable_diffusion_ckpt(
-            pretrained_model_link_or_path,
-            pipeline_class=cls,
-            model_type=model_type,
-            stable_unclip=stable_unclip,
-            controlnet=controlnet,
-            from_safetensors=from_safetensors,
-            extract_ema=extract_ema,
-            image_size=image_size,
-            scheduler_type=scheduler_type,
-            num_in_channels=num_in_channels,
-            upcast_attention=upcast_attention,
-            load_safety_checker=load_safety_checker,
-            prediction_type=prediction_type,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-        )
-        if torch_dtype is not None:
-            pipe.to(torch_dtype=torch_dtype)
-        return pipe

4DoF/diffusers/models/__init__.py DELETED Viewed

@@ -1,35 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import is_flax_available, is_torch_available
-if is_torch_available():
-    from .autoencoder_kl import AutoencoderKL
-    from .controlnet import ControlNetModel
-    from .dual_transformer_2d import DualTransformer2DModel
-    from .modeling_utils import ModelMixin
-    from .prior_transformer import PriorTransformer
-    from .t5_film_transformer import T5FilmDecoder
-    from .transformer_2d import Transformer2DModel
-    from .unet_1d import UNet1DModel
-    from .unet_2d import UNet2DModel
-    from .unet_2d_condition import UNet2DConditionModel
-    from .unet_3d_condition import UNet3DConditionModel
-    from .vq_model import VQModel
-if is_flax_available():
-    from .controlnet_flax import FlaxControlNetModel
-    from .unet_2d_condition_flax import FlaxUNet2DConditionModel
-    from .vae_flax import FlaxAutoencoderKL

4DoF/diffusers/models/activations.py DELETED Viewed

@@ -1,12 +0,0 @@
-from torch import nn
-def get_activation(act_fn):
-    if act_fn in ["swish", "silu"]:
-        return nn.SiLU()
-    elif act_fn == "mish":
-        return nn.Mish()
-    elif act_fn == "gelu":
-        return nn.GELU()
-    else:
-        raise ValueError(f"Unsupported activation function: {act_fn}")

4DoF/diffusers/models/attention.py DELETED Viewed

@@ -1,392 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Dict, Optional
-import torch
-import torch.nn.functional as F
-from torch import nn
-from ..utils import maybe_allow_in_graph
-from .activations import get_activation
-from .attention_processor import Attention
-from .embeddings import CombinedTimestepLabelEmbeddings
-@maybe_allow_in_graph
-class BasicTransformerBlock(nn.Module):
-    r"""
-    A basic Transformer block.
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        only_cross_attention (`bool`, *optional*):
-            Whether to use only cross-attention layers. In this case two cross attention layers are used.
-        double_self_attention (`bool`, *optional*):
-            Whether to use two self-attention layers. In this case no cross attention layers are used.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm (:
-            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
-        attention_bias (:
-            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
-    """
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-        attention_bias: bool = False,
-        only_cross_attention: bool = False,
-        double_self_attention: bool = False,
-        upcast_attention: bool = False,
-        norm_elementwise_affine: bool = True,
-        norm_type: str = "layer_norm",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        self.only_cross_attention = only_cross_attention
-        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
-        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
-        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
-            raise ValueError(
-                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
-                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
-            )
-        # Define 3 blocks. Each block has its own normalization layer.
-        # 1. Self-Attn
-        if self.use_ada_layer_norm:
-            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
-        elif self.use_ada_layer_norm_zero:
-            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
-        else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-        self.attn1 = Attention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-            upcast_attention=upcast_attention,
-        )
-        # 2. Cross-Attn
-        if cross_attention_dim is not None or double_self_attention:
-            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
-            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
-            # the second cross attention block.
-            self.norm2 = (
-                AdaLayerNorm(dim, num_embeds_ada_norm)
-                if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-            )
-            self.attn2 = Attention(
-                query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-            )  # is self-attn if encoder_hidden_states is none
-        else:
-            self.norm2 = None
-            self.attn2 = None
-        # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
-        # let chunk size default to None
-        self._chunk_size = None
-        self._chunk_dim = 0
-    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
-        # Sets chunk feed-forward
-        self._chunk_size = chunk_size
-        self._chunk_dim = dim
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        posemb: Optional = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        class_labels: Optional[torch.LongTensor] = None,
-    ):
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 1. Self-Attention
-        if self.use_ada_layer_norm:
-            norm_hidden_states = self.norm1(hidden_states, timestep)
-        elif self.use_ada_layer_norm_zero:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
-                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
-            )
-        else:
-            norm_hidden_states = self.norm1(hidden_states)
-        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-        attn_output = self.attn1(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
-            attention_mask=attention_mask,
-            posemb=posemb,  # todo in self attn, posemb shoule be [pose_in, pose_in]?
-            **cross_attention_kwargs,
-        )
-        if self.use_ada_layer_norm_zero:
-            attn_output = gate_msa.unsqueeze(1) * attn_output
-        hidden_states = attn_output + hidden_states
-        # 2. Cross-Attention
-        if self.attn2 is not None:
-            norm_hidden_states = (
-                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
-            )
-            attn_output = self.attn2(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                posemb=posemb,
-                **cross_attention_kwargs,
-            )
-            hidden_states = attn_output + hidden_states
-        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-        if self.use_ada_layer_norm_zero:
-            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
-        if self._chunk_size is not None:
-            # "feed_forward_chunk_size" can be used to save memory
-            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
-                raise ValueError(
-                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
-                )
-            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
-            ff_output = torch.cat(
-                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
-                dim=self._chunk_dim,
-            )
-        else:
-            ff_output = self.ff(norm_hidden_states)
-        if self.use_ada_layer_norm_zero:
-            ff_output = gate_mlp.unsqueeze(1) * ff_output
-        hidden_states = ff_output + hidden_states
-        return hidden_states
-class FeedForward(nn.Module):
-    r"""
-    A feed-forward layer.
-    Parameters:
-        dim (`int`): The number of channels in the input.
-        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
-    """
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: int = 4,
-        dropout: float = 0.0,
-        activation_fn: str = "geglu",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-        if activation_fn == "gelu":
-            act_fn = GELU(dim, inner_dim)
-        if activation_fn == "gelu-approximate":
-            act_fn = GELU(dim, inner_dim, approximate="tanh")
-        elif activation_fn == "geglu":
-            act_fn = GEGLU(dim, inner_dim)
-        elif activation_fn == "geglu-approximate":
-            act_fn = ApproximateGELU(dim, inner_dim)
-        self.net = nn.ModuleList([])
-        # project in
-        self.net.append(act_fn)
-        # project dropout
-        self.net.append(nn.Dropout(dropout))
-        # project out
-        self.net.append(nn.Linear(inner_dim, dim_out))
-        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
-        if final_dropout:
-            self.net.append(nn.Dropout(dropout))
-    def forward(self, hidden_states):
-        for module in self.net:
-            hidden_states = module(hidden_states)
-        return hidden_states
-class GELU(nn.Module):
-    r"""
-    GELU activation function with tanh approximation support with `approximate="tanh"`.
-    """
-    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out)
-        self.approximate = approximate
-    def gelu(self, gate):
-        if gate.device.type != "mps":
-            return F.gelu(gate, approximate=self.approximate)
-        # mps: gelu is not implemented for float16
-        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
-    def forward(self, hidden_states):
-        hidden_states = self.proj(hidden_states)
-        hidden_states = self.gelu(hidden_states)
-        return hidden_states
-class GEGLU(nn.Module):
-    r"""
-    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
-    Parameters:
-        dim_in (`int`): The number of channels in the input.
-        dim_out (`int`): The number of channels in the output.
-    """
-    def __init__(self, dim_in: int, dim_out: int):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
-    def gelu(self, gate):
-        if gate.device.type != "mps":
-            return F.gelu(gate)
-        # mps: gelu is not implemented for float16
-        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
-    def forward(self, hidden_states):
-        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
-        return hidden_states * self.gelu(gate)
-class ApproximateGELU(nn.Module):
-    """
-    The approximate form of Gaussian Error Linear Unit (GELU)
-    For more details, see section 2: https://arxiv.org/abs/1606.08415
-    """
-    def __init__(self, dim_in: int, dim_out: int):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out)
-    def forward(self, x):
-        x = self.proj(x)
-        return x * torch.sigmoid(1.702 * x)
-class AdaLayerNorm(nn.Module):
-    """
-    Norm layer modified to incorporate timestep embeddings.
-    """
-    def __init__(self, embedding_dim, num_embeddings):
-        super().__init__()
-        self.emb = nn.Embedding(num_embeddings, embedding_dim)
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
-        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
-    def forward(self, x, timestep):
-        emb = self.linear(self.silu(self.emb(timestep)))
-        scale, shift = torch.chunk(emb, 2)
-        x = self.norm(x) * (1 + scale) + shift
-        return x
-class AdaLayerNormZero(nn.Module):
-    """
-    Norm layer adaptive layer norm zero (adaLN-Zero).
-    """
-    def __init__(self, embedding_dim, num_embeddings):
-        super().__init__()
-        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
-        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
-    def forward(self, x, timestep, class_labels, hidden_dtype=None):
-        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
-        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
-        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
-class AdaGroupNorm(nn.Module):
-    """
-    GroupNorm layer modified to incorporate timestep embeddings.
-    """
-    def __init__(
-        self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
-    ):
-        super().__init__()
-        self.num_groups = num_groups
-        self.eps = eps
-        if act_fn is None:
-            self.act = None
-        else:
-            self.act = get_activation(act_fn)
-        self.linear = nn.Linear(embedding_dim, out_dim * 2)
-    def forward(self, x, emb):
-        if self.act:
-            emb = self.act(emb)
-        emb = self.linear(emb)
-        emb = emb[:, :, None, None]
-        scale, shift = emb.chunk(2, dim=1)
-        x = F.group_norm(x, self.num_groups, eps=self.eps)
-        x = x * (1 + scale) + shift
-        return x

4DoF/diffusers/models/attention_flax.py DELETED Viewed

@@ -1,446 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import functools
-import math
-import flax.linen as nn
-import jax
-import jax.numpy as jnp
-def _query_chunk_attention(query, key, value, precision, key_chunk_size: int = 4096):
-    """Multi-head dot product attention with a limited number of queries."""
-    num_kv, num_heads, k_features = key.shape[-3:]
-    v_features = value.shape[-1]
-    key_chunk_size = min(key_chunk_size, num_kv)
-    query = query / jnp.sqrt(k_features)
-    @functools.partial(jax.checkpoint, prevent_cse=False)
-    def summarize_chunk(query, key, value):
-        attn_weights = jnp.einsum("...qhd,...khd->...qhk", query, key, precision=precision)
-        max_score = jnp.max(attn_weights, axis=-1, keepdims=True)
-        max_score = jax.lax.stop_gradient(max_score)
-        exp_weights = jnp.exp(attn_weights - max_score)
-        exp_values = jnp.einsum("...vhf,...qhv->...qhf", value, exp_weights, precision=precision)
-        max_score = jnp.einsum("...qhk->...qh", max_score)
-        return (exp_values, exp_weights.sum(axis=-1), max_score)
-    def chunk_scanner(chunk_idx):
-        # julienne key array
-        key_chunk = jax.lax.dynamic_slice(
-            operand=key,
-            start_indices=[0] * (key.ndim - 3) + [chunk_idx, 0, 0],  # [...,k,h,d]
-            slice_sizes=list(key.shape[:-3]) + [key_chunk_size, num_heads, k_features],  # [...,k,h,d]
-        )
-        # julienne value array
-        value_chunk = jax.lax.dynamic_slice(
-            operand=value,
-            start_indices=[0] * (value.ndim - 3) + [chunk_idx, 0, 0],  # [...,v,h,d]
-            slice_sizes=list(value.shape[:-3]) + [key_chunk_size, num_heads, v_features],  # [...,v,h,d]
-        )
-        return summarize_chunk(query, key_chunk, value_chunk)
-    chunk_values, chunk_weights, chunk_max = jax.lax.map(f=chunk_scanner, xs=jnp.arange(0, num_kv, key_chunk_size))
-    global_max = jnp.max(chunk_max, axis=0, keepdims=True)
-    max_diffs = jnp.exp(chunk_max - global_max)
-    chunk_values *= jnp.expand_dims(max_diffs, axis=-1)
-    chunk_weights *= max_diffs
-    all_values = chunk_values.sum(axis=0)
-    all_weights = jnp.expand_dims(chunk_weights, -1).sum(axis=0)
-    return all_values / all_weights
-def jax_memory_efficient_attention(
-    query, key, value, precision=jax.lax.Precision.HIGHEST, query_chunk_size: int = 1024, key_chunk_size: int = 4096
-):
-    r"""
-    Flax Memory-efficient multi-head dot product attention. https://arxiv.org/abs/2112.05682v2
-    https://github.com/AminRezaei0x443/memory-efficient-attention
-    Args:
-        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
-        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
-        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
-        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
-            numerical precision for computation
-        query_chunk_size (`int`, *optional*, defaults to 1024):
-            chunk size to divide query array value must divide query_length equally without remainder
-        key_chunk_size (`int`, *optional*, defaults to 4096):
-            chunk size to divide key and value array value must divide key_value_length equally without remainder
-    Returns:
-        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
-    """
-    num_q, num_heads, q_features = query.shape[-3:]
-    def chunk_scanner(chunk_idx, _):
-        # julienne query array
-        query_chunk = jax.lax.dynamic_slice(
-            operand=query,
-            start_indices=([0] * (query.ndim - 3)) + [chunk_idx, 0, 0],  # [...,q,h,d]
-            slice_sizes=list(query.shape[:-3]) + [min(query_chunk_size, num_q), num_heads, q_features],  # [...,q,h,d]
-        )
-        return (
-            chunk_idx + query_chunk_size,  # unused ignore it
-            _query_chunk_attention(
-                query=query_chunk, key=key, value=value, precision=precision, key_chunk_size=key_chunk_size
-            ),
-        )
-    _, res = jax.lax.scan(
-        f=chunk_scanner, init=0, xs=None, length=math.ceil(num_q / query_chunk_size)  # start counter  # stop counter
-    )
-    return jnp.concatenate(res, axis=-3)  # fuse the chunked result back
-class FlaxAttention(nn.Module):
-    r"""
-    A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762
-    Parameters:
-        query_dim (:obj:`int`):
-            Input hidden states dimension
-        heads (:obj:`int`, *optional*, defaults to 8):
-            Number of heads
-        dim_head (:obj:`int`, *optional*, defaults to 64):
-            Hidden states dimension inside each head
-        dropout (:obj:`float`, *optional*, defaults to 0.0):
-            Dropout rate
-        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
-            enable memory efficient attention https://arxiv.org/abs/2112.05682
-        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
-            Parameters `dtype`
-    """
-    query_dim: int
-    heads: int = 8
-    dim_head: int = 64
-    dropout: float = 0.0
-    use_memory_efficient_attention: bool = False
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        inner_dim = self.dim_head * self.heads
-        self.scale = self.dim_head**-0.5
-        # Weights were exported with old names {to_q, to_k, to_v, to_out}
-        self.query = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_q")
-        self.key = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_k")
-        self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")
-        self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
-        self.dropout_layer = nn.Dropout(rate=self.dropout)
-    def reshape_heads_to_batch_dim(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
-        head_size = self.heads
-        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
-        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
-        tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
-        return tensor
-    def reshape_batch_dim_to_heads(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
-        head_size = self.heads
-        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
-        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
-        tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size)
-        return tensor
-    def __call__(self, hidden_states, context=None, deterministic=True):
-        context = hidden_states if context is None else context
-        query_proj = self.query(hidden_states)
-        key_proj = self.key(context)
-        value_proj = self.value(context)
-        query_states = self.reshape_heads_to_batch_dim(query_proj)
-        key_states = self.reshape_heads_to_batch_dim(key_proj)
-        value_states = self.reshape_heads_to_batch_dim(value_proj)
-        if self.use_memory_efficient_attention:
-            query_states = query_states.transpose(1, 0, 2)
-            key_states = key_states.transpose(1, 0, 2)
-            value_states = value_states.transpose(1, 0, 2)
-            # this if statement create a chunk size for each layer of the unet
-            # the chunk size is equal to the query_length dimension of the deepest layer of the unet
-            flatten_latent_dim = query_states.shape[-3]
-            if flatten_latent_dim % 64 == 0:
-                query_chunk_size = int(flatten_latent_dim / 64)
-            elif flatten_latent_dim % 16 == 0:
-                query_chunk_size = int(flatten_latent_dim / 16)
-            elif flatten_latent_dim % 4 == 0:
-                query_chunk_size = int(flatten_latent_dim / 4)
-            else:
-                query_chunk_size = int(flatten_latent_dim)
-            hidden_states = jax_memory_efficient_attention(
-                query_states, key_states, value_states, query_chunk_size=query_chunk_size, key_chunk_size=4096 * 4
-            )
-            hidden_states = hidden_states.transpose(1, 0, 2)
-        else:
-            # compute attentions
-            attention_scores = jnp.einsum("b i d, b j d->b i j", query_states, key_states)
-            attention_scores = attention_scores * self.scale
-            attention_probs = nn.softmax(attention_scores, axis=2)
-            # attend to values
-            hidden_states = jnp.einsum("b i j, b j d -> b i d", attention_probs, value_states)
-        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
-        hidden_states = self.proj_attn(hidden_states)
-        return self.dropout_layer(hidden_states, deterministic=deterministic)
-class FlaxBasicTransformerBlock(nn.Module):
-    r"""
-    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
-    https://arxiv.org/abs/1706.03762
-    Parameters:
-        dim (:obj:`int`):
-            Inner hidden states dimension
-        n_heads (:obj:`int`):
-            Number of heads
-        d_head (:obj:`int`):
-            Hidden states dimension inside each head
-        dropout (:obj:`float`, *optional*, defaults to 0.0):
-            Dropout rate
-        only_cross_attention (`bool`, defaults to `False`):
-            Whether to only apply cross attention.
-        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
-            Parameters `dtype`
-        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
-            enable memory efficient attention https://arxiv.org/abs/2112.05682
-    """
-    dim: int
-    n_heads: int
-    d_head: int
-    dropout: float = 0.0
-    only_cross_attention: bool = False
-    dtype: jnp.dtype = jnp.float32
-    use_memory_efficient_attention: bool = False
-    def setup(self):
-        # self attention (or cross_attention if only_cross_attention is True)
-        self.attn1 = FlaxAttention(
-            self.dim, self.n_heads, self.d_head, self.dropout, self.use_memory_efficient_attention, dtype=self.dtype
-        )
-        # cross attention
-        self.attn2 = FlaxAttention(
-            self.dim, self.n_heads, self.d_head, self.dropout, self.use_memory_efficient_attention, dtype=self.dtype
-        )
-        self.ff = FlaxFeedForward(dim=self.dim, dropout=self.dropout, dtype=self.dtype)
-        self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
-        self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
-        self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
-        self.dropout_layer = nn.Dropout(rate=self.dropout)
-    def __call__(self, hidden_states, context, deterministic=True):
-        # self attention
-        residual = hidden_states
-        if self.only_cross_attention:
-            hidden_states = self.attn1(self.norm1(hidden_states), context, deterministic=deterministic)
-        else:
-            hidden_states = self.attn1(self.norm1(hidden_states), deterministic=deterministic)
-        hidden_states = hidden_states + residual
-        # cross attention
-        residual = hidden_states
-        hidden_states = self.attn2(self.norm2(hidden_states), context, deterministic=deterministic)
-        hidden_states = hidden_states + residual
-        # feed forward
-        residual = hidden_states
-        hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
-        hidden_states = hidden_states + residual
-        return self.dropout_layer(hidden_states, deterministic=deterministic)
-class FlaxTransformer2DModel(nn.Module):
-    r"""
-    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
-    https://arxiv.org/pdf/1506.02025.pdf
-    Parameters:
-        in_channels (:obj:`int`):
-            Input number of channels
-        n_heads (:obj:`int`):
-            Number of heads
-        d_head (:obj:`int`):
-            Hidden states dimension inside each head
-        depth (:obj:`int`, *optional*, defaults to 1):
-            Number of transformers block
-        dropout (:obj:`float`, *optional*, defaults to 0.0):
-            Dropout rate
-        use_linear_projection (`bool`, defaults to `False`): tbd
-        only_cross_attention (`bool`, defaults to `False`): tbd
-        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
-            Parameters `dtype`
-        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
-            enable memory efficient attention https://arxiv.org/abs/2112.05682
-    """
-    in_channels: int
-    n_heads: int
-    d_head: int
-    depth: int = 1
-    dropout: float = 0.0
-    use_linear_projection: bool = False
-    only_cross_attention: bool = False
-    dtype: jnp.dtype = jnp.float32
-    use_memory_efficient_attention: bool = False
-    def setup(self):
-        self.norm = nn.GroupNorm(num_groups=32, epsilon=1e-5)
-        inner_dim = self.n_heads * self.d_head
-        if self.use_linear_projection:
-            self.proj_in = nn.Dense(inner_dim, dtype=self.dtype)
-        else:
-            self.proj_in = nn.Conv(
-                inner_dim,
-                kernel_size=(1, 1),
-                strides=(1, 1),
-                padding="VALID",
-                dtype=self.dtype,
-            )
-        self.transformer_blocks = [
-            FlaxBasicTransformerBlock(
-                inner_dim,
-                self.n_heads,
-                self.d_head,
-                dropout=self.dropout,
-                only_cross_attention=self.only_cross_attention,
-                dtype=self.dtype,
-                use_memory_efficient_attention=self.use_memory_efficient_attention,
-            )
-            for _ in range(self.depth)
-        ]
-        if self.use_linear_projection:
-            self.proj_out = nn.Dense(inner_dim, dtype=self.dtype)
-        else:
-            self.proj_out = nn.Conv(
-                inner_dim,
-                kernel_size=(1, 1),
-                strides=(1, 1),
-                padding="VALID",
-                dtype=self.dtype,
-            )
-        self.dropout_layer = nn.Dropout(rate=self.dropout)
-    def __call__(self, hidden_states, context, deterministic=True):
-        batch, height, width, channels = hidden_states.shape
-        residual = hidden_states
-        hidden_states = self.norm(hidden_states)
-        if self.use_linear_projection:
-            hidden_states = hidden_states.reshape(batch, height * width, channels)
-            hidden_states = self.proj_in(hidden_states)
-        else:
-            hidden_states = self.proj_in(hidden_states)
-            hidden_states = hidden_states.reshape(batch, height * width, channels)
-        for transformer_block in self.transformer_blocks:
-            hidden_states = transformer_block(hidden_states, context, deterministic=deterministic)
-        if self.use_linear_projection:
-            hidden_states = self.proj_out(hidden_states)
-            hidden_states = hidden_states.reshape(batch, height, width, channels)
-        else:
-            hidden_states = hidden_states.reshape(batch, height, width, channels)
-            hidden_states = self.proj_out(hidden_states)
-        hidden_states = hidden_states + residual
-        return self.dropout_layer(hidden_states, deterministic=deterministic)
-class FlaxFeedForward(nn.Module):
-    r"""
-    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
-    [`FeedForward`] class, with the following simplifications:
-    - The activation function is currently hardcoded to a gated linear unit from:
-    https://arxiv.org/abs/2002.05202
-    - `dim_out` is equal to `dim`.
-    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].
-    Parameters:
-        dim (:obj:`int`):
-            Inner hidden states dimension
-        dropout (:obj:`float`, *optional*, defaults to 0.0):
-            Dropout rate
-        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
-            Parameters `dtype`
-    """
-    dim: int
-    dropout: float = 0.0
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        # The second linear layer needs to be called
-        # net_2 for now to match the index of the Sequential layer
-        self.net_0 = FlaxGEGLU(self.dim, self.dropout, self.dtype)
-        self.net_2 = nn.Dense(self.dim, dtype=self.dtype)
-    def __call__(self, hidden_states, deterministic=True):
-        hidden_states = self.net_0(hidden_states, deterministic=deterministic)
-        hidden_states = self.net_2(hidden_states)
-        return hidden_states
-class FlaxGEGLU(nn.Module):
-    r"""
-    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
-    https://arxiv.org/abs/2002.05202.
-    Parameters:
-        dim (:obj:`int`):
-            Input hidden states dimension
-        dropout (:obj:`float`, *optional*, defaults to 0.0):
-            Dropout rate
-        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
-            Parameters `dtype`
-    """
-    dim: int
-    dropout: float = 0.0
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        inner_dim = self.dim * 4
-        self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype)
-        self.dropout_layer = nn.Dropout(rate=self.dropout)
-    def __call__(self, hidden_states, deterministic=True):
-        hidden_states = self.proj(hidden_states)
-        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
-        return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)

4DoF/diffusers/models/attention_processor.py DELETED Viewed

@@ -1,1714 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Callable, Optional, Union
-import torch
-import torch.nn.functional as F
-from torch import nn
-from ..utils import deprecate, logging, maybe_allow_in_graph
-from ..utils.import_utils import is_xformers_available
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-if is_xformers_available():
-    import xformers
-    import xformers.ops
-else:
-    xformers = None
-# 4DoF CaPE
-import einops
-def rotate_every_two(x):
-    x = einops.rearrange(x, '... (d j) -> ... d j', j=2)
-    x1, x2 = x.unbind(dim=-1)
-    x = torch.stack((-x2, x1), dim=-1)
-    return einops.rearrange(x, '... d j -> ... (d j)')
-def cape(x, p):
-    d, l, n = x.shape[-1], p.shape[-2], p.shape[-1]
-    assert d % (2 * n) == 0
-    m = einops.repeat(p, 'b l n -> b l (n k)', k=d // n)
-    return m
-def cape_embed(p1, p2, qq, kk):
-    """
-    Embed camera position encoding into attention map
-    Args:
-        p1: query pose  b, l_q, pose_dim
-        p2: key pose    b, l_k, pose_dim
-        qq: query feature map   b, l_q, feature_dim
-        kk: key feature map    b, l_k, feature_dim
-    Returns: cape embedded attention map    b, l_q, l_k
-    """
-    assert p1.shape[-1] == p2.shape[-1]
-    assert qq.shape[-1] == kk.shape[-1]
-    assert p1.shape[0] == p2.shape[0] == qq.shape[0] == kk.shape[0]
-    assert p1.shape[1] == qq.shape[1]
-    assert p2.shape[1] == kk.shape[1]
-    m1 = cape(qq, p1)
-    m2 = cape(kk, p2)
-    q = (qq * m1.cos()) + (rotate_every_two(qq) * m1.sin())
-    k = (kk * m2.cos()) + (rotate_every_two(kk) * m2.sin())
-    return q, k
-@maybe_allow_in_graph
-class Attention(nn.Module):
-    r"""
-    A cross attention layer.
-    Parameters:
-        query_dim (`int`): The number of channels in the query.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
-        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
-        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        bias (`bool`, *optional*, defaults to False):
-            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
-    """
-    def __init__(
-        self,
-        query_dim: int,
-        cross_attention_dim: Optional[int] = None,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias=False,
-        upcast_attention: bool = False,
-        upcast_softmax: bool = False,
-        cross_attention_norm: Optional[str] = None,
-        cross_attention_norm_num_groups: int = 32,
-        added_kv_proj_dim: Optional[int] = None,
-        norm_num_groups: Optional[int] = None,
-        spatial_norm_dim: Optional[int] = None,
-        out_bias: bool = True,
-        scale_qk: bool = True,
-        only_cross_attention: bool = False,
-        eps: float = 1e-5,
-        rescale_output_factor: float = 1.0,
-        residual_connection: bool = False,
-        _from_deprecated_attn_block=False,
-        processor: Optional["AttnProcessor"] = None,
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
-        self.upcast_attention = upcast_attention
-        self.upcast_softmax = upcast_softmax
-        self.rescale_output_factor = rescale_output_factor
-        self.residual_connection = residual_connection
-        self.dropout = dropout
-        # we make use of this private variable to know whether this class is loaded
-        # with an deprecated state dict so that we can convert it on the fly
-        self._from_deprecated_attn_block = _from_deprecated_attn_block
-        self.scale_qk = scale_qk
-        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
-        self.heads = heads
-        # for slice_size > 0 the attention score computation
-        # is split across the batch axis to save memory
-        # You can set slice_size with `set_attention_slice`
-        self.sliceable_head_dim = heads
-        self.added_kv_proj_dim = added_kv_proj_dim
-        self.only_cross_attention = only_cross_attention
-        if self.added_kv_proj_dim is None and self.only_cross_attention:
-            raise ValueError(
-                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
-            )
-        if norm_num_groups is not None:
-            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
-        else:
-            self.group_norm = None
-        if spatial_norm_dim is not None:
-            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
-        else:
-            self.spatial_norm = None
-        if cross_attention_norm is None:
-            self.norm_cross = None
-        elif cross_attention_norm == "layer_norm":
-            self.norm_cross = nn.LayerNorm(cross_attention_dim)
-        elif cross_attention_norm == "group_norm":
-            if self.added_kv_proj_dim is not None:
-                # The given `encoder_hidden_states` are initially of shape
-                # (batch_size, seq_len, added_kv_proj_dim) before being projected
-                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
-                # before the projection, so we need to use `added_kv_proj_dim` as
-                # the number of channels for the group norm.
-                norm_cross_num_channels = added_kv_proj_dim
-            else:
-                norm_cross_num_channels = cross_attention_dim
-            self.norm_cross = nn.GroupNorm(
-                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
-            )
-        else:
-            raise ValueError(
-                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
-            )
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
-        if not self.only_cross_attention:
-            # only relevant for the `AddedKVProcessor` classes
-            self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
-            self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
-        else:
-            self.to_k = None
-            self.to_v = None
-        if self.added_kv_proj_dim is not None:
-            self.add_k_proj = nn.Linear(added_kv_proj_dim, inner_dim)
-            self.add_v_proj = nn.Linear(added_kv_proj_dim, inner_dim)
-        self.to_out = nn.ModuleList([])
-        self.to_out.append(nn.Linear(inner_dim, query_dim, bias=out_bias))
-        self.to_out.append(nn.Dropout(dropout))
-        # set attention processor
-        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
-        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
-        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
-        if processor is None:
-            processor = (
-                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
-            )
-        self.set_processor(processor)
-    def set_use_memory_efficient_attention_xformers(
-        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
-    ):
-        is_lora = hasattr(self, "processor") and isinstance(
-            self.processor,
-            (LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, LoRAAttnAddedKVProcessor),
-        )
-        is_custom_diffusion = hasattr(self, "processor") and isinstance(
-            self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)
-        )
-        is_added_kv_processor = hasattr(self, "processor") and isinstance(
-            self.processor,
-            (
-                AttnAddedKVProcessor,
-                AttnAddedKVProcessor2_0,
-                SlicedAttnAddedKVProcessor,
-                XFormersAttnAddedKVProcessor,
-                LoRAAttnAddedKVProcessor,
-            ),
-        )
-        if use_memory_efficient_attention_xformers:
-            if is_added_kv_processor and (is_lora or is_custom_diffusion):
-                raise NotImplementedError(
-                    f"Memory efficient attention is currently not supported for LoRA or custom diffuson for attention processor type {self.processor}"
-                )
-            if not is_xformers_available():
-                raise ModuleNotFoundError(
-                    (
-                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
-                        " xformers"
-                    ),
-                    name="xformers",
-                )
-            elif not torch.cuda.is_available():
-                raise ValueError(
-                    "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
-                    " only available for GPU "
-                )
-            else:
-                try:
-                    # Make sure we can run the memory efficient attention
-                    _ = xformers.ops.memory_efficient_attention(
-                        torch.randn((1, 2, 40), device="cuda"),
-                        torch.randn((1, 2, 40), device="cuda"),
-                        torch.randn((1, 2, 40), device="cuda"),
-                    )
-                except Exception as e:
-                    raise e
-            if is_lora:
-                # TODO (sayakpaul): should we throw a warning if someone wants to use the xformers
-                # variant when using PT 2.0 now that we have LoRAAttnProcessor2_0?
-                processor = LoRAXFormersAttnProcessor(
-                    hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim,
-                    rank=self.processor.rank,
-                    attention_op=attention_op,
-                )
-                processor.load_state_dict(self.processor.state_dict())
-                processor.to(self.processor.to_q_lora.up.weight.device)
-            elif is_custom_diffusion:
-                processor = CustomDiffusionXFormersAttnProcessor(
-                    train_kv=self.processor.train_kv,
-                    train_q_out=self.processor.train_q_out,
-                    hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim,
-                    attention_op=attention_op,
-                )
-                processor.load_state_dict(self.processor.state_dict())
-                if hasattr(self.processor, "to_k_custom_diffusion"):
-                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
-            elif is_added_kv_processor:
-                # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
-                # which uses this type of cross attention ONLY because the attention mask of format
-                # [0, ..., -10.000, ..., 0, ...,] is not supported
-                # throw warning
-                logger.info(
-                    "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
-                )
-                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
-            else:
-                processor = XFormersAttnProcessor(attention_op=attention_op)
-        else:
-            if is_lora:
-                attn_processor_class = (
-                    LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
-                )
-                processor = attn_processor_class(
-                    hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim,
-                    rank=self.processor.rank,
-                )
-                processor.load_state_dict(self.processor.state_dict())
-                processor.to(self.processor.to_q_lora.up.weight.device)
-            elif is_custom_diffusion:
-                processor = CustomDiffusionAttnProcessor(
-                    train_kv=self.processor.train_kv,
-                    train_q_out=self.processor.train_q_out,
-                    hidden_size=self.processor.hidden_size,
-                    cross_attention_dim=self.processor.cross_attention_dim,
-                )
-                processor.load_state_dict(self.processor.state_dict())
-                if hasattr(self.processor, "to_k_custom_diffusion"):
-                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
-            else:
-                # set attention processor
-                # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
-                # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
-                # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
-                processor = (
-                    AttnProcessor2_0()
-                    if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
-                    else AttnProcessor()
-                )
-        self.set_processor(processor)
-    def set_attention_slice(self, slice_size):
-        if slice_size is not None and slice_size > self.sliceable_head_dim:
-            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
-        if slice_size is not None and self.added_kv_proj_dim is not None:
-            processor = SlicedAttnAddedKVProcessor(slice_size)
-        elif slice_size is not None:
-            processor = SlicedAttnProcessor(slice_size)
-        elif self.added_kv_proj_dim is not None:
-            processor = AttnAddedKVProcessor()
-        else:
-            # set attention processor
-            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
-            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
-            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
-            processor = (
-                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
-            )
-        self.set_processor(processor)
-    def set_processor(self, processor: "AttnProcessor"):
-        # if current processor is in `self._modules` and if passed `processor` is not, we need to
-        # pop `processor` from `self._modules`
-        if (
-            hasattr(self, "processor")
-            and isinstance(self.processor, torch.nn.Module)
-            and not isinstance(processor, torch.nn.Module)
-        ):
-            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
-            self._modules.pop("processor")
-        self.processor = processor
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
-        # The `Attention` class can call different attention processors / attention functions
-        # here we simply pass along all tensors to the selected processor class
-        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
-        return self.processor(
-            self,
-            hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=attention_mask,
-            **cross_attention_kwargs,
-        )
-    def batch_to_head_dim(self, tensor):
-        head_size = self.heads
-        batch_size, seq_len, dim = tensor.shape
-        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
-        return tensor
-    def head_to_batch_dim(self, tensor, out_dim=3):
-        head_size = self.heads
-        batch_size, seq_len, dim = tensor.shape
-        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
-        tensor = tensor.permute(0, 2, 1, 3)
-        if out_dim == 3:
-            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
-        return tensor
-    def get_attention_scores(self, query, key, attention_mask=None):
-        dtype = query.dtype
-        if self.upcast_attention:
-            query = query.float()
-            key = key.float()
-        if attention_mask is None:
-            baddbmm_input = torch.empty(
-                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
-            )
-            beta = 0
-        else:
-            baddbmm_input = attention_mask
-            beta = 1
-        attention_scores = torch.baddbmm(
-            baddbmm_input,
-            query,
-            key.transpose(-1, -2),
-            beta=beta,
-            alpha=self.scale,
-        )
-        del baddbmm_input
-        if self.upcast_softmax:
-            attention_scores = attention_scores.float()
-        attention_probs = attention_scores.softmax(dim=-1)
-        del attention_scores
-        attention_probs = attention_probs.to(dtype)
-        return attention_probs
-    def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=3):
-        if batch_size is None:
-            deprecate(
-                "batch_size=None",
-                "0.0.15",
-                (
-                    "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
-                    " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
-                    " `prepare_attention_mask` when preparing the attention_mask."
-                ),
-            )
-            batch_size = 1
-        head_size = self.heads
-        if attention_mask is None:
-            return attention_mask
-        current_length: int = attention_mask.shape[-1]
-        if current_length != target_length:
-            if attention_mask.device.type == "mps":
-                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
-                # Instead, we can manually construct the padding tensor.
-                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
-                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
-                attention_mask = torch.cat([attention_mask, padding], dim=2)
-            else:
-                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
-                #       we want to instead pad by (0, remaining_length), where remaining_length is:
-                #       remaining_length: int = target_length - current_length
-                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
-                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
-        if out_dim == 3:
-            if attention_mask.shape[0] < batch_size * head_size:
-                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
-        elif out_dim == 4:
-            attention_mask = attention_mask.unsqueeze(1)
-            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
-        return attention_mask
-    def norm_encoder_hidden_states(self, encoder_hidden_states):
-        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
-        if isinstance(self.norm_cross, nn.LayerNorm):
-            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
-        elif isinstance(self.norm_cross, nn.GroupNorm):
-            # Group norm norms along the channels dimension and expects
-            # input to be in the shape of (N, C, *). In this case, we want
-            # to norm along the hidden dimension, so we need to move
-            # (batch_size, sequence_length, hidden_size) ->
-            # (batch_size, hidden_size, sequence_length)
-            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
-            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
-            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
-        else:
-            assert False
-        return encoder_hidden_states
-class AttnProcessor:
-    r"""
-    Default processor for performing attention-related computations.
-    """
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-    ):
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states
-class LoRALinearLayer(nn.Module):
-    def __init__(self, in_features, out_features, rank=4, network_alpha=None):
-        super().__init__()
-        if rank > min(in_features, out_features):
-            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
-        self.down = nn.Linear(in_features, rank, bias=False)
-        self.up = nn.Linear(rank, out_features, bias=False)
-        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
-        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
-        self.network_alpha = network_alpha
-        self.rank = rank
-        nn.init.normal_(self.down.weight, std=1 / rank)
-        nn.init.zeros_(self.up.weight)
-    def forward(self, hidden_states):
-        orig_dtype = hidden_states.dtype
-        dtype = self.down.weight.dtype
-        down_hidden_states = self.down(hidden_states.to(dtype))
-        up_hidden_states = self.up(down_hidden_states)
-        if self.network_alpha is not None:
-            up_hidden_states *= self.network_alpha / self.rank
-        return up_hidden_states.to(orig_dtype)
-class LoRAAttnProcessor(nn.Module):
-    r"""
-    Processor for implementing the LoRA attention mechanism.
-    Args:
-        hidden_size (`int`, *optional*):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-        network_alpha (`int`, *optional*):
-            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
-    """
-    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None
-    ):
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states
-class CustomDiffusionAttnProcessor(nn.Module):
-    r"""
-    Processor for implementing attention for the Custom Diffusion method.
-    Args:
-        train_kv (`bool`, defaults to `True`):
-            Whether to newly train the key and value matrices corresponding to the text features.
-        train_q_out (`bool`, defaults to `True`):
-            Whether to newly train query matrices corresponding to the latent image features.
-        hidden_size (`int`, *optional*, defaults to `None`):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*, defaults to `None`):
-            The number of channels in the `encoder_hidden_states`.
-        out_bias (`bool`, defaults to `True`):
-            Whether to include the bias parameter in `train_q_out`.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability to use.
-    """
-    def __init__(
-        self,
-        train_kv=True,
-        train_q_out=True,
-        hidden_size=None,
-        cross_attention_dim=None,
-        out_bias=True,
-        dropout=0.0,
-    ):
-        super().__init__()
-        self.train_kv = train_kv
-        self.train_q_out = train_q_out
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        # `_custom_diffusion` id for easy serialization and loading.
-        if self.train_kv:
-            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-        if self.train_q_out:
-            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
-            self.to_out_custom_diffusion = nn.ModuleList([])
-            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
-            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if self.train_q_out:
-            query = self.to_q_custom_diffusion(hidden_states)
-        else:
-            query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            crossattn = False
-            encoder_hidden_states = hidden_states
-        else:
-            crossattn = True
-            if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        if self.train_kv:
-            key = self.to_k_custom_diffusion(encoder_hidden_states)
-            value = self.to_v_custom_diffusion(encoder_hidden_states)
-        else:
-            key = attn.to_k(encoder_hidden_states)
-            value = attn.to_v(encoder_hidden_states)
-        if crossattn:
-            detach = torch.ones_like(key)
-            detach[:, :1, :] = detach[:, :1, :] * 0.0
-            key = detach * key + (1 - detach) * key.detach()
-            value = detach * value + (1 - detach) * value.detach()
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        if self.train_q_out:
-            # linear proj
-            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
-            # dropout
-            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
-        else:
-            # linear proj
-            hidden_states = attn.to_out[0](hidden_states)
-            # dropout
-            hidden_states = attn.to_out[1](hidden_states)
-        return hidden_states
-class AttnAddedKVProcessor:
-    r"""
-    Processor for performing attention-related computations with extra learnable key and value matrices for the text
-    encoder.
-    """
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        residual = hidden_states
-        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        query = attn.head_to_batch_dim(query)
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states)
-            value = attn.to_v(hidden_states)
-            key = attn.head_to_batch_dim(key)
-            value = attn.head_to_batch_dim(value)
-            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
-            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-        return hidden_states
-class AttnAddedKVProcessor2_0:
-    r"""
-    Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra
-    learnable key and value matrices for the text encoder.
-    """
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "AttnAddedKVProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
-            )
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        residual = hidden_states
-        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=4)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        query = attn.head_to_batch_dim(query, out_dim=4)
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, out_dim=4)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, out_dim=4)
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states)
-            value = attn.to_v(hidden_states)
-            key = attn.head_to_batch_dim(key, out_dim=4)
-            value = attn.head_to_batch_dim(value, out_dim=4)
-            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
-            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, residual.shape[1])
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-        return hidden_states
-class LoRAAttnAddedKVProcessor(nn.Module):
-    r"""
-    Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text
-    encoder.
-    Args:
-        hidden_size (`int`, *optional*):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*, defaults to `None`):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-    """
-    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
-        residual = hidden_states
-        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query)
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + scale * self.add_k_proj_lora(
-            encoder_hidden_states
-        )
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + scale * self.add_v_proj_lora(
-            encoder_hidden_states
-        )
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states) + scale * self.to_k_lora(hidden_states)
-            value = attn.to_v(hidden_states) + scale * self.to_v_lora(hidden_states)
-            key = attn.head_to_batch_dim(key)
-            value = attn.head_to_batch_dim(value)
-            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
-            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-        return hidden_states
-class XFormersAttnAddedKVProcessor:
-    r"""
-    Processor for implementing memory efficient attention using xFormers.
-    Args:
-        attention_op (`Callable`, *optional*, defaults to `None`):
-            The base
-            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
-            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
-            operator.
-    """
-    def __init__(self, attention_op: Optional[Callable] = None):
-        self.attention_op = attention_op
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        residual = hidden_states
-        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        query = attn.head_to_batch_dim(query)
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states)
-            value = attn.to_v(hidden_states)
-            key = attn.head_to_batch_dim(key)
-            value = attn.head_to_batch_dim(value)
-            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
-            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-        hidden_states = xformers.ops.memory_efficient_attention(
-            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
-        )
-        hidden_states = hidden_states.to(query.dtype)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-        return hidden_states
-class XFormersAttnProcessor:
-    r"""
-    Processor for implementing memory efficient attention using xFormers.
-    Args:
-        attention_op (`Callable`, *optional*, defaults to `None`):
-            The base
-            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
-            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
-            operator.
-    """
-    def __init__(self, attention_op: Optional[Callable] = None):
-        self.attention_op = attention_op
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        posemb: Optional = None,
-    ):
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        if posemb is not None:
-            # turn 2d attention into multiview attention
-            self_attn = encoder_hidden_states is None  # check if self attn or cross attn
-            p_out, p_in = posemb
-            t_out, t_in = p_out.shape[1], p_in.shape[1]  # t size
-            hidden_states = einops.rearrange(hidden_states, '(b t_out) l d -> b (t_out l) d', t_out=t_out)
-        batch_size, key_tokens, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
-        if attention_mask is not None:
-            # expand our mask's singleton query_tokens dimension:
-            #   [batch*heads,            1, key_tokens] ->
-            #   [batch*heads, query_tokens, key_tokens]
-            # so that it can be added as a bias onto the attention scores that xformers computes:
-            #   [batch*heads, query_tokens, key_tokens]
-            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
-            _, query_tokens, _ = hidden_states.shape
-            attention_mask = attention_mask.expand(-1, query_tokens, -1)
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        # apply 4DoF CaPE, todo now only for xformer processor
-        if posemb is not None:
-            p_out = einops.repeat(p_out, 'b t_out d -> b (t_out l) d', l=query.shape[1]//t_out)  # query shape
-            if self_attn:
-                p_in = p_out
-            else:
-                p_in = einops.repeat(p_in, 'b t_in d -> b (t_in l) d', l=key.shape[1] // t_in)  # key shape
-            query, key = cape_embed(p_out, p_in, query, key)
-        query = attn.head_to_batch_dim(query).contiguous()
-        key = attn.head_to_batch_dim(key).contiguous()
-        value = attn.head_to_batch_dim(value).contiguous()
-        # self-ttn (bm) l c  x  (bm) l c -> (bm) l c
-        # cross-ttn (bm) l c  x  b (nl) c -> (bm) l c
-        # reuse 2d attention for multiview attention
-        # self-ttn b (ml) c  x  b (ml) c -> b (ml) c
-        # cross-ttn  b (ml) c  x  b (nl) c -> b (ml) c
-        hidden_states = xformers.ops.memory_efficient_attention(    # query: (bm) l c -> b (ml) c;  key: b (nl) c
-            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
-        )
-        hidden_states = hidden_states.to(query.dtype)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if posemb is not None:
-            # reshape back
-            hidden_states = einops.rearrange(hidden_states, 'b (t_out l) d -> (b t_out) l d', t_out=t_out)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states
-class AttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-    ):
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        inner_dim = hidden_states.shape[-1]
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states
-class LoRAXFormersAttnProcessor(nn.Module):
-    r"""
-    Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers.
-    Args:
-        hidden_size (`int`, *optional*):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-        attention_op (`Callable`, *optional*, defaults to `None`):
-            The base
-            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
-            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
-            operator.
-        network_alpha (`int`, *optional*):
-            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
-    """
-    def __init__(
-        self, hidden_size, cross_attention_dim, rank=4, attention_op: Optional[Callable] = None, network_alpha=None
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-        self.attention_op = attention_op
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-    def __call__(
-        self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None
-    ):
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        query = attn.head_to_batch_dim(query).contiguous()
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-        key = attn.head_to_batch_dim(key).contiguous()
-        value = attn.head_to_batch_dim(value).contiguous()
-        hidden_states = xformers.ops.memory_efficient_attention(
-            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
-        )
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states
-class LoRAAttnProcessor2_0(nn.Module):
-    r"""
-    Processor for implementing the LoRA attention mechanism using PyTorch 2.0's memory-efficient scaled dot-product
-    attention.
-    Args:
-        hidden_size (`int`):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-        network_alpha (`int`, *optional*):
-            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
-    """
-    def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
-        super().__init__()
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.rank = rank
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
-        residual = hidden_states
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        inner_dim = hidden_states.shape[-1]
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states
-class CustomDiffusionXFormersAttnProcessor(nn.Module):
-    r"""
-    Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method.
-    Args:
-    train_kv (`bool`, defaults to `True`):
-        Whether to newly train the key and value matrices corresponding to the text features.
-    train_q_out (`bool`, defaults to `True`):
-        Whether to newly train query matrices corresponding to the latent image features.
-    hidden_size (`int`, *optional*, defaults to `None`):
-        The hidden size of the attention layer.
-    cross_attention_dim (`int`, *optional*, defaults to `None`):
-        The number of channels in the `encoder_hidden_states`.
-    out_bias (`bool`, defaults to `True`):
-        Whether to include the bias parameter in `train_q_out`.
-    dropout (`float`, *optional*, defaults to 0.0):
-        The dropout probability to use.
-    attention_op (`Callable`, *optional*, defaults to `None`):
-        The base
-        [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use
-        as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator.
-    """
-    def __init__(
-        self,
-        train_kv=True,
-        train_q_out=False,
-        hidden_size=None,
-        cross_attention_dim=None,
-        out_bias=True,
-        dropout=0.0,
-        attention_op: Optional[Callable] = None,
-    ):
-        super().__init__()
-        self.train_kv = train_kv
-        self.train_q_out = train_q_out
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.attention_op = attention_op
-        # `_custom_diffusion` id for easy serialization and loading.
-        if self.train_kv:
-            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-        if self.train_q_out:
-            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
-            self.to_out_custom_diffusion = nn.ModuleList([])
-            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
-            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if self.train_q_out:
-            query = self.to_q_custom_diffusion(hidden_states)
-        else:
-            query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            crossattn = False
-            encoder_hidden_states = hidden_states
-        else:
-            crossattn = True
-            if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        if self.train_kv:
-            key = self.to_k_custom_diffusion(encoder_hidden_states)
-            value = self.to_v_custom_diffusion(encoder_hidden_states)
-        else:
-            key = attn.to_k(encoder_hidden_states)
-            value = attn.to_v(encoder_hidden_states)
-        if crossattn:
-            detach = torch.ones_like(key)
-            detach[:, :1, :] = detach[:, :1, :] * 0.0
-            key = detach * key + (1 - detach) * key.detach()
-            value = detach * value + (1 - detach) * value.detach()
-        query = attn.head_to_batch_dim(query).contiguous()
-        key = attn.head_to_batch_dim(key).contiguous()
-        value = attn.head_to_batch_dim(value).contiguous()
-        hidden_states = xformers.ops.memory_efficient_attention(
-            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
-        )
-        hidden_states = hidden_states.to(query.dtype)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        if self.train_q_out:
-            # linear proj
-            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
-            # dropout
-            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
-        else:
-            # linear proj
-            hidden_states = attn.to_out[0](hidden_states)
-            # dropout
-            hidden_states = attn.to_out[1](hidden_states)
-        return hidden_states
-class SlicedAttnProcessor:
-    r"""
-    Processor for implementing sliced attention.
-    Args:
-        slice_size (`int`, *optional*):
-            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
-            `attention_head_dim` must be a multiple of the `slice_size`.
-    """
-    def __init__(self, slice_size):
-        self.slice_size = slice_size
-    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        residual = hidden_states
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        dim = query.shape[-1]
-        query = attn.head_to_batch_dim(query)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-        batch_size_attention, query_tokens, _ = query.shape
-        hidden_states = torch.zeros(
-            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
-        )
-        for i in range(batch_size_attention // self.slice_size):
-            start_idx = i * self.slice_size
-            end_idx = (i + 1) * self.slice_size
-            query_slice = query[start_idx:end_idx]
-            key_slice = key[start_idx:end_idx]
-            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
-            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
-            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
-            hidden_states[start_idx:end_idx] = attn_slice
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states
-class SlicedAttnAddedKVProcessor:
-    r"""
-    Processor for implementing sliced attention with extra learnable key and value matrices for the text encoder.
-    Args:
-        slice_size (`int`, *optional*):
-            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
-            `attention_head_dim` must be a multiple of the `slice_size`.
-    """
-    def __init__(self, slice_size):
-        self.slice_size = slice_size
-    def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None):
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
-        batch_size, sequence_length, _ = hidden_states.shape
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        dim = query.shape[-1]
-        query = attn.head_to_batch_dim(query)
-        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
-        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
-        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
-        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
-        if not attn.only_cross_attention:
-            key = attn.to_k(hidden_states)
-            value = attn.to_v(hidden_states)
-            key = attn.head_to_batch_dim(key)
-            value = attn.head_to_batch_dim(value)
-            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
-            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
-        else:
-            key = encoder_hidden_states_key_proj
-            value = encoder_hidden_states_value_proj
-        batch_size_attention, query_tokens, _ = query.shape
-        hidden_states = torch.zeros(
-            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
-        )
-        for i in range(batch_size_attention // self.slice_size):
-            start_idx = i * self.slice_size
-            end_idx = (i + 1) * self.slice_size
-            query_slice = query[start_idx:end_idx]
-            key_slice = key[start_idx:end_idx]
-            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
-            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
-            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
-            hidden_states[start_idx:end_idx] = attn_slice
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
-        hidden_states = hidden_states + residual
-        return hidden_states
-AttentionProcessor = Union[
-    AttnProcessor,
-    AttnProcessor2_0,
-    XFormersAttnProcessor,
-    SlicedAttnProcessor,
-    AttnAddedKVProcessor,
-    SlicedAttnAddedKVProcessor,
-    AttnAddedKVProcessor2_0,
-    XFormersAttnAddedKVProcessor,
-    LoRAAttnProcessor,
-    LoRAXFormersAttnProcessor,
-    LoRAAttnProcessor2_0,
-    LoRAAttnAddedKVProcessor,
-    CustomDiffusionAttnProcessor,
-    CustomDiffusionXFormersAttnProcessor,
-]
-class SpatialNorm(nn.Module):
-    """
-    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
-    """
-    def __init__(
-        self,
-        f_channels,
-        zq_channels,
-    ):
-        super().__init__()
-        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True)
-        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
-        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
-    def forward(self, f, zq):
-        f_size = f.shape[-2:]
-        zq = F.interpolate(zq, size=f_size, mode="nearest")
-        norm_f = self.norm_layer(f)
-        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
-        return new_f

4DoF/diffusers/models/autoencoder_kl.py DELETED Viewed

@@ -1,411 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
-import torch
-import torch.nn as nn
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, apply_forward_hook
-from .attention_processor import AttentionProcessor, AttnProcessor
-from .modeling_utils import ModelMixin
-from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
-@dataclass
-class AutoencoderKLOutput(BaseOutput):
-    """
-    Output of AutoencoderKL encoding method.
-    Args:
-        latent_dist (`DiagonalGaussianDistribution`):
-            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
-            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
-    """
-    latent_dist: "DiagonalGaussianDistribution"
-class AutoencoderKL(ModelMixin, ConfigMixin):
-    r"""
-    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
-            Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
-            Tuple of block output channels.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
-        scaling_factor (`float`, *optional*, defaults to 0.18215):
-            The component-wise standard deviation of the trained latent space computed using the first batch of the
-            training set. This is used to scale the latent space to have unit variance when training the diffusion
-            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
-            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
-            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        norm_num_groups: int = 32,
-        sample_size: int = 32,
-        scaling_factor: float = 0.18215,
-    ):
-        super().__init__()
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            double_z=True,
-        )
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            norm_num_groups=norm_num_groups,
-            act_fn=act_fn,
-        )
-        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
-        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1)
-        self.use_slicing = False
-        self.use_tiling = False
-        # only relevant if vae tiling is enabled
-        self.tile_sample_min_size = self.config.sample_size
-        sample_size = (
-            self.config.sample_size[0]
-            if isinstance(self.config.sample_size, (list, tuple))
-            else self.config.sample_size
-        )
-        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
-        self.tile_overlap_factor = 0.25
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (Encoder, Decoder)):
-            module.gradient_checkpointing = value
-    def enable_tiling(self, use_tiling: bool = True):
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        """
-        self.use_tiling = use_tiling
-    def disable_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.enable_tiling(False)
-    def enable_slicing(self):
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.use_slicing = True
-    def disable_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.use_slicing = False
-    @property
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        self.set_attn_processor(AttnProcessor())
-    @apply_forward_hook
-    def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
-        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
-            return self.tiled_encode(x, return_dict=return_dict)
-        if self.use_slicing and x.shape[0] > 1:
-            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
-            h = torch.cat(encoded_slices)
-        else:
-            h = self.encoder(x)
-        moments = self.quant_conv(h)
-        posterior = DiagonalGaussianDistribution(moments)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
-        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
-            return self.tiled_decode(z, return_dict=return_dict)
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    @apply_forward_hook
-    def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
-        if self.use_slicing and z.shape[0] > 1:
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
-            decoded = torch.cat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-        if not return_dict:
-            return (decoded,)
-        return DecoderOutput(sample=decoded)
-    def blend_v(self, a, b, blend_extent):
-        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
-        for y in range(blend_extent):
-            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
-        return b
-    def blend_h(self, a, b, blend_extent):
-        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
-        for x in range(blend_extent):
-            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
-        return b
-    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
-        r"""Encode a batch of images using a tiled encoder.
-        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
-        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
-        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
-        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
-        output, but they should be much less noticeable.
-        Args:
-            x (`torch.FloatTensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
-                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
-                `tuple` is returned.
-        """
-        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_latent_min_size - blend_extent
-        # Split the image into 512x512 tiles and encode them separately.
-        rows = []
-        for i in range(0, x.shape[2], overlap_size):
-            row = []
-            for j in range(0, x.shape[3], overlap_size):
-                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
-                tile = self.encoder(tile)
-                tile = self.quant_conv(tile)
-                row.append(tile)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(torch.cat(result_row, dim=3))
-        moments = torch.cat(result_rows, dim=2)
-        posterior = DiagonalGaussianDistribution(moments)
-        if not return_dict:
-            return (posterior,)
-        return AutoencoderKLOutput(latent_dist=posterior)
-    def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
-        r"""
-        Decode a batch of images using a tiled decoder.
-        Args:
-            z (`torch.FloatTensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_sample_min_size - blend_extent
-        # Split z into overlapping 64x64 tiles and decode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, z.shape[2], overlap_size):
-            row = []
-            for j in range(0, z.shape[3], overlap_size):
-                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
-                tile = self.post_quant_conv(tile)
-                decoded = self.decoder(tile)
-                row.append(decoded)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(torch.cat(result_row, dim=3))
-        dec = torch.cat(result_rows, dim=2)
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator: Optional[torch.Generator] = None,
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
-        r"""
-        Args:
-            sample (`torch.FloatTensor`): Input sample.
-            sample_posterior (`bool`, *optional*, defaults to `False`):
-                Whether to sample from the posterior.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample(generator=generator)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z).sample
-        if not return_dict:
-            return (dec,)
-        return DecoderOutput(sample=dec)

4DoF/diffusers/models/controlnet.py DELETED Viewed

@@ -1,705 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-import torch
-from torch import nn
-from torch.nn import functional as F
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging
-from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .unet_2d_blocks import (
-    CrossAttnDownBlock2D,
-    DownBlock2D,
-    UNetMidBlock2DCrossAttn,
-    get_down_block,
-)
-from .unet_2d_condition import UNet2DConditionModel
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-@dataclass
-class ControlNetOutput(BaseOutput):
-    """
-    The output of [`ControlNetModel`].
-    Args:
-        down_block_res_samples (`tuple[torch.Tensor]`):
-            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
-            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
-            used to condition the original UNet's downsampling activations.
-        mid_down_block_re_sample (`torch.Tensor`):
-            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
-            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
-            Output can be used to condition the original UNet's middle block activation.
-    """
-    down_block_res_samples: Tuple[torch.Tensor]
-    mid_block_res_sample: torch.Tensor
-class ControlNetConditioningEmbedding(nn.Module):
-    """
-    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
-    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
-    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
-    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
-    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
-    model) to encode image-space conditions ... into feature maps ..."
-    """
-    def __init__(
-        self,
-        conditioning_embedding_channels: int,
-        conditioning_channels: int = 3,
-        block_out_channels: Tuple[int] = (16, 32, 96, 256),
-    ):
-        super().__init__()
-        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
-        self.blocks = nn.ModuleList([])
-        for i in range(len(block_out_channels) - 1):
-            channel_in = block_out_channels[i]
-            channel_out = block_out_channels[i + 1]
-            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
-            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
-        self.conv_out = zero_module(
-            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
-        )
-    def forward(self, conditioning):
-        embedding = self.conv_in(conditioning)
-        embedding = F.silu(embedding)
-        for block in self.blocks:
-            embedding = block(embedding)
-            embedding = F.silu(embedding)
-        embedding = self.conv_out(embedding)
-        return embedding
-class ControlNetModel(ModelMixin, ConfigMixin):
-    """
-    A ControlNet model.
-    Args:
-        in_channels (`int`, defaults to 4):
-            The number of channels in the input sample.
-        flip_sin_to_cos (`bool`, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, defaults to 0):
-            The frequency shift to apply to the time embedding.
-        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
-        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, defaults to 2):
-            The number of layers per block.
-        downsample_padding (`int`, defaults to 1):
-            The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, defaults to 1):
-            The scale factor to use for the mid block.
-        act_fn (`str`, defaults to "silu"):
-            The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
-            in post-processing.
-        norm_eps (`float`, defaults to 1e-5):
-            The epsilon to use for the normalization.
-        cross_attention_dim (`int`, defaults to 1280):
-            The dimension of the cross attention features.
-        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
-            The dimension of the attention heads.
-        use_linear_projection (`bool`, defaults to `False`):
-        class_embed_type (`str`, *optional*, defaults to `None`):
-            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
-            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
-        num_class_embeds (`int`, *optional*, defaults to 0):
-            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
-            class conditioning with `class_embed_type` equal to `None`.
-        upcast_attention (`bool`, defaults to `False`):
-        resnet_time_scale_shift (`str`, defaults to `"default"`):
-            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
-        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
-            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
-            `class_embed_type="projection"`.
-        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
-            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
-        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
-            The tuple of output channel for each block in the `conditioning_embedding` layer.
-        global_pool_conditions (`bool`, defaults to `False`):
-    """
-    _supports_gradient_checkpointing = True
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 4,
-        conditioning_channels: int = 3,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str] = (
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 1280,
-        attention_head_dim: Union[int, Tuple[int]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
-        use_linear_projection: bool = False,
-        class_embed_type: Optional[str] = None,
-        num_class_embeds: Optional[int] = None,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        projection_class_embeddings_input_dim: Optional[int] = None,
-        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
-        global_pool_conditions: bool = False,
-    ):
-        super().__init__()
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
-        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
-        # Check inputs
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
-            )
-        # input
-        conv_in_kernel = 3
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2d(
-            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-        # time
-        time_embed_dim = block_out_channels[0] * 4
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-        )
-        # class embedding
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-        # control net conditioning embedding
-        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
-            conditioning_embedding_channels=block_out_channels[0],
-            block_out_channels=conditioning_embedding_out_channels,
-            conditioning_channels=conditioning_channels,
-        )
-        self.down_blocks = nn.ModuleList([])
-        self.controlnet_down_blocks = nn.ModuleList([])
-        if isinstance(only_cross_attention, bool):
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-        if isinstance(num_attention_heads, int):
-            num_attention_heads = (num_attention_heads,) * len(down_block_types)
-        # down
-        output_channel = block_out_channels[0]
-        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-        controlnet_block = zero_module(controlnet_block)
-        self.controlnet_down_blocks.append(controlnet_block)
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                temb_channels=time_embed_dim,
-                add_downsample=not is_final_block,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                cross_attention_dim=cross_attention_dim,
-                num_attention_heads=num_attention_heads[i],
-                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
-                downsample_padding=downsample_padding,
-                use_linear_projection=use_linear_projection,
-                only_cross_attention=only_cross_attention[i],
-                upcast_attention=upcast_attention,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-            )
-            self.down_blocks.append(down_block)
-            for _ in range(layers_per_block):
-                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                controlnet_block = zero_module(controlnet_block)
-                self.controlnet_down_blocks.append(controlnet_block)
-            if not is_final_block:
-                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                controlnet_block = zero_module(controlnet_block)
-                self.controlnet_down_blocks.append(controlnet_block)
-        # mid
-        mid_block_channel = block_out_channels[-1]
-        controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
-        controlnet_block = zero_module(controlnet_block)
-        self.controlnet_mid_block = controlnet_block
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            in_channels=mid_block_channel,
-            temb_channels=time_embed_dim,
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            output_scale_factor=mid_block_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            cross_attention_dim=cross_attention_dim,
-            num_attention_heads=num_attention_heads[-1],
-            resnet_groups=norm_num_groups,
-            use_linear_projection=use_linear_projection,
-            upcast_attention=upcast_attention,
-        )
-    @classmethod
-    def from_unet(
-        cls,
-        unet: UNet2DConditionModel,
-        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
-        load_weights_from_unet: bool = True,
-    ):
-        r"""
-        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
-        Parameters:
-            unet (`UNet2DConditionModel`):
-                The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
-                where applicable.
-        """
-        controlnet = cls(
-            in_channels=unet.config.in_channels,
-            flip_sin_to_cos=unet.config.flip_sin_to_cos,
-            freq_shift=unet.config.freq_shift,
-            down_block_types=unet.config.down_block_types,
-            only_cross_attention=unet.config.only_cross_attention,
-            block_out_channels=unet.config.block_out_channels,
-            layers_per_block=unet.config.layers_per_block,
-            downsample_padding=unet.config.downsample_padding,
-            mid_block_scale_factor=unet.config.mid_block_scale_factor,
-            act_fn=unet.config.act_fn,
-            norm_num_groups=unet.config.norm_num_groups,
-            norm_eps=unet.config.norm_eps,
-            cross_attention_dim=unet.config.cross_attention_dim,
-            attention_head_dim=unet.config.attention_head_dim,
-            num_attention_heads=unet.config.num_attention_heads,
-            use_linear_projection=unet.config.use_linear_projection,
-            class_embed_type=unet.config.class_embed_type,
-            num_class_embeds=unet.config.num_class_embeds,
-            upcast_attention=unet.config.upcast_attention,
-            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
-            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
-            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
-            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
-        )
-        if load_weights_from_unet:
-            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
-            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
-            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
-            if controlnet.class_embedding:
-                controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
-            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
-            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())
-        return controlnet
-    @property
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        self.set_attn_processor(AttnProcessor())
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
-    def set_attention_slice(self, slice_size):
-        r"""
-        Enable sliced attention computation.
-        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
-        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
-                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-        num_sliceable_layers = len(sliceable_head_dims)
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
-            module.gradient_checkpointing = value
-    def forward(
-        self,
-        sample: torch.FloatTensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        controlnet_cond: torch.FloatTensor,
-        conditioning_scale: float = 1.0,
-        class_labels: Optional[torch.Tensor] = None,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guess_mode: bool = False,
-        return_dict: bool = True,
-    ) -> Union[ControlNetOutput, Tuple]:
-        """
-        The [`ControlNetModel`] forward method.
-        Args:
-            sample (`torch.FloatTensor`):
-                The noisy input tensor.
-            timestep (`Union[torch.Tensor, float, int]`):
-                The number of timesteps to denoise an input.
-            encoder_hidden_states (`torch.Tensor`):
-                The encoder hidden states.
-            controlnet_cond (`torch.FloatTensor`):
-                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
-            conditioning_scale (`float`, defaults to `1.0`):
-                The scale factor for ControlNet outputs.
-            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
-                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
-            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
-            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
-            cross_attention_kwargs(`dict[str]`, *optional*, defaults to `None`):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
-            guess_mode (`bool`, defaults to `False`):
-                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
-                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
-                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
-                returned where the first element is the sample tensor.
-        """
-        # check channel order
-        channel_order = self.config.controlnet_conditioning_channel_order
-        if channel_order == "rgb":
-            # in rgb order by default
-            ...
-        elif channel_order == "bgr":
-            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
-        else:
-            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-        t_emb = self.time_proj(timesteps)
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=sample.dtype)
-        emb = self.time_embedding(t_emb, timestep_cond)
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
-            emb = emb + class_emb
-        # 2. pre-process
-        sample = self.conv_in(sample)
-        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
-        sample = sample + controlnet_cond
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-            down_block_res_samples += res_samples
-        # 4. mid
-        if self.mid_block is not None:
-            sample = self.mid_block(
-                sample,
-                emb,
-                encoder_hidden_states=encoder_hidden_states,
-                attention_mask=attention_mask,
-                cross_attention_kwargs=cross_attention_kwargs,
-            )
-        # 5. Control net blocks
-        controlnet_down_block_res_samples = ()
-        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
-            down_block_res_sample = controlnet_block(down_block_res_sample)
-            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
-        down_block_res_samples = controlnet_down_block_res_samples
-        mid_block_res_sample = self.controlnet_mid_block(sample)
-        # 6. scaling
-        if guess_mode and not self.config.global_pool_conditions:
-            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
-            scales = scales * conditioning_scale
-            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
-            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
-        else:
-            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
-            mid_block_res_sample = mid_block_res_sample * conditioning_scale
-        if self.config.global_pool_conditions:
-            down_block_res_samples = [
-                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
-            ]
-            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
-        if not return_dict:
-            return (down_block_res_samples, mid_block_res_sample)
-        return ControlNetOutput(
-            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
-        )
-def zero_module(module):
-    for p in module.parameters():
-        nn.init.zeros_(p)
-    return module

4DoF/diffusers/models/controlnet_flax.py DELETED Viewed

@@ -1,394 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional, Tuple, Union
-import flax
-import flax.linen as nn
-import jax
-import jax.numpy as jnp
-from flax.core.frozen_dict import FrozenDict
-from ..configuration_utils import ConfigMixin, flax_register_to_config
-from ..utils import BaseOutput
-from .embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
-from .modeling_flax_utils import FlaxModelMixin
-from .unet_2d_blocks_flax import (
-    FlaxCrossAttnDownBlock2D,
-    FlaxDownBlock2D,
-    FlaxUNetMidBlock2DCrossAttn,
-)
-@flax.struct.dataclass
-class FlaxControlNetOutput(BaseOutput):
-    """
-    The output of [`FlaxControlNetModel`].
-    Args:
-        down_block_res_samples (`jnp.ndarray`):
-        mid_block_res_sample (`jnp.ndarray`):
-    """
-    down_block_res_samples: jnp.ndarray
-    mid_block_res_sample: jnp.ndarray
-class FlaxControlNetConditioningEmbedding(nn.Module):
-    conditioning_embedding_channels: int
-    block_out_channels: Tuple[int] = (16, 32, 96, 256)
-    dtype: jnp.dtype = jnp.float32
-    def setup(self):
-        self.conv_in = nn.Conv(
-            self.block_out_channels[0],
-            kernel_size=(3, 3),
-            padding=((1, 1), (1, 1)),
-            dtype=self.dtype,
-        )
-        blocks = []
-        for i in range(len(self.block_out_channels) - 1):
-            channel_in = self.block_out_channels[i]
-            channel_out = self.block_out_channels[i + 1]
-            conv1 = nn.Conv(
-                channel_in,
-                kernel_size=(3, 3),
-                padding=((1, 1), (1, 1)),
-                dtype=self.dtype,
-            )
-            blocks.append(conv1)
-            conv2 = nn.Conv(
-                channel_out,
-                kernel_size=(3, 3),
-                strides=(2, 2),
-                padding=((1, 1), (1, 1)),
-                dtype=self.dtype,
-            )
-            blocks.append(conv2)
-        self.blocks = blocks
-        self.conv_out = nn.Conv(
-            self.conditioning_embedding_channels,
-            kernel_size=(3, 3),
-            padding=((1, 1), (1, 1)),
-            kernel_init=nn.initializers.zeros_init(),
-            bias_init=nn.initializers.zeros_init(),
-            dtype=self.dtype,
-        )
-    def __call__(self, conditioning):
-        embedding = self.conv_in(conditioning)
-        embedding = nn.silu(embedding)
-        for block in self.blocks:
-            embedding = block(embedding)
-            embedding = nn.silu(embedding)
-        embedding = self.conv_out(embedding)
-        return embedding
-@flax_register_to_config
-class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
-    r"""
-    A ControlNet model.
-    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it’s generic methods
-    implemented for all models (such as downloading or saving).
-    This model is also a Flax Linen [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
-    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its
-    general usage and behavior.
-    Inherent JAX features such as the following are supported:
-    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
-    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
-    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
-    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
-    Parameters:
-        sample_size (`int`, *optional*):
-            The size of the input sample.
-        in_channels (`int`, *optional*, defaults to 4):
-            The number of channels in the input sample.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`):
-            The tuple of downsample blocks to use.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2):
-            The number of layers per block.
-        attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8):
-            The dimension of the attention heads.
-        num_attention_heads (`int` or `Tuple[int]`, *optional*):
-            The number of attention heads.
-        cross_attention_dim (`int`, *optional*, defaults to 768):
-            The dimension of the cross attention features.
-        dropout (`float`, *optional*, defaults to 0):
-            Dropout probability for down, up and bottleneck blocks.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
-        controlnet_conditioning_channel_order (`str`, *optional*, defaults to `rgb`):
-            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
-        conditioning_embedding_out_channels (`tuple`, *optional*, defaults to `(16, 32, 96, 256)`):
-            The tuple of output channel for each block in the `conditioning_embedding` layer.
-    """
-    sample_size: int = 32
-    in_channels: int = 4
-    down_block_types: Tuple[str] = (
-        "CrossAttnDownBlock2D",
-        "CrossAttnDownBlock2D",
-        "CrossAttnDownBlock2D",
-        "DownBlock2D",
-    )
-    only_cross_attention: Union[bool, Tuple[bool]] = False
-    block_out_channels: Tuple[int] = (320, 640, 1280, 1280)
-    layers_per_block: int = 2
-    attention_head_dim: Union[int, Tuple[int]] = 8
-    num_attention_heads: Optional[Union[int, Tuple[int]]] = None
-    cross_attention_dim: int = 1280
-    dropout: float = 0.0
-    use_linear_projection: bool = False
-    dtype: jnp.dtype = jnp.float32
-    flip_sin_to_cos: bool = True
-    freq_shift: int = 0
-    controlnet_conditioning_channel_order: str = "rgb"
-    conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256)
-    def init_weights(self, rng: jax.random.KeyArray) -> FrozenDict:
-        # init input tensors
-        sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
-        sample = jnp.zeros(sample_shape, dtype=jnp.float32)
-        timesteps = jnp.ones((1,), dtype=jnp.int32)
-        encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=jnp.float32)
-        controlnet_cond_shape = (1, 3, self.sample_size * 8, self.sample_size * 8)
-        controlnet_cond = jnp.zeros(controlnet_cond_shape, dtype=jnp.float32)
-        params_rng, dropout_rng = jax.random.split(rng)
-        rngs = {"params": params_rng, "dropout": dropout_rng}
-        return self.init(rngs, sample, timesteps, encoder_hidden_states, controlnet_cond)["params"]
-    def setup(self):
-        block_out_channels = self.block_out_channels
-        time_embed_dim = block_out_channels[0] * 4
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
-        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = self.num_attention_heads or self.attention_head_dim
-        # input
-        self.conv_in = nn.Conv(
-            block_out_channels[0],
-            kernel_size=(3, 3),
-            strides=(1, 1),
-            padding=((1, 1), (1, 1)),
-            dtype=self.dtype,
-        )
-        # time
-        self.time_proj = FlaxTimesteps(
-            block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
-        )
-        self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
-        self.controlnet_cond_embedding = FlaxControlNetConditioningEmbedding(
-            conditioning_embedding_channels=block_out_channels[0],
-            block_out_channels=self.conditioning_embedding_out_channels,
-        )
-        only_cross_attention = self.only_cross_attention
-        if isinstance(only_cross_attention, bool):
-            only_cross_attention = (only_cross_attention,) * len(self.down_block_types)
-        if isinstance(num_attention_heads, int):
-            num_attention_heads = (num_attention_heads,) * len(self.down_block_types)
-        # down
-        down_blocks = []
-        controlnet_down_blocks = []
-        output_channel = block_out_channels[0]
-        controlnet_block = nn.Conv(
-            output_channel,
-            kernel_size=(1, 1),
-            padding="VALID",
-            kernel_init=nn.initializers.zeros_init(),
-            bias_init=nn.initializers.zeros_init(),
-            dtype=self.dtype,
-        )
-        controlnet_down_blocks.append(controlnet_block)
-        for i, down_block_type in enumerate(self.down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-            if down_block_type == "CrossAttnDownBlock2D":
-                down_block = FlaxCrossAttnDownBlock2D(
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    dropout=self.dropout,
-                    num_layers=self.layers_per_block,
-                    num_attention_heads=num_attention_heads[i],
-                    add_downsample=not is_final_block,
-                    use_linear_projection=self.use_linear_projection,
-                    only_cross_attention=only_cross_attention[i],
-                    dtype=self.dtype,
-                )
-            else:
-                down_block = FlaxDownBlock2D(
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    dropout=self.dropout,
-                    num_layers=self.layers_per_block,
-                    add_downsample=not is_final_block,
-                    dtype=self.dtype,
-                )
-            down_blocks.append(down_block)
-            for _ in range(self.layers_per_block):
-                controlnet_block = nn.Conv(
-                    output_channel,
-                    kernel_size=(1, 1),
-                    padding="VALID",
-                    kernel_init=nn.initializers.zeros_init(),
-                    bias_init=nn.initializers.zeros_init(),
-                    dtype=self.dtype,
-                )
-                controlnet_down_blocks.append(controlnet_block)
-            if not is_final_block:
-                controlnet_block = nn.Conv(
-                    output_channel,
-                    kernel_size=(1, 1),
-                    padding="VALID",
-                    kernel_init=nn.initializers.zeros_init(),
-                    bias_init=nn.initializers.zeros_init(),
-                    dtype=self.dtype,
-                )
-                controlnet_down_blocks.append(controlnet_block)
-        self.down_blocks = down_blocks
-        self.controlnet_down_blocks = controlnet_down_blocks
-        # mid
-        mid_block_channel = block_out_channels[-1]
-        self.mid_block = FlaxUNetMidBlock2DCrossAttn(
-            in_channels=mid_block_channel,
-            dropout=self.dropout,
-            num_attention_heads=num_attention_heads[-1],
-            use_linear_projection=self.use_linear_projection,
-            dtype=self.dtype,
-        )
-        self.controlnet_mid_block = nn.Conv(
-            mid_block_channel,
-            kernel_size=(1, 1),
-            padding="VALID",
-            kernel_init=nn.initializers.zeros_init(),
-            bias_init=nn.initializers.zeros_init(),
-            dtype=self.dtype,
-        )
-    def __call__(
-        self,
-        sample,
-        timesteps,
-        encoder_hidden_states,
-        controlnet_cond,
-        conditioning_scale: float = 1.0,
-        return_dict: bool = True,
-        train: bool = False,
-    ) -> Union[FlaxControlNetOutput, Tuple]:
-        r"""
-        Args:
-            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
-            timestep (`jnp.ndarray` or `float` or `int`): timesteps
-            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
-            controlnet_cond (`jnp.ndarray`): (batch, channel, height, width) the conditional input tensor
-            conditioning_scale: (`float`) the scale factor for controlnet outputs
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a
-                plain tuple.
-            train (`bool`, *optional*, defaults to `False`):
-                Use deterministic functions and disable dropout when not training.
-        Returns:
-            [`~models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
-            [`~models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`.
-            When returning a tuple, the first element is the sample tensor.
-        """
-        channel_order = self.controlnet_conditioning_channel_order
-        if channel_order == "bgr":
-            controlnet_cond = jnp.flip(controlnet_cond, axis=1)
-        # 1. time
-        if not isinstance(timesteps, jnp.ndarray):
-            timesteps = jnp.array([timesteps], dtype=jnp.int32)
-        elif isinstance(timesteps, jnp.ndarray) and len(timesteps.shape) == 0:
-            timesteps = timesteps.astype(dtype=jnp.float32)
-            timesteps = jnp.expand_dims(timesteps, 0)
-        t_emb = self.time_proj(timesteps)
-        t_emb = self.time_embedding(t_emb)
-        # 2. pre-process
-        sample = jnp.transpose(sample, (0, 2, 3, 1))
-        sample = self.conv_in(sample)
-        controlnet_cond = jnp.transpose(controlnet_cond, (0, 2, 3, 1))
-        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
-        sample += controlnet_cond
-        # 3. down
-        down_block_res_samples = (sample,)
-        for down_block in self.down_blocks:
-            if isinstance(down_block, FlaxCrossAttnDownBlock2D):
-                sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
-            else:
-                sample, res_samples = down_block(sample, t_emb, deterministic=not train)
-            down_block_res_samples += res_samples
-        # 4. mid
-        sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
-        # 5. contronet blocks
-        controlnet_down_block_res_samples = ()
-        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
-            down_block_res_sample = controlnet_block(down_block_res_sample)
-            controlnet_down_block_res_samples += (down_block_res_sample,)
-        down_block_res_samples = controlnet_down_block_res_samples
-        mid_block_res_sample = self.controlnet_mid_block(sample)
-        # 6. scaling
-        down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
-        mid_block_res_sample *= conditioning_scale
-        if not return_dict:
-            return (down_block_res_samples, mid_block_res_sample)
-        return FlaxControlNetOutput(
-            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
-        )

4DoF/diffusers/models/cross_attention.py DELETED Viewed

@@ -1,94 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .attention_processor import (  # noqa: F401
-    Attention,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor2_0,
-    LoRAAttnProcessor,
-    LoRALinearLayer,
-    LoRAXFormersAttnProcessor,
-    SlicedAttnAddedKVProcessor,
-    SlicedAttnProcessor,
-    XFormersAttnProcessor,
-)
-from .attention_processor import AttnProcessor as AttnProcessorRename  # noqa: F401
-deprecate(
-    "cross_attention",
-    "0.20.0",
-    "Importing from cross_attention is deprecated. Please import from diffusers.models.attention_processor instead.",
-    standard_warn=False,
-)
-AttnProcessor = AttentionProcessor
-class CrossAttention(Attention):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-class CrossAttnProcessor(AttnProcessorRename):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-class LoRACrossAttnProcessor(LoRAAttnProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-class CrossAttnAddedKVProcessor(AttnAddedKVProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-class XFormersCrossAttnProcessor(XFormersAttnProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-class LoRAXFormersCrossAttnProcessor(LoRAXFormersAttnProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-class SlicedCrossAttnProcessor(SlicedAttnProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)
-class SlicedCrossAttnAddedKVProcessor(SlicedAttnAddedKVProcessor):
-    def __init__(self, *args, **kwargs):
-        deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead."
-        deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False)
-        super().__init__(*args, **kwargs)

4DoF/diffusers/models/dual_transformer_2d.py DELETED Viewed

@@ -1,151 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Optional
-from torch import nn
-from .transformer_2d import Transformer2DModel, Transformer2DModelOutput
-class DualTransformer2DModel(nn.Module):
-    """
-    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
-    Parameters:
-        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
-        in_channels (`int`, *optional*):
-            Pass if the input is continuous. The number of channels in the input and output.
-        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
-        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
-        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
-            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
-            `ImagePositionalEmbeddings`.
-        num_vector_embeds (`int`, *optional*):
-            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
-            Includes the class for the masked latent pixel.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
-            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
-            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
-            up to but not more than steps than `num_embeds_ada_norm`.
-        attention_bias (`bool`, *optional*):
-            Configure if the TransformerBlocks' attention should contain a bias parameter.
-    """
-    def __init__(
-        self,
-        num_attention_heads: int = 16,
-        attention_head_dim: int = 88,
-        in_channels: Optional[int] = None,
-        num_layers: int = 1,
-        dropout: float = 0.0,
-        norm_num_groups: int = 32,
-        cross_attention_dim: Optional[int] = None,
-        attention_bias: bool = False,
-        sample_size: Optional[int] = None,
-        num_vector_embeds: Optional[int] = None,
-        activation_fn: str = "geglu",
-        num_embeds_ada_norm: Optional[int] = None,
-    ):
-        super().__init__()
-        self.transformers = nn.ModuleList(
-            [
-                Transformer2DModel(
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    in_channels=in_channels,
-                    num_layers=num_layers,
-                    dropout=dropout,
-                    norm_num_groups=norm_num_groups,
-                    cross_attention_dim=cross_attention_dim,
-                    attention_bias=attention_bias,
-                    sample_size=sample_size,
-                    num_vector_embeds=num_vector_embeds,
-                    activation_fn=activation_fn,
-                    num_embeds_ada_norm=num_embeds_ada_norm,
-                )
-                for _ in range(2)
-            ]
-        )
-        # Variables that can be set by a pipeline:
-        # The ratio of transformer1 to transformer2's output states to be combined during inference
-        self.mix_ratio = 0.5
-        # The shape of `encoder_hidden_states` is expected to be
-        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
-        self.condition_lengths = [77, 257]
-        # Which transformer to use to encode which condition.
-        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
-        self.transformer_index_for_condition = [1, 0]
-    def forward(
-        self,
-        hidden_states,
-        encoder_hidden_states,
-        timestep=None,
-        attention_mask=None,
-        cross_attention_kwargs=None,
-        return_dict: bool = True,
-    ):
-        """
-        Args:
-            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
-                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
-                hidden_states
-            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
-                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
-                self-attention.
-            timestep ( `torch.long`, *optional*):
-                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
-            attention_mask (`torch.FloatTensor`, *optional*):
-                Optional attention mask to be applied in Attention
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
-            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
-        """
-        input_states = hidden_states
-        encoded_states = []
-        tokens_start = 0
-        # attention_mask is not used yet
-        for i in range(2):
-            # for each of the two transformers, pass the corresponding condition tokens
-            condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
-            transformer_index = self.transformer_index_for_condition[i]
-            encoded_state = self.transformers[transformer_index](
-                input_states,
-                encoder_hidden_states=condition_state,
-                timestep=timestep,
-                cross_attention_kwargs=cross_attention_kwargs,
-                return_dict=False,
-            )[0]
-            encoded_states.append(encoded_state - input_states)
-            tokens_start += self.condition_lengths[i]
-        output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
-        output_states = output_states + input_states
-        if not return_dict:
-            return (output_states,)
-        return Transformer2DModelOutput(sample=output_states)

4DoF/diffusers/models/embeddings.py DELETED Viewed

@@ -1,546 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from typing import Optional
-import numpy as np
-import torch
-from torch import nn
-from .activations import get_activation
-def get_timestep_embedding(
-    timesteps: torch.Tensor,
-    embedding_dim: int,
-    flip_sin_to_cos: bool = False,
-    downscale_freq_shift: float = 1,
-    scale: float = 1,
-    max_period: int = 10000,
-):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    embeddings. :return: an [N x dim] Tensor of positional embeddings.
-    """
-    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
-    half_dim = embedding_dim // 2
-    exponent = -math.log(max_period) * torch.arange(
-        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
-    )
-    exponent = exponent / (half_dim - downscale_freq_shift)
-    emb = torch.exp(exponent)
-    emb = timesteps[:, None].float() * emb[None, :]
-    # scale embeddings
-    emb = scale * emb
-    # concat sine and cosine embeddings
-    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
-    # flip sine and cosine embeddings
-    if flip_sin_to_cos:
-        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
-    # zero pad
-    if embedding_dim % 2 == 1:
-        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
-    return emb
-def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
-    """
-    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
-    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
-    """
-    grid_h = np.arange(grid_size, dtype=np.float32)
-    grid_w = np.arange(grid_size, dtype=np.float32)
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-    grid = grid.reshape([2, 1, grid_size, grid_size])
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    if cls_token and extra_tokens > 0:
-        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
-    return pos_embed
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    if embed_dim % 2 != 0:
-        raise ValueError("embed_dim must be divisible by 2")
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
-    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
-    return emb
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
-    """
-    if embed_dim % 2 != 0:
-        raise ValueError("embed_dim must be divisible by 2")
-    omega = np.arange(embed_dim // 2, dtype=np.float64)
-    omega /= embed_dim / 2.0
-    omega = 1.0 / 10000**omega  # (D/2,)
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
-    emb_sin = np.sin(out)  # (M, D/2)
-    emb_cos = np.cos(out)  # (M, D/2)
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-class PatchEmbed(nn.Module):
-    """2D Image to Patch Embedding"""
-    def __init__(
-        self,
-        height=224,
-        width=224,
-        patch_size=16,
-        in_channels=3,
-        embed_dim=768,
-        layer_norm=False,
-        flatten=True,
-        bias=True,
-    ):
-        super().__init__()
-        num_patches = (height // patch_size) * (width // patch_size)
-        self.flatten = flatten
-        self.layer_norm = layer_norm
-        self.proj = nn.Conv2d(
-            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
-        )
-        if layer_norm:
-            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
-        else:
-            self.norm = None
-        pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
-        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
-    def forward(self, latent):
-        latent = self.proj(latent)
-        if self.flatten:
-            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        if self.layer_norm:
-            latent = self.norm(latent)
-        return latent + self.pos_embed
-class TimestepEmbedding(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        time_embed_dim: int,
-        act_fn: str = "silu",
-        out_dim: int = None,
-        post_act_fn: Optional[str] = None,
-        cond_proj_dim=None,
-    ):
-        super().__init__()
-        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
-        if cond_proj_dim is not None:
-            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
-        else:
-            self.cond_proj = None
-        self.act = get_activation(act_fn)
-        if out_dim is not None:
-            time_embed_dim_out = out_dim
-        else:
-            time_embed_dim_out = time_embed_dim
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
-        if post_act_fn is None:
-            self.post_act = None
-        else:
-            self.post_act = get_activation(post_act_fn)
-    def forward(self, sample, condition=None):
-        if condition is not None:
-            sample = sample + self.cond_proj(condition)
-        sample = self.linear_1(sample)
-        if self.act is not None:
-            sample = self.act(sample)
-        sample = self.linear_2(sample)
-        if self.post_act is not None:
-            sample = self.post_act(sample)
-        return sample
-class Timesteps(nn.Module):
-    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
-        super().__init__()
-        self.num_channels = num_channels
-        self.flip_sin_to_cos = flip_sin_to_cos
-        self.downscale_freq_shift = downscale_freq_shift
-    def forward(self, timesteps):
-        t_emb = get_timestep_embedding(
-            timesteps,
-            self.num_channels,
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            downscale_freq_shift=self.downscale_freq_shift,
-        )
-        return t_emb
-class GaussianFourierProjection(nn.Module):
-    """Gaussian Fourier embeddings for noise levels."""
-    def __init__(
-        self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
-    ):
-        super().__init__()
-        self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
-        self.log = log
-        self.flip_sin_to_cos = flip_sin_to_cos
-        if set_W_to_weight:
-            # to delete later
-            self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
-            self.weight = self.W
-    def forward(self, x):
-        if self.log:
-            x = torch.log(x)
-        x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
-        if self.flip_sin_to_cos:
-            out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
-        else:
-            out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
-        return out
-class ImagePositionalEmbeddings(nn.Module):
-    """
-    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
-    height and width of the latent space.
-    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
-    For VQ-diffusion:
-    Output vector embeddings are used as input for the transformer.
-    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.
-    Args:
-        num_embed (`int`):
-            Number of embeddings for the latent pixels embeddings.
-        height (`int`):
-            Height of the latent image i.e. the number of height embeddings.
-        width (`int`):
-            Width of the latent image i.e. the number of width embeddings.
-        embed_dim (`int`):
-            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
-    """
-    def __init__(
-        self,
-        num_embed: int,
-        height: int,
-        width: int,
-        embed_dim: int,
-    ):
-        super().__init__()
-        self.height = height
-        self.width = width
-        self.num_embed = num_embed
-        self.embed_dim = embed_dim
-        self.emb = nn.Embedding(self.num_embed, embed_dim)
-        self.height_emb = nn.Embedding(self.height, embed_dim)
-        self.width_emb = nn.Embedding(self.width, embed_dim)
-    def forward(self, index):
-        emb = self.emb(index)
-        height_emb = self.height_emb(torch.arange(self.height, device=index.device).view(1, self.height))
-        # 1 x H x D -> 1 x H x 1 x D
-        height_emb = height_emb.unsqueeze(2)
-        width_emb = self.width_emb(torch.arange(self.width, device=index.device).view(1, self.width))
-        # 1 x W x D -> 1 x 1 x W x D
-        width_emb = width_emb.unsqueeze(1)
-        pos_emb = height_emb + width_emb
-        # 1 x H x W x D -> 1 x L xD
-        pos_emb = pos_emb.view(1, self.height * self.width, -1)
-        emb = emb + pos_emb[:, : emb.shape[1], :]
-        return emb
-class LabelEmbedding(nn.Module):
-    """
-    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
-    Args:
-        num_classes (`int`): The number of classes.
-        hidden_size (`int`): The size of the vector embeddings.
-        dropout_prob (`float`): The probability of dropping a label.
-    """
-    def __init__(self, num_classes, hidden_size, dropout_prob):
-        super().__init__()
-        use_cfg_embedding = dropout_prob > 0
-        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
-        self.num_classes = num_classes
-        self.dropout_prob = dropout_prob
-    def token_drop(self, labels, force_drop_ids=None):
-        """
-        Drops labels to enable classifier-free guidance.
-        """
-        if force_drop_ids is None:
-            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
-        else:
-            drop_ids = torch.tensor(force_drop_ids == 1)
-        labels = torch.where(drop_ids, self.num_classes, labels)
-        return labels
-    def forward(self, labels: torch.LongTensor, force_drop_ids=None):
-        use_dropout = self.dropout_prob > 0
-        if (self.training and use_dropout) or (force_drop_ids is not None):
-            labels = self.token_drop(labels, force_drop_ids)
-        embeddings = self.embedding_table(labels)
-        return embeddings
-class TextImageProjection(nn.Module):
-    def __init__(
-        self,
-        text_embed_dim: int = 1024,
-        image_embed_dim: int = 768,
-        cross_attention_dim: int = 768,
-        num_image_text_embeds: int = 10,
-    ):
-        super().__init__()
-        self.num_image_text_embeds = num_image_text_embeds
-        self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
-        self.text_proj = nn.Linear(text_embed_dim, cross_attention_dim)
-    def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor):
-        batch_size = text_embeds.shape[0]
-        # image
-        image_text_embeds = self.image_embeds(image_embeds)
-        image_text_embeds = image_text_embeds.reshape(batch_size, self.num_image_text_embeds, -1)
-        # text
-        text_embeds = self.text_proj(text_embeds)
-        return torch.cat([image_text_embeds, text_embeds], dim=1)
-class ImageProjection(nn.Module):
-    def __init__(
-        self,
-        image_embed_dim: int = 768,
-        cross_attention_dim: int = 768,
-        num_image_text_embeds: int = 32,
-    ):
-        super().__init__()
-        self.num_image_text_embeds = num_image_text_embeds
-        self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
-        self.norm = nn.LayerNorm(cross_attention_dim)
-    def forward(self, image_embeds: torch.FloatTensor):
-        batch_size = image_embeds.shape[0]
-        # image
-        image_embeds = self.image_embeds(image_embeds)
-        image_embeds = image_embeds.reshape(batch_size, self.num_image_text_embeds, -1)
-        image_embeds = self.norm(image_embeds)
-        return image_embeds
-class CombinedTimestepLabelEmbeddings(nn.Module):
-    def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob)
-    def forward(self, timestep, class_labels, hidden_dtype=None):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
-        class_labels = self.class_embedder(class_labels)  # (N, D)
-        conditioning = timesteps_emb + class_labels  # (N, D)
-        return conditioning
-class TextTimeEmbedding(nn.Module):
-    def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(encoder_dim)
-        self.pool = AttentionPooling(num_heads, encoder_dim)
-        self.proj = nn.Linear(encoder_dim, time_embed_dim)
-        self.norm2 = nn.LayerNorm(time_embed_dim)
-    def forward(self, hidden_states):
-        hidden_states = self.norm1(hidden_states)
-        hidden_states = self.pool(hidden_states)
-        hidden_states = self.proj(hidden_states)
-        hidden_states = self.norm2(hidden_states)
-        return hidden_states
-class TextImageTimeEmbedding(nn.Module):
-    def __init__(self, text_embed_dim: int = 768, image_embed_dim: int = 768, time_embed_dim: int = 1536):
-        super().__init__()
-        self.text_proj = nn.Linear(text_embed_dim, time_embed_dim)
-        self.text_norm = nn.LayerNorm(time_embed_dim)
-        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
-    def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor):
-        # text
-        time_text_embeds = self.text_proj(text_embeds)
-        time_text_embeds = self.text_norm(time_text_embeds)
-        # image
-        time_image_embeds = self.image_proj(image_embeds)
-        return time_image_embeds + time_text_embeds
-class ImageTimeEmbedding(nn.Module):
-    def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
-        super().__init__()
-        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
-        self.image_norm = nn.LayerNorm(time_embed_dim)
-    def forward(self, image_embeds: torch.FloatTensor):
-        # image
-        time_image_embeds = self.image_proj(image_embeds)
-        time_image_embeds = self.image_norm(time_image_embeds)
-        return time_image_embeds
-class ImageHintTimeEmbedding(nn.Module):
-    def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
-        super().__init__()
-        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
-        self.image_norm = nn.LayerNorm(time_embed_dim)
-        self.input_hint_block = nn.Sequential(
-            nn.Conv2d(3, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 16, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(16, 32, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(32, 32, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(32, 96, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(96, 96, 3, padding=1),
-            nn.SiLU(),
-            nn.Conv2d(96, 256, 3, padding=1, stride=2),
-            nn.SiLU(),
-            nn.Conv2d(256, 4, 3, padding=1),
-        )
-    def forward(self, image_embeds: torch.FloatTensor, hint: torch.FloatTensor):
-        # image
-        time_image_embeds = self.image_proj(image_embeds)
-        time_image_embeds = self.image_norm(time_image_embeds)
-        hint = self.input_hint_block(hint)
-        return time_image_embeds, hint
-class AttentionPooling(nn.Module):
-    # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54
-    def __init__(self, num_heads, embed_dim, dtype=None):
-        super().__init__()
-        self.dtype = dtype
-        self.positional_embedding = nn.Parameter(torch.randn(1, embed_dim) / embed_dim**0.5)
-        self.k_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
-        self.num_heads = num_heads
-        self.dim_per_head = embed_dim // self.num_heads
-    def forward(self, x):
-        bs, length, width = x.size()
-        def shape(x):
-            # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
-            x = x.view(bs, -1, self.num_heads, self.dim_per_head)
-            # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
-            x = x.transpose(1, 2)
-            # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
-            x = x.reshape(bs * self.num_heads, -1, self.dim_per_head)
-            # (bs*n_heads, length, dim_per_head) --> (bs*n_heads, dim_per_head, length)
-            x = x.transpose(1, 2)
-            return x
-        class_token = x.mean(dim=1, keepdim=True) + self.positional_embedding.to(x.dtype)
-        x = torch.cat([class_token, x], dim=1)  # (bs, length+1, width)
-        # (bs*n_heads, class_token_length, dim_per_head)
-        q = shape(self.q_proj(class_token))
-        # (bs*n_heads, length+class_token_length, dim_per_head)
-        k = shape(self.k_proj(x))
-        v = shape(self.v_proj(x))
-        # (bs*n_heads, class_token_length, length+class_token_length):
-        scale = 1 / math.sqrt(math.sqrt(self.dim_per_head))
-        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)  # More stable with f16 than dividing afterwards
-        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
-        # (bs*n_heads, dim_per_head, class_token_length)
-        a = torch.einsum("bts,bcs->bct", weight, v)
-        # (bs, length+1, width)
-        a = a.reshape(bs, -1, 1).transpose(1, 2)
-        return a[:, 0, :]  # cls_token

4DoF/diffusers/models/embeddings_flax.py DELETED Viewed

@@ -1,95 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import flax.linen as nn
-import jax.numpy as jnp
-def get_sinusoidal_embeddings(
-    timesteps: jnp.ndarray,
-    embedding_dim: int,
-    freq_shift: float = 1,
-    min_timescale: float = 1,
-    max_timescale: float = 1.0e4,
-    flip_sin_to_cos: bool = False,
-    scale: float = 1.0,
-) -> jnp.ndarray:
-    """Returns the positional encoding (same as Tensor2Tensor).
-    Args:
-        timesteps: a 1-D Tensor of N indices, one per batch element.
-        These may be fractional.
-        embedding_dim: The number of output channels.
-        min_timescale: The smallest time unit (should probably be 0.0).
-        max_timescale: The largest time unit.
-    Returns:
-        a Tensor of timing signals [N, num_channels]
-    """
-    assert timesteps.ndim == 1, "Timesteps should be a 1d-array"
-    assert embedding_dim % 2 == 0, f"Embedding dimension {embedding_dim} should be even"
-    num_timescales = float(embedding_dim // 2)
-    log_timescale_increment = math.log(max_timescale / min_timescale) / (num_timescales - freq_shift)
-    inv_timescales = min_timescale * jnp.exp(jnp.arange(num_timescales, dtype=jnp.float32) * -log_timescale_increment)
-    emb = jnp.expand_dims(timesteps, 1) * jnp.expand_dims(inv_timescales, 0)
-    # scale embeddings
-    scaled_time = scale * emb
-    if flip_sin_to_cos:
-        signal = jnp.concatenate([jnp.cos(scaled_time), jnp.sin(scaled_time)], axis=1)
-    else:
-        signal = jnp.concatenate([jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=1)
-    signal = jnp.reshape(signal, [jnp.shape(timesteps)[0], embedding_dim])
-    return signal
-class FlaxTimestepEmbedding(nn.Module):
-    r"""
-    Time step Embedding Module. Learns embeddings for input time steps.
-    Args:
-        time_embed_dim (`int`, *optional*, defaults to `32`):
-                Time step embedding dimension
-        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
-                Parameters `dtype`
-    """
-    time_embed_dim: int = 32
-    dtype: jnp.dtype = jnp.float32
-    @nn.compact
-    def __call__(self, temb):
-        temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, name="linear_1")(temb)
-        temb = nn.silu(temb)
-        temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, name="linear_2")(temb)
-        return temb
-class FlaxTimesteps(nn.Module):
-    r"""
-    Wrapper Module for sinusoidal Time step Embeddings as described in https://arxiv.org/abs/2006.11239
-    Args:
-        dim (`int`, *optional*, defaults to `32`):
-                Time step embedding dimension
-    """
-    dim: int = 32
-    flip_sin_to_cos: bool = False
-    freq_shift: float = 1
-    @nn.compact
-    def __call__(self, timesteps):
-        return get_sinusoidal_embeddings(
-            timesteps, embedding_dim=self.dim, flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.freq_shift
-        )

4DoF/diffusers/models/modeling_flax_pytorch_utils.py DELETED Viewed

@@ -1,118 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch - Flax general utilities."""
-import re
-import jax.numpy as jnp
-from flax.traverse_util import flatten_dict, unflatten_dict
-from jax.random import PRNGKey
-from ..utils import logging
-logger = logging.get_logger(__name__)
-def rename_key(key):
-    regex = r"\w+[.]\d+"
-    pats = re.findall(regex, key)
-    for pat in pats:
-        key = key.replace(pat, "_".join(pat.split(".")))
-    return key
-#####################
-# PyTorch => Flax #
-#####################
-# Adapted from https://github.com/huggingface/transformers/blob/c603c80f46881ae18b2ca50770ef65fa4033eacd/src/transformers/modeling_flax_pytorch_utils.py#L69
-# and https://github.com/patil-suraj/stable-diffusion-jax/blob/main/stable_diffusion_jax/convert_diffusers_to_jax.py
-def rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict):
-    """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
-    # conv norm or layer norm
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
-    if (
-        any("norm" in str_ for str_ in pt_tuple_key)
-        and (pt_tuple_key[-1] == "bias")
-        and (pt_tuple_key[:-1] + ("bias",) not in random_flax_state_dict)
-        and (pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict)
-    ):
-        renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
-        return renamed_pt_tuple_key, pt_tensor
-    elif pt_tuple_key[-1] in ["weight", "gamma"] and pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict:
-        renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
-        return renamed_pt_tuple_key, pt_tensor
-    # embedding
-    if pt_tuple_key[-1] == "weight" and pt_tuple_key[:-1] + ("embedding",) in random_flax_state_dict:
-        pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
-        return renamed_pt_tuple_key, pt_tensor
-    # conv layer
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
-    if pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4:
-        pt_tensor = pt_tensor.transpose(2, 3, 1, 0)
-        return renamed_pt_tuple_key, pt_tensor
-    # linear layer
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
-    if pt_tuple_key[-1] == "weight":
-        pt_tensor = pt_tensor.T
-        return renamed_pt_tuple_key, pt_tensor
-    # old PyTorch layer norm weight
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("weight",)
-    if pt_tuple_key[-1] == "gamma":
-        return renamed_pt_tuple_key, pt_tensor
-    # old PyTorch layer norm bias
-    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("bias",)
-    if pt_tuple_key[-1] == "beta":
-        return renamed_pt_tuple_key, pt_tensor
-    return pt_tuple_key, pt_tensor
-def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model, init_key=42):
-    # Step 1: Convert pytorch tensor to numpy
-    pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
-    # Step 2: Since the model is stateless, get random Flax params
-    random_flax_params = flax_model.init_weights(PRNGKey(init_key))
-    random_flax_state_dict = flatten_dict(random_flax_params)
-    flax_state_dict = {}
-    # Need to change some parameters name to match Flax names
-    for pt_key, pt_tensor in pt_state_dict.items():
-        renamed_pt_key = rename_key(pt_key)
-        pt_tuple_key = tuple(renamed_pt_key.split("."))
-        # Correctly rename weight parameters
-        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict)
-        if flax_key in random_flax_state_dict:
-            if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
-                raise ValueError(
-                    f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
-                    f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
-                )
-        # also add unexpected weight so that warning is thrown
-        flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
-    return unflatten_dict(flax_state_dict)

4DoF/diffusers/models/modeling_flax_utils.py DELETED Viewed

@@ -1,534 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from pickle import UnpicklingError
-from typing import Any, Dict, Union
-import jax
-import jax.numpy as jnp
-import msgpack.exceptions
-from flax.core.frozen_dict import FrozenDict, unfreeze
-from flax.serialization import from_bytes, to_bytes
-from flax.traverse_util import flatten_dict, unflatten_dict
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
-from requests import HTTPError
-from .. import __version__, is_torch_available
-from ..utils import (
-    CONFIG_NAME,
-    DIFFUSERS_CACHE,
-    FLAX_WEIGHTS_NAME,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
-    WEIGHTS_NAME,
-    logging,
-)
-from .modeling_flax_pytorch_utils import convert_pytorch_state_dict_to_flax
-logger = logging.get_logger(__name__)
-class FlaxModelMixin:
-    r"""
-    Base class for all Flax models.
-    [`FlaxModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
-    saving models.
-        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~FlaxModelMixin.save_pretrained`].
-    """
-    config_name = CONFIG_NAME
-    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
-    _flax_internal_args = ["name", "parent", "dtype"]
-    @classmethod
-    def _from_config(cls, config, **kwargs):
-        """
-        All context managers that the model should be initialized under go here.
-        """
-        return cls(config, **kwargs)
-    def _cast_floating_to(self, params: Union[Dict, FrozenDict], dtype: jnp.dtype, mask: Any = None) -> Any:
-        """
-        Helper method to cast floating-point values of given parameter `PyTree` to given `dtype`.
-        """
-        # taken from https://github.com/deepmind/jmp/blob/3a8318abc3292be38582794dbf7b094e6583b192/jmp/_src/policy.py#L27
-        def conditional_cast(param):
-            if isinstance(param, jnp.ndarray) and jnp.issubdtype(param.dtype, jnp.floating):
-                param = param.astype(dtype)
-            return param
-        if mask is None:
-            return jax.tree_map(conditional_cast, params)
-        flat_params = flatten_dict(params)
-        flat_mask, _ = jax.tree_flatten(mask)
-        for masked, key in zip(flat_mask, flat_params.keys()):
-            if masked:
-                param = flat_params[key]
-                flat_params[key] = conditional_cast(param)
-        return unflatten_dict(flat_params)
-    def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
-        r"""
-        Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not cast
-        the `params` in place.
-        This method can be used on a TPU to explicitly convert the model parameters to bfloat16 precision to do full
-        half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed.
-        Arguments:
-            params (`Union[Dict, FrozenDict]`):
-                A `PyTree` of model parameters.
-            mask (`Union[Dict, FrozenDict]`):
-                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
-                for params you want to cast, and `False` for those you want to skip.
-        Examples:
-        ```python
-        >>> from diffusers import FlaxUNet2DConditionModel
-        >>> # load model
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
-        >>> params = model.to_bf16(params)
-        >>> # If you don't want to cast certain parameters (for example layer norm bias and scale)
-        >>> # then pass the mask as follows
-        >>> from flax import traverse_util
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> flat_params = traverse_util.flatten_dict(params)
-        >>> mask = {
-        ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
-        ...     for path in flat_params
-        ... }
-        >>> mask = traverse_util.unflatten_dict(mask)
-        >>> params = model.to_bf16(params, mask)
-        ```"""
-        return self._cast_floating_to(params, jnp.bfloat16, mask)
-    def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
-        r"""
-        Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the
-        model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
-        Arguments:
-            params (`Union[Dict, FrozenDict]`):
-                A `PyTree` of model parameters.
-            mask (`Union[Dict, FrozenDict]`):
-                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
-                for params you want to cast, and `False` for those you want to skip.
-        Examples:
-        ```python
-        >>> from diffusers import FlaxUNet2DConditionModel
-        >>> # Download model and configuration from huggingface.co
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> # By default, the model params will be in fp32, to illustrate the use of this method,
-        >>> # we'll first cast to fp16 and back to fp32
-        >>> params = model.to_f16(params)
-        >>> # now cast back to fp32
-        >>> params = model.to_fp32(params)
-        ```"""
-        return self._cast_floating_to(params, jnp.float32, mask)
-    def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
-        r"""
-        Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
-        `params` in place.
-        This method can be used on a GPU to explicitly convert the model parameters to float16 precision to do full
-        half-precision training or to save weights in float16 for inference in order to save memory and improve speed.
-        Arguments:
-            params (`Union[Dict, FrozenDict]`):
-                A `PyTree` of model parameters.
-            mask (`Union[Dict, FrozenDict]`):
-                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
-                for params you want to cast, and `False` for those you want to skip.
-        Examples:
-        ```python
-        >>> from diffusers import FlaxUNet2DConditionModel
-        >>> # load model
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> # By default, the model params will be in fp32, to cast these to float16
-        >>> params = model.to_fp16(params)
-        >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
-        >>> # then pass the mask as follows
-        >>> from flax import traverse_util
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> flat_params = traverse_util.flatten_dict(params)
-        >>> mask = {
-        ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
-        ...     for path in flat_params
-        ... }
-        >>> mask = traverse_util.unflatten_dict(mask)
-        >>> params = model.to_fp16(params, mask)
-        ```"""
-        return self._cast_floating_to(params, jnp.float16, mask)
-    def init_weights(self, rng: jax.random.KeyArray) -> Dict:
-        raise NotImplementedError(f"init_weights method has to be implemented for {self}")
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        dtype: jnp.dtype = jnp.float32,
-        *model_args,
-        **kwargs,
-    ):
-        r"""
-        Instantiate a pretrained Flax model from a pretrained model configuration.
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                Can be either:
-                    - A string, the *model id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained model
-                      hosted on the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      using [`~FlaxModelMixin.save_pretrained`].
-            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
-                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
-                `jax.numpy.bfloat16` (on TPUs).
-                This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
-                specified, all the computation will be performed with the given `dtype`.
-                <Tip>
-                This only specifies the dtype of the *computation* and does not influence the dtype of model
-                parameters.
-                If you wish to change the dtype of the model parameters, see [`~FlaxModelMixin.to_fp16`] and
-                [`~FlaxModelMixin.to_bf16`].
-                </Tip>
-            model_args (sequence of positional arguments, *optional*):
-                All remaining positional arguments are passed to the underlying model's `__init__` method.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            from_pt (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a PyTorch checkpoint save file.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to update the configuration object (after it is loaded) and initiate the model (for
-                example, `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
-                automatically loaded:
-                    - If a configuration is provided with `config`, `kwargs` are directly passed to the underlying
-                      model's `__init__` method (we assume all relevant updates to the configuration have already been
-                      done).
-                    - If a configuration is not provided, `kwargs` are first passed to the configuration class
-                      initialization function [`~ConfigMixin.from_config`]. Each key of the `kwargs` that corresponds
-                      to a configuration attribute is used to override said attribute with the supplied `kwargs` value.
-                      Remaining keys that do not correspond to any configuration attribute are passed to the underlying
-                      model's `__init__` function.
-        Examples:
-        ```python
-        >>> from diffusers import FlaxUNet2DConditionModel
-        >>> # Download model and configuration from huggingface.co and cache.
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("./test/saved_model/")
-        ```
-        If you get the error message below, you need to finetune the weights for your downstream task:
-        ```bash
-        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
-        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
-        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-        ```
-        """
-        config = kwargs.pop("config", None)
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        from_pt = kwargs.pop("from_pt", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        user_agent = {
-            "diffusers": __version__,
-            "file_type": "model",
-            "framework": "flax",
-        }
-        # Load config if we don't provide a configuration
-        config_path = config if config is not None else pretrained_model_name_or_path
-        model, model_kwargs = cls.from_config(
-            config_path,
-            cache_dir=cache_dir,
-            return_unused_kwargs=True,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            # model args
-            dtype=dtype,
-            **kwargs,
-        )
-        # Load model
-        pretrained_path_with_subfolder = (
-            pretrained_model_name_or_path
-            if subfolder is None
-            else os.path.join(pretrained_model_name_or_path, subfolder)
-        )
-        if os.path.isdir(pretrained_path_with_subfolder):
-            if from_pt:
-                if not os.path.isfile(os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)):
-                    raise EnvironmentError(
-                        f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_path_with_subfolder} "
-                    )
-                model_file = os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)
-            elif os.path.isfile(os.path.join(pretrained_path_with_subfolder, FLAX_WEIGHTS_NAME)):
-                # Load from a Flax checkpoint
-                model_file = os.path.join(pretrained_path_with_subfolder, FLAX_WEIGHTS_NAME)
-            # Check if pytorch weights exist instead
-            elif os.path.isfile(os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)):
-                raise EnvironmentError(
-                    f"{WEIGHTS_NAME} file found in directory {pretrained_path_with_subfolder}. Please load the model"
-                    " using `from_pt=True`."
-                )
-            else:
-                raise EnvironmentError(
-                    f"Error no file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
-                    f"{pretrained_path_with_subfolder}."
-                )
-        else:
-            try:
-                model_file = hf_hub_download(
-                    pretrained_model_name_or_path,
-                    filename=FLAX_WEIGHTS_NAME if not from_pt else WEIGHTS_NAME,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    user_agent=user_agent,
-                    subfolder=subfolder,
-                    revision=revision,
-                )
-            except RepositoryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
-                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
-                    "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
-                    "login`."
-                )
-            except RevisionNotFoundError:
-                raise EnvironmentError(
-                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
-                    "this model name. Check the model page at "
-                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
-                )
-            except EntryNotFoundError:
-                raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} does not appear to have a file named {FLAX_WEIGHTS_NAME}."
-                )
-            except HTTPError as err:
-                raise EnvironmentError(
-                    f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
-                    f"{err}"
-                )
-            except ValueError:
-                raise EnvironmentError(
-                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
-                    f" directory containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}.\nCheckout your"
-                    " internet connection or see how to run the library in offline mode at"
-                    " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
-                )
-            except EnvironmentError:
-                raise EnvironmentError(
-                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
-                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
-                    f"containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}."
-                )
-        if from_pt:
-            if is_torch_available():
-                from .modeling_utils import load_state_dict
-            else:
-                raise EnvironmentError(
-                    "Can't load the model in PyTorch format because PyTorch is not installed. "
-                    "Please, install PyTorch or use native Flax weights."
-                )
-            # Step 1: Get the pytorch file
-            pytorch_model_file = load_state_dict(model_file)
-            # Step 2: Convert the weights
-            state = convert_pytorch_state_dict_to_flax(pytorch_model_file, model)
-        else:
-            try:
-                with open(model_file, "rb") as state_f:
-                    state = from_bytes(cls, state_f.read())
-            except (UnpicklingError, msgpack.exceptions.ExtraData) as e:
-                try:
-                    with open(model_file) as f:
-                        if f.read().startswith("version"):
-                            raise OSError(
-                                "You seem to have cloned a repository without having git-lfs installed. Please"
-                                " install git-lfs and run `git lfs install` followed by `git lfs pull` in the"
-                                " folder you cloned."
-                            )
-                        else:
-                            raise ValueError from e
-                except (UnicodeDecodeError, ValueError):
-                    raise EnvironmentError(f"Unable to convert {model_file} to Flax deserializable object. ")
-            # make sure all arrays are stored as jnp.ndarray
-            # NOTE: This is to prevent a bug this will be fixed in Flax >= v0.3.4:
-            # https://github.com/google/flax/issues/1261
-        state = jax.tree_util.tree_map(lambda x: jax.device_put(x, jax.devices("cpu")[0]), state)
-        # flatten dicts
-        state = flatten_dict(state)
-        params_shape_tree = jax.eval_shape(model.init_weights, rng=jax.random.PRNGKey(0))
-        required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
-        shape_state = flatten_dict(unfreeze(params_shape_tree))
-        missing_keys = required_params - set(state.keys())
-        unexpected_keys = set(state.keys()) - required_params
-        if missing_keys:
-            logger.warning(
-                f"The checkpoint {pretrained_model_name_or_path} is missing required keys: {missing_keys}. "
-                "Make sure to call model.init_weights to initialize the missing weights."
-            )
-            cls._missing_keys = missing_keys
-        for key in state.keys():
-            if key in shape_state and state[key].shape != shape_state[key].shape:
-                raise ValueError(
-                    f"Trying to load the pretrained weight for {key} failed: checkpoint has shape "
-                    f"{state[key].shape} which is incompatible with the model shape {shape_state[key].shape}. "
-                )
-        # remove unexpected keys to not be saved again
-        for unexpected_key in unexpected_keys:
-            del state[unexpected_key]
-        if len(unexpected_keys) > 0:
-            logger.warning(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
-                " with another architecture."
-            )
-        else:
-            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-        if len(missing_keys) > 0:
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-            )
-        else:
-            logger.info(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
-                f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
-                " training."
-            )
-        return model, unflatten_dict(state)
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        params: Union[Dict, FrozenDict],
-        is_main_process: bool = True,
-    ):
-        """
-        Save a model and its configuration file to a directory so that it can be reloaded using the
-        [`~FlaxModelMixin.from_pretrained`] class method.
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
-            params (`Union[Dict, FrozenDict]`):
-                A `PyTree` of model parameters.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-        os.makedirs(save_directory, exist_ok=True)
-        model_to_save = self
-        # Attach architecture to the config
-        # Save the config
-        if is_main_process:
-            model_to_save.save_config(save_directory)
-        # save model
-        output_model_file = os.path.join(save_directory, FLAX_WEIGHTS_NAME)
-        with open(output_model_file, "wb") as f:
-            model_bytes = to_bytes(params)
-            f.write(model_bytes)
-        logger.info(f"Model weights saved in {output_model_file}")

4DoF/diffusers/models/modeling_pytorch_flax_utils.py DELETED Viewed

@@ -1,161 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch - Flax general utilities."""
-from pickle import UnpicklingError
-import jax
-import jax.numpy as jnp
-import numpy as np
-from flax.serialization import from_bytes
-from flax.traverse_util import flatten_dict
-from ..utils import logging
-logger = logging.get_logger(__name__)
-#####################
-# Flax => PyTorch #
-#####################
-# from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_flax_pytorch_utils.py#L224-L352
-def load_flax_checkpoint_in_pytorch_model(pt_model, model_file):
-    try:
-        with open(model_file, "rb") as flax_state_f:
-            flax_state = from_bytes(None, flax_state_f.read())
-    except UnpicklingError as e:
-        try:
-            with open(model_file) as f:
-                if f.read().startswith("version"):
-                    raise OSError(
-                        "You seem to have cloned a repository without having git-lfs installed. Please"
-                        " install git-lfs and run `git lfs install` followed by `git lfs pull` in the"
-                        " folder you cloned."
-                    )
-                else:
-                    raise ValueError from e
-        except (UnicodeDecodeError, ValueError):
-            raise EnvironmentError(f"Unable to convert {model_file} to Flax deserializable object. ")
-    return load_flax_weights_in_pytorch_model(pt_model, flax_state)
-def load_flax_weights_in_pytorch_model(pt_model, flax_state):
-    """Load flax checkpoints in a PyTorch model"""
-    try:
-        import torch  # noqa: F401
-    except ImportError:
-        logger.error(
-            "Loading Flax weights in PyTorch requires both PyTorch and Flax to be installed. Please see"
-            " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation"
-            " instructions."
-        )
-        raise
-    # check if we have bf16 weights
-    is_type_bf16 = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype == jnp.bfloat16, flax_state)).values()
-    if any(is_type_bf16):
-        # convert all weights to fp32 if they are bf16 since torch.from_numpy can-not handle bf16
-        # and bf16 is not fully supported in PT yet.
-        logger.warning(
-            "Found ``bfloat16`` weights in Flax model. Casting all ``bfloat16`` weights to ``float32`` "
-            "before loading those in PyTorch model."
-        )
-        flax_state = jax.tree_util.tree_map(
-            lambda params: params.astype(np.float32) if params.dtype == jnp.bfloat16 else params, flax_state
-        )
-    pt_model.base_model_prefix = ""
-    flax_state_dict = flatten_dict(flax_state, sep=".")
-    pt_model_dict = pt_model.state_dict()
-    # keep track of unexpected & missing keys
-    unexpected_keys = []
-    missing_keys = set(pt_model_dict.keys())
-    for flax_key_tuple, flax_tensor in flax_state_dict.items():
-        flax_key_tuple_array = flax_key_tuple.split(".")
-        if flax_key_tuple_array[-1] == "kernel" and flax_tensor.ndim == 4:
-            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
-            flax_tensor = jnp.transpose(flax_tensor, (3, 2, 0, 1))
-        elif flax_key_tuple_array[-1] == "kernel":
-            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
-            flax_tensor = flax_tensor.T
-        elif flax_key_tuple_array[-1] == "scale":
-            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
-        if "time_embedding" not in flax_key_tuple_array:
-            for i, flax_key_tuple_string in enumerate(flax_key_tuple_array):
-                flax_key_tuple_array[i] = (
-                    flax_key_tuple_string.replace("_0", ".0")
-                    .replace("_1", ".1")
-                    .replace("_2", ".2")
-                    .replace("_3", ".3")
-                    .replace("_4", ".4")
-                    .replace("_5", ".5")
-                    .replace("_6", ".6")
-                    .replace("_7", ".7")
-                    .replace("_8", ".8")
-                    .replace("_9", ".9")
-                )
-        flax_key = ".".join(flax_key_tuple_array)
-        if flax_key in pt_model_dict:
-            if flax_tensor.shape != pt_model_dict[flax_key].shape:
-                raise ValueError(
-                    f"Flax checkpoint seems to be incorrect. Weight {flax_key_tuple} was expected "
-                    f"to be of shape {pt_model_dict[flax_key].shape}, but is {flax_tensor.shape}."
-                )
-            else:
-                # add weight to pytorch dict
-                flax_tensor = np.asarray(flax_tensor) if not isinstance(flax_tensor, np.ndarray) else flax_tensor
-                pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
-                # remove from missing keys
-                missing_keys.remove(flax_key)
-        else:
-            # weight is not expected by PyTorch model
-            unexpected_keys.append(flax_key)
-    pt_model.load_state_dict(pt_model_dict)
-    # re-transform missing_keys to list
-    missing_keys = list(missing_keys)
-    if len(unexpected_keys) > 0:
-        logger.warning(
-            "Some weights of the Flax model were not used when initializing the PyTorch model"
-            f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
-            f" {pt_model.__class__.__name__} from a Flax model trained on another task or with another architecture"
-            " (e.g. initializing a BertForSequenceClassification model from a FlaxBertForPreTraining model).\n- This"
-            f" IS NOT expected if you are initializing {pt_model.__class__.__name__} from a Flax model that you expect"
-            " to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
-            " FlaxBertForSequenceClassification model)."
-        )
-    if len(missing_keys) > 0:
-        logger.warning(
-            f"Some weights of {pt_model.__class__.__name__} were not initialized from the Flax model and are newly"
-            f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
-            " use it for predictions and inference."
-        )
-    return pt_model

4DoF/diffusers/models/modeling_utils.py DELETED Viewed

@@ -1,980 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import itertools
-import os
-import re
-from functools import partial
-from typing import Any, Callable, List, Optional, Tuple, Union
-import torch
-from torch import Tensor, device, nn
-from .. import __version__
-from ..utils import (
-    CONFIG_NAME,
-    DIFFUSERS_CACHE,
-    FLAX_WEIGHTS_NAME,
-    HF_HUB_OFFLINE,
-    SAFETENSORS_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    _add_variant,
-    _get_model_file,
-    deprecate,
-    is_accelerate_available,
-    is_safetensors_available,
-    is_torch_version,
-    logging,
-)
-logger = logging.get_logger(__name__)
-if is_torch_version(">=", "1.9.0"):
-    _LOW_CPU_MEM_USAGE_DEFAULT = True
-else:
-    _LOW_CPU_MEM_USAGE_DEFAULT = False
-if is_accelerate_available():
-    import accelerate
-    from accelerate.utils import set_module_tensor_to_device
-    from accelerate.utils.versions import is_torch_version
-if is_safetensors_available():
-    import safetensors
-def get_parameter_device(parameter: torch.nn.Module):
-    try:
-        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
-        return next(parameters_and_buffers).device
-    except StopIteration:
-        # For torch.nn.DataParallel compatibility in PyTorch 1.5
-        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        first_tuple = next(gen)
-        return first_tuple[1].device
-def get_parameter_dtype(parameter: torch.nn.Module):
-    try:
-        params = tuple(parameter.parameters())
-        if len(params) > 0:
-            return params[0].dtype
-        buffers = tuple(parameter.buffers())
-        if len(buffers) > 0:
-            return buffers[0].dtype
-    except StopIteration:
-        # For torch.nn.DataParallel compatibility in PyTorch 1.5
-        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        first_tuple = next(gen)
-        return first_tuple[1].dtype
-def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
-    """
-    Reads a checkpoint file, returning properly formatted errors if they arise.
-    """
-    try:
-        if os.path.basename(checkpoint_file) == _add_variant(WEIGHTS_NAME, variant):
-            return torch.load(checkpoint_file, map_location="cpu")
-        else:
-            return safetensors.torch.load_file(checkpoint_file, device="cpu")
-    except Exception as e:
-        try:
-            with open(checkpoint_file) as f:
-                if f.read().startswith("version"):
-                    raise OSError(
-                        "You seem to have cloned a repository without having git-lfs installed. Please install "
-                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
-                        "you cloned."
-                    )
-                else:
-                    raise ValueError(
-                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
-                        "model. Make sure you have saved the model properly."
-                    ) from e
-        except (UnicodeDecodeError, ValueError):
-            raise OSError(
-                f"Unable to load weights from checkpoint file for '{checkpoint_file}' "
-                f"at '{checkpoint_file}'. "
-                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
-            )
-def _load_state_dict_into_model(model_to_load, state_dict):
-    # Convert old format to new format if needed from a PyTorch state_dict
-    # copy state_dict so _load_from_state_dict can modify it
-    state_dict = state_dict.copy()
-    error_msgs = []
-    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-    # so we need to apply the function recursively.
-    def load(module: torch.nn.Module, prefix=""):
-        args = (state_dict, prefix, {}, True, [], [], error_msgs)
-        module._load_from_state_dict(*args)
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, prefix + name + ".")
-    load(model_to_load)
-    return error_msgs
-class ModelMixin(torch.nn.Module):
-    r"""
-    Base class for all models.
-    [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
-    saving models.
-        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`].
-    """
-    config_name = CONFIG_NAME
-    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
-    _supports_gradient_checkpointing = False
-    _keys_to_ignore_on_load_unexpected = None
-    def __init__(self):
-        super().__init__()
-    def __getattr__(self, name: str) -> Any:
-        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
-        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
-        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
-        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        """
-        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
-        is_attribute = name in self.__dict__
-        if is_in_config and not is_attribute:
-            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
-            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
-            return self._internal_dict[name]
-        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        return super().__getattr__(name)
-    @property
-    def is_gradient_checkpointing(self) -> bool:
-        """
-        Whether gradient checkpointing is activated for this model or not.
-        """
-        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
-    def enable_gradient_checkpointing(self):
-        """
-        Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
-        *checkpoint activations* in other frameworks).
-        """
-        if not self._supports_gradient_checkpointing:
-            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
-        self.apply(partial(self._set_gradient_checkpointing, value=True))
-    def disable_gradient_checkpointing(self):
-        """
-        Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
-        *checkpoint activations* in other frameworks).
-        """
-        if self._supports_gradient_checkpointing:
-            self.apply(partial(self._set_gradient_checkpointing, value=False))
-    def set_use_memory_efficient_attention_xformers(
-        self, valid: bool, attention_op: Optional[Callable] = None
-    ) -> None:
-        # Recursively walk through all the children.
-        # Any children which exposes the set_use_memory_efficient_attention_xformers method
-        # gets the message
-        def fn_recursive_set_mem_eff(module: torch.nn.Module):
-            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
-            for child in module.children():
-                fn_recursive_set_mem_eff(child)
-        for module in self.children():
-            if isinstance(module, torch.nn.Module):
-                fn_recursive_set_mem_eff(module)
-    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
-        r"""
-        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
-        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
-        inference. Speed up during training is not guaranteed.
-        <Tip warning={true}>
-        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
-        precedent.
-        </Tip>
-        Parameters:
-            attention_op (`Callable`, *optional*):
-                Override the default `None` operator for use as `op` argument to the
-                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
-                function of xFormers.
-        Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import UNet2DConditionModel
-        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
-        >>> model = UNet2DConditionModel.from_pretrained(
-        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
-        ... )
-        >>> model = model.to("cuda")
-        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
-        ```
-        """
-        self.set_use_memory_efficient_attention_xformers(True, attention_op)
-    def disable_xformers_memory_efficient_attention(self):
-        r"""
-        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
-        """
-        self.set_use_memory_efficient_attention_xformers(False)
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        is_main_process: bool = True,
-        save_function: Callable = None,
-        safe_serialization: bool = False,
-        variant: Optional[str] = None,
-    ):
-        """
-        Save a model and its configuration file to a directory so that it can be reloaded using the
-        [`~models.ModelMixin.from_pretrained`] class method.
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `False`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            variant (`str`, *optional*):
-                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
-        """
-        if safe_serialization and not is_safetensors_available():
-            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-        os.makedirs(save_directory, exist_ok=True)
-        model_to_save = self
-        # Attach architecture to the config
-        # Save the config
-        if is_main_process:
-            model_to_save.save_config(save_directory)
-        # Save the model
-        state_dict = model_to_save.state_dict()
-        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
-        weights_name = _add_variant(weights_name, variant)
-        # Save the model
-        if safe_serialization:
-            safetensors.torch.save_file(
-                state_dict, os.path.join(save_directory, weights_name), metadata={"format": "pt"}
-            )
-        else:
-            torch.save(state_dict, os.path.join(save_directory, weights_name))
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        r"""
-        Instantiate a pretrained PyTorch model from a pretrained model configuration.
-        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
-        train the model, set it back in training mode with `model.train()`.
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`~ModelMixin.save_pretrained`].
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info (`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            from_flax (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a Flax checkpoint save file.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            mirror (`str`, *optional*):
-                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
-                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
-                information.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
-                A map that specifies where each submodule should go. It doesn't need to be defined for each
-                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
-                same device.
-                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
-                more information about each option see [designing a device
-                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
-            max_memory (`Dict`, *optional*):
-                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
-                each GPU and the available CPU RAM if unset.
-            offload_folder (`str` or `os.PathLike`, *optional*):
-                The path to offload weights if `device_map` contains the value `"disk"`.
-            offload_state_dict (`bool`, *optional*):
-                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
-                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
-                when there is some disk offload.
-            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
-                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
-                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
-                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
-                argument to `True` will raise an error.
-            variant (`str`, *optional*):
-                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
-                loading `from_flax`.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
-                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
-                weights. If set to `False`, `safetensors` weights are not loaded.
-        <Tip>
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
-        `huggingface-cli login`. You can also activate the special
-        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
-        firewalled environment.
-        </Tip>
-        Example:
-        ```py
-        from diffusers import UNet2DConditionModel
-        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
-        ```
-        If you get the error message below, you need to finetune the weights for your downstream task:
-        ```bash
-        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
-        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
-        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-        ```
-        """
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
-        from_flax = kwargs.pop("from_flax", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        torch_dtype = kwargs.pop("torch_dtype", None)
-        subfolder = kwargs.pop("subfolder", None)
-        device_map = kwargs.pop("device_map", None)
-        max_memory = kwargs.pop("max_memory", None)
-        offload_folder = kwargs.pop("offload_folder", None)
-        offload_state_dict = kwargs.pop("offload_state_dict", False)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
-        variant = kwargs.pop("variant", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
-            )
-        allow_pickle = False
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-            allow_pickle = True
-        if low_cpu_mem_usage and not is_accelerate_available():
-            low_cpu_mem_usage = False
-            logger.warning(
-                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
-                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
-                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
-                " install accelerate\n```\n."
-            )
-        if device_map is not None and not is_accelerate_available():
-            raise NotImplementedError(
-                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
-                " `device_map=None`. You can install accelerate with `pip install accelerate`."
-            )
-        # Check if we can handle device_map and dispatching the weights
-        if device_map is not None and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError(
-                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
-                " `device_map=None`."
-            )
-        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError(
-                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
-                " `low_cpu_mem_usage=False`."
-            )
-        if low_cpu_mem_usage is False and device_map is not None:
-            raise ValueError(
-                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
-                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
-            )
-        # Load config if we don't provide a configuration
-        config_path = pretrained_model_name_or_path
-        user_agent = {
-            "diffusers": __version__,
-            "file_type": "model",
-            "framework": "pytorch",
-        }
-        # load config
-        config, unused_kwargs, commit_hash = cls.load_config(
-            config_path,
-            cache_dir=cache_dir,
-            return_unused_kwargs=True,
-            return_commit_hash=True,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            device_map=device_map,
-            max_memory=max_memory,
-            offload_folder=offload_folder,
-            offload_state_dict=offload_state_dict,
-            user_agent=user_agent,
-            **kwargs,
-        )
-        # load model
-        model_file = None
-        if from_flax:
-            model_file = _get_model_file(
-                pretrained_model_name_or_path,
-                weights_name=FLAX_WEIGHTS_NAME,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-                commit_hash=commit_hash,
-            )
-            model = cls.from_config(config, **unused_kwargs)
-            # Convert the weights
-            from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
-            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
-        else:
-            if use_safetensors:
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        commit_hash=commit_hash,
-                    )
-                except IOError as e:
-                    if not allow_pickle:
-                        raise e
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path,
-                    weights_name=_add_variant(WEIGHTS_NAME, variant),
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                    commit_hash=commit_hash,
-                )
-            if low_cpu_mem_usage:
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls.from_config(config, **unused_kwargs)
-                # if device_map is None, load the state dict and move the params from meta device to the cpu
-                if device_map is None:
-                    param_device = "cpu"
-                    state_dict = load_state_dict(model_file, variant=variant)
-                    model._convert_deprecated_attention_blocks(state_dict)
-                    # move the params from meta device to cpu
-                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-                    if len(missing_keys) > 0:
-                        raise ValueError(
-                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
-                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
-                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
-                            " those weights or else make sure your checkpoint file is correct."
-                        )
-                    unexpected_keys = []
-                    empty_state_dict = model.state_dict()
-                    for param_name, param in state_dict.items():
-                        accepts_dtype = "dtype" in set(
-                            inspect.signature(set_module_tensor_to_device).parameters.keys()
-                        )
-                        if param_name not in empty_state_dict:
-                            unexpected_keys.append(param_name)
-                            continue
-                        if empty_state_dict[param_name].shape != param.shape:
-                            raise ValueError(
-                                f"Cannot load {pretrained_model_name_or_path} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
-                            )
-                        if accepts_dtype:
-                            set_module_tensor_to_device(
-                                model, param_name, param_device, value=param, dtype=torch_dtype
-                            )
-                        else:
-                            set_module_tensor_to_device(model, param_name, param_device, value=param)
-                    if cls._keys_to_ignore_on_load_unexpected is not None:
-                        for pat in cls._keys_to_ignore_on_load_unexpected:
-                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-                    if len(unexpected_keys) > 0:
-                        logger.warn(
-                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
-                        )
-                else:  # else let accelerate handle loading and dispatching.
-                    # Load weights and dispatch according to the device_map
-                    # by default the device_map is None and the weights are loaded on the CPU
-                    try:
-                        accelerate.load_checkpoint_and_dispatch(
-                            model,
-                            model_file,
-                            device_map,
-                            max_memory=max_memory,
-                            offload_folder=offload_folder,
-                            offload_state_dict=offload_state_dict,
-                            dtype=torch_dtype,
-                        )
-                    except AttributeError as e:
-                        # When using accelerate loading, we do not have the ability to load the state
-                        # dict and rename the weight names manually. Additionally, accelerate skips
-                        # torch loading conventions and directly writes into `module.{_buffers, _parameters}`
-                        # (which look like they should be private variables?), so we can't use the standard hooks
-                        # to rename parameters on load. We need to mimic the original weight names so the correct
-                        # attributes are available. After we have loaded the weights, we convert the deprecated
-                        # names to the new non-deprecated names. Then we _greatly encourage_ the user to convert
-                        # the weights so we don't have to do this again.
-                        if "'Attention' object has no attribute" in str(e):
-                            logger.warn(
-                                f"Taking `{str(e)}` while using `accelerate.load_checkpoint_and_dispatch` to mean {pretrained_model_name_or_path}"
-                                " was saved with deprecated attention block weight names. We will load it with the deprecated attention block"
-                                " names and convert them on the fly to the new attention block format. Please re-save the model after this conversion,"
-                                " so we don't have to do the on the fly renaming in the future. If the model is from a hub checkpoint,"
-                                " please also re-upload it or open a PR on the original repository."
-                            )
-                            model._temp_convert_self_to_deprecated_attention_blocks()
-                            accelerate.load_checkpoint_and_dispatch(
-                                model,
-                                model_file,
-                                device_map,
-                                max_memory=max_memory,
-                                offload_folder=offload_folder,
-                                offload_state_dict=offload_state_dict,
-                                dtype=torch_dtype,
-                            )
-                            model._undo_temp_convert_self_to_deprecated_attention_blocks()
-                        else:
-                            raise e
-                loading_info = {
-                    "missing_keys": [],
-                    "unexpected_keys": [],
-                    "mismatched_keys": [],
-                    "error_msgs": [],
-                }
-            else:
-                model = cls.from_config(config, **unused_kwargs)
-                state_dict = load_state_dict(model_file, variant=variant)
-                model._convert_deprecated_attention_blocks(state_dict)
-                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
-                    model,
-                    state_dict,
-                    model_file,
-                    pretrained_model_name_or_path,
-                    ignore_mismatched_sizes=ignore_mismatched_sizes,
-                )
-                loading_info = {
-                    "missing_keys": missing_keys,
-                    "unexpected_keys": unexpected_keys,
-                    "mismatched_keys": mismatched_keys,
-                    "error_msgs": error_msgs,
-                }
-        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
-            raise ValueError(
-                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
-            )
-        elif torch_dtype is not None:
-            model = model.to(torch_dtype)
-        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-        if output_loading_info:
-            return model, loading_info
-        return model
-    @classmethod
-    def _load_pretrained_model(
-        cls,
-        model,
-        state_dict,
-        resolved_archive_file,
-        pretrained_model_name_or_path,
-        ignore_mismatched_sizes=False,
-    ):
-        # Retrieve missing & unexpected_keys
-        model_state_dict = model.state_dict()
-        loaded_keys = list(state_dict.keys())
-        expected_keys = list(model_state_dict.keys())
-        original_loaded_keys = loaded_keys
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-        # Make sure we are able to load base models as well as derived models (with heads)
-        model_to_load = model
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key in loaded_keys:
-                    model_key = checkpoint_key
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        mismatched_keys.append(
-                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                        )
-                        del state_dict[checkpoint_key]
-            return mismatched_keys
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                original_loaded_keys,
-                ignore_mismatched_sizes,
-            )
-            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
-        if len(error_msgs) > 0:
-            error_msg = "\n\t".join(error_msgs)
-            if "size mismatch" in error_msg:
-                error_msg += (
-                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
-                )
-            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
-        if len(unexpected_keys) > 0:
-            logger.warning(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
-                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
-                " identical (initializing a BertForSequenceClassification model from a"
-                " BertForSequenceClassification model)."
-            )
-        else:
-            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-        if len(missing_keys) > 0:
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-            )
-        elif len(mismatched_keys) == 0:
-            logger.info(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
-                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
-                " without further training."
-            )
-        if len(mismatched_keys) > 0:
-            mismatched_warning = "\n".join(
-                [
-                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                    for key, shape1, shape2 in mismatched_keys
-                ]
-            )
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
-                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
-                " able to use it for predictions and inference."
-            )
-        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
-    @property
-    def device(self) -> device:
-        """
-        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
-        return get_parameter_device(self)
-    @property
-    def dtype(self) -> torch.dtype:
-        """
-        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
-        return get_parameter_dtype(self)
-    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
-        """
-        Get number of (trainable or non-embedding) parameters in the module.
-        Args:
-            only_trainable (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of trainable parameters.
-            exclude_embeddings (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of non-embedding parameters.
-        Returns:
-            `int`: The number of parameters.
-        Example:
-        ```py
-        from diffusers import UNet2DConditionModel
-        model_id = "runwayml/stable-diffusion-v1-5"
-        unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
-        unet.num_parameters(only_trainable=True)
-        859520964
-        ```
-        """
-        if exclude_embeddings:
-            embedding_param_names = [
-                f"{name}.weight"
-                for name, module_type in self.named_modules()
-                if isinstance(module_type, torch.nn.Embedding)
-            ]
-            non_embedding_parameters = [
-                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
-            ]
-            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
-        else:
-            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
-    def _convert_deprecated_attention_blocks(self, state_dict):
-        deprecated_attention_block_paths = []
-        def recursive_find_attn_block(name, module):
-            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
-                deprecated_attention_block_paths.append(name)
-            for sub_name, sub_module in module.named_children():
-                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
-                recursive_find_attn_block(sub_name, sub_module)
-        recursive_find_attn_block("", self)
-        # NOTE: we have to check if the deprecated parameters are in the state dict
-        # because it is possible we are loading from a state dict that was already
-        # converted
-        for path in deprecated_attention_block_paths:
-            # group_norm path stays the same
-            # query -> to_q
-            if f"{path}.query.weight" in state_dict:
-                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
-            if f"{path}.query.bias" in state_dict:
-                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
-            # key -> to_k
-            if f"{path}.key.weight" in state_dict:
-                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
-            if f"{path}.key.bias" in state_dict:
-                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
-            # value -> to_v
-            if f"{path}.value.weight" in state_dict:
-                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
-            if f"{path}.value.bias" in state_dict:
-                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
-            # proj_attn -> to_out.0
-            if f"{path}.proj_attn.weight" in state_dict:
-                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
-            if f"{path}.proj_attn.bias" in state_dict:
-                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
-    def _temp_convert_self_to_deprecated_attention_blocks(self):
-        deprecated_attention_block_modules = []
-        def recursive_find_attn_block(module):
-            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
-                deprecated_attention_block_modules.append(module)
-            for sub_module in module.children():
-                recursive_find_attn_block(sub_module)
-        recursive_find_attn_block(self)
-        for module in deprecated_attention_block_modules:
-            module.query = module.to_q
-            module.key = module.to_k
-            module.value = module.to_v
-            module.proj_attn = module.to_out[0]
-            # We don't _have_ to delete the old attributes, but it's helpful to ensure
-            # that _all_ the weights are loaded into the new attributes and we're not
-            # making an incorrect assumption that this model should be converted when
-            # it really shouldn't be.
-            del module.to_q
-            del module.to_k
-            del module.to_v
-            del module.to_out
-    def _undo_temp_convert_self_to_deprecated_attention_blocks(self):
-        deprecated_attention_block_modules = []
-        def recursive_find_attn_block(module):
-            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
-                deprecated_attention_block_modules.append(module)
-            for sub_module in module.children():
-                recursive_find_attn_block(sub_module)
-        recursive_find_attn_block(self)
-        for module in deprecated_attention_block_modules:
-            module.to_q = module.query
-            module.to_k = module.key
-            module.to_v = module.value
-            module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)])
-            del module.query
-            del module.key
-            del module.value
-            del module.proj_attn

4DoF/diffusers/models/prior_transformer.py DELETED Viewed

@@ -1,364 +0,0 @@
-from dataclasses import dataclass
-from typing import Dict, Optional, Union
-import torch
-import torch.nn.functional as F
-from torch import nn
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from .attention import BasicTransformerBlock
-from .attention_processor import AttentionProcessor, AttnProcessor
-from .embeddings import TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-@dataclass
-class PriorTransformerOutput(BaseOutput):
-    """
-    The output of [`PriorTransformer`].
-    Args:
-        predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
-            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
-    """
-    predicted_image_embedding: torch.FloatTensor
-class PriorTransformer(ModelMixin, ConfigMixin):
-    """
-    A Prior Transformer model.
-    Parameters:
-        num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
-        num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
-        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
-        num_embeddings (`int`, *optional*, defaults to 77):
-            The number of embeddings of the model input `hidden_states`
-        additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
-            projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
-            additional_embeddings`.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
-            The activation function to use to create timestep embeddings.
-        norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
-            passing to Transformer blocks. Set it to `None` if normalization is not needed.
-        embedding_proj_norm_type (`str`, *optional*, defaults to None):
-            The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
-            needed.
-        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
-            The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
-            `encoder_hidden_states` is `None`.
-        added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
-            Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot
-            product between the text embedding and image embedding as proposed in the unclip paper
-            https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended.
-        time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
-            If None, will be set to `num_attention_heads * attention_head_dim`
-        embedding_proj_dim (`int`, *optional*, default to None):
-            The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
-        clip_embed_dim (`int`, *optional*, default to None):
-            The dimension of the output. If None, will be set to `embedding_dim`.
-    """
-    @register_to_config
-    def __init__(
-        self,
-        num_attention_heads: int = 32,
-        attention_head_dim: int = 64,
-        num_layers: int = 20,
-        embedding_dim: int = 768,
-        num_embeddings=77,
-        additional_embeddings=4,
-        dropout: float = 0.0,
-        time_embed_act_fn: str = "silu",
-        norm_in_type: Optional[str] = None,  # layer
-        embedding_proj_norm_type: Optional[str] = None,  # layer
-        encoder_hid_proj_type: Optional[str] = "linear",  # linear
-        added_emb_type: Optional[str] = "prd",  # prd
-        time_embed_dim: Optional[int] = None,
-        embedding_proj_dim: Optional[int] = None,
-        clip_embed_dim: Optional[int] = None,
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        inner_dim = num_attention_heads * attention_head_dim
-        self.additional_embeddings = additional_embeddings
-        time_embed_dim = time_embed_dim or inner_dim
-        embedding_proj_dim = embedding_proj_dim or embedding_dim
-        clip_embed_dim = clip_embed_dim or embedding_dim
-        self.time_proj = Timesteps(inner_dim, True, 0)
-        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
-        self.proj_in = nn.Linear(embedding_dim, inner_dim)
-        if embedding_proj_norm_type is None:
-            self.embedding_proj_norm = None
-        elif embedding_proj_norm_type == "layer":
-            self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim)
-        else:
-            raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}")
-        self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim)
-        if encoder_hid_proj_type is None:
-            self.encoder_hidden_states_proj = None
-        elif encoder_hid_proj_type == "linear":
-            self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
-        else:
-            raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}")
-        self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
-        if added_emb_type == "prd":
-            self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
-        elif added_emb_type is None:
-            self.prd_embedding = None
-        else:
-            raise ValueError(
-                f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`."
-            )
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock(
-                    inner_dim,
-                    num_attention_heads,
-                    attention_head_dim,
-                    dropout=dropout,
-                    activation_fn="gelu",
-                    attention_bias=True,
-                )
-                for d in range(num_layers)
-            ]
-        )
-        if norm_in_type == "layer":
-            self.norm_in = nn.LayerNorm(inner_dim)
-        elif norm_in_type is None:
-            self.norm_in = None
-        else:
-            raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.")
-        self.norm_out = nn.LayerNorm(inner_dim)
-        self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim)
-        causal_attention_mask = torch.full(
-            [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0
-        )
-        causal_attention_mask.triu_(1)
-        causal_attention_mask = causal_attention_mask[None, ...]
-        self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
-        self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim))
-        self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim))
-    @property
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "set_processor"):
-                processors[f"{name}.processor"] = module.processor
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-            return processors
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-        return processors
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-        """
-        count = len(self.attn_processors.keys())
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        self.set_attn_processor(AttnProcessor())
-    def forward(
-        self,
-        hidden_states,
-        timestep: Union[torch.Tensor, float, int],
-        proj_embedding: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.BoolTensor] = None,
-        return_dict: bool = True,
-    ):
-        """
-        The [`PriorTransformer`] forward method.
-        Args:
-            hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
-                The currently predicted image embeddings.
-            timestep (`torch.LongTensor`):
-                Current denoising step.
-            proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
-                Projected embedding vector the denoising process is conditioned on.
-            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_embeddings, embedding_dim)`):
-                Hidden states of the text embeddings the denoising process is conditioned on.
-            attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`):
-                Text mask for the text embeddings.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain
-                tuple.
-        Returns:
-            [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
-                If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
-        """
-        batch_size = hidden_states.shape[0]
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            timesteps = torch.tensor([timesteps], dtype=torch.long, device=hidden_states.device)
-        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(hidden_states.device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps * torch.ones(batch_size, dtype=timesteps.dtype, device=timesteps.device)
-        timesteps_projected = self.time_proj(timesteps)
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might be fp16, so we need to cast here.
-        timesteps_projected = timesteps_projected.to(dtype=self.dtype)
-        time_embeddings = self.time_embedding(timesteps_projected)
-        if self.embedding_proj_norm is not None:
-            proj_embedding = self.embedding_proj_norm(proj_embedding)
-        proj_embeddings = self.embedding_proj(proj_embedding)
-        if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
-            encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
-        elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None:
-            raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set")
-        hidden_states = self.proj_in(hidden_states)
-        positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
-        additional_embeds = []
-        additional_embeddings_len = 0
-        if encoder_hidden_states is not None:
-            additional_embeds.append(encoder_hidden_states)
-            additional_embeddings_len += encoder_hidden_states.shape[1]
-        if len(proj_embeddings.shape) == 2:
-            proj_embeddings = proj_embeddings[:, None, :]
-        if len(hidden_states.shape) == 2:
-            hidden_states = hidden_states[:, None, :]
-        additional_embeds = additional_embeds + [
-            proj_embeddings,
-            time_embeddings[:, None, :],
-            hidden_states,
-        ]
-        if self.prd_embedding is not None:
-            prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
-            additional_embeds.append(prd_embedding)
-        hidden_states = torch.cat(
-            additional_embeds,
-            dim=1,
-        )
-        # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
-        additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1
-        if positional_embeddings.shape[1] < hidden_states.shape[1]:
-            positional_embeddings = F.pad(
-                positional_embeddings,
-                (
-                    0,
-                    0,
-                    additional_embeddings_len,
-                    self.prd_embedding.shape[1] if self.prd_embedding is not None else 0,
-                ),
-                value=0.0,
-            )
-        hidden_states = hidden_states + positional_embeddings
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
-            attention_mask = F.pad(attention_mask, (0, self.additional_embeddings), value=0.0)
-            attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype)
-            attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0)
-        if self.norm_in is not None:
-            hidden_states = self.norm_in(hidden_states)
-        for block in self.transformer_blocks:
-            hidden_states = block(hidden_states, attention_mask=attention_mask)
-        hidden_states = self.norm_out(hidden_states)
-        if self.prd_embedding is not None:
-            hidden_states = hidden_states[:, -1]
-        else:
-            hidden_states = hidden_states[:, additional_embeddings_len:]
-        predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
-        if not return_dict:
-            return (predicted_image_embedding,)
-        return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
-    def post_process_latents(self, prior_latents):
-        prior_latents = (prior_latents * self.clip_std) + self.clip_mean
-        return prior_latents