Spaces:

Kyle-Liz
/

3DFauna_demo

Sleeping

App Files Files Community

kyleleey commited on Dec 24, 2023

Commit

cd3b424

•

1 Parent(s): 2b1cca8

remove unused pkgs

Browse files

Files changed (5) hide show

requirements.txt +0 -2
video3d/diffusion/sd.py +0 -252
video3d/diffusion/sd_utils.py +0 -123
video3d/diffusion/vsd.py +0 -323
video3d/model_ddp.py +2 -199

requirements.txt CHANGED Viewed

@@ -1,6 +1,4 @@
 ConfigArgParse==1.5.3
-core==1.0.1
-diffusers==0.20.0
 einops==0.4.1
 faiss==1.7.3
 fire==0.5.0

 ConfigArgParse==1.5.3
 einops==0.4.1
 faiss==1.7.3
 fire==0.5.0

video3d/diffusion/sd.py DELETED Viewed

@@ -1,252 +0,0 @@
-import os
-# os.environ['HUGGINGFACE_HUB_CACHE'] = '/work/tomj/cache/huggingface_hub'
-# os.environ['HF_HOME'] = '/work/tomj/cache/huggingface_hub'
-os.environ['HUGGINGFACE_HUB_CACHE'] = '/viscam/u/zzli'
-os.environ['HF_HOME'] = '/viscam/u/zzli'
-from transformers import CLIPTextModel, CLIPTokenizer, logging
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DDIMScheduler
-# Suppress partial model loading warning
-logging.set_verbosity_error()
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.cuda.amp import custom_bwd, custom_fwd
-class SpecifyGradient(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, input_tensor, gt_grad):
-        ctx.save_for_backward(gt_grad)
-        return torch.zeros([1], device=input_tensor.device, dtype=input_tensor.dtype)  # Dummy loss value
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad):
-        gt_grad, = ctx.saved_tensors
-        batch_size = len(gt_grad)
-        return gt_grad / batch_size, None
-def seed_everything(seed):
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-class StableDiffusion(nn.Module):
-    def __init__(self, device, sd_version='2.1', hf_key=None, torch_dtype=torch.float32):
-        super().__init__()
-        self.device = device
-        self.sd_version = sd_version
-        self.torch_dtype = torch_dtype
-        print(f'[INFO] loading stable diffusion...')
-        if hf_key is not None:
-            print(f'[INFO] using hugging face custom model key: {hf_key}')
-            model_key = hf_key
-        elif self.sd_version == '2.1':
-            model_key = "stabilityai/stable-diffusion-2-1-base"
-        elif self.sd_version == '2.0':
-            model_key = "stabilityai/stable-diffusion-2-base"
-        elif self.sd_version == '1.5':
-            model_key = "runwayml/stable-diffusion-v1-5"
-        else:
-            raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
-        # Create model
-        self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae", torch_dtype=torch_dtype).to(self.device)
-        self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
-        self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder").to(self.device)
-        self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", torch_dtype=torch_dtype).to(self.device)
-        self.scheduler = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")
-        # self.scheduler = PNDMScheduler.from_pretrained(model_key, subfolder="scheduler")
-        self.num_train_timesteps = self.scheduler.config.num_train_timesteps
-        self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience
-        print(f'[INFO] loaded stable diffusion!')
-    def get_text_embeds(self, prompt, negative_prompt):
-        # prompt, negative_prompt: [str]
-        # Tokenize text and get embeddings
-        text_input = self.tokenizer(prompt, padding='max_length', max_length=self.tokenizer.model_max_length, truncation=True, return_tensors='pt')
-        with torch.no_grad():
-            text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
-        # Do the same for unconditional embeddings
-        uncond_input = self.tokenizer(negative_prompt, padding='max_length', max_length=self.tokenizer.model_max_length, return_tensors='pt')
-        with torch.no_grad():
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
-        # Cat for final embeddings
-        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-        return text_embeddings
-    def train_step(self, text_embeddings, pred_rgb,
-                   guidance_scale=100, loss_weight=1.0, min_step_pct=0.02, max_step_pct=0.98, return_aux=False):
-        pred_rgb = pred_rgb.to(self.torch_dtype)
-        text_embeddings = text_embeddings.to(self.torch_dtype)
-        b = pred_rgb.shape[0]
-        # interp to 512x512 to be fed into vae.
-        # _t = time.time()
-        pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False)
-        # torch.cuda.synchronize(); print(f'[TIME] guiding: interp {time.time() - _t:.4f}s')
-        # timestep ~ U(0.02, 0.98) to avoid very high/low noise level
-        min_step = int(self.num_train_timesteps * min_step_pct)
-        max_step = int(self.num_train_timesteps * max_step_pct)
-        t = torch.randint(min_step, max_step + 1, [b], dtype=torch.long, device=self.device)
-        # encode image into latents with vae, requires grad!
-        # _t = time.time()
-        latents = self.encode_imgs(pred_rgb_512)
-        # torch.cuda.synchronize(); print(f'[TIME] guiding: vae enc {time.time() - _t:.4f}s')
-        # predict the noise residual with unet, NO grad!
-        # _t = time.time()
-        with torch.no_grad():
-            # add noise
-            noise = torch.randn_like(latents)
-            latents_noisy = self.scheduler.add_noise(latents, noise, t)
-            # pred noise
-            latent_model_input = torch.cat([latents_noisy] * 2)
-            t_input = torch.cat([t, t])
-            noise_pred = self.unet(latent_model_input, t_input, encoder_hidden_states=text_embeddings).sample
-        # torch.cuda.synchronize(); print(f'[TIME] guiding: unet {time.time() - _t:.4f}s')
-        # perform guidance (high scale from paper!)
-        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-        # noise_pred = noise_pred_text + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-        # w(t), sigma_t^2
-        w = (1 - self.alphas[t])
-        # w = self.alphas[t] ** 0.5 * (1 - self.alphas[t])
-        grad = loss_weight * w[:, None, None, None] * (noise_pred - noise)
-        # clip grad for stable training?
-        # grad = grad.clamp(-10, 10)
-        grad = torch.nan_to_num(grad)
-        # since we omitted an item in grad, we need to use the custom function to specify the gradient
-        # _t = time.time()
-        # loss = SpecifyGradient.apply(latents, grad)
-        # torch.cuda.synchronize(); print(f'[TIME] guiding: backward {time.time() - _t:.4f}s')
-        targets = (latents - grad).detach()
-        loss = 0.5 * F.mse_loss(latents.float(), targets, reduction='sum') / latents.shape[0]
-        if return_aux:
-            aux = {'grad': grad, 't': t, 'w': w}
-            return loss, aux
-        else:
-            return loss
-    def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None):
-        if latents is None:
-            latents = torch.randn((text_embeddings.shape[0] // 2, self.unet.config.in_channels, height // 8, width // 8), device=self.device)
-        self.scheduler.set_timesteps(num_inference_steps)
-        with torch.autocast('cuda'):
-            for i, t in enumerate(self.scheduler.timesteps):
-                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-                latent_model_input = torch.cat([latents] * 2)
-                # predict the noise residual
-                with torch.no_grad():
-                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']
-                # perform guidance
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_text + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents)['prev_sample']
-        return latents
-    def decode_latents(self, latents):
-        latents = 1 / self.vae.config.scaling_factor * latents
-        with torch.no_grad():
-            imgs = self.vae.decode(latents).sample
-        imgs = (imgs / 2 + 0.5).clamp(0, 1)
-        return imgs
-    def encode_imgs(self, imgs):
-        # imgs: [B, 3, H, W]
-        imgs = 2 * imgs - 1
-        posterior = self.vae.encode(imgs).latent_dist
-        latents = posterior.sample() * self.vae.config.scaling_factor
-        return latents
-    def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50, guidance_scale=7.5, latents=None):
-        if isinstance(prompts, str):
-            prompts = [prompts]
-        if isinstance(negative_prompts, str):
-            negative_prompts = [negative_prompts]
-        # Prompts -> text embeds
-        text_embeds = self.get_text_embeds(prompts, negative_prompts) # [2, 77, 768]
-        # Text embeds -> img latents
-        latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale) # [1, 4, 64, 64]
-        # Img latents -> imgs
-        imgs = self.decode_latents(latents) # [1, 3, 512, 512]
-        # Img to Numpy
-        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
-        imgs = (imgs * 255).round().astype('uint8')
-        return imgs
-if __name__ == '__main__':
-    import argparse
-    import matplotlib.pyplot as plt
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt', type=str)
-    parser.add_argument('--negative', default='', type=str)
-    parser.add_argument('--sd_version', type=str, default='2.1', choices=['1.5', '2.0', '2.1'], help="stable diffusion version")
-    parser.add_argument('--hf_key', type=str, default=None, help="hugging face Stable diffusion model key")
-    parser.add_argument('-H', type=int, default=512)
-    parser.add_argument('-W', type=int, default=512)
-    parser.add_argument('--seed', type=int, default=0)
-    parser.add_argument('--steps', type=int, default=50)
-    opt = parser.parse_args()
-    seed_everything(opt.seed)
-    device = torch.device('cuda')
-    sd = StableDiffusion(device, opt.sd_version, opt.hf_key)
-    imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps)
-    # visualize image
-    plt.imshow(imgs[0])
-    plt.show()
-    plt.savefig(f'{opt.prompt}.png')

video3d/diffusion/sd_utils.py DELETED Viewed

@@ -1,123 +0,0 @@
-import torch
-import numpy as np
-import random
-import torch.nn.functional as F
-from ..render.light import DirectionalLight
-def safe_normalize(x, eps=1e-20):
-    return x / torch.sqrt(torch.clamp(torch.sum(x * x, -1, keepdim=True), min=eps))
-def get_view_direction(thetas, phis, overhead, front, phi_offset=0):
-    #                   phis [B,];          thetas: [B,]
-    # front = 0         [360 - front / 2, front / 2)
-    # side (left) = 1   [front / 2, 180 - front / 2)
-    # back = 2          [180 - front / 2, 180 + front / 2)
-    # side (right) = 3  [180 + front / 2, 360 - front / 2)
-    # top = 4                               [0, overhead]
-    # bottom = 5                            [180-overhead, 180]
-    res = torch.zeros(thetas.shape[0], dtype=torch.long)
-    # first determine by phis
-    phi_offset = np.deg2rad(phi_offset)
-    phis = phis + phi_offset
-    phis = phis % (2 * np.pi)
-    half_front = front / 2
-    res[(phis >= (2*np.pi - half_front)) | (phis < half_front)] = 0
-    res[(phis >= half_front) & (phis < (np.pi - half_front))] = 1
-    res[(phis >= (np.pi - half_front)) & (phis < (np.pi + half_front))] = 2
-    res[(phis >= (np.pi + half_front)) & (phis < (2*np.pi - half_front))] = 3
-    # override by thetas
-    res[thetas <= overhead] = 4
-    res[thetas >= (np.pi - overhead)] = 5
-    return res
-def view_direction_id_to_text(view_direction_id):
-    dir_texts = ['front', 'side', 'back', 'side', 'overhead', 'bottom']
-    return [dir_texts[i] for i in view_direction_id]
-def append_text_direction(prompts, dir_texts):
-    return [f'{prompt}, {dir_text} view' for prompt, dir_text in zip(prompts, dir_texts)]
-def rand_lights(camera_dir, fixed_ambient, fixed_diffuse):
-    size = camera_dir.shape[0]
-    device = camera_dir.device
-    random_fixed_dir = F.normalize(torch.randn_like(camera_dir) + camera_dir, dim=-1)  # Centered around camera_dir
-    random_fixed_intensity = torch.tensor([fixed_ambient, fixed_diffuse], device=device)[None, :].repeat(size, 1)  # ambient, diffuse
-    return DirectionalLight(mlp_in=1, mlp_layers=1, mlp_hidden_size=1, # Dummy values
-                            intensity_min_max=[0.5, 1],fixed_dir=random_fixed_dir, fixed_intensity=random_fixed_intensity).to(device)
-def rand_poses(size, device, radius_range=[1, 1], theta_range=[0, 120], phi_range=[0, 360], cam_z_offset=10, return_dirs=False, angle_overhead=30, angle_front=60, phi_offset=0, jitter=False, uniform_sphere_rate=0.5):
-    ''' generate random poses from an orbit camera
-    Args:
-        size: batch size of generated poses.
-        device: where to allocate the output.
-        radius_range: [min, max]
-        theta_range: [min, max], should be in [0, pi]
-        phi_range: [min, max], should be in [0, 2 * pi]
-    Return:
-        poses: [size, 4, 4]
-    '''
-    theta_range = np.deg2rad(theta_range)
-    phi_range = np.deg2rad(phi_range)
-    angle_overhead = np.deg2rad(angle_overhead)
-    angle_front = np.deg2rad(angle_front)
-    radius = torch.rand(size, device=device) * (radius_range[1] - radius_range[0]) + radius_range[0]
-    phis = torch.rand(size, device=device) * (phi_range[1] - phi_range[0]) + phi_range[0]
-    if random.random() < uniform_sphere_rate:
-        # based on http://corysimon.github.io/articles/uniformdistn-on-sphere/
-        # acos takes in [-1, 1], first convert theta range to fit in [-1, 1]
-        theta_range = torch.from_numpy(np.array(theta_range)).to(device)
-        theta_amplitude_range = torch.cos(theta_range)
-        # sample uniformly in amplitude space range
-        thetas_amplitude = torch.rand(size, device=device) * (theta_amplitude_range[1] - theta_amplitude_range[0]) + theta_amplitude_range[0]
-        # convert back
-        thetas = torch.acos(thetas_amplitude)
-    else:
-        thetas = torch.rand(size, device=device) * (theta_range[1] - theta_range[0]) + theta_range[0]
-    centers = -torch.stack([
-        radius * torch.sin(thetas) * torch.sin(phis),
-        radius * torch.cos(thetas),
-        radius * torch.sin(thetas) * torch.cos(phis),
-    ], dim=-1) # [B, 3]
-    targets = 0
-    # jitters
-    if jitter:
-        centers = centers + (torch.rand_like(centers) * 0.2 - 0.1)
-        targets = targets + torch.randn_like(centers) * 0.2
-    # lookat
-    forward_vector = safe_normalize(targets - centers)
-    up_vector = torch.FloatTensor([0, 1, 0]).to(device).unsqueeze(0).repeat(size, 1)
-    right_vector = safe_normalize(torch.cross(up_vector, forward_vector, dim=-1))
-    if jitter:
-        up_noise = torch.randn_like(up_vector) * 0.02
-    else:
-        up_noise = 0
-    up_vector = safe_normalize(torch.cross(forward_vector, right_vector, dim=-1) + up_noise)
-    poses = torch.stack([right_vector, up_vector, forward_vector], dim=-1)
-    radius = radius[..., None] - cam_z_offset
-    translations = torch.cat([torch.zeros_like(radius), torch.zeros_like(radius), radius], dim=-1)
-    poses = torch.cat([poses.view(-1, 9), translations], dim=-1)
-    if return_dirs:
-        dirs = get_view_direction(thetas, phis, angle_overhead, angle_front, phi_offset=phi_offset)
-        dirs = view_direction_id_to_text(dirs)
-    else:
-        dirs = None
-    return poses, dirs

video3d/diffusion/vsd.py DELETED Viewed

@@ -1,323 +0,0 @@
-import os
-os.environ['HUGGINGFACE_HUB_CACHE'] = '/viscam/u/zzli'
-os.environ['HF_HOME'] = '/viscam/u/zzli'
-from transformers import CLIPTextModel, CLIPTokenizer, logging
-from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, DDIMScheduler
-from diffusers.loaders import AttnProcsLayers
-from diffusers.models.attention_processor import LoRAAttnProcessor
-from diffusers.models.embeddings import TimestepEmbedding
-from diffusers.utils.import_utils import is_xformers_available
-# Suppress partial model loading warning
-logging.set_verbosity_error()
-import gc
-import random
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import tinycudann as tcnn
-from video3d.diffusion.sd import StableDiffusion
-from torch.cuda.amp import custom_bwd, custom_fwd
-def seed_everything(seed):
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-def cleanup():
-    gc.collect()
-    torch.cuda.empty_cache()
-    tcnn.free_temporary_memory()
-class StableDiffusion_VSD(StableDiffusion):
-    def __init__(self, device, sd_version='2.1', hf_key=None, torch_dtype=torch.float32, lora_n_timestamp_samples=1):
-        super().__init__(device, sd_version=sd_version, hf_key=hf_key, torch_dtype=torch_dtype)
-        # self.device = device
-        # self.sd_version = sd_version
-        # self.torch_dtype = torch_dtype
-        if hf_key is not None:
-            print(f'[INFO] using hugging face custom model key: {hf_key}')
-            model_key = hf_key
-        elif self.sd_version == '2.1':
-            model_key = "stabilityai/stable-diffusion-2-1-base"
-        elif self.sd_version == '2.0':
-            model_key = "stabilityai/stable-diffusion-2-base"
-        elif self.sd_version == '1.5':
-            model_key = "runwayml/stable-diffusion-v1-5"
-        else:
-            raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
-        # # Create model
-        # self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae", torch_dtype=torch_dtype).to(self.device)
-        # self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
-        # self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder").to(self.device)
-        # self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", torch_dtype=torch_dtype).to(self.device)
-        # self.scheduler = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")
-        # # self.scheduler = PNDMScheduler.from_pretrained(model_key, subfolder="scheduler")
-        # self.num_train_timesteps = self.scheduler.config.num_train_timesteps
-        # self.alphas = self.scheduler.alphas_cumprod.to(self.device) # for convenience
-        print(f'[INFO] loading stable diffusion VSD modules...')
-        self.unet_lora = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", torch_dtype=torch_dtype).to(self.device)
-        cleanup()
-        for p in self.vae.parameters():
-            p.requires_grad_(False)
-        for p in self.text_encoder.parameters():
-            p.requires_grad_(False)
-        for p in self.unet.parameters():
-            p.requires_grad_(False)
-        for p in self.unet_lora.parameters():
-            p.requires_grad_(False)
-        # set up LoRA layers
-        lora_attn_procs = {}
-        for name in self.unet_lora.attn_processors.keys():
-            cross_attention_dim = (
-                None
-                if name.endswith("attn1.processor")
-                else self.unet_lora.config.cross_attention_dim
-            )
-            if name.startswith("mid_block"):
-                hidden_size = self.unet_lora.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(self.unet_lora.config.block_out_channels))[
-                    block_id
-                ]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = self.unet_lora.config.block_out_channels[block_id]
-            lora_attn_procs[name] = LoRAAttnProcessor(
-                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
-            )
-        self.unet_lora.set_attn_processor(lora_attn_procs)
-        self.lora_layers = AttnProcsLayers(self.unet_lora.attn_processors).to(
-            self.device
-        )
-        self.lora_layers._load_state_dict_pre_hooks.clear()
-        self.lora_layers._state_dict_hooks.clear()
-        self.lora_n_timestamp_samples = lora_n_timestamp_samples
-        self.scheduler_lora = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")
-        print(f'[INFO] loaded stable diffusion VSD modules!')
-    def train_lora(
-        self,
-        latents,
-        text_embeddings,
-        camera_condition
-    ):
-        B = latents.shape[0]
-        lora_n_timestamp_samples = self.lora_n_timestamp_samples
-        latents = latents.detach().repeat(lora_n_timestamp_samples, 1, 1, 1)
-        t = torch.randint(
-            int(self.num_train_timesteps * 0.0),
-            int(self.num_train_timesteps * 1.0),
-            [B * lora_n_timestamp_samples],
-            dtype=torch.long,
-            device=self.device,
-        )
-        noise = torch.randn_like(latents)
-        noisy_latents = self.scheduler_lora.add_noise(latents, noise, t)
-        if self.scheduler_lora.config.prediction_type == "epsilon":
-            target = noise
-        elif self.scheduler_lora.config.prediction_type == "v_prediction":
-            target = self.scheduler_lora.get_velocity(latents, noise, t)
-        else:
-            raise ValueError(
-                f"Unknown prediction type {self.scheduler_lora.config.prediction_type}"
-            )
-        # use view-independent text embeddings in LoRA
-        _, text_embeddings_cond = text_embeddings.chunk(2)
-        if random.random() < 0.1:
-            camera_condition = torch.zeros_like(camera_condition)
-        noise_pred = self.unet_lora(
-            noisy_latents,
-            t,
-            encoder_hidden_states=text_embeddings_cond.repeat(
-                lora_n_timestamp_samples, 1, 1
-            ),
-            class_labels=camera_condition.reshape(B, -1).repeat(
-                lora_n_timestamp_samples, 1
-            ),
-            cross_attention_kwargs={"scale": 1.0}
-        ).sample
-        loss_lora = 0.5 * F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
-        return loss_lora
-    def train_step(
-        self,
-        text_embeddings,
-        text_embeddings_vd,
-        pred_rgb,
-        camera_condition,
-        im_features,
-        guidance_scale=7.5,
-        guidance_scale_lora=7.5,
-        loss_weight=1.0,
-        min_step_pct=0.02,
-        max_step_pct=0.98,
-        return_aux=False
-    ):
-        pred_rgb = pred_rgb.to(self.torch_dtype)
-        text_embeddings = text_embeddings.to(self.torch_dtype)
-        text_embeddings_vd = text_embeddings_vd.to(self.torch_dtype)
-        camera_condition = camera_condition.to(self.torch_dtype)
-        im_features = im_features.to(self.torch_dtype)
-        # condition_label = camera_condition
-        condition_label = im_features
-        b = pred_rgb.shape[0]
-        # interp to 512x512 to be fed into vae.
-        # _t = time.time()
-        pred_rgb_512 = F.interpolate(pred_rgb, (512, 512), mode='bilinear', align_corners=False)
-        # torch.cuda.synchronize(); print(f'[TIME] guiding: interp {time.time() - _t:.4f}s')
-        # timestep ~ U(0.02, 0.98) to avoid very high/low noise level
-        min_step = int(self.num_train_timesteps * min_step_pct)
-        max_step = int(self.num_train_timesteps * max_step_pct)
-        t = torch.randint(min_step, max_step + 1, [b], dtype=torch.long, device=self.device)
-        # encode image into latents with vae, requires grad!
-        # _t = time.time()
-        latents = self.encode_imgs(pred_rgb_512)
-        # torch.cuda.synchronize(); print(f'[TIME] guiding: vae enc {time.time() - _t:.4f}s')
-        # predict the noise residual with unet, NO grad!
-        # _t = time.time()
-        with torch.no_grad():
-            # add noise
-            noise = torch.randn_like(latents)
-            latents_noisy = self.scheduler.add_noise(latents, noise, t)
-            # pred noise
-            latent_model_input = torch.cat([latents_noisy] * 2)
-            # disable unet class embedding here
-            cls_embedding = self.unet.class_embedding
-            self.unet.class_embedding = None
-            cross_attention_kwargs = None
-            noise_pred_pretrain = self.unet(
-                latent_model_input,
-                torch.cat([t, t]),
-                encoder_hidden_states=text_embeddings_vd,
-                class_labels=None,
-                cross_attention_kwargs=cross_attention_kwargs
-            ).sample
-            self.unet.class_embedding = cls_embedding
-            # use view-independent text embeddings in LoRA
-            _, text_embeddings_cond = text_embeddings.chunk(2)
-            noise_pred_est = self.unet_lora(
-                latent_model_input,
-                torch.cat([t, t]),
-                encoder_hidden_states=torch.cat([text_embeddings_cond] * 2),
-                class_labels=torch.cat(
-                    [
-                        condition_label.reshape(b, -1),
-                        torch.zeros_like(condition_label.reshape(b, -1)),
-                    ],
-                    dim=0,
-                ),
-                cross_attention_kwargs={"scale": 1.0},
-            ).sample
-        noise_pred_pretrain_uncond, noise_pred_pretrain_text = noise_pred_pretrain.chunk(2)
-        noise_pred_pretrain = noise_pred_pretrain_uncond + guidance_scale * (
-            noise_pred_pretrain_text - noise_pred_pretrain_uncond
-        )
-        assert self.scheduler.config.prediction_type == "epsilon"
-        if self.scheduler_lora.config.prediction_type == "v_prediction":
-            alphas_cumprod = self.scheduler_lora.alphas_cumprod.to(
-                device=latents_noisy.device, dtype=latents_noisy.dtype
-            )
-            alpha_t = alphas_cumprod[t] ** 0.5
-            sigma_t = (1 - alphas_cumprod[t]) ** 0.5
-            noise_pred_est = latent_model_input * torch.cat([sigma_t] * 2, dim=0).reshape(
-                -1, 1, 1, 1
-            ) + noise_pred_est * torch.cat([alpha_t] * 2, dim=0).reshape(-1, 1, 1, 1)
-        noise_pred_est_uncond, noise_pred_est_camera = noise_pred_est.chunk(2)
-        noise_pred_est = noise_pred_est_uncond + guidance_scale_lora * (
-            noise_pred_est_camera - noise_pred_est_uncond
-        )
-        # w(t), sigma_t^2
-        w = (1 - self.alphas[t])
-        # w = self.alphas[t] ** 0.5 * (1 - self.alphas[t])
-        grad = loss_weight * w[:, None, None, None] * (noise_pred_pretrain - noise_pred_est)
-        grad = torch.nan_to_num(grad)
-        targets = (latents - grad).detach()
-        loss_vsd = 0.5 * F.mse_loss(latents.float(), targets, reduction='sum') / latents.shape[0]
-        loss_lora = self.train_lora(latents, text_embeddings, condition_label)
-        loss = {
-            'loss_vsd': loss_vsd,
-            'loss_lora': loss_lora
-        }
-        if return_aux:
-            aux = {'grad': grad, 't': t, 'w': w}
-            return loss, aux
-        else:
-            return loss
-if __name__ == '__main__':
-    import argparse
-    import matplotlib.pyplot as plt
-    parser = argparse.ArgumentParser()
-    parser.add_argument('prompt', type=str)
-    parser.add_argument('--negative', default='', type=str)
-    parser.add_argument('--sd_version', type=str, default='2.1', choices=['1.5', '2.0', '2.1'], help="stable diffusion version")
-    parser.add_argument('--hf_key', type=str, default=None, help="hugging face Stable diffusion model key")
-    parser.add_argument('-H', type=int, default=512)
-    parser.add_argument('-W', type=int, default=512)
-    parser.add_argument('--seed', type=int, default=0)
-    parser.add_argument('--steps', type=int, default=50)
-    opt = parser.parse_args()
-    seed_everything(opt.seed)
-    device = torch.device('cuda')
-    sd = StableDiffusion_VSD(device, opt.sd_version, opt.hf_key)
-    imgs = sd.prompt_to_img(opt.prompt, opt.negative, opt.H, opt.W, opt.steps)
-    # visualize image
-    plt.imshow(imgs[0])
-    plt.show()
-    plt.savefig(f'{opt.prompt}.png')

video3d/model_ddp.py CHANGED Viewed

@@ -41,10 +41,6 @@ from .render import mesh
 from .render import light
 from .render import render
-from .diffusion.sd import StableDiffusion
-from .diffusion.vsd import StableDiffusion_VSD
-from .diffusion.sd_utils import rand_poses, rand_lights, append_text_direction
 EPS = 1e-7
@@ -1269,53 +1265,8 @@ class Unsup3DDDP:
         self.enable_sds = cfgs.get('enable_sds', False)
         self.enable_vsd = cfgs.get('enable_vsd', False)
-        if self.enable_sds:
-            diffusion_torch_dtype = torch.float16 if cfgs.get('diffusion_precision', 'float16') == 'float16' else torch.float32
-            # decide if use SDS or VSD
-            if self.enable_vsd:
-                # self.stable_diffusion = misc.LazyClass(StableDiffusion_VSD, device=self.device, torch_dtype=diffusion_torch_dtype)
-                self.stable_diffusion = StableDiffusion_VSD(device=self.device, torch_dtype=diffusion_torch_dtype)
-                self.diffusion_guidance_scale_lora = cfgs.get('diffusion_guidance_scale_lora', 1.)
-                self.diffusion_guidance_scale = cfgs.get('diffusion_guidance_scale', 7.5)
-            else:
-                self.stable_diffusion = misc.LazyClass(StableDiffusion, device=self.device, torch_dtype=diffusion_torch_dtype)
-                self.diffusion_guidance_scale = cfgs.get('diffusion_guidance_scale', 100.)
-            self.diffusion_loss_weight = cfgs.get('diffusion_loss_weight', 1.)
-            self.diffusion_num_random_cameras = cfgs.get('diffusion_num_random_cameras', 1)
-            # For prompts
-            self.diffusion_prompt = cfgs.get('diffusion_prompt', '')
-            self.diffusion_negative_prompt = cfgs.get('diffusion_negative_prompt', '')
-            # For image sampling
-            self.diffusion_albedo_ratio = cfgs.get('diffusion_albedo_ratio', 0.2)
-            self.diffusion_shading_ratio = cfgs.get('diffusion_shading_ratio', 0.4)
-            self.diffusion_light_ambient = cfgs.get('diffusion_light_ambient', 0.5)
-            self.diffusion_light_diffuse = cfgs.get('diffusion_light_diffuse', 0.8)
-            self.diffusion_radius_range = cfgs.get('diffusion_radius_range', [0.8, 1.4])
-            self.diffusion_uniform_sphere_rate = cfgs.get('diffusion_uniform_sphere_rate', 0.5)
-            self.diffusion_theta_range = cfgs.get('diffusion_theta_range', [0, 120])
-            self.diffusion_phi_offset = cfgs.get('diffusion_phi_offset', 180)
-            self.diffusion_resolution = cfgs.get('diffusion_resolution', 256)
-            print('-----------------------------------------------')
-            print(f"!!!!!! the phi offset for diffusion is set as {self.diffusion_phi_offset}!!!!!!!!!!!!!")
-            print('-----------------------------------------------')
-            # For randomizing light
-            self.diffusion_random_light = cfgs.get('diffusion_random_light', False)
-            self.diffusion_light_ambient = cfgs.get('diffusion_light_ambient', 0.5)
-            self.diffusion_light_diffuse = cfgs.get('diffusion_light_diffuse', 0.8)
-            # For noise scheduling
-            self.diffusion_max_step = cfgs.get('diffusion_max_step', 0.98)
-            # For view-dependent prompting
-            self.diffusion_append_prompt_directions = cfgs.get('diffusion_append_prompt_directions', False)
-            self.diffusion_angle_overhead = cfgs.get('diffusion_angle_overhead', 30)
-            self.diffusion_angle_front = cfgs.get('diffusion_angle_front', 60)
     @staticmethod
     def get_data_loaders(cfgs, dataset, in_image_size=256, out_image_size=256, batch_size=64, num_workers=4, run_train=False, run_test=False, train_data_dir=None, val_data_dir=None, test_data_dir=None, flow_bool=False):
@@ -2017,141 +1968,6 @@ class Unsup3DDDP:
         return losses, aux
-    def score_distillation_sampling(self, shape, texture, resolution, im_features, light, prior_shape, random_light=False, prompts=None, classes_vectors=None, im_features_map=None, w2c_pred=None):
-        num_instances = im_features.shape[0]
-        n_total_random_cameras = num_instances * self.diffusion_num_random_cameras
-        poses, dirs = rand_poses(
-            n_total_random_cameras, self.device, radius_range=self.diffusion_radius_range, uniform_sphere_rate=self.diffusion_uniform_sphere_rate,
-            cam_z_offset=self.cam_pos_z_offset, theta_range=self.diffusion_theta_range, phi_offset=self.diffusion_phi_offset, return_dirs=True,
-            angle_front=self.diffusion_angle_front, angle_overhead=self.diffusion_angle_overhead,
-        )
-        mvp, w2c, campos = self.netInstance.get_camera_extrinsics_from_pose(poses, crop_fov_approx=self.crop_fov_approx)
-        if random_light:
-            lights = rand_lights(campos, fixed_ambient=self.diffusion_light_ambient, fixed_diffuse=self.diffusion_light_diffuse)
-        else:
-            lights = light
-        proj = util.perspective(self.crop_fov_approx / 180 * np.pi, 1, n=0.1, f=1000.0).repeat(num_instances, 1, 1).to(self.device)
-        original_mvp = torch.bmm(proj, w2c_pred)
-        im_features = im_features.repeat(self.diffusion_num_random_cameras, 1) if im_features is not None else None
-        num_shapes = shape.v_pos.shape[0]
-        assert n_total_random_cameras % num_shapes == 0
-        shape = shape.extend(n_total_random_cameras // num_shapes)
-        bg_color = torch.rand((n_total_random_cameras, 3), device=self.device) # channel-wise random
-        background = repeat(bg_color, 'b c -> b h w c', h=resolution[0], w=resolution[1])
-        # only train the texture
-        safe_detach = lambda x: x.detach() if x is not None else None
-        shape = safe_detach(shape)
-        im_features = safe_detach(im_features)
-        im_features_map = safe_detach(im_features_map)
-        set_requires_grad(texture, True)
-        set_requires_grad(light, True)
-        image_pred, mask_pred, _, _, albedo, shading = self.render(
-            shape,
-            texture,
-            mvp,
-            w2c,
-            campos,
-            resolution,
-            im_features=im_features,
-            light=lights,
-            prior_shape=prior_shape,
-            dino_pred=None,
-            spp=self.renderer_spp,
-            bg_image=background,
-            im_features_map={"original_mvp": original_mvp, "im_features_map": im_features_map} if im_features_map is not None else None
-        )
-        if self.enable_vsd:
-            if prompts is None:
-                prompts = n_total_random_cameras * [self.diffusion_prompt]
-            else:
-                if '_' in prompts:
-                    prompts = prompts.replace('_', ' ')
-                prompts = n_total_random_cameras * [prompts]
-            prompts = ['a high-resolution DSLR image of ' + x for x in prompts]
-            assert self.diffusion_append_prompt_directions
-            # TODO: check if this implementation is aligned with stable-diffusion-prompt-processor
-            prompts_vd = append_text_direction(prompts, dirs)
-            negative_prompts = n_total_random_cameras * [self.diffusion_negative_prompt]
-            text_embeddings = self.stable_diffusion.get_text_embeds(prompts, negative_prompts)  # [BB, 77, 768]
-            text_embeddings_vd = self.stable_diffusion.get_text_embeds(prompts_vd, negative_prompts)
-            camera_condition_type = 'c2w'
-            if camera_condition_type == 'c2w':
-                camera_condition = torch.linalg.inv(w2c).detach()
-            elif camera_condition_type == 'mvp':
-                camera_condition = mvp.detach()
-            else:
-                raise NotImplementedError
-            # Alternate among albedo, shading, and image
-            rand = torch.rand(n_total_random_cameras, device=self.device)
-            rendered_component = torch.zeros_like(image_pred)
-            mask_pred = mask_pred[:, None]
-            background = rearrange(background, 'b h w c -> b c h w')
-            albedo_flag = rand > (1 - self.diffusion_albedo_ratio)
-            rendered_component[albedo_flag] = albedo[albedo_flag] * mask_pred[albedo_flag] + (1 - mask_pred[albedo_flag]) * background[albedo_flag]
-            shading_flag = (rand > (1 - self.diffusion_albedo_ratio - self.diffusion_shading_ratio)) & (rand <= (1 - self.diffusion_albedo_ratio))
-            rendered_component[shading_flag] = shading.repeat(1, 3, 1, 1)[shading_flag] / 2 * mask_pred[shading_flag] + (1 - mask_pred[shading_flag]) * background[shading_flag]
-            rendered_component[~(albedo_flag | shading_flag)] = image_pred[~(albedo_flag | shading_flag)]
-            condition_label = classes_vectors
-            # condition_label = im_features
-            sd_loss, sd_aux = self.stable_diffusion.train_step(
-                text_embeddings,
-                text_embeddings_vd,
-                rendered_component,
-                camera_condition,  # TODO: can we input category condition in lora?
-                condition_label,
-                guidance_scale=self.diffusion_guidance_scale,
-                guidance_scale_lora=self.diffusion_guidance_scale_lora,
-                loss_weight=self.diffusion_loss_weight,
-                max_step_pct=self.diffusion_max_step,
-                return_aux=True
-            )
-            aux = {'loss': sd_loss['loss_vsd'], 'loss_lora': sd_loss['loss_lora'], 'dirs': dirs, 'sd_aux': sd_aux, 'rendered_shape': shape}
-        else:
-            # Prompt to text embeds
-            if prompts is None:
-                prompts = n_total_random_cameras * [self.diffusion_prompt]
-            else:
-                if '_' in prompts:
-                    prompts = prompts.replace('_', ' ')
-                prompts = n_total_random_cameras * [prompts]
-            prompts = ['a high-resolution DSLR image of ' + x for x in prompts]
-            if self.diffusion_append_prompt_directions:
-                prompts = append_text_direction(prompts, dirs)
-            negative_prompts = n_total_random_cameras * [self.diffusion_negative_prompt]
-            text_embeddings = self.stable_diffusion.get_text_embeds(prompts, negative_prompts) # [2, 77, 768]
-            # Alternate among albedo, shading, and image
-            rand = torch.rand(n_total_random_cameras, device=self.device)
-            rendered_component = torch.zeros_like(image_pred)
-            mask_pred = mask_pred[:, None]
-            background = rearrange(background, 'b h w c -> b c h w')
-            albedo_flag = rand > (1 - self.diffusion_albedo_ratio)
-            rendered_component[albedo_flag] = albedo[albedo_flag] * mask_pred[albedo_flag] + (1 - mask_pred[albedo_flag]) * background[albedo_flag]
-            shading_flag = (rand > (1 - self.diffusion_albedo_ratio - self.diffusion_shading_ratio)) & (rand <= (1 - self.diffusion_albedo_ratio))
-            rendered_component[shading_flag] = shading.repeat(1, 3, 1, 1)[shading_flag] / 2 * mask_pred[shading_flag] + (1 - mask_pred[shading_flag]) * background[shading_flag]
-            rendered_component[~(albedo_flag | shading_flag)] = image_pred[~(albedo_flag | shading_flag)]
-            sd_loss, sd_aux = self.stable_diffusion.train_step(
-                    text_embeddings, rendered_component, guidance_scale=self.diffusion_guidance_scale, loss_weight=self.diffusion_loss_weight, max_step_pct=self.diffusion_max_step, return_aux=True)
-            aux = {'loss':sd_loss, 'dirs': dirs, 'sd_aux': sd_aux, 'rendered_shape': shape}
-        return rendered_component, aux
     def parse_dict_definition(self, dict_config, total_iter):
         '''
         The dict_config is a diction-based configuration with ascending order
@@ -2987,19 +2803,6 @@ class Unsup3DDDP:
             final_losses[name] = loss.mean()
         final_losses['logit_loss'] = ((expandF(rot_logit) - logit_loss_target.detach())**2.).mean()
-        ## score distillation sampling
-        sds_random_images = None
-        if self.enable_sds:
-            prompts = None
-            if classes_vectors is not None:
-                prompts = category_name[0]
-            sds_random_images, sds_aux = self.score_distillation_sampling(shape, texture, [self.diffusion_resolution, self.diffusion_resolution], im_features, light, prior_shape, prompts=prompts, classes_vectors=class_vector[None, :].expand(batch_size * num_frames, -1), im_features_map=im_features_map, w2c_pred=w2c)
-            if self.enable_vsd:
-                final_losses.update({'vsd_loss': sds_aux['loss']})
-                final_losses.update({'vsd_lora_loss': sds_aux['loss_lora']})
-            else:
-                final_losses.update({'sds_loss': sds_aux['loss']})
         ## mask distribution loss
         mask_distribution_aux = None
         if self.enable_mask_distribution:

 from .render import light
 from .render import render
 EPS = 1e-7
         self.enable_sds = cfgs.get('enable_sds', False)
         self.enable_vsd = cfgs.get('enable_vsd', False)
+        self.enable_sds = False
+        self.enable_vsd = False
     @staticmethod
     def get_data_loaders(cfgs, dataset, in_image_size=256, out_image_size=256, batch_size=64, num_workers=4, run_train=False, run_test=False, train_data_dir=None, val_data_dir=None, test_data_dir=None, flow_bool=False):
         return losses, aux
     def parse_dict_definition(self, dict_config, total_iter):
         '''
         The dict_config is a diction-based configuration with ascending order
             final_losses[name] = loss.mean()
         final_losses['logit_loss'] = ((expandF(rot_logit) - logit_loss_target.detach())**2.).mean()
         ## mask distribution loss
         mask_distribution_aux = None
         if self.enable_mask_distribution: