jadechoghari
/

spad

Diffusers

Safetensors

SPADPipeline

spad

Model card Files Files and versions Community

jadechoghari commited on Aug 20

Commit

ef48aca

•

1 Parent(s): a1c4d22

Update pipeline_spad.py

Browse files

Files changed (1) hide show

pipeline_spad.py +28 -37

pipeline_spad.py CHANGED Viewed

@@ -12,7 +12,7 @@ from .geometry import get_batch_from_spherical
 class SPADPipeline(DiffusionPipeline):
     def __init__(self, unet, vae, text_encoder, tokenizer, scheduler):
         super().__init__()
         self.register_modules(
             unet=unet,
             vae=vae,
@@ -20,26 +20,26 @@ class SPADPipeline(DiffusionPipeline):
             tokenizer=tokenizer,
             scheduler=scheduler
         )
         self.cfg_conds = ["txt", "cam", "epi", "plucker"]
         self.cfg_scales = [7.5, 1.0, 1.0, 1.0]  # Default scales, adjust as needed
         self.use_abs_extrinsics = False
         self.use_intrinsic = False
         self.cc_projection = nn.Sequential(
             nn.Linear(4 if not self.use_intrinsic else 8, 1280),
             nn.SiLU(),
             nn.Linear(1280, 1280),
-        )
         nn.init.zeros_(self.cc_projection[-1].weight)
         nn.init.zeros_(self.cc_projection[-1].bias)
     def generate_camera_batch(self, elevations, azimuths, use_abs=False):
         batch = get_batch_from_spherical(elevations, azimuths)
         abs_cams = [torch.tensor([theta, azimuth, 3.5]) for theta, azimuth in zip(elevations, azimuths)]
         debug_cams = [[] for _ in range(len(azimuths))]
         for i, icam in enumerate(abs_cams):
             for j, jcam in enumerate(abs_cams):
@@ -49,15 +49,15 @@ class SPADPipeline(DiffusionPipeline):
                     dcam = icam - jcam
                     dcam = torch.tensor([dcam[0].item(), math.sin(dcam[1].item()), math.cos(dcam[1].item()), dcam[2].item()])
                 debug_cams[i].append(dcam)
         batch["cam"] = torch.stack([torch.stack(dc) for dc in debug_cams])
         # Add intrinsics to the batch
         focal = 1 / np.tan(0.702769935131073 / 2)
         intrinsics = np.diag(np.array([focal, focal, 1])).astype(np.float32)
         intrinsics = torch.from_numpy(intrinsics).unsqueeze(0).float().repeat(batch["cam"].shape[0], 1, 1)
         batch["render_intrinsics_flat"] = intrinsics[:, [0,1,0,1], [0,1,-1,-1]]
         return batch
     def get_gaussian_image(self, blob_width=256, blob_height=256, sigma=0.5):
@@ -68,15 +68,15 @@ class SPADPipeline(DiffusionPipeline):
         if gaussian_blob.max() > 0:
             gaussian_blob = 255.0 * (gaussian_blob - gaussian_blob.min()) / gaussian_blob.max()
         gaussian_blob = 255.0 - gaussian_blob
         gaussian_blob = (gaussian_blob / 255.0) * 2.0 - 1.0
         gaussian_blob = np.expand_dims(gaussian_blob, axis=-1).repeat(3,-1)
         gaussian_blob = torch.from_numpy(gaussian_blob)
         return gaussian_blob
     @torch.no_grad()
-    def __call__(self, prompt, num_inference_steps=50, guidance_scale=7.5, num_images_per_prompt=1,
                  elevations=None, azimuths=None, blob_sigma=0.5, **kwargs):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
         device = self.device
@@ -85,7 +85,7 @@ class SPADPipeline(DiffusionPipeline):
         if elevations is None or azimuths is None:
             elevations = [45] * 4
             azimuths = [0, 90, 180, 270]
         n_views = len(elevations)
         camera_batch = self.generate_camera_batch(elevations, azimuths, use_abs=self.use_abs_extrinsics)
         camera_batch = {k: v[None].repeat_interleave(batch_size, dim=0).to(device) for k, v in camera_batch.items()}
@@ -104,7 +104,7 @@ class SPADPipeline(DiffusionPipeline):
         uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
         # Encode camera data
-        camera_embeddings = self.cc_projection(camera_batch["cam"])
         # Prepare latents
         latent_height, latent_width = self.vae.config.sample_size // 8, self.vae.config.sample_size // 8
@@ -127,45 +127,37 @@ class SPADPipeline(DiffusionPipeline):
         latent_height, latent_width = 64, 64  # Fixed to match the required shape [batch_size, 1, 4, 64, 64]
         n_objects = 2;
-        latents = torch.randn(n_objects, n_views, 10, 32, 32, device=device, dtype=self.unet.dtype)
         # Set up scheduler
         # self.scheduler.set_timesteps(num_inference_steps)
-        self.scheduler.set_timesteps(10)
         # Repeat text_embeddings to match the desired dimensions
         text_embeddings = text_embeddings.repeat(n_objects, 1, 1)  # Shape: [2, max_seq_len, 512]
         # Reshape text_embeddings to match [n_objects, n_views, max_seq_len, 512]
         text_embeddings = text_embeddings.unsqueeze(1).repeat(1, n_views, 1, 1)
         # Denoising loop
         for t in tqdm(self.scheduler.timesteps):
           # Expand timesteps to match shape [batch_size, 1, 1]
           # timesteps = torch.full((batch_size, 1, 1), t, device=device, dtype=torch.long)
           timesteps = torch.full((n_objects, n_views), t, device=device, dtype=torch.long)
-          # # Repeat text_embeddings to match the desired dimensions
-          # text_embeddings = text_embeddings.repeat(n_objects, 1, 1)  # Shape: [2, max_seq_len, 512]
-          # # Reshape text_embeddings to match [n_objects, n_views, max_seq_len, 512]
-          # text_embeddings = text_embeddings.unsqueeze(1).repeat(1, n_views, 1, 1)
-          # print("old cam shape: ", camera_embeddings.shape)
-          camera_embeddings = camera_embeddings.repeat(n_objects, 1, 1, 1)
-          # print("cam emb shape: ", camera_embeddings.shape)
           # Prepare context
           context = [
             # text_embeddings.unsqueeze(1),  # [batch_size, 1, max_seq_len, 768]
             # camera_embeddings.unsqueeze(1) * 0.0,  # [batch_size, 1, 1280] * 0.0
             # epi_constraint_masks  # Keep this as is for now
-            text_embeddings,  # [n_objects, n_views, max_seq_len, 768]
-            camera_embeddings  # [n_objects, n_views, 1280]
-            torch.ones(n_objects, n_views, 6, 32, 32)
           ]
           # Predict noise residual
           noise_pred = self.unet(
-              latents,  # Shape: [batch_size, 1, 4, 64, 64]
-              timesteps=timesteps,  # Shape: [batch_size, 1, 1]
               context=context
           )
@@ -179,19 +171,18 @@ class SPADPipeline(DiffusionPipeline):
         # reduce latents
         #EXPERIMENTAL
-        # If you need to reduce the channels from 10 to 4
-        latents = latents[:, :, :4, :, :]  # Select only the first 4 channels
-        latents = latents.view(-1, latents.shape[2], latents.shape[3], latents.shape[4])
         # Decode latents
-        images = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
         # Post-process images
         images = (images / 2 + 0.5).clamp(0, 1)
         if images.dim() == 5:
-          images = images.cpu().permute(0, 1, 3, 4, 2).float().numpy()  # For 5D tensors
         elif images.dim() == 4:
-          images = images.cpu().permute(0, 2, 3, 1).float().numpy()  # For 4D tensors
         else:
           raise ValueError(f"Unexpected image dimensions: {images.shape}")

 class SPADPipeline(DiffusionPipeline):
     def __init__(self, unet, vae, text_encoder, tokenizer, scheduler):
         super().__init__()
         self.register_modules(
             unet=unet,
             vae=vae,
             tokenizer=tokenizer,
             scheduler=scheduler
         )
         self.cfg_conds = ["txt", "cam", "epi", "plucker"]
         self.cfg_scales = [7.5, 1.0, 1.0, 1.0]  # Default scales, adjust as needed
         self.use_abs_extrinsics = False
         self.use_intrinsic = False
         self.cc_projection = nn.Sequential(
             nn.Linear(4 if not self.use_intrinsic else 8, 1280),
             nn.SiLU(),
             nn.Linear(1280, 1280),
+        ).to(device)
         nn.init.zeros_(self.cc_projection[-1].weight)
         nn.init.zeros_(self.cc_projection[-1].bias)
     def generate_camera_batch(self, elevations, azimuths, use_abs=False):
         batch = get_batch_from_spherical(elevations, azimuths)
         abs_cams = [torch.tensor([theta, azimuth, 3.5]) for theta, azimuth in zip(elevations, azimuths)]
         debug_cams = [[] for _ in range(len(azimuths))]
         for i, icam in enumerate(abs_cams):
             for j, jcam in enumerate(abs_cams):
                     dcam = icam - jcam
                     dcam = torch.tensor([dcam[0].item(), math.sin(dcam[1].item()), math.cos(dcam[1].item()), dcam[2].item()])
                 debug_cams[i].append(dcam)
         batch["cam"] = torch.stack([torch.stack(dc) for dc in debug_cams])
         # Add intrinsics to the batch
         focal = 1 / np.tan(0.702769935131073 / 2)
         intrinsics = np.diag(np.array([focal, focal, 1])).astype(np.float32)
         intrinsics = torch.from_numpy(intrinsics).unsqueeze(0).float().repeat(batch["cam"].shape[0], 1, 1)
         batch["render_intrinsics_flat"] = intrinsics[:, [0,1,0,1], [0,1,-1,-1]]
         return batch
     def get_gaussian_image(self, blob_width=256, blob_height=256, sigma=0.5):
         if gaussian_blob.max() > 0:
             gaussian_blob = 255.0 * (gaussian_blob - gaussian_blob.min()) / gaussian_blob.max()
         gaussian_blob = 255.0 - gaussian_blob
         gaussian_blob = (gaussian_blob / 255.0) * 2.0 - 1.0
         gaussian_blob = np.expand_dims(gaussian_blob, axis=-1).repeat(3,-1)
         gaussian_blob = torch.from_numpy(gaussian_blob)
         return gaussian_blob
     @torch.no_grad()
+    def __call__(self, prompt, num_inference_steps=50, guidance_scale=7.5, num_images_per_prompt=1,
                  elevations=None, azimuths=None, blob_sigma=0.5, **kwargs):
         batch_size = len(prompt) if isinstance(prompt, list) else 1
         device = self.device
         if elevations is None or azimuths is None:
             elevations = [45] * 4
             azimuths = [0, 90, 180, 270]
         n_views = len(elevations)
         camera_batch = self.generate_camera_batch(elevations, azimuths, use_abs=self.use_abs_extrinsics)
         camera_batch = {k: v[None].repeat_interleave(batch_size, dim=0).to(device) for k, v in camera_batch.items()}
         uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
         # Encode camera data
+        camera_embeddings = self.cc_projection(camera_batch["cam"]).to(device)
         # Prepare latents
         latent_height, latent_width = self.vae.config.sample_size // 8, self.vae.config.sample_size // 8
         latent_height, latent_width = 64, 64  # Fixed to match the required shape [batch_size, 1, 4, 64, 64]
         n_objects = 2;
+        latents = torch.randn(n_objects, n_views, 4, 64, 64, device=device, dtype=self.unet.dtype)
         # Set up scheduler
         # self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.set_timesteps(50)
         # Repeat text_embeddings to match the desired dimensions
         text_embeddings = text_embeddings.repeat(n_objects, 1, 1)  # Shape: [2, max_seq_len, 512]
         # Reshape text_embeddings to match [n_objects, n_views, max_seq_len, 512]
         text_embeddings = text_embeddings.unsqueeze(1).repeat(1, n_views, 1, 1)
+        camera_embeddings = camera_embeddings.repeat(n_objects, 1, 1, 1)
         # Denoising loop
         for t in tqdm(self.scheduler.timesteps):
           # Expand timesteps to match shape [batch_size, 1, 1]
           # timesteps = torch.full((batch_size, 1, 1), t, device=device, dtype=torch.long)
           timesteps = torch.full((n_objects, n_views), t, device=device, dtype=torch.long)
           # Prepare context
           context = [
             # text_embeddings.unsqueeze(1),  # [batch_size, 1, max_seq_len, 768]
             # camera_embeddings.unsqueeze(1) * 0.0,  # [batch_size, 1, 1280] * 0.0
             # epi_constraint_masks  # Keep this as is for now
+            text_embeddings.to(device),  # [n_objects, n_views, max_seq_len, 768]
+            camera_embeddings,  # [n_objects, n_views, 1280]
+            torch.ones(n_objects, n_views, 6, 32, 32).to(device)
           ]
           # Predict noise residual
           noise_pred = self.unet(
+              latents.to(device),  # Shape: [batch_size, 1, 4, 64, 64]
+              timesteps=timesteps.to(device),  # Shape: [batch_size, 1, 1]
               context=context
           )
         # reduce latents
         #EXPERIMENTAL
+        latents_reshaped = latents[:, 0, :, :, :]  # Selecting the first view
         # Decode latents
+        images = self.vae.decode(latents_reshaped / self.vae.config.scaling_factor, return_dict=False)[0]
         # Post-process images
         images = (images / 2 + 0.5).clamp(0, 1)
         if images.dim() == 5:
+          images_output = images.cpu().permute(0, 1, 3, 4, 2).float().numpy()  # For 5D tensors
         elif images.dim() == 4:
+          images_output = images.cpu().permute(0, 2, 3, 1).float().numpy()  # For 4D tensors
         else:
           raise ValueError(f"Unexpected image dimensions: {images.shape}")