from abc import ABC, abstractmethod from typing import Tuple import torch from diffusers.configuration_utils import ConfigMixin from einops import rearrange from torch import Tensor from xora.utils.torch_utils import append_dims class Patchifier(ConfigMixin, ABC): def __init__(self, patch_size: int): super().__init__() self._patch_size = (1, patch_size, patch_size) @abstractmethod def patchify( self, latents: Tensor, frame_rates: Tensor, scale_grid: bool ) -> Tuple[Tensor, Tensor]: pass @abstractmethod def unpatchify( self, latents: Tensor, output_height: int, output_width: int, output_num_frames: int, out_channels: int, ) -> Tuple[Tensor, Tensor]: pass @property def patch_size(self): return self._patch_size def get_grid( self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device ): f = orig_num_frames // self._patch_size[0] h = orig_height // self._patch_size[1] w = orig_width // self._patch_size[2] grid_h = torch.arange(h, dtype=torch.float32, device=device) grid_w = torch.arange(w, dtype=torch.float32, device=device) grid_f = torch.arange(f, dtype=torch.float32, device=device) grid = torch.meshgrid(grid_f, grid_h, grid_w) grid = torch.stack(grid, dim=0) grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1) if scale_grid is not None: for i in range(3): if isinstance(scale_grid[i], Tensor): scale = append_dims(scale_grid[i], grid.ndim - 1) else: scale = scale_grid[i] grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i] grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size) return grid class SymmetricPatchifier(Patchifier): def patchify( self, latents: Tensor, ) -> Tuple[Tensor, Tensor]: latents = rearrange( latents, "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)", p1=self._patch_size[0], p2=self._patch_size[1], p3=self._patch_size[2], ) return latents def unpatchify( self, latents: Tensor, output_height: int, output_width: int, output_num_frames: int, out_channels: int, ) -> Tuple[Tensor, Tensor]: output_height = output_height // self._patch_size[1] output_width = output_width // self._patch_size[2] latents = rearrange( latents, "b (f h w) (c p q) -> b c f (h p) (w q) ", f=output_num_frames, h=output_height, w=output_width, p=self._patch_size[1], q=self._patch_size[2], ) return latents