from dataclasses import dataclass from typing import Optional import torch import torch.nn as nn from einops import rearrange from jaxtyping import Float from torch import Tensor from sf3d.models.tokenizers.dinov2 import Dinov2Model from sf3d.models.transformers.attention import Modulation from sf3d.models.utils import BaseModule class DINOV2SingleImageTokenizer(BaseModule): @dataclass class Config(BaseModule.Config): pretrained_model_name_or_path: str = "facebook/dinov2-large" width: int = 512 height: int = 512 modulation_cond_dim: int = 768 cfg: Config def configure(self) -> None: self.model = Dinov2Model.from_pretrained(self.cfg.pretrained_model_name_or_path) for p in self.model.parameters(): p.requires_grad_(False) self.model.eval() self.model.set_gradient_checkpointing(False) # add modulation modulations = [] for layer in self.model.encoder.layer: norm1_modulation = Modulation( self.model.config.hidden_size, self.cfg.modulation_cond_dim, zero_init=True, single_layer=True, ) norm2_modulation = Modulation( self.model.config.hidden_size, self.cfg.modulation_cond_dim, zero_init=True, single_layer=True, ) layer.register_ada_norm_modulation(norm1_modulation, norm2_modulation) modulations += [norm1_modulation, norm2_modulation] self.modulations = nn.ModuleList(modulations) self.register_buffer( "image_mean", torch.as_tensor([0.485, 0.456, 0.406]).reshape(1, 1, 3, 1, 1), persistent=False, ) self.register_buffer( "image_std", torch.as_tensor([0.229, 0.224, 0.225]).reshape(1, 1, 3, 1, 1), persistent=False, ) def forward( self, images: Float[Tensor, "B *N C H W"], modulation_cond: Optional[Float[Tensor, "B *N Cc"]], **kwargs, ) -> Float[Tensor, "B *N Ct Nt"]: model = self.model packed = False if images.ndim == 4: packed = True images = images.unsqueeze(1) if modulation_cond is not None: assert modulation_cond.ndim == 2 modulation_cond = modulation_cond.unsqueeze(1) batch_size, n_input_views = images.shape[:2] images = (images - self.image_mean) / self.image_std out = model( rearrange(images, "B N C H W -> (B N) C H W"), modulation_cond=rearrange(modulation_cond, "B N Cc -> (B N) Cc") if modulation_cond is not None else None, ) local_features = out.last_hidden_state local_features = local_features.permute(0, 2, 1) local_features = rearrange( local_features, "(B N) Ct Nt -> B N Ct Nt", B=batch_size ) if packed: local_features = local_features.squeeze(1) return local_features def detokenize(self, *args, **kwargs): raise NotImplementedError