from dataclasses import dataclass import torch import torch.nn as nn import math import importlib import craftsman import re from typing import Optional from craftsman.utils.base import BaseModule from craftsman.models.denoisers.utils import * @craftsman.register("pixart-denoiser") class PixArtDinoDenoiser(BaseModule): @dataclass class Config(BaseModule.Config): pretrained_model_name_or_path: Optional[str] = None input_channels: int = 32 output_channels: int = 32 n_ctx: int = 512 width: int = 768 layers: int = 28 heads: int = 16 context_dim: int = 1024 n_views: int = 1 context_ln: bool = True skip_ln: bool = False init_scale: float = 0.25 use_checkpoint: bool = False drop_path: float = 0. variance_type: str = "" img_pos_embed: bool = False clip_weight: float = 1.0 dino_weight: float = 1.0 dit_block: str = "" cfg: Config def configure(self) -> None: super().configure() # timestep embedding self.time_embed = TimestepEmbedder(self.cfg.width) # x embedding self.x_embed = nn.Linear(self.cfg.input_channels, self.cfg.width, bias=True) # context embedding if self.cfg.context_ln: self.clip_embed = nn.Sequential( nn.LayerNorm(self.cfg.context_dim), nn.Linear(self.cfg.context_dim, self.cfg.width), ) self.dino_embed = nn.Sequential( nn.LayerNorm(self.cfg.context_dim), nn.Linear(self.cfg.context_dim, self.cfg.width), ) else: self.clip_embed = nn.Linear(self.cfg.context_dim, self.cfg.width) self.dino_embed = nn.Linear(self.cfg.context_dim, self.cfg.width) init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width) drop_path = [x.item() for x in torch.linspace(0, self.cfg.drop_path, self.cfg.layers)] ditblock = getattr(importlib.import_module("craftsman.models.denoisers.utils"), self.cfg.dit_block) self.blocks = nn.ModuleList([ ditblock( width=self.cfg.width, heads=self.cfg.heads, init_scale=init_scale, qkv_bias=self.cfg.drop_path, use_flash=True, drop_path=drop_path[i] ) for i in range(self.cfg.layers) ]) self.t_block = nn.Sequential( nn.SiLU(), nn.Linear(self.cfg.width, 6 * self.cfg.width, bias=True) ) # final layer if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]: self.output_channels = self.cfg.output_channels * 2 else: self.output_channels = self.cfg.output_channels self.final_layer = T2IFinalLayer(self.cfg.width, self.output_channels) self.identity_initialize() if self.cfg.pretrained_model_name_or_path: print(f"Loading pretrained model from {self.cfg.pretrained_model_name_or_path}") ckpt = torch.load(self.cfg.pretrained_model_name_or_path, map_location="cpu")['state_dict'] self.denoiser_ckpt = {} for k, v in ckpt.items(): if k.startswith('denoiser_model.'): self.denoiser_ckpt[k.replace('denoiser_model.', '')] = v self.load_state_dict(self.denoiser_ckpt, strict=False) def forward_with_dpmsolver(self, model_input, timestep, context): """ dpm solver donnot need variance prediction """ # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb model_out = self.forward(model_input, timestep, context) if self.cfg.variance_type.upper() in ["LEARNED", "LEARNED_RANGE"]: return model_out.chunk(2, dim=-1)[0] else: return model_out def identity_initialize(self): for block in self.blocks: nn.init.constant_(block.attn.c_proj.weight, 0) nn.init.constant_(block.attn.c_proj.bias, 0) nn.init.constant_(block.cross_attn.c_proj.weight, 0) nn.init.constant_(block.cross_attn.c_proj.bias, 0) nn.init.constant_(block.mlp.c_proj.weight, 0) nn.init.constant_(block.mlp.c_proj.bias, 0) def forward(self, model_input: torch.FloatTensor, timestep: torch.LongTensor, context: torch.FloatTensor): r""" Args: model_input (torch.FloatTensor): [bs, n_data, c] timestep (torch.LongTensor): [bs,] context (torch.FloatTensor): [bs, context_tokens, c] Returns: sample (torch.FloatTensor): [bs, n_data, c] """ B, n_data, _ = model_input.shape # 1. time t_emb = self.time_embed(timestep) # 2. conditions projector context = context.view(B, self.cfg.n_views, -1, self.cfg.context_dim) clip_feat, dino_feat = context.chunk(2, dim=2) clip_cond = self.clip_embed(clip_feat.contiguous().view(B, -1, self.cfg.context_dim)) dino_cond = self.dino_embed(dino_feat.contiguous().view(B, -1, self.cfg.context_dim)) visual_cond = self.cfg.clip_weight * clip_cond + self.cfg.dino_weight * dino_cond # 4. denoiser latent = self.x_embed(model_input) t0 = self.t_block(t_emb).unsqueeze(dim=1) for block in self.blocks: latent = auto_grad_checkpoint(block, latent, visual_cond, t0) latent = self.final_layer(latent, t_emb) return latent