StreamingSVD

Runtime error

App Files Files Community

StreamingSVD / models /control /controlnet.py

lev1

Initial commit

8fd2f2f 3 months ago

raw

history blame

23.1 kB

	import torch
	import torch.nn as nn
	from typing import List, Optional, Union
	from models.svd.sgm.util import default
	from models.svd.sgm.modules.video_attention import SpatialVideoTransformer
	from models.svd.sgm.modules.diffusionmodules.openaimodel import *
	from models.diffusion.video_model import VideoResBlock, VideoUNet
	from einops import repeat, rearrange
	from models.svd.sgm.modules.diffusionmodules.wrappers import OpenAIWrapper


	class Merger(nn.Module):
	"""
	Merges the controlnet latents with the conditioning embedding (encoding of control frames).

	"""

	def __init__(self, merge_mode: str = "addition", input_channels=0, frame_expansion="last_frame") -> None:
	super().__init__()
	self.merge_mode = merge_mode
	self.frame_expansion = frame_expansion

	def forward(self, x, condition_signal, num_video_frames, num_video_frames_conditional):
	x = rearrange(x, "(B F) C H W -> B F C H W", F=num_video_frames)

	condition_signal = rearrange(
	condition_signal, "(B F) C H W -> B F C H W", B=x.shape[0])

	if x.shape[1] - condition_signal.shape[1] > 0:
	if self.frame_expansion == "last_frame":
	fillup_latent = repeat(
	condition_signal[:, -1], "B C H W -> B F C H W", F=x.shape[1] - condition_signal.shape[1])
	elif self.frame_expansion == "zero":
	fillup_latent = torch.zeros(
	(x.shape[0], num_video_frames-num_video_frames_conditional, *x.shape[2:]), device=x.device, dtype=x.dtype)

	if self.frame_expansion != "none":
	condition_signal = torch.cat(
	[condition_signal, fillup_latent], dim=1)

	if self.merge_mode == "addition":
	out = x + condition_signal
	else:
	raise NotImplementedError(
	f"Merging mode {self.merge_mode} not implemented.")

	out = rearrange(out, "B F C H W -> (B F) C H W")
	return out


	class ControlNetConditioningEmbedding(nn.Module):
	"""
	Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
	[11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
	training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
	convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
	(activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
	model) to encode image-space conditions ... into feature maps ..."
	"""

	def __init__(
	self,
	conditioning_embedding_channels: int,
	conditioning_channels: int = 3,
	block_out_channels: Tuple[int] = (16, 32, 96, 256),
	downsample: bool = True,
	final_3d_conv: bool = False,
	zero_init: bool = True,
	use_controlnet_mask: bool = False,
	use_normalization: bool = False,
	):
	super().__init__()

	self.final_3d_conv = final_3d_conv
	self.conv_in = nn.Conv2d(
	conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
	if final_3d_conv:
	print("USING 3D CONV in ControlNET")

	self.blocks = nn.ModuleList([])
	if use_normalization:
	self.norms = nn.ModuleList([])
	self.use_normalization = use_normalization

	stride = 2 if downsample else 1

	for i in range(len(block_out_channels) - 1):
	channel_in = block_out_channels[i]
	channel_out = block_out_channels[i + 1]
	self.blocks.append(
	nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
	if use_normalization:
	self.norms.append(nn.LayerNorm((channel_in)))
	self.blocks.append(
	nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=stride))
	if use_normalization:
	self.norms.append(nn.LayerNorm((channel_out)))

	self.conv_out = zero_module(
	nn.Conv2d(
	block_out_channels[-1]+int(use_controlnet_mask), conditioning_embedding_channels, kernel_size=3, padding=1), reset=zero_init
	)

	def forward(self, conditioning):
	embedding = self.conv_in(conditioning)
	embedding = F.silu(embedding)

	if self.use_normalization:
	for block, norm in zip(self.blocks, self.norms):
	embedding = block(embedding)
	embedding = rearrange(embedding, " ... C W H -> ... W H C")
	embedding = norm(embedding)
	embedding = rearrange(embedding, "... W H C -> ... C W H")
	embedding = F.silu(embedding)
	else:
	for block in self.blocks:
	embedding = block(embedding)
	embedding = F.silu(embedding)

	embedding = self.conv_out(embedding)
	return embedding


	class ControlNet(nn.Module):

	def __init__(
	self,
	in_channels: int,
	model_channels: int,
	out_channels: int,
	num_res_blocks: int,
	attention_resolutions: Union[List[int], int],
	dropout: float = 0.0,
	channel_mult: List[int] = (1, 2, 4, 8),
	conv_resample: bool = True,
	dims: int = 2,
	num_classes: Optional[Union[int, str]] = None,
	use_checkpoint: bool = False,
	num_heads: int = -1,
	num_head_channels: int = -1,
	num_heads_upsample: int = -1,
	use_scale_shift_norm: bool = False,
	resblock_updown: bool = False,
	transformer_depth: Union[List[int], int] = 1,
	transformer_depth_middle: Optional[int] = None,
	context_dim: Optional[int] = None,
	time_downup: bool = False,
	time_context_dim: Optional[int] = None,
	extra_ff_mix_layer: bool = False,
	use_spatial_context: bool = False,
	merge_strategy: str = "fixed",
	merge_factor: float = 0.5,
	spatial_transformer_attn_type: str = "softmax",
	video_kernel_size: Union[int, List[int]] = 3,
	use_linear_in_transformer: bool = False,
	adm_in_channels: Optional[int] = None,
	disable_temporal_crossattention: bool = False,
	max_ddpm_temb_period: int = 10000,
	conditioning_embedding_out_channels: Optional[Tuple[int]] = (
	16, 32, 96, 256),
	condition_encoder: str = "",
	use_controlnet_mask: bool = False,
	downsample_controlnet_cond: bool = True,
	use_image_encoder_normalization: bool = False,
	zero_conv_mode: str = "Identity",
	frame_expansion: str = "none",
	merging_mode: str = "addition",
	):
	super().__init__()
	assert zero_conv_mode == "Identity", "Zero convolution not implemented"

	assert context_dim is not None

	if num_heads_upsample == -1:
	num_heads_upsample = num_heads

	if num_heads == -1:
	assert num_head_channels != -1

	if num_head_channels == -1:
	assert num_heads != -1

	self.in_channels = in_channels
	self.model_channels = model_channels
	self.out_channels = out_channels
	if isinstance(transformer_depth, int):
	transformer_depth = len(channel_mult) * [transformer_depth]
	transformer_depth_middle = default(
	transformer_depth_middle, transformer_depth[-1]
	)

	self.num_res_blocks = num_res_blocks
	self.attention_resolutions = attention_resolutions
	self.dropout = dropout
	self.channel_mult = channel_mult
	self.conv_resample = conv_resample
	self.num_classes = num_classes
	self.use_checkpoint = use_checkpoint
	self.num_heads = num_heads
	self.num_head_channels = num_head_channels
	self.num_heads_upsample = num_heads_upsample
	self.dims = dims
	self.use_scale_shift_norm = use_scale_shift_norm
	self.resblock_updown = resblock_updown
	self.transformer_depth = transformer_depth
	self.transformer_depth_middle = transformer_depth_middle
	self.context_dim = context_dim
	self.time_downup = time_downup
	self.time_context_dim = time_context_dim
	self.extra_ff_mix_layer = extra_ff_mix_layer
	self.use_spatial_context = use_spatial_context
	self.merge_strategy = merge_strategy
	self.merge_factor = merge_factor
	self.spatial_transformer_attn_type = spatial_transformer_attn_type
	self.video_kernel_size = video_kernel_size
	self.use_linear_in_transformer = use_linear_in_transformer
	self.adm_in_channels = adm_in_channels
	self.disable_temporal_crossattention = disable_temporal_crossattention
	self.max_ddpm_temb_period = max_ddpm_temb_period

	time_embed_dim = model_channels * 4
	self.time_embed = nn.Sequential(
	linear(model_channels, time_embed_dim),
	nn.SiLU(),
	linear(time_embed_dim, time_embed_dim),
	)

	if self.num_classes is not None:
	if isinstance(self.num_classes, int):
	self.label_emb = nn.Embedding(num_classes, time_embed_dim)
	elif self.num_classes == "continuous":
	print("setting up linear c_adm embedding layer")
	self.label_emb = nn.Linear(1, time_embed_dim)
	elif self.num_classes == "timestep":
	self.label_emb = nn.Sequential(
	Timestep(model_channels),
	nn.Sequential(
	linear(model_channels, time_embed_dim),
	nn.SiLU(),
	linear(time_embed_dim, time_embed_dim),
	),
	)

	elif self.num_classes == "sequential":
	assert adm_in_channels is not None
	self.label_emb = nn.Sequential(
	nn.Sequential(
	linear(adm_in_channels, time_embed_dim),
	nn.SiLU(),
	linear(time_embed_dim, time_embed_dim),
	)
	)
	else:
	raise ValueError()

	self.input_blocks = nn.ModuleList(
	[
	TimestepEmbedSequential(
	conv_nd(dims, in_channels, model_channels, 3, padding=1)
	)
	]
	)
	self._feature_size = model_channels
	input_block_chans = [model_channels]
	ch = model_channels
	ds = 1

	def get_attention_layer(
	ch,
	num_heads,
	dim_head,
	depth=1,
	context_dim=None,
	use_checkpoint=False,
	disabled_sa=False,
	):
	return SpatialVideoTransformer(
	ch,
	num_heads,
	dim_head,
	depth=depth,
	context_dim=context_dim,
	time_context_dim=time_context_dim,
	dropout=dropout,
	ff_in=extra_ff_mix_layer,
	use_spatial_context=use_spatial_context,
	merge_strategy=merge_strategy,
	merge_factor=merge_factor,
	checkpoint=use_checkpoint,
	use_linear=use_linear_in_transformer,
	attn_mode=spatial_transformer_attn_type,
	disable_self_attn=disabled_sa,
	disable_temporal_crossattention=disable_temporal_crossattention,
	max_time_embed_period=max_ddpm_temb_period,
	)

	def get_resblock(
	merge_factor,
	merge_strategy,
	video_kernel_size,
	ch,
	time_embed_dim,
	dropout,
	out_ch,
	dims,
	use_checkpoint,
	use_scale_shift_norm,
	down=False,
	up=False,
	):
	return VideoResBlock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	channels=ch,
	emb_channels=time_embed_dim,
	dropout=dropout,
	out_channels=out_ch,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	down=down,
	up=up,
	)

	for level, mult in enumerate(channel_mult):
	for _ in range(num_res_blocks):
	layers = [
	get_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch,
	time_embed_dim=time_embed_dim,
	dropout=dropout,
	out_ch=mult * model_channels,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	)
	]
	ch = mult * model_channels
	if ds in attention_resolutions:
	if num_head_channels == -1:
	dim_head = ch // num_heads
	else:
	num_heads = ch // num_head_channels
	dim_head = num_head_channels

	layers.append(
	get_attention_layer(
	ch,
	num_heads,
	dim_head,
	depth=transformer_depth[level],
	context_dim=context_dim,
	use_checkpoint=use_checkpoint,
	disabled_sa=False,
	)
	)
	self.input_blocks.append(TimestepEmbedSequential(*layers))
	self._feature_size += ch
	input_block_chans.append(ch)
	if level != len(channel_mult) - 1:
	ds *= 2
	out_ch = ch
	self.input_blocks.append(
	TimestepEmbedSequential(
	get_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch,
	time_embed_dim=time_embed_dim,
	dropout=dropout,
	out_ch=out_ch,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	down=True,
	)
	if resblock_updown
	else Downsample(
	ch,
	conv_resample,
	dims=dims,
	out_channels=out_ch,
	third_down=time_downup,
	)
	)
	)
	ch = out_ch
	input_block_chans.append(ch)

	self._feature_size += ch

	if num_head_channels == -1:
	dim_head = ch // num_heads
	else:
	num_heads = ch // num_head_channels
	dim_head = num_head_channels

	self.middle_block = TimestepEmbedSequential(
	get_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch,
	time_embed_dim=time_embed_dim,
	out_ch=None,
	dropout=dropout,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	),
	get_attention_layer(
	ch,
	num_heads,
	dim_head,
	depth=transformer_depth_middle,
	context_dim=context_dim,
	use_checkpoint=use_checkpoint,
	),
	get_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch,
	out_ch=None,
	time_embed_dim=time_embed_dim,
	dropout=dropout,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	),
	)
	self._feature_size += ch

	self.merger = Merger(
	merge_mode=merging_mode, input_channels=model_channels, frame_expansion=frame_expansion)

	conditioning_channels = 3 if downsample_controlnet_cond else 4
	block_out_channels = (320, 640, 1280, 1280)

	self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
	conditioning_embedding_channels=block_out_channels[0],
	conditioning_channels=conditioning_channels,
	block_out_channels=conditioning_embedding_out_channels,
	downsample=downsample_controlnet_cond,
	final_3d_conv=condition_encoder.endswith("3DConv"),
	use_controlnet_mask=use_controlnet_mask,
	use_normalization=use_image_encoder_normalization,
	)

	def forward(
	self,
	x: th.Tensor,
	timesteps: th.Tensor,
	controlnet_cond: th.Tensor,
	context: Optional[th.Tensor] = None,
	y: Optional[th.Tensor] = None,
	time_context: Optional[th.Tensor] = None,
	num_video_frames: Optional[int] = None,
	num_video_frames_conditional: Optional[int] = None,
	image_only_indicator: Optional[th.Tensor] = None,
	):
	assert (y is not None) == (
	self.num_classes is not None
	), "must specify y if and only if the model is class-conditional -> no, relax this TODO"
	hs = []
	t_emb = timestep_embedding(
	timesteps, self.model_channels, repeat_only=False).to(x.dtype)

	emb = self.time_embed(t_emb)

	# TODO restrict y to [:self.num_frames] (conditonal frames)

	if self.num_classes is not None:
	assert y.shape[0] == x.shape[0]
	emb = emb + self.label_emb(y)

	controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)

	h = x
	for idx, module in enumerate(self.input_blocks):
	h = module(
	h,
	emb,
	context=context,
	image_only_indicator=image_only_indicator,
	time_context=time_context,
	num_video_frames=num_video_frames,
	)
	if idx == 0:
	h = self.merger(h, controlnet_cond, num_video_frames=num_video_frames,
	num_video_frames_conditional=num_video_frames_conditional)

	hs.append(h)
	h = self.middle_block(
	h,
	emb,
	context=context,
	image_only_indicator=image_only_indicator,
	time_context=time_context,
	num_video_frames=num_video_frames,
	)

	# 5. Control net blocks

	down_block_res_samples = hs

	mid_block_res_sample = h

	return (down_block_res_samples, mid_block_res_sample)

	@classmethod
	def from_unet(cls,
	model: OpenAIWrapper,
	merging_mode: str = "addition",
	zero_conv_mode: str = "Identity",
	frame_expansion: str = "none",
	downsample_controlnet_cond: bool = True,
	use_image_encoder_normalization: bool = False,
	use_controlnet_mask: bool = False,
	condition_encoder: str = "",
	conditioning_embedding_out_channels: List[int] = None,

	):

	unet: VideoUNet = model.diffusion_model

	controlnet = cls(in_channels=unet.in_channels,
	model_channels=unet.model_channels,
	out_channels=unet.out_channels,
	num_res_blocks=unet.num_res_blocks,
	attention_resolutions=unet.attention_resolutions,
	dropout=unet.dropout,
	channel_mult=unet.channel_mult,
	conv_resample=unet.conv_resample,
	dims=unet.dims,
	num_classes=unet.num_classes,
	use_checkpoint=unet.use_checkpoint,
	num_heads=unet.num_heads,
	num_head_channels=unet.num_head_channels,
	num_heads_upsample=unet.num_heads_upsample,
	use_scale_shift_norm=unet.use_scale_shift_norm,
	resblock_updown=unet.resblock_updown,
	transformer_depth=unet.transformer_depth,
	transformer_depth_middle=unet.transformer_depth_middle,
	context_dim=unet.context_dim,
	time_downup=unet.time_downup,
	time_context_dim=unet.time_context_dim,
	extra_ff_mix_layer=unet.extra_ff_mix_layer,
	use_spatial_context=unet.use_spatial_context,
	merge_strategy=unet.merge_strategy,
	merge_factor=unet.merge_factor,
	spatial_transformer_attn_type=unet.spatial_transformer_attn_type,
	video_kernel_size=unet.video_kernel_size,
	use_linear_in_transformer=unet.use_linear_in_transformer,
	adm_in_channels=unet.adm_in_channels,
	disable_temporal_crossattention=unet.disable_temporal_crossattention,
	max_ddpm_temb_period=unet.max_ddpm_temb_period, # up to here unet params
	merging_mode=merging_mode,
	zero_conv_mode=zero_conv_mode,
	frame_expansion=frame_expansion,
	downsample_controlnet_cond=downsample_controlnet_cond,
	use_image_encoder_normalization=use_image_encoder_normalization,
	use_controlnet_mask=use_controlnet_mask,
	condition_encoder=condition_encoder,
	conditioning_embedding_out_channels=conditioning_embedding_out_channels,
	)
	controlnet: ControlNet

	return controlnet


	def zero_module(module, reset=True):
	if reset:
	for p in module.parameters():
	nn.init.zeros_(p)
	return module