StreamingSVD / config.yaml
lev1's picture
Initial commit
8fd2f2f
# pytorch_lightning==2.2.2
seed_everything: 33
trainer:
accelerator: auto
strategy: auto
devices: '1'
num_nodes: 1
precision: 16-mixed
logger: False
model:
class_path: diffusion_trainer.streaming_svd.StreamingSVD
init_args:
vfi:
class_path: modules.params.vfi.VFIParams
init_args:
ckpt_path_local: checkpoint/VFI/ours.pkl
ckpt_path_global: https://drive.google.com/file/d/1XCNoyhA1RX3m8W-XJK8H8inH47l36kxP/view?usp=sharing
i2v_enhance:
class_path: modules.params.i2v_enhance.I2VEnhanceParams
init_args:
ckpt_path_local: checkpoint/i2v_enhance/
ckpt_path_global: ali-vilab/i2vgen-xl
module_loader:
class_path: modules.loader.module_loader.GenericModuleLoader
init_args:
pipeline_repo: stabilityai/stable-video-diffusion-img2vid-xt
pipeline_obj: streamingt2v_pipeline
set_prediction_type: ''
module_names:
- network_config
- model
- controlnet
- denoiser
- conditioner
- first_stage_model
- sampler
- svd_pipeline
module_config:
controlnet:
class_path: modules.loader.module_loader_config.ModuleLoaderConfig
init_args:
loader_cls_path: models.control.controlnet.ControlNet
cls_func: from_unet
cls_func_fast_dev_run: ''
kwargs_diffusers: null
model_params:
merging_mode: addition
zero_conv_mode: Identity
frame_expansion: none
downsample_controlnet_cond: true
use_image_encoder_normalization: true
use_controlnet_mask: false
condition_encoder: ''
conditioning_embedding_out_channels:
- 32
- 96
- 256
- 512
kwargs_diff_trainer_params: null
args: []
dependent_modules:
model: model
dependent_modules_cloned: null
state_dict_path: ''
strict_loading: true
state_dict_filters: []
network_config:
class_path: models.diffusion.video_model.VideoUNet
init_args:
in_channels: 8
model_channels: 320
out_channels: 4
num_res_blocks: 2
num_conditional_frames: null
attention_resolutions:
- 4
- 2
- 1
dropout: 0.0
channel_mult:
- 1
- 2
- 4
- 4
conv_resample: true
dims: 2
num_classes: sequential
use_checkpoint: False
num_heads: -1
num_head_channels: 64
num_heads_upsample: -1
use_scale_shift_norm: false
resblock_updown: false
transformer_depth: 1
transformer_depth_middle: null
context_dim: 1024
time_downup: false
time_context_dim: null
extra_ff_mix_layer: true
use_spatial_context: true
merge_strategy: learned_with_images
merge_factor: 0.5
spatial_transformer_attn_type: softmax-xformers
video_kernel_size:
- 3
- 1
- 1
use_linear_in_transformer: true
adm_in_channels: 768
disable_temporal_crossattention: false
max_ddpm_temb_period: 10000
merging_mode: attention_cross_attention
controlnet_mode: true
use_apm: false
model:
class_path: modules.loader.module_loader_config.ModuleLoaderConfig
init_args:
loader_cls_path: models.svd.sgm.modules.diffusionmodules.wrappers.OpenAIWrapper
cls_func: ''
cls_func_fast_dev_run: ''
kwargs_diffusers:
compile_model: false
model_params: null
model_params_fast_dev_run: null
kwargs_diff_trainer_params: null
args: []
dependent_modules:
diffusion_model: network_config
dependent_modules_cloned: null
state_dict_path: ''
strict_loading: true
state_dict_filters: []
denoiser:
class_path: models.svd.sgm.modules.diffusionmodules.denoiser.Denoiser
init_args:
scaling_config:
target: models.svd.sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
sampler:
class_path: models.svd.sgm.modules.diffusionmodules.sampling.EulerEDMSampler
init_args:
s_churn: 0.0
s_tmin: 0.0
s_tmax: .inf
s_noise: 1.0
discretization_config:
target: models.diffusion.discretizer.AlignYourSteps
params:
sigma_max: 700.0
num_steps: 30
guider_config:
target: models.svd.sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
params:
max_scale: 3.0
min_scale: 1.5
num_frames: 25
verbose: false
device: cuda
conditioner:
class_path: models.svd.sgm.modules.GeneralConditioner
init_args:
emb_models:
- is_trainable: false
input_key: cond_frames_without_noise
target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
params:
n_cond_frames: 1
n_copies: 1
open_clip_embedding_config:
target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
params:
freeze: true
- input_key: fps_id
is_trainable: false
target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
- input_key: motion_bucket_id
is_trainable: false
target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
- input_key: cond_frames
is_trainable: false
target: models.svd.sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
params:
disable_encoder_autocast: true
n_cond_frames: 1
n_copies: 1
is_ae: true
encoder_config:
target: models.svd.sgm.models.autoencoder.AutoencoderKLModeOnly
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
- input_key: cond_aug
is_trainable: false
target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
first_stage_model:
class_path: models.svd.sgm.AutoencodingEngine
init_args:
encoder_config:
target: models.svd.sgm.modules.diffusionmodules.model.Encoder
params:
attn_type: vanilla
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
decoder_config:
target: models.svd.sgm.modules.autoencoding.temporal_ae.VideoDecoder
params:
attn_type: vanilla
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
video_kernel_size:
- 3
- 1
- 1
loss_config:
target: torch.nn.Identity
regularizer_config:
target: models.svd.sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
optimizer_config: null
lr_g_factor: 1.0
trainable_ae_params: null
ae_optimizer_args: null
trainable_disc_params: null
disc_optimizer_args: null
disc_start_iter: 0
diff_boost_factor: 3.0
ckpt_engine: null
ckpt_path: null
additional_decode_keys: null
ema_decay: null
monitor: null
input_key: jpg
svd_pipeline:
class_path: modules.loader.module_loader_config.ModuleLoaderConfig
init_args:
loader_cls_path: diffusers.StableVideoDiffusionPipeline
cls_func: from_pretrained
cls_func_fast_dev_run: ''
kwargs_diffusers:
torch_dtype: torch.float16
variant: fp16
use_safetensors: true
model_params: null
model_params_fast_dev_run: null
kwargs_diff_trainer_params: null
args:
- stabilityai/stable-video-diffusion-img2vid-xt
dependent_modules: null
dependent_modules_cloned: null
state_dict_path: ''
strict_loading: true
state_dict_filters: []
root_cls: null
diff_trainer_params:
class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.DiffusionTrainerParams
init_args:
scale_factor: 0.18215
streamingsvd_ckpt:
class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.CheckpointDescriptor
init_args:
ckpt_path_local: checkpoint/StreamingSVD/model.safetensors
ckpt_path_global: PAIR/StreamingSVD/resolve/main/model.safetensors
disable_first_stage_autocast: true
inference_params:
class_path: modules.params.diffusion.inference_params.T2VInferenceParams
init_args:
n_autoregressive_generations: 2 # Number of autoregression for StreamingSVD
num_conditional_frames: 7 # is this used?
anchor_frames: '6' # Take the (Number+1)th frame as CLIP encoding for StreamingSVD
reset_seed_per_generation: true # If true, the seed is reset on every generation