File size: 11,862 Bytes
8fd2f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# pytorch_lightning==2.2.2
seed_everything: 33
trainer:
  accelerator: auto
  strategy: auto
  devices: '1'
  num_nodes: 1
  precision: 16-mixed
  logger: False
model:
  class_path: diffusion_trainer.streaming_svd.StreamingSVD
  init_args:
    vfi:
      class_path: modules.params.vfi.VFIParams 
      init_args:
        ckpt_path_local: checkpoint/VFI/ours.pkl
        ckpt_path_global: https://drive.google.com/file/d/1XCNoyhA1RX3m8W-XJK8H8inH47l36kxP/view?usp=sharing
    i2v_enhance:
      class_path:  modules.params.i2v_enhance.I2VEnhanceParams
      init_args:
        ckpt_path_local: checkpoint/i2v_enhance/
        ckpt_path_global: ali-vilab/i2vgen-xl
    module_loader:
      class_path: modules.loader.module_loader.GenericModuleLoader
      init_args:
        pipeline_repo: stabilityai/stable-video-diffusion-img2vid-xt
        pipeline_obj: streamingt2v_pipeline
        set_prediction_type: ''
        module_names:
        - network_config
        - model
        - controlnet
        - denoiser
        - conditioner
        - first_stage_model
        - sampler
        - svd_pipeline
        module_config:
          controlnet:
            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
            init_args:
              loader_cls_path: models.control.controlnet.ControlNet
              cls_func: from_unet
              cls_func_fast_dev_run: ''
              kwargs_diffusers: null
              model_params:
                merging_mode: addition
                zero_conv_mode: Identity
                frame_expansion: none
                downsample_controlnet_cond: true
                use_image_encoder_normalization: true
                use_controlnet_mask: false
                condition_encoder: ''
                conditioning_embedding_out_channels:
                - 32
                - 96
                - 256
                - 512
              kwargs_diff_trainer_params: null
              args: []
              dependent_modules:
                model: model
              dependent_modules_cloned: null
              state_dict_path: ''
              strict_loading: true
              state_dict_filters: []
          network_config:
            class_path: models.diffusion.video_model.VideoUNet
            init_args:
              in_channels: 8
              model_channels: 320
              out_channels: 4
              num_res_blocks: 2
              num_conditional_frames: null
              attention_resolutions:
              - 4
              - 2
              - 1
              dropout: 0.0
              channel_mult:
              - 1
              - 2
              - 4
              - 4
              conv_resample: true
              dims: 2
              num_classes: sequential
              use_checkpoint: False
              num_heads: -1
              num_head_channels: 64
              num_heads_upsample: -1
              use_scale_shift_norm: false
              resblock_updown: false
              transformer_depth: 1
              transformer_depth_middle: null
              context_dim: 1024
              time_downup: false
              time_context_dim: null
              extra_ff_mix_layer: true
              use_spatial_context: true
              merge_strategy: learned_with_images
              merge_factor: 0.5
              spatial_transformer_attn_type: softmax-xformers
              video_kernel_size:
              - 3
              - 1
              - 1
              use_linear_in_transformer: true
              adm_in_channels: 768
              disable_temporal_crossattention: false
              max_ddpm_temb_period: 10000
              merging_mode: attention_cross_attention
              controlnet_mode: true
              use_apm: false
          model:
            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
            init_args:
              loader_cls_path: models.svd.sgm.modules.diffusionmodules.wrappers.OpenAIWrapper
              cls_func: ''
              cls_func_fast_dev_run: ''
              kwargs_diffusers:
                compile_model: false
              model_params: null
              model_params_fast_dev_run: null
              kwargs_diff_trainer_params: null
              args: []
              dependent_modules:
                diffusion_model: network_config
              dependent_modules_cloned: null
              state_dict_path: ''
              strict_loading: true
              state_dict_filters: []
          denoiser:
            class_path: models.svd.sgm.modules.diffusionmodules.denoiser.Denoiser
            init_args:
              scaling_config:
                target: models.svd.sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
          sampler:
            class_path: models.svd.sgm.modules.diffusionmodules.sampling.EulerEDMSampler
            init_args:
              s_churn: 0.0
              s_tmin: 0.0
              s_tmax: .inf
              s_noise: 1.0
              discretization_config:
                target: models.diffusion.discretizer.AlignYourSteps
                params:
                  sigma_max: 700.0
              num_steps: 30
              guider_config:
                target: models.svd.sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
                params:
                  max_scale: 3.0
                  min_scale: 1.5
                  num_frames: 25
              verbose: false
              device: cuda
          conditioner:
            class_path: models.svd.sgm.modules.GeneralConditioner
            init_args:
              emb_models:
              - is_trainable: false
                input_key: cond_frames_without_noise
                target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
                params:
                  n_cond_frames: 1
                  n_copies: 1
                  open_clip_embedding_config:
                    target: models.svd.sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
                    params:
                      freeze: true
              - input_key: fps_id
                is_trainable: false
                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
                params:
                  outdim: 256
              - input_key: motion_bucket_id
                is_trainable: false
                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
                params:
                  outdim: 256
              - input_key: cond_frames
                is_trainable: false
                target: models.svd.sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
                params:
                  disable_encoder_autocast: true
                  n_cond_frames: 1
                  n_copies: 1
                  is_ae: true
                  encoder_config:
                    target: models.svd.sgm.models.autoencoder.AutoencoderKLModeOnly
                    params:
                      embed_dim: 4
                      monitor: val/rec_loss
                      ddconfig:
                        attn_type: vanilla-xformers
                        double_z: true
                        z_channels: 4
                        resolution: 256
                        in_channels: 3
                        out_ch: 3
                        ch: 128
                        ch_mult:
                        - 1
                        - 2
                        - 4
                        - 4
                        num_res_blocks: 2
                        attn_resolutions: []
                        dropout: 0.0
                      lossconfig:
                        target: torch.nn.Identity
              - input_key: cond_aug
                is_trainable: false
                target: models.svd.sgm.modules.encoders.modules.ConcatTimestepEmbedderND
                params:
                  outdim: 256
          first_stage_model:
            class_path: models.svd.sgm.AutoencodingEngine
            init_args:
              encoder_config:
                target: models.svd.sgm.modules.diffusionmodules.model.Encoder
                params:
                  attn_type: vanilla
                  double_z: true
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult:
                  - 1
                  - 2
                  - 4
                  - 4
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
              decoder_config:
                target: models.svd.sgm.modules.autoencoding.temporal_ae.VideoDecoder
                params:
                  attn_type: vanilla
                  double_z: true
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult:
                  - 1
                  - 2
                  - 4
                  - 4
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
                  video_kernel_size:
                  - 3
                  - 1
                  - 1
              loss_config:
                target: torch.nn.Identity
              regularizer_config:
                target: models.svd.sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
              optimizer_config: null
              lr_g_factor: 1.0
              trainable_ae_params: null
              ae_optimizer_args: null
              trainable_disc_params: null
              disc_optimizer_args: null
              disc_start_iter: 0
              diff_boost_factor: 3.0
              ckpt_engine: null
              ckpt_path: null
              additional_decode_keys: null
              ema_decay: null
              monitor: null
              input_key: jpg
          svd_pipeline:
            class_path: modules.loader.module_loader_config.ModuleLoaderConfig
            init_args:
              loader_cls_path: diffusers.StableVideoDiffusionPipeline
              cls_func: from_pretrained
              cls_func_fast_dev_run: ''
              kwargs_diffusers:
                torch_dtype: torch.float16
                variant: fp16
                use_safetensors: true
              model_params: null
              model_params_fast_dev_run: null
              kwargs_diff_trainer_params: null
              args:
              - stabilityai/stable-video-diffusion-img2vid-xt
              dependent_modules: null
              dependent_modules_cloned: null
              state_dict_path: ''
              strict_loading: true
              state_dict_filters: []
        root_cls: null
    diff_trainer_params:
      class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.DiffusionTrainerParams
      init_args:
        scale_factor: 0.18215
        streamingsvd_ckpt:
          class_path: modules.params.diffusion_trainer.params_streaming_diff_trainer.CheckpointDescriptor
          init_args:
            ckpt_path_local: checkpoint/StreamingSVD/model.safetensors
            ckpt_path_global: PAIR/StreamingSVD/resolve/main/model.safetensors
        disable_first_stage_autocast: true
    inference_params:
      class_path: modules.params.diffusion.inference_params.T2VInferenceParams
      init_args:
        n_autoregressive_generations: 2 # Number of autoregression for StreamingSVD
        num_conditional_frames: 7 # is this used?
        anchor_frames: '6'  #  Take the (Number+1)th frame as CLIP encoding for StreamingSVD
        reset_seed_per_generation: true # If true, the seed is reset on every generation