# pytorch_lightning==2.0.9 seed_everything: 33 trainer: accelerator: auto strategy: auto devices: '8' num_nodes: 1 precision: 16-mixed logger: null callbacks: - class_path: pytorch_lightning.callbacks.RichModelSummary init_args: max_depth: 1 - class_path: pytorch_lightning.callbacks.RichProgressBar init_args: refresh_rate: 1 leave: false theme: description: white progress_bar: '#6206E0' progress_bar_finished: '#6206E0' progress_bar_pulse: '#6206E0' batch_progress: white time: grey54 processing_speed: grey70 metrics: white console_kwargs: null fast_dev_run: false max_epochs: 5000 min_epochs: null max_steps: 2020000 min_steps: null max_time: null limit_train_batches: null limit_val_batches: 512 limit_test_batches: null limit_predict_batches: null overfit_batches: 0.0 val_check_interval: 8000 check_val_every_n_epoch: 1 num_sanity_val_steps: null log_every_n_steps: 10 enable_checkpointing: null enable_progress_bar: null enable_model_summary: null accumulate_grad_batches: 8 gradient_clip_val: 1 gradient_clip_algorithm: norm deterministic: null benchmark: null inference_mode: true use_distributed_sampler: true profiler: null detect_anomaly: false barebones: false plugins: null sync_batchnorm: false reload_dataloaders_every_n_epochs: 0 default_root_dir: null model: inference_params: class_path: t2v_enhanced.model.pl_module_params_controlnet.InferenceParams init_args: width: 256 height: 256 video_length: 16 guidance_scale: 7.5 use_dec_scaling: true frame_rate: 8 num_inference_steps: 50 eta: 1.0 n_autoregressive_generations: 1 mode: long_video start_from_real_input: true eval_loss_metrics: false scheduler_cls: '' negative_prompt: '' conditioning_from_all_past: false validation_samples: 80 conditioning_type: last_chunk result_formats: - eval_gif - gif - mp4 concat_video: true opt_params: class_path: t2v_enhanced.model.pl_module_params_controlnet.OptimizerParams init_args: learning_rate: 5.0e-05 layers_config: class_path: t2v_enhanced.model.requires_grad_setter.LayerConfig init_args: gradient_setup: - - false - - vae - - false - - text_encoder - - false - - image_encoder - - true - - resampler - - true - - unet - - true - - base_model - - false - - base_model - transformer_in - - false - - base_model - temp_attentions - - false - - base_model - temp_convs layers_config_base: null use_warmup: false warmup_steps: 10000 warmup_start_factor: 1.0e-05 learning_rate_spatial: 0.0 use_8_bit_adam: false noise_generator: null noise_decomposition: null perceptual_loss: false noise_offset: 0.0 split_opt_by_node: false reset_prediction_type_to_eps: false train_val_sampler_may_differ: true measure_similarity: false similarity_loss: false similarity_loss_weight: 1.0 loss_conditional_weight: 0.0 loss_conditional_weight_convex: false loss_conditional_change_after_step: 0 mask_conditional_frames: false sample_from_noise: true mask_alternating: false uncondition_freq: -1 no_text_condition_control: false inject_image_into_input: false inject_at_T: false resampling_steps: 1 control_freq_in_resample: 1 resample_to_T: false adaptive_loss_reweight: false load_resampler_from_ckpt: '' skip_controlnet_branch: false use_fps_conditioning: false num_frame_embeddings_range: 16 start_frame_training: 16 start_frame_ctrl: 16 load_trained_base_model_and_resampler_from_ckpt: '' load_trained_controlnet_from_ckpt: '' unet_params: class_path: t2v_enhanced.model.pl_module_params_controlnet.UNetParams init_args: conditioning_embedding_out_channels: - 32 - 96 - 256 - 512 ckpt_spatial_layers: '' pipeline_repo: damo-vilab/text-to-video-ms-1.7b unet_from_diffusers: true spatial_latent_input: false num_frame_conditioning: 1 pipeline_class: t2v_enhanced.model.model.controlnet.pipeline_text_to_video_w_controlnet_synth.TextToVideoSDPipeline frame_expansion: none downsample_controlnet_cond: true num_frames: 16 pre_transformer_in_cond: false num_tranformers: 1 zero_conv_3d: false merging_mode: addition compute_only_conditioned_frames: false condition_encoder: '' zero_conv_mode: Identity clean_model: true merging_mode_base: attention_cross_attention attention_mask_params: null attention_mask_params_base: null modelscope_input_format: true temporal_self_attention_only_on_conditioning: false temporal_self_attention_mask_included_itself: false use_post_merger_zero_conv: false weight_control_sample: 1.0 use_controlnet_mask: false random_mask_shift: false random_mask: false use_resampler: true unet_from_pipe: false unet_operates_on_2d: false image_encoder: CLIP use_standard_attention_processor: false num_frames_before_chunk: 0 resampler_type: single_frame resampler_cls: t2v_enhanced.model.diffusers_conditional.models.controlnet.image_embedder.ImgEmbContextResampler resampler_merging_layers: 4 image_encoder_obj: class_path: t2v_enhanced.model.diffusers_conditional.models.controlnet.image_embedder.FrozenOpenCLIPImageEmbedder init_args: arch: ViT-H-14 version: laion2b_s32b_b79k device: cuda max_length: 77 freeze: true antialias: true ucg_rate: 0.0 unsqueeze_dim: false repeat_to_max_len: false num_image_crops: 0 output_tokens: false cfg_text_image: false aggregation: last_out resampler_random_shift: true img_cond_alpha_per_frame: false num_control_input_frames: 8 use_image_encoder_normalization: false use_of: false ema_param: -1.0 concat: false use_image_tokens_main: true use_image_tokens_ctrl: false result_fol: results exp_name: my_exp_name run_name: my_run_name scale_lr: false matmul_precision: high