hpoghos's picture
add code
f949b3f
raw
history blame
6.73 kB
# pytorch_lightning==2.0.9
seed_everything: 33
trainer:
accelerator: auto
strategy: auto
devices: '8'
num_nodes: 1
precision: 16-mixed
logger: null
callbacks:
- class_path: pytorch_lightning.callbacks.RichModelSummary
init_args:
max_depth: 1
- class_path: pytorch_lightning.callbacks.RichProgressBar
init_args:
refresh_rate: 1
leave: false
theme:
description: white
progress_bar: '#6206E0'
progress_bar_finished: '#6206E0'
progress_bar_pulse: '#6206E0'
batch_progress: white
time: grey54
processing_speed: grey70
metrics: white
console_kwargs: null
fast_dev_run: false
max_epochs: 5000
min_epochs: null
max_steps: 2020000
min_steps: null
max_time: null
limit_train_batches: null
limit_val_batches: 512
limit_test_batches: null
limit_predict_batches: null
overfit_batches: 0.0
val_check_interval: 8000
check_val_every_n_epoch: 1
num_sanity_val_steps: null
log_every_n_steps: 10
enable_checkpointing: null
enable_progress_bar: null
enable_model_summary: null
accumulate_grad_batches: 8
gradient_clip_val: 1
gradient_clip_algorithm: norm
deterministic: null
benchmark: null
inference_mode: true
use_distributed_sampler: true
profiler: null
detect_anomaly: false
barebones: false
plugins: null
sync_batchnorm: false
reload_dataloaders_every_n_epochs: 0
default_root_dir: null
model:
inference_params:
class_path: t2v_enhanced.model.pl_module_params_controlnet.InferenceParams
init_args:
width: 256
height: 256
video_length: 16
guidance_scale: 7.5
use_dec_scaling: true
frame_rate: 8
num_inference_steps: 50
eta: 1.0
n_autoregressive_generations: 1
mode: long_video
start_from_real_input: true
eval_loss_metrics: false
scheduler_cls: ''
negative_prompt: ''
conditioning_from_all_past: false
validation_samples: 80
conditioning_type: last_chunk
result_formats:
- eval_gif
- gif
- mp4
concat_video: true
opt_params:
class_path: t2v_enhanced.model.pl_module_params_controlnet.OptimizerParams
init_args:
learning_rate: 5.0e-05
layers_config:
class_path: t2v_enhanced.model.requires_grad_setter.LayerConfig
init_args:
gradient_setup:
- - false
- - vae
- - false
- - text_encoder
- - false
- - image_encoder
- - true
- - resampler
- - true
- - unet
- - true
- - base_model
- - false
- - base_model
- transformer_in
- - false
- - base_model
- temp_attentions
- - false
- - base_model
- temp_convs
layers_config_base: null
use_warmup: false
warmup_steps: 10000
warmup_start_factor: 1.0e-05
learning_rate_spatial: 0.0
use_8_bit_adam: false
noise_generator: null
noise_decomposition: null
perceptual_loss: false
noise_offset: 0.0
split_opt_by_node: false
reset_prediction_type_to_eps: false
train_val_sampler_may_differ: true
measure_similarity: false
similarity_loss: false
similarity_loss_weight: 1.0
loss_conditional_weight: 0.0
loss_conditional_weight_convex: false
loss_conditional_change_after_step: 0
mask_conditional_frames: false
sample_from_noise: true
mask_alternating: false
uncondition_freq: -1
no_text_condition_control: false
inject_image_into_input: false
inject_at_T: false
resampling_steps: 1
control_freq_in_resample: 1
resample_to_T: false
adaptive_loss_reweight: false
load_resampler_from_ckpt: ''
skip_controlnet_branch: false
use_fps_conditioning: false
num_frame_embeddings_range: 16
start_frame_training: 16
start_frame_ctrl: 16
load_trained_base_model_and_resampler_from_ckpt: ''
load_trained_controlnet_from_ckpt: ''
unet_params:
class_path: t2v_enhanced.model.pl_module_params_controlnet.UNetParams
init_args:
conditioning_embedding_out_channels:
- 32
- 96
- 256
- 512
ckpt_spatial_layers: ''
pipeline_repo: damo-vilab/text-to-video-ms-1.7b
unet_from_diffusers: true
spatial_latent_input: false
num_frame_conditioning: 1
pipeline_class: t2v_enhanced.model.model.controlnet.pipeline_text_to_video_w_controlnet_synth.TextToVideoSDPipeline
frame_expansion: none
downsample_controlnet_cond: true
num_frames: 16
pre_transformer_in_cond: false
num_tranformers: 1
zero_conv_3d: false
merging_mode: addition
compute_only_conditioned_frames: false
condition_encoder: ''
zero_conv_mode: Identity
clean_model: true
merging_mode_base: attention_cross_attention
attention_mask_params: null
attention_mask_params_base: null
modelscope_input_format: true
temporal_self_attention_only_on_conditioning: false
temporal_self_attention_mask_included_itself: false
use_post_merger_zero_conv: false
weight_control_sample: 1.0
use_controlnet_mask: false
random_mask_shift: false
random_mask: false
use_resampler: true
unet_from_pipe: false
unet_operates_on_2d: false
image_encoder: CLIP
use_standard_attention_processor: false
num_frames_before_chunk: 0
resampler_type: single_frame
resampler_cls: t2v_enhanced.model.diffusers_conditional.models.controlnet.image_embedder.ImgEmbContextResampler
resampler_merging_layers: 4
image_encoder_obj:
class_path: t2v_enhanced.model.diffusers_conditional.models.controlnet.image_embedder.FrozenOpenCLIPImageEmbedder
init_args:
arch: ViT-H-14
version: laion2b_s32b_b79k
device: cuda
max_length: 77
freeze: true
antialias: true
ucg_rate: 0.0
unsqueeze_dim: false
repeat_to_max_len: false
num_image_crops: 0
output_tokens: false
cfg_text_image: false
aggregation: last_out
resampler_random_shift: true
img_cond_alpha_per_frame: false
num_control_input_frames: 8
use_image_encoder_normalization: false
use_of: false
ema_param: -1.0
concat: false
use_image_tokens_main: true
use_image_tokens_ctrl: false
result_fol: results
exp_name: my_exp_name
run_name: my_run_name
scale_lr: false
matmul_precision: high