Spaces:

AIGC-Audio
/

Make-An-Audio-3

Running on Zero

App Files Files Community

Make-An-Audio-3 / configs /video2audio-cfm1-cfg-LargeDiT1-moe.yaml

3v324v23

Add config

28cda0c 5 months ago

raw

history blame

3.98 kB

	model:
	base_learning_rate: 3.0e-06
	target: ldm.models.diffusion.cfm1_audio.CFM
	params:
	linear_start: 0.00085
	linear_end: 0.012
	num_timesteps_cond: 1
	log_every_t: 200
	timesteps: 1000
	first_stage_key: "mix_spec"
	cond_stage_key: "mix_video_feat"
	mel_dim: 20
	mel_length: 256
	channels: 0
	cond_stage_trainable: True
	conditioning_key: crossattn
	monitor: val/loss_simple_ema
	scale_by_std: true
	use_ema: false
	scheduler_config:
	target: ldm.lr_scheduler.LambdaLinearScheduler
	params:
	warm_up_steps:
	- 10000
	cycle_lengths:
	- 10000000000000
	f_start:
	- 1.0e-06
	f_max:
	- 1.0
	f_min:
	- 1.0
	unet_config:
	target: ldm.modules.diffusionmodules.flag_large_dit_moe.VideoFlagLargeDiT
	params:
	in_channels: 20
	context_dim: 768
	hidden_size: 768
	num_heads: 32
	depth: 16
	max_len: 1000
	num_experts: 4


	first_stage_config:
	target: ldm.models.autoencoder1d.AutoencoderKL
	params:
	embed_dim: 20
	monitor: val/rec_loss
	ckpt_path: /apdcephfs_intern/share_1316500/nlphuang/results/Text_to_audio/ldm_src/ckpt/epoch=000032.ckpt
	ddconfig:
	double_z: true
	in_channels: 80
	out_ch: 80
	z_channels: 20
	kernel_size: 5
	ch: 384
	ch_mult:
	- 1
	- 2
	- 4
	num_res_blocks: 2
	attn_layers:
	- 3
	down_layers:
	- 0
	dropout: 0.0
	lossconfig:
	target: torch.nn.Identity
	cond_stage_config:
	target: ldm.modules.encoders.modules.Video_Feat_Encoder_NoPosembed
	params:
	origin_dim: 512
	embed_dim: 768
	seq_len: 40

	lightning:
	callbacks:
	image_logger:
	target: main.AudioLogger
	params:
	sample_rate: 16000
	for_specs: true
	increase_log_steps: false
	batch_frequency: 5000
	max_images: 8
	melvmin: -5
	melvmax: 1.5
	vocoder_cfg:
	target: vocoder.bigvgan.models.VocoderBigVGAN
	params:
	ckpt_vocoder: /apdcephfs_intern/share_1316500/nlphuang/results/Text_to_audio/ldm_src/ckpt/bigvnat
	trainer:
	benchmark: True
	gradient_clip_val: 1.0
	modelcheckpoint:
	params:
	monitor: epoch
	mode: max
	save_top_k: 10
	every_n_epochs: 5

	data:
	target: main.DataModuleFromConfig
	params:
	batch_size: 16 # originally 220
	num_workers: 10
	wrap: True
	train:
	target: ldm.data.video_spec_maa2_dataset.audio_video_spec_fullset_Dataset_Train
	params:
	dataset_cfg:
	dataset1:
	dataset_name: VGGSound
	data_dir: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/
	video_dir: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/ # Not Necessary Except for Inference
	split_txt_path: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/split_txt

	feat_type: CAVP_feat
	sr: 16000
	duration: 10
	truncate: 131072
	fps: 4
	hop_len: 256
	drop: 0.2

	validation:
	target: ldm.data.video_spec_maa2_dataset.audio_video_spec_fullset_Dataset_Valid
	params:
	dataset_cfg:
	dataset1:
	dataset_name: VGGSound
	data_dir: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/
	video_dir: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/ # Not Necessary Except for Inference
	split_txt_path: /apdcephfs_intern/share_1316500/nlphuang/data/video_to_audio/vggsound/split_txt

	feat_type: CAVP_feat
	sr: 16000
	duration: 10
	truncate: 131072
	fps: 4
	hop_len: 256