Spaces:
Paused
Paused
model_name: EzAudio-L | |
model: | |
mae: True | |
mae_prob: 0.25 | |
mask_ratio: [0.25, 1.0] | |
mask_span: 10 | |
img_size: 500 | |
patch_size: 1 | |
in_chans: 257 | |
out_chans: 128 | |
input_type: '1d' | |
embed_dim: 1024 | |
depth: 24 | |
num_heads: 16 | |
mlp_ratio: 4.0 | |
qkv_bias: false | |
qk_scale: null | |
qk_norm: layernorm | |
norm_layer: layernorm | |
act_layer: geglu | |
context_norm: true | |
use_checkpoint: true | |
time_fusion: 'ada_lora_bias' | |
ada_lora_rank: 32 | |
ada_lora_alpha: 32 | |
cls_dim: null | |
context_dim: 1024 | |
context_fusion: 'cross' | |
context_max_length: null | |
context_pe_method: 'none' | |
pe_method: 'none' | |
rope_mode: 'shared' | |
use_conv: true | |
skip: true | |
skip_norm: true | |
autoencoder: | |
name: stable_vae | |
dim: 128 | |
sr: 24000 | |
latent_sr: 50 | |
q_first: true | |
scale: 1.0 | |
shift: 0.0 | |
text_encoder: | |
model: google/flan-t5-large | |
max_length: 100 | |
cfg: 0.1 | |
diff: | |
num_train_timesteps: 1000 | |
beta_schedule: 'scaled_linear' | |
beta_start: 0.00085 | |
beta_end: 0.012 | |
prediction_type: 'v_prediction' | |
rescale_betas_zero_snr: true | |
timestep_spacing: 'trailing' | |
clip_sample: false | |