File size: 1,813 Bytes
8c92a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
{
  "base_config": "config/base.json",
  "model_type": "AudioLDM",
  "task_type": "tta",
  "dataset": [
    "AudioCaps"
  ],
  "preprocess": {
    // feature used for model training
    "use_spkid": false,
    "use_uv": false,
    "use_frame_pitch": false,
    "use_phone_pitch": false,
    "use_frame_energy": false,
    "use_phone_energy": false,
    "use_mel": false,
    "use_audio": false,
    "use_label": false,
    "use_one_hot": false,
    "cond_mask_prob": 0.1
  },
  // model
  "model": {
    "audioldm": {
      "image_size": 32,
      "in_channels": 4,
      "out_channels": 4,
      "model_channels": 256,
      "attention_resolutions": [
        4,
        2,
        1
      ],
      "num_res_blocks": 2,
      "channel_mult": [
        1,
        2,
        4
      ],
      "num_heads": 8,
      "use_spatial_transformer": true,
      "transformer_depth": 1,
      "context_dim": 768,
      "use_checkpoint": true,
      "legacy": false
    },
    "autoencoderkl": {
      "ch": 128,
      "ch_mult": [
        1,
        1,
        2,
        2,
        4
      ],
      "num_res_blocks": 2,
      "in_channels": 1,
      "z_channels": 4,
      "out_ch": 1,
      "double_z": true
    },
    "noise_scheduler": {
      "num_train_timesteps": 1000,
      "beta_start": 0.00085,
      "beta_end": 0.012,
      "beta_schedule": "scaled_linear",
      "clip_sample": false,
      "steps_offset": 1,
      "set_alpha_to_one": false,
      "skip_prk_steps": true,
      "prediction_type": "epsilon"
    }
  },
  // train
  "train": {
    "lronPlateau": {
      "factor": 0.9,
      "patience": 100,
      "min_lr": 4.0e-5,
      "verbose": true
    },
    "adam": {
      "lr": 5.0e-5,
      "betas": [
        0.9,
        0.999
      ],
      "weight_decay": 1.0e-2,
      "eps": 1.0e-8
    }
  }
}