File size: 4,114 Bytes
8c92a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
{
    "base_config": "config/svc/base.json",
    "model": {
        "condition_encoder": {
            "merge_mode": "add",
            // Prosody Features
            "use_f0": true,
            "use_uv": true,
            "use_energy": true,
            // Quantization (0 for not quantization)
            "input_melody_dim": 1,
            "n_bins_melody": 256,
            "output_melody_dim": 384,
            "input_loudness_dim": 1,
            "n_bins_loudness": 256,
            "output_loudness_dim": 384,
            // Semantic Features
            "use_whisper": false,
            "use_contentvec": false,
            "use_wenet": false,
            "use_mert": false,
            "whisper_dim": 1024,
            "contentvec_dim": 256,
            "mert_dim": 256,
            "wenet_dim": 512,
            "content_encoder_dim": 384,
            // Speaker Features
            "output_singer_dim": 384,
            "singer_table_size": 512,
            "use_spkid": true
        },
        "diffusion": {
            "scheduler": "ddpm",
            "scheduler_settings": {
                "num_train_timesteps": 1000,
                "beta_start": 1.0e-4,
                "beta_end": 0.02,
                "beta_schedule": "linear"
            },
            // Diffusion steps encoder
            "step_encoder": {
                "dim_raw_embedding": 128,
                "dim_hidden_layer": 512,
                "activation": "SiLU",
                "num_layer": 2,
                "max_period": 10000
            },
            // Diffusion decoder
            "model_type": "bidilconv",
            // bidilconv, unet2d, TODO: unet1d
            "bidilconv": {
                "base_channel": 384,
                "n_res_block": 20,
                "conv_kernel_size": 3,
                "dilation_cycle_length": 4,
                // specially, 1 means no dilation
                "conditioner_size": 384
            },
            "unet2d": {
                "in_channels": 1,
                "out_channels": 1,
                "down_block_types": [
                    "CrossAttnDownBlock2D",
                    "CrossAttnDownBlock2D",
                    "CrossAttnDownBlock2D",
                    "DownBlock2D"
                ],
                "mid_block_type": "UNetMidBlock2DCrossAttn",
                "up_block_types": [
                    "UpBlock2D",
                    "CrossAttnUpBlock2D",
                    "CrossAttnUpBlock2D",
                    "CrossAttnUpBlock2D"
                ],
                "only_cross_attention": false
            }
        }
    },
    "train": {
        // Basic settings
        "batch_size": 64,
        "gradient_accumulation_step": 1,
        "max_epoch": -1,
        // -1 means no limit
        "save_checkpoint_stride": [
            5,
            20
        ],
        // unit is epoch
        "keep_last": [
            3,
            -1
        ],
        // -1 means infinite, if one number will broadcast
        "run_eval": [
            false,
            true
        ],
        // if one number will broadcast
        // Fix the random seed
        "random_seed": 10086,
        // Batchsampler
        "sampler": {
            "holistic_shuffle": true,
            "drop_last": true
        },
        // Dataloader
        "dataloader": {
            "num_worker": 32,
            "pin_memory": true
        },
        // Trackers
        "tracker": [
            "tensorboard"
            // "wandb",
            // "cometml",
            // "mlflow",
        ],
        // Optimizer
        "optimizer": "AdamW",
        "adamw": {
            "lr": 4.0e-4
            // nn model lr
        },
        // LR Scheduler
        "scheduler": "ReduceLROnPlateau",
        "reducelronplateau": {
            "factor": 0.8,
            "patience": 10,
            // unit is epoch
            "min_lr": 1.0e-4
        }
    },
    "inference": {
        "diffusion": {
            "scheduler": "pndm",
            "scheduler_settings": {
                "num_inference_timesteps": 1000
            }
        }
    }
}