rinflan commited on
Commit
a6df73d
1 Parent(s): 5f84dff

Upload 15 files

Browse files
configs/_base_/archs/diff_svc.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fish_diffusion.utils.pitch import pitch_to_scale
2
+
3
+ sampling_rate = 44100
4
+ mel_channels = 128
5
+ hidden_size = 256
6
+
7
+ model = dict(
8
+ type="DiffSVC",
9
+ diffusion=dict(
10
+ type="GaussianDiffusion",
11
+ mel_channels=mel_channels,
12
+ noise_schedule="linear",
13
+ timesteps=1000,
14
+ max_beta=0.01,
15
+ s=0.008,
16
+ noise_loss="smoothed-l1",
17
+ denoiser=dict(
18
+ type="WaveNetDenoiser",
19
+ mel_channels=mel_channels,
20
+ d_encoder=hidden_size,
21
+ residual_channels=512,
22
+ residual_layers=20,
23
+ ),
24
+ spec_stats_path="dataset/stats.json",
25
+ sampler_interval=10,
26
+ ),
27
+ text_encoder=dict(
28
+ type="NaiveProjectionEncoder",
29
+ input_size=256,
30
+ output_size=hidden_size,
31
+ ),
32
+ speaker_encoder=dict(
33
+ type="NaiveProjectionEncoder",
34
+ input_size=10,
35
+ output_size=hidden_size,
36
+ use_embedding=True,
37
+ ),
38
+ pitch_encoder=dict(
39
+ type="NaiveProjectionEncoder",
40
+ input_size=1,
41
+ output_size=hidden_size,
42
+ use_embedding=False,
43
+ preprocessing=pitch_to_scale,
44
+ ),
45
+ vocoder=dict(
46
+ type="NsfHifiGAN",
47
+ checkpoint_path="checkpoints/nsf_hifigan/model",
48
+ sampling_rate=sampling_rate,
49
+ mel_channels=mel_channels,
50
+ use_natural_log=True,
51
+ ),
52
+ )
configs/_base_/archs/diff_svc_v2.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DiffSVC architecture with WaveNet denoiser and NSF-HiFiGAN vocoder.
3
+
4
+ Comparing to v1, this version
5
+ - Doesn't need spec stats anymore.
6
+ - Added dilation cycle to WaveNet denoiser.
7
+ - Used the log10 mel spectrogram.
8
+ - Better matching DiffSinger architecture.
9
+ """
10
+
11
+ from fish_diffusion.utils.pitch import pitch_to_scale
12
+
13
+ sampling_rate = 44100
14
+ mel_channels = 128
15
+ hidden_size = 256
16
+
17
+ model = dict(
18
+ type="DiffSVC",
19
+ diffusion=dict(
20
+ type="GaussianDiffusion",
21
+ mel_channels=mel_channels,
22
+ noise_schedule="linear",
23
+ timesteps=1000,
24
+ max_beta=0.01,
25
+ s=0.008,
26
+ noise_loss="l1",
27
+ denoiser=dict(
28
+ type="WaveNetDenoiser",
29
+ mel_channels=mel_channels,
30
+ d_encoder=hidden_size,
31
+ residual_channels=512,
32
+ residual_layers=20,
33
+ dilation_cycle=4,
34
+ use_linear_bias=True,
35
+ ),
36
+ sampler_interval=10,
37
+ spec_min=[-5],
38
+ spec_max=[0],
39
+ ),
40
+ text_encoder=dict(
41
+ type="NaiveProjectionEncoder",
42
+ input_size=256,
43
+ output_size=hidden_size,
44
+ ),
45
+ speaker_encoder=dict(
46
+ type="NaiveProjectionEncoder",
47
+ input_size=10,
48
+ output_size=hidden_size,
49
+ use_embedding=True,
50
+ ),
51
+ pitch_encoder=dict(
52
+ type="NaiveProjectionEncoder",
53
+ input_size=1,
54
+ output_size=hidden_size,
55
+ use_embedding=False,
56
+ preprocessing=pitch_to_scale,
57
+ ),
58
+ vocoder=dict(
59
+ type="NsfHifiGAN",
60
+ checkpoint_path="checkpoints/nsf_hifigan/model",
61
+ sampling_rate=sampling_rate,
62
+ mel_channels=mel_channels,
63
+ use_natural_log=False,
64
+ ),
65
+ )
configs/_base_/datasets/audio_folder.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset = dict(
2
+ train=dict(
3
+ type="AudioFolderDataset",
4
+ path="dataset/train",
5
+ speaker_id=0,
6
+ ),
7
+ valid=dict(
8
+ type="AudioFolderDataset",
9
+ path="dataset/valid",
10
+ speaker_id=0,
11
+ ),
12
+ )
13
+
14
+ dataloader = dict(
15
+ train=dict(
16
+ batch_size=16,
17
+ shuffle=True,
18
+ num_workers=2,
19
+ persistent_workers=True,
20
+ ),
21
+ valid=dict(
22
+ batch_size=2,
23
+ shuffle=False,
24
+ num_workers=2,
25
+ persistent_workers=True,
26
+ ),
27
+ )
configs/_base_/schedulers/step.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ optimizer = dict(
2
+ type="AdamW",
3
+ lr=8e-4,
4
+ weight_decay=1e-2,
5
+ betas=(0.9, 0.98),
6
+ eps=1e-9,
7
+ )
8
+
9
+ scheduler = dict(
10
+ type="StepLR",
11
+ step_size=50000,
12
+ gamma=0.5,
13
+ )
configs/_base_/schedulers/warmup_cosine.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fish_diffusion.schedulers.warmup_cosine_scheduler import (
2
+ LambdaWarmUpCosineScheduler,
3
+ )
4
+
5
+ lambda_func = LambdaWarmUpCosineScheduler(
6
+ warm_up_steps=1000,
7
+ lr_min=1e-4,
8
+ lr_max=8e-4,
9
+ lr_start=1e-5,
10
+ max_decay_steps=150000,
11
+ )
12
+
13
+ optimizer = dict(
14
+ type="AdamW",
15
+ lr=1.0,
16
+ weight_decay=1e-2,
17
+ betas=(0.9, 0.98),
18
+ eps=1e-9,
19
+ )
20
+
21
+ scheduler = dict(
22
+ type="LambdaLR",
23
+ lr_lambda=lambda_func,
24
+ )
configs/_base_/schedulers/warmup_cosine_finetune.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fish_diffusion.schedulers.warmup_cosine_scheduler import (
2
+ LambdaWarmUpCosineScheduler,
3
+ )
4
+
5
+ lambda_func = LambdaWarmUpCosineScheduler(
6
+ warm_up_steps=1000,
7
+ lr_min=1e-4,
8
+ lr_max=4e-4,
9
+ lr_start=1e-5,
10
+ max_decay_steps=5000,
11
+ )
12
+
13
+ optimizer = dict(
14
+ type="AdamW",
15
+ lr=1.0,
16
+ weight_decay=1e-2,
17
+ betas=(0.9, 0.98),
18
+ eps=1e-9,
19
+ )
20
+
21
+ scheduler = dict(
22
+ type="LambdaLR",
23
+ lr_lambda=lambda_func,
24
+ )
configs/_base_/trainers/base.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
5
+ from pytorch_lightning.strategies import DDPStrategy
6
+
7
+ trainer = dict(
8
+ accelerator="gpu",
9
+ devices=-1,
10
+ gradient_clip_val=0.5,
11
+ log_every_n_steps=10,
12
+ val_check_interval=5000,
13
+ check_val_every_n_epoch=None,
14
+ max_steps=300000,
15
+ # Warning: If you are training the model with fs2 (and see nan), you should either use bf16 or fp32
16
+ precision=16,
17
+ callbacks=[
18
+ ModelCheckpoint(
19
+ filename="{epoch}-{step}-{valid_loss:.2f}",
20
+ every_n_train_steps=5000,
21
+ save_top_k=-1,
22
+ ),
23
+ LearningRateMonitor(logging_interval="step"),
24
+ ],
25
+ )
26
+
27
+ # Use DDP for multi-gpu training
28
+ if torch.cuda.is_available() and torch.cuda.device_count() > 1:
29
+ # Use gloo for windows
30
+ process_group_backend = "nccl" if sys.platform != "win32" else "gloo"
31
+
32
+ trainer["strategy"] = DDPStrategy(
33
+ find_unused_parameters=True, process_group_backend=process_group_backend
34
+ )
configs/svc_cn_hubert_soft.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fish_diffusion.datasets.audio_folder import AudioFolderDataset
2
+
3
+ _base_ = [
4
+ "./svc_hubert_soft.py",
5
+ ]
6
+
7
+ preprocessing = dict(
8
+ text_features_extractor=dict(
9
+ _delete_=True,
10
+ type="ChineseHubertSoft",
11
+ pretrained=True,
12
+ ),
13
+ )
configs/svc_cn_hubert_soft_finetune.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
2
+
3
+ from fish_diffusion.datasets.audio_folder import AudioFolderDataset
4
+
5
+ _base_ = [
6
+ "./_base_/archs/diff_svc_v2.py",
7
+ "./_base_/trainers/base.py",
8
+ "./_base_/schedulers/warmup_cosine_finetune.py",
9
+ "./_base_/datasets/audio_folder.py",
10
+ ]
11
+
12
+ speaker_mapping = {
13
+ "Placeholder": 0,
14
+ }
15
+
16
+ dataset = dict(
17
+ train=dict(
18
+ _delete_=True, # Delete the default train dataset
19
+ type="ConcatDataset",
20
+ datasets=[
21
+ dict(
22
+ type="AudioFolderDataset",
23
+ path="dataset/train",
24
+ speaker_id=speaker_mapping["Placeholder"],
25
+ ),
26
+ ],
27
+ # Are there any other ways to do this?
28
+ collate_fn=AudioFolderDataset.collate_fn,
29
+ ),
30
+ valid=dict(
31
+ _delete_=True, # Delete the default valid dataset
32
+ type="ConcatDataset",
33
+ datasets=[
34
+ dict(
35
+ type="AudioFolderDataset",
36
+ path="dataset/valid",
37
+ speaker_id=speaker_mapping["Placeholder"],
38
+ ),
39
+ ],
40
+ collate_fn=AudioFolderDataset.collate_fn,
41
+ ),
42
+ )
43
+
44
+ model = dict(
45
+ speaker_encoder=dict(
46
+ input_size=len(speaker_mapping),
47
+ ),
48
+ text_encoder=dict(
49
+ type="NaiveProjectionEncoder",
50
+ input_size=256,
51
+ output_size=256,
52
+ ),
53
+ )
54
+
55
+ preprocessing = dict(
56
+ text_features_extractor=dict(
57
+ type="ChineseHubertSoft",
58
+ pretrained=True,
59
+ gate_size=25,
60
+ ),
61
+ pitch_extractor=dict(
62
+ type="ParselMouthPitchExtractor",
63
+ ),
64
+ )
65
+
66
+ # The following trainer val and save checkpoints every 1000 steps
67
+ trainer = dict(
68
+ val_check_interval=1000,
69
+ callbacks=[
70
+ ModelCheckpoint(
71
+ filename="{epoch}-{step}-{valid_loss:.2f}",
72
+ every_n_train_steps=5000,
73
+ save_top_k=-1,
74
+ ),
75
+ LearningRateMonitor(logging_interval="step"),
76
+ ],
77
+ )
configs/svc_cn_hubert_soft_finetune_crepe.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
2
+
3
+ from fish_diffusion.datasets.audio_folder import AudioFolderDataset
4
+
5
+ _base_ = [
6
+ "./_base_/archs/diff_svc_v2.py",
7
+ "./_base_/trainers/base.py",
8
+ "./_base_/schedulers/warmup_cosine_finetune.py",
9
+ "./_base_/datasets/audio_folder.py",
10
+ ]
11
+
12
+ speaker_mapping = {
13
+ "Placeholder": 0,
14
+ }
15
+
16
+ dataset = dict(
17
+ train=dict(
18
+ _delete_=True, # Delete the default train dataset
19
+ type="ConcatDataset",
20
+ datasets=[
21
+ dict(
22
+ type="AudioFolderDataset",
23
+ path="dataset/train",
24
+ speaker_id=speaker_mapping["Placeholder"],
25
+ ),
26
+ ],
27
+ # Are there any other ways to do this?
28
+ collate_fn=AudioFolderDataset.collate_fn,
29
+ ),
30
+ valid=dict(
31
+ _delete_=True, # Delete the default valid dataset
32
+ type="ConcatDataset",
33
+ datasets=[
34
+ dict(
35
+ type="AudioFolderDataset",
36
+ path="dataset/valid",
37
+ speaker_id=speaker_mapping["Placeholder"],
38
+ ),
39
+ ],
40
+ collate_fn=AudioFolderDataset.collate_fn,
41
+ ),
42
+ )
43
+
44
+ model = dict(
45
+ speaker_encoder=dict(
46
+ input_size=len(speaker_mapping),
47
+ ),
48
+ text_encoder=dict(
49
+ type="NaiveProjectionEncoder",
50
+ input_size=256,
51
+ output_size=256,
52
+ ),
53
+ )
54
+
55
+ preprocessing = dict(
56
+ text_features_extractor=dict(
57
+ type="ChineseHubertSoft",
58
+ pretrained=True,
59
+ gate_size=25,
60
+ ),
61
+ pitch_extractor=dict(
62
+ type="CrepePitchExtractor",
63
+ ),
64
+ )
65
+
66
+ # The following trainer val and save checkpoints every 1000 steps
67
+ trainer = dict(
68
+ val_check_interval=1000,
69
+ callbacks=[
70
+ ModelCheckpoint(
71
+ filename="{epoch}-{step}-{valid_loss:.2f}",
72
+ every_n_train_steps=5000,
73
+ save_top_k=-1,
74
+ ),
75
+ LearningRateMonitor(logging_interval="step"),
76
+ ],
77
+ )
configs/svc_hubert_soft.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Warning: This config has a breaking change in Feb 12, 2023.
2
+ # It updates the arch from diff_svc to diff_svc_v2 and switch to the cosine scheduler.
3
+
4
+ _base_ = [
5
+ "./_base_/archs/diff_svc_v2.py",
6
+ "./_base_/trainers/base.py",
7
+ "./_base_/schedulers/warmup_cosine.py",
8
+ "./_base_/datasets/audio_folder.py",
9
+ ]
10
+
11
+
12
+ preprocessing = dict(
13
+ text_features_extractor=dict(
14
+ type="HubertSoft",
15
+ ),
16
+ pitch_extractor=dict(
17
+ # ParselMouth is much faster than Crepe
18
+ # However, Crepe may have better performance in some cases
19
+ type="ParselMouthPitchExtractor",
20
+ ),
21
+ )
configs/svc_hubert_soft_diff_svc.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ import numpy as np
4
+
5
+ from fish_diffusion.utils.pitch import pitch_to_coarse
6
+
7
+ _base_ = [
8
+ "./_base_/archs/diff_svc_v2.py",
9
+ "./_base_/trainers/base.py",
10
+ "./_base_/schedulers/step.py",
11
+ "./_base_/datasets/audio_folder.py",
12
+ ]
13
+
14
+ hidden_size = 256
15
+
16
+ model = dict(
17
+ type="DiffSVC",
18
+ speaker_encoder=dict(
19
+ _delete_=True,
20
+ # This is currently not used, all params will be zeroed
21
+ type="NaiveProjectionEncoder",
22
+ input_size=10,
23
+ output_size=hidden_size,
24
+ use_embedding=True,
25
+ ),
26
+ pitch_encoder=dict(
27
+ _delete_=True,
28
+ type="NaiveProjectionEncoder",
29
+ input_size=300,
30
+ output_size=hidden_size,
31
+ use_embedding=True,
32
+ # Since the pretrained model uses a 40.0 Hz minimum pitch,
33
+ preprocessing=partial(
34
+ pitch_to_coarse, f0_mel_min=1127 * np.log(1 + 40.0 / 700)
35
+ ),
36
+ ),
37
+ text_encoder=dict(
38
+ _delete_=True,
39
+ type="IdentityEncoder",
40
+ ),
41
+ diffusion=dict(
42
+ denoiser=dict(
43
+ residual_channels=384,
44
+ ),
45
+ ),
46
+ )
47
+
48
+ preprocessing = dict(
49
+ # You need to choose either "parselmouth" or "crepe" for pitch_extractor
50
+ pitch_extractor=dict(
51
+ type="CrepePitchExtractor",
52
+ f0_min=40.0,
53
+ f0_max=1100.0,
54
+ ),
55
+ text_features_extractor=dict(
56
+ type="HubertSoft",
57
+ ),
58
+ )
configs/svc_hubert_soft_multi_speakers.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fish_diffusion.datasets.audio_folder import AudioFolderDataset
2
+
3
+ _base_ = [
4
+ "./svc_hubert_soft.py",
5
+ ]
6
+
7
+ dataset = dict(
8
+ train=dict(
9
+ _delete_=True, # Delete the default train dataset
10
+ type="ConcatDataset",
11
+ datasets=[
12
+ dict(
13
+ type="AudioFolderDataset",
14
+ path="dataset/speaker_0",
15
+ speaker_id=0,
16
+ ),
17
+ dict(
18
+ type="AudioFolderDataset",
19
+ path="dataset/speaker_1",
20
+ speaker_id=1,
21
+ ),
22
+ ],
23
+ # Are there any other ways to do this?
24
+ collate_fn=AudioFolderDataset.collate_fn,
25
+ ),
26
+ valid=dict(
27
+ type="AudioFolderDataset",
28
+ path="dataset/valid",
29
+ speaker_id=0,
30
+ ),
31
+ )
32
+
33
+ model = dict(
34
+ speaker_encoder=dict(
35
+ input_size=2, # 2 speakers
36
+ ),
37
+ )
configs/svs_baseline.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Warning: This config is developing, and subject to change.
2
+
3
+ _base_ = [
4
+ "./_base_/archs/diff_svc_v2.py",
5
+ "./_base_/trainers/base.py",
6
+ "./_base_/schedulers/warmup_cosine.py",
7
+ "./_base_/datasets/audio_folder.py",
8
+ ]
9
+
10
+ phonemes = [
11
+ "AP",
12
+ "SP",
13
+ "E",
14
+ "En",
15
+ "a",
16
+ "ai",
17
+ "an",
18
+ "ang",
19
+ "ao",
20
+ "b",
21
+ "c",
22
+ "ch",
23
+ "d",
24
+ "e",
25
+ "ei",
26
+ "en",
27
+ "eng",
28
+ "er",
29
+ "f",
30
+ "g",
31
+ "h",
32
+ "i",
33
+ "i0",
34
+ "ia",
35
+ "ian",
36
+ "iang",
37
+ "iao",
38
+ "ie",
39
+ "in",
40
+ "ing",
41
+ "iong",
42
+ "ir",
43
+ "iu",
44
+ "j",
45
+ "k",
46
+ "l",
47
+ "m",
48
+ "n",
49
+ "o",
50
+ "ong",
51
+ "ou",
52
+ "p",
53
+ "q",
54
+ "r",
55
+ "s",
56
+ "sh",
57
+ "t",
58
+ "u",
59
+ "ua",
60
+ "uai",
61
+ "uan",
62
+ "uang",
63
+ "ui",
64
+ "un",
65
+ "uo",
66
+ "v",
67
+ "van",
68
+ "ve",
69
+ "vn",
70
+ "w",
71
+ "x",
72
+ "y",
73
+ "z",
74
+ "zh",
75
+ ]
76
+
77
+ preprocessing = dict(
78
+ text_features_extractor=dict(
79
+ type="OpenCpopTranscriptionToPhonemesDuration",
80
+ phonemes=phonemes,
81
+ transcription_path="dataset/transcriptions.txt",
82
+ ),
83
+ pitch_extractor=dict(
84
+ type="ParselMouthPitchExtractor",
85
+ ),
86
+ )
87
+
88
+ model = dict(
89
+ type="DiffSinger",
90
+ text_encoder=dict(
91
+ _delete_=True,
92
+ type="NaiveProjectionEncoder",
93
+ input_size=len(phonemes) * 2 + 2,
94
+ output_size=256,
95
+ ),
96
+ diffusion=dict(
97
+ max_beta=0.02,
98
+ ),
99
+ )
100
+
101
+ dataset = dict(
102
+ _delete_=True,
103
+ train=dict(
104
+ type="AudioFolderDataset",
105
+ path="dataset/diff-singer/train",
106
+ speaker_id=0,
107
+ ),
108
+ valid=dict(
109
+ type="AudioFolderDataset",
110
+ path="dataset/diff-singer/valid",
111
+ speaker_id=0,
112
+ ),
113
+ )
configs/train_my_config.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _base_ = [
2
+ ".\svc_cn_hubert_soft_finetune.py",
3
+ ]
4
+