pyp1 commited on
Commit
74f4df0
1 Parent(s): 4137d85

Push model using huggingface_hub.

Browse files
Files changed (3) hide show
  1. README.md +10 -1
  2. config.json +142 -142
  3. model.safetensors +1 -1
README.md CHANGED
@@ -1,3 +1,12 @@
1
  ---
2
- license: cc-by-nc-sa-4.0
 
 
 
 
 
3
  ---
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - Text-to-Speech
4
+ - VoiceCraft
5
+ - pytorch_model_hub_mixin
6
+ - model_hub_mixin
7
+ repo_url: https://github.com/jasonppy/VoiceCraft
8
  ---
9
+
10
+ This model has been pushed to the Hub using ****:
11
+ - Repo: https://github.com/jasonppy/VoiceCraft
12
+ - Docs: [More Information Needed]
config.json CHANGED
@@ -1,173 +1,173 @@
1
  {
2
- "seed": 1,
3
- "precision": "float16",
4
- "num_workers": 8,
5
- "resume": false,
6
- "tb_write_every_n_steps": 100,
7
- "print_every_n_steps": 400,
8
- "val_every_n_steps": 1600,
9
- "lr": 1e-05,
10
  "batch_size": 100,
11
- "weight_decay": 0.0,
12
- "warmup_fraction": 0.1,
13
- "num_epochs": 10,
14
- "num_steps": 500000,
15
- "gradient_accumulation_steps": 24,
16
- "gradient_clip_val": 1.0,
17
- "early_stop_step": 3200,
18
- "early_stop_threshold": -1.0,
19
- "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M",
20
  "dataset": "gigaspeech",
21
  "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl",
22
- "pseudo_epoch_size": 3000,
23
- "phn_folder_name": "phonemes",
24
- "encodec_folder_name": "encodec_16khz_4codebooks",
25
- "manifest_name": "manifest_large16khz_lessambi",
26
- "pad_x": 0,
27
- "max_num_tokens": 20000,
28
- "val_max_num_tokens": 6000,
29
- "num_buckets": 10,
30
  "dynamic_batching": 1,
31
- "audio_max_length": 16.0,
32
- "audio_min_length": 1.0,
33
- "text_max_length": 400,
34
- "text_min_length": 10.0,
35
  "encodec_sr": 50,
36
- "mask_len_min": 1,
37
- "mask_len_max": 600,
38
- "drop_long": 1,
39
  "eos": 2051,
40
- "reduced_eog": 1,
41
- "special_first": 0,
42
- "n_special": 4,
43
- "codebook_weight": "[2,1,1,1]",
44
- "empty_token": 2048,
45
- "optimizer_name": "AdamW",
46
- "reduce_lr_start_step": 3000,
47
- "reduce_lr_start_epoch": 4,
48
- "clipping_update_period": 1000,
49
  "max_mask_portion": 0.9,
50
  "max_n_spans": 3,
51
- "shuffle_mask_embedding": 0,
52
- "mask_sample_dist": "poisson1",
53
  "min_gap": 5,
54
  "n_codebooks": 4,
55
- "text_vocab_size": 120,
56
- "text_pad_token": 120,
57
- "audio_vocab_size": 2048,
58
- "eog": 2049,
59
- "audio_pad_token": 2050,
60
- "d_model": 1024,
61
- "audio_embedding_dim": 1024,
62
- "text_embedding_dropout": 0.0,
63
- "audio_embedding_dropout": 0.0,
64
- "text_positional_embedding_dropout": 0.0,
65
- "audio_positional_embedding_dropout": 0.0,
66
- "trm_dropout": 0.0,
67
  "nhead": 16,
 
68
  "num_decoder_layers": 24,
69
- "load_model_from": "./pretrained_models/giga330M.pth",
 
 
 
 
70
  "phn2num": {
71
- "\u0251\u02d0": 0,
72
- "u": 1,
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  "a\u026a\u025a": 2,
74
- "\u0254": 3,
75
- "x": 4,
76
- "\u0279": 5,
 
 
77
  "e\u026a": 6,
78
- "\u00f0": 7,
79
- "n\u02b2": 8,
 
 
 
 
 
 
 
 
80
  "m": 9,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  "\u00e7": 10,
82
- "\u025b\u0279": 11,
83
- "\u0329": 12,
 
 
 
 
 
 
 
84
  "\u0254\u026a": 13,
85
- "h": 14,
86
- "_": 15,
 
87
  "\u0259l": 16,
88
- "!": 17,
 
89
  "\u025b": 18,
90
- "w": 19,
91
- "b": 20,
92
  "\u025c\u02d0": 21,
93
- "z": 22,
94
- "n": 23,
 
 
 
 
 
 
95
  "\u027e": 24,
96
- "o\u028a": 25,
97
  "\u0283": 26,
98
- "i": 27,
99
  "\u028a\u0279": 28,
100
- "\u0254\u02d0": 29,
101
- "\u03b8": 30,
102
- "v": 31,
103
- "\u00e6": 32,
104
- "\u0254\u02d0\u0279": 33,
105
- "p": 34,
106
- "\u025a": 35,
107
- "a\u028a": 36,
108
- "\u0261\u02b2": 37,
109
  "\u028c": 38,
110
- "<MUSIC>": 39,
111
- "o\u02d0\u0279": 40,
112
- "k": 41,
113
- "i\u0259": 42,
114
- "\u028a": 43,
115
- "\u0251\u02d0\u0279": 44,
116
- "\u0303": 45,
117
- "\u026c": 46,
118
- "u\u02d0": 47,
119
- "a\u026a": 48,
120
- "\u0261": 49,
121
- "\u00e6\u00e6": 50,
122
- "i\u02d0\u02d0": 51,
123
- "<NOISE>": 52,
124
- "<SIL>": 53,
125
- "\u0259": 54,
126
  "\u0292": 55,
127
- "a\u026a\u0259": 56,
128
- "d\u0292": 57,
129
- "\u014b": 58,
130
  "\u0294": 59,
131
- "<OTHER>": 60,
132
- "\u0251": 61,
 
133
  "\u1d7b": 62,
134
- "l": 63,
135
- ",": 64,
136
- "\u026a": 65,
137
- "s": 66,
138
- "j": 67,
139
- "i\u02d0": 68,
140
- "f": 69,
141
- "\u0250": 70,
142
- "\u0250\u0250": 71,
143
- "d": 72,
144
- "t": 73,
145
- "o\u02d0": 74,
146
- "t\u0283": 75,
147
- "\u026a\u0279": 76,
148
- ".": 77,
149
- "?": 78,
150
- "r": 79,
151
- "1": 80,
152
- ";": 81,
153
- "\u025b\u02d0": 82,
154
- "\u0252": 83,
155
- "kh": 84,
156
- "e": 85,
157
- "o": 86,
158
- "t\u0255": 87,
159
- "\u00ab": 88,
160
- "\u00bb": 89,
161
- "\u0259\u028a": 90,
162
- "\u026f": 91,
163
- "\u2026": 92,
164
- ":": 93,
165
- "t\u02b0": 94,
166
- "\u00bf": 95,
167
- "q": 96,
168
- "\"": 97,
169
- "\u00a1": 98,
170
  "\u2014": 99,
171
- "\u026a\u02d0": 100
172
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  }
 
1
  {
2
+ "audio_embedding_dim": 1024,
3
+ "audio_embedding_dropout": 0.0,
4
+ "audio_max_length": 16.0,
5
+ "audio_min_length": 1.0,
6
+ "audio_pad_token": 2050,
7
+ "audio_positional_embedding_dropout": 0.0,
8
+ "audio_vocab_size": 2048,
 
9
  "batch_size": 100,
10
+ "clipping_update_period": 1000,
11
+ "codebook_weight": "[2,1,1,1]",
12
+ "d_model": 1024,
 
 
 
 
 
 
13
  "dataset": "gigaspeech",
14
  "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl",
15
+ "drop_long": 1,
 
 
 
 
 
 
 
16
  "dynamic_batching": 1,
17
+ "early_stop_step": 3200,
18
+ "early_stop_threshold": -1.0,
19
+ "empty_token": 2048,
20
+ "encodec_folder_name": "encodec_16khz_4codebooks",
21
  "encodec_sr": 50,
22
+ "eog": 2049,
 
 
23
  "eos": 2051,
24
+ "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M",
25
+ "gradient_accumulation_steps": 24,
26
+ "gradient_clip_val": 1.0,
27
+ "load_model_from": "./pretrained_models/giga330M.pth",
28
+ "lr": 1e-05,
29
+ "manifest_name": "manifest_large16khz_lessambi",
30
+ "mask_len_max": 600,
31
+ "mask_len_min": 1,
32
+ "mask_sample_dist": "poisson1",
33
  "max_mask_portion": 0.9,
34
  "max_n_spans": 3,
35
+ "max_num_tokens": 20000,
 
36
  "min_gap": 5,
37
  "n_codebooks": 4,
38
+ "n_special": 4,
 
 
 
 
 
 
 
 
 
 
 
39
  "nhead": 16,
40
+ "num_buckets": 10,
41
  "num_decoder_layers": 24,
42
+ "num_epochs": 10,
43
+ "num_steps": 500000,
44
+ "num_workers": 8,
45
+ "optimizer_name": "AdamW",
46
+ "pad_x": 0,
47
  "phn2num": {
48
+ "!": 17,
49
+ "\"": 97,
50
+ ",": 64,
51
+ ".": 77,
52
+ "1": 80,
53
+ ":": 93,
54
+ ";": 81,
55
+ "<MUSIC>": 39,
56
+ "<NOISE>": 52,
57
+ "<OTHER>": 60,
58
+ "<SIL>": 53,
59
+ "?": 78,
60
+ "_": 15,
61
+ "a\u026a": 48,
62
+ "a\u026a\u0259": 56,
63
  "a\u026a\u025a": 2,
64
+ "a\u028a": 36,
65
+ "b": 20,
66
+ "d": 72,
67
+ "d\u0292": 57,
68
+ "e": 85,
69
  "e\u026a": 6,
70
+ "f": 69,
71
+ "h": 14,
72
+ "i": 27,
73
+ "i\u0259": 42,
74
+ "i\u02d0": 68,
75
+ "i\u02d0\u02d0": 51,
76
+ "j": 67,
77
+ "k": 41,
78
+ "kh": 84,
79
+ "l": 63,
80
  "m": 9,
81
+ "n": 23,
82
+ "n\u02b2": 8,
83
+ "o": 86,
84
+ "o\u028a": 25,
85
+ "o\u02d0": 74,
86
+ "o\u02d0\u0279": 40,
87
+ "p": 34,
88
+ "q": 96,
89
+ "r": 79,
90
+ "s": 66,
91
+ "t": 73,
92
+ "t\u0255": 87,
93
+ "t\u0283": 75,
94
+ "t\u02b0": 94,
95
+ "u": 1,
96
+ "u\u02d0": 47,
97
+ "v": 31,
98
+ "w": 19,
99
+ "x": 4,
100
+ "z": 22,
101
+ "\u00a1": 98,
102
+ "\u00ab": 88,
103
+ "\u00bb": 89,
104
+ "\u00bf": 95,
105
+ "\u00e6": 32,
106
+ "\u00e6\u00e6": 50,
107
  "\u00e7": 10,
108
+ "\u00f0": 7,
109
+ "\u014b": 58,
110
+ "\u0250": 70,
111
+ "\u0250\u0250": 71,
112
+ "\u0251": 61,
113
+ "\u0251\u02d0": 0,
114
+ "\u0251\u02d0\u0279": 44,
115
+ "\u0252": 83,
116
+ "\u0254": 3,
117
  "\u0254\u026a": 13,
118
+ "\u0254\u02d0": 29,
119
+ "\u0254\u02d0\u0279": 33,
120
+ "\u0259": 54,
121
  "\u0259l": 16,
122
+ "\u0259\u028a": 90,
123
+ "\u025a": 35,
124
  "\u025b": 18,
125
+ "\u025b\u0279": 11,
126
+ "\u025b\u02d0": 82,
127
  "\u025c\u02d0": 21,
128
+ "\u0261": 49,
129
+ "\u0261\u02b2": 37,
130
+ "\u026a": 65,
131
+ "\u026a\u0279": 76,
132
+ "\u026a\u02d0": 100,
133
+ "\u026c": 46,
134
+ "\u026f": 91,
135
+ "\u0279": 5,
136
  "\u027e": 24,
 
137
  "\u0283": 26,
138
+ "\u028a": 43,
139
  "\u028a\u0279": 28,
 
 
 
 
 
 
 
 
 
140
  "\u028c": 38,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  "\u0292": 55,
 
 
 
142
  "\u0294": 59,
143
+ "\u0303": 45,
144
+ "\u0329": 12,
145
+ "\u03b8": 30,
146
  "\u1d7b": 62,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  "\u2014": 99,
148
+ "\u2026": 92
149
+ },
150
+ "phn_folder_name": "phonemes",
151
+ "precision": "float16",
152
+ "print_every_n_steps": 400,
153
+ "pseudo_epoch_size": 3000,
154
+ "reduce_lr_start_epoch": 4,
155
+ "reduce_lr_start_step": 3000,
156
+ "reduced_eog": 1,
157
+ "resume": false,
158
+ "seed": 1,
159
+ "shuffle_mask_embedding": 0,
160
+ "special_first": 0,
161
+ "tb_write_every_n_steps": 100,
162
+ "text_embedding_dropout": 0.0,
163
+ "text_max_length": 400,
164
+ "text_min_length": 10.0,
165
+ "text_pad_token": 120,
166
+ "text_positional_embedding_dropout": 0.0,
167
+ "text_vocab_size": 120,
168
+ "trm_dropout": 0.0,
169
+ "val_every_n_steps": 1600,
170
+ "val_max_num_tokens": 6000,
171
+ "warmup_fraction": 0.1,
172
+ "weight_decay": 0.0
173
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b0d1729b106fab1c4a87b65c9a57fb4f614ed7eb35f3a132484c831b98445c1
3
  size 1293853416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80a128677d3be5a846c5cbb2b45ed1f8a4bd4983b40923e56580c6f2a4557f00
3
  size 1293853416