multilingual dataset
Browse files- scripts/prepare_pretrain_dataset.py +294 -17
- scripts/pretrain-model.yaml +7 -10
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -42,15 +42,294 @@ datasets_configs = [
|
|
42 |
{'path': 'VMware/open-instruct', 'format': '{instruction} {response}'},
|
43 |
|
44 |
# multilingual
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
*[
|
46 |
-
{'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train
|
47 |
for data_dir in [
|
48 |
-
'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4'
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
]
|
51 |
],
|
52 |
*[
|
53 |
-
{'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train[
|
54 |
for name in [
|
55 |
'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
|
56 |
'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
|
@@ -66,17 +345,12 @@ datasets_configs = [
|
|
66 |
'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
|
67 |
'zh-Hans', 'zh-Hant', 'zu',
|
68 |
]
|
|
|
|
|
|
|
|
|
|
|
69 |
],
|
70 |
-
# *[
|
71 |
-
# {'path': 'Salesforce/wikitext', 'name': name, 'split': 'train+validation+test', 'format': '{text}'}
|
72 |
-
# for name in [
|
73 |
-
# 'wikitext-103-raw-v1',
|
74 |
-
# 'wikitext-103-v1',
|
75 |
-
# 'wikitext-2-raw-v1',
|
76 |
-
# 'wikitext-2-v1',
|
77 |
-
# ]
|
78 |
-
# ],
|
79 |
-
{'path': 'jordiclive/wikipedia-summary-dataset', 'format': '{summary}'},
|
80 |
# {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
|
81 |
|
82 |
# general
|
@@ -86,9 +360,9 @@ datasets_configs = [
|
|
86 |
# {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
|
87 |
# {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
|
88 |
# {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
|
89 |
-
|
90 |
# code
|
91 |
-
{'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
|
92 |
*[
|
93 |
{'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
|
94 |
for name in [
|
@@ -118,7 +392,10 @@ datasets_configs = [
|
|
118 |
|
119 |
# math
|
120 |
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
|
121 |
-
|
|
|
|
|
|
|
122 |
{'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
|
123 |
{'path': 'ajibawa-2023/Maths-College', 'split': 'train', 'format': '{instruction} {output}'},
|
124 |
{'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},
|
|
|
42 |
{'path': 'VMware/open-instruct', 'format': '{instruction} {response}'},
|
43 |
|
44 |
# multilingual
|
45 |
+
# *[
|
46 |
+
# {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': f'train[{i}%:{i + 1}%]', 'format': '{instruction} {input} {output}'}
|
47 |
+
# for data_dir in [
|
48 |
+
# 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
49 |
+
# 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
50 |
+
# ]
|
51 |
+
# for i in range(0, 100, 10)
|
52 |
+
# ],
|
53 |
*[
|
54 |
+
{'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': f'train', 'format': '{instruction} {input} {output}'}
|
55 |
for data_dir in [
|
56 |
+
f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
|
57 |
+
for n in [
|
58 |
+
'Afrikaans',
|
59 |
+
'Albanian',
|
60 |
+
'Amharic',
|
61 |
+
'Arabic',
|
62 |
+
'Armenian',
|
63 |
+
'Assamese',
|
64 |
+
'Aymara',
|
65 |
+
'Azerbaijani',
|
66 |
+
'Bambara',
|
67 |
+
'Basque',
|
68 |
+
'Belarusian',
|
69 |
+
'Bengali',
|
70 |
+
'Bhojpuri',
|
71 |
+
'Bosnian',
|
72 |
+
'Bulgarian',
|
73 |
+
'Catalan',
|
74 |
+
'Cebuano',
|
75 |
+
'Chichewa',
|
76 |
+
'ChineseSimplified',
|
77 |
+
'ChineseTraditional',
|
78 |
+
'Corsican',
|
79 |
+
'Croatian',
|
80 |
+
'Czech',
|
81 |
+
'Danish',
|
82 |
+
'Divehi',
|
83 |
+
'Dogri',
|
84 |
+
'Dutch',
|
85 |
+
'Esperanto',
|
86 |
+
'Estonian',
|
87 |
+
'Ewe',
|
88 |
+
'Filipino',
|
89 |
+
'Finnish',
|
90 |
+
'French',
|
91 |
+
'Frisian',
|
92 |
+
'Galician',
|
93 |
+
'Georgian',
|
94 |
+
'German',
|
95 |
+
'Greek',
|
96 |
+
'Guarani',
|
97 |
+
'Gujarati',
|
98 |
+
'Haitian_Creole',
|
99 |
+
'Hausa',
|
100 |
+
'Hawaiian',
|
101 |
+
'Hebrew',
|
102 |
+
'Hindi',
|
103 |
+
'Hmong',
|
104 |
+
'Hungarian',
|
105 |
+
'Icelandic',
|
106 |
+
'Igbo',
|
107 |
+
'Ilocano',
|
108 |
+
'Indonesian',
|
109 |
+
'Irish',
|
110 |
+
'Italian',
|
111 |
+
'Japanese',
|
112 |
+
'Javanese',
|
113 |
+
'Kannada',
|
114 |
+
'Kazakh',
|
115 |
+
'Khmer',
|
116 |
+
'Kinyarwanda',
|
117 |
+
'Konkani',
|
118 |
+
'Korean',
|
119 |
+
'Krio',
|
120 |
+
'Kurdish_Kurmanji',
|
121 |
+
'Kurdish_Sorani',
|
122 |
+
'Kyrgyz',
|
123 |
+
'Lao',
|
124 |
+
'Latin',
|
125 |
+
'Latvian',
|
126 |
+
'Lingala',
|
127 |
+
'Lithuanian',
|
128 |
+
'Luganda',
|
129 |
+
'Luxembourgish',
|
130 |
+
'Macedonian',
|
131 |
+
'Maithili',
|
132 |
+
'Malagasy',
|
133 |
+
'Malay',
|
134 |
+
'Malayalam',
|
135 |
+
'Maltese',
|
136 |
+
'Maori',
|
137 |
+
'Marathi',
|
138 |
+
'Meiteilon_Manipuri',
|
139 |
+
'Mizo',
|
140 |
+
'Mongolian',
|
141 |
+
'Myanmar_Burmese',
|
142 |
+
'Nepali',
|
143 |
+
'Norwegian',
|
144 |
+
'Odia_Oriya',
|
145 |
+
'Oromo',
|
146 |
+
'Pashto',
|
147 |
+
'Persian',
|
148 |
+
'Polish',
|
149 |
+
'Portuguese',
|
150 |
+
'Punjabi',
|
151 |
+
'Quechua',
|
152 |
+
'Romanian',
|
153 |
+
'Russian',
|
154 |
+
'Samoan',
|
155 |
+
'Sanskrit',
|
156 |
+
'ScottishGaelic',
|
157 |
+
'Sepedi',
|
158 |
+
'Serbian',
|
159 |
+
'Sesotho',
|
160 |
+
'Shona',
|
161 |
+
'Sindhi',
|
162 |
+
'Sinhala',
|
163 |
+
'Slovak',
|
164 |
+
'Slovenian',
|
165 |
+
'Somali',
|
166 |
+
'Spanish',
|
167 |
+
'Sundanese',
|
168 |
+
'Swahili',
|
169 |
+
'Swedish',
|
170 |
+
'Tajik',
|
171 |
+
'Tamil',
|
172 |
+
'Tatar',
|
173 |
+
'Telugu',
|
174 |
+
'Thai',
|
175 |
+
'Tigrinya',
|
176 |
+
'Tsonga',
|
177 |
+
'Turkish',
|
178 |
+
'Turkmen',
|
179 |
+
'Twi',
|
180 |
+
'Ukrainian',
|
181 |
+
'Urdu',
|
182 |
+
'Uyghur',
|
183 |
+
'Uzbek',
|
184 |
+
'Vietnamese',
|
185 |
+
'Welsh',
|
186 |
+
'Xhosa',
|
187 |
+
'Yiddish',
|
188 |
+
'Yoruba',
|
189 |
+
'Zulu',
|
190 |
+
]
|
191 |
+
]
|
192 |
+
],
|
193 |
+
*[
|
194 |
+
{'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': f'train', 'format': '{instruction} {input} {output}'}
|
195 |
+
for n in [
|
196 |
+
'Afrikaans.json',
|
197 |
+
'Albanian.json',
|
198 |
+
'Amharic.json',
|
199 |
+
'Arabic.json',
|
200 |
+
'Armenian.json',
|
201 |
+
'Assamese.json',
|
202 |
+
'Aymara.json',
|
203 |
+
'Azerbaijani.json',
|
204 |
+
'Bambara.json',
|
205 |
+
'Basque.json',
|
206 |
+
'Belarusian.json',
|
207 |
+
'Bengali.json',
|
208 |
+
'Bhojpuri.json',
|
209 |
+
'Bosnian.json',
|
210 |
+
'Bulgarian.json',
|
211 |
+
'Catalan.json',
|
212 |
+
'Cebuano.json',
|
213 |
+
'Chichewa.json',
|
214 |
+
'ChineseSimplified.json',
|
215 |
+
'ChineseTraditional.json',
|
216 |
+
'Corsican.json',
|
217 |
+
'Croatian.json',
|
218 |
+
'Czech.json',
|
219 |
+
'Danish.json',
|
220 |
+
'Dhivehi.json',
|
221 |
+
'Dogri.json',
|
222 |
+
'Dutch.json',
|
223 |
+
'English.json',
|
224 |
+
'Esperanto.json',
|
225 |
+
'Estonian.json',
|
226 |
+
'Ewe.json',
|
227 |
+
'Filipino.json',
|
228 |
+
'Finnish.json',
|
229 |
+
'French.json',
|
230 |
+
'Frisian.json',
|
231 |
+
'Galician.json',
|
232 |
+
'Georgian.json',
|
233 |
+
'German.json',
|
234 |
+
'Greek.json',
|
235 |
+
'Guarani.json',
|
236 |
+
'Gujarati.json',
|
237 |
+
'Haitian_Creole.json',
|
238 |
+
'Hausa.json',
|
239 |
+
'Hawaiian.json',
|
240 |
+
'Hebrew.json',
|
241 |
+
'Hindi.json',
|
242 |
+
'Hmong.json',
|
243 |
+
'Hungarian.json',
|
244 |
+
'Icelandic.json',
|
245 |
+
'Igbo.json',
|
246 |
+
'Ilocano.json',
|
247 |
+
'Indonesian.json',
|
248 |
+
'Irish.json',
|
249 |
+
'Italian.json',
|
250 |
+
'Japanese.json',
|
251 |
+
'Javanese.json',
|
252 |
+
'Kannada.json',
|
253 |
+
'Kazakh.json',
|
254 |
+
'Khmer.json',
|
255 |
+
'Kinyarwanda.json',
|
256 |
+
'Konkani.json',
|
257 |
+
'Korean.json',
|
258 |
+
'Krio.json',
|
259 |
+
'Kurdish_Kurmanji.json',
|
260 |
+
'Kurdish_Sorani.json',
|
261 |
+
'Kyrgyz.json',
|
262 |
+
'Lao.json',
|
263 |
+
'Latin.json',
|
264 |
+
'Latvian.json',
|
265 |
+
'Lingala.json',
|
266 |
+
'Lithuanian.json',
|
267 |
+
'Luganda.json',
|
268 |
+
'Luxembourgish.json',
|
269 |
+
'Macedonian.json',
|
270 |
+
'Maithili.json',
|
271 |
+
'Malagasy.json',
|
272 |
+
'Malayalam.json',
|
273 |
+
'Malay.json',
|
274 |
+
'Maltese.json',
|
275 |
+
'Maori.json',
|
276 |
+
'Marathi.json',
|
277 |
+
'Meiteilon_Manipuri.json',
|
278 |
+
'Mizo.json',
|
279 |
+
'Mongolian.json',
|
280 |
+
'Myanmar_Burmese.json',
|
281 |
+
'Nepali.json',
|
282 |
+
'Norwegian.json',
|
283 |
+
'Odia_Oriya.json',
|
284 |
+
'Oromo.json',
|
285 |
+
'Pashto.json',
|
286 |
+
'Persian.json',
|
287 |
+
'Polish.json',
|
288 |
+
'Portuguese.json',
|
289 |
+
'Punjabi.json',
|
290 |
+
'Quechua.json',
|
291 |
+
'Romanian.json',
|
292 |
+
'Russian.json',
|
293 |
+
'Samoan.json',
|
294 |
+
'Sanskrit.json',
|
295 |
+
'ScottishGaelic.json',
|
296 |
+
'Sepedi.json',
|
297 |
+
'Serbian.json',
|
298 |
+
'Sesotho.json',
|
299 |
+
'Shona.json',
|
300 |
+
'Sindhi.json',
|
301 |
+
'Sinhala.json',
|
302 |
+
'Slovak.json',
|
303 |
+
'Slovenian.json',
|
304 |
+
'Somali.json',
|
305 |
+
'Spanish.json',
|
306 |
+
'Sundanese.json',
|
307 |
+
'Swahili.json',
|
308 |
+
'Swedish.json',
|
309 |
+
'Tajik.json',
|
310 |
+
'Tamil.json',
|
311 |
+
'Tatar.json',
|
312 |
+
'Telugu.json',
|
313 |
+
'Thai.json',
|
314 |
+
'Tigrinya.json',
|
315 |
+
'Tsonga.json',
|
316 |
+
'Turkish.json',
|
317 |
+
'Turkmen.json',
|
318 |
+
'Twi.json',
|
319 |
+
'Ukrainian.json',
|
320 |
+
'Urdu.json',
|
321 |
+
'Uyghur.json',
|
322 |
+
'Uzbek.json',
|
323 |
+
'Vietnamese.json',
|
324 |
+
'Welsh.json',
|
325 |
+
'Xhosa.json',
|
326 |
+
'Yiddish.json',
|
327 |
+
'Yoruba.json',
|
328 |
+
'Zulu.json',
|
329 |
]
|
330 |
],
|
331 |
*[
|
332 |
+
{'path': 'xu-song/cc100-samples', 'name': name, 'split': f'train[{i}%:{i + 1}%]', 'format': '{text}'}
|
333 |
for name in [
|
334 |
'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
|
335 |
'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
|
|
|
345 |
'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
|
346 |
'zh-Hans', 'zh-Hant', 'zu',
|
347 |
]
|
348 |
+
for i in range(0, 100, 10)
|
349 |
+
],
|
350 |
+
*[
|
351 |
+
{'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 5}%]', 'format': '{summary}'}
|
352 |
+
for i in range(0, 100, 5)
|
353 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
# {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
|
355 |
|
356 |
# general
|
|
|
360 |
# {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
|
361 |
# {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
|
362 |
# {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
|
363 |
+
|
364 |
# code
|
365 |
+
# {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
|
366 |
*[
|
367 |
{'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
|
368 |
for name in [
|
|
|
392 |
|
393 |
# math
|
394 |
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
|
395 |
+
*[
|
396 |
+
{'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 1}%]', 'format': '{text}'}
|
397 |
+
for i in range(0, 100, 20)
|
398 |
+
],
|
399 |
{'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
|
400 |
{'path': 'ajibawa-2023/Maths-College', 'split': 'train', 'format': '{instruction} {output}'},
|
401 |
{'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},
|
scripts/pretrain-model.yaml
CHANGED
@@ -8,10 +8,10 @@ model_config:
|
|
8 |
padded_vocab_size: 38400
|
9 |
vocab_size: 38400
|
10 |
block_size: 8192
|
11 |
-
n_layer:
|
12 |
n_head: 32
|
13 |
head_size: null
|
14 |
-
n_embd:
|
15 |
n_query_groups: 8
|
16 |
rotary_percentage: 1.0
|
17 |
parallel_residual: false
|
@@ -19,7 +19,7 @@ model_config:
|
|
19 |
norm_class_name: "RMSNorm"
|
20 |
norm_eps: 1e-05
|
21 |
mlp_class_name: "LLaMAMLP"
|
22 |
-
intermediate_size:
|
23 |
rope_base: 500000
|
24 |
# rope_adjustments:
|
25 |
# factor: 32.0
|
@@ -77,17 +77,16 @@ train:
|
|
77 |
|
78 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
79 |
# max_tokens: 3000000000000
|
80 |
-
|
81 |
-
max_tokens: 11422750857 # 796399 * 2049 * 7
|
82 |
|
83 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
84 |
max_steps:
|
85 |
|
86 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
87 |
-
max_seq_length:
|
88 |
|
89 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
90 |
-
tie_embeddings:
|
91 |
|
92 |
# (type: Optional[float], default: 1.0)
|
93 |
max_norm: 1.0
|
@@ -121,11 +120,9 @@ optimizer:
|
|
121 |
|
122 |
init_args:
|
123 |
# (type: float, default: 0.001)
|
124 |
-
|
125 |
-
lr: 1e-4
|
126 |
|
127 |
# (type: float, default: 0.01)
|
128 |
-
# weight_decay: 0.01
|
129 |
weight_decay: 0.1
|
130 |
|
131 |
# (type: tuple, default: (0.9,0.999))
|
|
|
8 |
padded_vocab_size: 38400
|
9 |
vocab_size: 38400
|
10 |
block_size: 8192
|
11 |
+
n_layer: 32
|
12 |
n_head: 32
|
13 |
head_size: null
|
14 |
+
n_embd: 256
|
15 |
n_query_groups: 8
|
16 |
rotary_percentage: 1.0
|
17 |
parallel_residual: false
|
|
|
19 |
norm_class_name: "RMSNorm"
|
20 |
norm_eps: 1e-05
|
21 |
mlp_class_name: "LLaMAMLP"
|
22 |
+
intermediate_size: 1024
|
23 |
rope_base: 500000
|
24 |
# rope_adjustments:
|
25 |
# factor: 32.0
|
|
|
77 |
|
78 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
79 |
# max_tokens: 3000000000000
|
80 |
+
max_tokens: 8159107755 # 796399 * 2049 * 5
|
|
|
81 |
|
82 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
83 |
max_steps:
|
84 |
|
85 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
86 |
+
max_seq_length: 2049
|
87 |
|
88 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
89 |
+
tie_embeddings: true
|
90 |
|
91 |
# (type: Optional[float], default: 1.0)
|
92 |
max_norm: 1.0
|
|
|
120 |
|
121 |
init_args:
|
122 |
# (type: float, default: 0.001)
|
123 |
+
lr: 1e-3
|
|
|
124 |
|
125 |
# (type: float, default: 0.01)
|
|
|
126 |
weight_decay: 0.1
|
127 |
|
128 |
# (type: tuple, default: (0.9,0.999))
|