pretrain model
Browse files
scripts/prepare_contrain_dataset.py
CHANGED
@@ -28,4 +28,6 @@ https://huggingface.co/datasets/KingNish/reasoning-base-20k
|
|
28 |
https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
|
29 |
https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
|
30 |
https://huggingface.co/datasets/thesven/gsm8k-reasoning
|
|
|
|
|
31 |
"""
|
|
|
28 |
https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
|
29 |
https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
|
30 |
https://huggingface.co/datasets/thesven/gsm8k-reasoning
|
31 |
+
|
32 |
+
https://huggingface.co/datasets/codeparrot/self-instruct-starcoder
|
33 |
"""
|
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -55,277 +55,68 @@ datasets_configs = [
|
|
55 |
for data_dir in [
|
56 |
f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
|
57 |
for n in [
|
58 |
-
'Afrikaans',
|
59 |
-
'
|
60 |
-
'
|
61 |
-
'
|
62 |
-
'
|
63 |
-
'
|
64 |
-
'
|
65 |
-
'
|
66 |
-
'
|
67 |
-
'
|
68 |
-
'
|
69 |
-
'
|
70 |
-
'
|
71 |
-
'
|
72 |
-
'
|
73 |
-
'
|
74 |
-
'
|
75 |
-
'
|
76 |
-
'
|
77 |
-
'
|
78 |
-
'
|
79 |
-
'
|
80 |
-
'
|
81 |
-
'Danish',
|
82 |
-
'Divehi',
|
83 |
-
'Dogri',
|
84 |
-
'Dutch',
|
85 |
-
'Esperanto',
|
86 |
-
'Estonian',
|
87 |
-
'Ewe',
|
88 |
-
'Filipino',
|
89 |
-
'Finnish',
|
90 |
-
'French',
|
91 |
-
'Frisian',
|
92 |
-
'Galician',
|
93 |
-
'Georgian',
|
94 |
-
'German',
|
95 |
-
'Greek',
|
96 |
-
'Guarani',
|
97 |
-
'Gujarati',
|
98 |
-
'Haitian_Creole',
|
99 |
-
'Hausa',
|
100 |
-
'Hawaiian',
|
101 |
-
'Hebrew',
|
102 |
-
'Hindi',
|
103 |
-
'Hmong',
|
104 |
-
'Hungarian',
|
105 |
-
'Icelandic',
|
106 |
-
'Igbo',
|
107 |
-
'Ilocano',
|
108 |
-
'Indonesian',
|
109 |
-
'Irish',
|
110 |
-
'Italian',
|
111 |
-
'Japanese',
|
112 |
-
'Javanese',
|
113 |
-
'Kannada',
|
114 |
-
'Kazakh',
|
115 |
-
'Khmer',
|
116 |
-
'Kinyarwanda',
|
117 |
-
'Konkani',
|
118 |
-
'Korean',
|
119 |
-
'Krio',
|
120 |
-
'Kurdish_Kurmanji',
|
121 |
-
'Kurdish_Sorani',
|
122 |
-
'Kyrgyz',
|
123 |
-
'Lao',
|
124 |
-
'Latin',
|
125 |
-
'Latvian',
|
126 |
-
'Lingala',
|
127 |
-
'Lithuanian',
|
128 |
-
'Luganda',
|
129 |
-
'Luxembourgish',
|
130 |
-
'Macedonian',
|
131 |
-
'Maithili',
|
132 |
-
'Malagasy',
|
133 |
-
'Malay',
|
134 |
-
'Malayalam',
|
135 |
-
'Maltese',
|
136 |
-
'Maori',
|
137 |
-
'Marathi',
|
138 |
-
'Meiteilon_Manipuri',
|
139 |
-
'Mizo',
|
140 |
-
'Mongolian',
|
141 |
-
'Myanmar_Burmese',
|
142 |
-
'Nepali',
|
143 |
-
'Norwegian',
|
144 |
-
'Odia_Oriya',
|
145 |
-
'Oromo',
|
146 |
-
'Pashto',
|
147 |
-
'Persian',
|
148 |
-
'Polish',
|
149 |
-
'Portuguese',
|
150 |
-
'Punjabi',
|
151 |
-
'Quechua',
|
152 |
-
'Romanian',
|
153 |
-
'Russian',
|
154 |
-
'Samoan',
|
155 |
-
'Sanskrit',
|
156 |
-
'ScottishGaelic',
|
157 |
-
'Sepedi',
|
158 |
-
'Serbian',
|
159 |
-
'Sesotho',
|
160 |
-
'Shona',
|
161 |
-
'Sindhi',
|
162 |
-
'Sinhala',
|
163 |
-
'Slovak',
|
164 |
-
'Slovenian',
|
165 |
-
'Somali',
|
166 |
-
'Spanish',
|
167 |
-
'Sundanese',
|
168 |
-
'Swahili',
|
169 |
-
'Swedish',
|
170 |
-
'Tajik',
|
171 |
-
'Tamil',
|
172 |
-
'Tatar',
|
173 |
-
'Telugu',
|
174 |
-
'Thai',
|
175 |
-
'Tigrinya',
|
176 |
-
'Tsonga',
|
177 |
-
'Turkish',
|
178 |
-
'Turkmen',
|
179 |
-
'Twi',
|
180 |
-
'Ukrainian',
|
181 |
-
'Urdu',
|
182 |
-
'Uyghur',
|
183 |
-
'Uzbek',
|
184 |
-
'Vietnamese',
|
185 |
-
'Welsh',
|
186 |
-
'Xhosa',
|
187 |
-
'Yiddish',
|
188 |
-
'Yoruba',
|
189 |
-
'Zulu',
|
190 |
]
|
191 |
]
|
192 |
],
|
193 |
*[
|
194 |
{'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': f'train', 'format': '{instruction} {input} {output}'}
|
195 |
for n in [
|
196 |
-
'Afrikaans.json',
|
197 |
-
'
|
198 |
-
'
|
199 |
-
'
|
200 |
-
'
|
201 |
-
'
|
202 |
-
'
|
203 |
-
'
|
204 |
-
'
|
205 |
-
'
|
206 |
-
'
|
207 |
-
'
|
208 |
-
'
|
209 |
-
'
|
210 |
-
'
|
211 |
-
'
|
212 |
-
'
|
213 |
-
'
|
214 |
-
'
|
215 |
-
'
|
216 |
-
'
|
217 |
-
'
|
218 |
-
'
|
219 |
-
'
|
220 |
-
'
|
221 |
-
'
|
222 |
-
'
|
223 |
-
'
|
224 |
-
'
|
225 |
-
'
|
226 |
-
'
|
227 |
-
'
|
228 |
-
'
|
229 |
-
'French.json',
|
230 |
-
'Frisian.json',
|
231 |
-
'Galician.json',
|
232 |
-
'Georgian.json',
|
233 |
-
'German.json',
|
234 |
-
'Greek.json',
|
235 |
-
'Guarani.json',
|
236 |
-
'Gujarati.json',
|
237 |
-
'Haitian_Creole.json',
|
238 |
-
'Hausa.json',
|
239 |
-
'Hawaiian.json',
|
240 |
-
'Hebrew.json',
|
241 |
-
'Hindi.json',
|
242 |
-
'Hmong.json',
|
243 |
-
'Hungarian.json',
|
244 |
-
'Icelandic.json',
|
245 |
-
'Igbo.json',
|
246 |
-
'Ilocano.json',
|
247 |
-
'Indonesian.json',
|
248 |
-
'Irish.json',
|
249 |
-
'Italian.json',
|
250 |
-
'Japanese.json',
|
251 |
-
'Javanese.json',
|
252 |
-
'Kannada.json',
|
253 |
-
'Kazakh.json',
|
254 |
-
'Khmer.json',
|
255 |
-
'Kinyarwanda.json',
|
256 |
-
'Konkani.json',
|
257 |
-
'Korean.json',
|
258 |
-
'Krio.json',
|
259 |
-
'Kurdish_Kurmanji.json',
|
260 |
-
'Kurdish_Sorani.json',
|
261 |
-
'Kyrgyz.json',
|
262 |
-
'Lao.json',
|
263 |
-
'Latin.json',
|
264 |
-
'Latvian.json',
|
265 |
-
'Lingala.json',
|
266 |
-
'Lithuanian.json',
|
267 |
-
'Luganda.json',
|
268 |
-
'Luxembourgish.json',
|
269 |
-
'Macedonian.json',
|
270 |
-
'Maithili.json',
|
271 |
-
'Malagasy.json',
|
272 |
-
'Malayalam.json',
|
273 |
-
'Malay.json',
|
274 |
-
'Maltese.json',
|
275 |
-
'Maori.json',
|
276 |
-
'Marathi.json',
|
277 |
-
'Meiteilon_Manipuri.json',
|
278 |
-
'Mizo.json',
|
279 |
-
'Mongolian.json',
|
280 |
-
'Myanmar_Burmese.json',
|
281 |
-
'Nepali.json',
|
282 |
-
'Norwegian.json',
|
283 |
-
'Odia_Oriya.json',
|
284 |
-
'Oromo.json',
|
285 |
-
'Pashto.json',
|
286 |
-
'Persian.json',
|
287 |
-
'Polish.json',
|
288 |
-
'Portuguese.json',
|
289 |
-
'Punjabi.json',
|
290 |
-
'Quechua.json',
|
291 |
-
'Romanian.json',
|
292 |
-
'Russian.json',
|
293 |
-
'Samoan.json',
|
294 |
-
'Sanskrit.json',
|
295 |
-
'ScottishGaelic.json',
|
296 |
-
'Sepedi.json',
|
297 |
-
'Serbian.json',
|
298 |
-
'Sesotho.json',
|
299 |
-
'Shona.json',
|
300 |
-
'Sindhi.json',
|
301 |
-
'Sinhala.json',
|
302 |
-
'Slovak.json',
|
303 |
-
'Slovenian.json',
|
304 |
-
'Somali.json',
|
305 |
-
'Spanish.json',
|
306 |
-
'Sundanese.json',
|
307 |
-
'Swahili.json',
|
308 |
-
'Swedish.json',
|
309 |
-
'Tajik.json',
|
310 |
-
'Tamil.json',
|
311 |
-
'Tatar.json',
|
312 |
-
'Telugu.json',
|
313 |
-
'Thai.json',
|
314 |
-
'Tigrinya.json',
|
315 |
-
'Tsonga.json',
|
316 |
-
'Turkish.json',
|
317 |
-
'Turkmen.json',
|
318 |
-
'Twi.json',
|
319 |
-
'Ukrainian.json',
|
320 |
-
'Urdu.json',
|
321 |
-
'Uyghur.json',
|
322 |
-
'Uzbek.json',
|
323 |
-
'Vietnamese.json',
|
324 |
-
'Welsh.json',
|
325 |
-
'Xhosa.json',
|
326 |
-
'Yiddish.json',
|
327 |
-
'Yoruba.json',
|
328 |
-
'Zulu.json',
|
329 |
]
|
330 |
],
|
331 |
*[
|
|
|
55 |
for data_dir in [
|
56 |
f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
|
57 |
for n in [
|
58 |
+
'Afrikaans', 'Albanian', 'Amharic', 'Arabic', 'Armenian', 'Assamese',
|
59 |
+
'Aymara', 'Azerbaijani', 'Bambara', 'Basque', 'Belarusian', 'Bengali',
|
60 |
+
'Bhojpuri', 'Bosnian', 'Bulgarian', 'Catalan', 'Cebuano', 'Chichewa',
|
61 |
+
'ChineseSimplified', 'ChineseTraditional', 'Corsican', 'Croatian',
|
62 |
+
'Czech', 'Danish', 'Divehi', 'Dogri', 'Dutch', 'Esperanto', 'Estonian',
|
63 |
+
'Ewe', 'Filipino', 'Finnish', 'French', 'Frisian', 'Galician',
|
64 |
+
'Georgian', 'German', 'Greek', 'Guarani', 'Gujarati', 'Haitian_Creole',
|
65 |
+
'Hausa', 'Hawaiian', 'Hebrew', 'Hindi', 'Hmong', 'Hungarian',
|
66 |
+
'Icelandic', 'Igbo', 'Ilocano', 'Indonesian', 'Irish', 'Italian',
|
67 |
+
'Japanese', 'Javanese', 'Kannada', 'Kazakh', 'Khmer', 'Kinyarwanda',
|
68 |
+
'Konkani', 'Korean', 'Krio', 'Kurdish_Kurmanji', 'Kurdish_Sorani',
|
69 |
+
'Kyrgyz', 'Lao', 'Latin', 'Latvian', 'Lingala', 'Lithuanian',
|
70 |
+
'Luganda', 'Luxembourgish', 'Macedonian', 'Maithili', 'Malagasy',
|
71 |
+
'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Meiteilon_Manipuri',
|
72 |
+
'Mizo', 'Mongolian', 'Myanmar_Burmese', 'Nepali', 'Norwegian',
|
73 |
+
'Odia_Oriya', 'Oromo', 'Pashto', 'Persian', 'Polish', 'Portuguese',
|
74 |
+
'Punjabi', 'Quechua', 'Romanian', 'Russian', 'Samoan', 'Sanskrit',
|
75 |
+
'ScottishGaelic', 'Sepedi', 'Serbian', 'Sesotho', 'Shona', 'Sindhi',
|
76 |
+
'Sinhala', 'Slovak', 'Slovenian', 'Somali', 'Spanish', 'Sundanese',
|
77 |
+
'Swahili', 'Swedish', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai',
|
78 |
+
'Tigrinya', 'Tsonga', 'Turkish', 'Turkmen', 'Twi', 'Ukrainian',
|
79 |
+
'Urdu', 'Uyghur', 'Uzbek', 'Vietnamese', 'Welsh', 'Xhosa',
|
80 |
+
'Yiddish', 'Yoruba', 'Zulu',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
]
|
82 |
]
|
83 |
],
|
84 |
*[
|
85 |
{'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': f'train', 'format': '{instruction} {input} {output}'}
|
86 |
for n in [
|
87 |
+
'Afrikaans.json', 'Albanian.json', 'Amharic.json', 'Arabic.json', 'Armenian.json',
|
88 |
+
'Assamese.json', 'Aymara.json', 'Azerbaijani.json', 'Bambara.json', 'Basque.json',
|
89 |
+
'Belarusian.json', 'Bengali.json', 'Bhojpuri.json', 'Bosnian.json', 'Bulgarian.json',
|
90 |
+
'Catalan.json', 'Cebuano.json', 'Chichewa.json', 'ChineseSimplified.json',
|
91 |
+
'ChineseTraditional.json', 'Corsican.json', 'Croatian.json', 'Czech.json',
|
92 |
+
'Danish.json', 'Dhivehi.json', 'Dogri.json', 'Dutch.json', 'English.json',
|
93 |
+
'Esperanto.json', 'Estonian.json', 'Ewe.json', 'Filipino.json',
|
94 |
+
'Finnish.json', 'French.json', 'Frisian.json', 'Galician.json',
|
95 |
+
'Georgian.json', 'German.json', 'Greek.json', 'Guarani.json',
|
96 |
+
'Gujarati.json', 'Haitian_Creole.json', 'Hausa.json', 'Hawaiian.json',
|
97 |
+
'Hebrew.json', 'Hindi.json', 'Hmong.json', 'Hungarian.json',
|
98 |
+
'Icelandic.json', 'Igbo.json', 'Ilocano.json', 'Indonesian.json',
|
99 |
+
'Irish.json', 'Italian.json', 'Japanese.json', 'Javanese.json',
|
100 |
+
'Kannada.json', 'Kazakh.json', 'Khmer.json', 'Kinyarwanda.json',
|
101 |
+
'Konkani.json', 'Korean.json', 'Krio.json', 'Kurdish_Kurmanji.json',
|
102 |
+
'Kurdish_Sorani.json', 'Kyrgyz.json', 'Lao.json', 'Latin.json',
|
103 |
+
'Latvian.json', 'Lingala.json', 'Lithuanian.json', 'Luganda.json',
|
104 |
+
'Luxembourgish.json', 'Macedonian.json', 'Maithili.json',
|
105 |
+
'Malagasy.json', 'Malayalam.json', 'Malay.json', 'Maltese.json',
|
106 |
+
'Maori.json', 'Marathi.json', 'Meiteilon_Manipuri.json',
|
107 |
+
'Mizo.json', 'Mongolian.json', 'Myanmar_Burmese.json',
|
108 |
+
'Nepali.json', 'Norwegian.json', 'Odia_Oriya.json', 'Oromo.json',
|
109 |
+
'Pashto.json', 'Persian.json', 'Polish.json', 'Portuguese.json',
|
110 |
+
'Punjabi.json', 'Quechua.json', 'Romanian.json', 'Russian.json',
|
111 |
+
'Samoan.json', 'Sanskrit.json', 'ScottishGaelic.json', 'Sepedi.json',
|
112 |
+
'Serbian.json', 'Sesotho.json', 'Shona.json', 'Sindhi.json',
|
113 |
+
'Sinhala.json', 'Slovak.json', 'Slovenian.json', 'Somali.json',
|
114 |
+
'Spanish.json', 'Sundanese.json', 'Swahili.json', 'Swedish.json',
|
115 |
+
'Tajik.json', 'Tamil.json', 'Tatar.json', 'Telugu.json', 'Thai.json',
|
116 |
+
'Tigrinya.json', 'Tsonga.json', 'Turkish.json', 'Turkmen.json',
|
117 |
+
'Twi.json', 'Ukrainian.json', 'Urdu.json', 'Uyghur.json', 'Uzbek.json',
|
118 |
+
'Vietnamese.json', 'Welsh.json', 'Xhosa.json', 'Yiddish.json',
|
119 |
+
'Yoruba.json', 'Zulu.json',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
]
|
121 |
],
|
122 |
*[
|
scripts/pretrain-model.yaml
CHANGED
@@ -67,7 +67,7 @@ train:
|
|
67 |
|
68 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
69 |
# micro_batch_size: 16
|
70 |
-
micro_batch_size:
|
71 |
|
72 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
73 |
lr_warmup_steps: 2000
|
|
|
67 |
|
68 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
69 |
# micro_batch_size: 16
|
70 |
+
micro_batch_size: 12
|
71 |
|
72 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
73 |
lr_warmup_steps: 2000
|