mtasic85 commited on
Commit
6d9772d
1 Parent(s): a4a75cd

pretrain model

Browse files
scripts/prepare_contrain_dataset.py CHANGED
@@ -28,4 +28,6 @@ https://huggingface.co/datasets/KingNish/reasoning-base-20k
28
  https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
29
  https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
30
  https://huggingface.co/datasets/thesven/gsm8k-reasoning
 
 
31
  """
 
28
  https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
29
  https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
30
  https://huggingface.co/datasets/thesven/gsm8k-reasoning
31
+
32
+ https://huggingface.co/datasets/codeparrot/self-instruct-starcoder
33
  """
scripts/prepare_pretrain_dataset.py CHANGED
@@ -55,277 +55,68 @@ datasets_configs = [
55
  for data_dir in [
56
  f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
57
  for n in [
58
- 'Afrikaans',
59
- 'Albanian',
60
- 'Amharic',
61
- 'Arabic',
62
- 'Armenian',
63
- 'Assamese',
64
- 'Aymara',
65
- 'Azerbaijani',
66
- 'Bambara',
67
- 'Basque',
68
- 'Belarusian',
69
- 'Bengali',
70
- 'Bhojpuri',
71
- 'Bosnian',
72
- 'Bulgarian',
73
- 'Catalan',
74
- 'Cebuano',
75
- 'Chichewa',
76
- 'ChineseSimplified',
77
- 'ChineseTraditional',
78
- 'Corsican',
79
- 'Croatian',
80
- 'Czech',
81
- 'Danish',
82
- 'Divehi',
83
- 'Dogri',
84
- 'Dutch',
85
- 'Esperanto',
86
- 'Estonian',
87
- 'Ewe',
88
- 'Filipino',
89
- 'Finnish',
90
- 'French',
91
- 'Frisian',
92
- 'Galician',
93
- 'Georgian',
94
- 'German',
95
- 'Greek',
96
- 'Guarani',
97
- 'Gujarati',
98
- 'Haitian_Creole',
99
- 'Hausa',
100
- 'Hawaiian',
101
- 'Hebrew',
102
- 'Hindi',
103
- 'Hmong',
104
- 'Hungarian',
105
- 'Icelandic',
106
- 'Igbo',
107
- 'Ilocano',
108
- 'Indonesian',
109
- 'Irish',
110
- 'Italian',
111
- 'Japanese',
112
- 'Javanese',
113
- 'Kannada',
114
- 'Kazakh',
115
- 'Khmer',
116
- 'Kinyarwanda',
117
- 'Konkani',
118
- 'Korean',
119
- 'Krio',
120
- 'Kurdish_Kurmanji',
121
- 'Kurdish_Sorani',
122
- 'Kyrgyz',
123
- 'Lao',
124
- 'Latin',
125
- 'Latvian',
126
- 'Lingala',
127
- 'Lithuanian',
128
- 'Luganda',
129
- 'Luxembourgish',
130
- 'Macedonian',
131
- 'Maithili',
132
- 'Malagasy',
133
- 'Malay',
134
- 'Malayalam',
135
- 'Maltese',
136
- 'Maori',
137
- 'Marathi',
138
- 'Meiteilon_Manipuri',
139
- 'Mizo',
140
- 'Mongolian',
141
- 'Myanmar_Burmese',
142
- 'Nepali',
143
- 'Norwegian',
144
- 'Odia_Oriya',
145
- 'Oromo',
146
- 'Pashto',
147
- 'Persian',
148
- 'Polish',
149
- 'Portuguese',
150
- 'Punjabi',
151
- 'Quechua',
152
- 'Romanian',
153
- 'Russian',
154
- 'Samoan',
155
- 'Sanskrit',
156
- 'ScottishGaelic',
157
- 'Sepedi',
158
- 'Serbian',
159
- 'Sesotho',
160
- 'Shona',
161
- 'Sindhi',
162
- 'Sinhala',
163
- 'Slovak',
164
- 'Slovenian',
165
- 'Somali',
166
- 'Spanish',
167
- 'Sundanese',
168
- 'Swahili',
169
- 'Swedish',
170
- 'Tajik',
171
- 'Tamil',
172
- 'Tatar',
173
- 'Telugu',
174
- 'Thai',
175
- 'Tigrinya',
176
- 'Tsonga',
177
- 'Turkish',
178
- 'Turkmen',
179
- 'Twi',
180
- 'Ukrainian',
181
- 'Urdu',
182
- 'Uyghur',
183
- 'Uzbek',
184
- 'Vietnamese',
185
- 'Welsh',
186
- 'Xhosa',
187
- 'Yiddish',
188
- 'Yoruba',
189
- 'Zulu',
190
  ]
191
  ]
192
  ],
193
  *[
194
  {'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': f'train', 'format': '{instruction} {input} {output}'}
195
  for n in [
196
- 'Afrikaans.json',
197
- 'Albanian.json',
198
- 'Amharic.json',
199
- 'Arabic.json',
200
- 'Armenian.json',
201
- 'Assamese.json',
202
- 'Aymara.json',
203
- 'Azerbaijani.json',
204
- 'Bambara.json',
205
- 'Basque.json',
206
- 'Belarusian.json',
207
- 'Bengali.json',
208
- 'Bhojpuri.json',
209
- 'Bosnian.json',
210
- 'Bulgarian.json',
211
- 'Catalan.json',
212
- 'Cebuano.json',
213
- 'Chichewa.json',
214
- 'ChineseSimplified.json',
215
- 'ChineseTraditional.json',
216
- 'Corsican.json',
217
- 'Croatian.json',
218
- 'Czech.json',
219
- 'Danish.json',
220
- 'Dhivehi.json',
221
- 'Dogri.json',
222
- 'Dutch.json',
223
- 'English.json',
224
- 'Esperanto.json',
225
- 'Estonian.json',
226
- 'Ewe.json',
227
- 'Filipino.json',
228
- 'Finnish.json',
229
- 'French.json',
230
- 'Frisian.json',
231
- 'Galician.json',
232
- 'Georgian.json',
233
- 'German.json',
234
- 'Greek.json',
235
- 'Guarani.json',
236
- 'Gujarati.json',
237
- 'Haitian_Creole.json',
238
- 'Hausa.json',
239
- 'Hawaiian.json',
240
- 'Hebrew.json',
241
- 'Hindi.json',
242
- 'Hmong.json',
243
- 'Hungarian.json',
244
- 'Icelandic.json',
245
- 'Igbo.json',
246
- 'Ilocano.json',
247
- 'Indonesian.json',
248
- 'Irish.json',
249
- 'Italian.json',
250
- 'Japanese.json',
251
- 'Javanese.json',
252
- 'Kannada.json',
253
- 'Kazakh.json',
254
- 'Khmer.json',
255
- 'Kinyarwanda.json',
256
- 'Konkani.json',
257
- 'Korean.json',
258
- 'Krio.json',
259
- 'Kurdish_Kurmanji.json',
260
- 'Kurdish_Sorani.json',
261
- 'Kyrgyz.json',
262
- 'Lao.json',
263
- 'Latin.json',
264
- 'Latvian.json',
265
- 'Lingala.json',
266
- 'Lithuanian.json',
267
- 'Luganda.json',
268
- 'Luxembourgish.json',
269
- 'Macedonian.json',
270
- 'Maithili.json',
271
- 'Malagasy.json',
272
- 'Malayalam.json',
273
- 'Malay.json',
274
- 'Maltese.json',
275
- 'Maori.json',
276
- 'Marathi.json',
277
- 'Meiteilon_Manipuri.json',
278
- 'Mizo.json',
279
- 'Mongolian.json',
280
- 'Myanmar_Burmese.json',
281
- 'Nepali.json',
282
- 'Norwegian.json',
283
- 'Odia_Oriya.json',
284
- 'Oromo.json',
285
- 'Pashto.json',
286
- 'Persian.json',
287
- 'Polish.json',
288
- 'Portuguese.json',
289
- 'Punjabi.json',
290
- 'Quechua.json',
291
- 'Romanian.json',
292
- 'Russian.json',
293
- 'Samoan.json',
294
- 'Sanskrit.json',
295
- 'ScottishGaelic.json',
296
- 'Sepedi.json',
297
- 'Serbian.json',
298
- 'Sesotho.json',
299
- 'Shona.json',
300
- 'Sindhi.json',
301
- 'Sinhala.json',
302
- 'Slovak.json',
303
- 'Slovenian.json',
304
- 'Somali.json',
305
- 'Spanish.json',
306
- 'Sundanese.json',
307
- 'Swahili.json',
308
- 'Swedish.json',
309
- 'Tajik.json',
310
- 'Tamil.json',
311
- 'Tatar.json',
312
- 'Telugu.json',
313
- 'Thai.json',
314
- 'Tigrinya.json',
315
- 'Tsonga.json',
316
- 'Turkish.json',
317
- 'Turkmen.json',
318
- 'Twi.json',
319
- 'Ukrainian.json',
320
- 'Urdu.json',
321
- 'Uyghur.json',
322
- 'Uzbek.json',
323
- 'Vietnamese.json',
324
- 'Welsh.json',
325
- 'Xhosa.json',
326
- 'Yiddish.json',
327
- 'Yoruba.json',
328
- 'Zulu.json',
329
  ]
330
  ],
331
  *[
 
55
  for data_dir in [
56
  f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
57
  for n in [
58
+ 'Afrikaans', 'Albanian', 'Amharic', 'Arabic', 'Armenian', 'Assamese',
59
+ 'Aymara', 'Azerbaijani', 'Bambara', 'Basque', 'Belarusian', 'Bengali',
60
+ 'Bhojpuri', 'Bosnian', 'Bulgarian', 'Catalan', 'Cebuano', 'Chichewa',
61
+ 'ChineseSimplified', 'ChineseTraditional', 'Corsican', 'Croatian',
62
+ 'Czech', 'Danish', 'Divehi', 'Dogri', 'Dutch', 'Esperanto', 'Estonian',
63
+ 'Ewe', 'Filipino', 'Finnish', 'French', 'Frisian', 'Galician',
64
+ 'Georgian', 'German', 'Greek', 'Guarani', 'Gujarati', 'Haitian_Creole',
65
+ 'Hausa', 'Hawaiian', 'Hebrew', 'Hindi', 'Hmong', 'Hungarian',
66
+ 'Icelandic', 'Igbo', 'Ilocano', 'Indonesian', 'Irish', 'Italian',
67
+ 'Japanese', 'Javanese', 'Kannada', 'Kazakh', 'Khmer', 'Kinyarwanda',
68
+ 'Konkani', 'Korean', 'Krio', 'Kurdish_Kurmanji', 'Kurdish_Sorani',
69
+ 'Kyrgyz', 'Lao', 'Latin', 'Latvian', 'Lingala', 'Lithuanian',
70
+ 'Luganda', 'Luxembourgish', 'Macedonian', 'Maithili', 'Malagasy',
71
+ 'Malay', 'Malayalam', 'Maltese', 'Maori', 'Marathi', 'Meiteilon_Manipuri',
72
+ 'Mizo', 'Mongolian', 'Myanmar_Burmese', 'Nepali', 'Norwegian',
73
+ 'Odia_Oriya', 'Oromo', 'Pashto', 'Persian', 'Polish', 'Portuguese',
74
+ 'Punjabi', 'Quechua', 'Romanian', 'Russian', 'Samoan', 'Sanskrit',
75
+ 'ScottishGaelic', 'Sepedi', 'Serbian', 'Sesotho', 'Shona', 'Sindhi',
76
+ 'Sinhala', 'Slovak', 'Slovenian', 'Somali', 'Spanish', 'Sundanese',
77
+ 'Swahili', 'Swedish', 'Tajik', 'Tamil', 'Tatar', 'Telugu', 'Thai',
78
+ 'Tigrinya', 'Tsonga', 'Turkish', 'Turkmen', 'Twi', 'Ukrainian',
79
+ 'Urdu', 'Uyghur', 'Uzbek', 'Vietnamese', 'Welsh', 'Xhosa',
80
+ 'Yiddish', 'Yoruba', 'Zulu',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  ]
82
  ]
83
  ],
84
  *[
85
  {'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': f'train', 'format': '{instruction} {input} {output}'}
86
  for n in [
87
+ 'Afrikaans.json', 'Albanian.json', 'Amharic.json', 'Arabic.json', 'Armenian.json',
88
+ 'Assamese.json', 'Aymara.json', 'Azerbaijani.json', 'Bambara.json', 'Basque.json',
89
+ 'Belarusian.json', 'Bengali.json', 'Bhojpuri.json', 'Bosnian.json', 'Bulgarian.json',
90
+ 'Catalan.json', 'Cebuano.json', 'Chichewa.json', 'ChineseSimplified.json',
91
+ 'ChineseTraditional.json', 'Corsican.json', 'Croatian.json', 'Czech.json',
92
+ 'Danish.json', 'Dhivehi.json', 'Dogri.json', 'Dutch.json', 'English.json',
93
+ 'Esperanto.json', 'Estonian.json', 'Ewe.json', 'Filipino.json',
94
+ 'Finnish.json', 'French.json', 'Frisian.json', 'Galician.json',
95
+ 'Georgian.json', 'German.json', 'Greek.json', 'Guarani.json',
96
+ 'Gujarati.json', 'Haitian_Creole.json', 'Hausa.json', 'Hawaiian.json',
97
+ 'Hebrew.json', 'Hindi.json', 'Hmong.json', 'Hungarian.json',
98
+ 'Icelandic.json', 'Igbo.json', 'Ilocano.json', 'Indonesian.json',
99
+ 'Irish.json', 'Italian.json', 'Japanese.json', 'Javanese.json',
100
+ 'Kannada.json', 'Kazakh.json', 'Khmer.json', 'Kinyarwanda.json',
101
+ 'Konkani.json', 'Korean.json', 'Krio.json', 'Kurdish_Kurmanji.json',
102
+ 'Kurdish_Sorani.json', 'Kyrgyz.json', 'Lao.json', 'Latin.json',
103
+ 'Latvian.json', 'Lingala.json', 'Lithuanian.json', 'Luganda.json',
104
+ 'Luxembourgish.json', 'Macedonian.json', 'Maithili.json',
105
+ 'Malagasy.json', 'Malayalam.json', 'Malay.json', 'Maltese.json',
106
+ 'Maori.json', 'Marathi.json', 'Meiteilon_Manipuri.json',
107
+ 'Mizo.json', 'Mongolian.json', 'Myanmar_Burmese.json',
108
+ 'Nepali.json', 'Norwegian.json', 'Odia_Oriya.json', 'Oromo.json',
109
+ 'Pashto.json', 'Persian.json', 'Polish.json', 'Portuguese.json',
110
+ 'Punjabi.json', 'Quechua.json', 'Romanian.json', 'Russian.json',
111
+ 'Samoan.json', 'Sanskrit.json', 'ScottishGaelic.json', 'Sepedi.json',
112
+ 'Serbian.json', 'Sesotho.json', 'Shona.json', 'Sindhi.json',
113
+ 'Sinhala.json', 'Slovak.json', 'Slovenian.json', 'Somali.json',
114
+ 'Spanish.json', 'Sundanese.json', 'Swahili.json', 'Swedish.json',
115
+ 'Tajik.json', 'Tamil.json', 'Tatar.json', 'Telugu.json', 'Thai.json',
116
+ 'Tigrinya.json', 'Tsonga.json', 'Turkish.json', 'Turkmen.json',
117
+ 'Twi.json', 'Ukrainian.json', 'Urdu.json', 'Uyghur.json', 'Uzbek.json',
118
+ 'Vietnamese.json', 'Welsh.json', 'Xhosa.json', 'Yiddish.json',
119
+ 'Yoruba.json', 'Zulu.json',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  ]
121
  ],
122
  *[
scripts/pretrain-model.yaml CHANGED
@@ -67,7 +67,7 @@ train:
67
 
68
  # Number of samples per data-parallel rank (type: int, default: 4)
69
  # micro_batch_size: 16
70
- micro_batch_size: 4
71
 
72
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
73
  lr_warmup_steps: 2000
 
67
 
68
  # Number of samples per data-parallel rank (type: int, default: 4)
69
  # micro_batch_size: 16
70
+ micro_batch_size: 12
71
 
72
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
73
  lr_warmup_steps: 2000