mtasic85 commited on
Commit
39a4b5b
1 Parent(s): adf4b14

multilingual dataset

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -42,15 +42,294 @@ datasets_configs = [
42
  {'path': 'VMware/open-instruct', 'format': '{instruction} {response}'},
43
 
44
  # multilingual
 
 
 
 
 
 
 
 
45
  *[
46
- {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train[:10%]', 'format': '{instruction} {input} {output}'}
47
  for data_dir in [
48
- 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
49
- 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  ]
51
  ],
52
  *[
53
- {'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train[:10%]', 'format': '{text}'}
54
  for name in [
55
  'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
56
  'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
@@ -66,17 +345,12 @@ datasets_configs = [
66
  'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
67
  'zh-Hans', 'zh-Hant', 'zu',
68
  ]
 
 
 
 
 
69
  ],
70
- # *[
71
- # {'path': 'Salesforce/wikitext', 'name': name, 'split': 'train+validation+test', 'format': '{text}'}
72
- # for name in [
73
- # 'wikitext-103-raw-v1',
74
- # 'wikitext-103-v1',
75
- # 'wikitext-2-raw-v1',
76
- # 'wikitext-2-v1',
77
- # ]
78
- # ],
79
- {'path': 'jordiclive/wikipedia-summary-dataset', 'format': '{summary}'},
80
  # {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
81
 
82
  # general
@@ -86,9 +360,9 @@ datasets_configs = [
86
  # {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
87
  # {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
88
  # {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
89
-
90
  # code
91
- {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
92
  *[
93
  {'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
94
  for name in [
@@ -118,7 +392,10 @@ datasets_configs = [
118
 
119
  # math
120
  {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
121
- {'path': 'gair-prox/open-web-math-pro', 'split': 'train[:5%]', 'format': '{text}'},
 
 
 
122
  {'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
123
  {'path': 'ajibawa-2023/Maths-College', 'split': 'train', 'format': '{instruction} {output}'},
124
  {'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},
 
42
  {'path': 'VMware/open-instruct', 'format': '{instruction} {response}'},
43
 
44
  # multilingual
45
+ # *[
46
+ # {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': f'train[{i}%:{i + 1}%]', 'format': '{instruction} {input} {output}'}
47
+ # for data_dir in [
48
+ # 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
49
+ # 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
50
+ # ]
51
+ # for i in range(0, 100, 10)
52
+ # ],
53
  *[
54
+ {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': f'train', 'format': '{instruction} {input} {output}'}
55
  for data_dir in [
56
+ f'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4/{n}'
57
+ for n in [
58
+ 'Afrikaans',
59
+ 'Albanian',
60
+ 'Amharic',
61
+ 'Arabic',
62
+ 'Armenian',
63
+ 'Assamese',
64
+ 'Aymara',
65
+ 'Azerbaijani',
66
+ 'Bambara',
67
+ 'Basque',
68
+ 'Belarusian',
69
+ 'Bengali',
70
+ 'Bhojpuri',
71
+ 'Bosnian',
72
+ 'Bulgarian',
73
+ 'Catalan',
74
+ 'Cebuano',
75
+ 'Chichewa',
76
+ 'ChineseSimplified',
77
+ 'ChineseTraditional',
78
+ 'Corsican',
79
+ 'Croatian',
80
+ 'Czech',
81
+ 'Danish',
82
+ 'Divehi',
83
+ 'Dogri',
84
+ 'Dutch',
85
+ 'Esperanto',
86
+ 'Estonian',
87
+ 'Ewe',
88
+ 'Filipino',
89
+ 'Finnish',
90
+ 'French',
91
+ 'Frisian',
92
+ 'Galician',
93
+ 'Georgian',
94
+ 'German',
95
+ 'Greek',
96
+ 'Guarani',
97
+ 'Gujarati',
98
+ 'Haitian_Creole',
99
+ 'Hausa',
100
+ 'Hawaiian',
101
+ 'Hebrew',
102
+ 'Hindi',
103
+ 'Hmong',
104
+ 'Hungarian',
105
+ 'Icelandic',
106
+ 'Igbo',
107
+ 'Ilocano',
108
+ 'Indonesian',
109
+ 'Irish',
110
+ 'Italian',
111
+ 'Japanese',
112
+ 'Javanese',
113
+ 'Kannada',
114
+ 'Kazakh',
115
+ 'Khmer',
116
+ 'Kinyarwanda',
117
+ 'Konkani',
118
+ 'Korean',
119
+ 'Krio',
120
+ 'Kurdish_Kurmanji',
121
+ 'Kurdish_Sorani',
122
+ 'Kyrgyz',
123
+ 'Lao',
124
+ 'Latin',
125
+ 'Latvian',
126
+ 'Lingala',
127
+ 'Lithuanian',
128
+ 'Luganda',
129
+ 'Luxembourgish',
130
+ 'Macedonian',
131
+ 'Maithili',
132
+ 'Malagasy',
133
+ 'Malay',
134
+ 'Malayalam',
135
+ 'Maltese',
136
+ 'Maori',
137
+ 'Marathi',
138
+ 'Meiteilon_Manipuri',
139
+ 'Mizo',
140
+ 'Mongolian',
141
+ 'Myanmar_Burmese',
142
+ 'Nepali',
143
+ 'Norwegian',
144
+ 'Odia_Oriya',
145
+ 'Oromo',
146
+ 'Pashto',
147
+ 'Persian',
148
+ 'Polish',
149
+ 'Portuguese',
150
+ 'Punjabi',
151
+ 'Quechua',
152
+ 'Romanian',
153
+ 'Russian',
154
+ 'Samoan',
155
+ 'Sanskrit',
156
+ 'ScottishGaelic',
157
+ 'Sepedi',
158
+ 'Serbian',
159
+ 'Sesotho',
160
+ 'Shona',
161
+ 'Sindhi',
162
+ 'Sinhala',
163
+ 'Slovak',
164
+ 'Slovenian',
165
+ 'Somali',
166
+ 'Spanish',
167
+ 'Sundanese',
168
+ 'Swahili',
169
+ 'Swedish',
170
+ 'Tajik',
171
+ 'Tamil',
172
+ 'Tatar',
173
+ 'Telugu',
174
+ 'Thai',
175
+ 'Tigrinya',
176
+ 'Tsonga',
177
+ 'Turkish',
178
+ 'Turkmen',
179
+ 'Twi',
180
+ 'Ukrainian',
181
+ 'Urdu',
182
+ 'Uyghur',
183
+ 'Uzbek',
184
+ 'Vietnamese',
185
+ 'Welsh',
186
+ 'Xhosa',
187
+ 'Yiddish',
188
+ 'Yoruba',
189
+ 'Zulu',
190
+ ]
191
+ ]
192
+ ],
193
+ *[
194
+ {'path': 'saillab/taco-datasets', 'data_dir': 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k/', 'data_files': n, 'split': f'train', 'format': '{instruction} {input} {output}'}
195
+ for n in [
196
+ 'Afrikaans.json',
197
+ 'Albanian.json',
198
+ 'Amharic.json',
199
+ 'Arabic.json',
200
+ 'Armenian.json',
201
+ 'Assamese.json',
202
+ 'Aymara.json',
203
+ 'Azerbaijani.json',
204
+ 'Bambara.json',
205
+ 'Basque.json',
206
+ 'Belarusian.json',
207
+ 'Bengali.json',
208
+ 'Bhojpuri.json',
209
+ 'Bosnian.json',
210
+ 'Bulgarian.json',
211
+ 'Catalan.json',
212
+ 'Cebuano.json',
213
+ 'Chichewa.json',
214
+ 'ChineseSimplified.json',
215
+ 'ChineseTraditional.json',
216
+ 'Corsican.json',
217
+ 'Croatian.json',
218
+ 'Czech.json',
219
+ 'Danish.json',
220
+ 'Dhivehi.json',
221
+ 'Dogri.json',
222
+ 'Dutch.json',
223
+ 'English.json',
224
+ 'Esperanto.json',
225
+ 'Estonian.json',
226
+ 'Ewe.json',
227
+ 'Filipino.json',
228
+ 'Finnish.json',
229
+ 'French.json',
230
+ 'Frisian.json',
231
+ 'Galician.json',
232
+ 'Georgian.json',
233
+ 'German.json',
234
+ 'Greek.json',
235
+ 'Guarani.json',
236
+ 'Gujarati.json',
237
+ 'Haitian_Creole.json',
238
+ 'Hausa.json',
239
+ 'Hawaiian.json',
240
+ 'Hebrew.json',
241
+ 'Hindi.json',
242
+ 'Hmong.json',
243
+ 'Hungarian.json',
244
+ 'Icelandic.json',
245
+ 'Igbo.json',
246
+ 'Ilocano.json',
247
+ 'Indonesian.json',
248
+ 'Irish.json',
249
+ 'Italian.json',
250
+ 'Japanese.json',
251
+ 'Javanese.json',
252
+ 'Kannada.json',
253
+ 'Kazakh.json',
254
+ 'Khmer.json',
255
+ 'Kinyarwanda.json',
256
+ 'Konkani.json',
257
+ 'Korean.json',
258
+ 'Krio.json',
259
+ 'Kurdish_Kurmanji.json',
260
+ 'Kurdish_Sorani.json',
261
+ 'Kyrgyz.json',
262
+ 'Lao.json',
263
+ 'Latin.json',
264
+ 'Latvian.json',
265
+ 'Lingala.json',
266
+ 'Lithuanian.json',
267
+ 'Luganda.json',
268
+ 'Luxembourgish.json',
269
+ 'Macedonian.json',
270
+ 'Maithili.json',
271
+ 'Malagasy.json',
272
+ 'Malayalam.json',
273
+ 'Malay.json',
274
+ 'Maltese.json',
275
+ 'Maori.json',
276
+ 'Marathi.json',
277
+ 'Meiteilon_Manipuri.json',
278
+ 'Mizo.json',
279
+ 'Mongolian.json',
280
+ 'Myanmar_Burmese.json',
281
+ 'Nepali.json',
282
+ 'Norwegian.json',
283
+ 'Odia_Oriya.json',
284
+ 'Oromo.json',
285
+ 'Pashto.json',
286
+ 'Persian.json',
287
+ 'Polish.json',
288
+ 'Portuguese.json',
289
+ 'Punjabi.json',
290
+ 'Quechua.json',
291
+ 'Romanian.json',
292
+ 'Russian.json',
293
+ 'Samoan.json',
294
+ 'Sanskrit.json',
295
+ 'ScottishGaelic.json',
296
+ 'Sepedi.json',
297
+ 'Serbian.json',
298
+ 'Sesotho.json',
299
+ 'Shona.json',
300
+ 'Sindhi.json',
301
+ 'Sinhala.json',
302
+ 'Slovak.json',
303
+ 'Slovenian.json',
304
+ 'Somali.json',
305
+ 'Spanish.json',
306
+ 'Sundanese.json',
307
+ 'Swahili.json',
308
+ 'Swedish.json',
309
+ 'Tajik.json',
310
+ 'Tamil.json',
311
+ 'Tatar.json',
312
+ 'Telugu.json',
313
+ 'Thai.json',
314
+ 'Tigrinya.json',
315
+ 'Tsonga.json',
316
+ 'Turkish.json',
317
+ 'Turkmen.json',
318
+ 'Twi.json',
319
+ 'Ukrainian.json',
320
+ 'Urdu.json',
321
+ 'Uyghur.json',
322
+ 'Uzbek.json',
323
+ 'Vietnamese.json',
324
+ 'Welsh.json',
325
+ 'Xhosa.json',
326
+ 'Yiddish.json',
327
+ 'Yoruba.json',
328
+ 'Zulu.json',
329
  ]
330
  ],
331
  *[
332
+ {'path': 'xu-song/cc100-samples', 'name': name, 'split': f'train[{i}%:{i + 1}%]', 'format': '{text}'}
333
  for name in [
334
  'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
335
  'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
 
345
  'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
346
  'zh-Hans', 'zh-Hant', 'zu',
347
  ]
348
+ for i in range(0, 100, 10)
349
+ ],
350
+ *[
351
+ {'path': 'jordiclive/wikipedia-summary-dataset', 'split': f'train[{i}%:{i + 5}%]', 'format': '{summary}'}
352
+ for i in range(0, 100, 5)
353
  ],
 
 
 
 
 
 
 
 
 
 
354
  # {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
355
 
356
  # general
 
360
  # {'path': 'keivalya/MedQuad-MedicalQnADataset', 'split': 'train', 'format': '{Question} {Answer}'},
361
  # {'path': 'NousResearch/CharacterCodex', 'split': 'train', 'format': '{scenario} {description}'},
362
  # {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
363
+
364
  # code
365
+ # {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
366
  *[
367
  {'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
368
  for name in [
 
392
 
393
  # math
394
  {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
395
+ *[
396
+ {'path': 'gair-prox/open-web-math-pro', 'split': f'train[{i}%:{i + 1}%]', 'format': '{text}'}
397
+ for i in range(0, 100, 20)
398
+ ],
399
  {'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
400
  {'path': 'ajibawa-2023/Maths-College', 'split': 'train', 'format': '{instruction} {output}'},
401
  {'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},
scripts/pretrain-model.yaml CHANGED
@@ -8,10 +8,10 @@ model_config:
8
  padded_vocab_size: 38400
9
  vocab_size: 38400
10
  block_size: 8192
11
- n_layer: 5
12
  n_head: 32
13
  head_size: null
14
- n_embd: 1024
15
  n_query_groups: 8
16
  rotary_percentage: 1.0
17
  parallel_residual: false
@@ -19,7 +19,7 @@ model_config:
19
  norm_class_name: "RMSNorm"
20
  norm_eps: 1e-05
21
  mlp_class_name: "LLaMAMLP"
22
- intermediate_size: 4096
23
  rope_base: 500000
24
  # rope_adjustments:
25
  # factor: 32.0
@@ -77,17 +77,16 @@ train:
77
 
78
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
79
  # max_tokens: 3000000000000
80
- # max_tokens: 8159107755 # 796399 * 2049 * 5
81
- max_tokens: 11422750857 # 796399 * 2049 * 7
82
 
83
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
84
  max_steps:
85
 
86
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
87
- max_seq_length:
88
 
89
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
90
- tie_embeddings:
91
 
92
  # (type: Optional[float], default: 1.0)
93
  max_norm: 1.0
@@ -121,11 +120,9 @@ optimizer:
121
 
122
  init_args:
123
  # (type: float, default: 0.001)
124
- # lr: 1e-3
125
- lr: 1e-4
126
 
127
  # (type: float, default: 0.01)
128
- # weight_decay: 0.01
129
  weight_decay: 0.1
130
 
131
  # (type: tuple, default: (0.9,0.999))
 
8
  padded_vocab_size: 38400
9
  vocab_size: 38400
10
  block_size: 8192
11
+ n_layer: 32
12
  n_head: 32
13
  head_size: null
14
+ n_embd: 256
15
  n_query_groups: 8
16
  rotary_percentage: 1.0
17
  parallel_residual: false
 
19
  norm_class_name: "RMSNorm"
20
  norm_eps: 1e-05
21
  mlp_class_name: "LLaMAMLP"
22
+ intermediate_size: 1024
23
  rope_base: 500000
24
  # rope_adjustments:
25
  # factor: 32.0
 
77
 
78
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
79
  # max_tokens: 3000000000000
80
+ max_tokens: 8159107755 # 796399 * 2049 * 5
 
81
 
82
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
83
  max_steps:
84
 
85
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
86
+ max_seq_length: 2049
87
 
88
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
89
+ tie_embeddings: true
90
 
91
  # (type: Optional[float], default: 1.0)
92
  max_norm: 1.0
 
120
 
121
  init_args:
122
  # (type: float, default: 0.001)
123
+ lr: 1e-3
 
124
 
125
  # (type: float, default: 0.01)
 
126
  weight_decay: 0.1
127
 
128
  # (type: tuple, default: (0.9,0.999))