mtasic85 commited on
Commit
716dba4
1 Parent(s): 049be21

general pretrain data generation

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -1,4 +1,5 @@
1
  import gc
 
2
 
3
  from datasets import load_dataset
4
  from litdata import optimize, TokensLoader
@@ -6,11 +7,12 @@ from litgpt.tokenizer import Tokenizer
6
  from functools import partial
7
 
8
 
9
- def batch_iterator(name=None):
 
10
  # text
11
- if name in (None, 'saillab/taco-datasets'):
12
  dataset = (
13
- load_dataset(name, data_dir=data_dir, split='train')
14
  for data_dir in [
15
  'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
16
  'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
@@ -32,9 +34,9 @@ def batch_iterator(name=None):
32
  gc.collect()
33
 
34
  # text
35
- if name in (None, 'xu-song/cc100-samples'):
36
  dataset = (
37
- load_dataset(name, lang, split='train')
38
  for lang in [
39
  'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
40
  'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
@@ -60,8 +62,8 @@ def batch_iterator(name=None):
60
  gc.collect()
61
 
62
  # text
63
- if name in (None, 'ontocord/fineweb-permissive-multilingual-2m'):
64
- dataset = load_dataset(name, split='train')
65
 
66
  for row in dataset['text']:
67
  yield row
@@ -70,8 +72,8 @@ def batch_iterator(name=None):
70
  gc.collect()
71
 
72
  # text
73
- if name in (None, 'MuskumPillerum/General-Knowledge'):
74
- dataset = load_dataset(name, split='train')
75
 
76
  for row in dataset:
77
  if not row['Question'] or not row['Answer']:
@@ -83,9 +85,9 @@ def batch_iterator(name=None):
83
  gc.collect()
84
 
85
  # text
86
- if name in (None, 'yirenc/general_knowledge_boolean'):
87
  for split in ['train', 'validation']:
88
- dataset = load_dataset(name, split=split)
89
 
90
  for row in dataset:
91
  yield row['question'] + '? ' + str(row['answer']) + '. ' + row['passage']
@@ -94,9 +96,9 @@ def batch_iterator(name=None):
94
  gc.collect()
95
 
96
  # text
97
- if name in (None, 'nampdn-ai/tiny-textbooks'):
98
  for split in ['train', 'test']:
99
- dataset = load_dataset(name, split=split)
100
 
101
  for row in dataset['textbook']:
102
  yield row
@@ -105,8 +107,8 @@ def batch_iterator(name=None):
105
  gc.collect()
106
 
107
  # code
108
- if name in (None, 'nampdn-ai/tiny-codes'):
109
- dataset = load_dataset(name, split='train')
110
 
111
  for row in dataset:
112
  yield (
@@ -119,9 +121,9 @@ def batch_iterator(name=None):
119
  gc.collect()
120
 
121
  # code
122
- if name in (None, 'bigcode/the-stack-smol-xs'):
123
  dataset = (
124
- load_dataset(name, lang, split='train', trust_remote_code=True)
125
  for lang in [
126
  'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
127
  'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
@@ -150,8 +152,8 @@ def batch_iterator(name=None):
150
  gc.collect()
151
 
152
  # text + code
153
- if name in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
154
- dataset = load_dataset(name, split='train')
155
 
156
  for row in dataset:
157
  yield (
@@ -164,8 +166,8 @@ def batch_iterator(name=None):
164
  gc.collect()
165
 
166
  # code
167
- if name in (None, 'jtatman/python-code-dataset-500k'):
168
- dataset = load_dataset(name, split='train')
169
 
170
  for row in dataset:
171
  yield (
@@ -178,8 +180,8 @@ def batch_iterator(name=None):
178
  gc.collect()
179
 
180
  # code
181
- if name in (None, 'iamtarun/python_code_instructions_18k_alpaca'):
182
- dataset = load_dataset(name, split='train')
183
 
184
  for row in dataset:
185
  yield (
@@ -194,9 +196,9 @@ def batch_iterator(name=None):
194
  gc.collect()
195
 
196
  # code
197
- if name in (None, 'HuggingFaceH4/CodeAlpaca_20K'):
198
  for split in ['train', 'test']:
199
- dataset = load_dataset(name, split=split)
200
 
201
  for row in dataset:
202
  yield (
@@ -209,8 +211,8 @@ def batch_iterator(name=None):
209
  gc.collect()
210
 
211
  # math
212
- if name in (None, 'gair-prox/open-web-math-pro'):
213
- dataset = load_dataset(name, split='train')
214
 
215
  for row in dataset['text']:
216
  yield row
@@ -219,9 +221,9 @@ def batch_iterator(name=None):
219
  gc.collect()
220
 
221
  # math
222
- if name in (None, 'rvv-karma/Math-QA'):
223
  for split in ['train', 'val', 'test']:
224
- dataset = load_dataset(name, split=split)
225
 
226
  for row in dataset:
227
  yield (
@@ -234,8 +236,8 @@ def batch_iterator(name=None):
234
  gc.collect()
235
 
236
  # math
237
- if name in (None, 'ajibawa-2023/Maths-College'):
238
- dataset = load_dataset(name, split='train')
239
 
240
  for row in dataset:
241
  yield (
@@ -248,8 +250,8 @@ def batch_iterator(name=None):
248
  gc.collect()
249
 
250
  # math
251
- if name in (None, 'microsoft/orca-math-word-problems-200k'):
252
- dataset = load_dataset(name, split='train')
253
 
254
  for row in dataset:
255
  yield (
@@ -262,9 +264,9 @@ def batch_iterator(name=None):
262
  gc.collect()
263
 
264
  # math
265
- if name in (None, 'fblgit/simple-math'):
266
  for split in ['train', 'test']:
267
- dataset = load_dataset(name, revision='refs/convert/parquet', split=split)
268
 
269
  for row in dataset:
270
  yield (
@@ -277,8 +279,8 @@ def batch_iterator(name=None):
277
  gc.collect()
278
 
279
  # reasoning
280
- if name in (None, 'SkunkworksAI/reasoning-0.01'):
281
- dataset = load_dataset(name, split='train')
282
 
283
  for row in dataset:
284
  yield (
@@ -293,8 +295,8 @@ def batch_iterator(name=None):
293
  gc.collect()
294
 
295
  # emoji
296
- if name in (None, 'badrex/llm-emoji-dataset'):
297
- dataset = load_dataset(name, split='train')
298
 
299
  for row in dataset:
300
  yield (
@@ -311,14 +313,36 @@ def batch_iterator(name=None):
311
 
312
  del dataset
313
  gc.collect()
314
-
315
-
316
- def tokenize_fn(dataset_name, tokenizer=None):
317
- for text in batch_iterator(dataset_name):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  text_ids = tokenizer.encode(text, bos=False, eos=True)
319
  yield text_ids
320
 
321
-
322
  datasets_names = [
323
  'saillab/taco-datasets',
324
  # 'xu-song/cc100-samples',
@@ -340,12 +364,80 @@ datasets_names = [
340
  # 'SkunkworksAI/reasoning-0.01',
341
  'badrex/llm-emoji-dataset',
342
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  outputs = optimize(
345
  fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
346
- inputs=datasets_names,
347
  output_dir='../pretrain-data/',
348
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
349
  chunk_size=(2049 * 8012),
350
- num_workers=16,
351
  )
 
1
  import gc
2
+ from typing import Optional
3
 
4
  from datasets import load_dataset
5
  from litdata import optimize, TokensLoader
 
7
  from functools import partial
8
 
9
 
10
+ """
11
+ def batch_iterator_1(path=None):
12
  # text
13
+ if path in (None, 'saillab/taco-datasets'):
14
  dataset = (
15
+ load_dataset(path, data_dir=data_dir, split='train')
16
  for data_dir in [
17
  'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
18
  'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
 
34
  gc.collect()
35
 
36
  # text
37
+ if path in (None, 'xu-song/cc100-samples'):
38
  dataset = (
39
+ load_dataset(path, lang, split='train')
40
  for lang in [
41
  'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
42
  'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
 
62
  gc.collect()
63
 
64
  # text
65
+ if path in (None, 'ontocord/fineweb-permissive-multilingual-2m'):
66
+ dataset = load_dataset(path, split='train')
67
 
68
  for row in dataset['text']:
69
  yield row
 
72
  gc.collect()
73
 
74
  # text
75
+ if path in (None, 'MuskumPillerum/General-Knowledge'):
76
+ dataset = load_dataset(path, split='train')
77
 
78
  for row in dataset:
79
  if not row['Question'] or not row['Answer']:
 
85
  gc.collect()
86
 
87
  # text
88
+ if path in (None, 'yirenc/general_knowledge_boolean'):
89
  for split in ['train', 'validation']:
90
+ dataset = load_dataset(path, split=split)
91
 
92
  for row in dataset:
93
  yield row['question'] + '? ' + str(row['answer']) + '. ' + row['passage']
 
96
  gc.collect()
97
 
98
  # text
99
+ if path in (None, 'nampdn-ai/tiny-textbooks'):
100
  for split in ['train', 'test']:
101
+ dataset = load_dataset(path, split=split)
102
 
103
  for row in dataset['textbook']:
104
  yield row
 
107
  gc.collect()
108
 
109
  # code
110
+ if path in (None, 'nampdn-ai/tiny-codes'):
111
+ dataset = load_dataset(path, split='train')
112
 
113
  for row in dataset:
114
  yield (
 
121
  gc.collect()
122
 
123
  # code
124
+ if path in (None, 'bigcode/the-stack-smol-xs'):
125
  dataset = (
126
+ load_dataset(path, lang, split='train', trust_remote_code=True)
127
  for lang in [
128
  'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
129
  'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
 
152
  gc.collect()
153
 
154
  # text + code
155
+ if path in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
156
+ dataset = load_dataset(path, split='train')
157
 
158
  for row in dataset:
159
  yield (
 
166
  gc.collect()
167
 
168
  # code
169
+ if path in (None, 'jtatman/python-code-dataset-500k'):
170
+ dataset = load_dataset(path, split='train')
171
 
172
  for row in dataset:
173
  yield (
 
180
  gc.collect()
181
 
182
  # code
183
+ if path in (None, 'iamtarun/python_code_instructions_18k_alpaca'):
184
+ dataset = load_dataset(path, split='train')
185
 
186
  for row in dataset:
187
  yield (
 
196
  gc.collect()
197
 
198
  # code
199
+ if path in (None, 'HuggingFaceH4/CodeAlpaca_20K'):
200
  for split in ['train', 'test']:
201
+ dataset = load_dataset(path, split=split)
202
 
203
  for row in dataset:
204
  yield (
 
211
  gc.collect()
212
 
213
  # math
214
+ if path in (None, 'gair-prox/open-web-math-pro'):
215
+ dataset = load_dataset(path, split='train')
216
 
217
  for row in dataset['text']:
218
  yield row
 
221
  gc.collect()
222
 
223
  # math
224
+ if path in (None, 'rvv-karma/Math-QA'):
225
  for split in ['train', 'val', 'test']:
226
+ dataset = load_dataset(path, split=split)
227
 
228
  for row in dataset:
229
  yield (
 
236
  gc.collect()
237
 
238
  # math
239
+ if path in (None, 'ajibawa-2023/Maths-College'):
240
+ dataset = load_dataset(path, split='train')
241
 
242
  for row in dataset:
243
  yield (
 
250
  gc.collect()
251
 
252
  # math
253
+ if path in (None, 'microsoft/orca-math-word-problems-200k'):
254
+ dataset = load_dataset(path, split='train')
255
 
256
  for row in dataset:
257
  yield (
 
264
  gc.collect()
265
 
266
  # math
267
+ if path in (None, 'fblgit/simple-math'):
268
  for split in ['train', 'test']:
269
+ dataset = load_dataset(path, revision='refs/convert/parquet', split=split)
270
 
271
  for row in dataset:
272
  yield (
 
279
  gc.collect()
280
 
281
  # reasoning
282
+ if path in (None, 'SkunkworksAI/reasoning-0.01'):
283
+ dataset = load_dataset(path, split='train')
284
 
285
  for row in dataset:
286
  yield (
 
295
  gc.collect()
296
 
297
  # emoji
298
+ if path in (None, 'badrex/llm-emoji-dataset'):
299
+ dataset = load_dataset(path, split='train')
300
 
301
  for row in dataset:
302
  yield (
 
313
 
314
  del dataset
315
  gc.collect()
316
+ """
317
+
318
+ def batch_iterator(path: str,
319
+ name: Optional[str]=None,
320
+ data_dir: Optional[str]=None,
321
+ data_files: Optional[str]=None,
322
+ revision: Optional[str]=None,
323
+ split: str='train',
324
+ format: Optional[str]=None):
325
+ assert format is not None
326
+
327
+ dataset = load_dataset(path=path,
328
+ name=name,
329
+ data_dir=data_dir,
330
+ data_files=data_files,
331
+ revision=revision,
332
+ split=split,
333
+ trust_remote_code=True)
334
+
335
+ for row in dataset:
336
+ text = format.format(**row)
337
+ yield text
338
+
339
+
340
+ def tokenize_fn(datasets_config, tokenizer=None):
341
+ for text in batch_iterator(**datasets_config):
342
  text_ids = tokenizer.encode(text, bos=False, eos=True)
343
  yield text_ids
344
 
345
+ """
346
  datasets_names = [
347
  'saillab/taco-datasets',
348
  # 'xu-song/cc100-samples',
 
364
  # 'SkunkworksAI/reasoning-0.01',
365
  'badrex/llm-emoji-dataset',
366
  ]
367
+ """
368
+
369
+ datasets_configs = [
370
+ {'path': 'yahma/alpaca-cleaned', 'format': '{instruction} {input} {output}'},
371
+ {'path': 'gbharti/wealth-alpaca_lora', 'format': '{instruction} {input} {output}'},
372
+ *[
373
+ {'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train[:10%]', 'format': '{instruction} {input} {output}'}
374
+ for data_dir in [
375
+ 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
376
+ 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
377
+ ]
378
+ ],
379
+ *[
380
+ {'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train[:10%]', 'format': '{text}'}
381
+ for name in [
382
+ 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
383
+ 'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
384
+ 'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
385
+ 'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
386
+ 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
387
+ 'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
388
+ 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
389
+ 'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
390
+ 'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
391
+ 'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
392
+ 'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
393
+ 'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
394
+ 'zh-Hans', 'zh-Hant', 'zu',
395
+ ]
396
+ ],
397
+ {'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
398
+ {'path': 'MuskumPillerum/General-Knowledge', 'format': '{Question} {Answer}'},
399
+ {'path': 'yirenc/general_knowledge_boolean', 'split': 'train+validation', 'format': '{question}? {answer}. {passage}'},
400
+ {'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
401
+ {'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
402
+ *[
403
+ {'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
404
+ for name in [
405
+ 'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
406
+ 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
407
+ 'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
408
+ 'css', 'cuda', 'dart', 'dockerfile', 'elixir',
409
+ 'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
410
+ 'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
411
+ 'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
412
+ 'literate-agda', 'literate-coffeescript', 'literate-haskell',
413
+ 'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
414
+ 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
415
+ 'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
416
+ 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
417
+ 'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
418
+ 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
419
+ 'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
420
+ 'yacc', 'zig',
421
+ ]
422
+ ],
423
+ {'path': 'm-a-p/CodeFeedback-Filtered-Instruction', 'split': 'train', 'format': '{query} {answer}'},
424
+ {'path': 'jtatman/python-code-dataset-500k', 'format': '{instruction} {output}'},
425
+ {'path': 'iamtarun/python_code_instructions_18k_alpaca', 'format': '{instruction} {input} {output}'},
426
+ {'path': 'HuggingFaceH4/CodeAlpaca_20K', 'split': 'train+test', 'format': '{prompt} {completion}'},
427
+ {'path': 'gair-prox/open-web-math-pro', 'split': 'train[:5%]', 'format': '{text}'},
428
+ {'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
429
+ {'path': 'ajibawa-2023/Maths-College', 'split': 'train[:10%]', 'format': '{instruction} {output}'},
430
+ {'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},
431
+ {'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
432
+ {'path': 'SkunkworksAI/reasoning-0.01', 'format': '{instruction} {reasoning} {output}'},
433
+ {'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'},
434
+ ]
435
 
436
  outputs = optimize(
437
  fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
438
+ inputs=datasets_configs,
439
  output_dir='../pretrain-data/',
440
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
441
  chunk_size=(2049 * 8012),
442
+ num_workers=32,
443
  )
scripts/pretrain-model.yaml CHANGED
@@ -1,13 +1,13 @@
1
  # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
2
  # ``model_config``. (type: Optional[str], default: null)
3
- model_name: "tiny-llama-1.1b"
4
 
5
  # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
6
  # ``model_config``. (type: Optional[Config], default: null)
7
  model_config:
8
  padded_vocab_size: 38400
9
  vocab_size: 38400
10
- block_size: 131072
11
  n_layer: 5
12
  n_head: 32
13
  head_size: null
@@ -20,7 +20,12 @@ model_config:
20
  norm_eps: 1e-05
21
  mlp_class_name: "LLaMAMLP"
22
  intermediate_size: 3584
23
- rope_base: 1000000
 
 
 
 
 
24
 
25
  # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
26
  # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
@@ -71,7 +76,7 @@ train:
71
 
72
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
73
  # max_tokens: 3000000000000
74
- max_tokens: ??? # ? * 2049 * 3
75
 
76
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
77
  max_steps:
 
1
  # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
2
  # ``model_config``. (type: Optional[str], default: null)
3
+ model_name: "Llama-3.2-1B"
4
 
5
  # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
6
  # ``model_config``. (type: Optional[Config], default: null)
7
  model_config:
8
  padded_vocab_size: 38400
9
  vocab_size: 38400
10
+ block_size: 8192
11
  n_layer: 5
12
  n_head: 32
13
  head_size: null
 
20
  norm_eps: 1e-05
21
  mlp_class_name: "LLaMAMLP"
22
  intermediate_size: 3584
23
+ rope_base: 500000
24
+ rope_adjustments:
25
+ factor: 32.0
26
+ low_freq_factor: 1.0
27
+ high_freq_factor: 4.0
28
+ original_max_seq_len: 8192
29
 
30
  # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
31
  # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
 
76
 
77
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
78
  # max_tokens: 3000000000000
79
+ max_tokens: ??? # ? * 2049 * 5
80
 
81
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
82
  max_steps: