general pretrain data generation
Browse files- scripts/prepare_pretrain_dataset.py +138 -46
- scripts/pretrain-model.yaml +9 -4
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import gc
|
|
|
2 |
|
3 |
from datasets import load_dataset
|
4 |
from litdata import optimize, TokensLoader
|
@@ -6,11 +7,12 @@ from litgpt.tokenizer import Tokenizer
|
|
6 |
from functools import partial
|
7 |
|
8 |
|
9 |
-
|
|
|
10 |
# text
|
11 |
-
if
|
12 |
dataset = (
|
13 |
-
load_dataset(
|
14 |
for data_dir in [
|
15 |
'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
16 |
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
@@ -32,9 +34,9 @@ def batch_iterator(name=None):
|
|
32 |
gc.collect()
|
33 |
|
34 |
# text
|
35 |
-
if
|
36 |
dataset = (
|
37 |
-
load_dataset(
|
38 |
for lang in [
|
39 |
'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
|
40 |
'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
|
@@ -60,8 +62,8 @@ def batch_iterator(name=None):
|
|
60 |
gc.collect()
|
61 |
|
62 |
# text
|
63 |
-
if
|
64 |
-
dataset = load_dataset(
|
65 |
|
66 |
for row in dataset['text']:
|
67 |
yield row
|
@@ -70,8 +72,8 @@ def batch_iterator(name=None):
|
|
70 |
gc.collect()
|
71 |
|
72 |
# text
|
73 |
-
if
|
74 |
-
dataset = load_dataset(
|
75 |
|
76 |
for row in dataset:
|
77 |
if not row['Question'] or not row['Answer']:
|
@@ -83,9 +85,9 @@ def batch_iterator(name=None):
|
|
83 |
gc.collect()
|
84 |
|
85 |
# text
|
86 |
-
if
|
87 |
for split in ['train', 'validation']:
|
88 |
-
dataset = load_dataset(
|
89 |
|
90 |
for row in dataset:
|
91 |
yield row['question'] + '? ' + str(row['answer']) + '. ' + row['passage']
|
@@ -94,9 +96,9 @@ def batch_iterator(name=None):
|
|
94 |
gc.collect()
|
95 |
|
96 |
# text
|
97 |
-
if
|
98 |
for split in ['train', 'test']:
|
99 |
-
dataset = load_dataset(
|
100 |
|
101 |
for row in dataset['textbook']:
|
102 |
yield row
|
@@ -105,8 +107,8 @@ def batch_iterator(name=None):
|
|
105 |
gc.collect()
|
106 |
|
107 |
# code
|
108 |
-
if
|
109 |
-
dataset = load_dataset(
|
110 |
|
111 |
for row in dataset:
|
112 |
yield (
|
@@ -119,9 +121,9 @@ def batch_iterator(name=None):
|
|
119 |
gc.collect()
|
120 |
|
121 |
# code
|
122 |
-
if
|
123 |
dataset = (
|
124 |
-
load_dataset(
|
125 |
for lang in [
|
126 |
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
127 |
'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
|
@@ -150,8 +152,8 @@ def batch_iterator(name=None):
|
|
150 |
gc.collect()
|
151 |
|
152 |
# text + code
|
153 |
-
if
|
154 |
-
dataset = load_dataset(
|
155 |
|
156 |
for row in dataset:
|
157 |
yield (
|
@@ -164,8 +166,8 @@ def batch_iterator(name=None):
|
|
164 |
gc.collect()
|
165 |
|
166 |
# code
|
167 |
-
if
|
168 |
-
dataset = load_dataset(
|
169 |
|
170 |
for row in dataset:
|
171 |
yield (
|
@@ -178,8 +180,8 @@ def batch_iterator(name=None):
|
|
178 |
gc.collect()
|
179 |
|
180 |
# code
|
181 |
-
if
|
182 |
-
dataset = load_dataset(
|
183 |
|
184 |
for row in dataset:
|
185 |
yield (
|
@@ -194,9 +196,9 @@ def batch_iterator(name=None):
|
|
194 |
gc.collect()
|
195 |
|
196 |
# code
|
197 |
-
if
|
198 |
for split in ['train', 'test']:
|
199 |
-
dataset = load_dataset(
|
200 |
|
201 |
for row in dataset:
|
202 |
yield (
|
@@ -209,8 +211,8 @@ def batch_iterator(name=None):
|
|
209 |
gc.collect()
|
210 |
|
211 |
# math
|
212 |
-
if
|
213 |
-
dataset = load_dataset(
|
214 |
|
215 |
for row in dataset['text']:
|
216 |
yield row
|
@@ -219,9 +221,9 @@ def batch_iterator(name=None):
|
|
219 |
gc.collect()
|
220 |
|
221 |
# math
|
222 |
-
if
|
223 |
for split in ['train', 'val', 'test']:
|
224 |
-
dataset = load_dataset(
|
225 |
|
226 |
for row in dataset:
|
227 |
yield (
|
@@ -234,8 +236,8 @@ def batch_iterator(name=None):
|
|
234 |
gc.collect()
|
235 |
|
236 |
# math
|
237 |
-
if
|
238 |
-
dataset = load_dataset(
|
239 |
|
240 |
for row in dataset:
|
241 |
yield (
|
@@ -248,8 +250,8 @@ def batch_iterator(name=None):
|
|
248 |
gc.collect()
|
249 |
|
250 |
# math
|
251 |
-
if
|
252 |
-
dataset = load_dataset(
|
253 |
|
254 |
for row in dataset:
|
255 |
yield (
|
@@ -262,9 +264,9 @@ def batch_iterator(name=None):
|
|
262 |
gc.collect()
|
263 |
|
264 |
# math
|
265 |
-
if
|
266 |
for split in ['train', 'test']:
|
267 |
-
dataset = load_dataset(
|
268 |
|
269 |
for row in dataset:
|
270 |
yield (
|
@@ -277,8 +279,8 @@ def batch_iterator(name=None):
|
|
277 |
gc.collect()
|
278 |
|
279 |
# reasoning
|
280 |
-
if
|
281 |
-
dataset = load_dataset(
|
282 |
|
283 |
for row in dataset:
|
284 |
yield (
|
@@ -293,8 +295,8 @@ def batch_iterator(name=None):
|
|
293 |
gc.collect()
|
294 |
|
295 |
# emoji
|
296 |
-
if
|
297 |
-
dataset = load_dataset(
|
298 |
|
299 |
for row in dataset:
|
300 |
yield (
|
@@ -311,14 +313,36 @@ def batch_iterator(name=None):
|
|
311 |
|
312 |
del dataset
|
313 |
gc.collect()
|
314 |
-
|
315 |
-
|
316 |
-
def
|
317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
text_ids = tokenizer.encode(text, bos=False, eos=True)
|
319 |
yield text_ids
|
320 |
|
321 |
-
|
322 |
datasets_names = [
|
323 |
'saillab/taco-datasets',
|
324 |
# 'xu-song/cc100-samples',
|
@@ -340,12 +364,80 @@ datasets_names = [
|
|
340 |
# 'SkunkworksAI/reasoning-0.01',
|
341 |
'badrex/llm-emoji-dataset',
|
342 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
outputs = optimize(
|
345 |
fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
|
346 |
-
inputs=
|
347 |
output_dir='../pretrain-data/',
|
348 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
349 |
chunk_size=(2049 * 8012),
|
350 |
-
num_workers=
|
351 |
)
|
|
|
1 |
import gc
|
2 |
+
from typing import Optional
|
3 |
|
4 |
from datasets import load_dataset
|
5 |
from litdata import optimize, TokensLoader
|
|
|
7 |
from functools import partial
|
8 |
|
9 |
|
10 |
+
"""
|
11 |
+
def batch_iterator_1(path=None):
|
12 |
# text
|
13 |
+
if path in (None, 'saillab/taco-datasets'):
|
14 |
dataset = (
|
15 |
+
load_dataset(path, data_dir=data_dir, split='train')
|
16 |
for data_dir in [
|
17 |
'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
18 |
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
|
|
34 |
gc.collect()
|
35 |
|
36 |
# text
|
37 |
+
if path in (None, 'xu-song/cc100-samples'):
|
38 |
dataset = (
|
39 |
+
load_dataset(path, lang, split='train')
|
40 |
for lang in [
|
41 |
'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
|
42 |
'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
|
|
|
62 |
gc.collect()
|
63 |
|
64 |
# text
|
65 |
+
if path in (None, 'ontocord/fineweb-permissive-multilingual-2m'):
|
66 |
+
dataset = load_dataset(path, split='train')
|
67 |
|
68 |
for row in dataset['text']:
|
69 |
yield row
|
|
|
72 |
gc.collect()
|
73 |
|
74 |
# text
|
75 |
+
if path in (None, 'MuskumPillerum/General-Knowledge'):
|
76 |
+
dataset = load_dataset(path, split='train')
|
77 |
|
78 |
for row in dataset:
|
79 |
if not row['Question'] or not row['Answer']:
|
|
|
85 |
gc.collect()
|
86 |
|
87 |
# text
|
88 |
+
if path in (None, 'yirenc/general_knowledge_boolean'):
|
89 |
for split in ['train', 'validation']:
|
90 |
+
dataset = load_dataset(path, split=split)
|
91 |
|
92 |
for row in dataset:
|
93 |
yield row['question'] + '? ' + str(row['answer']) + '. ' + row['passage']
|
|
|
96 |
gc.collect()
|
97 |
|
98 |
# text
|
99 |
+
if path in (None, 'nampdn-ai/tiny-textbooks'):
|
100 |
for split in ['train', 'test']:
|
101 |
+
dataset = load_dataset(path, split=split)
|
102 |
|
103 |
for row in dataset['textbook']:
|
104 |
yield row
|
|
|
107 |
gc.collect()
|
108 |
|
109 |
# code
|
110 |
+
if path in (None, 'nampdn-ai/tiny-codes'):
|
111 |
+
dataset = load_dataset(path, split='train')
|
112 |
|
113 |
for row in dataset:
|
114 |
yield (
|
|
|
121 |
gc.collect()
|
122 |
|
123 |
# code
|
124 |
+
if path in (None, 'bigcode/the-stack-smol-xs'):
|
125 |
dataset = (
|
126 |
+
load_dataset(path, lang, split='train', trust_remote_code=True)
|
127 |
for lang in [
|
128 |
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
129 |
'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
|
|
|
152 |
gc.collect()
|
153 |
|
154 |
# text + code
|
155 |
+
if path in (None, 'm-a-p/CodeFeedback-Filtered-Instruction'):
|
156 |
+
dataset = load_dataset(path, split='train')
|
157 |
|
158 |
for row in dataset:
|
159 |
yield (
|
|
|
166 |
gc.collect()
|
167 |
|
168 |
# code
|
169 |
+
if path in (None, 'jtatman/python-code-dataset-500k'):
|
170 |
+
dataset = load_dataset(path, split='train')
|
171 |
|
172 |
for row in dataset:
|
173 |
yield (
|
|
|
180 |
gc.collect()
|
181 |
|
182 |
# code
|
183 |
+
if path in (None, 'iamtarun/python_code_instructions_18k_alpaca'):
|
184 |
+
dataset = load_dataset(path, split='train')
|
185 |
|
186 |
for row in dataset:
|
187 |
yield (
|
|
|
196 |
gc.collect()
|
197 |
|
198 |
# code
|
199 |
+
if path in (None, 'HuggingFaceH4/CodeAlpaca_20K'):
|
200 |
for split in ['train', 'test']:
|
201 |
+
dataset = load_dataset(path, split=split)
|
202 |
|
203 |
for row in dataset:
|
204 |
yield (
|
|
|
211 |
gc.collect()
|
212 |
|
213 |
# math
|
214 |
+
if path in (None, 'gair-prox/open-web-math-pro'):
|
215 |
+
dataset = load_dataset(path, split='train')
|
216 |
|
217 |
for row in dataset['text']:
|
218 |
yield row
|
|
|
221 |
gc.collect()
|
222 |
|
223 |
# math
|
224 |
+
if path in (None, 'rvv-karma/Math-QA'):
|
225 |
for split in ['train', 'val', 'test']:
|
226 |
+
dataset = load_dataset(path, split=split)
|
227 |
|
228 |
for row in dataset:
|
229 |
yield (
|
|
|
236 |
gc.collect()
|
237 |
|
238 |
# math
|
239 |
+
if path in (None, 'ajibawa-2023/Maths-College'):
|
240 |
+
dataset = load_dataset(path, split='train')
|
241 |
|
242 |
for row in dataset:
|
243 |
yield (
|
|
|
250 |
gc.collect()
|
251 |
|
252 |
# math
|
253 |
+
if path in (None, 'microsoft/orca-math-word-problems-200k'):
|
254 |
+
dataset = load_dataset(path, split='train')
|
255 |
|
256 |
for row in dataset:
|
257 |
yield (
|
|
|
264 |
gc.collect()
|
265 |
|
266 |
# math
|
267 |
+
if path in (None, 'fblgit/simple-math'):
|
268 |
for split in ['train', 'test']:
|
269 |
+
dataset = load_dataset(path, revision='refs/convert/parquet', split=split)
|
270 |
|
271 |
for row in dataset:
|
272 |
yield (
|
|
|
279 |
gc.collect()
|
280 |
|
281 |
# reasoning
|
282 |
+
if path in (None, 'SkunkworksAI/reasoning-0.01'):
|
283 |
+
dataset = load_dataset(path, split='train')
|
284 |
|
285 |
for row in dataset:
|
286 |
yield (
|
|
|
295 |
gc.collect()
|
296 |
|
297 |
# emoji
|
298 |
+
if path in (None, 'badrex/llm-emoji-dataset'):
|
299 |
+
dataset = load_dataset(path, split='train')
|
300 |
|
301 |
for row in dataset:
|
302 |
yield (
|
|
|
313 |
|
314 |
del dataset
|
315 |
gc.collect()
|
316 |
+
"""
|
317 |
+
|
318 |
+
def batch_iterator(path: str,
|
319 |
+
name: Optional[str]=None,
|
320 |
+
data_dir: Optional[str]=None,
|
321 |
+
data_files: Optional[str]=None,
|
322 |
+
revision: Optional[str]=None,
|
323 |
+
split: str='train',
|
324 |
+
format: Optional[str]=None):
|
325 |
+
assert format is not None
|
326 |
+
|
327 |
+
dataset = load_dataset(path=path,
|
328 |
+
name=name,
|
329 |
+
data_dir=data_dir,
|
330 |
+
data_files=data_files,
|
331 |
+
revision=revision,
|
332 |
+
split=split,
|
333 |
+
trust_remote_code=True)
|
334 |
+
|
335 |
+
for row in dataset:
|
336 |
+
text = format.format(**row)
|
337 |
+
yield text
|
338 |
+
|
339 |
+
|
340 |
+
def tokenize_fn(datasets_config, tokenizer=None):
|
341 |
+
for text in batch_iterator(**datasets_config):
|
342 |
text_ids = tokenizer.encode(text, bos=False, eos=True)
|
343 |
yield text_ids
|
344 |
|
345 |
+
"""
|
346 |
datasets_names = [
|
347 |
'saillab/taco-datasets',
|
348 |
# 'xu-song/cc100-samples',
|
|
|
364 |
# 'SkunkworksAI/reasoning-0.01',
|
365 |
'badrex/llm-emoji-dataset',
|
366 |
]
|
367 |
+
"""
|
368 |
+
|
369 |
+
datasets_configs = [
|
370 |
+
{'path': 'yahma/alpaca-cleaned', 'format': '{instruction} {input} {output}'},
|
371 |
+
{'path': 'gbharti/wealth-alpaca_lora', 'format': '{instruction} {input} {output}'},
|
372 |
+
*[
|
373 |
+
{'path': 'saillab/taco-datasets', 'data_dir': data_dir, 'split': 'train[:10%]', 'format': '{instruction} {input} {output}'}
|
374 |
+
for data_dir in [
|
375 |
+
'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
376 |
+
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
377 |
+
]
|
378 |
+
],
|
379 |
+
*[
|
380 |
+
{'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train[:10%]', 'format': '{text}'}
|
381 |
+
for name in [
|
382 |
+
'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
|
383 |
+
'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
|
384 |
+
'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
|
385 |
+
'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
|
386 |
+
'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
|
387 |
+
'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
|
388 |
+
'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
|
389 |
+
'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
|
390 |
+
'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
|
391 |
+
'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
|
392 |
+
'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
|
393 |
+
'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
|
394 |
+
'zh-Hans', 'zh-Hant', 'zu',
|
395 |
+
]
|
396 |
+
],
|
397 |
+
{'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': 'train[:5%]', 'format': '{text}'},
|
398 |
+
{'path': 'MuskumPillerum/General-Knowledge', 'format': '{Question} {Answer}'},
|
399 |
+
{'path': 'yirenc/general_knowledge_boolean', 'split': 'train+validation', 'format': '{question}? {answer}. {passage}'},
|
400 |
+
{'path': 'nampdn-ai/tiny-textbooks', 'split': 'train+test', 'format': '{textbook}'},
|
401 |
+
{'path': 'nampdn-ai/tiny-codes', 'split': 'train[:5%]', 'format': '{prompt} {response}'},
|
402 |
+
*[
|
403 |
+
{'path': 'bigcode/the-stack-smol-xs', 'name': name, 'format': '{content}'}
|
404 |
+
for name in [
|
405 |
+
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
406 |
+
'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',
|
407 |
+
'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
|
408 |
+
'css', 'cuda', 'dart', 'dockerfile', 'elixir',
|
409 |
+
'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
|
410 |
+
'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
|
411 |
+
'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
|
412 |
+
'literate-agda', 'literate-coffeescript', 'literate-haskell',
|
413 |
+
'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
|
414 |
+
'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
|
415 |
+
'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
|
416 |
+
'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
|
417 |
+
'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
|
418 |
+
'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
|
419 |
+
'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
|
420 |
+
'yacc', 'zig',
|
421 |
+
]
|
422 |
+
],
|
423 |
+
{'path': 'm-a-p/CodeFeedback-Filtered-Instruction', 'split': 'train', 'format': '{query} {answer}'},
|
424 |
+
{'path': 'jtatman/python-code-dataset-500k', 'format': '{instruction} {output}'},
|
425 |
+
{'path': 'iamtarun/python_code_instructions_18k_alpaca', 'format': '{instruction} {input} {output}'},
|
426 |
+
{'path': 'HuggingFaceH4/CodeAlpaca_20K', 'split': 'train+test', 'format': '{prompt} {completion}'},
|
427 |
+
{'path': 'gair-prox/open-web-math-pro', 'split': 'train[:5%]', 'format': '{text}'},
|
428 |
+
{'path': 'rvv-karma/Math-QA', 'split': 'train+val+test', 'format': '{question} {answer}'},
|
429 |
+
{'path': 'ajibawa-2023/Maths-College', 'split': 'train[:10%]', 'format': '{instruction} {output}'},
|
430 |
+
{'path': 'microsoft/orca-math-word-problems-200k', 'format': '{question} {answer}'},
|
431 |
+
{'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train+test', 'format': '{instruction} = {output}'},
|
432 |
+
{'path': 'SkunkworksAI/reasoning-0.01', 'format': '{instruction} {reasoning} {output}'},
|
433 |
+
{'path': 'badrex/llm-emoji-dataset', 'format': '{character} {unicode} {short description} {tags} {LLM description}'},
|
434 |
+
]
|
435 |
|
436 |
outputs = optimize(
|
437 |
fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
|
438 |
+
inputs=datasets_configs,
|
439 |
output_dir='../pretrain-data/',
|
440 |
# Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
441 |
chunk_size=(2049 * 8012),
|
442 |
+
num_workers=32,
|
443 |
)
|
scripts/pretrain-model.yaml
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
|
2 |
# ``model_config``. (type: Optional[str], default: null)
|
3 |
-
model_name: "
|
4 |
|
5 |
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
|
6 |
# ``model_config``. (type: Optional[Config], default: null)
|
7 |
model_config:
|
8 |
padded_vocab_size: 38400
|
9 |
vocab_size: 38400
|
10 |
-
block_size:
|
11 |
n_layer: 5
|
12 |
n_head: 32
|
13 |
head_size: null
|
@@ -20,7 +20,12 @@ model_config:
|
|
20 |
norm_eps: 1e-05
|
21 |
mlp_class_name: "LLaMAMLP"
|
22 |
intermediate_size: 3584
|
23 |
-
rope_base:
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
|
26 |
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
|
@@ -71,7 +76,7 @@ train:
|
|
71 |
|
72 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
73 |
# max_tokens: 3000000000000
|
74 |
-
max_tokens: ??? # ? * 2049 *
|
75 |
|
76 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
77 |
max_steps:
|
|
|
1 |
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
|
2 |
# ``model_config``. (type: Optional[str], default: null)
|
3 |
+
model_name: "Llama-3.2-1B"
|
4 |
|
5 |
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
|
6 |
# ``model_config``. (type: Optional[Config], default: null)
|
7 |
model_config:
|
8 |
padded_vocab_size: 38400
|
9 |
vocab_size: 38400
|
10 |
+
block_size: 8192
|
11 |
n_layer: 5
|
12 |
n_head: 32
|
13 |
head_size: null
|
|
|
20 |
norm_eps: 1e-05
|
21 |
mlp_class_name: "LLaMAMLP"
|
22 |
intermediate_size: 3584
|
23 |
+
rope_base: 500000
|
24 |
+
rope_adjustments:
|
25 |
+
factor: 32.0
|
26 |
+
low_freq_factor: 1.0
|
27 |
+
high_freq_factor: 4.0
|
28 |
+
original_max_seq_len: 8192
|
29 |
|
30 |
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
|
31 |
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
|
|
|
76 |
|
77 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
78 |
# max_tokens: 3000000000000
|
79 |
+
max_tokens: ??? # ? * 2049 * 5
|
80 |
|
81 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
82 |
max_steps:
|