mtasic85 commited on
Commit
1816ac6
1 Parent(s): abd5982

pretrain model

Browse files
.gitignore CHANGED
@@ -163,4 +163,6 @@ cython_debug/
163
  .DS_Store
164
  .ruff_cache
165
  venv*/
166
- data/
 
 
 
163
  .DS_Store
164
  .ruff_cache
165
  venv*/
166
+ data/
167
+ pretrain-data/
168
+ contrain-data/
scripts/prepare_pretrain_dataset.py CHANGED
@@ -233,7 +233,7 @@ datasets_names = [
233
  outputs = optimize(
234
  fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
235
  inputs=datasets_names,
236
- output_dir='../data/',
237
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
238
  chunk_size=(4097 * 4006),
239
  num_workers=16,
 
233
  outputs = optimize(
234
  fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
235
  inputs=datasets_names,
236
+ output_dir='../pretrain-data/',
237
  # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
238
  chunk_size=(4097 * 4006),
239
  num_workers=16,
scripts/pretrain-model.yaml CHANGED
@@ -46,7 +46,7 @@ data:
46
  class_path: LitData
47
 
48
  init_args:
49
- data_path: "../data/"
50
  num_workers: 16
51
 
52
  # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
 
46
  class_path: LitData
47
 
48
  init_args:
49
+ data_path: "../pretrain-data/"
50
  num_workers: 16
51
 
52
  # Training-related arguments. See ``litgpt.args.TrainArgs`` for details