hlzhang109 commited on
Commit
31e0125
1 Parent(s): bf823f3

Upload folder using huggingface_hub

Browse files
models/random_1b/config.yaml CHANGED
@@ -16,18 +16,18 @@ update_reference: false
16
  learner_model: null
17
  train_online: false
18
  fix_reference: false
19
- run_name: olmo_32180564_4
20
  seed: 0
21
  epoch: null
22
  dry_run: false
23
  model:
24
- d_model: 1024
25
- n_heads: 16
26
  n_kv_heads: null
27
  clip_qkv: null
28
- n_layers: 12
29
  mlp_ratio: 4
30
- mlp_hidden_size: 4096
31
  activation_type: gelu
32
  block_type: sequential
33
  block_group_size: 1
@@ -255,7 +255,7 @@ data:
255
  label_mask_paths: null
256
  pad_direction: right
257
  generate_attention_mask: false
258
- num_workers: 0
259
  drop_last: true
260
  pin_memory: true
261
  prefetch_factor: 16
@@ -265,6 +265,7 @@ data:
265
  extra_data_paths: null
266
  extra_data_key: null
267
  load_extra_data_to_ram: false
 
268
  restore_dataloader: true
269
  fast_forward_batches: null
270
  evaluators:
@@ -288,6 +289,7 @@ evaluators:
288
  extra_data_paths: null
289
  extra_data_key: null
290
  load_extra_data_to_ram: false
 
291
  device_eval_batch_size: null
292
  subset_num_batches: null
293
  sft_use_label: false
@@ -312,6 +314,7 @@ evaluators:
312
  extra_data_paths: null
313
  extra_data_key: null
314
  load_extra_data_to_ram: false
 
315
  device_eval_batch_size: null
316
  subset_num_batches: null
317
  sft_use_label: false
@@ -342,6 +345,7 @@ evaluators:
342
  extra_data_paths: null
343
  extra_data_key: null
344
  load_extra_data_to_ram: false
 
345
  device_eval_batch_size: null
346
  subset_num_batches: null
347
  sft_use_label: false
@@ -364,6 +368,7 @@ evaluators:
364
  extra_data_paths: null
365
  extra_data_key: null
366
  load_extra_data_to_ram: false
 
367
  device_eval_batch_size: null
368
  subset_num_batches: null
369
  sft_use_label: false
@@ -386,6 +391,7 @@ evaluators:
386
  extra_data_paths: null
387
  extra_data_key: null
388
  load_extra_data_to_ram: false
 
389
  device_eval_batch_size: null
390
  subset_num_batches: null
391
  sft_use_label: false
@@ -408,6 +414,7 @@ evaluators:
408
  extra_data_paths: null
409
  extra_data_key: null
410
  load_extra_data_to_ram: false
 
411
  device_eval_batch_size: null
412
  subset_num_batches: null
413
  sft_use_label: false
@@ -430,6 +437,7 @@ evaluators:
430
  extra_data_paths: null
431
  extra_data_key: null
432
  load_extra_data_to_ram: false
 
433
  device_eval_batch_size: null
434
  subset_num_batches: null
435
  sft_use_label: false
@@ -452,6 +460,7 @@ evaluators:
452
  extra_data_paths: null
453
  extra_data_key: null
454
  load_extra_data_to_ram: false
 
455
  device_eval_batch_size: null
456
  subset_num_batches: null
457
  sft_use_label: false
@@ -474,6 +483,7 @@ evaluators:
474
  extra_data_paths: null
475
  extra_data_key: null
476
  load_extra_data_to_ram: false
 
477
  device_eval_batch_size: null
478
  subset_num_batches: null
479
  sft_use_label: false
@@ -496,6 +506,7 @@ evaluators:
496
  extra_data_paths: null
497
  extra_data_key: null
498
  load_extra_data_to_ram: false
 
499
  device_eval_batch_size: null
500
  subset_num_batches: null
501
  sft_use_label: false
@@ -518,6 +529,7 @@ evaluators:
518
  extra_data_paths: null
519
  extra_data_key: null
520
  load_extra_data_to_ram: false
 
521
  device_eval_batch_size: null
522
  subset_num_batches: null
523
  sft_use_label: false
@@ -540,6 +552,7 @@ evaluators:
540
  extra_data_paths: null
541
  extra_data_key: null
542
  load_extra_data_to_ram: false
 
543
  device_eval_batch_size: null
544
  subset_num_batches: 1000
545
  sft_use_label: false
@@ -562,6 +575,7 @@ evaluators:
562
  extra_data_paths: null
563
  extra_data_key: null
564
  load_extra_data_to_ram: false
 
565
  device_eval_batch_size: null
566
  subset_num_batches: 1000
567
  sft_use_label: false
@@ -584,6 +598,7 @@ evaluators:
584
  extra_data_paths: null
585
  extra_data_key: null
586
  load_extra_data_to_ram: false
 
587
  device_eval_batch_size: null
588
  subset_num_batches: 1000
589
  sft_use_label: false
@@ -606,6 +621,7 @@ evaluators:
606
  extra_data_paths: null
607
  extra_data_key: null
608
  load_extra_data_to_ram: false
 
609
  device_eval_batch_size: null
610
  subset_num_batches: 1000
611
  sft_use_label: false
@@ -628,6 +644,7 @@ evaluators:
628
  extra_data_paths: null
629
  extra_data_key: null
630
  load_extra_data_to_ram: false
 
631
  device_eval_batch_size: null
632
  subset_num_batches: 1000
633
  sft_use_label: false
@@ -650,6 +667,7 @@ evaluators:
650
  extra_data_paths: null
651
  extra_data_key: null
652
  load_extra_data_to_ram: false
 
653
  device_eval_batch_size: null
654
  subset_num_batches: 1000
655
  sft_use_label: false
@@ -672,6 +690,7 @@ evaluators:
672
  extra_data_paths: null
673
  extra_data_key: null
674
  load_extra_data_to_ram: false
 
675
  device_eval_batch_size: null
676
  subset_num_batches: 1000
677
  sft_use_label: false
@@ -694,6 +713,7 @@ evaluators:
694
  extra_data_paths: null
695
  extra_data_key: null
696
  load_extra_data_to_ram: false
 
697
  device_eval_batch_size: null
698
  subset_num_batches: 1000
699
  sft_use_label: false
@@ -702,7 +722,7 @@ eval_interval: 2000
702
  tokenizer:
703
  identifier: allenai/eleuther-ai-gpt-neox-20b-pii-special
704
  truncate_direction: right
705
- save_folder: /n/vast-scratch/kempner_sham_lab/data-olmo/ckpts/32180564_4/
706
  remote_save_folder: null
707
  canceled_check_interval: 50
708
  save_interval: 5000
@@ -713,7 +733,7 @@ save_num_unsharded_checkpoints_to_keep: 1
713
  save_overwrite: true
714
  force_save_unsharded: false
715
  no_pre_train_checkpoint: false
716
- load_path: null
717
  load_path_sharded_checkpointer: null
718
  reset_optimizer_state: false
719
  reset_trainer_state: false
@@ -721,8 +741,8 @@ sharded_checkpointer: torch_legacy
721
  new_style_checkpoints: null
722
  max_duration: 192000
723
  global_train_batch_size: 256
724
- device_train_batch_size: 256
725
- device_train_microbatch_size: 128
726
  device_eval_batch_size: 64
727
  eval_subset_num_batches: 100
728
  eval_on_load: false
@@ -733,8 +753,8 @@ precision: amp_bf16
733
  wandb:
734
  project: data-olmo
735
  entity: harvardml
736
- group: pretrain-150-9
737
- name: olmo_32180564_4
738
  tags:
739
  - watching
740
  log_artifacts: false
@@ -756,7 +776,7 @@ fsdp:
756
  precision: mixed
757
  hybrid_sharding_num_model_replicas: null
758
  softmax_auxiliary_loss: true
759
- time_limit: 210000.0
760
  extra_steps_after_cancel: 10
761
  early_stopping_factor: null
762
  save_data_indices: true
 
16
  learner_model: null
17
  train_online: false
18
  fix_reference: false
19
+ run_name: olmo_33178628_1
20
  seed: 0
21
  epoch: null
22
  dry_run: false
23
  model:
24
+ d_model: 2048
25
+ n_heads: 32
26
  n_kv_heads: null
27
  clip_qkv: null
28
+ n_layers: 24
29
  mlp_ratio: 4
30
+ mlp_hidden_size: 8192
31
  activation_type: gelu
32
  block_type: sequential
33
  block_group_size: 1
 
255
  label_mask_paths: null
256
  pad_direction: right
257
  generate_attention_mask: false
258
+ num_workers: 16
259
  drop_last: true
260
  pin_memory: true
261
  prefetch_factor: 16
 
265
  extra_data_paths: null
266
  extra_data_key: null
267
  load_extra_data_to_ram: false
268
+ index_path: null
269
  restore_dataloader: true
270
  fast_forward_batches: null
271
  evaluators:
 
289
  extra_data_paths: null
290
  extra_data_key: null
291
  load_extra_data_to_ram: false
292
+ index_path: null
293
  device_eval_batch_size: null
294
  subset_num_batches: null
295
  sft_use_label: false
 
314
  extra_data_paths: null
315
  extra_data_key: null
316
  load_extra_data_to_ram: false
317
+ index_path: null
318
  device_eval_batch_size: null
319
  subset_num_batches: null
320
  sft_use_label: false
 
345
  extra_data_paths: null
346
  extra_data_key: null
347
  load_extra_data_to_ram: false
348
+ index_path: null
349
  device_eval_batch_size: null
350
  subset_num_batches: null
351
  sft_use_label: false
 
368
  extra_data_paths: null
369
  extra_data_key: null
370
  load_extra_data_to_ram: false
371
+ index_path: null
372
  device_eval_batch_size: null
373
  subset_num_batches: null
374
  sft_use_label: false
 
391
  extra_data_paths: null
392
  extra_data_key: null
393
  load_extra_data_to_ram: false
394
+ index_path: null
395
  device_eval_batch_size: null
396
  subset_num_batches: null
397
  sft_use_label: false
 
414
  extra_data_paths: null
415
  extra_data_key: null
416
  load_extra_data_to_ram: false
417
+ index_path: null
418
  device_eval_batch_size: null
419
  subset_num_batches: null
420
  sft_use_label: false
 
437
  extra_data_paths: null
438
  extra_data_key: null
439
  load_extra_data_to_ram: false
440
+ index_path: null
441
  device_eval_batch_size: null
442
  subset_num_batches: null
443
  sft_use_label: false
 
460
  extra_data_paths: null
461
  extra_data_key: null
462
  load_extra_data_to_ram: false
463
+ index_path: null
464
  device_eval_batch_size: null
465
  subset_num_batches: null
466
  sft_use_label: false
 
483
  extra_data_paths: null
484
  extra_data_key: null
485
  load_extra_data_to_ram: false
486
+ index_path: null
487
  device_eval_batch_size: null
488
  subset_num_batches: null
489
  sft_use_label: false
 
506
  extra_data_paths: null
507
  extra_data_key: null
508
  load_extra_data_to_ram: false
509
+ index_path: null
510
  device_eval_batch_size: null
511
  subset_num_batches: null
512
  sft_use_label: false
 
529
  extra_data_paths: null
530
  extra_data_key: null
531
  load_extra_data_to_ram: false
532
+ index_path: null
533
  device_eval_batch_size: null
534
  subset_num_batches: null
535
  sft_use_label: false
 
552
  extra_data_paths: null
553
  extra_data_key: null
554
  load_extra_data_to_ram: false
555
+ index_path: null
556
  device_eval_batch_size: null
557
  subset_num_batches: 1000
558
  sft_use_label: false
 
575
  extra_data_paths: null
576
  extra_data_key: null
577
  load_extra_data_to_ram: false
578
+ index_path: null
579
  device_eval_batch_size: null
580
  subset_num_batches: 1000
581
  sft_use_label: false
 
598
  extra_data_paths: null
599
  extra_data_key: null
600
  load_extra_data_to_ram: false
601
+ index_path: null
602
  device_eval_batch_size: null
603
  subset_num_batches: 1000
604
  sft_use_label: false
 
621
  extra_data_paths: null
622
  extra_data_key: null
623
  load_extra_data_to_ram: false
624
+ index_path: null
625
  device_eval_batch_size: null
626
  subset_num_batches: 1000
627
  sft_use_label: false
 
644
  extra_data_paths: null
645
  extra_data_key: null
646
  load_extra_data_to_ram: false
647
+ index_path: null
648
  device_eval_batch_size: null
649
  subset_num_batches: 1000
650
  sft_use_label: false
 
667
  extra_data_paths: null
668
  extra_data_key: null
669
  load_extra_data_to_ram: false
670
+ index_path: null
671
  device_eval_batch_size: null
672
  subset_num_batches: 1000
673
  sft_use_label: false
 
690
  extra_data_paths: null
691
  extra_data_key: null
692
  load_extra_data_to_ram: false
693
+ index_path: null
694
  device_eval_batch_size: null
695
  subset_num_batches: 1000
696
  sft_use_label: false
 
713
  extra_data_paths: null
714
  extra_data_key: null
715
  load_extra_data_to_ram: false
716
+ index_path: null
717
  device_eval_batch_size: null
718
  subset_num_batches: 1000
719
  sft_use_label: false
 
722
  tokenizer:
723
  identifier: allenai/eleuther-ai-gpt-neox-20b-pii-special
724
  truncate_direction: right
725
+ save_folder: /n/holyscratch01/sham_lab/data-olmo/ckpts/33178628_1/
726
  remote_save_folder: null
727
  canceled_check_interval: 50
728
  save_interval: 5000
 
733
  save_overwrite: true
734
  force_save_unsharded: false
735
  no_pre_train_checkpoint: false
736
+ load_path: /n/holyscratch01/sham_lab/data-olmo/ckpts/32845610_1/step105000
737
  load_path_sharded_checkpointer: null
738
  reset_optimizer_state: false
739
  reset_trainer_state: false
 
741
  new_style_checkpoints: null
742
  max_duration: 192000
743
  global_train_batch_size: 256
744
+ device_train_batch_size: 64
745
+ device_train_microbatch_size: 32
746
  device_eval_batch_size: 64
747
  eval_subset_num_batches: 100
748
  eval_on_load: false
 
753
  wandb:
754
  project: data-olmo
755
  entity: harvardml
756
+ group: pretrain-150-1b-1-test
757
+ name: olmo_33178628_1
758
  tags:
759
  - watching
760
  log_artifacts: false
 
776
  precision: mixed
777
  hybrid_sharding_num_model_replicas: null
778
  softmax_auxiliary_loss: true
779
+ time_limit: 2100000.0
780
  extra_steps_after_cancel: 10
781
  early_stopping_factor: null
782
  save_data_indices: true
models/random_1b/model.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7845427fd34bb744773b5bd005b61960a176c5fbfb0254b3f1d94882348f292
3
- size 1016309727
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:863777eca7cd7dab0a013edb4469e24de6af6dc806dea3939f73bda271eaa71a
3
+ size 5656891654
models/random_1b/optim.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8fbb2627c922cb0dd673e13f4ba9d487f3b0ca1abd70c7674cafc2f2ef43e3a0
3
- size 2032626246
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd5cff86b087843aab6600528ed375274c616e8d40853b1099221811c0a70ae1
3
+ size 11313806634
models/random_1b/train.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b15d20b2e3f494f0cf5b5c53be57b9bb618477202d15fe3e2ce3a5ead7465a8e
3
- size 14668
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ac8c6b6f9f76b9fd9c21a898d148e291f91f17c5d919aa1b6d2327233d96c2a
3
+ size 14604