hlzhang109
commited on
Commit
•
31e0125
1
Parent(s):
bf823f3
Upload folder using huggingface_hub
Browse files- models/random_1b/config.yaml +33 -13
- models/random_1b/model.pt +2 -2
- models/random_1b/optim.pt +2 -2
- models/random_1b/train.pt +2 -2
models/random_1b/config.yaml
CHANGED
@@ -16,18 +16,18 @@ update_reference: false
|
|
16 |
learner_model: null
|
17 |
train_online: false
|
18 |
fix_reference: false
|
19 |
-
run_name:
|
20 |
seed: 0
|
21 |
epoch: null
|
22 |
dry_run: false
|
23 |
model:
|
24 |
-
d_model:
|
25 |
-
n_heads:
|
26 |
n_kv_heads: null
|
27 |
clip_qkv: null
|
28 |
-
n_layers:
|
29 |
mlp_ratio: 4
|
30 |
-
mlp_hidden_size:
|
31 |
activation_type: gelu
|
32 |
block_type: sequential
|
33 |
block_group_size: 1
|
@@ -255,7 +255,7 @@ data:
|
|
255 |
label_mask_paths: null
|
256 |
pad_direction: right
|
257 |
generate_attention_mask: false
|
258 |
-
num_workers:
|
259 |
drop_last: true
|
260 |
pin_memory: true
|
261 |
prefetch_factor: 16
|
@@ -265,6 +265,7 @@ data:
|
|
265 |
extra_data_paths: null
|
266 |
extra_data_key: null
|
267 |
load_extra_data_to_ram: false
|
|
|
268 |
restore_dataloader: true
|
269 |
fast_forward_batches: null
|
270 |
evaluators:
|
@@ -288,6 +289,7 @@ evaluators:
|
|
288 |
extra_data_paths: null
|
289 |
extra_data_key: null
|
290 |
load_extra_data_to_ram: false
|
|
|
291 |
device_eval_batch_size: null
|
292 |
subset_num_batches: null
|
293 |
sft_use_label: false
|
@@ -312,6 +314,7 @@ evaluators:
|
|
312 |
extra_data_paths: null
|
313 |
extra_data_key: null
|
314 |
load_extra_data_to_ram: false
|
|
|
315 |
device_eval_batch_size: null
|
316 |
subset_num_batches: null
|
317 |
sft_use_label: false
|
@@ -342,6 +345,7 @@ evaluators:
|
|
342 |
extra_data_paths: null
|
343 |
extra_data_key: null
|
344 |
load_extra_data_to_ram: false
|
|
|
345 |
device_eval_batch_size: null
|
346 |
subset_num_batches: null
|
347 |
sft_use_label: false
|
@@ -364,6 +368,7 @@ evaluators:
|
|
364 |
extra_data_paths: null
|
365 |
extra_data_key: null
|
366 |
load_extra_data_to_ram: false
|
|
|
367 |
device_eval_batch_size: null
|
368 |
subset_num_batches: null
|
369 |
sft_use_label: false
|
@@ -386,6 +391,7 @@ evaluators:
|
|
386 |
extra_data_paths: null
|
387 |
extra_data_key: null
|
388 |
load_extra_data_to_ram: false
|
|
|
389 |
device_eval_batch_size: null
|
390 |
subset_num_batches: null
|
391 |
sft_use_label: false
|
@@ -408,6 +414,7 @@ evaluators:
|
|
408 |
extra_data_paths: null
|
409 |
extra_data_key: null
|
410 |
load_extra_data_to_ram: false
|
|
|
411 |
device_eval_batch_size: null
|
412 |
subset_num_batches: null
|
413 |
sft_use_label: false
|
@@ -430,6 +437,7 @@ evaluators:
|
|
430 |
extra_data_paths: null
|
431 |
extra_data_key: null
|
432 |
load_extra_data_to_ram: false
|
|
|
433 |
device_eval_batch_size: null
|
434 |
subset_num_batches: null
|
435 |
sft_use_label: false
|
@@ -452,6 +460,7 @@ evaluators:
|
|
452 |
extra_data_paths: null
|
453 |
extra_data_key: null
|
454 |
load_extra_data_to_ram: false
|
|
|
455 |
device_eval_batch_size: null
|
456 |
subset_num_batches: null
|
457 |
sft_use_label: false
|
@@ -474,6 +483,7 @@ evaluators:
|
|
474 |
extra_data_paths: null
|
475 |
extra_data_key: null
|
476 |
load_extra_data_to_ram: false
|
|
|
477 |
device_eval_batch_size: null
|
478 |
subset_num_batches: null
|
479 |
sft_use_label: false
|
@@ -496,6 +506,7 @@ evaluators:
|
|
496 |
extra_data_paths: null
|
497 |
extra_data_key: null
|
498 |
load_extra_data_to_ram: false
|
|
|
499 |
device_eval_batch_size: null
|
500 |
subset_num_batches: null
|
501 |
sft_use_label: false
|
@@ -518,6 +529,7 @@ evaluators:
|
|
518 |
extra_data_paths: null
|
519 |
extra_data_key: null
|
520 |
load_extra_data_to_ram: false
|
|
|
521 |
device_eval_batch_size: null
|
522 |
subset_num_batches: null
|
523 |
sft_use_label: false
|
@@ -540,6 +552,7 @@ evaluators:
|
|
540 |
extra_data_paths: null
|
541 |
extra_data_key: null
|
542 |
load_extra_data_to_ram: false
|
|
|
543 |
device_eval_batch_size: null
|
544 |
subset_num_batches: 1000
|
545 |
sft_use_label: false
|
@@ -562,6 +575,7 @@ evaluators:
|
|
562 |
extra_data_paths: null
|
563 |
extra_data_key: null
|
564 |
load_extra_data_to_ram: false
|
|
|
565 |
device_eval_batch_size: null
|
566 |
subset_num_batches: 1000
|
567 |
sft_use_label: false
|
@@ -584,6 +598,7 @@ evaluators:
|
|
584 |
extra_data_paths: null
|
585 |
extra_data_key: null
|
586 |
load_extra_data_to_ram: false
|
|
|
587 |
device_eval_batch_size: null
|
588 |
subset_num_batches: 1000
|
589 |
sft_use_label: false
|
@@ -606,6 +621,7 @@ evaluators:
|
|
606 |
extra_data_paths: null
|
607 |
extra_data_key: null
|
608 |
load_extra_data_to_ram: false
|
|
|
609 |
device_eval_batch_size: null
|
610 |
subset_num_batches: 1000
|
611 |
sft_use_label: false
|
@@ -628,6 +644,7 @@ evaluators:
|
|
628 |
extra_data_paths: null
|
629 |
extra_data_key: null
|
630 |
load_extra_data_to_ram: false
|
|
|
631 |
device_eval_batch_size: null
|
632 |
subset_num_batches: 1000
|
633 |
sft_use_label: false
|
@@ -650,6 +667,7 @@ evaluators:
|
|
650 |
extra_data_paths: null
|
651 |
extra_data_key: null
|
652 |
load_extra_data_to_ram: false
|
|
|
653 |
device_eval_batch_size: null
|
654 |
subset_num_batches: 1000
|
655 |
sft_use_label: false
|
@@ -672,6 +690,7 @@ evaluators:
|
|
672 |
extra_data_paths: null
|
673 |
extra_data_key: null
|
674 |
load_extra_data_to_ram: false
|
|
|
675 |
device_eval_batch_size: null
|
676 |
subset_num_batches: 1000
|
677 |
sft_use_label: false
|
@@ -694,6 +713,7 @@ evaluators:
|
|
694 |
extra_data_paths: null
|
695 |
extra_data_key: null
|
696 |
load_extra_data_to_ram: false
|
|
|
697 |
device_eval_batch_size: null
|
698 |
subset_num_batches: 1000
|
699 |
sft_use_label: false
|
@@ -702,7 +722,7 @@ eval_interval: 2000
|
|
702 |
tokenizer:
|
703 |
identifier: allenai/eleuther-ai-gpt-neox-20b-pii-special
|
704 |
truncate_direction: right
|
705 |
-
save_folder: /n/
|
706 |
remote_save_folder: null
|
707 |
canceled_check_interval: 50
|
708 |
save_interval: 5000
|
@@ -713,7 +733,7 @@ save_num_unsharded_checkpoints_to_keep: 1
|
|
713 |
save_overwrite: true
|
714 |
force_save_unsharded: false
|
715 |
no_pre_train_checkpoint: false
|
716 |
-
load_path:
|
717 |
load_path_sharded_checkpointer: null
|
718 |
reset_optimizer_state: false
|
719 |
reset_trainer_state: false
|
@@ -721,8 +741,8 @@ sharded_checkpointer: torch_legacy
|
|
721 |
new_style_checkpoints: null
|
722 |
max_duration: 192000
|
723 |
global_train_batch_size: 256
|
724 |
-
device_train_batch_size:
|
725 |
-
device_train_microbatch_size:
|
726 |
device_eval_batch_size: 64
|
727 |
eval_subset_num_batches: 100
|
728 |
eval_on_load: false
|
@@ -733,8 +753,8 @@ precision: amp_bf16
|
|
733 |
wandb:
|
734 |
project: data-olmo
|
735 |
entity: harvardml
|
736 |
-
group: pretrain-150-
|
737 |
-
name:
|
738 |
tags:
|
739 |
- watching
|
740 |
log_artifacts: false
|
@@ -756,7 +776,7 @@ fsdp:
|
|
756 |
precision: mixed
|
757 |
hybrid_sharding_num_model_replicas: null
|
758 |
softmax_auxiliary_loss: true
|
759 |
-
time_limit:
|
760 |
extra_steps_after_cancel: 10
|
761 |
early_stopping_factor: null
|
762 |
save_data_indices: true
|
|
|
16 |
learner_model: null
|
17 |
train_online: false
|
18 |
fix_reference: false
|
19 |
+
run_name: olmo_33178628_1
|
20 |
seed: 0
|
21 |
epoch: null
|
22 |
dry_run: false
|
23 |
model:
|
24 |
+
d_model: 2048
|
25 |
+
n_heads: 32
|
26 |
n_kv_heads: null
|
27 |
clip_qkv: null
|
28 |
+
n_layers: 24
|
29 |
mlp_ratio: 4
|
30 |
+
mlp_hidden_size: 8192
|
31 |
activation_type: gelu
|
32 |
block_type: sequential
|
33 |
block_group_size: 1
|
|
|
255 |
label_mask_paths: null
|
256 |
pad_direction: right
|
257 |
generate_attention_mask: false
|
258 |
+
num_workers: 16
|
259 |
drop_last: true
|
260 |
pin_memory: true
|
261 |
prefetch_factor: 16
|
|
|
265 |
extra_data_paths: null
|
266 |
extra_data_key: null
|
267 |
load_extra_data_to_ram: false
|
268 |
+
index_path: null
|
269 |
restore_dataloader: true
|
270 |
fast_forward_batches: null
|
271 |
evaluators:
|
|
|
289 |
extra_data_paths: null
|
290 |
extra_data_key: null
|
291 |
load_extra_data_to_ram: false
|
292 |
+
index_path: null
|
293 |
device_eval_batch_size: null
|
294 |
subset_num_batches: null
|
295 |
sft_use_label: false
|
|
|
314 |
extra_data_paths: null
|
315 |
extra_data_key: null
|
316 |
load_extra_data_to_ram: false
|
317 |
+
index_path: null
|
318 |
device_eval_batch_size: null
|
319 |
subset_num_batches: null
|
320 |
sft_use_label: false
|
|
|
345 |
extra_data_paths: null
|
346 |
extra_data_key: null
|
347 |
load_extra_data_to_ram: false
|
348 |
+
index_path: null
|
349 |
device_eval_batch_size: null
|
350 |
subset_num_batches: null
|
351 |
sft_use_label: false
|
|
|
368 |
extra_data_paths: null
|
369 |
extra_data_key: null
|
370 |
load_extra_data_to_ram: false
|
371 |
+
index_path: null
|
372 |
device_eval_batch_size: null
|
373 |
subset_num_batches: null
|
374 |
sft_use_label: false
|
|
|
391 |
extra_data_paths: null
|
392 |
extra_data_key: null
|
393 |
load_extra_data_to_ram: false
|
394 |
+
index_path: null
|
395 |
device_eval_batch_size: null
|
396 |
subset_num_batches: null
|
397 |
sft_use_label: false
|
|
|
414 |
extra_data_paths: null
|
415 |
extra_data_key: null
|
416 |
load_extra_data_to_ram: false
|
417 |
+
index_path: null
|
418 |
device_eval_batch_size: null
|
419 |
subset_num_batches: null
|
420 |
sft_use_label: false
|
|
|
437 |
extra_data_paths: null
|
438 |
extra_data_key: null
|
439 |
load_extra_data_to_ram: false
|
440 |
+
index_path: null
|
441 |
device_eval_batch_size: null
|
442 |
subset_num_batches: null
|
443 |
sft_use_label: false
|
|
|
460 |
extra_data_paths: null
|
461 |
extra_data_key: null
|
462 |
load_extra_data_to_ram: false
|
463 |
+
index_path: null
|
464 |
device_eval_batch_size: null
|
465 |
subset_num_batches: null
|
466 |
sft_use_label: false
|
|
|
483 |
extra_data_paths: null
|
484 |
extra_data_key: null
|
485 |
load_extra_data_to_ram: false
|
486 |
+
index_path: null
|
487 |
device_eval_batch_size: null
|
488 |
subset_num_batches: null
|
489 |
sft_use_label: false
|
|
|
506 |
extra_data_paths: null
|
507 |
extra_data_key: null
|
508 |
load_extra_data_to_ram: false
|
509 |
+
index_path: null
|
510 |
device_eval_batch_size: null
|
511 |
subset_num_batches: null
|
512 |
sft_use_label: false
|
|
|
529 |
extra_data_paths: null
|
530 |
extra_data_key: null
|
531 |
load_extra_data_to_ram: false
|
532 |
+
index_path: null
|
533 |
device_eval_batch_size: null
|
534 |
subset_num_batches: null
|
535 |
sft_use_label: false
|
|
|
552 |
extra_data_paths: null
|
553 |
extra_data_key: null
|
554 |
load_extra_data_to_ram: false
|
555 |
+
index_path: null
|
556 |
device_eval_batch_size: null
|
557 |
subset_num_batches: 1000
|
558 |
sft_use_label: false
|
|
|
575 |
extra_data_paths: null
|
576 |
extra_data_key: null
|
577 |
load_extra_data_to_ram: false
|
578 |
+
index_path: null
|
579 |
device_eval_batch_size: null
|
580 |
subset_num_batches: 1000
|
581 |
sft_use_label: false
|
|
|
598 |
extra_data_paths: null
|
599 |
extra_data_key: null
|
600 |
load_extra_data_to_ram: false
|
601 |
+
index_path: null
|
602 |
device_eval_batch_size: null
|
603 |
subset_num_batches: 1000
|
604 |
sft_use_label: false
|
|
|
621 |
extra_data_paths: null
|
622 |
extra_data_key: null
|
623 |
load_extra_data_to_ram: false
|
624 |
+
index_path: null
|
625 |
device_eval_batch_size: null
|
626 |
subset_num_batches: 1000
|
627 |
sft_use_label: false
|
|
|
644 |
extra_data_paths: null
|
645 |
extra_data_key: null
|
646 |
load_extra_data_to_ram: false
|
647 |
+
index_path: null
|
648 |
device_eval_batch_size: null
|
649 |
subset_num_batches: 1000
|
650 |
sft_use_label: false
|
|
|
667 |
extra_data_paths: null
|
668 |
extra_data_key: null
|
669 |
load_extra_data_to_ram: false
|
670 |
+
index_path: null
|
671 |
device_eval_batch_size: null
|
672 |
subset_num_batches: 1000
|
673 |
sft_use_label: false
|
|
|
690 |
extra_data_paths: null
|
691 |
extra_data_key: null
|
692 |
load_extra_data_to_ram: false
|
693 |
+
index_path: null
|
694 |
device_eval_batch_size: null
|
695 |
subset_num_batches: 1000
|
696 |
sft_use_label: false
|
|
|
713 |
extra_data_paths: null
|
714 |
extra_data_key: null
|
715 |
load_extra_data_to_ram: false
|
716 |
+
index_path: null
|
717 |
device_eval_batch_size: null
|
718 |
subset_num_batches: 1000
|
719 |
sft_use_label: false
|
|
|
722 |
tokenizer:
|
723 |
identifier: allenai/eleuther-ai-gpt-neox-20b-pii-special
|
724 |
truncate_direction: right
|
725 |
+
save_folder: /n/holyscratch01/sham_lab/data-olmo/ckpts/33178628_1/
|
726 |
remote_save_folder: null
|
727 |
canceled_check_interval: 50
|
728 |
save_interval: 5000
|
|
|
733 |
save_overwrite: true
|
734 |
force_save_unsharded: false
|
735 |
no_pre_train_checkpoint: false
|
736 |
+
load_path: /n/holyscratch01/sham_lab/data-olmo/ckpts/32845610_1/step105000
|
737 |
load_path_sharded_checkpointer: null
|
738 |
reset_optimizer_state: false
|
739 |
reset_trainer_state: false
|
|
|
741 |
new_style_checkpoints: null
|
742 |
max_duration: 192000
|
743 |
global_train_batch_size: 256
|
744 |
+
device_train_batch_size: 64
|
745 |
+
device_train_microbatch_size: 32
|
746 |
device_eval_batch_size: 64
|
747 |
eval_subset_num_batches: 100
|
748 |
eval_on_load: false
|
|
|
753 |
wandb:
|
754 |
project: data-olmo
|
755 |
entity: harvardml
|
756 |
+
group: pretrain-150-1b-1-test
|
757 |
+
name: olmo_33178628_1
|
758 |
tags:
|
759 |
- watching
|
760 |
log_artifacts: false
|
|
|
776 |
precision: mixed
|
777 |
hybrid_sharding_num_model_replicas: null
|
778 |
softmax_auxiliary_loss: true
|
779 |
+
time_limit: 2100000.0
|
780 |
extra_steps_after_cancel: 10
|
781 |
early_stopping_factor: null
|
782 |
save_data_indices: true
|
models/random_1b/model.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:863777eca7cd7dab0a013edb4469e24de6af6dc806dea3939f73bda271eaa71a
|
3 |
+
size 5656891654
|
models/random_1b/optim.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd5cff86b087843aab6600528ed375274c616e8d40853b1099221811c0a70ae1
|
3 |
+
size 11313806634
|
models/random_1b/train.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ac8c6b6f9f76b9fd9c21a898d148e291f91f17c5d919aa1b6d2327233d96c2a
|
3 |
+
size 14604
|