sagadre
commited on
Commit
•
4c44549
1
Parent(s):
6ade3a7
7b
Browse files
c4_original-open_lm_7b-1.0/checkpoints/epoch_17.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b04707ca718e40d012b9bb43b38c01b2f5684884d81144a744a4eaf7df43e138
|
3 |
+
size 27560991570
|
c4_original-open_lm_7b-1.0/params.txt
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 1
|
2 |
+
attn_activation: None
|
3 |
+
attn_name: auto
|
4 |
+
attn_seq_scalar: None
|
5 |
+
attn_seq_scalar_alpha: None
|
6 |
+
average: None
|
7 |
+
average_coefficients: None
|
8 |
+
beta1: 0.9
|
9 |
+
beta2: 0.95
|
10 |
+
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_7b-1.0/checkpoints
|
11 |
+
copy_codebase: False
|
12 |
+
data_key: txt
|
13 |
+
dataset_manifest: None
|
14 |
+
dataset_resampled: False
|
15 |
+
dataset_type: auto
|
16 |
+
ddp_static_graph: False
|
17 |
+
debug: False
|
18 |
+
delete_previous_checkpoint: True
|
19 |
+
device: cuda:0
|
20 |
+
disable_buffer: False
|
21 |
+
dist_backend: nccl
|
22 |
+
dist_url: env://
|
23 |
+
distill_model: None
|
24 |
+
distill_pretrained: None
|
25 |
+
distributed: False
|
26 |
+
epochs: 5
|
27 |
+
epochs_cooldown: None
|
28 |
+
eps: 1e-08
|
29 |
+
experimental_meta_device: False
|
30 |
+
ffn_type: swiglu
|
31 |
+
force_distributed: False
|
32 |
+
force_min_lr: 0.0
|
33 |
+
fsdp: False
|
34 |
+
fsdp_amp: False
|
35 |
+
fsdp_backward_prefetch: False
|
36 |
+
fsdp_checkpoint: False
|
37 |
+
fsdp_cpu_offload: False
|
38 |
+
fsdp_hybrid: False
|
39 |
+
fsdp_hybrid_o2: False
|
40 |
+
fsdp_limit_all_gathers: False
|
41 |
+
fsdp_pure_bf16: False
|
42 |
+
fsdp_use_orig_params: False
|
43 |
+
global_batch_size: 1
|
44 |
+
global_val_batch_size: 1
|
45 |
+
grad_checkpointing: False
|
46 |
+
grad_clip_norm: 1.0
|
47 |
+
hf_fsdp_block: None
|
48 |
+
hf_model: None
|
49 |
+
hf_seq_len: None
|
50 |
+
ignore_parse_errors: False
|
51 |
+
load_pretrained_state: False
|
52 |
+
local_rank: 0
|
53 |
+
log_every_n_steps: 20
|
54 |
+
log_level: 20
|
55 |
+
log_local: False
|
56 |
+
log_logit_mean: False
|
57 |
+
log_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_7b-1.0/out.log
|
58 |
+
logs: /admin/home-sy/dcnlp_logs
|
59 |
+
lr: 0.0003
|
60 |
+
lr_cooldown_end: 3e-05
|
61 |
+
lr_cooldown_power: 1.0
|
62 |
+
lr_scheduler: cosine
|
63 |
+
model: open_lm_7b
|
64 |
+
model_norm: lp_layer_norm
|
65 |
+
moe_capacity_factor: 1.25
|
66 |
+
moe_expert_model_parallelism: False
|
67 |
+
moe_freq: 0
|
68 |
+
moe_loss_weight: 0.1
|
69 |
+
moe_num_experts: None
|
70 |
+
moe_top_k: 2
|
71 |
+
moe_weight_parallelism: False
|
72 |
+
multiple_data_passes: False
|
73 |
+
name: c4_original-open_lm_7b-1.0
|
74 |
+
no_set_device_rank: False
|
75 |
+
optimizer: adamw
|
76 |
+
per_gpu_batch_size: 1
|
77 |
+
per_gpu_val_batch_size: 1
|
78 |
+
positional_embedding_type: rotary
|
79 |
+
precision: amp_bfloat16
|
80 |
+
pretrained: None
|
81 |
+
qk_norm: True
|
82 |
+
rank: 0
|
83 |
+
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
|
84 |
+
remote_sync_frequency: 300
|
85 |
+
remote_sync_protocol: s3
|
86 |
+
report_to:
|
87 |
+
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-open_lm_7b-1.0/checkpoints/epoch_17.pt
|
88 |
+
save_frequency: 1
|
89 |
+
save_most_recent: False
|
90 |
+
seed: 124
|
91 |
+
seq_len: 2048
|
92 |
+
skip_scheduler: False
|
93 |
+
squash_mask_left: True
|
94 |
+
target_mask_individual: 50400
|
95 |
+
target_mask_left: 50300
|
96 |
+
tensorboard: False
|
97 |
+
tensorboard_path:
|
98 |
+
torchcompile: False
|
99 |
+
torchscript: False
|
100 |
+
trace: False
|
101 |
+
train_data: None
|
102 |
+
train_data_mix_weights: None
|
103 |
+
train_data_upsampling_factors: None
|
104 |
+
train_num_samples: None
|
105 |
+
use_bn_sync: False
|
106 |
+
use_bnb_linear: None
|
107 |
+
val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar']
|
108 |
+
val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz']
|
109 |
+
val_frequency: 5
|
110 |
+
val_iter_ci: 10000
|
111 |
+
val_max_pop_ci: 300000
|
112 |
+
val_num_samples: None
|
113 |
+
val_seq_ci: True
|
114 |
+
val_tok_ci: True
|
115 |
+
vocab_size: 50432
|
116 |
+
wandb: False
|
117 |
+
wandb_notes:
|
118 |
+
wandb_project_name: open-lm
|
119 |
+
warmup: 5000
|
120 |
+
wd: 0.33
|
121 |
+
workers: 2
|
122 |
+
world_size: 1
|
123 |
+
z_loss_coefficient: 0.0001
|
rpj-open_lm_7b-1.0/checkpoints/epoch_39.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1c1a2008f6f48a3a406b047a4f3ba689fe4c00c50362477c416c2807cdca19f
|
3 |
+
size 27560991506
|
rpj-open_lm_7b-1.0/params.txt
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 4
|
2 |
+
average: None
|
3 |
+
average_coefficients: None
|
4 |
+
batch_size: 16
|
5 |
+
beta1: 0.9
|
6 |
+
beta2: 0.95
|
7 |
+
checkpoint_path: ./logs/vaishaal_open_lm_7b_cc1_without_replacement_137B_tokens_h100_rpj_only/checkpoints
|
8 |
+
copy_codebase: False
|
9 |
+
data_key: json
|
10 |
+
dataset_manifest: ['s3://permanent-813987666268/users/vaishaal/mlr/open_lm/rpj_tokenized_upsampled_eleutherai/manifest.jsonl']
|
11 |
+
dataset_resampled: False
|
12 |
+
dataset_type: auto
|
13 |
+
ddp_static_graph: False
|
14 |
+
debug: False
|
15 |
+
delete_previous_checkpoint: False
|
16 |
+
device: cuda:0
|
17 |
+
disable_buffer: False
|
18 |
+
dist_backend: nccl
|
19 |
+
dist_url: env://
|
20 |
+
distill_model: None
|
21 |
+
distill_pretrained: None
|
22 |
+
distributed: True
|
23 |
+
epochs: 64
|
24 |
+
epochs_cooldown: None
|
25 |
+
eps: 1e-08
|
26 |
+
ffn_type: swiglu
|
27 |
+
force_min_lr: 0.0
|
28 |
+
fsdp: True
|
29 |
+
fsdp_amp: False
|
30 |
+
fsdp_backward_prefetch: False
|
31 |
+
fsdp_checkpoint: False
|
32 |
+
fsdp_cpu_offload: False
|
33 |
+
fsdp_hybrid: False
|
34 |
+
fsdp_hybrid_o2: False
|
35 |
+
fsdp_limit_all_gathers: True
|
36 |
+
fsdp_pure_bf16: True
|
37 |
+
fsdp_use_orig_params: False
|
38 |
+
grad_checkpointing: False
|
39 |
+
grad_clip_norm: 1.0
|
40 |
+
hf_fsdp_block: None
|
41 |
+
hf_model: None
|
42 |
+
hf_seq_len: None
|
43 |
+
ignore_parse_errors: True
|
44 |
+
load_pretrained_state: False
|
45 |
+
local_rank: 0
|
46 |
+
log_every_n_steps: 20
|
47 |
+
log_level: 20
|
48 |
+
log_local: False
|
49 |
+
log_logit_mean: False
|
50 |
+
log_path: ./logs/vaishaal_open_lm_7b_cc1_without_replacement_137B_tokens_h100_rpj_only/out.log
|
51 |
+
logs: ./logs/
|
52 |
+
lr: 0.0003
|
53 |
+
lr_cooldown_end: 3e-05
|
54 |
+
lr_cooldown_power: 1.0
|
55 |
+
lr_scheduler: cosine
|
56 |
+
model: open_lm_7b
|
57 |
+
model_norm: lp_layer_norm
|
58 |
+
name: vaishaal_open_lm_7b_cc1_without_replacement_137B_tokens_h100_rpj_only
|
59 |
+
no_set_device_rank: False
|
60 |
+
optimizer: adamw
|
61 |
+
positional_embedding_type: rotary
|
62 |
+
precision: amp_bfloat16
|
63 |
+
pretrained: None
|
64 |
+
qk_norm: True
|
65 |
+
rank: 0
|
66 |
+
remote_sync: s3://permanent-813987666268/users/vaishaal/mlr/open_lm/checkpoints
|
67 |
+
remote_sync_frequency: 300
|
68 |
+
remote_sync_protocol: s3
|
69 |
+
report_to: wandb
|
70 |
+
resume: s3://permanent-813987666268/users/vaishaal/mlr/open_lm/checkpoints/vaishaal_open_lm_7b_cc1_without_replacement_137B_tokens_h100_rpj_only/checkpoints/epoch_23.pt
|
71 |
+
save_frequency: 1
|
72 |
+
save_most_recent: False
|
73 |
+
seed: 124
|
74 |
+
seq_len: 2048
|
75 |
+
skip_scheduler: False
|
76 |
+
target_mask_individual: None
|
77 |
+
target_mask_left: None
|
78 |
+
tensorboard: False
|
79 |
+
tensorboard_path:
|
80 |
+
torchcompile: False
|
81 |
+
torchscript: False
|
82 |
+
trace: False
|
83 |
+
train_data: None
|
84 |
+
train_data_mix_weights: None
|
85 |
+
train_data_upsampling_factors: None
|
86 |
+
train_num_samples: 1052856
|
87 |
+
use_bn_sync: False
|
88 |
+
use_bnb_linear: None
|
89 |
+
val_data: None
|
90 |
+
val_frequency: 1
|
91 |
+
val_num_samples: None
|
92 |
+
vocab_size: 50432
|
93 |
+
wandb: True
|
94 |
+
wandb_notes:
|
95 |
+
wandb_project_name: open_lm
|
96 |
+
warmup: 5000
|
97 |
+
wd: 0.1
|
98 |
+
workers: 4
|
99 |
+
world_size: 64
|
100 |
+
z_loss_coefficient: 0.0001
|
rw_original-open_lm_7b-1.0/checkpoints/epoch_47.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:169754a84143f9b9956684e0596a67a8379576beee50e186220abe277d0dd422
|
3 |
+
size 27560991506
|
rw_original-open_lm_7b-1.0/params.txt
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accum_freq: 4
|
2 |
+
average: None
|
3 |
+
average_coefficients: None
|
4 |
+
batch_size: 16
|
5 |
+
beta1: 0.9
|
6 |
+
beta2: 0.95
|
7 |
+
checkpoint_path: ./logs/refined_web_7b_CC1_experiment_16_nodes/checkpoints
|
8 |
+
copy_codebase: False
|
9 |
+
data_key: json.gz
|
10 |
+
dataset_manifest: ['s3://permanent-813987666268/users/vaishaal/mlr/refined_web_tokenized/manifest.jsonl']
|
11 |
+
dataset_resampled: False
|
12 |
+
dataset_type: auto
|
13 |
+
ddp_static_graph: False
|
14 |
+
debug: False
|
15 |
+
delete_previous_checkpoint: False
|
16 |
+
device: cuda:0
|
17 |
+
disable_buffer: False
|
18 |
+
dist_backend: nccl
|
19 |
+
dist_url: env://
|
20 |
+
distill_model: None
|
21 |
+
distill_pretrained: None
|
22 |
+
distributed: True
|
23 |
+
epochs: 64
|
24 |
+
epochs_cooldown: None
|
25 |
+
eps: 1e-08
|
26 |
+
ffn_type: swiglu
|
27 |
+
force_min_lr: 0.0
|
28 |
+
fsdp: True
|
29 |
+
fsdp_amp: False
|
30 |
+
fsdp_backward_prefetch: False
|
31 |
+
fsdp_checkpoint: False
|
32 |
+
fsdp_cpu_offload: False
|
33 |
+
fsdp_hybrid: False
|
34 |
+
fsdp_hybrid_o2: False
|
35 |
+
fsdp_limit_all_gathers: True
|
36 |
+
fsdp_pure_bf16: True
|
37 |
+
fsdp_use_orig_params: False
|
38 |
+
grad_checkpointing: False
|
39 |
+
grad_clip_norm: 1.0
|
40 |
+
hf_fsdp_block: None
|
41 |
+
hf_model: None
|
42 |
+
hf_seq_len: None
|
43 |
+
ignore_parse_errors: True
|
44 |
+
load_pretrained_state: False
|
45 |
+
local_rank: 0
|
46 |
+
log_every_n_steps: 20
|
47 |
+
log_level: 20
|
48 |
+
log_local: False
|
49 |
+
log_logit_mean: False
|
50 |
+
log_path: ./logs/refined_web_7b_CC1_experiment_16_nodes/out.log
|
51 |
+
logs: ./logs/
|
52 |
+
lr: 0.0003
|
53 |
+
lr_cooldown_end: 3e-05
|
54 |
+
lr_cooldown_power: 1.0
|
55 |
+
lr_scheduler: cosine
|
56 |
+
model: open_lm_7b
|
57 |
+
model_norm: lp_layer_norm
|
58 |
+
name: refined_web_7b_CC1_experiment_16_nodes
|
59 |
+
no_set_device_rank: False
|
60 |
+
optimizer: adamw
|
61 |
+
positional_embedding_type: rotary
|
62 |
+
precision: amp_bfloat16
|
63 |
+
pretrained: None
|
64 |
+
qk_norm: True
|
65 |
+
rank: 0
|
66 |
+
remote_sync: s3://permanent-813987666268/users/vaishaal/mlr/open_lm/checkpoints
|
67 |
+
remote_sync_frequency: 300
|
68 |
+
remote_sync_protocol: s3
|
69 |
+
report_to: wandb
|
70 |
+
resume: None
|
71 |
+
save_frequency: 1
|
72 |
+
save_most_recent: False
|
73 |
+
seed: 124
|
74 |
+
seq_len: 2048
|
75 |
+
skip_scheduler: False
|
76 |
+
target_mask_individual: None
|
77 |
+
target_mask_left: None
|
78 |
+
tensorboard: False
|
79 |
+
tensorboard_path:
|
80 |
+
torchcompile: False
|
81 |
+
torchscript: False
|
82 |
+
trace: False
|
83 |
+
train_data: None
|
84 |
+
train_data_mix_weights: None
|
85 |
+
train_data_upsampling_factors: None
|
86 |
+
train_num_samples: 1052856
|
87 |
+
use_bn_sync: False
|
88 |
+
use_bnb_linear: None
|
89 |
+
val_batch_size: None
|
90 |
+
val_data: None
|
91 |
+
val_data_key: txt
|
92 |
+
val_frequency: 1
|
93 |
+
val_num_samples: None
|
94 |
+
vocab_size: 50432
|
95 |
+
wandb: True
|
96 |
+
wandb_notes:
|
97 |
+
wandb_project_name: open_lm
|
98 |
+
warmup: 5000
|
99 |
+
wd: 0.1
|
100 |
+
workers: 4
|
101 |
+
world_size: 128
|
102 |
+
z_loss_coefficient: 0.0001
|