File size: 3,510 Bytes
4c44549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
accum_freq: 1
attn_activation: None
attn_name: auto
attn_seq_scalar: None
attn_seq_scalar_alpha: None
average: None
average_coefficients: None
beta1: 0.9
beta2: 0.95
checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_7b-1.0/checkpoints
copy_codebase: False
data_key: txt
dataset_manifest: None
dataset_resampled: False
dataset_type: auto
ddp_static_graph: False
debug: False
delete_previous_checkpoint: True
device: cuda:0
disable_buffer: False
dist_backend: nccl
dist_url: env://
distill_model: None
distill_pretrained: None
distributed: False
epochs: 5
epochs_cooldown: None
eps: 1e-08
experimental_meta_device: False
ffn_type: swiglu
force_distributed: False
force_min_lr: 0.0
fsdp: False
fsdp_amp: False
fsdp_backward_prefetch: False
fsdp_checkpoint: False
fsdp_cpu_offload: False
fsdp_hybrid: False
fsdp_hybrid_o2: False
fsdp_limit_all_gathers: False
fsdp_pure_bf16: False
fsdp_use_orig_params: False
global_batch_size: 1
global_val_batch_size: 1
grad_checkpointing: False
grad_clip_norm: 1.0
hf_fsdp_block: None
hf_model: None
hf_seq_len: None
ignore_parse_errors: False
load_pretrained_state: False
local_rank: 0
log_every_n_steps: 20
log_level: 20
log_local: False
log_logit_mean: False
log_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_7b-1.0/out.log
logs: /admin/home-sy/dcnlp_logs
lr: 0.0003
lr_cooldown_end: 3e-05
lr_cooldown_power: 1.0
lr_scheduler: cosine
model: open_lm_7b
model_norm: lp_layer_norm
moe_capacity_factor: 1.25
moe_expert_model_parallelism: False
moe_freq: 0
moe_loss_weight: 0.1
moe_num_experts: None
moe_top_k: 2
moe_weight_parallelism: False
multiple_data_passes: False
name: c4_original-open_lm_7b-1.0
no_set_device_rank: False
optimizer: adamw
per_gpu_batch_size: 1
per_gpu_val_batch_size: 1
positional_embedding_type: rotary
precision: amp_bfloat16
pretrained: None
qk_norm: True
rank: 0
remote_sync: s3://dcnlp-west/dcnlp_experiments_v3
remote_sync_frequency: 300
remote_sync_protocol: s3
report_to: 
resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-open_lm_7b-1.0/checkpoints/epoch_17.pt
save_frequency: 1
save_most_recent: False
seed: 124
seq_len: 2048
skip_scheduler: False
squash_mask_left: True
target_mask_individual: 50400
target_mask_left: 50300
tensorboard: False
tensorboard_path: 
torchcompile: False
torchscript: False
trace: False
train_data: None
train_data_mix_weights: None
train_data_upsampling_factors: None
train_num_samples: None
use_bn_sync: False
use_bnb_linear: None
val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar']
val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz']
val_frequency: 5
val_iter_ci: 10000
val_max_pop_ci: 300000
val_num_samples: None
val_seq_ci: True
val_tok_ci: True
vocab_size: 50432
wandb: False
wandb_notes: 
wandb_project_name: open-lm
warmup: 5000
wd: 0.33
workers: 2
world_size: 1
z_loss_coefficient: 0.0001