Text Generation
scaling
GregorZiegltrumAA commited on
Commit
52f8912
1 Parent(s): b174439
config.yml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ optimizer:
2
+ allreduce_bucket_size: 500000000
3
+ beta1: 0.9
4
+ beta2: 0.95
5
+ debug_log: false
6
+ eps: 1e-08
7
+ gradient_clipping: 0.0
8
+ zero: true
9
+ zero_save_static: false
10
+ topology:
11
+ activation_checkpointing_type: disabled
12
+ global_batch_size: 1024
13
+ gradient_accumulation_steps: 2
14
+ micro_batch_size: 2
15
+ model_parallel_size: 1
16
+ pipe_parallel_size: 1
17
+ pipe_partition_method: balanced
18
+ pipe_partition_overwrite: null
19
+ sequence_parallel: false
20
+ trainer:
21
+ seed: 42
22
+ train_iterations: 72000
23
+ training:
24
+ allow_missing_params_in_optimizer: true
25
+ training_groups:
26
+ - group_name: param_group
27
+ independent_weight_decay: true
28
+ learning_rate_scheduler:
29
+ learning_rate: 11.313708498984761
30
+ learning_rate_decay_iters: 72000
31
+ learning_rate_decay_style: cosine
32
+ learning_rate_minimum: 1.131370849898476
33
+ learning_rate_warmup_steps: 500
34
+ parameters_exclude:
35
+ - norm
36
+ weight_decay: 0.0001221
37
+ transformer_architecture:
38
+ attention_bias: false
39
+ attention_num_kv_heads: null
40
+ attention_qkv_in_one: true
41
+ dropout_after_attention: 0.0
42
+ dropout_after_mlp: 0.0
43
+ dropout_attention_probs: 0.0
44
+ dropout_embedding: 0.0
45
+ dropout_image_encoder: 0.0
46
+ fp8_config_attention:
47
+ dtypes_forward:
48
+ left_dtype: e4m3
49
+ right_dtype: e4m3
50
+ dtypes_grad_input:
51
+ left_dtype: e5m2
52
+ right_dtype: e4m3
53
+ dtypes_grad_weight:
54
+ left_dtype: e4m3
55
+ right_dtype: e5m2
56
+ fp8_config_mlp:
57
+ dtypes_forward:
58
+ left_dtype: e4m3
59
+ right_dtype: e4m3
60
+ dtypes_grad_input:
61
+ left_dtype: e5m2
62
+ right_dtype: e4m3
63
+ dtypes_grad_weight:
64
+ left_dtype: e4m3
65
+ right_dtype: e5m2
66
+ hidden_size: 3072
67
+ image_encoder: false
68
+ key_query_norm: false
69
+ layernorm:
70
+ layernorm_epsilon: 1e-05
71
+ optimization_type: torch
72
+ local_attention_window_size: null
73
+ masked_softmax:
74
+ kernel: flash_attention
75
+ scale: 1.0
76
+ softmax_in_fp32: false
77
+ mlp_bias: false
78
+ mlp_factor: 2.6666666666666665
79
+ mlp_type: swiglu
80
+ norm_type: rms
81
+ num_attention_heads: 24
82
+ num_layers: 24
83
+ num_local_attention_heads: 0
84
+ precision: bfloat16
85
+ relative_position_embedding_type: rotary_complex
86
+ reset_attention_mask: false
87
+ reset_position_ids: false
88
+ rotary_embedding_base: 10000
89
+ rotary_percentage: 1.0
90
+ sequence_length: 4096
91
+ umup:
92
+ act_mult: 1.0
93
+ attn_mult: 1.0
94
+ enable: true
95
+ loss_mult: 1.0
96
+ normalize_depth_to_num_layers: true
97
+ residual_attn_ratio: 0.25
98
+ residual_mult: 1.0
99
+ vocab_file: null
100
+ vocab_size: 65536
101
+ weight_tying: false
model_state_layer_0_EmbeddingInput.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a901d0236a34da6bc606004f0255a30ba66d62da8dbb566fea4d4529804bcc4b
3
+ size 402654667
model_state_layer_10_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d828f28b9d082199d826abab8b8f10bbe841ce72487927bc66e8d76bc778d35
3
+ size 226507949
model_state_layer_11_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57d40e3a8554d1dfc5f1138379262477d68456a87dfdad41d4405727fb7e28c3
3
+ size 226507949
model_state_layer_12_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9503d6f99647d05eec3cadcae0992d0d51ad41783334fc88e66a76ee04422c3f
3
+ size 226507949
model_state_layer_13_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:835d76100270b439061a0cf199119cb3edda9d203a77bcd7444df5565895c525
3
+ size 226507949
model_state_layer_14_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaf2f8f6eb56400363feb1aa628d6fc8f3cdde4532aa68496666a7b736733730
3
+ size 226507949
model_state_layer_15_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b8711a96fe4b89d11ab292720a08412250dfe35caf0c699b988af61acb84a2
3
+ size 226507949
model_state_layer_16_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fd6fd36e71f05a284e7962797541f80f2fe3ffc50f098dae565ea23742260cf
3
+ size 226507949
model_state_layer_17_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5c43368a0042ac8febf3d99ff662597da93ac6003985f80e5335d8269f425ba
3
+ size 226507949
model_state_layer_18_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:923f718fc9d25a45be76dde2e64ad7afbd73c40dd810b7bbd6d4cd1dfbb1060e
3
+ size 226507949
model_state_layer_19_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59604cc1cef38e882888990aebb39010bd3504c77fff183540c1a7745d937d26
3
+ size 226507949
model_state_layer_1_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac4c94cbab681cd212180b2670e3449273476f702f9c8b4fe9b961713ace4bac
3
+ size 226507938
model_state_layer_20_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecbbf53e61d01f3e0e38c2cc7319993eba9c0e82bd2100096885ae5880686283
3
+ size 226507949
model_state_layer_21_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a340c4a94c0cd87b550deb81049828725768d9ae00180168ef104846d5989f23
3
+ size 226507949
model_state_layer_22_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9244195aa6b7133280576713117b27daa951e1053e5f2368e70ad12a027dc092
3
+ size 226507949
model_state_layer_23_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdd23a9ccbddb21beeee6eb59997c5d940275948b9a916d8aa8bf8dece8a1520
3
+ size 226507949
model_state_layer_24_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffdecb077d4aaa6f84dfc4f96e56a241049804fa3097f74476f3eec6da69ae6a
3
+ size 226507949
model_state_layer_25_LayerNormWrapper.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2b3c754dd51d475a2b59fe34562670ccffc8c8f005bffc176906fca2531106b
3
+ size 7602
model_state_layer_26_TransformerLMHead.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:618073a4542175f11ec86cae8ee9f0fe69632ef803e40d31a7d91a35d76ea8e1
3
+ size 402654632
model_state_layer_2_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fcfc338db6a9725e547b0f54fc8d858001e129c8e50016a15b3e65d58b43e60
3
+ size 226507938
model_state_layer_3_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a9e2e222f89a1742b3805eb8209ed273eb24fe2abcaddb3a285de04a5614bb8
3
+ size 226507938
model_state_layer_4_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c14971561e2bf4d4aa247f0cd4ff23cecce3c64e3383e90f1fe142d8fc253150
3
+ size 226507938
model_state_layer_5_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80497d07209df4f4e7a60b14f4e5751b4ba3508a29cc028c7bda481b342e7384
3
+ size 226507938
model_state_layer_6_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00dbda8871dd75c7e2c3418cd6e19956dc09e87db097840f9b10f90fbb70ff8a
3
+ size 226507938
model_state_layer_7_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e7fe38db8e431414b119029adcf763a449b307db041117a7d44187d867247e3
3
+ size 226507938
model_state_layer_8_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f12c5ceb054b376f1c072b64fc21c3e24d4632299325e61a98913f4b86c4f22
3
+ size 226507938
model_state_layer_9_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b7edbd79f85f6887eda6fabc4a143593271c04a7d0790ad664407ee9035ebb3
3
+ size 226507938
vocab.json ADDED
The diff for this file is too large to render. See raw diff