Text Generation
scaling
GregorZiegltrumAA commited on
Commit
db5f1ac
1 Parent(s): 6ef7e2b
config.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ optimizer:
2
+ allreduce_bucket_size: 500000000
3
+ beta1: 0.9
4
+ beta2: 0.95
5
+ debug_log: false
6
+ eps: 1e-08
7
+ gradient_clipping: 0.0
8
+ zero: true
9
+ zero_save_static: false
10
+ topology:
11
+ activation_checkpointing_type: disabled
12
+ global_batch_size: 1024
13
+ gradient_accumulation_steps: 4
14
+ micro_batch_size: 2
15
+ model_parallel_size: 1
16
+ pipe_parallel_size: 1
17
+ pipe_partition_method: balanced
18
+ pipe_partition_overwrite: null
19
+ sequence_parallel: false
20
+ trainer:
21
+ seed: 42
22
+ train_iterations: 72000
23
+ training:
24
+ allow_missing_params_in_optimizer: true
25
+ training_groups:
26
+ - group_name: param_group
27
+ independent_weight_decay: true
28
+ learning_rate_scheduler:
29
+ learning_rate: 11.313708498984761
30
+ learning_rate_decay_iters: 72000
31
+ learning_rate_decay_style: cosine
32
+ learning_rate_minimum: 1.131370849898476
33
+ learning_rate_warmup_steps: 500
34
+ parameters_exclude:
35
+ - norm
36
+ weight_decay: 0.0001221
37
+ transformer_architecture:
38
+ attention_bias: false
39
+ attention_num_kv_heads: null
40
+ attention_qkv_in_one: true
41
+ dropout_after_attention: 0.0
42
+ dropout_after_mlp: 0.0
43
+ dropout_attention_probs: 0.0
44
+ dropout_embedding: 0.0
45
+ dropout_image_encoder: 0.0
46
+ hidden_size: 2048
47
+ image_encoder: false
48
+ key_query_norm: false
49
+ layernorm:
50
+ layernorm_epsilon: 1e-05
51
+ optimization_type: torch
52
+ local_attention_window_size: null
53
+ masked_softmax:
54
+ kernel: flash_attention
55
+ scale: 1.0
56
+ softmax_in_fp32: false
57
+ mlp_bias: false
58
+ mlp_factor: 2.6640625
59
+ mlp_type: swiglu
60
+ norm_type: rms
61
+ num_attention_heads: 16
62
+ num_layers: 16
63
+ num_local_attention_heads: 0
64
+ precision: bfloat16
65
+ relative_position_embedding_type: rotary_complex
66
+ reset_attention_mask: false
67
+ reset_position_ids: false
68
+ rotary_embedding_base: 10000
69
+ rotary_percentage: 1.0
70
+ sequence_length: 4096
71
+ umup:
72
+ act_mult: 1.0
73
+ attn_mult: 1.0
74
+ enable: true
75
+ loss_mult: 1.0
76
+ normalize_depth_to_num_layers: true
77
+ residual_attn_ratio: 0.25
78
+ residual_mult: 1.0
79
+ vocab_file: null
80
+ vocab_size: 65536
81
+ weight_tying: false
model_state_layer_0_EmbeddingInput.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cdb22e1ad6bf895a44f788c7bb683773a56573c0aefb8ec6d0fd2410e48ca75
3
+ size 268436939
model_state_layer_10_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05d46de64463bca0fc6d167cc316703756b50a1d5f4044e8569d701b46d41c5a
3
+ size 100609197
model_state_layer_11_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:240c617a90769284ac27d1c792698ee875d4f5c72fa429d1a55771f5f46210bd
3
+ size 100609197
model_state_layer_12_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0897dd6efb4ecd7f10b4e1e7ac204f1d10cbe0e78e2ab0fd69031dd9c4e6f5f0
3
+ size 100609197
model_state_layer_13_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ea3e47b4dee532aa774dfa2bf67af7c58116e4a740d8fcfbdccf503569d87fe
3
+ size 100609197
model_state_layer_14_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:433caebfc684ded77dac848b1ae4c925cd85f492f36b8ddeae34ea82b6ed9690
3
+ size 100609197
model_state_layer_15_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfc7c0b890c25502f914ee7d07a658f6e84456e1aeea5247b7f6854bc1968a50
3
+ size 100609197
model_state_layer_16_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d85077733722b7efea50eca2369142d9c868a225301adf2e3e52166d2a2122f1
3
+ size 100609197
model_state_layer_17_LayerNormWrapper.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e657e00cad67691e2875f1bb3b943a2d86c17a326840572359f08e08e2f5b6cc
3
+ size 5554
model_state_layer_18_TransformerLMHead.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f261f4a1308044dae2335331c8f15918cf98df5515c6e32bc436121789f04de
3
+ size 268436904
model_state_layer_1_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d87fb862beecb45ef3ab83432ac1e946f85b226efc3903c495ab8e762a254841
3
+ size 100609186
model_state_layer_2_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b87d937ed72a842ce54cc3c507d3df9d43177dcb11e590ff5b20127388833789
3
+ size 100609186
model_state_layer_3_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e395c9ff1d91f6bbcd96d2888cac54c565371b335266b974ad710bc606c10a7
3
+ size 100609186
model_state_layer_4_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47780084f8521bdcfc6416c6415d0c041065c6e9f627d216e57f6a6dc35ae728
3
+ size 100609186
model_state_layer_5_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:480697d6bd96a1fe5bfe2e36896b73ef7ec6dd7926d5f3abec720fce99e60ae7
3
+ size 100609186
model_state_layer_6_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca0d80a2d3acd080792d5d4cd742190f30d7a8bd101e6e9580b25cde343de431
3
+ size 100609186
model_state_layer_7_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56d74776b86b39f203c61b49d8536707af34bf439f0b16cc0bd94f191b44053a
3
+ size 100609186
model_state_layer_8_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:224acd2cca341ba02d5505bfe6a4de87a5f2d12e430e1f4ebb280945546be237
3
+ size 100609186
model_state_layer_9_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c3cce9a4c9561082841634f4b3fcc298144732f688494cb7e5155176b44efac
3
+ size 100609186
vocab.json ADDED
The diff for this file is too large to render. See raw diff