Text Generation
scaling
GregorZiegltrumAA commited on
Commit
aa41939
1 Parent(s): c6a60d4

Push umup-research-7b-bf16 Model

Browse files
Files changed (38) hide show
  1. LICENSE +31 -0
  2. config.yml +81 -0
  3. model_state_layer_0_EmbeddingInput.pt +3 -0
  4. model_state_layer_10_TransformerLayer.pt +3 -0
  5. model_state_layer_11_TransformerLayer.pt +3 -0
  6. model_state_layer_12_TransformerLayer.pt +3 -0
  7. model_state_layer_13_TransformerLayer.pt +3 -0
  8. model_state_layer_14_TransformerLayer.pt +3 -0
  9. model_state_layer_15_TransformerLayer.pt +3 -0
  10. model_state_layer_16_TransformerLayer.pt +3 -0
  11. model_state_layer_17_TransformerLayer.pt +3 -0
  12. model_state_layer_18_TransformerLayer.pt +3 -0
  13. model_state_layer_19_TransformerLayer.pt +3 -0
  14. model_state_layer_1_TransformerLayer.pt +3 -0
  15. model_state_layer_20_TransformerLayer.pt +3 -0
  16. model_state_layer_21_TransformerLayer.pt +3 -0
  17. model_state_layer_22_TransformerLayer.pt +3 -0
  18. model_state_layer_23_TransformerLayer.pt +3 -0
  19. model_state_layer_24_TransformerLayer.pt +3 -0
  20. model_state_layer_25_TransformerLayer.pt +3 -0
  21. model_state_layer_26_TransformerLayer.pt +3 -0
  22. model_state_layer_27_TransformerLayer.pt +3 -0
  23. model_state_layer_28_TransformerLayer.pt +3 -0
  24. model_state_layer_29_TransformerLayer.pt +3 -0
  25. model_state_layer_2_TransformerLayer.pt +3 -0
  26. model_state_layer_30_TransformerLayer.pt +3 -0
  27. model_state_layer_31_TransformerLayer.pt +3 -0
  28. model_state_layer_32_TransformerLayer.pt +3 -0
  29. model_state_layer_33_LayerNormWrapper.pt +3 -0
  30. model_state_layer_34_TransformerLMHead.pt +3 -0
  31. model_state_layer_3_TransformerLayer.pt +3 -0
  32. model_state_layer_4_TransformerLayer.pt +3 -0
  33. model_state_layer_5_TransformerLayer.pt +3 -0
  34. model_state_layer_6_TransformerLayer.pt +3 -0
  35. model_state_layer_7_TransformerLayer.pt +3 -0
  36. model_state_layer_8_TransformerLayer.pt +3 -0
  37. model_state_layer_9_TransformerLayer.pt +3 -0
  38. vocab.json +0 -0
LICENSE CHANGED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The following applies to all files in this repository, unless otherwise noted:
2
+
3
+ Copyright (c) 2024 IPAI Aleph Alpha Research GmbH. All rights reserved.
4
+
5
+ This project is licensed under the terms of the Open Aleph License 1.0, available at
6
+ https://github.com/Aleph-Alpha/.github/blob/main/oal.pdf
7
+
8
+ ---
9
+ Excerpt from the license text:
10
+
11
+ Subject to the terms and conditions of this License, the Licensor grants you a non-exclusive, worldwide,
12
+ non-transferable, non-sublicensable, and royalty-free limited right to use, copy, modify, distribute, make
13
+ otherwise publicly available, and reproduce the Works and Derivative Works under Licensor’s copyright,
14
+ for any Non-Commercial and Non-Administrative purpose.
15
+ You may not use, copy, modify, distribute, make otherwise publicly available, reproduce, or sublicense the
16
+ Works or Derivative Works except as expressly provided under and in accordance with this License.
17
+ Your rights granted under this License will automatically terminate if you fail to comply with any of the
18
+ terms of this License.
19
+
20
+ EXCEPT FOR DAMAGES CAUSED BY INTENT OR FRAUDULENTLY CONCEALED
21
+ DEFECTS, AND EXCEPT FOR DAMAGES RESULTING FROM BREACH OF ANY
22
+ WARRANTY OR GUARANTEE EXPRESSLY GIVEN BY LICENSOR IN THE OPEN ALEPH LICENSE,
23
+ IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY
24
+ DAMAGES ARISING OUT OF THE OPEN ALEPH LICENSE OR THE USE OF THE WORK. ANY
25
+ MANDATORY STATUTORY LIABILITY UNDER APPLICABLE LAW REMAINS
26
+ UNAFFECTED.
27
+
28
+ EXCEPT AS EXPRESSLY STATED IN THIS LICENSE OR REQUIRED BY APPLICABLE
29
+ LAW, THE WORKS ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES
30
+ OF ANY KIND INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES REGARDING
31
+ THE CONTENTS, ACCURACY, OR FITNESS FOR A PARTICULAR PURPOSE.
config.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ optimizer:
2
+ allreduce_bucket_size: 500000000
3
+ beta1: 0.9
4
+ beta2: 0.95
5
+ debug_log: false
6
+ eps: 1e-08
7
+ gradient_clipping: 0.0
8
+ zero: true
9
+ zero_save_static: false
10
+ topology:
11
+ activation_checkpointing_type: disabled
12
+ global_batch_size: 1024
13
+ gradient_accumulation_steps: 4
14
+ micro_batch_size: 2
15
+ model_parallel_size: 1
16
+ pipe_parallel_size: 2
17
+ pipe_partition_method: balanced
18
+ pipe_partition_overwrite: null
19
+ sequence_parallel: false
20
+ trainer:
21
+ seed: 42
22
+ train_iterations: 72000
23
+ training:
24
+ allow_missing_params_in_optimizer: true
25
+ training_groups:
26
+ - group_name: param_group
27
+ independent_weight_decay: true
28
+ learning_rate_scheduler:
29
+ learning_rate: 11.313708498984761
30
+ learning_rate_decay_iters: 72000
31
+ learning_rate_decay_style: cosine
32
+ learning_rate_minimum: 1.131370849898476
33
+ learning_rate_warmup_steps: 500
34
+ parameters_exclude:
35
+ - norm
36
+ weight_decay: 0.0001221
37
+ transformer_architecture:
38
+ attention_bias: false
39
+ attention_num_kv_heads: null
40
+ attention_qkv_in_one: true
41
+ dropout_after_attention: 0.0
42
+ dropout_after_mlp: 0.0
43
+ dropout_attention_probs: 0.0
44
+ dropout_embedding: 0.0
45
+ dropout_image_encoder: 0.0
46
+ hidden_size: 4096
47
+ image_encoder: false
48
+ key_query_norm: false
49
+ layernorm:
50
+ layernorm_epsilon: 1e-05
51
+ optimization_type: torch
52
+ local_attention_window_size: null
53
+ masked_softmax:
54
+ kernel: flash_attention
55
+ scale: 1.0
56
+ softmax_in_fp32: false
57
+ mlp_bias: false
58
+ mlp_factor: 2.66796875
59
+ mlp_type: swiglu
60
+ norm_type: rms
61
+ num_attention_heads: 32
62
+ num_layers: 32
63
+ num_local_attention_heads: 0
64
+ precision: bfloat16
65
+ relative_position_embedding_type: rotary_complex
66
+ reset_attention_mask: false
67
+ reset_position_ids: false
68
+ rotary_embedding_base: 10000
69
+ rotary_percentage: 1.0
70
+ sequence_length: 4096
71
+ umup:
72
+ act_mult: 1.0
73
+ attn_mult: 1.0
74
+ enable: true
75
+ loss_mult: 1.0
76
+ normalize_depth_to_num_layers: true
77
+ residual_attn_ratio: 0.25
78
+ residual_mult: 1.0
79
+ vocab_file: null
80
+ vocab_size: 65536
81
+ weight_tying: false
model_state_layer_0_EmbeddingInput.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50a11727fdd6c0cd2b1409a7a04028b628e30d0dcbe49738b73ae90c4ad0309f
3
+ size 536872395
model_state_layer_10_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1856712a163f3bb2e76e157852dccca39ad777388b938e8d23020d4b606e97f
3
+ size 402803885
model_state_layer_11_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d8e3321d227ebd57e1a20056af0655eb63475a1d0c981e2b0faa8da9af64779
3
+ size 402803885
model_state_layer_12_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68587ad93e4c85d95fdff7db8566ccf6d2f6ee7943e4ce9a9dd943e02628ae1d
3
+ size 402803885
model_state_layer_13_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b0724591604b036f6353227ef66a86c6376807eec5d20acd97e87e0b00cfc30
3
+ size 402803885
model_state_layer_14_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c41dd92a37d6669644d5bc5ddec972c5f2c1be1ff8248e2c6e026378c5f8b506
3
+ size 402803885
model_state_layer_15_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5f809d3951280bcdf69680619d353027de30d9e2f79f78534eaa145c62b1a46
3
+ size 402803885
model_state_layer_16_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76fdb6005103e77bd13b2f3ed2aaf3ff72e0394e8abba9ff8e2b6bf08e45e41d
3
+ size 402803885
model_state_layer_17_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da5bae6473a50ce4ba93b9b3a44fe9d6639f8ab19116b9f0a890310a96c86b4
3
+ size 402803885
model_state_layer_18_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d32a0bf03907c75d3b3506d0003810025b6f8630fc1d51973fb0a8f99b92e12
3
+ size 402803885
model_state_layer_19_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1db3834367a8952535823f065c7afb543265c8885476af7c394fdfcd80536e0
3
+ size 402803885
model_state_layer_1_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e306d46c4d3bc6a909a9a41bfb2a6d35c02b1cd723032b9c01a64221827e72df
3
+ size 402803874
model_state_layer_20_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91555d19572fc4d92f02f07ae22010d0b3deb9a47c40b4a3f816a83f7d01a7e8
3
+ size 402803885
model_state_layer_21_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad2ff7040e3d0a4f32153cade487420647bb137e40e4a813cec32ce49ca6c89b
3
+ size 402803885
model_state_layer_22_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcf21aee365527775fcadcaa6435518d5dde4fd87c0b491231c7b47bc0b96e1a
3
+ size 402803885
model_state_layer_23_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:311818cd45871fe7e70b3c15dfcdd32858782087fcee3c05e996befa9681c425
3
+ size 402803885
model_state_layer_24_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a545837a45b56d64ea9457bbf2e81813493b24c06dec2b0d09f7c285b36d9d05
3
+ size 402803885
model_state_layer_25_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a0f22b08c98c4f226b3132d89953eafded322168b4f82d332f8c61f33218d0
3
+ size 402803885
model_state_layer_26_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebe8b1941b00359b490725f1945b8852e779d9b31a5647380ef56e2205fa1c0e
3
+ size 402803885
model_state_layer_27_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc6d3736cfd822feb8cb9e3fcfb130e61dafd81b72d44b5f2806c6f9592ea740
3
+ size 402803885
model_state_layer_28_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de8a023eafd6d7fd26597a4211e38842d4bca0469fe12f4bc3983d819559c374
3
+ size 402803885
model_state_layer_29_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dddb585312e189a38b1c0846e3bffd880e7b8efb4c635ea4017f2d263265b16
3
+ size 402803885
model_state_layer_2_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:594864c71d8a130d4589570e51dd26b3ebdb217e3f13b6f0a2a6bbd2d94d87f2
3
+ size 402803874
model_state_layer_30_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46824d80771a1846a44798d8f0226ac78bb22c4cfc44517e3b051e2da1e95bd4
3
+ size 402803885
model_state_layer_31_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7463a4fd21f076b103bfabf26c4dc86e3bc513d53090479a54fe425712db8404
3
+ size 402803885
model_state_layer_32_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c81942bda3514a776127a6cf3a73bde6f914701a14a102bf81bdee2fe0947a8
3
+ size 402803885
model_state_layer_33_LayerNormWrapper.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fef3ad9a684ad078dc8af6a7034775dfb3fb66550f2a44945928150a3c162ed
3
+ size 9650
model_state_layer_34_TransformerLMHead.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c18ad5c74173ae583b004c7d2407fe64086b306be0507fb1f75697ff97e6762
3
+ size 536872360
model_state_layer_3_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efea6b1179adebecae463901f4927674a6cf8beb944e6a4022624a49f1c09667
3
+ size 402803874
model_state_layer_4_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49ed243afbe090c3c5aecf226d72f4efb6c53d0d3feb760bac1b2ae03144675b
3
+ size 402803874
model_state_layer_5_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5350ced23424502f73eff2520e3611e8d14b70046a6a20e6cecfe65716be4033
3
+ size 402803874
model_state_layer_6_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85e4e1d047265f79defed2bfd1366a154aef7ddb6477cdca02fa3bf70a6ec878
3
+ size 402803874
model_state_layer_7_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4562c41c44b0a6834a4c9d7660d81de57959569b3933e33107e825204b31a06b
3
+ size 402803874
model_state_layer_8_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c26c2dab1c57b0e70599b50db631a6db37f144e0c5682304eacd6bb84674cab
3
+ size 402803874
model_state_layer_9_TransformerLayer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03b996b9a1ecb8c844684cc862d72a4ce9cd02fce5c63cc39921fffe429bb1ec
3
+ size 402803874
vocab.json ADDED
The diff for this file is too large to render. See raw diff