GregorZiegltrumAA
commited on
Commit
•
aa41939
1
Parent(s):
c6a60d4
Push umup-research-7b-bf16 Model
Browse files- LICENSE +31 -0
- config.yml +81 -0
- model_state_layer_0_EmbeddingInput.pt +3 -0
- model_state_layer_10_TransformerLayer.pt +3 -0
- model_state_layer_11_TransformerLayer.pt +3 -0
- model_state_layer_12_TransformerLayer.pt +3 -0
- model_state_layer_13_TransformerLayer.pt +3 -0
- model_state_layer_14_TransformerLayer.pt +3 -0
- model_state_layer_15_TransformerLayer.pt +3 -0
- model_state_layer_16_TransformerLayer.pt +3 -0
- model_state_layer_17_TransformerLayer.pt +3 -0
- model_state_layer_18_TransformerLayer.pt +3 -0
- model_state_layer_19_TransformerLayer.pt +3 -0
- model_state_layer_1_TransformerLayer.pt +3 -0
- model_state_layer_20_TransformerLayer.pt +3 -0
- model_state_layer_21_TransformerLayer.pt +3 -0
- model_state_layer_22_TransformerLayer.pt +3 -0
- model_state_layer_23_TransformerLayer.pt +3 -0
- model_state_layer_24_TransformerLayer.pt +3 -0
- model_state_layer_25_TransformerLayer.pt +3 -0
- model_state_layer_26_TransformerLayer.pt +3 -0
- model_state_layer_27_TransformerLayer.pt +3 -0
- model_state_layer_28_TransformerLayer.pt +3 -0
- model_state_layer_29_TransformerLayer.pt +3 -0
- model_state_layer_2_TransformerLayer.pt +3 -0
- model_state_layer_30_TransformerLayer.pt +3 -0
- model_state_layer_31_TransformerLayer.pt +3 -0
- model_state_layer_32_TransformerLayer.pt +3 -0
- model_state_layer_33_LayerNormWrapper.pt +3 -0
- model_state_layer_34_TransformerLMHead.pt +3 -0
- model_state_layer_3_TransformerLayer.pt +3 -0
- model_state_layer_4_TransformerLayer.pt +3 -0
- model_state_layer_5_TransformerLayer.pt +3 -0
- model_state_layer_6_TransformerLayer.pt +3 -0
- model_state_layer_7_TransformerLayer.pt +3 -0
- model_state_layer_8_TransformerLayer.pt +3 -0
- model_state_layer_9_TransformerLayer.pt +3 -0
- vocab.json +0 -0
LICENSE
CHANGED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The following applies to all files in this repository, unless otherwise noted:
|
2 |
+
|
3 |
+
Copyright (c) 2024 IPAI Aleph Alpha Research GmbH. All rights reserved.
|
4 |
+
|
5 |
+
This project is licensed under the terms of the Open Aleph License 1.0, available at
|
6 |
+
https://github.com/Aleph-Alpha/.github/blob/main/oal.pdf
|
7 |
+
|
8 |
+
---
|
9 |
+
Excerpt from the license text:
|
10 |
+
|
11 |
+
Subject to the terms and conditions of this License, the Licensor grants you a non-exclusive, worldwide,
|
12 |
+
non-transferable, non-sublicensable, and royalty-free limited right to use, copy, modify, distribute, make
|
13 |
+
otherwise publicly available, and reproduce the Works and Derivative Works under Licensor’s copyright,
|
14 |
+
for any Non-Commercial and Non-Administrative purpose.
|
15 |
+
You may not use, copy, modify, distribute, make otherwise publicly available, reproduce, or sublicense the
|
16 |
+
Works or Derivative Works except as expressly provided under and in accordance with this License.
|
17 |
+
Your rights granted under this License will automatically terminate if you fail to comply with any of the
|
18 |
+
terms of this License.
|
19 |
+
|
20 |
+
EXCEPT FOR DAMAGES CAUSED BY INTENT OR FRAUDULENTLY CONCEALED
|
21 |
+
DEFECTS, AND EXCEPT FOR DAMAGES RESULTING FROM BREACH OF ANY
|
22 |
+
WARRANTY OR GUARANTEE EXPRESSLY GIVEN BY LICENSOR IN THE OPEN ALEPH LICENSE,
|
23 |
+
IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY
|
24 |
+
DAMAGES ARISING OUT OF THE OPEN ALEPH LICENSE OR THE USE OF THE WORK. ANY
|
25 |
+
MANDATORY STATUTORY LIABILITY UNDER APPLICABLE LAW REMAINS
|
26 |
+
UNAFFECTED.
|
27 |
+
|
28 |
+
EXCEPT AS EXPRESSLY STATED IN THIS LICENSE OR REQUIRED BY APPLICABLE
|
29 |
+
LAW, THE WORKS ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES
|
30 |
+
OF ANY KIND INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES REGARDING
|
31 |
+
THE CONTENTS, ACCURACY, OR FITNESS FOR A PARTICULAR PURPOSE.
|
config.yml
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
optimizer:
|
2 |
+
allreduce_bucket_size: 500000000
|
3 |
+
beta1: 0.9
|
4 |
+
beta2: 0.95
|
5 |
+
debug_log: false
|
6 |
+
eps: 1e-08
|
7 |
+
gradient_clipping: 0.0
|
8 |
+
zero: true
|
9 |
+
zero_save_static: false
|
10 |
+
topology:
|
11 |
+
activation_checkpointing_type: disabled
|
12 |
+
global_batch_size: 1024
|
13 |
+
gradient_accumulation_steps: 4
|
14 |
+
micro_batch_size: 2
|
15 |
+
model_parallel_size: 1
|
16 |
+
pipe_parallel_size: 2
|
17 |
+
pipe_partition_method: balanced
|
18 |
+
pipe_partition_overwrite: null
|
19 |
+
sequence_parallel: false
|
20 |
+
trainer:
|
21 |
+
seed: 42
|
22 |
+
train_iterations: 72000
|
23 |
+
training:
|
24 |
+
allow_missing_params_in_optimizer: true
|
25 |
+
training_groups:
|
26 |
+
- group_name: param_group
|
27 |
+
independent_weight_decay: true
|
28 |
+
learning_rate_scheduler:
|
29 |
+
learning_rate: 11.313708498984761
|
30 |
+
learning_rate_decay_iters: 72000
|
31 |
+
learning_rate_decay_style: cosine
|
32 |
+
learning_rate_minimum: 1.131370849898476
|
33 |
+
learning_rate_warmup_steps: 500
|
34 |
+
parameters_exclude:
|
35 |
+
- norm
|
36 |
+
weight_decay: 0.0001221
|
37 |
+
transformer_architecture:
|
38 |
+
attention_bias: false
|
39 |
+
attention_num_kv_heads: null
|
40 |
+
attention_qkv_in_one: true
|
41 |
+
dropout_after_attention: 0.0
|
42 |
+
dropout_after_mlp: 0.0
|
43 |
+
dropout_attention_probs: 0.0
|
44 |
+
dropout_embedding: 0.0
|
45 |
+
dropout_image_encoder: 0.0
|
46 |
+
hidden_size: 4096
|
47 |
+
image_encoder: false
|
48 |
+
key_query_norm: false
|
49 |
+
layernorm:
|
50 |
+
layernorm_epsilon: 1e-05
|
51 |
+
optimization_type: torch
|
52 |
+
local_attention_window_size: null
|
53 |
+
masked_softmax:
|
54 |
+
kernel: flash_attention
|
55 |
+
scale: 1.0
|
56 |
+
softmax_in_fp32: false
|
57 |
+
mlp_bias: false
|
58 |
+
mlp_factor: 2.66796875
|
59 |
+
mlp_type: swiglu
|
60 |
+
norm_type: rms
|
61 |
+
num_attention_heads: 32
|
62 |
+
num_layers: 32
|
63 |
+
num_local_attention_heads: 0
|
64 |
+
precision: bfloat16
|
65 |
+
relative_position_embedding_type: rotary_complex
|
66 |
+
reset_attention_mask: false
|
67 |
+
reset_position_ids: false
|
68 |
+
rotary_embedding_base: 10000
|
69 |
+
rotary_percentage: 1.0
|
70 |
+
sequence_length: 4096
|
71 |
+
umup:
|
72 |
+
act_mult: 1.0
|
73 |
+
attn_mult: 1.0
|
74 |
+
enable: true
|
75 |
+
loss_mult: 1.0
|
76 |
+
normalize_depth_to_num_layers: true
|
77 |
+
residual_attn_ratio: 0.25
|
78 |
+
residual_mult: 1.0
|
79 |
+
vocab_file: null
|
80 |
+
vocab_size: 65536
|
81 |
+
weight_tying: false
|
model_state_layer_0_EmbeddingInput.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50a11727fdd6c0cd2b1409a7a04028b628e30d0dcbe49738b73ae90c4ad0309f
|
3 |
+
size 536872395
|
model_state_layer_10_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1856712a163f3bb2e76e157852dccca39ad777388b938e8d23020d4b606e97f
|
3 |
+
size 402803885
|
model_state_layer_11_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d8e3321d227ebd57e1a20056af0655eb63475a1d0c981e2b0faa8da9af64779
|
3 |
+
size 402803885
|
model_state_layer_12_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:68587ad93e4c85d95fdff7db8566ccf6d2f6ee7943e4ce9a9dd943e02628ae1d
|
3 |
+
size 402803885
|
model_state_layer_13_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b0724591604b036f6353227ef66a86c6376807eec5d20acd97e87e0b00cfc30
|
3 |
+
size 402803885
|
model_state_layer_14_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c41dd92a37d6669644d5bc5ddec972c5f2c1be1ff8248e2c6e026378c5f8b506
|
3 |
+
size 402803885
|
model_state_layer_15_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5f809d3951280bcdf69680619d353027de30d9e2f79f78534eaa145c62b1a46
|
3 |
+
size 402803885
|
model_state_layer_16_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76fdb6005103e77bd13b2f3ed2aaf3ff72e0394e8abba9ff8e2b6bf08e45e41d
|
3 |
+
size 402803885
|
model_state_layer_17_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3da5bae6473a50ce4ba93b9b3a44fe9d6639f8ab19116b9f0a890310a96c86b4
|
3 |
+
size 402803885
|
model_state_layer_18_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d32a0bf03907c75d3b3506d0003810025b6f8630fc1d51973fb0a8f99b92e12
|
3 |
+
size 402803885
|
model_state_layer_19_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1db3834367a8952535823f065c7afb543265c8885476af7c394fdfcd80536e0
|
3 |
+
size 402803885
|
model_state_layer_1_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e306d46c4d3bc6a909a9a41bfb2a6d35c02b1cd723032b9c01a64221827e72df
|
3 |
+
size 402803874
|
model_state_layer_20_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91555d19572fc4d92f02f07ae22010d0b3deb9a47c40b4a3f816a83f7d01a7e8
|
3 |
+
size 402803885
|
model_state_layer_21_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad2ff7040e3d0a4f32153cade487420647bb137e40e4a813cec32ce49ca6c89b
|
3 |
+
size 402803885
|
model_state_layer_22_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcf21aee365527775fcadcaa6435518d5dde4fd87c0b491231c7b47bc0b96e1a
|
3 |
+
size 402803885
|
model_state_layer_23_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:311818cd45871fe7e70b3c15dfcdd32858782087fcee3c05e996befa9681c425
|
3 |
+
size 402803885
|
model_state_layer_24_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a545837a45b56d64ea9457bbf2e81813493b24c06dec2b0d09f7c285b36d9d05
|
3 |
+
size 402803885
|
model_state_layer_25_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4a0f22b08c98c4f226b3132d89953eafded322168b4f82d332f8c61f33218d0
|
3 |
+
size 402803885
|
model_state_layer_26_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ebe8b1941b00359b490725f1945b8852e779d9b31a5647380ef56e2205fa1c0e
|
3 |
+
size 402803885
|
model_state_layer_27_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc6d3736cfd822feb8cb9e3fcfb130e61dafd81b72d44b5f2806c6f9592ea740
|
3 |
+
size 402803885
|
model_state_layer_28_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de8a023eafd6d7fd26597a4211e38842d4bca0469fe12f4bc3983d819559c374
|
3 |
+
size 402803885
|
model_state_layer_29_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4dddb585312e189a38b1c0846e3bffd880e7b8efb4c635ea4017f2d263265b16
|
3 |
+
size 402803885
|
model_state_layer_2_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:594864c71d8a130d4589570e51dd26b3ebdb217e3f13b6f0a2a6bbd2d94d87f2
|
3 |
+
size 402803874
|
model_state_layer_30_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46824d80771a1846a44798d8f0226ac78bb22c4cfc44517e3b051e2da1e95bd4
|
3 |
+
size 402803885
|
model_state_layer_31_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7463a4fd21f076b103bfabf26c4dc86e3bc513d53090479a54fe425712db8404
|
3 |
+
size 402803885
|
model_state_layer_32_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0c81942bda3514a776127a6cf3a73bde6f914701a14a102bf81bdee2fe0947a8
|
3 |
+
size 402803885
|
model_state_layer_33_LayerNormWrapper.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fef3ad9a684ad078dc8af6a7034775dfb3fb66550f2a44945928150a3c162ed
|
3 |
+
size 9650
|
model_state_layer_34_TransformerLMHead.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c18ad5c74173ae583b004c7d2407fe64086b306be0507fb1f75697ff97e6762
|
3 |
+
size 536872360
|
model_state_layer_3_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:efea6b1179adebecae463901f4927674a6cf8beb944e6a4022624a49f1c09667
|
3 |
+
size 402803874
|
model_state_layer_4_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49ed243afbe090c3c5aecf226d72f4efb6c53d0d3feb760bac1b2ae03144675b
|
3 |
+
size 402803874
|
model_state_layer_5_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5350ced23424502f73eff2520e3611e8d14b70046a6a20e6cecfe65716be4033
|
3 |
+
size 402803874
|
model_state_layer_6_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:85e4e1d047265f79defed2bfd1366a154aef7ddb6477cdca02fa3bf70a6ec878
|
3 |
+
size 402803874
|
model_state_layer_7_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4562c41c44b0a6834a4c9d7660d81de57959569b3933e33107e825204b31a06b
|
3 |
+
size 402803874
|
model_state_layer_8_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c26c2dab1c57b0e70599b50db631a6db37f144e0c5682304eacd6bb84674cab
|
3 |
+
size 402803874
|
model_state_layer_9_TransformerLayer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03b996b9a1ecb8c844684cc862d72a4ce9cd02fce5c63cc39921fffe429bb1ec
|
3 |
+
size 402803874
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|