maxidl commited on
Commit
9b9c86f
1 Parent(s): 7d9eab6

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: mistralai/Mixtral-8x7B-v0.1
3
+ tags:
4
+ - alignment-handbook
5
+ - generated_from_trainer
6
+ datasets:
7
+ - maxidl/distilabel-capybara-dpo-7k-binarized_en_de
8
+ model-index:
9
+ - name: Mixtral-8x7B-orpo-en-de
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # Mixtral-8x7B-orpo-en-de
17
+
18
+ This model is a fine-tuned version of [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) on the maxidl/distilabel-capybara-dpo-7k-binarized_en_de dataset.
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 5e-06
38
+ - train_batch_size: 1
39
+ - eval_batch_size: 8
40
+ - seed: 42
41
+ - distributed_type: multi-GPU
42
+ - num_devices: 32
43
+ - total_train_batch_size: 32
44
+ - total_eval_batch_size: 256
45
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
+ - lr_scheduler_type: inverse_sqrt
47
+ - lr_scheduler_warmup_steps: 100
48
+ - num_epochs: 3
49
+
50
+ ### Training results
51
+
52
+
53
+
54
+ ### Framework versions
55
+
56
+ - Transformers 4.39.3
57
+ - Pytorch 2.1.2
58
+ - Datasets 2.18.0
59
+ - Tokenizers 0.15.2
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.7946326331969004,
4
+ "train_runtime": 13847.9433,
5
+ "train_samples": 11530,
6
+ "train_samples_per_second": 2.498,
7
+ "train_steps_per_second": 0.078
8
+ }
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "mistralai/Mixtral-8x7B-v0.1",
3
+ "architectures": [
4
+ "MixtralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 4096,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 14336,
13
+ "max_position_embeddings": 32768,
14
+ "model_type": "mixtral",
15
+ "num_attention_heads": 32,
16
+ "num_experts_per_tok": 2,
17
+ "num_hidden_layers": 32,
18
+ "num_key_value_heads": 8,
19
+ "num_local_experts": 8,
20
+ "output_router_logits": false,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_theta": 1000000.0,
23
+ "router_aux_loss_coef": 0.02,
24
+ "sliding_window": null,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.39.3",
28
+ "use_cache": true,
29
+ "vocab_size": 32000
30
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.39.3"
6
+ }
model-00001-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51393fb3b45364b573b4cf5513bbd4431283c21b849c6f8c17d801a55f8de984
3
+ size 4892809584
model-00002-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4b27fbc0cf9b5243617a54f3deacf70ed122e8271643ac2163b125e42870b49
3
+ size 4983004016
model-00003-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbdec71b5aaee6bce11df1921a7a196d4f66eea1c0839d45c9af67f8ddb2d1a8
3
+ size 4983004016
model-00004-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a36e5ac47e97addf38a57aaedba39ef68c21bbf0fada7c127215e84651a031b5
3
+ size 4899035200
model-00005-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a6869b63aae37d3463fcbc68a954c767b167589e1442e1d1a6695843e6202c7
3
+ size 4983004016
model-00006-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a5eb01932b13df54db976532b458789137900c868f4a03219e476bd5fcb5926
3
+ size 4983004016
model-00007-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33414b08e79dcc96c69060d6998b489c027fc0074f506e8cad25fd8cf31a6793
3
+ size 4899035248
model-00008-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:625556c52cbe9d2f2b7d52992a4a412fe971ccd9192354c7625a176649c7e4c4
3
+ size 4983004072
model-00009-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3798961900f1f38b3745a4e5bf8d8973b111658a54f7f5b2fdc8f02d1e9335d
3
+ size 4983004072
model-00010-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c77ae3a1f4f4731d579a9f2493d806131374e743df725b869178031f76c6233d
3
+ size 4899035248
model-00011-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d449cda1b608e08861d44551af97db229dfdeb7de6792271a7f67416c9f57eb
3
+ size 4983004072
model-00012-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:371074ab118bf30286f80b005823d3c5f8dc05e7a745e9da9e06c5ddcb7a35f1
3
+ size 4983004072
model-00013-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9340735cc666eee867eb130759e545cd362b3810821568a1265d0bf6923414e4
3
+ size 4983004072
model-00014-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13a9bfe510c93873d741f64578d931e0803ba7a9cfd6f3361b2f1ca994efa89a
3
+ size 4899035248
model-00015-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aacc70437fe0c54b0d0e41504afaab791f1843b5a51d16925f7080fe5f381945
3
+ size 4983004072
model-00016-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46adf38fcddc63372268a2e300d84f28447fb250312dd6ce1cb3a1e027cacae3
3
+ size 4983004072
model-00017-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4dd2769aea4107a0ac0af4971bee7ba1f6c73942181f4f11a489d25fbaf1ffe
3
+ size 4899035248
model-00018-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1011698d9c4309063e5e436bcfb1a848fd5e5d78dcc73e0132d0827d36a354f
3
+ size 4983004072
model-00019-of-00019.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c3a2f9568c12911277d9908ea3593ba6eda441d41a0fbdf0b9b59c86c05f81d
3
+ size 4221679088
model.safetensors.index.json ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 93405585408
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00019-of-00019.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00019.safetensors",
8
+ "model.layers.0.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00019.safetensors",
9
+ "model.layers.0.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00019.safetensors",
10
+ "model.layers.0.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00019.safetensors",
11
+ "model.layers.0.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00019.safetensors",
12
+ "model.layers.0.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00019.safetensors",
13
+ "model.layers.0.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00019.safetensors",
14
+ "model.layers.0.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00019.safetensors",
15
+ "model.layers.0.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00019.safetensors",
16
+ "model.layers.0.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00019.safetensors",
17
+ "model.layers.0.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00019.safetensors",
18
+ "model.layers.0.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00019.safetensors",
19
+ "model.layers.0.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00019.safetensors",
20
+ "model.layers.0.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00019.safetensors",
21
+ "model.layers.0.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00019.safetensors",
22
+ "model.layers.0.block_sparse_moe.experts.4.w3.weight": "model-00001-of-00019.safetensors",
23
+ "model.layers.0.block_sparse_moe.experts.5.w1.weight": "model-00001-of-00019.safetensors",
24
+ "model.layers.0.block_sparse_moe.experts.5.w2.weight": "model-00001-of-00019.safetensors",
25
+ "model.layers.0.block_sparse_moe.experts.5.w3.weight": "model-00001-of-00019.safetensors",
26
+ "model.layers.0.block_sparse_moe.experts.6.w1.weight": "model-00001-of-00019.safetensors",
27
+ "model.layers.0.block_sparse_moe.experts.6.w2.weight": "model-00001-of-00019.safetensors",
28
+ "model.layers.0.block_sparse_moe.experts.6.w3.weight": "model-00001-of-00019.safetensors",
29
+ "model.layers.0.block_sparse_moe.experts.7.w1.weight": "model-00001-of-00019.safetensors",
30
+ "model.layers.0.block_sparse_moe.experts.7.w2.weight": "model-00001-of-00019.safetensors",
31
+ "model.layers.0.block_sparse_moe.experts.7.w3.weight": "model-00001-of-00019.safetensors",
32
+ "model.layers.0.block_sparse_moe.gate.weight": "model-00001-of-00019.safetensors",
33
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00019.safetensors",
34
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00019.safetensors",
35
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00019.safetensors",
36
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00019.safetensors",
37
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00019.safetensors",
38
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00019.safetensors",
39
+ "model.layers.1.block_sparse_moe.experts.0.w1.weight": "model-00001-of-00019.safetensors",
40
+ "model.layers.1.block_sparse_moe.experts.0.w2.weight": "model-00001-of-00019.safetensors",
41
+ "model.layers.1.block_sparse_moe.experts.0.w3.weight": "model-00001-of-00019.safetensors",
42
+ "model.layers.1.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00019.safetensors",
43
+ "model.layers.1.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00019.safetensors",
44
+ "model.layers.1.block_sparse_moe.experts.1.w3.weight": "model-00001-of-00019.safetensors",
45
+ "model.layers.1.block_sparse_moe.experts.2.w1.weight": "model-00001-of-00019.safetensors",
46
+ "model.layers.1.block_sparse_moe.experts.2.w2.weight": "model-00001-of-00019.safetensors",
47
+ "model.layers.1.block_sparse_moe.experts.2.w3.weight": "model-00001-of-00019.safetensors",
48
+ "model.layers.1.block_sparse_moe.experts.3.w1.weight": "model-00001-of-00019.safetensors",
49
+ "model.layers.1.block_sparse_moe.experts.3.w2.weight": "model-00001-of-00019.safetensors",
50
+ "model.layers.1.block_sparse_moe.experts.3.w3.weight": "model-00001-of-00019.safetensors",
51
+ "model.layers.1.block_sparse_moe.experts.4.w1.weight": "model-00001-of-00019.safetensors",
52
+ "model.layers.1.block_sparse_moe.experts.4.w2.weight": "model-00001-of-00019.safetensors",
53
+ "model.layers.1.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00019.safetensors",
54
+ "model.layers.1.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00019.safetensors",
55
+ "model.layers.1.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00019.safetensors",
56
+ "model.layers.1.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00019.safetensors",
57
+ "model.layers.1.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00019.safetensors",
58
+ "model.layers.1.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00019.safetensors",
59
+ "model.layers.1.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00019.safetensors",
60
+ "model.layers.1.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00019.safetensors",
61
+ "model.layers.1.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00019.safetensors",
62
+ "model.layers.1.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00019.safetensors",
63
+ "model.layers.1.block_sparse_moe.gate.weight": "model-00001-of-00019.safetensors",
64
+ "model.layers.1.input_layernorm.weight": "model-00002-of-00019.safetensors",
65
+ "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00019.safetensors",
66
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00019.safetensors",
67
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00019.safetensors",
68
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00019.safetensors",
69
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00019.safetensors",
70
+ "model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00019.safetensors",
71
+ "model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00019.safetensors",
72
+ "model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00019.safetensors",
73
+ "model.layers.10.block_sparse_moe.experts.1.w1.weight": "model-00007-of-00019.safetensors",
74
+ "model.layers.10.block_sparse_moe.experts.1.w2.weight": "model-00007-of-00019.safetensors",
75
+ "model.layers.10.block_sparse_moe.experts.1.w3.weight": "model-00007-of-00019.safetensors",
76
+ "model.layers.10.block_sparse_moe.experts.2.w1.weight": "model-00007-of-00019.safetensors",
77
+ "model.layers.10.block_sparse_moe.experts.2.w2.weight": "model-00007-of-00019.safetensors",
78
+ "model.layers.10.block_sparse_moe.experts.2.w3.weight": "model-00007-of-00019.safetensors",
79
+ "model.layers.10.block_sparse_moe.experts.3.w1.weight": "model-00007-of-00019.safetensors",
80
+ "model.layers.10.block_sparse_moe.experts.3.w2.weight": "model-00007-of-00019.safetensors",
81
+ "model.layers.10.block_sparse_moe.experts.3.w3.weight": "model-00007-of-00019.safetensors",
82
+ "model.layers.10.block_sparse_moe.experts.4.w1.weight": "model-00007-of-00019.safetensors",
83
+ "model.layers.10.block_sparse_moe.experts.4.w2.weight": "model-00007-of-00019.safetensors",
84
+ "model.layers.10.block_sparse_moe.experts.4.w3.weight": "model-00007-of-00019.safetensors",
85
+ "model.layers.10.block_sparse_moe.experts.5.w1.weight": "model-00007-of-00019.safetensors",
86
+ "model.layers.10.block_sparse_moe.experts.5.w2.weight": "model-00007-of-00019.safetensors",
87
+ "model.layers.10.block_sparse_moe.experts.5.w3.weight": "model-00007-of-00019.safetensors",
88
+ "model.layers.10.block_sparse_moe.experts.6.w1.weight": "model-00007-of-00019.safetensors",
89
+ "model.layers.10.block_sparse_moe.experts.6.w2.weight": "model-00007-of-00019.safetensors",
90
+ "model.layers.10.block_sparse_moe.experts.6.w3.weight": "model-00007-of-00019.safetensors",
91
+ "model.layers.10.block_sparse_moe.experts.7.w1.weight": "model-00007-of-00019.safetensors",
92
+ "model.layers.10.block_sparse_moe.experts.7.w2.weight": "model-00007-of-00019.safetensors",
93
+ "model.layers.10.block_sparse_moe.experts.7.w3.weight": "model-00007-of-00019.safetensors",
94
+ "model.layers.10.block_sparse_moe.gate.weight": "model-00006-of-00019.safetensors",
95
+ "model.layers.10.input_layernorm.weight": "model-00007-of-00019.safetensors",
96
+ "model.layers.10.post_attention_layernorm.weight": "model-00007-of-00019.safetensors",
97
+ "model.layers.10.self_attn.k_proj.weight": "model-00006-of-00019.safetensors",
98
+ "model.layers.10.self_attn.o_proj.weight": "model-00006-of-00019.safetensors",
99
+ "model.layers.10.self_attn.q_proj.weight": "model-00006-of-00019.safetensors",
100
+ "model.layers.10.self_attn.v_proj.weight": "model-00006-of-00019.safetensors",
101
+ "model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00007-of-00019.safetensors",
102
+ "model.layers.11.block_sparse_moe.experts.0.w2.weight": "model-00007-of-00019.safetensors",
103
+ "model.layers.11.block_sparse_moe.experts.0.w3.weight": "model-00007-of-00019.safetensors",
104
+ "model.layers.11.block_sparse_moe.experts.1.w1.weight": "model-00007-of-00019.safetensors",
105
+ "model.layers.11.block_sparse_moe.experts.1.w2.weight": "model-00007-of-00019.safetensors",
106
+ "model.layers.11.block_sparse_moe.experts.1.w3.weight": "model-00007-of-00019.safetensors",
107
+ "model.layers.11.block_sparse_moe.experts.2.w1.weight": "model-00007-of-00019.safetensors",
108
+ "model.layers.11.block_sparse_moe.experts.2.w2.weight": "model-00007-of-00019.safetensors",
109
+ "model.layers.11.block_sparse_moe.experts.2.w3.weight": "model-00007-of-00019.safetensors",
110
+ "model.layers.11.block_sparse_moe.experts.3.w1.weight": "model-00007-of-00019.safetensors",
111
+ "model.layers.11.block_sparse_moe.experts.3.w2.weight": "model-00007-of-00019.safetensors",
112
+ "model.layers.11.block_sparse_moe.experts.3.w3.weight": "model-00007-of-00019.safetensors",
113
+ "model.layers.11.block_sparse_moe.experts.4.w1.weight": "model-00007-of-00019.safetensors",
114
+ "model.layers.11.block_sparse_moe.experts.4.w2.weight": "model-00007-of-00019.safetensors",
115
+ "model.layers.11.block_sparse_moe.experts.4.w3.weight": "model-00007-of-00019.safetensors",
116
+ "model.layers.11.block_sparse_moe.experts.5.w1.weight": "model-00007-of-00019.safetensors",
117
+ "model.layers.11.block_sparse_moe.experts.5.w2.weight": "model-00007-of-00019.safetensors",
118
+ "model.layers.11.block_sparse_moe.experts.5.w3.weight": "model-00007-of-00019.safetensors",
119
+ "model.layers.11.block_sparse_moe.experts.6.w1.weight": "model-00007-of-00019.safetensors",
120
+ "model.layers.11.block_sparse_moe.experts.6.w2.weight": "model-00007-of-00019.safetensors",
121
+ "model.layers.11.block_sparse_moe.experts.6.w3.weight": "model-00008-of-00019.safetensors",
122
+ "model.layers.11.block_sparse_moe.experts.7.w1.weight": "model-00008-of-00019.safetensors",
123
+ "model.layers.11.block_sparse_moe.experts.7.w2.weight": "model-00008-of-00019.safetensors",
124
+ "model.layers.11.block_sparse_moe.experts.7.w3.weight": "model-00008-of-00019.safetensors",
125
+ "model.layers.11.block_sparse_moe.gate.weight": "model-00007-of-00019.safetensors",
126
+ "model.layers.11.input_layernorm.weight": "model-00008-of-00019.safetensors",
127
+ "model.layers.11.post_attention_layernorm.weight": "model-00008-of-00019.safetensors",
128
+ "model.layers.11.self_attn.k_proj.weight": "model-00007-of-00019.safetensors",
129
+ "model.layers.11.self_attn.o_proj.weight": "model-00007-of-00019.safetensors",
130
+ "model.layers.11.self_attn.q_proj.weight": "model-00007-of-00019.safetensors",
131
+ "model.layers.11.self_attn.v_proj.weight": "model-00007-of-00019.safetensors",
132
+ "model.layers.12.block_sparse_moe.experts.0.w1.weight": "model-00008-of-00019.safetensors",
133
+ "model.layers.12.block_sparse_moe.experts.0.w2.weight": "model-00008-of-00019.safetensors",
134
+ "model.layers.12.block_sparse_moe.experts.0.w3.weight": "model-00008-of-00019.safetensors",
135
+ "model.layers.12.block_sparse_moe.experts.1.w1.weight": "model-00008-of-00019.safetensors",
136
+ "model.layers.12.block_sparse_moe.experts.1.w2.weight": "model-00008-of-00019.safetensors",
137
+ "model.layers.12.block_sparse_moe.experts.1.w3.weight": "model-00008-of-00019.safetensors",
138
+ "model.layers.12.block_sparse_moe.experts.2.w1.weight": "model-00008-of-00019.safetensors",
139
+ "model.layers.12.block_sparse_moe.experts.2.w2.weight": "model-00008-of-00019.safetensors",
140
+ "model.layers.12.block_sparse_moe.experts.2.w3.weight": "model-00008-of-00019.safetensors",
141
+ "model.layers.12.block_sparse_moe.experts.3.w1.weight": "model-00008-of-00019.safetensors",
142
+ "model.layers.12.block_sparse_moe.experts.3.w2.weight": "model-00008-of-00019.safetensors",
143
+ "model.layers.12.block_sparse_moe.experts.3.w3.weight": "model-00008-of-00019.safetensors",
144
+ "model.layers.12.block_sparse_moe.experts.4.w1.weight": "model-00008-of-00019.safetensors",
145
+ "model.layers.12.block_sparse_moe.experts.4.w2.weight": "model-00008-of-00019.safetensors",
146
+ "model.layers.12.block_sparse_moe.experts.4.w3.weight": "model-00008-of-00019.safetensors",
147
+ "model.layers.12.block_sparse_moe.experts.5.w1.weight": "model-00008-of-00019.safetensors",
148
+ "model.layers.12.block_sparse_moe.experts.5.w2.weight": "model-00008-of-00019.safetensors",
149
+ "model.layers.12.block_sparse_moe.experts.5.w3.weight": "model-00008-of-00019.safetensors",
150
+ "model.layers.12.block_sparse_moe.experts.6.w1.weight": "model-00008-of-00019.safetensors",
151
+ "model.layers.12.block_sparse_moe.experts.6.w2.weight": "model-00008-of-00019.safetensors",
152
+ "model.layers.12.block_sparse_moe.experts.6.w3.weight": "model-00008-of-00019.safetensors",
153
+ "model.layers.12.block_sparse_moe.experts.7.w1.weight": "model-00008-of-00019.safetensors",
154
+ "model.layers.12.block_sparse_moe.experts.7.w2.weight": "model-00008-of-00019.safetensors",
155
+ "model.layers.12.block_sparse_moe.experts.7.w3.weight": "model-00008-of-00019.safetensors",
156
+ "model.layers.12.block_sparse_moe.gate.weight": "model-00008-of-00019.safetensors",
157
+ "model.layers.12.input_layernorm.weight": "model-00008-of-00019.safetensors",
158
+ "model.layers.12.post_attention_layernorm.weight": "model-00008-of-00019.safetensors",
159
+ "model.layers.12.self_attn.k_proj.weight": "model-00008-of-00019.safetensors",
160
+ "model.layers.12.self_attn.o_proj.weight": "model-00008-of-00019.safetensors",
161
+ "model.layers.12.self_attn.q_proj.weight": "model-00008-of-00019.safetensors",
162
+ "model.layers.12.self_attn.v_proj.weight": "model-00008-of-00019.safetensors",
163
+ "model.layers.13.block_sparse_moe.experts.0.w1.weight": "model-00008-of-00019.safetensors",
164
+ "model.layers.13.block_sparse_moe.experts.0.w2.weight": "model-00008-of-00019.safetensors",
165
+ "model.layers.13.block_sparse_moe.experts.0.w3.weight": "model-00008-of-00019.safetensors",
166
+ "model.layers.13.block_sparse_moe.experts.1.w1.weight": "model-00008-of-00019.safetensors",
167
+ "model.layers.13.block_sparse_moe.experts.1.w2.weight": "model-00008-of-00019.safetensors",
168
+ "model.layers.13.block_sparse_moe.experts.1.w3.weight": "model-00008-of-00019.safetensors",
169
+ "model.layers.13.block_sparse_moe.experts.2.w1.weight": "model-00008-of-00019.safetensors",
170
+ "model.layers.13.block_sparse_moe.experts.2.w2.weight": "model-00008-of-00019.safetensors",
171
+ "model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00008-of-00019.safetensors",
172
+ "model.layers.13.block_sparse_moe.experts.3.w1.weight": "model-00008-of-00019.safetensors",
173
+ "model.layers.13.block_sparse_moe.experts.3.w2.weight": "model-00008-of-00019.safetensors",
174
+ "model.layers.13.block_sparse_moe.experts.3.w3.weight": "model-00008-of-00019.safetensors",
175
+ "model.layers.13.block_sparse_moe.experts.4.w1.weight": "model-00008-of-00019.safetensors",
176
+ "model.layers.13.block_sparse_moe.experts.4.w2.weight": "model-00009-of-00019.safetensors",
177
+ "model.layers.13.block_sparse_moe.experts.4.w3.weight": "model-00009-of-00019.safetensors",
178
+ "model.layers.13.block_sparse_moe.experts.5.w1.weight": "model-00009-of-00019.safetensors",
179
+ "model.layers.13.block_sparse_moe.experts.5.w2.weight": "model-00009-of-00019.safetensors",
180
+ "model.layers.13.block_sparse_moe.experts.5.w3.weight": "model-00009-of-00019.safetensors",
181
+ "model.layers.13.block_sparse_moe.experts.6.w1.weight": "model-00009-of-00019.safetensors",
182
+ "model.layers.13.block_sparse_moe.experts.6.w2.weight": "model-00009-of-00019.safetensors",
183
+ "model.layers.13.block_sparse_moe.experts.6.w3.weight": "model-00009-of-00019.safetensors",
184
+ "model.layers.13.block_sparse_moe.experts.7.w1.weight": "model-00009-of-00019.safetensors",
185
+ "model.layers.13.block_sparse_moe.experts.7.w2.weight": "model-00009-of-00019.safetensors",
186
+ "model.layers.13.block_sparse_moe.experts.7.w3.weight": "model-00009-of-00019.safetensors",
187
+ "model.layers.13.block_sparse_moe.gate.weight": "model-00008-of-00019.safetensors",
188
+ "model.layers.13.input_layernorm.weight": "model-00009-of-00019.safetensors",
189
+ "model.layers.13.post_attention_layernorm.weight": "model-00009-of-00019.safetensors",
190
+ "model.layers.13.self_attn.k_proj.weight": "model-00008-of-00019.safetensors",
191
+ "model.layers.13.self_attn.o_proj.weight": "model-00008-of-00019.safetensors",
192
+ "model.layers.13.self_attn.q_proj.weight": "model-00008-of-00019.safetensors",
193
+ "model.layers.13.self_attn.v_proj.weight": "model-00008-of-00019.safetensors",
194
+ "model.layers.14.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00019.safetensors",
195
+ "model.layers.14.block_sparse_moe.experts.0.w2.weight": "model-00009-of-00019.safetensors",
196
+ "model.layers.14.block_sparse_moe.experts.0.w3.weight": "model-00009-of-00019.safetensors",
197
+ "model.layers.14.block_sparse_moe.experts.1.w1.weight": "model-00009-of-00019.safetensors",
198
+ "model.layers.14.block_sparse_moe.experts.1.w2.weight": "model-00009-of-00019.safetensors",
199
+ "model.layers.14.block_sparse_moe.experts.1.w3.weight": "model-00009-of-00019.safetensors",
200
+ "model.layers.14.block_sparse_moe.experts.2.w1.weight": "model-00009-of-00019.safetensors",
201
+ "model.layers.14.block_sparse_moe.experts.2.w2.weight": "model-00009-of-00019.safetensors",
202
+ "model.layers.14.block_sparse_moe.experts.2.w3.weight": "model-00009-of-00019.safetensors",
203
+ "model.layers.14.block_sparse_moe.experts.3.w1.weight": "model-00009-of-00019.safetensors",
204
+ "model.layers.14.block_sparse_moe.experts.3.w2.weight": "model-00009-of-00019.safetensors",
205
+ "model.layers.14.block_sparse_moe.experts.3.w3.weight": "model-00009-of-00019.safetensors",
206
+ "model.layers.14.block_sparse_moe.experts.4.w1.weight": "model-00009-of-00019.safetensors",
207
+ "model.layers.14.block_sparse_moe.experts.4.w2.weight": "model-00009-of-00019.safetensors",
208
+ "model.layers.14.block_sparse_moe.experts.4.w3.weight": "model-00009-of-00019.safetensors",
209
+ "model.layers.14.block_sparse_moe.experts.5.w1.weight": "model-00009-of-00019.safetensors",
210
+ "model.layers.14.block_sparse_moe.experts.5.w2.weight": "model-00009-of-00019.safetensors",
211
+ "model.layers.14.block_sparse_moe.experts.5.w3.weight": "model-00009-of-00019.safetensors",
212
+ "model.layers.14.block_sparse_moe.experts.6.w1.weight": "model-00009-of-00019.safetensors",
213
+ "model.layers.14.block_sparse_moe.experts.6.w2.weight": "model-00009-of-00019.safetensors",
214
+ "model.layers.14.block_sparse_moe.experts.6.w3.weight": "model-00009-of-00019.safetensors",
215
+ "model.layers.14.block_sparse_moe.experts.7.w1.weight": "model-00009-of-00019.safetensors",
216
+ "model.layers.14.block_sparse_moe.experts.7.w2.weight": "model-00009-of-00019.safetensors",
217
+ "model.layers.14.block_sparse_moe.experts.7.w3.weight": "model-00009-of-00019.safetensors",
218
+ "model.layers.14.block_sparse_moe.gate.weight": "model-00009-of-00019.safetensors",
219
+ "model.layers.14.input_layernorm.weight": "model-00009-of-00019.safetensors",
220
+ "model.layers.14.post_attention_layernorm.weight": "model-00009-of-00019.safetensors",
221
+ "model.layers.14.self_attn.k_proj.weight": "model-00009-of-00019.safetensors",
222
+ "model.layers.14.self_attn.o_proj.weight": "model-00009-of-00019.safetensors",
223
+ "model.layers.14.self_attn.q_proj.weight": "model-00009-of-00019.safetensors",
224
+ "model.layers.14.self_attn.v_proj.weight": "model-00009-of-00019.safetensors",
225
+ "model.layers.15.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00019.safetensors",
226
+ "model.layers.15.block_sparse_moe.experts.0.w2.weight": "model-00009-of-00019.safetensors",
227
+ "model.layers.15.block_sparse_moe.experts.0.w3.weight": "model-00009-of-00019.safetensors",
228
+ "model.layers.15.block_sparse_moe.experts.1.w1.weight": "model-00009-of-00019.safetensors",
229
+ "model.layers.15.block_sparse_moe.experts.1.w2.weight": "model-00009-of-00019.safetensors",
230
+ "model.layers.15.block_sparse_moe.experts.1.w3.weight": "model-00009-of-00019.safetensors",
231
+ "model.layers.15.block_sparse_moe.experts.2.w1.weight": "model-00010-of-00019.safetensors",
232
+ "model.layers.15.block_sparse_moe.experts.2.w2.weight": "model-00010-of-00019.safetensors",
233
+ "model.layers.15.block_sparse_moe.experts.2.w3.weight": "model-00010-of-00019.safetensors",
234
+ "model.layers.15.block_sparse_moe.experts.3.w1.weight": "model-00010-of-00019.safetensors",
235
+ "model.layers.15.block_sparse_moe.experts.3.w2.weight": "model-00010-of-00019.safetensors",
236
+ "model.layers.15.block_sparse_moe.experts.3.w3.weight": "model-00010-of-00019.safetensors",
237
+ "model.layers.15.block_sparse_moe.experts.4.w1.weight": "model-00010-of-00019.safetensors",
238
+ "model.layers.15.block_sparse_moe.experts.4.w2.weight": "model-00010-of-00019.safetensors",
239
+ "model.layers.15.block_sparse_moe.experts.4.w3.weight": "model-00010-of-00019.safetensors",
240
+ "model.layers.15.block_sparse_moe.experts.5.w1.weight": "model-00010-of-00019.safetensors",
241
+ "model.layers.15.block_sparse_moe.experts.5.w2.weight": "model-00010-of-00019.safetensors",
242
+ "model.layers.15.block_sparse_moe.experts.5.w3.weight": "model-00010-of-00019.safetensors",
243
+ "model.layers.15.block_sparse_moe.experts.6.w1.weight": "model-00010-of-00019.safetensors",
244
+ "model.layers.15.block_sparse_moe.experts.6.w2.weight": "model-00010-of-00019.safetensors",
245
+ "model.layers.15.block_sparse_moe.experts.6.w3.weight": "model-00010-of-00019.safetensors",
246
+ "model.layers.15.block_sparse_moe.experts.7.w1.weight": "model-00010-of-00019.safetensors",
247
+ "model.layers.15.block_sparse_moe.experts.7.w2.weight": "model-00010-of-00019.safetensors",
248
+ "model.layers.15.block_sparse_moe.experts.7.w3.weight": "model-00010-of-00019.safetensors",
249
+ "model.layers.15.block_sparse_moe.gate.weight": "model-00009-of-00019.safetensors",
250
+ "model.layers.15.input_layernorm.weight": "model-00010-of-00019.safetensors",
251
+ "model.layers.15.post_attention_layernorm.weight": "model-00010-of-00019.safetensors",
252
+ "model.layers.15.self_attn.k_proj.weight": "model-00009-of-00019.safetensors",
253
+ "model.layers.15.self_attn.o_proj.weight": "model-00009-of-00019.safetensors",
254
+ "model.layers.15.self_attn.q_proj.weight": "model-00009-of-00019.safetensors",
255
+ "model.layers.15.self_attn.v_proj.weight": "model-00009-of-00019.safetensors",
256
+ "model.layers.16.block_sparse_moe.experts.0.w1.weight": "model-00010-of-00019.safetensors",
257
+ "model.layers.16.block_sparse_moe.experts.0.w2.weight": "model-00010-of-00019.safetensors",
258
+ "model.layers.16.block_sparse_moe.experts.0.w3.weight": "model-00010-of-00019.safetensors",
259
+ "model.layers.16.block_sparse_moe.experts.1.w1.weight": "model-00010-of-00019.safetensors",
260
+ "model.layers.16.block_sparse_moe.experts.1.w2.weight": "model-00010-of-00019.safetensors",
261
+ "model.layers.16.block_sparse_moe.experts.1.w3.weight": "model-00010-of-00019.safetensors",
262
+ "model.layers.16.block_sparse_moe.experts.2.w1.weight": "model-00010-of-00019.safetensors",
263
+ "model.layers.16.block_sparse_moe.experts.2.w2.weight": "model-00010-of-00019.safetensors",
264
+ "model.layers.16.block_sparse_moe.experts.2.w3.weight": "model-00010-of-00019.safetensors",
265
+ "model.layers.16.block_sparse_moe.experts.3.w1.weight": "model-00010-of-00019.safetensors",
266
+ "model.layers.16.block_sparse_moe.experts.3.w2.weight": "model-00010-of-00019.safetensors",
267
+ "model.layers.16.block_sparse_moe.experts.3.w3.weight": "model-00010-of-00019.safetensors",
268
+ "model.layers.16.block_sparse_moe.experts.4.w1.weight": "model-00010-of-00019.safetensors",
269
+ "model.layers.16.block_sparse_moe.experts.4.w2.weight": "model-00010-of-00019.safetensors",
270
+ "model.layers.16.block_sparse_moe.experts.4.w3.weight": "model-00010-of-00019.safetensors",
271
+ "model.layers.16.block_sparse_moe.experts.5.w1.weight": "model-00010-of-00019.safetensors",
272
+ "model.layers.16.block_sparse_moe.experts.5.w2.weight": "model-00010-of-00019.safetensors",
273
+ "model.layers.16.block_sparse_moe.experts.5.w3.weight": "model-00010-of-00019.safetensors",
274
+ "model.layers.16.block_sparse_moe.experts.6.w1.weight": "model-00010-of-00019.safetensors",
275
+ "model.layers.16.block_sparse_moe.experts.6.w2.weight": "model-00010-of-00019.safetensors",
276
+ "model.layers.16.block_sparse_moe.experts.6.w3.weight": "model-00010-of-00019.safetensors",
277
+ "model.layers.16.block_sparse_moe.experts.7.w1.weight": "model-00010-of-00019.safetensors",
278
+ "model.layers.16.block_sparse_moe.experts.7.w2.weight": "model-00010-of-00019.safetensors",
279
+ "model.layers.16.block_sparse_moe.experts.7.w3.weight": "model-00011-of-00019.safetensors",
280
+ "model.layers.16.block_sparse_moe.gate.weight": "model-00010-of-00019.safetensors",
281
+ "model.layers.16.input_layernorm.weight": "model-00011-of-00019.safetensors",
282
+ "model.layers.16.post_attention_layernorm.weight": "model-00011-of-00019.safetensors",
283
+ "model.layers.16.self_attn.k_proj.weight": "model-00010-of-00019.safetensors",
284
+ "model.layers.16.self_attn.o_proj.weight": "model-00010-of-00019.safetensors",
285
+ "model.layers.16.self_attn.q_proj.weight": "model-00010-of-00019.safetensors",
286
+ "model.layers.16.self_attn.v_proj.weight": "model-00010-of-00019.safetensors",
287
+ "model.layers.17.block_sparse_moe.experts.0.w1.weight": "model-00011-of-00019.safetensors",
288
+ "model.layers.17.block_sparse_moe.experts.0.w2.weight": "model-00011-of-00019.safetensors",
289
+ "model.layers.17.block_sparse_moe.experts.0.w3.weight": "model-00011-of-00019.safetensors",
290
+ "model.layers.17.block_sparse_moe.experts.1.w1.weight": "model-00011-of-00019.safetensors",
291
+ "model.layers.17.block_sparse_moe.experts.1.w2.weight": "model-00011-of-00019.safetensors",
292
+ "model.layers.17.block_sparse_moe.experts.1.w3.weight": "model-00011-of-00019.safetensors",
293
+ "model.layers.17.block_sparse_moe.experts.2.w1.weight": "model-00011-of-00019.safetensors",
294
+ "model.layers.17.block_sparse_moe.experts.2.w2.weight": "model-00011-of-00019.safetensors",
295
+ "model.layers.17.block_sparse_moe.experts.2.w3.weight": "model-00011-of-00019.safetensors",
296
+ "model.layers.17.block_sparse_moe.experts.3.w1.weight": "model-00011-of-00019.safetensors",
297
+ "model.layers.17.block_sparse_moe.experts.3.w2.weight": "model-00011-of-00019.safetensors",
298
+ "model.layers.17.block_sparse_moe.experts.3.w3.weight": "model-00011-of-00019.safetensors",
299
+ "model.layers.17.block_sparse_moe.experts.4.w1.weight": "model-00011-of-00019.safetensors",
300
+ "model.layers.17.block_sparse_moe.experts.4.w2.weight": "model-00011-of-00019.safetensors",
301
+ "model.layers.17.block_sparse_moe.experts.4.w3.weight": "model-00011-of-00019.safetensors",
302
+ "model.layers.17.block_sparse_moe.experts.5.w1.weight": "model-00011-of-00019.safetensors",
303
+ "model.layers.17.block_sparse_moe.experts.5.w2.weight": "model-00011-of-00019.safetensors",
304
+ "model.layers.17.block_sparse_moe.experts.5.w3.weight": "model-00011-of-00019.safetensors",
305
+ "model.layers.17.block_sparse_moe.experts.6.w1.weight": "model-00011-of-00019.safetensors",
306
+ "model.layers.17.block_sparse_moe.experts.6.w2.weight": "model-00011-of-00019.safetensors",
307
+ "model.layers.17.block_sparse_moe.experts.6.w3.weight": "model-00011-of-00019.safetensors",
308
+ "model.layers.17.block_sparse_moe.experts.7.w1.weight": "model-00011-of-00019.safetensors",
309
+ "model.layers.17.block_sparse_moe.experts.7.w2.weight": "model-00011-of-00019.safetensors",
310
+ "model.layers.17.block_sparse_moe.experts.7.w3.weight": "model-00011-of-00019.safetensors",
311
+ "model.layers.17.block_sparse_moe.gate.weight": "model-00011-of-00019.safetensors",
312
+ "model.layers.17.input_layernorm.weight": "model-00011-of-00019.safetensors",
313
+ "model.layers.17.post_attention_layernorm.weight": "model-00011-of-00019.safetensors",
314
+ "model.layers.17.self_attn.k_proj.weight": "model-00011-of-00019.safetensors",
315
+ "model.layers.17.self_attn.o_proj.weight": "model-00011-of-00019.safetensors",
316
+ "model.layers.17.self_attn.q_proj.weight": "model-00011-of-00019.safetensors",
317
+ "model.layers.17.self_attn.v_proj.weight": "model-00011-of-00019.safetensors",
318
+ "model.layers.18.block_sparse_moe.experts.0.w1.weight": "model-00011-of-00019.safetensors",
319
+ "model.layers.18.block_sparse_moe.experts.0.w2.weight": "model-00011-of-00019.safetensors",
320
+ "model.layers.18.block_sparse_moe.experts.0.w3.weight": "model-00011-of-00019.safetensors",
321
+ "model.layers.18.block_sparse_moe.experts.1.w1.weight": "model-00011-of-00019.safetensors",
322
+ "model.layers.18.block_sparse_moe.experts.1.w2.weight": "model-00011-of-00019.safetensors",
323
+ "model.layers.18.block_sparse_moe.experts.1.w3.weight": "model-00011-of-00019.safetensors",
324
+ "model.layers.18.block_sparse_moe.experts.2.w1.weight": "model-00011-of-00019.safetensors",
325
+ "model.layers.18.block_sparse_moe.experts.2.w2.weight": "model-00011-of-00019.safetensors",
326
+ "model.layers.18.block_sparse_moe.experts.2.w3.weight": "model-00011-of-00019.safetensors",
327
+ "model.layers.18.block_sparse_moe.experts.3.w1.weight": "model-00011-of-00019.safetensors",
328
+ "model.layers.18.block_sparse_moe.experts.3.w2.weight": "model-00011-of-00019.safetensors",
329
+ "model.layers.18.block_sparse_moe.experts.3.w3.weight": "model-00011-of-00019.safetensors",
330
+ "model.layers.18.block_sparse_moe.experts.4.w1.weight": "model-00011-of-00019.safetensors",
331
+ "model.layers.18.block_sparse_moe.experts.4.w2.weight": "model-00011-of-00019.safetensors",
332
+ "model.layers.18.block_sparse_moe.experts.4.w3.weight": "model-00011-of-00019.safetensors",
333
+ "model.layers.18.block_sparse_moe.experts.5.w1.weight": "model-00011-of-00019.safetensors",
334
+ "model.layers.18.block_sparse_moe.experts.5.w2.weight": "model-00012-of-00019.safetensors",
335
+ "model.layers.18.block_sparse_moe.experts.5.w3.weight": "model-00012-of-00019.safetensors",
336
+ "model.layers.18.block_sparse_moe.experts.6.w1.weight": "model-00012-of-00019.safetensors",
337
+ "model.layers.18.block_sparse_moe.experts.6.w2.weight": "model-00012-of-00019.safetensors",
338
+ "model.layers.18.block_sparse_moe.experts.6.w3.weight": "model-00012-of-00019.safetensors",
339
+ "model.layers.18.block_sparse_moe.experts.7.w1.weight": "model-00012-of-00019.safetensors",
340
+ "model.layers.18.block_sparse_moe.experts.7.w2.weight": "model-00012-of-00019.safetensors",
341
+ "model.layers.18.block_sparse_moe.experts.7.w3.weight": "model-00012-of-00019.safetensors",
342
+ "model.layers.18.block_sparse_moe.gate.weight": "model-00011-of-00019.safetensors",
343
+ "model.layers.18.input_layernorm.weight": "model-00012-of-00019.safetensors",
344
+ "model.layers.18.post_attention_layernorm.weight": "model-00012-of-00019.safetensors",
345
+ "model.layers.18.self_attn.k_proj.weight": "model-00011-of-00019.safetensors",
346
+ "model.layers.18.self_attn.o_proj.weight": "model-00011-of-00019.safetensors",
347
+ "model.layers.18.self_attn.q_proj.weight": "model-00011-of-00019.safetensors",
348
+ "model.layers.18.self_attn.v_proj.weight": "model-00011-of-00019.safetensors",
349
+ "model.layers.19.block_sparse_moe.experts.0.w1.weight": "model-00012-of-00019.safetensors",
350
+ "model.layers.19.block_sparse_moe.experts.0.w2.weight": "model-00012-of-00019.safetensors",
351
+ "model.layers.19.block_sparse_moe.experts.0.w3.weight": "model-00012-of-00019.safetensors",
352
+ "model.layers.19.block_sparse_moe.experts.1.w1.weight": "model-00012-of-00019.safetensors",
353
+ "model.layers.19.block_sparse_moe.experts.1.w2.weight": "model-00012-of-00019.safetensors",
354
+ "model.layers.19.block_sparse_moe.experts.1.w3.weight": "model-00012-of-00019.safetensors",
355
+ "model.layers.19.block_sparse_moe.experts.2.w1.weight": "model-00012-of-00019.safetensors",
356
+ "model.layers.19.block_sparse_moe.experts.2.w2.weight": "model-00012-of-00019.safetensors",
357
+ "model.layers.19.block_sparse_moe.experts.2.w3.weight": "model-00012-of-00019.safetensors",
358
+ "model.layers.19.block_sparse_moe.experts.3.w1.weight": "model-00012-of-00019.safetensors",
359
+ "model.layers.19.block_sparse_moe.experts.3.w2.weight": "model-00012-of-00019.safetensors",
360
+ "model.layers.19.block_sparse_moe.experts.3.w3.weight": "model-00012-of-00019.safetensors",
361
+ "model.layers.19.block_sparse_moe.experts.4.w1.weight": "model-00012-of-00019.safetensors",
362
+ "model.layers.19.block_sparse_moe.experts.4.w2.weight": "model-00012-of-00019.safetensors",
363
+ "model.layers.19.block_sparse_moe.experts.4.w3.weight": "model-00012-of-00019.safetensors",
364
+ "model.layers.19.block_sparse_moe.experts.5.w1.weight": "model-00012-of-00019.safetensors",
365
+ "model.layers.19.block_sparse_moe.experts.5.w2.weight": "model-00012-of-00019.safetensors",
366
+ "model.layers.19.block_sparse_moe.experts.5.w3.weight": "model-00012-of-00019.safetensors",
367
+ "model.layers.19.block_sparse_moe.experts.6.w1.weight": "model-00012-of-00019.safetensors",
368
+ "model.layers.19.block_sparse_moe.experts.6.w2.weight": "model-00012-of-00019.safetensors",
369
+ "model.layers.19.block_sparse_moe.experts.6.w3.weight": "model-00012-of-00019.safetensors",
370
+ "model.layers.19.block_sparse_moe.experts.7.w1.weight": "model-00012-of-00019.safetensors",
371
+ "model.layers.19.block_sparse_moe.experts.7.w2.weight": "model-00012-of-00019.safetensors",
372
+ "model.layers.19.block_sparse_moe.experts.7.w3.weight": "model-00012-of-00019.safetensors",
373
+ "model.layers.19.block_sparse_moe.gate.weight": "model-00012-of-00019.safetensors",
374
+ "model.layers.19.input_layernorm.weight": "model-00012-of-00019.safetensors",
375
+ "model.layers.19.post_attention_layernorm.weight": "model-00012-of-00019.safetensors",
376
+ "model.layers.19.self_attn.k_proj.weight": "model-00012-of-00019.safetensors",
377
+ "model.layers.19.self_attn.o_proj.weight": "model-00012-of-00019.safetensors",
378
+ "model.layers.19.self_attn.q_proj.weight": "model-00012-of-00019.safetensors",
379
+ "model.layers.19.self_attn.v_proj.weight": "model-00012-of-00019.safetensors",
380
+ "model.layers.2.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00019.safetensors",
381
+ "model.layers.2.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00019.safetensors",
382
+ "model.layers.2.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00019.safetensors",
383
+ "model.layers.2.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00019.safetensors",
384
+ "model.layers.2.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00019.safetensors",
385
+ "model.layers.2.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00019.safetensors",
386
+ "model.layers.2.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00019.safetensors",
387
+ "model.layers.2.block_sparse_moe.experts.2.w2.weight": "model-00002-of-00019.safetensors",
388
+ "model.layers.2.block_sparse_moe.experts.2.w3.weight": "model-00002-of-00019.safetensors",
389
+ "model.layers.2.block_sparse_moe.experts.3.w1.weight": "model-00002-of-00019.safetensors",
390
+ "model.layers.2.block_sparse_moe.experts.3.w2.weight": "model-00002-of-00019.safetensors",
391
+ "model.layers.2.block_sparse_moe.experts.3.w3.weight": "model-00002-of-00019.safetensors",
392
+ "model.layers.2.block_sparse_moe.experts.4.w1.weight": "model-00002-of-00019.safetensors",
393
+ "model.layers.2.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00019.safetensors",
394
+ "model.layers.2.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00019.safetensors",
395
+ "model.layers.2.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00019.safetensors",
396
+ "model.layers.2.block_sparse_moe.experts.5.w2.weight": "model-00002-of-00019.safetensors",
397
+ "model.layers.2.block_sparse_moe.experts.5.w3.weight": "model-00002-of-00019.safetensors",
398
+ "model.layers.2.block_sparse_moe.experts.6.w1.weight": "model-00002-of-00019.safetensors",
399
+ "model.layers.2.block_sparse_moe.experts.6.w2.weight": "model-00002-of-00019.safetensors",
400
+ "model.layers.2.block_sparse_moe.experts.6.w3.weight": "model-00002-of-00019.safetensors",
401
+ "model.layers.2.block_sparse_moe.experts.7.w1.weight": "model-00002-of-00019.safetensors",
402
+ "model.layers.2.block_sparse_moe.experts.7.w2.weight": "model-00002-of-00019.safetensors",
403
+ "model.layers.2.block_sparse_moe.experts.7.w3.weight": "model-00002-of-00019.safetensors",
404
+ "model.layers.2.block_sparse_moe.gate.weight": "model-00002-of-00019.safetensors",
405
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00019.safetensors",
406
+ "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00019.safetensors",
407
+ "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00019.safetensors",
408
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00019.safetensors",
409
+ "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00019.safetensors",
410
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00019.safetensors",
411
+ "model.layers.20.block_sparse_moe.experts.0.w1.weight": "model-00012-of-00019.safetensors",
412
+ "model.layers.20.block_sparse_moe.experts.0.w2.weight": "model-00012-of-00019.safetensors",
413
+ "model.layers.20.block_sparse_moe.experts.0.w3.weight": "model-00012-of-00019.safetensors",
414
+ "model.layers.20.block_sparse_moe.experts.1.w1.weight": "model-00012-of-00019.safetensors",
415
+ "model.layers.20.block_sparse_moe.experts.1.w2.weight": "model-00012-of-00019.safetensors",
416
+ "model.layers.20.block_sparse_moe.experts.1.w3.weight": "model-00012-of-00019.safetensors",
417
+ "model.layers.20.block_sparse_moe.experts.2.w1.weight": "model-00012-of-00019.safetensors",
418
+ "model.layers.20.block_sparse_moe.experts.2.w2.weight": "model-00012-of-00019.safetensors",
419
+ "model.layers.20.block_sparse_moe.experts.2.w3.weight": "model-00012-of-00019.safetensors",
420
+ "model.layers.20.block_sparse_moe.experts.3.w1.weight": "model-00013-of-00019.safetensors",
421
+ "model.layers.20.block_sparse_moe.experts.3.w2.weight": "model-00013-of-00019.safetensors",
422
+ "model.layers.20.block_sparse_moe.experts.3.w3.weight": "model-00013-of-00019.safetensors",
423
+ "model.layers.20.block_sparse_moe.experts.4.w1.weight": "model-00013-of-00019.safetensors",
424
+ "model.layers.20.block_sparse_moe.experts.4.w2.weight": "model-00013-of-00019.safetensors",
425
+ "model.layers.20.block_sparse_moe.experts.4.w3.weight": "model-00013-of-00019.safetensors",
426
+ "model.layers.20.block_sparse_moe.experts.5.w1.weight": "model-00013-of-00019.safetensors",
427
+ "model.layers.20.block_sparse_moe.experts.5.w2.weight": "model-00013-of-00019.safetensors",
428
+ "model.layers.20.block_sparse_moe.experts.5.w3.weight": "model-00013-of-00019.safetensors",
429
+ "model.layers.20.block_sparse_moe.experts.6.w1.weight": "model-00013-of-00019.safetensors",
430
+ "model.layers.20.block_sparse_moe.experts.6.w2.weight": "model-00013-of-00019.safetensors",
431
+ "model.layers.20.block_sparse_moe.experts.6.w3.weight": "model-00013-of-00019.safetensors",
432
+ "model.layers.20.block_sparse_moe.experts.7.w1.weight": "model-00013-of-00019.safetensors",
433
+ "model.layers.20.block_sparse_moe.experts.7.w2.weight": "model-00013-of-00019.safetensors",
434
+ "model.layers.20.block_sparse_moe.experts.7.w3.weight": "model-00013-of-00019.safetensors",
435
+ "model.layers.20.block_sparse_moe.gate.weight": "model-00012-of-00019.safetensors",
436
+ "model.layers.20.input_layernorm.weight": "model-00013-of-00019.safetensors",
437
+ "model.layers.20.post_attention_layernorm.weight": "model-00013-of-00019.safetensors",
438
+ "model.layers.20.self_attn.k_proj.weight": "model-00012-of-00019.safetensors",
439
+ "model.layers.20.self_attn.o_proj.weight": "model-00012-of-00019.safetensors",
440
+ "model.layers.20.self_attn.q_proj.weight": "model-00012-of-00019.safetensors",
441
+ "model.layers.20.self_attn.v_proj.weight": "model-00012-of-00019.safetensors",
442
+ "model.layers.21.block_sparse_moe.experts.0.w1.weight": "model-00013-of-00019.safetensors",
443
+ "model.layers.21.block_sparse_moe.experts.0.w2.weight": "model-00013-of-00019.safetensors",
444
+ "model.layers.21.block_sparse_moe.experts.0.w3.weight": "model-00013-of-00019.safetensors",
445
+ "model.layers.21.block_sparse_moe.experts.1.w1.weight": "model-00013-of-00019.safetensors",
446
+ "model.layers.21.block_sparse_moe.experts.1.w2.weight": "model-00013-of-00019.safetensors",
447
+ "model.layers.21.block_sparse_moe.experts.1.w3.weight": "model-00013-of-00019.safetensors",
448
+ "model.layers.21.block_sparse_moe.experts.2.w1.weight": "model-00013-of-00019.safetensors",
449
+ "model.layers.21.block_sparse_moe.experts.2.w2.weight": "model-00013-of-00019.safetensors",
450
+ "model.layers.21.block_sparse_moe.experts.2.w3.weight": "model-00013-of-00019.safetensors",
451
+ "model.layers.21.block_sparse_moe.experts.3.w1.weight": "model-00013-of-00019.safetensors",
452
+ "model.layers.21.block_sparse_moe.experts.3.w2.weight": "model-00013-of-00019.safetensors",
453
+ "model.layers.21.block_sparse_moe.experts.3.w3.weight": "model-00013-of-00019.safetensors",
454
+ "model.layers.21.block_sparse_moe.experts.4.w1.weight": "model-00013-of-00019.safetensors",
455
+ "model.layers.21.block_sparse_moe.experts.4.w2.weight": "model-00013-of-00019.safetensors",
456
+ "model.layers.21.block_sparse_moe.experts.4.w3.weight": "model-00013-of-00019.safetensors",
457
+ "model.layers.21.block_sparse_moe.experts.5.w1.weight": "model-00013-of-00019.safetensors",
458
+ "model.layers.21.block_sparse_moe.experts.5.w2.weight": "model-00013-of-00019.safetensors",
459
+ "model.layers.21.block_sparse_moe.experts.5.w3.weight": "model-00013-of-00019.safetensors",
460
+ "model.layers.21.block_sparse_moe.experts.6.w1.weight": "model-00013-of-00019.safetensors",
461
+ "model.layers.21.block_sparse_moe.experts.6.w2.weight": "model-00013-of-00019.safetensors",
462
+ "model.layers.21.block_sparse_moe.experts.6.w3.weight": "model-00013-of-00019.safetensors",
463
+ "model.layers.21.block_sparse_moe.experts.7.w1.weight": "model-00013-of-00019.safetensors",
464
+ "model.layers.21.block_sparse_moe.experts.7.w2.weight": "model-00013-of-00019.safetensors",
465
+ "model.layers.21.block_sparse_moe.experts.7.w3.weight": "model-00013-of-00019.safetensors",
466
+ "model.layers.21.block_sparse_moe.gate.weight": "model-00013-of-00019.safetensors",
467
+ "model.layers.21.input_layernorm.weight": "model-00013-of-00019.safetensors",
468
+ "model.layers.21.post_attention_layernorm.weight": "model-00013-of-00019.safetensors",
469
+ "model.layers.21.self_attn.k_proj.weight": "model-00013-of-00019.safetensors",
470
+ "model.layers.21.self_attn.o_proj.weight": "model-00013-of-00019.safetensors",
471
+ "model.layers.21.self_attn.q_proj.weight": "model-00013-of-00019.safetensors",
472
+ "model.layers.21.self_attn.v_proj.weight": "model-00013-of-00019.safetensors",
473
+ "model.layers.22.block_sparse_moe.experts.0.w1.weight": "model-00013-of-00019.safetensors",
474
+ "model.layers.22.block_sparse_moe.experts.0.w2.weight": "model-00013-of-00019.safetensors",
475
+ "model.layers.22.block_sparse_moe.experts.0.w3.weight": "model-00014-of-00019.safetensors",
476
+ "model.layers.22.block_sparse_moe.experts.1.w1.weight": "model-00014-of-00019.safetensors",
477
+ "model.layers.22.block_sparse_moe.experts.1.w2.weight": "model-00014-of-00019.safetensors",
478
+ "model.layers.22.block_sparse_moe.experts.1.w3.weight": "model-00014-of-00019.safetensors",
479
+ "model.layers.22.block_sparse_moe.experts.2.w1.weight": "model-00014-of-00019.safetensors",
480
+ "model.layers.22.block_sparse_moe.experts.2.w2.weight": "model-00014-of-00019.safetensors",
481
+ "model.layers.22.block_sparse_moe.experts.2.w3.weight": "model-00014-of-00019.safetensors",
482
+ "model.layers.22.block_sparse_moe.experts.3.w1.weight": "model-00014-of-00019.safetensors",
483
+ "model.layers.22.block_sparse_moe.experts.3.w2.weight": "model-00014-of-00019.safetensors",
484
+ "model.layers.22.block_sparse_moe.experts.3.w3.weight": "model-00014-of-00019.safetensors",
485
+ "model.layers.22.block_sparse_moe.experts.4.w1.weight": "model-00014-of-00019.safetensors",
486
+ "model.layers.22.block_sparse_moe.experts.4.w2.weight": "model-00014-of-00019.safetensors",
487
+ "model.layers.22.block_sparse_moe.experts.4.w3.weight": "model-00014-of-00019.safetensors",
488
+ "model.layers.22.block_sparse_moe.experts.5.w1.weight": "model-00014-of-00019.safetensors",
489
+ "model.layers.22.block_sparse_moe.experts.5.w2.weight": "model-00014-of-00019.safetensors",
490
+ "model.layers.22.block_sparse_moe.experts.5.w3.weight": "model-00014-of-00019.safetensors",
491
+ "model.layers.22.block_sparse_moe.experts.6.w1.weight": "model-00014-of-00019.safetensors",
492
+ "model.layers.22.block_sparse_moe.experts.6.w2.weight": "model-00014-of-00019.safetensors",
493
+ "model.layers.22.block_sparse_moe.experts.6.w3.weight": "model-00014-of-00019.safetensors",
494
+ "model.layers.22.block_sparse_moe.experts.7.w1.weight": "model-00014-of-00019.safetensors",
495
+ "model.layers.22.block_sparse_moe.experts.7.w2.weight": "model-00014-of-00019.safetensors",
496
+ "model.layers.22.block_sparse_moe.experts.7.w3.weight": "model-00014-of-00019.safetensors",
497
+ "model.layers.22.block_sparse_moe.gate.weight": "model-00013-of-00019.safetensors",
498
+ "model.layers.22.input_layernorm.weight": "model-00014-of-00019.safetensors",
499
+ "model.layers.22.post_attention_layernorm.weight": "model-00014-of-00019.safetensors",
500
+ "model.layers.22.self_attn.k_proj.weight": "model-00013-of-00019.safetensors",
501
+ "model.layers.22.self_attn.o_proj.weight": "model-00013-of-00019.safetensors",
502
+ "model.layers.22.self_attn.q_proj.weight": "model-00013-of-00019.safetensors",
503
+ "model.layers.22.self_attn.v_proj.weight": "model-00013-of-00019.safetensors",
504
+ "model.layers.23.block_sparse_moe.experts.0.w1.weight": "model-00014-of-00019.safetensors",
505
+ "model.layers.23.block_sparse_moe.experts.0.w2.weight": "model-00014-of-00019.safetensors",
506
+ "model.layers.23.block_sparse_moe.experts.0.w3.weight": "model-00014-of-00019.safetensors",
507
+ "model.layers.23.block_sparse_moe.experts.1.w1.weight": "model-00014-of-00019.safetensors",
508
+ "model.layers.23.block_sparse_moe.experts.1.w2.weight": "model-00014-of-00019.safetensors",
509
+ "model.layers.23.block_sparse_moe.experts.1.w3.weight": "model-00014-of-00019.safetensors",
510
+ "model.layers.23.block_sparse_moe.experts.2.w1.weight": "model-00014-of-00019.safetensors",
511
+ "model.layers.23.block_sparse_moe.experts.2.w2.weight": "model-00014-of-00019.safetensors",
512
+ "model.layers.23.block_sparse_moe.experts.2.w3.weight": "model-00014-of-00019.safetensors",
513
+ "model.layers.23.block_sparse_moe.experts.3.w1.weight": "model-00014-of-00019.safetensors",
514
+ "model.layers.23.block_sparse_moe.experts.3.w2.weight": "model-00014-of-00019.safetensors",
515
+ "model.layers.23.block_sparse_moe.experts.3.w3.weight": "model-00014-of-00019.safetensors",
516
+ "model.layers.23.block_sparse_moe.experts.4.w1.weight": "model-00014-of-00019.safetensors",
517
+ "model.layers.23.block_sparse_moe.experts.4.w2.weight": "model-00014-of-00019.safetensors",
518
+ "model.layers.23.block_sparse_moe.experts.4.w3.weight": "model-00014-of-00019.safetensors",
519
+ "model.layers.23.block_sparse_moe.experts.5.w1.weight": "model-00014-of-00019.safetensors",
520
+ "model.layers.23.block_sparse_moe.experts.5.w2.weight": "model-00014-of-00019.safetensors",
521
+ "model.layers.23.block_sparse_moe.experts.5.w3.weight": "model-00014-of-00019.safetensors",
522
+ "model.layers.23.block_sparse_moe.experts.6.w1.weight": "model-00014-of-00019.safetensors",
523
+ "model.layers.23.block_sparse_moe.experts.6.w2.weight": "model-00015-of-00019.safetensors",
524
+ "model.layers.23.block_sparse_moe.experts.6.w3.weight": "model-00015-of-00019.safetensors",
525
+ "model.layers.23.block_sparse_moe.experts.7.w1.weight": "model-00015-of-00019.safetensors",
526
+ "model.layers.23.block_sparse_moe.experts.7.w2.weight": "model-00015-of-00019.safetensors",
527
+ "model.layers.23.block_sparse_moe.experts.7.w3.weight": "model-00015-of-00019.safetensors",
528
+ "model.layers.23.block_sparse_moe.gate.weight": "model-00014-of-00019.safetensors",
529
+ "model.layers.23.input_layernorm.weight": "model-00015-of-00019.safetensors",
530
+ "model.layers.23.post_attention_layernorm.weight": "model-00015-of-00019.safetensors",
531
+ "model.layers.23.self_attn.k_proj.weight": "model-00014-of-00019.safetensors",
532
+ "model.layers.23.self_attn.o_proj.weight": "model-00014-of-00019.safetensors",
533
+ "model.layers.23.self_attn.q_proj.weight": "model-00014-of-00019.safetensors",
534
+ "model.layers.23.self_attn.v_proj.weight": "model-00014-of-00019.safetensors",
535
+ "model.layers.24.block_sparse_moe.experts.0.w1.weight": "model-00015-of-00019.safetensors",
536
+ "model.layers.24.block_sparse_moe.experts.0.w2.weight": "model-00015-of-00019.safetensors",
537
+ "model.layers.24.block_sparse_moe.experts.0.w3.weight": "model-00015-of-00019.safetensors",
538
+ "model.layers.24.block_sparse_moe.experts.1.w1.weight": "model-00015-of-00019.safetensors",
539
+ "model.layers.24.block_sparse_moe.experts.1.w2.weight": "model-00015-of-00019.safetensors",
540
+ "model.layers.24.block_sparse_moe.experts.1.w3.weight": "model-00015-of-00019.safetensors",
541
+ "model.layers.24.block_sparse_moe.experts.2.w1.weight": "model-00015-of-00019.safetensors",
542
+ "model.layers.24.block_sparse_moe.experts.2.w2.weight": "model-00015-of-00019.safetensors",
543
+ "model.layers.24.block_sparse_moe.experts.2.w3.weight": "model-00015-of-00019.safetensors",
544
+ "model.layers.24.block_sparse_moe.experts.3.w1.weight": "model-00015-of-00019.safetensors",
545
+ "model.layers.24.block_sparse_moe.experts.3.w2.weight": "model-00015-of-00019.safetensors",
546
+ "model.layers.24.block_sparse_moe.experts.3.w3.weight": "model-00015-of-00019.safetensors",
547
+ "model.layers.24.block_sparse_moe.experts.4.w1.weight": "model-00015-of-00019.safetensors",
548
+ "model.layers.24.block_sparse_moe.experts.4.w2.weight": "model-00015-of-00019.safetensors",
549
+ "model.layers.24.block_sparse_moe.experts.4.w3.weight": "model-00015-of-00019.safetensors",
550
+ "model.layers.24.block_sparse_moe.experts.5.w1.weight": "model-00015-of-00019.safetensors",
551
+ "model.layers.24.block_sparse_moe.experts.5.w2.weight": "model-00015-of-00019.safetensors",
552
+ "model.layers.24.block_sparse_moe.experts.5.w3.weight": "model-00015-of-00019.safetensors",
553
+ "model.layers.24.block_sparse_moe.experts.6.w1.weight": "model-00015-of-00019.safetensors",
554
+ "model.layers.24.block_sparse_moe.experts.6.w2.weight": "model-00015-of-00019.safetensors",
555
+ "model.layers.24.block_sparse_moe.experts.6.w3.weight": "model-00015-of-00019.safetensors",
556
+ "model.layers.24.block_sparse_moe.experts.7.w1.weight": "model-00015-of-00019.safetensors",
557
+ "model.layers.24.block_sparse_moe.experts.7.w2.weight": "model-00015-of-00019.safetensors",
558
+ "model.layers.24.block_sparse_moe.experts.7.w3.weight": "model-00015-of-00019.safetensors",
559
+ "model.layers.24.block_sparse_moe.gate.weight": "model-00015-of-00019.safetensors",
560
+ "model.layers.24.input_layernorm.weight": "model-00015-of-00019.safetensors",
561
+ "model.layers.24.post_attention_layernorm.weight": "model-00015-of-00019.safetensors",
562
+ "model.layers.24.self_attn.k_proj.weight": "model-00015-of-00019.safetensors",
563
+ "model.layers.24.self_attn.o_proj.weight": "model-00015-of-00019.safetensors",
564
+ "model.layers.24.self_attn.q_proj.weight": "model-00015-of-00019.safetensors",
565
+ "model.layers.24.self_attn.v_proj.weight": "model-00015-of-00019.safetensors",
566
+ "model.layers.25.block_sparse_moe.experts.0.w1.weight": "model-00015-of-00019.safetensors",
567
+ "model.layers.25.block_sparse_moe.experts.0.w2.weight": "model-00015-of-00019.safetensors",
568
+ "model.layers.25.block_sparse_moe.experts.0.w3.weight": "model-00015-of-00019.safetensors",
569
+ "model.layers.25.block_sparse_moe.experts.1.w1.weight": "model-00015-of-00019.safetensors",
570
+ "model.layers.25.block_sparse_moe.experts.1.w2.weight": "model-00015-of-00019.safetensors",
571
+ "model.layers.25.block_sparse_moe.experts.1.w3.weight": "model-00015-of-00019.safetensors",
572
+ "model.layers.25.block_sparse_moe.experts.2.w1.weight": "model-00015-of-00019.safetensors",
573
+ "model.layers.25.block_sparse_moe.experts.2.w2.weight": "model-00015-of-00019.safetensors",
574
+ "model.layers.25.block_sparse_moe.experts.2.w3.weight": "model-00015-of-00019.safetensors",
575
+ "model.layers.25.block_sparse_moe.experts.3.w1.weight": "model-00015-of-00019.safetensors",
576
+ "model.layers.25.block_sparse_moe.experts.3.w2.weight": "model-00015-of-00019.safetensors",
577
+ "model.layers.25.block_sparse_moe.experts.3.w3.weight": "model-00015-of-00019.safetensors",
578
+ "model.layers.25.block_sparse_moe.experts.4.w1.weight": "model-00016-of-00019.safetensors",
579
+ "model.layers.25.block_sparse_moe.experts.4.w2.weight": "model-00016-of-00019.safetensors",
580
+ "model.layers.25.block_sparse_moe.experts.4.w3.weight": "model-00016-of-00019.safetensors",
581
+ "model.layers.25.block_sparse_moe.experts.5.w1.weight": "model-00016-of-00019.safetensors",
582
+ "model.layers.25.block_sparse_moe.experts.5.w2.weight": "model-00016-of-00019.safetensors",
583
+ "model.layers.25.block_sparse_moe.experts.5.w3.weight": "model-00016-of-00019.safetensors",
584
+ "model.layers.25.block_sparse_moe.experts.6.w1.weight": "model-00016-of-00019.safetensors",
585
+ "model.layers.25.block_sparse_moe.experts.6.w2.weight": "model-00016-of-00019.safetensors",
586
+ "model.layers.25.block_sparse_moe.experts.6.w3.weight": "model-00016-of-00019.safetensors",
587
+ "model.layers.25.block_sparse_moe.experts.7.w1.weight": "model-00016-of-00019.safetensors",
588
+ "model.layers.25.block_sparse_moe.experts.7.w2.weight": "model-00016-of-00019.safetensors",
589
+ "model.layers.25.block_sparse_moe.experts.7.w3.weight": "model-00016-of-00019.safetensors",
590
+ "model.layers.25.block_sparse_moe.gate.weight": "model-00015-of-00019.safetensors",
591
+ "model.layers.25.input_layernorm.weight": "model-00016-of-00019.safetensors",
592
+ "model.layers.25.post_attention_layernorm.weight": "model-00016-of-00019.safetensors",
593
+ "model.layers.25.self_attn.k_proj.weight": "model-00015-of-00019.safetensors",
594
+ "model.layers.25.self_attn.o_proj.weight": "model-00015-of-00019.safetensors",
595
+ "model.layers.25.self_attn.q_proj.weight": "model-00015-of-00019.safetensors",
596
+ "model.layers.25.self_attn.v_proj.weight": "model-00015-of-00019.safetensors",
597
+ "model.layers.26.block_sparse_moe.experts.0.w1.weight": "model-00016-of-00019.safetensors",
598
+ "model.layers.26.block_sparse_moe.experts.0.w2.weight": "model-00016-of-00019.safetensors",
599
+ "model.layers.26.block_sparse_moe.experts.0.w3.weight": "model-00016-of-00019.safetensors",
600
+ "model.layers.26.block_sparse_moe.experts.1.w1.weight": "model-00016-of-00019.safetensors",
601
+ "model.layers.26.block_sparse_moe.experts.1.w2.weight": "model-00016-of-00019.safetensors",
602
+ "model.layers.26.block_sparse_moe.experts.1.w3.weight": "model-00016-of-00019.safetensors",
603
+ "model.layers.26.block_sparse_moe.experts.2.w1.weight": "model-00016-of-00019.safetensors",
604
+ "model.layers.26.block_sparse_moe.experts.2.w2.weight": "model-00016-of-00019.safetensors",
605
+ "model.layers.26.block_sparse_moe.experts.2.w3.weight": "model-00016-of-00019.safetensors",
606
+ "model.layers.26.block_sparse_moe.experts.3.w1.weight": "model-00016-of-00019.safetensors",
607
+ "model.layers.26.block_sparse_moe.experts.3.w2.weight": "model-00016-of-00019.safetensors",
608
+ "model.layers.26.block_sparse_moe.experts.3.w3.weight": "model-00016-of-00019.safetensors",
609
+ "model.layers.26.block_sparse_moe.experts.4.w1.weight": "model-00016-of-00019.safetensors",
610
+ "model.layers.26.block_sparse_moe.experts.4.w2.weight": "model-00016-of-00019.safetensors",
611
+ "model.layers.26.block_sparse_moe.experts.4.w3.weight": "model-00016-of-00019.safetensors",
612
+ "model.layers.26.block_sparse_moe.experts.5.w1.weight": "model-00016-of-00019.safetensors",
613
+ "model.layers.26.block_sparse_moe.experts.5.w2.weight": "model-00016-of-00019.safetensors",
614
+ "model.layers.26.block_sparse_moe.experts.5.w3.weight": "model-00016-of-00019.safetensors",
615
+ "model.layers.26.block_sparse_moe.experts.6.w1.weight": "model-00016-of-00019.safetensors",
616
+ "model.layers.26.block_sparse_moe.experts.6.w2.weight": "model-00016-of-00019.safetensors",
617
+ "model.layers.26.block_sparse_moe.experts.6.w3.weight": "model-00016-of-00019.safetensors",
618
+ "model.layers.26.block_sparse_moe.experts.7.w1.weight": "model-00016-of-00019.safetensors",
619
+ "model.layers.26.block_sparse_moe.experts.7.w2.weight": "model-00016-of-00019.safetensors",
620
+ "model.layers.26.block_sparse_moe.experts.7.w3.weight": "model-00016-of-00019.safetensors",
621
+ "model.layers.26.block_sparse_moe.gate.weight": "model-00016-of-00019.safetensors",
622
+ "model.layers.26.input_layernorm.weight": "model-00016-of-00019.safetensors",
623
+ "model.layers.26.post_attention_layernorm.weight": "model-00016-of-00019.safetensors",
624
+ "model.layers.26.self_attn.k_proj.weight": "model-00016-of-00019.safetensors",
625
+ "model.layers.26.self_attn.o_proj.weight": "model-00016-of-00019.safetensors",
626
+ "model.layers.26.self_attn.q_proj.weight": "model-00016-of-00019.safetensors",
627
+ "model.layers.26.self_attn.v_proj.weight": "model-00016-of-00019.safetensors",
628
+ "model.layers.27.block_sparse_moe.experts.0.w1.weight": "model-00016-of-00019.safetensors",
629
+ "model.layers.27.block_sparse_moe.experts.0.w2.weight": "model-00016-of-00019.safetensors",
630
+ "model.layers.27.block_sparse_moe.experts.0.w3.weight": "model-00016-of-00019.safetensors",
631
+ "model.layers.27.block_sparse_moe.experts.1.w1.weight": "model-00016-of-00019.safetensors",
632
+ "model.layers.27.block_sparse_moe.experts.1.w2.weight": "model-00016-of-00019.safetensors",
633
+ "model.layers.27.block_sparse_moe.experts.1.w3.weight": "model-00017-of-00019.safetensors",
634
+ "model.layers.27.block_sparse_moe.experts.2.w1.weight": "model-00017-of-00019.safetensors",
635
+ "model.layers.27.block_sparse_moe.experts.2.w2.weight": "model-00017-of-00019.safetensors",
636
+ "model.layers.27.block_sparse_moe.experts.2.w3.weight": "model-00017-of-00019.safetensors",
637
+ "model.layers.27.block_sparse_moe.experts.3.w1.weight": "model-00017-of-00019.safetensors",
638
+ "model.layers.27.block_sparse_moe.experts.3.w2.weight": "model-00017-of-00019.safetensors",
639
+ "model.layers.27.block_sparse_moe.experts.3.w3.weight": "model-00017-of-00019.safetensors",
640
+ "model.layers.27.block_sparse_moe.experts.4.w1.weight": "model-00017-of-00019.safetensors",
641
+ "model.layers.27.block_sparse_moe.experts.4.w2.weight": "model-00017-of-00019.safetensors",
642
+ "model.layers.27.block_sparse_moe.experts.4.w3.weight": "model-00017-of-00019.safetensors",
643
+ "model.layers.27.block_sparse_moe.experts.5.w1.weight": "model-00017-of-00019.safetensors",
644
+ "model.layers.27.block_sparse_moe.experts.5.w2.weight": "model-00017-of-00019.safetensors",
645
+ "model.layers.27.block_sparse_moe.experts.5.w3.weight": "model-00017-of-00019.safetensors",
646
+ "model.layers.27.block_sparse_moe.experts.6.w1.weight": "model-00017-of-00019.safetensors",
647
+ "model.layers.27.block_sparse_moe.experts.6.w2.weight": "model-00017-of-00019.safetensors",
648
+ "model.layers.27.block_sparse_moe.experts.6.w3.weight": "model-00017-of-00019.safetensors",
649
+ "model.layers.27.block_sparse_moe.experts.7.w1.weight": "model-00017-of-00019.safetensors",
650
+ "model.layers.27.block_sparse_moe.experts.7.w2.weight": "model-00017-of-00019.safetensors",
651
+ "model.layers.27.block_sparse_moe.experts.7.w3.weight": "model-00017-of-00019.safetensors",
652
+ "model.layers.27.block_sparse_moe.gate.weight": "model-00016-of-00019.safetensors",
653
+ "model.layers.27.input_layernorm.weight": "model-00017-of-00019.safetensors",
654
+ "model.layers.27.post_attention_layernorm.weight": "model-00017-of-00019.safetensors",
655
+ "model.layers.27.self_attn.k_proj.weight": "model-00016-of-00019.safetensors",
656
+ "model.layers.27.self_attn.o_proj.weight": "model-00016-of-00019.safetensors",
657
+ "model.layers.27.self_attn.q_proj.weight": "model-00016-of-00019.safetensors",
658
+ "model.layers.27.self_attn.v_proj.weight": "model-00016-of-00019.safetensors",
659
+ "model.layers.28.block_sparse_moe.experts.0.w1.weight": "model-00017-of-00019.safetensors",
660
+ "model.layers.28.block_sparse_moe.experts.0.w2.weight": "model-00017-of-00019.safetensors",
661
+ "model.layers.28.block_sparse_moe.experts.0.w3.weight": "model-00017-of-00019.safetensors",
662
+ "model.layers.28.block_sparse_moe.experts.1.w1.weight": "model-00017-of-00019.safetensors",
663
+ "model.layers.28.block_sparse_moe.experts.1.w2.weight": "model-00017-of-00019.safetensors",
664
+ "model.layers.28.block_sparse_moe.experts.1.w3.weight": "model-00017-of-00019.safetensors",
665
+ "model.layers.28.block_sparse_moe.experts.2.w1.weight": "model-00017-of-00019.safetensors",
666
+ "model.layers.28.block_sparse_moe.experts.2.w2.weight": "model-00017-of-00019.safetensors",
667
+ "model.layers.28.block_sparse_moe.experts.2.w3.weight": "model-00017-of-00019.safetensors",
668
+ "model.layers.28.block_sparse_moe.experts.3.w1.weight": "model-00017-of-00019.safetensors",
669
+ "model.layers.28.block_sparse_moe.experts.3.w2.weight": "model-00017-of-00019.safetensors",
670
+ "model.layers.28.block_sparse_moe.experts.3.w3.weight": "model-00017-of-00019.safetensors",
671
+ "model.layers.28.block_sparse_moe.experts.4.w1.weight": "model-00017-of-00019.safetensors",
672
+ "model.layers.28.block_sparse_moe.experts.4.w2.weight": "model-00017-of-00019.safetensors",
673
+ "model.layers.28.block_sparse_moe.experts.4.w3.weight": "model-00017-of-00019.safetensors",
674
+ "model.layers.28.block_sparse_moe.experts.5.w1.weight": "model-00017-of-00019.safetensors",
675
+ "model.layers.28.block_sparse_moe.experts.5.w2.weight": "model-00017-of-00019.safetensors",
676
+ "model.layers.28.block_sparse_moe.experts.5.w3.weight": "model-00017-of-00019.safetensors",
677
+ "model.layers.28.block_sparse_moe.experts.6.w1.weight": "model-00017-of-00019.safetensors",
678
+ "model.layers.28.block_sparse_moe.experts.6.w2.weight": "model-00017-of-00019.safetensors",
679
+ "model.layers.28.block_sparse_moe.experts.6.w3.weight": "model-00017-of-00019.safetensors",
680
+ "model.layers.28.block_sparse_moe.experts.7.w1.weight": "model-00017-of-00019.safetensors",
681
+ "model.layers.28.block_sparse_moe.experts.7.w2.weight": "model-00018-of-00019.safetensors",
682
+ "model.layers.28.block_sparse_moe.experts.7.w3.weight": "model-00018-of-00019.safetensors",
683
+ "model.layers.28.block_sparse_moe.gate.weight": "model-00017-of-00019.safetensors",
684
+ "model.layers.28.input_layernorm.weight": "model-00018-of-00019.safetensors",
685
+ "model.layers.28.post_attention_layernorm.weight": "model-00018-of-00019.safetensors",
686
+ "model.layers.28.self_attn.k_proj.weight": "model-00017-of-00019.safetensors",
687
+ "model.layers.28.self_attn.o_proj.weight": "model-00017-of-00019.safetensors",
688
+ "model.layers.28.self_attn.q_proj.weight": "model-00017-of-00019.safetensors",
689
+ "model.layers.28.self_attn.v_proj.weight": "model-00017-of-00019.safetensors",
690
+ "model.layers.29.block_sparse_moe.experts.0.w1.weight": "model-00018-of-00019.safetensors",
691
+ "model.layers.29.block_sparse_moe.experts.0.w2.weight": "model-00018-of-00019.safetensors",
692
+ "model.layers.29.block_sparse_moe.experts.0.w3.weight": "model-00018-of-00019.safetensors",
693
+ "model.layers.29.block_sparse_moe.experts.1.w1.weight": "model-00018-of-00019.safetensors",
694
+ "model.layers.29.block_sparse_moe.experts.1.w2.weight": "model-00018-of-00019.safetensors",
695
+ "model.layers.29.block_sparse_moe.experts.1.w3.weight": "model-00018-of-00019.safetensors",
696
+ "model.layers.29.block_sparse_moe.experts.2.w1.weight": "model-00018-of-00019.safetensors",
697
+ "model.layers.29.block_sparse_moe.experts.2.w2.weight": "model-00018-of-00019.safetensors",
698
+ "model.layers.29.block_sparse_moe.experts.2.w3.weight": "model-00018-of-00019.safetensors",
699
+ "model.layers.29.block_sparse_moe.experts.3.w1.weight": "model-00018-of-00019.safetensors",
700
+ "model.layers.29.block_sparse_moe.experts.3.w2.weight": "model-00018-of-00019.safetensors",
701
+ "model.layers.29.block_sparse_moe.experts.3.w3.weight": "model-00018-of-00019.safetensors",
702
+ "model.layers.29.block_sparse_moe.experts.4.w1.weight": "model-00018-of-00019.safetensors",
703
+ "model.layers.29.block_sparse_moe.experts.4.w2.weight": "model-00018-of-00019.safetensors",
704
+ "model.layers.29.block_sparse_moe.experts.4.w3.weight": "model-00018-of-00019.safetensors",
705
+ "model.layers.29.block_sparse_moe.experts.5.w1.weight": "model-00018-of-00019.safetensors",
706
+ "model.layers.29.block_sparse_moe.experts.5.w2.weight": "model-00018-of-00019.safetensors",
707
+ "model.layers.29.block_sparse_moe.experts.5.w3.weight": "model-00018-of-00019.safetensors",
708
+ "model.layers.29.block_sparse_moe.experts.6.w1.weight": "model-00018-of-00019.safetensors",
709
+ "model.layers.29.block_sparse_moe.experts.6.w2.weight": "model-00018-of-00019.safetensors",
710
+ "model.layers.29.block_sparse_moe.experts.6.w3.weight": "model-00018-of-00019.safetensors",
711
+ "model.layers.29.block_sparse_moe.experts.7.w1.weight": "model-00018-of-00019.safetensors",
712
+ "model.layers.29.block_sparse_moe.experts.7.w2.weight": "model-00018-of-00019.safetensors",
713
+ "model.layers.29.block_sparse_moe.experts.7.w3.weight": "model-00018-of-00019.safetensors",
714
+ "model.layers.29.block_sparse_moe.gate.weight": "model-00018-of-00019.safetensors",
715
+ "model.layers.29.input_layernorm.weight": "model-00018-of-00019.safetensors",
716
+ "model.layers.29.post_attention_layernorm.weight": "model-00018-of-00019.safetensors",
717
+ "model.layers.29.self_attn.k_proj.weight": "model-00018-of-00019.safetensors",
718
+ "model.layers.29.self_attn.o_proj.weight": "model-00018-of-00019.safetensors",
719
+ "model.layers.29.self_attn.q_proj.weight": "model-00018-of-00019.safetensors",
720
+ "model.layers.29.self_attn.v_proj.weight": "model-00018-of-00019.safetensors",
721
+ "model.layers.3.block_sparse_moe.experts.0.w1.weight": "model-00002-of-00019.safetensors",
722
+ "model.layers.3.block_sparse_moe.experts.0.w2.weight": "model-00002-of-00019.safetensors",
723
+ "model.layers.3.block_sparse_moe.experts.0.w3.weight": "model-00002-of-00019.safetensors",
724
+ "model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00002-of-00019.safetensors",
725
+ "model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00019.safetensors",
726
+ "model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00019.safetensors",
727
+ "model.layers.3.block_sparse_moe.experts.2.w1.weight": "model-00002-of-00019.safetensors",
728
+ "model.layers.3.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00019.safetensors",
729
+ "model.layers.3.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00019.safetensors",
730
+ "model.layers.3.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00019.safetensors",
731
+ "model.layers.3.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00019.safetensors",
732
+ "model.layers.3.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00019.safetensors",
733
+ "model.layers.3.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00019.safetensors",
734
+ "model.layers.3.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00019.safetensors",
735
+ "model.layers.3.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00019.safetensors",
736
+ "model.layers.3.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00019.safetensors",
737
+ "model.layers.3.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00019.safetensors",
738
+ "model.layers.3.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00019.safetensors",
739
+ "model.layers.3.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00019.safetensors",
740
+ "model.layers.3.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00019.safetensors",
741
+ "model.layers.3.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00019.safetensors",
742
+ "model.layers.3.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00019.safetensors",
743
+ "model.layers.3.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00019.safetensors",
744
+ "model.layers.3.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00019.safetensors",
745
+ "model.layers.3.block_sparse_moe.gate.weight": "model-00002-of-00019.safetensors",
746
+ "model.layers.3.input_layernorm.weight": "model-00003-of-00019.safetensors",
747
+ "model.layers.3.post_attention_layernorm.weight": "model-00003-of-00019.safetensors",
748
+ "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00019.safetensors",
749
+ "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00019.safetensors",
750
+ "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00019.safetensors",
751
+ "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00019.safetensors",
752
+ "model.layers.30.block_sparse_moe.experts.0.w1.weight": "model-00018-of-00019.safetensors",
753
+ "model.layers.30.block_sparse_moe.experts.0.w2.weight": "model-00018-of-00019.safetensors",
754
+ "model.layers.30.block_sparse_moe.experts.0.w3.weight": "model-00018-of-00019.safetensors",
755
+ "model.layers.30.block_sparse_moe.experts.1.w1.weight": "model-00018-of-00019.safetensors",
756
+ "model.layers.30.block_sparse_moe.experts.1.w2.weight": "model-00018-of-00019.safetensors",
757
+ "model.layers.30.block_sparse_moe.experts.1.w3.weight": "model-00018-of-00019.safetensors",
758
+ "model.layers.30.block_sparse_moe.experts.2.w1.weight": "model-00018-of-00019.safetensors",
759
+ "model.layers.30.block_sparse_moe.experts.2.w2.weight": "model-00018-of-00019.safetensors",
760
+ "model.layers.30.block_sparse_moe.experts.2.w3.weight": "model-00018-of-00019.safetensors",
761
+ "model.layers.30.block_sparse_moe.experts.3.w1.weight": "model-00018-of-00019.safetensors",
762
+ "model.layers.30.block_sparse_moe.experts.3.w2.weight": "model-00018-of-00019.safetensors",
763
+ "model.layers.30.block_sparse_moe.experts.3.w3.weight": "model-00018-of-00019.safetensors",
764
+ "model.layers.30.block_sparse_moe.experts.4.w1.weight": "model-00018-of-00019.safetensors",
765
+ "model.layers.30.block_sparse_moe.experts.4.w2.weight": "model-00018-of-00019.safetensors",
766
+ "model.layers.30.block_sparse_moe.experts.4.w3.weight": "model-00018-of-00019.safetensors",
767
+ "model.layers.30.block_sparse_moe.experts.5.w1.weight": "model-00019-of-00019.safetensors",
768
+ "model.layers.30.block_sparse_moe.experts.5.w2.weight": "model-00019-of-00019.safetensors",
769
+ "model.layers.30.block_sparse_moe.experts.5.w3.weight": "model-00019-of-00019.safetensors",
770
+ "model.layers.30.block_sparse_moe.experts.6.w1.weight": "model-00019-of-00019.safetensors",
771
+ "model.layers.30.block_sparse_moe.experts.6.w2.weight": "model-00019-of-00019.safetensors",
772
+ "model.layers.30.block_sparse_moe.experts.6.w3.weight": "model-00019-of-00019.safetensors",
773
+ "model.layers.30.block_sparse_moe.experts.7.w1.weight": "model-00019-of-00019.safetensors",
774
+ "model.layers.30.block_sparse_moe.experts.7.w2.weight": "model-00019-of-00019.safetensors",
775
+ "model.layers.30.block_sparse_moe.experts.7.w3.weight": "model-00019-of-00019.safetensors",
776
+ "model.layers.30.block_sparse_moe.gate.weight": "model-00018-of-00019.safetensors",
777
+ "model.layers.30.input_layernorm.weight": "model-00019-of-00019.safetensors",
778
+ "model.layers.30.post_attention_layernorm.weight": "model-00019-of-00019.safetensors",
779
+ "model.layers.30.self_attn.k_proj.weight": "model-00018-of-00019.safetensors",
780
+ "model.layers.30.self_attn.o_proj.weight": "model-00018-of-00019.safetensors",
781
+ "model.layers.30.self_attn.q_proj.weight": "model-00018-of-00019.safetensors",
782
+ "model.layers.30.self_attn.v_proj.weight": "model-00018-of-00019.safetensors",
783
+ "model.layers.31.block_sparse_moe.experts.0.w1.weight": "model-00019-of-00019.safetensors",
784
+ "model.layers.31.block_sparse_moe.experts.0.w2.weight": "model-00019-of-00019.safetensors",
785
+ "model.layers.31.block_sparse_moe.experts.0.w3.weight": "model-00019-of-00019.safetensors",
786
+ "model.layers.31.block_sparse_moe.experts.1.w1.weight": "model-00019-of-00019.safetensors",
787
+ "model.layers.31.block_sparse_moe.experts.1.w2.weight": "model-00019-of-00019.safetensors",
788
+ "model.layers.31.block_sparse_moe.experts.1.w3.weight": "model-00019-of-00019.safetensors",
789
+ "model.layers.31.block_sparse_moe.experts.2.w1.weight": "model-00019-of-00019.safetensors",
790
+ "model.layers.31.block_sparse_moe.experts.2.w2.weight": "model-00019-of-00019.safetensors",
791
+ "model.layers.31.block_sparse_moe.experts.2.w3.weight": "model-00019-of-00019.safetensors",
792
+ "model.layers.31.block_sparse_moe.experts.3.w1.weight": "model-00019-of-00019.safetensors",
793
+ "model.layers.31.block_sparse_moe.experts.3.w2.weight": "model-00019-of-00019.safetensors",
794
+ "model.layers.31.block_sparse_moe.experts.3.w3.weight": "model-00019-of-00019.safetensors",
795
+ "model.layers.31.block_sparse_moe.experts.4.w1.weight": "model-00019-of-00019.safetensors",
796
+ "model.layers.31.block_sparse_moe.experts.4.w2.weight": "model-00019-of-00019.safetensors",
797
+ "model.layers.31.block_sparse_moe.experts.4.w3.weight": "model-00019-of-00019.safetensors",
798
+ "model.layers.31.block_sparse_moe.experts.5.w1.weight": "model-00019-of-00019.safetensors",
799
+ "model.layers.31.block_sparse_moe.experts.5.w2.weight": "model-00019-of-00019.safetensors",
800
+ "model.layers.31.block_sparse_moe.experts.5.w3.weight": "model-00019-of-00019.safetensors",
801
+ "model.layers.31.block_sparse_moe.experts.6.w1.weight": "model-00019-of-00019.safetensors",
802
+ "model.layers.31.block_sparse_moe.experts.6.w2.weight": "model-00019-of-00019.safetensors",
803
+ "model.layers.31.block_sparse_moe.experts.6.w3.weight": "model-00019-of-00019.safetensors",
804
+ "model.layers.31.block_sparse_moe.experts.7.w1.weight": "model-00019-of-00019.safetensors",
805
+ "model.layers.31.block_sparse_moe.experts.7.w2.weight": "model-00019-of-00019.safetensors",
806
+ "model.layers.31.block_sparse_moe.experts.7.w3.weight": "model-00019-of-00019.safetensors",
807
+ "model.layers.31.block_sparse_moe.gate.weight": "model-00019-of-00019.safetensors",
808
+ "model.layers.31.input_layernorm.weight": "model-00019-of-00019.safetensors",
809
+ "model.layers.31.post_attention_layernorm.weight": "model-00019-of-00019.safetensors",
810
+ "model.layers.31.self_attn.k_proj.weight": "model-00019-of-00019.safetensors",
811
+ "model.layers.31.self_attn.o_proj.weight": "model-00019-of-00019.safetensors",
812
+ "model.layers.31.self_attn.q_proj.weight": "model-00019-of-00019.safetensors",
813
+ "model.layers.31.self_attn.v_proj.weight": "model-00019-of-00019.safetensors",
814
+ "model.layers.4.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00019.safetensors",
815
+ "model.layers.4.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00019.safetensors",
816
+ "model.layers.4.block_sparse_moe.experts.0.w3.weight": "model-00003-of-00019.safetensors",
817
+ "model.layers.4.block_sparse_moe.experts.1.w1.weight": "model-00003-of-00019.safetensors",
818
+ "model.layers.4.block_sparse_moe.experts.1.w2.weight": "model-00003-of-00019.safetensors",
819
+ "model.layers.4.block_sparse_moe.experts.1.w3.weight": "model-00003-of-00019.safetensors",
820
+ "model.layers.4.block_sparse_moe.experts.2.w1.weight": "model-00003-of-00019.safetensors",
821
+ "model.layers.4.block_sparse_moe.experts.2.w2.weight": "model-00003-of-00019.safetensors",
822
+ "model.layers.4.block_sparse_moe.experts.2.w3.weight": "model-00003-of-00019.safetensors",
823
+ "model.layers.4.block_sparse_moe.experts.3.w1.weight": "model-00003-of-00019.safetensors",
824
+ "model.layers.4.block_sparse_moe.experts.3.w2.weight": "model-00003-of-00019.safetensors",
825
+ "model.layers.4.block_sparse_moe.experts.3.w3.weight": "model-00003-of-00019.safetensors",
826
+ "model.layers.4.block_sparse_moe.experts.4.w1.weight": "model-00003-of-00019.safetensors",
827
+ "model.layers.4.block_sparse_moe.experts.4.w2.weight": "model-00003-of-00019.safetensors",
828
+ "model.layers.4.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00019.safetensors",
829
+ "model.layers.4.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00019.safetensors",
830
+ "model.layers.4.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00019.safetensors",
831
+ "model.layers.4.block_sparse_moe.experts.5.w3.weight": "model-00003-of-00019.safetensors",
832
+ "model.layers.4.block_sparse_moe.experts.6.w1.weight": "model-00003-of-00019.safetensors",
833
+ "model.layers.4.block_sparse_moe.experts.6.w2.weight": "model-00003-of-00019.safetensors",
834
+ "model.layers.4.block_sparse_moe.experts.6.w3.weight": "model-00003-of-00019.safetensors",
835
+ "model.layers.4.block_sparse_moe.experts.7.w1.weight": "model-00003-of-00019.safetensors",
836
+ "model.layers.4.block_sparse_moe.experts.7.w2.weight": "model-00003-of-00019.safetensors",
837
+ "model.layers.4.block_sparse_moe.experts.7.w3.weight": "model-00003-of-00019.safetensors",
838
+ "model.layers.4.block_sparse_moe.gate.weight": "model-00003-of-00019.safetensors",
839
+ "model.layers.4.input_layernorm.weight": "model-00003-of-00019.safetensors",
840
+ "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00019.safetensors",
841
+ "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00019.safetensors",
842
+ "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00019.safetensors",
843
+ "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00019.safetensors",
844
+ "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00019.safetensors",
845
+ "model.layers.5.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00019.safetensors",
846
+ "model.layers.5.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00019.safetensors",
847
+ "model.layers.5.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00019.safetensors",
848
+ "model.layers.5.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00019.safetensors",
849
+ "model.layers.5.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00019.safetensors",
850
+ "model.layers.5.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00019.safetensors",
851
+ "model.layers.5.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00019.safetensors",
852
+ "model.layers.5.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00019.safetensors",
853
+ "model.layers.5.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00019.safetensors",
854
+ "model.layers.5.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00019.safetensors",
855
+ "model.layers.5.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00019.safetensors",
856
+ "model.layers.5.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00019.safetensors",
857
+ "model.layers.5.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00019.safetensors",
858
+ "model.layers.5.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00019.safetensors",
859
+ "model.layers.5.block_sparse_moe.experts.4.w3.weight": "model-00004-of-00019.safetensors",
860
+ "model.layers.5.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00019.safetensors",
861
+ "model.layers.5.block_sparse_moe.experts.5.w2.weight": "model-00004-of-00019.safetensors",
862
+ "model.layers.5.block_sparse_moe.experts.5.w3.weight": "model-00004-of-00019.safetensors",
863
+ "model.layers.5.block_sparse_moe.experts.6.w1.weight": "model-00004-of-00019.safetensors",
864
+ "model.layers.5.block_sparse_moe.experts.6.w2.weight": "model-00004-of-00019.safetensors",
865
+ "model.layers.5.block_sparse_moe.experts.6.w3.weight": "model-00004-of-00019.safetensors",
866
+ "model.layers.5.block_sparse_moe.experts.7.w1.weight": "model-00004-of-00019.safetensors",
867
+ "model.layers.5.block_sparse_moe.experts.7.w2.weight": "model-00004-of-00019.safetensors",
868
+ "model.layers.5.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00019.safetensors",
869
+ "model.layers.5.block_sparse_moe.gate.weight": "model-00003-of-00019.safetensors",
870
+ "model.layers.5.input_layernorm.weight": "model-00004-of-00019.safetensors",
871
+ "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00019.safetensors",
872
+ "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00019.safetensors",
873
+ "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00019.safetensors",
874
+ "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00019.safetensors",
875
+ "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00019.safetensors",
876
+ "model.layers.6.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00019.safetensors",
877
+ "model.layers.6.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00019.safetensors",
878
+ "model.layers.6.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00019.safetensors",
879
+ "model.layers.6.block_sparse_moe.experts.1.w1.weight": "model-00004-of-00019.safetensors",
880
+ "model.layers.6.block_sparse_moe.experts.1.w2.weight": "model-00004-of-00019.safetensors",
881
+ "model.layers.6.block_sparse_moe.experts.1.w3.weight": "model-00004-of-00019.safetensors",
882
+ "model.layers.6.block_sparse_moe.experts.2.w1.weight": "model-00004-of-00019.safetensors",
883
+ "model.layers.6.block_sparse_moe.experts.2.w2.weight": "model-00004-of-00019.safetensors",
884
+ "model.layers.6.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00019.safetensors",
885
+ "model.layers.6.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00019.safetensors",
886
+ "model.layers.6.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00019.safetensors",
887
+ "model.layers.6.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00019.safetensors",
888
+ "model.layers.6.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00019.safetensors",
889
+ "model.layers.6.block_sparse_moe.experts.4.w2.weight": "model-00004-of-00019.safetensors",
890
+ "model.layers.6.block_sparse_moe.experts.4.w3.weight": "model-00004-of-00019.safetensors",
891
+ "model.layers.6.block_sparse_moe.experts.5.w1.weight": "model-00004-of-00019.safetensors",
892
+ "model.layers.6.block_sparse_moe.experts.5.w2.weight": "model-00004-of-00019.safetensors",
893
+ "model.layers.6.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00019.safetensors",
894
+ "model.layers.6.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00019.safetensors",
895
+ "model.layers.6.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00019.safetensors",
896
+ "model.layers.6.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00019.safetensors",
897
+ "model.layers.6.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00019.safetensors",
898
+ "model.layers.6.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00019.safetensors",
899
+ "model.layers.6.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00019.safetensors",
900
+ "model.layers.6.block_sparse_moe.gate.weight": "model-00004-of-00019.safetensors",
901
+ "model.layers.6.input_layernorm.weight": "model-00005-of-00019.safetensors",
902
+ "model.layers.6.post_attention_layernorm.weight": "model-00005-of-00019.safetensors",
903
+ "model.layers.6.self_attn.k_proj.weight": "model-00004-of-00019.safetensors",
904
+ "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00019.safetensors",
905
+ "model.layers.6.self_attn.q_proj.weight": "model-00004-of-00019.safetensors",
906
+ "model.layers.6.self_attn.v_proj.weight": "model-00004-of-00019.safetensors",
907
+ "model.layers.7.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00019.safetensors",
908
+ "model.layers.7.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00019.safetensors",
909
+ "model.layers.7.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00019.safetensors",
910
+ "model.layers.7.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00019.safetensors",
911
+ "model.layers.7.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00019.safetensors",
912
+ "model.layers.7.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00019.safetensors",
913
+ "model.layers.7.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00019.safetensors",
914
+ "model.layers.7.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00019.safetensors",
915
+ "model.layers.7.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00019.safetensors",
916
+ "model.layers.7.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00019.safetensors",
917
+ "model.layers.7.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00019.safetensors",
918
+ "model.layers.7.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00019.safetensors",
919
+ "model.layers.7.block_sparse_moe.experts.4.w1.weight": "model-00005-of-00019.safetensors",
920
+ "model.layers.7.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00019.safetensors",
921
+ "model.layers.7.block_sparse_moe.experts.4.w3.weight": "model-00005-of-00019.safetensors",
922
+ "model.layers.7.block_sparse_moe.experts.5.w1.weight": "model-00005-of-00019.safetensors",
923
+ "model.layers.7.block_sparse_moe.experts.5.w2.weight": "model-00005-of-00019.safetensors",
924
+ "model.layers.7.block_sparse_moe.experts.5.w3.weight": "model-00005-of-00019.safetensors",
925
+ "model.layers.7.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00019.safetensors",
926
+ "model.layers.7.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00019.safetensors",
927
+ "model.layers.7.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00019.safetensors",
928
+ "model.layers.7.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00019.safetensors",
929
+ "model.layers.7.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00019.safetensors",
930
+ "model.layers.7.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00019.safetensors",
931
+ "model.layers.7.block_sparse_moe.gate.weight": "model-00005-of-00019.safetensors",
932
+ "model.layers.7.input_layernorm.weight": "model-00005-of-00019.safetensors",
933
+ "model.layers.7.post_attention_layernorm.weight": "model-00005-of-00019.safetensors",
934
+ "model.layers.7.self_attn.k_proj.weight": "model-00005-of-00019.safetensors",
935
+ "model.layers.7.self_attn.o_proj.weight": "model-00005-of-00019.safetensors",
936
+ "model.layers.7.self_attn.q_proj.weight": "model-00005-of-00019.safetensors",
937
+ "model.layers.7.self_attn.v_proj.weight": "model-00005-of-00019.safetensors",
938
+ "model.layers.8.block_sparse_moe.experts.0.w1.weight": "model-00005-of-00019.safetensors",
939
+ "model.layers.8.block_sparse_moe.experts.0.w2.weight": "model-00005-of-00019.safetensors",
940
+ "model.layers.8.block_sparse_moe.experts.0.w3.weight": "model-00005-of-00019.safetensors",
941
+ "model.layers.8.block_sparse_moe.experts.1.w1.weight": "model-00005-of-00019.safetensors",
942
+ "model.layers.8.block_sparse_moe.experts.1.w2.weight": "model-00005-of-00019.safetensors",
943
+ "model.layers.8.block_sparse_moe.experts.1.w3.weight": "model-00005-of-00019.safetensors",
944
+ "model.layers.8.block_sparse_moe.experts.2.w1.weight": "model-00005-of-00019.safetensors",
945
+ "model.layers.8.block_sparse_moe.experts.2.w2.weight": "model-00005-of-00019.safetensors",
946
+ "model.layers.8.block_sparse_moe.experts.2.w3.weight": "model-00005-of-00019.safetensors",
947
+ "model.layers.8.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00019.safetensors",
948
+ "model.layers.8.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00019.safetensors",
949
+ "model.layers.8.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00019.safetensors",
950
+ "model.layers.8.block_sparse_moe.experts.4.w1.weight": "model-00006-of-00019.safetensors",
951
+ "model.layers.8.block_sparse_moe.experts.4.w2.weight": "model-00006-of-00019.safetensors",
952
+ "model.layers.8.block_sparse_moe.experts.4.w3.weight": "model-00006-of-00019.safetensors",
953
+ "model.layers.8.block_sparse_moe.experts.5.w1.weight": "model-00006-of-00019.safetensors",
954
+ "model.layers.8.block_sparse_moe.experts.5.w2.weight": "model-00006-of-00019.safetensors",
955
+ "model.layers.8.block_sparse_moe.experts.5.w3.weight": "model-00006-of-00019.safetensors",
956
+ "model.layers.8.block_sparse_moe.experts.6.w1.weight": "model-00006-of-00019.safetensors",
957
+ "model.layers.8.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00019.safetensors",
958
+ "model.layers.8.block_sparse_moe.experts.6.w3.weight": "model-00006-of-00019.safetensors",
959
+ "model.layers.8.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00019.safetensors",
960
+ "model.layers.8.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00019.safetensors",
961
+ "model.layers.8.block_sparse_moe.experts.7.w3.weight": "model-00006-of-00019.safetensors",
962
+ "model.layers.8.block_sparse_moe.gate.weight": "model-00005-of-00019.safetensors",
963
+ "model.layers.8.input_layernorm.weight": "model-00006-of-00019.safetensors",
964
+ "model.layers.8.post_attention_layernorm.weight": "model-00006-of-00019.safetensors",
965
+ "model.layers.8.self_attn.k_proj.weight": "model-00005-of-00019.safetensors",
966
+ "model.layers.8.self_attn.o_proj.weight": "model-00005-of-00019.safetensors",
967
+ "model.layers.8.self_attn.q_proj.weight": "model-00005-of-00019.safetensors",
968
+ "model.layers.8.self_attn.v_proj.weight": "model-00005-of-00019.safetensors",
969
+ "model.layers.9.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00019.safetensors",
970
+ "model.layers.9.block_sparse_moe.experts.0.w2.weight": "model-00006-of-00019.safetensors",
971
+ "model.layers.9.block_sparse_moe.experts.0.w3.weight": "model-00006-of-00019.safetensors",
972
+ "model.layers.9.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00019.safetensors",
973
+ "model.layers.9.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00019.safetensors",
974
+ "model.layers.9.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00019.safetensors",
975
+ "model.layers.9.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00019.safetensors",
976
+ "model.layers.9.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00019.safetensors",
977
+ "model.layers.9.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00019.safetensors",
978
+ "model.layers.9.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00019.safetensors",
979
+ "model.layers.9.block_sparse_moe.experts.3.w2.weight": "model-00006-of-00019.safetensors",
980
+ "model.layers.9.block_sparse_moe.experts.3.w3.weight": "model-00006-of-00019.safetensors",
981
+ "model.layers.9.block_sparse_moe.experts.4.w1.weight": "model-00006-of-00019.safetensors",
982
+ "model.layers.9.block_sparse_moe.experts.4.w2.weight": "model-00006-of-00019.safetensors",
983
+ "model.layers.9.block_sparse_moe.experts.4.w3.weight": "model-00006-of-00019.safetensors",
984
+ "model.layers.9.block_sparse_moe.experts.5.w1.weight": "model-00006-of-00019.safetensors",
985
+ "model.layers.9.block_sparse_moe.experts.5.w2.weight": "model-00006-of-00019.safetensors",
986
+ "model.layers.9.block_sparse_moe.experts.5.w3.weight": "model-00006-of-00019.safetensors",
987
+ "model.layers.9.block_sparse_moe.experts.6.w1.weight": "model-00006-of-00019.safetensors",
988
+ "model.layers.9.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00019.safetensors",
989
+ "model.layers.9.block_sparse_moe.experts.6.w3.weight": "model-00006-of-00019.safetensors",
990
+ "model.layers.9.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00019.safetensors",
991
+ "model.layers.9.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00019.safetensors",
992
+ "model.layers.9.block_sparse_moe.experts.7.w3.weight": "model-00006-of-00019.safetensors",
993
+ "model.layers.9.block_sparse_moe.gate.weight": "model-00006-of-00019.safetensors",
994
+ "model.layers.9.input_layernorm.weight": "model-00006-of-00019.safetensors",
995
+ "model.layers.9.post_attention_layernorm.weight": "model-00006-of-00019.safetensors",
996
+ "model.layers.9.self_attn.k_proj.weight": "model-00006-of-00019.safetensors",
997
+ "model.layers.9.self_attn.o_proj.weight": "model-00006-of-00019.safetensors",
998
+ "model.layers.9.self_attn.q_proj.weight": "model-00006-of-00019.safetensors",
999
+ "model.layers.9.self_attn.v_proj.weight": "model-00006-of-00019.safetensors",
1000
+ "model.norm.weight": "model-00019-of-00019.safetensors"
1001
+ }
1002
+ }
runs/Apr16_09-58-18_ggpu159/events.out.tfevents.1713254855.ggpu159.2274579.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6799c0a5d9ba97971562b9c8a96180bcd436d00997cf8a68b82c4d4f2d2c92d2
3
+ size 98258
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "train_loss": 0.7946326331969004,
4
+ "train_runtime": 13847.9433,
5
+ "train_samples": 11530,
6
+ "train_samples_per_second": 2.498,
7
+ "train_steps_per_second": 0.078
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1974 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1083,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "grad_norm": 129536.0,
14
+ "learning_rate": 5.000000000000001e-07,
15
+ "log_odds_chosen": 0.2670513093471527,
16
+ "log_odds_ratio": -0.5927332043647766,
17
+ "logits/chosen": 4.400671482086182,
18
+ "logits/rejected": 4.455313682556152,
19
+ "logps/chosen": -0.7621899843215942,
20
+ "logps/rejected": -0.9112717509269714,
21
+ "loss": 1.7067,
22
+ "nll_loss": 1.2433254718780518,
23
+ "rewards/accuracies": 0.699999988079071,
24
+ "rewards/chosen": -0.03810950368642807,
25
+ "rewards/margins": 0.007454085163772106,
26
+ "rewards/rejected": -0.04556358978152275,
27
+ "step": 10
28
+ },
29
+ {
30
+ "epoch": 0.06,
31
+ "grad_norm": 372736.0,
32
+ "learning_rate": 1.0000000000000002e-06,
33
+ "log_odds_chosen": 0.6244336366653442,
34
+ "log_odds_ratio": -0.49496006965637207,
35
+ "logits/chosen": 4.977335453033447,
36
+ "logits/rejected": 4.971369743347168,
37
+ "logps/chosen": -0.8002179861068726,
38
+ "logps/rejected": -1.1836291551589966,
39
+ "loss": 1.9551,
40
+ "nll_loss": 1.6530841588974,
41
+ "rewards/accuracies": 0.800000011920929,
42
+ "rewards/chosen": -0.040010906755924225,
43
+ "rewards/margins": 0.019170550629496574,
44
+ "rewards/rejected": -0.05918145924806595,
45
+ "step": 20
46
+ },
47
+ {
48
+ "epoch": 0.08,
49
+ "grad_norm": 198656.0,
50
+ "learning_rate": 1.5e-06,
51
+ "log_odds_chosen": 0.22288911044597626,
52
+ "log_odds_ratio": -0.6639400720596313,
53
+ "logits/chosen": 4.187830924987793,
54
+ "logits/rejected": 4.613868713378906,
55
+ "logps/chosen": -0.8369601368904114,
56
+ "logps/rejected": -0.9581049680709839,
57
+ "loss": 2.0627,
58
+ "nll_loss": 2.1578612327575684,
59
+ "rewards/accuracies": 0.6000000238418579,
60
+ "rewards/chosen": -0.04184800758957863,
61
+ "rewards/margins": 0.006057241465896368,
62
+ "rewards/rejected": -0.04790525510907173,
63
+ "step": 30
64
+ },
65
+ {
66
+ "epoch": 0.11,
67
+ "grad_norm": 9792.0,
68
+ "learning_rate": 2.0000000000000003e-06,
69
+ "log_odds_chosen": 0.2988000512123108,
70
+ "log_odds_ratio": -0.5841063261032104,
71
+ "logits/chosen": 3.96022367477417,
72
+ "logits/rejected": 5.4938645362854,
73
+ "logps/chosen": -0.9292069673538208,
74
+ "logps/rejected": -1.1179070472717285,
75
+ "loss": 1.7005,
76
+ "nll_loss": 2.407797336578369,
77
+ "rewards/accuracies": 0.800000011920929,
78
+ "rewards/chosen": -0.0464603491127491,
79
+ "rewards/margins": 0.00943500641733408,
80
+ "rewards/rejected": -0.0558953583240509,
81
+ "step": 40
82
+ },
83
+ {
84
+ "epoch": 0.14,
85
+ "grad_norm": 30592.0,
86
+ "learning_rate": 2.5e-06,
87
+ "log_odds_chosen": 0.17792589962482452,
88
+ "log_odds_ratio": -0.6552326083183289,
89
+ "logits/chosen": 3.499319076538086,
90
+ "logits/rejected": 3.937030792236328,
91
+ "logps/chosen": -0.9279796481132507,
92
+ "logps/rejected": -1.08034348487854,
93
+ "loss": 1.4593,
94
+ "nll_loss": 1.7558667659759521,
95
+ "rewards/accuracies": 0.6000000238418579,
96
+ "rewards/chosen": -0.046398986130952835,
97
+ "rewards/margins": 0.007618183735758066,
98
+ "rewards/rejected": -0.054017167538404465,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.17,
103
+ "grad_norm": 118272.0,
104
+ "learning_rate": 3e-06,
105
+ "log_odds_chosen": 0.4226877689361572,
106
+ "log_odds_ratio": -0.6192396283149719,
107
+ "logits/chosen": 3.7764244079589844,
108
+ "logits/rejected": 3.936429262161255,
109
+ "logps/chosen": -1.0400720834732056,
110
+ "logps/rejected": -1.394266963005066,
111
+ "loss": 1.833,
112
+ "nll_loss": 2.083087921142578,
113
+ "rewards/accuracies": 0.6000000238418579,
114
+ "rewards/chosen": -0.05200360342860222,
115
+ "rewards/margins": 0.017709745094180107,
116
+ "rewards/rejected": -0.06971334666013718,
117
+ "step": 60
118
+ },
119
+ {
120
+ "epoch": 0.19,
121
+ "grad_norm": 21760.0,
122
+ "learning_rate": 3.5e-06,
123
+ "log_odds_chosen": -0.028518375009298325,
124
+ "log_odds_ratio": -0.7405228614807129,
125
+ "logits/chosen": 3.4970717430114746,
126
+ "logits/rejected": 3.979139804840088,
127
+ "logps/chosen": -0.9001835584640503,
128
+ "logps/rejected": -0.8650320172309875,
129
+ "loss": 1.89,
130
+ "nll_loss": 2.863569498062134,
131
+ "rewards/accuracies": 0.6000000238418579,
132
+ "rewards/chosen": -0.045009177178144455,
133
+ "rewards/margins": -0.0017575754318386316,
134
+ "rewards/rejected": -0.043251603841781616,
135
+ "step": 70
136
+ },
137
+ {
138
+ "epoch": 0.22,
139
+ "grad_norm": 18432.0,
140
+ "learning_rate": 4.000000000000001e-06,
141
+ "log_odds_chosen": 0.20453815162181854,
142
+ "log_odds_ratio": -0.674200713634491,
143
+ "logits/chosen": 5.348904132843018,
144
+ "logits/rejected": 5.157883167266846,
145
+ "logps/chosen": -0.9463122487068176,
146
+ "logps/rejected": -1.1002007722854614,
147
+ "loss": 1.6847,
148
+ "nll_loss": 1.1438477039337158,
149
+ "rewards/accuracies": 0.6000000238418579,
150
+ "rewards/chosen": -0.04731561243534088,
151
+ "rewards/margins": 0.007694427855312824,
152
+ "rewards/rejected": -0.05501004308462143,
153
+ "step": 80
154
+ },
155
+ {
156
+ "epoch": 0.25,
157
+ "grad_norm": 40448.0,
158
+ "learning_rate": 4.5e-06,
159
+ "log_odds_chosen": 0.04343784973025322,
160
+ "log_odds_ratio": -0.6950091123580933,
161
+ "logits/chosen": 3.8532798290252686,
162
+ "logits/rejected": 4.101541519165039,
163
+ "logps/chosen": -0.8504306077957153,
164
+ "logps/rejected": -0.8647142648696899,
165
+ "loss": 2.0381,
166
+ "nll_loss": 1.738358736038208,
167
+ "rewards/accuracies": 0.6000000238418579,
168
+ "rewards/chosen": -0.042521532624959946,
169
+ "rewards/margins": 0.0007141813402995467,
170
+ "rewards/rejected": -0.04323571175336838,
171
+ "step": 90
172
+ },
173
+ {
174
+ "epoch": 0.28,
175
+ "grad_norm": 26880.0,
176
+ "learning_rate": 5e-06,
177
+ "log_odds_chosen": 0.0891367569565773,
178
+ "log_odds_ratio": -0.6793378591537476,
179
+ "logits/chosen": 4.815893650054932,
180
+ "logits/rejected": 4.600794315338135,
181
+ "logps/chosen": -0.8133799433708191,
182
+ "logps/rejected": -0.8459997177124023,
183
+ "loss": 1.6753,
184
+ "nll_loss": 0.9453862309455872,
185
+ "rewards/accuracies": 0.699999988079071,
186
+ "rewards/chosen": -0.040668997913599014,
187
+ "rewards/margins": 0.0016309904167428613,
188
+ "rewards/rejected": -0.042299989610910416,
189
+ "step": 100
190
+ },
191
+ {
192
+ "epoch": 0.3,
193
+ "grad_norm": 398.0,
194
+ "learning_rate": 4.767312946227961e-06,
195
+ "log_odds_chosen": 0.2612980902194977,
196
+ "log_odds_ratio": -0.6079257130622864,
197
+ "logits/chosen": 4.852027416229248,
198
+ "logits/rejected": 5.277764320373535,
199
+ "logps/chosen": -0.7108517289161682,
200
+ "logps/rejected": -0.8703864812850952,
201
+ "loss": 1.084,
202
+ "nll_loss": 0.8523097038269043,
203
+ "rewards/accuracies": 0.699999988079071,
204
+ "rewards/chosen": -0.03554258495569229,
205
+ "rewards/margins": 0.007976744323968887,
206
+ "rewards/rejected": -0.04351933300495148,
207
+ "step": 110
208
+ },
209
+ {
210
+ "epoch": 0.33,
211
+ "grad_norm": 3.40625,
212
+ "learning_rate": 4.564354645876385e-06,
213
+ "log_odds_chosen": 0.6404939889907837,
214
+ "log_odds_ratio": -0.44226688146591187,
215
+ "logits/chosen": 4.647592544555664,
216
+ "logits/rejected": 4.496420860290527,
217
+ "logps/chosen": -0.6522791981697083,
218
+ "logps/rejected": -1.016757607460022,
219
+ "loss": 0.7584,
220
+ "nll_loss": 0.7253808975219727,
221
+ "rewards/accuracies": 0.8999999761581421,
222
+ "rewards/chosen": -0.03261395916342735,
223
+ "rewards/margins": 0.018223917111754417,
224
+ "rewards/rejected": -0.05083787441253662,
225
+ "step": 120
226
+ },
227
+ {
228
+ "epoch": 0.36,
229
+ "grad_norm": 2.484375,
230
+ "learning_rate": 4.385290096535147e-06,
231
+ "log_odds_chosen": 0.40130940079689026,
232
+ "log_odds_ratio": -0.5459367632865906,
233
+ "logits/chosen": 3.2656028270721436,
234
+ "logits/rejected": 4.282083034515381,
235
+ "logps/chosen": -0.7449339032173157,
236
+ "logps/rejected": -1.005130410194397,
237
+ "loss": 0.7507,
238
+ "nll_loss": 0.6981132626533508,
239
+ "rewards/accuracies": 0.800000011920929,
240
+ "rewards/chosen": -0.037246692925691605,
241
+ "rewards/margins": 0.013009825721383095,
242
+ "rewards/rejected": -0.05025652050971985,
243
+ "step": 130
244
+ },
245
+ {
246
+ "epoch": 0.39,
247
+ "grad_norm": 2.453125,
248
+ "learning_rate": 4.2257712736425835e-06,
249
+ "log_odds_chosen": 0.167119100689888,
250
+ "log_odds_ratio": -0.781502366065979,
251
+ "logits/chosen": 4.5941596031188965,
252
+ "logits/rejected": 3.490717649459839,
253
+ "logps/chosen": -0.849592387676239,
254
+ "logps/rejected": -0.8088783025741577,
255
+ "loss": 0.7401,
256
+ "nll_loss": 0.8015066981315613,
257
+ "rewards/accuracies": 0.6000000238418579,
258
+ "rewards/chosen": -0.04247962310910225,
259
+ "rewards/margins": -0.0020357039757072926,
260
+ "rewards/rejected": -0.040443919599056244,
261
+ "step": 140
262
+ },
263
+ {
264
+ "epoch": 0.42,
265
+ "grad_norm": 2.25,
266
+ "learning_rate": 4.082482904638631e-06,
267
+ "log_odds_chosen": 0.01697809062898159,
268
+ "log_odds_ratio": -0.7280154228210449,
269
+ "logits/chosen": 3.6419081687927246,
270
+ "logits/rejected": 4.471678733825684,
271
+ "logps/chosen": -0.8600558042526245,
272
+ "logps/rejected": -0.8721025586128235,
273
+ "loss": 0.7625,
274
+ "nll_loss": 0.6598131656646729,
275
+ "rewards/accuracies": 0.800000011920929,
276
+ "rewards/chosen": -0.043002791702747345,
277
+ "rewards/margins": 0.0006023403257131577,
278
+ "rewards/rejected": -0.04360513016581535,
279
+ "step": 150
280
+ },
281
+ {
282
+ "epoch": 0.44,
283
+ "grad_norm": 1.703125,
284
+ "learning_rate": 3.952847075210474e-06,
285
+ "log_odds_chosen": 0.21897530555725098,
286
+ "log_odds_ratio": -0.6258933544158936,
287
+ "logits/chosen": 5.113783836364746,
288
+ "logits/rejected": 5.189225196838379,
289
+ "logps/chosen": -0.9097244143486023,
290
+ "logps/rejected": -1.0366160869598389,
291
+ "loss": 0.6979,
292
+ "nll_loss": 0.8092159032821655,
293
+ "rewards/accuracies": 0.5,
294
+ "rewards/chosen": -0.04548622667789459,
295
+ "rewards/margins": 0.0063445777632296085,
296
+ "rewards/rejected": -0.05183080583810806,
297
+ "step": 160
298
+ },
299
+ {
300
+ "epoch": 0.47,
301
+ "grad_norm": 1.875,
302
+ "learning_rate": 3.834824944236852e-06,
303
+ "log_odds_chosen": 0.354801744222641,
304
+ "log_odds_ratio": -0.5426136255264282,
305
+ "logits/chosen": 3.2793831825256348,
306
+ "logits/rejected": 4.266524314880371,
307
+ "logps/chosen": -0.6823564171791077,
308
+ "logps/rejected": -0.8897114992141724,
309
+ "loss": 0.7135,
310
+ "nll_loss": 0.6986647844314575,
311
+ "rewards/accuracies": 0.8999999761581421,
312
+ "rewards/chosen": -0.03411782532930374,
313
+ "rewards/margins": 0.01036775577813387,
314
+ "rewards/rejected": -0.04448557645082474,
315
+ "step": 170
316
+ },
317
+ {
318
+ "epoch": 0.5,
319
+ "grad_norm": 1.734375,
320
+ "learning_rate": 3.72677996249965e-06,
321
+ "log_odds_chosen": 0.6848920583724976,
322
+ "log_odds_ratio": -0.48101407289505005,
323
+ "logits/chosen": 3.578697681427002,
324
+ "logits/rejected": 3.992805004119873,
325
+ "logps/chosen": -0.8704670667648315,
326
+ "logps/rejected": -1.3336102962493896,
327
+ "loss": 0.7157,
328
+ "nll_loss": 0.7008849382400513,
329
+ "rewards/accuracies": 0.800000011920929,
330
+ "rewards/chosen": -0.04352335259318352,
331
+ "rewards/margins": 0.023157157003879547,
332
+ "rewards/rejected": -0.06668050587177277,
333
+ "step": 180
334
+ },
335
+ {
336
+ "epoch": 0.53,
337
+ "grad_norm": 2.21875,
338
+ "learning_rate": 3.6273812505500587e-06,
339
+ "log_odds_chosen": 0.5638074278831482,
340
+ "log_odds_ratio": -0.5914163589477539,
341
+ "logits/chosen": 3.691683530807495,
342
+ "logits/rejected": 5.060125827789307,
343
+ "logps/chosen": -0.7228246927261353,
344
+ "logps/rejected": -0.9139431118965149,
345
+ "loss": 0.7447,
346
+ "nll_loss": 0.6561239957809448,
347
+ "rewards/accuracies": 0.5,
348
+ "rewards/chosen": -0.03614123538136482,
349
+ "rewards/margins": 0.009555922821164131,
350
+ "rewards/rejected": -0.0456971600651741,
351
+ "step": 190
352
+ },
353
+ {
354
+ "epoch": 0.55,
355
+ "grad_norm": 1.890625,
356
+ "learning_rate": 3.5355339059327378e-06,
357
+ "log_odds_chosen": 0.4812771677970886,
358
+ "log_odds_ratio": -0.5113085508346558,
359
+ "logits/chosen": 3.8571255207061768,
360
+ "logits/rejected": 4.72374153137207,
361
+ "logps/chosen": -0.7712750434875488,
362
+ "logps/rejected": -0.9863548278808594,
363
+ "loss": 0.7045,
364
+ "nll_loss": 0.6923601031303406,
365
+ "rewards/accuracies": 0.800000011920929,
366
+ "rewards/chosen": -0.03856375068426132,
367
+ "rewards/margins": 0.010753988288342953,
368
+ "rewards/rejected": -0.04931773990392685,
369
+ "step": 200
370
+ },
371
+ {
372
+ "epoch": 0.58,
373
+ "grad_norm": 2.265625,
374
+ "learning_rate": 3.450327796711771e-06,
375
+ "log_odds_chosen": 0.40674924850463867,
376
+ "log_odds_ratio": -0.5564650297164917,
377
+ "logits/chosen": 4.317850112915039,
378
+ "logits/rejected": 5.421323299407959,
379
+ "logps/chosen": -0.6844028234481812,
380
+ "logps/rejected": -0.8700541257858276,
381
+ "loss": 0.7064,
382
+ "nll_loss": 0.6271991729736328,
383
+ "rewards/accuracies": 0.800000011920929,
384
+ "rewards/chosen": -0.034220144152641296,
385
+ "rewards/margins": 0.009282568469643593,
386
+ "rewards/rejected": -0.04350270703434944,
387
+ "step": 210
388
+ },
389
+ {
390
+ "epoch": 0.61,
391
+ "grad_norm": 2.140625,
392
+ "learning_rate": 3.3709993123162106e-06,
393
+ "log_odds_chosen": 0.5274091958999634,
394
+ "log_odds_ratio": -0.4882450997829437,
395
+ "logits/chosen": 4.161227703094482,
396
+ "logits/rejected": 4.616461277008057,
397
+ "logps/chosen": -0.7146799564361572,
398
+ "logps/rejected": -1.0237388610839844,
399
+ "loss": 0.6941,
400
+ "nll_loss": 0.6821693181991577,
401
+ "rewards/accuracies": 0.800000011920929,
402
+ "rewards/chosen": -0.03573400154709816,
403
+ "rewards/margins": 0.01545293815433979,
404
+ "rewards/rejected": -0.0511869378387928,
405
+ "step": 220
406
+ },
407
+ {
408
+ "epoch": 0.64,
409
+ "grad_norm": 1.8984375,
410
+ "learning_rate": 3.296902366978936e-06,
411
+ "log_odds_chosen": 0.7328251004219055,
412
+ "log_odds_ratio": -0.5591307878494263,
413
+ "logits/chosen": 1.1708077192306519,
414
+ "logits/rejected": 4.127654075622559,
415
+ "logps/chosen": -0.6000097990036011,
416
+ "logps/rejected": -0.7907406091690063,
417
+ "loss": 0.7189,
418
+ "nll_loss": 0.5180870294570923,
419
+ "rewards/accuracies": 0.800000011920929,
420
+ "rewards/chosen": -0.030000487342476845,
421
+ "rewards/margins": 0.009536544792354107,
422
+ "rewards/rejected": -0.039537034928798676,
423
+ "step": 230
424
+ },
425
+ {
426
+ "epoch": 0.66,
427
+ "grad_norm": 2.390625,
428
+ "learning_rate": 3.2274861218395142e-06,
429
+ "log_odds_chosen": 0.30477339029312134,
430
+ "log_odds_ratio": -0.6216728091239929,
431
+ "logits/chosen": 4.052184104919434,
432
+ "logits/rejected": 4.9365081787109375,
433
+ "logps/chosen": -0.9283322095870972,
434
+ "logps/rejected": -1.132279634475708,
435
+ "loss": 0.7021,
436
+ "nll_loss": 0.74329674243927,
437
+ "rewards/accuracies": 0.6000000238418579,
438
+ "rewards/chosen": -0.04641660675406456,
439
+ "rewards/margins": 0.010197381488978863,
440
+ "rewards/rejected": -0.056613992899656296,
441
+ "step": 240
442
+ },
443
+ {
444
+ "epoch": 0.69,
445
+ "grad_norm": 1.7890625,
446
+ "learning_rate": 3.1622776601683796e-06,
447
+ "log_odds_chosen": 0.1895931363105774,
448
+ "log_odds_ratio": -0.6505658626556396,
449
+ "logits/chosen": 4.295980930328369,
450
+ "logits/rejected": 4.204621315002441,
451
+ "logps/chosen": -0.8170398473739624,
452
+ "logps/rejected": -0.962540328502655,
453
+ "loss": 0.6787,
454
+ "nll_loss": 0.7326347231864929,
455
+ "rewards/accuracies": 0.6000000238418579,
456
+ "rewards/chosen": -0.04085199162364006,
457
+ "rewards/margins": 0.007275022566318512,
458
+ "rewards/rejected": -0.04812701791524887,
459
+ "step": 250
460
+ },
461
+ {
462
+ "epoch": 0.72,
463
+ "grad_norm": 2.375,
464
+ "learning_rate": 3.1008683647302113e-06,
465
+ "log_odds_chosen": 0.643774688243866,
466
+ "log_odds_ratio": -0.5065270066261292,
467
+ "logits/chosen": 3.066134452819824,
468
+ "logits/rejected": 5.140274524688721,
469
+ "logps/chosen": -0.6760501861572266,
470
+ "logps/rejected": -1.0565364360809326,
471
+ "loss": 0.7083,
472
+ "nll_loss": 0.5696229934692383,
473
+ "rewards/accuracies": 0.800000011920929,
474
+ "rewards/chosen": -0.03380250930786133,
475
+ "rewards/margins": 0.019024310633540154,
476
+ "rewards/rejected": -0.05282682180404663,
477
+ "step": 260
478
+ },
479
+ {
480
+ "epoch": 0.75,
481
+ "grad_norm": 1.921875,
482
+ "learning_rate": 3.0429030972509227e-06,
483
+ "log_odds_chosen": 0.11027543246746063,
484
+ "log_odds_ratio": -0.7136895656585693,
485
+ "logits/chosen": 3.8440871238708496,
486
+ "logits/rejected": 5.5886664390563965,
487
+ "logps/chosen": -0.9002073407173157,
488
+ "logps/rejected": -0.8725664019584656,
489
+ "loss": 0.6887,
490
+ "nll_loss": 0.6460383534431458,
491
+ "rewards/accuracies": 0.6000000238418579,
492
+ "rewards/chosen": -0.045010361820459366,
493
+ "rewards/margins": -0.0013820428866893053,
494
+ "rewards/rejected": -0.04362832009792328,
495
+ "step": 270
496
+ },
497
+ {
498
+ "epoch": 0.78,
499
+ "grad_norm": 2.4375,
500
+ "learning_rate": 2.988071523335984e-06,
501
+ "log_odds_chosen": 0.19881311058998108,
502
+ "log_odds_ratio": -0.645098865032196,
503
+ "logits/chosen": 3.4119772911071777,
504
+ "logits/rejected": 3.858569622039795,
505
+ "logps/chosen": -0.6236559748649597,
506
+ "logps/rejected": -0.6720820665359497,
507
+ "loss": 0.6831,
508
+ "nll_loss": 0.5412070155143738,
509
+ "rewards/accuracies": 0.699999988079071,
510
+ "rewards/chosen": -0.031182799488306046,
511
+ "rewards/margins": 0.0024213031865656376,
512
+ "rewards/rejected": -0.033604104071855545,
513
+ "step": 280
514
+ },
515
+ {
516
+ "epoch": 0.8,
517
+ "grad_norm": 1.796875,
518
+ "learning_rate": 2.9361010975735177e-06,
519
+ "log_odds_chosen": 0.8000429272651672,
520
+ "log_odds_ratio": -0.5133927464485168,
521
+ "logits/chosen": 3.1116268634796143,
522
+ "logits/rejected": 4.1175031661987305,
523
+ "logps/chosen": -0.6604128479957581,
524
+ "logps/rejected": -0.9541538953781128,
525
+ "loss": 0.7189,
526
+ "nll_loss": 0.6099768877029419,
527
+ "rewards/accuracies": 0.699999988079071,
528
+ "rewards/chosen": -0.033020637929439545,
529
+ "rewards/margins": 0.014687063172459602,
530
+ "rewards/rejected": -0.047707699239254,
531
+ "step": 290
532
+ },
533
+ {
534
+ "epoch": 0.83,
535
+ "grad_norm": 2.140625,
536
+ "learning_rate": 2.8867513459481293e-06,
537
+ "log_odds_chosen": 0.27028822898864746,
538
+ "log_odds_ratio": -0.5845714807510376,
539
+ "logits/chosen": 4.801875114440918,
540
+ "logits/rejected": 4.0431060791015625,
541
+ "logps/chosen": -0.9752612113952637,
542
+ "logps/rejected": -1.1325619220733643,
543
+ "loss": 0.7067,
544
+ "nll_loss": 0.8859332799911499,
545
+ "rewards/accuracies": 0.699999988079071,
546
+ "rewards/chosen": -0.048763059079647064,
547
+ "rewards/margins": 0.007865036837756634,
548
+ "rewards/rejected": -0.05662809684872627,
549
+ "step": 300
550
+ },
551
+ {
552
+ "epoch": 0.86,
553
+ "grad_norm": 2.203125,
554
+ "learning_rate": 2.839809171235324e-06,
555
+ "log_odds_chosen": 0.3040723502635956,
556
+ "log_odds_ratio": -0.5971912145614624,
557
+ "logits/chosen": 3.540631055831909,
558
+ "logits/rejected": 4.819446563720703,
559
+ "logps/chosen": -0.618737518787384,
560
+ "logps/rejected": -0.8253359794616699,
561
+ "loss": 0.7267,
562
+ "nll_loss": 0.5355499982833862,
563
+ "rewards/accuracies": 0.6000000238418579,
564
+ "rewards/chosen": -0.030936872586607933,
565
+ "rewards/margins": 0.01032992172986269,
566
+ "rewards/rejected": -0.0412667952477932,
567
+ "step": 310
568
+ },
569
+ {
570
+ "epoch": 0.89,
571
+ "grad_norm": 2.09375,
572
+ "learning_rate": 2.7950849718747376e-06,
573
+ "log_odds_chosen": 0.4747926592826843,
574
+ "log_odds_ratio": -0.6426443457603455,
575
+ "logits/chosen": 4.275177955627441,
576
+ "logits/rejected": 5.392439365386963,
577
+ "logps/chosen": -0.9479950070381165,
578
+ "logps/rejected": -1.074181318283081,
579
+ "loss": 0.7026,
580
+ "nll_loss": 0.81135094165802,
581
+ "rewards/accuracies": 0.699999988079071,
582
+ "rewards/chosen": -0.047399748116731644,
583
+ "rewards/margins": 0.006309316959232092,
584
+ "rewards/rejected": -0.053709059953689575,
585
+ "step": 320
586
+ },
587
+ {
588
+ "epoch": 0.91,
589
+ "grad_norm": 2.046875,
590
+ "learning_rate": 2.752409412815902e-06,
591
+ "log_odds_chosen": 0.05597902089357376,
592
+ "log_odds_ratio": -0.6852544546127319,
593
+ "logits/chosen": 4.019442558288574,
594
+ "logits/rejected": 4.838677406311035,
595
+ "logps/chosen": -0.8511530756950378,
596
+ "logps/rejected": -0.8828445672988892,
597
+ "loss": 0.7044,
598
+ "nll_loss": 0.694568932056427,
599
+ "rewards/accuracies": 0.5,
600
+ "rewards/chosen": -0.04255765676498413,
601
+ "rewards/margins": 0.001584571204148233,
602
+ "rewards/rejected": -0.0441422276198864,
603
+ "step": 330
604
+ },
605
+ {
606
+ "epoch": 0.94,
607
+ "grad_norm": 2.078125,
608
+ "learning_rate": 2.711630722733202e-06,
609
+ "log_odds_chosen": 0.4387446343898773,
610
+ "log_odds_ratio": -0.5828881859779358,
611
+ "logits/chosen": 5.145287990570068,
612
+ "logits/rejected": 4.682337760925293,
613
+ "logps/chosen": -0.8366853594779968,
614
+ "logps/rejected": -1.0714147090911865,
615
+ "loss": 0.7106,
616
+ "nll_loss": 0.7823752164840698,
617
+ "rewards/accuracies": 0.6000000238418579,
618
+ "rewards/chosen": -0.0418342649936676,
619
+ "rewards/margins": 0.01173648051917553,
620
+ "rewards/rejected": -0.05357074737548828,
621
+ "step": 340
622
+ },
623
+ {
624
+ "epoch": 0.97,
625
+ "grad_norm": 1.640625,
626
+ "learning_rate": 2.6726124191242444e-06,
627
+ "log_odds_chosen": 0.646438717842102,
628
+ "log_odds_ratio": -0.4320458471775055,
629
+ "logits/chosen": 3.6300628185272217,
630
+ "logits/rejected": 4.811415195465088,
631
+ "logps/chosen": -0.697738528251648,
632
+ "logps/rejected": -1.0509469509124756,
633
+ "loss": 0.7094,
634
+ "nll_loss": 0.6207277178764343,
635
+ "rewards/accuracies": 1.0,
636
+ "rewards/chosen": -0.0348869264125824,
637
+ "rewards/margins": 0.017660418525338173,
638
+ "rewards/rejected": -0.05254734680056572,
639
+ "step": 350
640
+ },
641
+ {
642
+ "epoch": 1.0,
643
+ "grad_norm": 1.859375,
644
+ "learning_rate": 2.6352313834736496e-06,
645
+ "log_odds_chosen": 0.2544732391834259,
646
+ "log_odds_ratio": -0.6412532925605774,
647
+ "logits/chosen": 3.9895572662353516,
648
+ "logits/rejected": 4.779620170593262,
649
+ "logps/chosen": -0.9361473321914673,
650
+ "logps/rejected": -1.1174229383468628,
651
+ "loss": 0.7128,
652
+ "nll_loss": 0.7654691934585571,
653
+ "rewards/accuracies": 0.699999988079071,
654
+ "rewards/chosen": -0.04680735990405083,
655
+ "rewards/margins": 0.009063784033060074,
656
+ "rewards/rejected": -0.0558711476624012,
657
+ "step": 360
658
+ },
659
+ {
660
+ "epoch": 1.02,
661
+ "grad_norm": 2.0,
662
+ "learning_rate": 2.599376224550182e-06,
663
+ "log_odds_chosen": 0.40497034788131714,
664
+ "log_odds_ratio": -0.5424339175224304,
665
+ "logits/chosen": 3.386862277984619,
666
+ "logits/rejected": 4.2417168617248535,
667
+ "logps/chosen": -0.58955979347229,
668
+ "logps/rejected": -0.7452008128166199,
669
+ "loss": 0.6815,
670
+ "nll_loss": 0.5987212061882019,
671
+ "rewards/accuracies": 0.800000011920929,
672
+ "rewards/chosen": -0.029477989301085472,
673
+ "rewards/margins": 0.007782056927680969,
674
+ "rewards/rejected": -0.03726004809141159,
675
+ "step": 370
676
+ },
677
+ {
678
+ "epoch": 1.05,
679
+ "grad_norm": 1.796875,
680
+ "learning_rate": 2.564945880212886e-06,
681
+ "log_odds_chosen": -0.028061389923095703,
682
+ "log_odds_ratio": -0.8072420954704285,
683
+ "logits/chosen": 3.5768370628356934,
684
+ "logits/rejected": 5.4233012199401855,
685
+ "logps/chosen": -0.7529481053352356,
686
+ "logps/rejected": -0.8536372184753418,
687
+ "loss": 0.6656,
688
+ "nll_loss": 0.5306578278541565,
689
+ "rewards/accuracies": 0.6000000238418579,
690
+ "rewards/chosen": -0.03764740750193596,
691
+ "rewards/margins": 0.005034452769905329,
692
+ "rewards/rejected": -0.04268186166882515,
693
+ "step": 380
694
+ },
695
+ {
696
+ "epoch": 1.08,
697
+ "grad_norm": 2.046875,
698
+ "learning_rate": 2.5318484177091667e-06,
699
+ "log_odds_chosen": 0.4068288803100586,
700
+ "log_odds_ratio": -0.5929966568946838,
701
+ "logits/chosen": 3.2761809825897217,
702
+ "logits/rejected": 4.4970245361328125,
703
+ "logps/chosen": -0.6948614120483398,
704
+ "logps/rejected": -0.865265965461731,
705
+ "loss": 0.6794,
706
+ "nll_loss": 0.6040965914726257,
707
+ "rewards/accuracies": 0.5,
708
+ "rewards/chosen": -0.03474307060241699,
709
+ "rewards/margins": 0.008520226925611496,
710
+ "rewards/rejected": -0.04326330125331879,
711
+ "step": 390
712
+ },
713
+ {
714
+ "epoch": 1.11,
715
+ "grad_norm": 2.015625,
716
+ "learning_rate": 2.5e-06,
717
+ "log_odds_chosen": 0.1341879665851593,
718
+ "log_odds_ratio": -0.6464511752128601,
719
+ "logits/chosen": 3.4316935539245605,
720
+ "logits/rejected": 3.2954697608947754,
721
+ "logps/chosen": -0.8758536577224731,
722
+ "logps/rejected": -0.933131992816925,
723
+ "loss": 0.6744,
724
+ "nll_loss": 0.7761722803115845,
725
+ "rewards/accuracies": 0.699999988079071,
726
+ "rewards/chosen": -0.04379267990589142,
727
+ "rewards/margins": 0.002863916801288724,
728
+ "rewards/rejected": -0.046656593680381775,
729
+ "step": 400
730
+ },
731
+ {
732
+ "epoch": 1.14,
733
+ "grad_norm": 2.15625,
734
+ "learning_rate": 2.4693239916239746e-06,
735
+ "log_odds_chosen": 0.2867754399776459,
736
+ "log_odds_ratio": -0.5857530236244202,
737
+ "logits/chosen": 3.6360816955566406,
738
+ "logits/rejected": 3.9970450401306152,
739
+ "logps/chosen": -0.7139376997947693,
740
+ "logps/rejected": -0.8725380897521973,
741
+ "loss": 0.6869,
742
+ "nll_loss": 0.6628342866897583,
743
+ "rewards/accuracies": 0.699999988079071,
744
+ "rewards/chosen": -0.035696886479854584,
745
+ "rewards/margins": 0.007930012419819832,
746
+ "rewards/rejected": -0.043626900762319565,
747
+ "step": 410
748
+ },
749
+ {
750
+ "epoch": 1.16,
751
+ "grad_norm": 2.15625,
752
+ "learning_rate": 2.4397501823713327e-06,
753
+ "log_odds_chosen": 0.4246703088283539,
754
+ "log_odds_ratio": -0.5430588722229004,
755
+ "logits/chosen": 3.829016923904419,
756
+ "logits/rejected": 4.135481834411621,
757
+ "logps/chosen": -0.7444478869438171,
758
+ "logps/rejected": -0.9897812008857727,
759
+ "loss": 0.6709,
760
+ "nll_loss": 0.6274694204330444,
761
+ "rewards/accuracies": 0.800000011920929,
762
+ "rewards/chosen": -0.037222400307655334,
763
+ "rewards/margins": 0.012266661040484905,
764
+ "rewards/rejected": -0.049489058554172516,
765
+ "step": 420
766
+ },
767
+ {
768
+ "epoch": 1.19,
769
+ "grad_norm": 2.140625,
770
+ "learning_rate": 2.411214110852061e-06,
771
+ "log_odds_chosen": 0.25163400173187256,
772
+ "log_odds_ratio": -0.6242859363555908,
773
+ "logits/chosen": 3.951763868331909,
774
+ "logits/rejected": 4.234454154968262,
775
+ "logps/chosen": -0.7341341376304626,
776
+ "logps/rejected": -0.9003491401672363,
777
+ "loss": 0.6944,
778
+ "nll_loss": 0.7275547385215759,
779
+ "rewards/accuracies": 0.4000000059604645,
780
+ "rewards/chosen": -0.03670670837163925,
781
+ "rewards/margins": 0.008310755714774132,
782
+ "rewards/rejected": -0.045017458498477936,
783
+ "step": 430
784
+ },
785
+ {
786
+ "epoch": 1.22,
787
+ "grad_norm": 1.8203125,
788
+ "learning_rate": 2.3836564731139807e-06,
789
+ "log_odds_chosen": 1.0159021615982056,
790
+ "log_odds_ratio": -0.37116050720214844,
791
+ "logits/chosen": 4.0213727951049805,
792
+ "logits/rejected": 3.6271164417266846,
793
+ "logps/chosen": -0.5240938067436218,
794
+ "logps/rejected": -0.8907585144042969,
795
+ "loss": 0.6649,
796
+ "nll_loss": 0.6113199591636658,
797
+ "rewards/accuracies": 0.8999999761581421,
798
+ "rewards/chosen": -0.02620469406247139,
799
+ "rewards/margins": 0.018333233892917633,
800
+ "rewards/rejected": -0.04453792795538902,
801
+ "step": 440
802
+ },
803
+ {
804
+ "epoch": 1.25,
805
+ "grad_norm": 1.9765625,
806
+ "learning_rate": 2.357022603955159e-06,
807
+ "log_odds_chosen": 0.6340302228927612,
808
+ "log_odds_ratio": -0.5088221430778503,
809
+ "logits/chosen": 3.0994677543640137,
810
+ "logits/rejected": 3.617880344390869,
811
+ "logps/chosen": -0.6344050168991089,
812
+ "logps/rejected": -0.9070758819580078,
813
+ "loss": 0.6749,
814
+ "nll_loss": 0.6057204604148865,
815
+ "rewards/accuracies": 0.800000011920929,
816
+ "rewards/chosen": -0.03172025829553604,
817
+ "rewards/margins": 0.013633537106215954,
818
+ "rewards/rejected": -0.04535379260778427,
819
+ "step": 450
820
+ },
821
+ {
822
+ "epoch": 1.27,
823
+ "grad_norm": 2.375,
824
+ "learning_rate": 2.3312620206007847e-06,
825
+ "log_odds_chosen": 0.713164746761322,
826
+ "log_odds_ratio": -0.5116228461265564,
827
+ "logits/chosen": 3.274913787841797,
828
+ "logits/rejected": 3.9031119346618652,
829
+ "logps/chosen": -0.7013699412345886,
830
+ "logps/rejected": -0.9506866335868835,
831
+ "loss": 0.689,
832
+ "nll_loss": 0.6167888045310974,
833
+ "rewards/accuracies": 0.800000011920929,
834
+ "rewards/chosen": -0.03506850078701973,
835
+ "rewards/margins": 0.012465836480259895,
836
+ "rewards/rejected": -0.047534335404634476,
837
+ "step": 460
838
+ },
839
+ {
840
+ "epoch": 1.3,
841
+ "grad_norm": 1.859375,
842
+ "learning_rate": 2.3063280200722128e-06,
843
+ "log_odds_chosen": 0.276517391204834,
844
+ "log_odds_ratio": -0.6000176668167114,
845
+ "logits/chosen": 4.087462425231934,
846
+ "logits/rejected": 4.489025592803955,
847
+ "logps/chosen": -0.764947772026062,
848
+ "logps/rejected": -0.9075348973274231,
849
+ "loss": 0.7095,
850
+ "nll_loss": 0.673345685005188,
851
+ "rewards/accuracies": 0.699999988079071,
852
+ "rewards/chosen": -0.03824738413095474,
853
+ "rewards/margins": 0.007129358593374491,
854
+ "rewards/rejected": -0.045376747846603394,
855
+ "step": 470
856
+ },
857
+ {
858
+ "epoch": 1.33,
859
+ "grad_norm": 2.03125,
860
+ "learning_rate": 2.2821773229381924e-06,
861
+ "log_odds_chosen": 1.0662429332733154,
862
+ "log_odds_ratio": -0.34629103541374207,
863
+ "logits/chosen": 2.857253313064575,
864
+ "logits/rejected": 4.2227373123168945,
865
+ "logps/chosen": -0.6380993127822876,
866
+ "logps/rejected": -1.1461809873580933,
867
+ "loss": 0.6967,
868
+ "nll_loss": 0.6230236291885376,
869
+ "rewards/accuracies": 1.0,
870
+ "rewards/chosen": -0.03190496563911438,
871
+ "rewards/margins": 0.02540409006178379,
872
+ "rewards/rejected": -0.05730905383825302,
873
+ "step": 480
874
+ },
875
+ {
876
+ "epoch": 1.36,
877
+ "grad_norm": 1.8515625,
878
+ "learning_rate": 2.2587697572631284e-06,
879
+ "log_odds_chosen": 0.05213805288076401,
880
+ "log_odds_ratio": -0.7282466888427734,
881
+ "logits/chosen": 3.712087631225586,
882
+ "logits/rejected": 3.26226806640625,
883
+ "logps/chosen": -0.8317926526069641,
884
+ "logps/rejected": -0.8364424705505371,
885
+ "loss": 0.6909,
886
+ "nll_loss": 0.7362145185470581,
887
+ "rewards/accuracies": 0.5,
888
+ "rewards/chosen": -0.041589636355638504,
889
+ "rewards/margins": 0.00023248679644893855,
890
+ "rewards/rejected": -0.041822124272584915,
891
+ "step": 490
892
+ },
893
+ {
894
+ "epoch": 1.39,
895
+ "grad_norm": 1.9921875,
896
+ "learning_rate": 2.23606797749979e-06,
897
+ "log_odds_chosen": 0.6301153302192688,
898
+ "log_odds_ratio": -0.4543212354183197,
899
+ "logits/chosen": 3.4883971214294434,
900
+ "logits/rejected": 4.507528781890869,
901
+ "logps/chosen": -0.8292129635810852,
902
+ "logps/rejected": -1.210298776626587,
903
+ "loss": 0.6785,
904
+ "nll_loss": 0.6745945811271667,
905
+ "rewards/accuracies": 1.0,
906
+ "rewards/chosen": -0.04146064817905426,
907
+ "rewards/margins": 0.019054297357797623,
908
+ "rewards/rejected": -0.06051494926214218,
909
+ "step": 500
910
+ },
911
+ {
912
+ "epoch": 1.41,
913
+ "grad_norm": 1.75,
914
+ "learning_rate": 2.2140372138502386e-06,
915
+ "log_odds_chosen": -0.06262560188770294,
916
+ "log_odds_ratio": -0.8653733134269714,
917
+ "logits/chosen": 3.5743117332458496,
918
+ "logits/rejected": 3.30938720703125,
919
+ "logps/chosen": -0.7883298993110657,
920
+ "logps/rejected": -0.8334089517593384,
921
+ "loss": 0.6784,
922
+ "nll_loss": 0.7287472486495972,
923
+ "rewards/accuracies": 0.6000000238418579,
924
+ "rewards/chosen": -0.03941649571061134,
925
+ "rewards/margins": 0.002253949176520109,
926
+ "rewards/rejected": -0.04167044535279274,
927
+ "step": 510
928
+ },
929
+ {
930
+ "epoch": 1.44,
931
+ "grad_norm": 2.453125,
932
+ "learning_rate": 2.1926450482675734e-06,
933
+ "log_odds_chosen": 0.7889357209205627,
934
+ "log_odds_ratio": -0.5113512277603149,
935
+ "logits/chosen": 3.6222686767578125,
936
+ "logits/rejected": 4.588388919830322,
937
+ "logps/chosen": -0.6055595278739929,
938
+ "logps/rejected": -0.8102161288261414,
939
+ "loss": 0.6911,
940
+ "nll_loss": 0.7365372180938721,
941
+ "rewards/accuracies": 0.699999988079071,
942
+ "rewards/chosen": -0.030277978628873825,
943
+ "rewards/margins": 0.01023282390087843,
944
+ "rewards/rejected": -0.04051080346107483,
945
+ "step": 520
946
+ },
947
+ {
948
+ "epoch": 1.47,
949
+ "grad_norm": 1.671875,
950
+ "learning_rate": 2.1718612138153473e-06,
951
+ "log_odds_chosen": 0.4299934506416321,
952
+ "log_odds_ratio": -0.5557712316513062,
953
+ "logits/chosen": 4.014830112457275,
954
+ "logits/rejected": 5.017902374267578,
955
+ "logps/chosen": -0.7874509692192078,
956
+ "logps/rejected": -1.018026351928711,
957
+ "loss": 0.667,
958
+ "nll_loss": 0.6355669498443604,
959
+ "rewards/accuracies": 0.800000011920929,
960
+ "rewards/chosen": -0.03937254846096039,
961
+ "rewards/margins": 0.011528769508004189,
962
+ "rewards/rejected": -0.05090131610631943,
963
+ "step": 530
964
+ },
965
+ {
966
+ "epoch": 1.5,
967
+ "grad_norm": 2.296875,
968
+ "learning_rate": 2.151657414559676e-06,
969
+ "log_odds_chosen": 0.2829347550868988,
970
+ "log_odds_ratio": -0.6682294607162476,
971
+ "logits/chosen": 3.943626880645752,
972
+ "logits/rejected": 4.587673187255859,
973
+ "logps/chosen": -0.6128166913986206,
974
+ "logps/rejected": -0.7399159073829651,
975
+ "loss": 0.6921,
976
+ "nll_loss": 0.5586395263671875,
977
+ "rewards/accuracies": 0.5,
978
+ "rewards/chosen": -0.03064083494246006,
979
+ "rewards/margins": 0.006354962475597858,
980
+ "rewards/rejected": -0.03699579834938049,
981
+ "step": 540
982
+ },
983
+ {
984
+ "epoch": 1.52,
985
+ "grad_norm": 2.34375,
986
+ "learning_rate": 2.132007163556104e-06,
987
+ "log_odds_chosen": 0.12430952489376068,
988
+ "log_odds_ratio": -0.691026508808136,
989
+ "logits/chosen": 4.236235618591309,
990
+ "logits/rejected": 4.258731842041016,
991
+ "logps/chosen": -0.8816810846328735,
992
+ "logps/rejected": -0.8798414468765259,
993
+ "loss": 0.6621,
994
+ "nll_loss": 0.7372727394104004,
995
+ "rewards/accuracies": 0.6000000238418579,
996
+ "rewards/chosen": -0.044084057211875916,
997
+ "rewards/margins": -9.19781596167013e-05,
998
+ "rewards/rejected": -0.04399207606911659,
999
+ "step": 550
1000
+ },
1001
+ {
1002
+ "epoch": 1.55,
1003
+ "grad_norm": 2.0625,
1004
+ "learning_rate": 2.1128856368212917e-06,
1005
+ "log_odds_chosen": 0.45417746901512146,
1006
+ "log_odds_ratio": -0.6252912878990173,
1007
+ "logits/chosen": 3.6231613159179688,
1008
+ "logits/rejected": 4.052757263183594,
1009
+ "logps/chosen": -0.8618866801261902,
1010
+ "logps/rejected": -1.0115950107574463,
1011
+ "loss": 0.6866,
1012
+ "nll_loss": 0.7204962372779846,
1013
+ "rewards/accuracies": 0.6000000238418579,
1014
+ "rewards/chosen": -0.04309433326125145,
1015
+ "rewards/margins": 0.007485417183488607,
1016
+ "rewards/rejected": -0.05057975649833679,
1017
+ "step": 560
1018
+ },
1019
+ {
1020
+ "epoch": 1.58,
1021
+ "grad_norm": 1.5078125,
1022
+ "learning_rate": 2.0942695414584777e-06,
1023
+ "log_odds_chosen": 0.2981715500354767,
1024
+ "log_odds_ratio": -0.6889346837997437,
1025
+ "logits/chosen": 4.225995063781738,
1026
+ "logits/rejected": 5.418367862701416,
1027
+ "logps/chosen": -0.8141951560974121,
1028
+ "logps/rejected": -0.9664665460586548,
1029
+ "loss": 0.682,
1030
+ "nll_loss": 0.7680364847183228,
1031
+ "rewards/accuracies": 0.699999988079071,
1032
+ "rewards/chosen": -0.040709760040044785,
1033
+ "rewards/margins": 0.007613567169755697,
1034
+ "rewards/rejected": -0.04832332953810692,
1035
+ "step": 570
1036
+ },
1037
+ {
1038
+ "epoch": 1.61,
1039
+ "grad_norm": 2.078125,
1040
+ "learning_rate": 2.0761369963434992e-06,
1041
+ "log_odds_chosen": 0.09669248759746552,
1042
+ "log_odds_ratio": -0.6933160424232483,
1043
+ "logits/chosen": 4.034695625305176,
1044
+ "logits/rejected": 3.9514598846435547,
1045
+ "logps/chosen": -0.7859554886817932,
1046
+ "logps/rejected": -0.8040157556533813,
1047
+ "loss": 0.6702,
1048
+ "nll_loss": 0.6275065541267395,
1049
+ "rewards/accuracies": 0.6000000238418579,
1050
+ "rewards/chosen": -0.03929777070879936,
1051
+ "rewards/margins": 0.0009030139190144837,
1052
+ "rewards/rejected": -0.04020078480243683,
1053
+ "step": 580
1054
+ },
1055
+ {
1056
+ "epoch": 1.63,
1057
+ "grad_norm": 2.046875,
1058
+ "learning_rate": 2.058467423981546e-06,
1059
+ "log_odds_chosen": 0.7078760266304016,
1060
+ "log_odds_ratio": -0.41859808564186096,
1061
+ "logits/chosen": 4.595616817474365,
1062
+ "logits/rejected": 5.63283634185791,
1063
+ "logps/chosen": -0.7666529417037964,
1064
+ "logps/rejected": -1.1932477951049805,
1065
+ "loss": 0.6744,
1066
+ "nll_loss": 0.6559887528419495,
1067
+ "rewards/accuracies": 1.0,
1068
+ "rewards/chosen": -0.03833264485001564,
1069
+ "rewards/margins": 0.021329741925001144,
1070
+ "rewards/rejected": -0.059662383049726486,
1071
+ "step": 590
1072
+ },
1073
+ {
1074
+ "epoch": 1.66,
1075
+ "grad_norm": 1.984375,
1076
+ "learning_rate": 2.0412414523193154e-06,
1077
+ "log_odds_chosen": 0.46980589628219604,
1078
+ "log_odds_ratio": -0.507270336151123,
1079
+ "logits/chosen": 4.159191131591797,
1080
+ "logits/rejected": 4.8198137283325195,
1081
+ "logps/chosen": -0.6551539897918701,
1082
+ "logps/rejected": -0.8997269868850708,
1083
+ "loss": 0.6999,
1084
+ "nll_loss": 0.5972849726676941,
1085
+ "rewards/accuracies": 0.8999999761581421,
1086
+ "rewards/chosen": -0.032757699489593506,
1087
+ "rewards/margins": 0.012228650972247124,
1088
+ "rewards/rejected": -0.04498635232448578,
1089
+ "step": 600
1090
+ },
1091
+ {
1092
+ "epoch": 1.69,
1093
+ "grad_norm": 1.828125,
1094
+ "learning_rate": 2.0244408254472904e-06,
1095
+ "log_odds_chosen": 0.8159920573234558,
1096
+ "log_odds_ratio": -0.44673436880111694,
1097
+ "logits/chosen": 1.4881597757339478,
1098
+ "logits/rejected": 3.2742371559143066,
1099
+ "logps/chosen": -0.5382428169250488,
1100
+ "logps/rejected": -0.8298252820968628,
1101
+ "loss": 0.6778,
1102
+ "nll_loss": 0.47871965169906616,
1103
+ "rewards/accuracies": 0.699999988079071,
1104
+ "rewards/chosen": -0.0269121415913105,
1105
+ "rewards/margins": 0.014579126611351967,
1106
+ "rewards/rejected": -0.04149126634001732,
1107
+ "step": 610
1108
+ },
1109
+ {
1110
+ "epoch": 1.72,
1111
+ "grad_norm": 2.0625,
1112
+ "learning_rate": 2.0080483222562476e-06,
1113
+ "log_odds_chosen": 0.4831489026546478,
1114
+ "log_odds_ratio": -0.6106674075126648,
1115
+ "logits/chosen": 2.6716971397399902,
1116
+ "logits/rejected": 4.190335273742676,
1117
+ "logps/chosen": -0.7028285264968872,
1118
+ "logps/rejected": -0.8107720613479614,
1119
+ "loss": 0.7,
1120
+ "nll_loss": 0.6031611561775208,
1121
+ "rewards/accuracies": 0.699999988079071,
1122
+ "rewards/chosen": -0.03514142706990242,
1123
+ "rewards/margins": 0.005397176835685968,
1124
+ "rewards/rejected": -0.04053860157728195,
1125
+ "step": 620
1126
+ },
1127
+ {
1128
+ "epoch": 1.75,
1129
+ "grad_norm": 2.390625,
1130
+ "learning_rate": 1.9920476822239895e-06,
1131
+ "log_odds_chosen": 1.1681692600250244,
1132
+ "log_odds_ratio": -0.42384225130081177,
1133
+ "logits/chosen": 1.9299933910369873,
1134
+ "logits/rejected": 4.495340824127197,
1135
+ "logps/chosen": -0.6006202697753906,
1136
+ "logps/rejected": -0.8768091201782227,
1137
+ "loss": 0.6826,
1138
+ "nll_loss": 0.5617047548294067,
1139
+ "rewards/accuracies": 0.699999988079071,
1140
+ "rewards/chosen": -0.03003101423382759,
1141
+ "rewards/margins": 0.013809445314109325,
1142
+ "rewards/rejected": -0.04384046047925949,
1143
+ "step": 630
1144
+ },
1145
+ {
1146
+ "epoch": 1.77,
1147
+ "grad_norm": 1.9921875,
1148
+ "learning_rate": 1.976423537605237e-06,
1149
+ "log_odds_chosen": 0.20114317536354065,
1150
+ "log_odds_ratio": -0.6850827932357788,
1151
+ "logits/chosen": 3.9372570514678955,
1152
+ "logits/rejected": 4.620089530944824,
1153
+ "logps/chosen": -0.7602877616882324,
1154
+ "logps/rejected": -0.798611581325531,
1155
+ "loss": 0.693,
1156
+ "nll_loss": 0.6903423070907593,
1157
+ "rewards/accuracies": 0.699999988079071,
1158
+ "rewards/chosen": -0.03801439329981804,
1159
+ "rewards/margins": 0.0019161909585818648,
1160
+ "rewards/rejected": -0.03993058204650879,
1161
+ "step": 640
1162
+ },
1163
+ {
1164
+ "epoch": 1.8,
1165
+ "grad_norm": 2.15625,
1166
+ "learning_rate": 1.961161351381841e-06,
1167
+ "log_odds_chosen": 0.7238461971282959,
1168
+ "log_odds_ratio": -0.49284863471984863,
1169
+ "logits/chosen": 3.1412081718444824,
1170
+ "logits/rejected": 4.667212963104248,
1171
+ "logps/chosen": -0.7989972829818726,
1172
+ "logps/rejected": -1.1198698282241821,
1173
+ "loss": 0.6811,
1174
+ "nll_loss": 0.6139580607414246,
1175
+ "rewards/accuracies": 0.800000011920929,
1176
+ "rewards/chosen": -0.03994986042380333,
1177
+ "rewards/margins": 0.01604362577199936,
1178
+ "rewards/rejected": -0.05599348992109299,
1179
+ "step": 650
1180
+ },
1181
+ {
1182
+ "epoch": 1.83,
1183
+ "grad_norm": 2.0625,
1184
+ "learning_rate": 1.9462473604038077e-06,
1185
+ "log_odds_chosen": 0.8220060467720032,
1186
+ "log_odds_ratio": -0.3908359408378601,
1187
+ "logits/chosen": 3.9316563606262207,
1188
+ "logits/rejected": 5.108726501464844,
1189
+ "logps/chosen": -0.6555695533752441,
1190
+ "logps/rejected": -1.0596764087677002,
1191
+ "loss": 0.6864,
1192
+ "nll_loss": 0.6063144207000732,
1193
+ "rewards/accuracies": 1.0,
1194
+ "rewards/chosen": -0.032778479158878326,
1195
+ "rewards/margins": 0.020205341279506683,
1196
+ "rewards/rejected": -0.05298382043838501,
1197
+ "step": 660
1198
+ },
1199
+ {
1200
+ "epoch": 1.86,
1201
+ "grad_norm": 2.015625,
1202
+ "learning_rate": 1.9316685232156397e-06,
1203
+ "log_odds_chosen": 0.2806245982646942,
1204
+ "log_odds_ratio": -0.6073696613311768,
1205
+ "logits/chosen": 3.3105571269989014,
1206
+ "logits/rejected": 3.892564058303833,
1207
+ "logps/chosen": -0.7684590816497803,
1208
+ "logps/rejected": -0.9330514073371887,
1209
+ "loss": 0.6861,
1210
+ "nll_loss": 0.6917561888694763,
1211
+ "rewards/accuracies": 0.800000011920929,
1212
+ "rewards/chosen": -0.038422949612140656,
1213
+ "rewards/margins": 0.008229617960751057,
1214
+ "rewards/rejected": -0.046652574092149734,
1215
+ "step": 670
1216
+ },
1217
+ {
1218
+ "epoch": 1.88,
1219
+ "grad_norm": 2.234375,
1220
+ "learning_rate": 1.917412472118426e-06,
1221
+ "log_odds_chosen": 0.7242671251296997,
1222
+ "log_odds_ratio": -0.5751112699508667,
1223
+ "logits/chosen": 3.4888782501220703,
1224
+ "logits/rejected": 3.981926679611206,
1225
+ "logps/chosen": -0.7546848654747009,
1226
+ "logps/rejected": -0.9648480415344238,
1227
+ "loss": 0.708,
1228
+ "nll_loss": 0.7166061401367188,
1229
+ "rewards/accuracies": 0.699999988079071,
1230
+ "rewards/chosen": -0.03773424029350281,
1231
+ "rewards/margins": 0.010508162900805473,
1232
+ "rewards/rejected": -0.04824240505695343,
1233
+ "step": 680
1234
+ },
1235
+ {
1236
+ "epoch": 1.91,
1237
+ "grad_norm": 1.7265625,
1238
+ "learning_rate": 1.9034674690672024e-06,
1239
+ "log_odds_chosen": 0.007441395428031683,
1240
+ "log_odds_ratio": -0.8017139434814453,
1241
+ "logits/chosen": 4.016451358795166,
1242
+ "logits/rejected": 4.500775337219238,
1243
+ "logps/chosen": -0.6954795718193054,
1244
+ "logps/rejected": -0.7557869553565979,
1245
+ "loss": 0.6758,
1246
+ "nll_loss": 0.5994603633880615,
1247
+ "rewards/accuracies": 0.5,
1248
+ "rewards/chosen": -0.03477397933602333,
1249
+ "rewards/margins": 0.0030153680127114058,
1250
+ "rewards/rejected": -0.037789348512887955,
1251
+ "step": 690
1252
+ },
1253
+ {
1254
+ "epoch": 1.94,
1255
+ "grad_norm": 2.109375,
1256
+ "learning_rate": 1.8898223650461362e-06,
1257
+ "log_odds_chosen": 0.8856722712516785,
1258
+ "log_odds_ratio": -0.3986029624938965,
1259
+ "logits/chosen": 1.7587060928344727,
1260
+ "logits/rejected": 4.666166305541992,
1261
+ "logps/chosen": -0.550857424736023,
1262
+ "logps/rejected": -0.8934911489486694,
1263
+ "loss": 0.6606,
1264
+ "nll_loss": 0.4851749539375305,
1265
+ "rewards/accuracies": 0.8999999761581421,
1266
+ "rewards/chosen": -0.027542870491743088,
1267
+ "rewards/margins": 0.017131686210632324,
1268
+ "rewards/rejected": -0.044674552977085114,
1269
+ "step": 700
1270
+ },
1271
+ {
1272
+ "epoch": 1.97,
1273
+ "grad_norm": 1.7265625,
1274
+ "learning_rate": 1.876466562602004e-06,
1275
+ "log_odds_chosen": 0.24165010452270508,
1276
+ "log_odds_ratio": -0.6184555292129517,
1277
+ "logits/chosen": 2.516463279724121,
1278
+ "logits/rejected": 3.698896884918213,
1279
+ "logps/chosen": -0.5866281390190125,
1280
+ "logps/rejected": -0.6856867074966431,
1281
+ "loss": 0.7084,
1282
+ "nll_loss": 0.6121121644973755,
1283
+ "rewards/accuracies": 0.6000000238418579,
1284
+ "rewards/chosen": -0.029331404715776443,
1285
+ "rewards/margins": 0.004952927120029926,
1286
+ "rewards/rejected": -0.034284330904483795,
1287
+ "step": 710
1288
+ },
1289
+ {
1290
+ "epoch": 1.99,
1291
+ "grad_norm": 2.03125,
1292
+ "learning_rate": 1.863389981249825e-06,
1293
+ "log_odds_chosen": 0.913441002368927,
1294
+ "log_odds_ratio": -0.47331008315086365,
1295
+ "logits/chosen": 2.567636251449585,
1296
+ "logits/rejected": 4.045866012573242,
1297
+ "logps/chosen": -0.5542882084846497,
1298
+ "logps/rejected": -0.8390460014343262,
1299
+ "loss": 0.6859,
1300
+ "nll_loss": 0.4496838450431824,
1301
+ "rewards/accuracies": 0.8999999761581421,
1302
+ "rewards/chosen": -0.027714407071471214,
1303
+ "rewards/margins": 0.014237893745303154,
1304
+ "rewards/rejected": -0.04195230454206467,
1305
+ "step": 720
1306
+ },
1307
+ {
1308
+ "epoch": 2.02,
1309
+ "grad_norm": 2.0,
1310
+ "learning_rate": 1.8505830254940132e-06,
1311
+ "log_odds_chosen": 0.7949598431587219,
1312
+ "log_odds_ratio": -0.45839613676071167,
1313
+ "logits/chosen": 3.325202226638794,
1314
+ "logits/rejected": 4.260237216949463,
1315
+ "logps/chosen": -0.7130070924758911,
1316
+ "logps/rejected": -1.053581953048706,
1317
+ "loss": 0.6691,
1318
+ "nll_loss": 0.6019625663757324,
1319
+ "rewards/accuracies": 0.699999988079071,
1320
+ "rewards/chosen": -0.035650357604026794,
1321
+ "rewards/margins": 0.01702873967587948,
1322
+ "rewards/rejected": -0.05267909914255142,
1323
+ "step": 730
1324
+ },
1325
+ {
1326
+ "epoch": 2.05,
1327
+ "grad_norm": 1.96875,
1328
+ "learning_rate": 1.8380365552345197e-06,
1329
+ "log_odds_chosen": 0.5770705938339233,
1330
+ "log_odds_ratio": -0.5122253894805908,
1331
+ "logits/chosen": 3.619288682937622,
1332
+ "logits/rejected": 4.384647369384766,
1333
+ "logps/chosen": -0.8298980593681335,
1334
+ "logps/rejected": -1.167079210281372,
1335
+ "loss": 0.6766,
1336
+ "nll_loss": 0.704396665096283,
1337
+ "rewards/accuracies": 0.800000011920929,
1338
+ "rewards/chosen": -0.041494905948638916,
1339
+ "rewards/margins": 0.016859063878655434,
1340
+ "rewards/rejected": -0.0583539679646492,
1341
+ "step": 740
1342
+ },
1343
+ {
1344
+ "epoch": 2.08,
1345
+ "grad_norm": 2.0625,
1346
+ "learning_rate": 1.8257418583505536e-06,
1347
+ "log_odds_chosen": 0.6968498826026917,
1348
+ "log_odds_ratio": -0.4758426547050476,
1349
+ "logits/chosen": 3.848654270172119,
1350
+ "logits/rejected": 2.3218348026275635,
1351
+ "logps/chosen": -0.67643803358078,
1352
+ "logps/rejected": -0.8964241743087769,
1353
+ "loss": 0.7021,
1354
+ "nll_loss": 0.6553528904914856,
1355
+ "rewards/accuracies": 0.8999999761581421,
1356
+ "rewards/chosen": -0.03382190316915512,
1357
+ "rewards/margins": 0.010999304242432117,
1358
+ "rewards/rejected": -0.04482121020555496,
1359
+ "step": 750
1360
+ },
1361
+ {
1362
+ "epoch": 2.11,
1363
+ "grad_norm": 1.9296875,
1364
+ "learning_rate": 1.8136906252750293e-06,
1365
+ "log_odds_chosen": 0.2059679478406906,
1366
+ "log_odds_ratio": -0.6639232039451599,
1367
+ "logits/chosen": 4.111894130706787,
1368
+ "logits/rejected": 4.707150459289551,
1369
+ "logps/chosen": -0.7494447231292725,
1370
+ "logps/rejected": -0.8692378997802734,
1371
+ "loss": 0.6765,
1372
+ "nll_loss": 0.6483981013298035,
1373
+ "rewards/accuracies": 0.699999988079071,
1374
+ "rewards/chosen": -0.03747224062681198,
1375
+ "rewards/margins": 0.005989660043269396,
1376
+ "rewards/rejected": -0.04346190020442009,
1377
+ "step": 760
1378
+ },
1379
+ {
1380
+ "epoch": 2.13,
1381
+ "grad_norm": 1.8203125,
1382
+ "learning_rate": 1.801874925391118e-06,
1383
+ "log_odds_chosen": 0.5316376686096191,
1384
+ "log_odds_ratio": -0.5046476721763611,
1385
+ "logits/chosen": 2.8869481086730957,
1386
+ "logits/rejected": 2.6999757289886475,
1387
+ "logps/chosen": -0.5816842317581177,
1388
+ "logps/rejected": -0.7666364908218384,
1389
+ "loss": 0.6631,
1390
+ "nll_loss": 0.6736694574356079,
1391
+ "rewards/accuracies": 0.8999999761581421,
1392
+ "rewards/chosen": -0.029084209352731705,
1393
+ "rewards/margins": 0.009247615933418274,
1394
+ "rewards/rejected": -0.03833182901144028,
1395
+ "step": 770
1396
+ },
1397
+ {
1398
+ "epoch": 2.16,
1399
+ "grad_norm": 2.03125,
1400
+ "learning_rate": 1.7902871850985824e-06,
1401
+ "log_odds_chosen": 0.37722089886665344,
1402
+ "log_odds_ratio": -0.5441932082176208,
1403
+ "logits/chosen": 3.213557720184326,
1404
+ "logits/rejected": 4.203583240509033,
1405
+ "logps/chosen": -0.7685849070549011,
1406
+ "logps/rejected": -1.0165854692459106,
1407
+ "loss": 0.6767,
1408
+ "nll_loss": 0.6813093423843384,
1409
+ "rewards/accuracies": 0.800000011920929,
1410
+ "rewards/chosen": -0.038429249078035355,
1411
+ "rewards/margins": 0.012400026433169842,
1412
+ "rewards/rejected": -0.05082928016781807,
1413
+ "step": 780
1414
+ },
1415
+ {
1416
+ "epoch": 2.19,
1417
+ "grad_norm": 2.0625,
1418
+ "learning_rate": 1.7789201674120502e-06,
1419
+ "log_odds_chosen": 1.2822726964950562,
1420
+ "log_odds_ratio": -0.43989676237106323,
1421
+ "logits/chosen": 2.3955116271972656,
1422
+ "logits/rejected": 4.735221862792969,
1423
+ "logps/chosen": -0.7316259145736694,
1424
+ "logps/rejected": -1.1169257164001465,
1425
+ "loss": 0.6817,
1426
+ "nll_loss": 0.5958443880081177,
1427
+ "rewards/accuracies": 0.800000011920929,
1428
+ "rewards/chosen": -0.03658129647374153,
1429
+ "rewards/margins": 0.019264988601207733,
1430
+ "rewards/rejected": -0.055846281349658966,
1431
+ "step": 790
1432
+ },
1433
+ {
1434
+ "epoch": 2.22,
1435
+ "grad_norm": 2.296875,
1436
+ "learning_rate": 1.7677669529663689e-06,
1437
+ "log_odds_chosen": 0.651089072227478,
1438
+ "log_odds_ratio": -0.44289833307266235,
1439
+ "logits/chosen": 3.480353593826294,
1440
+ "logits/rejected": 4.546199798583984,
1441
+ "logps/chosen": -0.6184307932853699,
1442
+ "logps/rejected": -0.884985625743866,
1443
+ "loss": 0.6483,
1444
+ "nll_loss": 0.5650686025619507,
1445
+ "rewards/accuracies": 0.8999999761581421,
1446
+ "rewards/chosen": -0.030921539291739464,
1447
+ "rewards/margins": 0.01332774292677641,
1448
+ "rewards/rejected": -0.0442492850124836,
1449
+ "step": 800
1450
+ },
1451
+ {
1452
+ "epoch": 2.24,
1453
+ "grad_norm": 1.9765625,
1454
+ "learning_rate": 1.7568209223157664e-06,
1455
+ "log_odds_chosen": 0.0874546617269516,
1456
+ "log_odds_ratio": -0.6833248138427734,
1457
+ "logits/chosen": 3.4896228313446045,
1458
+ "logits/rejected": 4.298434734344482,
1459
+ "logps/chosen": -0.7224100828170776,
1460
+ "logps/rejected": -0.7926791906356812,
1461
+ "loss": 0.6619,
1462
+ "nll_loss": 0.5892444849014282,
1463
+ "rewards/accuracies": 0.5,
1464
+ "rewards/chosen": -0.03612050786614418,
1465
+ "rewards/margins": 0.0035134553909301758,
1466
+ "rewards/rejected": -0.039633963257074356,
1467
+ "step": 810
1468
+ },
1469
+ {
1470
+ "epoch": 2.27,
1471
+ "grad_norm": 1.953125,
1472
+ "learning_rate": 1.7460757394239458e-06,
1473
+ "log_odds_chosen": 0.7751902341842651,
1474
+ "log_odds_ratio": -0.5393909215927124,
1475
+ "logits/chosen": 4.474148750305176,
1476
+ "logits/rejected": 5.107138156890869,
1477
+ "logps/chosen": -0.8380101919174194,
1478
+ "logps/rejected": -1.0609526634216309,
1479
+ "loss": 0.685,
1480
+ "nll_loss": 0.7294074296951294,
1481
+ "rewards/accuracies": 0.6000000238418579,
1482
+ "rewards/chosen": -0.04190050810575485,
1483
+ "rewards/margins": 0.01114712655544281,
1484
+ "rewards/rejected": -0.05304763838648796,
1485
+ "step": 820
1486
+ },
1487
+ {
1488
+ "epoch": 2.3,
1489
+ "grad_norm": 1.9765625,
1490
+ "learning_rate": 1.7355253362515584e-06,
1491
+ "log_odds_chosen": 0.5033993721008301,
1492
+ "log_odds_ratio": -0.5074852108955383,
1493
+ "logits/chosen": 2.511612892150879,
1494
+ "logits/rejected": 3.992374897003174,
1495
+ "logps/chosen": -0.5894848704338074,
1496
+ "logps/rejected": -0.8681418299674988,
1497
+ "loss": 0.6802,
1498
+ "nll_loss": 0.5586022138595581,
1499
+ "rewards/accuracies": 0.800000011920929,
1500
+ "rewards/chosen": -0.029474247246980667,
1501
+ "rewards/margins": 0.013932851143181324,
1502
+ "rewards/rejected": -0.043407101184129715,
1503
+ "step": 830
1504
+ },
1505
+ {
1506
+ "epoch": 2.33,
1507
+ "grad_norm": 2.046875,
1508
+ "learning_rate": 1.7251638983558855e-06,
1509
+ "log_odds_chosen": 0.6550665497779846,
1510
+ "log_odds_ratio": -0.49995166063308716,
1511
+ "logits/chosen": 3.1267733573913574,
1512
+ "logits/rejected": 3.973337173461914,
1513
+ "logps/chosen": -0.65223228931427,
1514
+ "logps/rejected": -0.904323399066925,
1515
+ "loss": 0.6808,
1516
+ "nll_loss": 0.6052195429801941,
1517
+ "rewards/accuracies": 0.800000011920929,
1518
+ "rewards/chosen": -0.03261161595582962,
1519
+ "rewards/margins": 0.012604555115103722,
1520
+ "rewards/rejected": -0.04521617293357849,
1521
+ "step": 840
1522
+ },
1523
+ {
1524
+ "epoch": 2.35,
1525
+ "grad_norm": 1.6796875,
1526
+ "learning_rate": 1.7149858514250883e-06,
1527
+ "log_odds_chosen": 0.28233373165130615,
1528
+ "log_odds_ratio": -0.6105042695999146,
1529
+ "logits/chosen": 4.673365116119385,
1530
+ "logits/rejected": 4.622339248657227,
1531
+ "logps/chosen": -0.839856743812561,
1532
+ "logps/rejected": -0.9960880279541016,
1533
+ "loss": 0.7006,
1534
+ "nll_loss": 0.7127640247344971,
1535
+ "rewards/accuracies": 0.699999988079071,
1536
+ "rewards/chosen": -0.041992831975221634,
1537
+ "rewards/margins": 0.0078115700744092464,
1538
+ "rewards/rejected": -0.04980440065264702,
1539
+ "step": 850
1540
+ },
1541
+ {
1542
+ "epoch": 2.38,
1543
+ "grad_norm": 2.140625,
1544
+ "learning_rate": 1.704985848676184e-06,
1545
+ "log_odds_chosen": 0.3673107922077179,
1546
+ "log_odds_ratio": -0.6019551753997803,
1547
+ "logits/chosen": 4.395766258239746,
1548
+ "logits/rejected": 4.125451564788818,
1549
+ "logps/chosen": -0.7563544511795044,
1550
+ "logps/rejected": -0.9725953936576843,
1551
+ "loss": 0.6872,
1552
+ "nll_loss": 0.6850851774215698,
1553
+ "rewards/accuracies": 0.699999988079071,
1554
+ "rewards/chosen": -0.03781772404909134,
1555
+ "rewards/margins": 0.01081205252557993,
1556
+ "rewards/rejected": -0.048629771918058395,
1557
+ "step": 860
1558
+ },
1559
+ {
1560
+ "epoch": 2.41,
1561
+ "grad_norm": 1.90625,
1562
+ "learning_rate": 1.6951587590520263e-06,
1563
+ "log_odds_chosen": 0.316839337348938,
1564
+ "log_odds_ratio": -0.5757532119750977,
1565
+ "logits/chosen": 3.451542615890503,
1566
+ "logits/rejected": 4.92800760269165,
1567
+ "logps/chosen": -0.7138621807098389,
1568
+ "logps/rejected": -0.900250256061554,
1569
+ "loss": 0.6756,
1570
+ "nll_loss": 0.6208301782608032,
1571
+ "rewards/accuracies": 0.6000000238418579,
1572
+ "rewards/chosen": -0.03569310903549194,
1573
+ "rewards/margins": 0.009319400414824486,
1574
+ "rewards/rejected": -0.04501251131296158,
1575
+ "step": 870
1576
+ },
1577
+ {
1578
+ "epoch": 2.44,
1579
+ "grad_norm": 2.171875,
1580
+ "learning_rate": 1.6854996561581053e-06,
1581
+ "log_odds_chosen": 0.6411941051483154,
1582
+ "log_odds_ratio": -0.4790283739566803,
1583
+ "logits/chosen": 3.6001486778259277,
1584
+ "logits/rejected": 4.4620466232299805,
1585
+ "logps/chosen": -0.7167496085166931,
1586
+ "logps/rejected": -1.0959062576293945,
1587
+ "loss": 0.666,
1588
+ "nll_loss": 0.5813517570495605,
1589
+ "rewards/accuracies": 0.699999988079071,
1590
+ "rewards/chosen": -0.035837478935718536,
1591
+ "rewards/margins": 0.01895783841609955,
1592
+ "rewards/rejected": -0.05479532480239868,
1593
+ "step": 880
1594
+ },
1595
+ {
1596
+ "epoch": 2.47,
1597
+ "grad_norm": 2.125,
1598
+ "learning_rate": 1.6760038078849776e-06,
1599
+ "log_odds_chosen": 0.29189780354499817,
1600
+ "log_odds_ratio": -0.5954831838607788,
1601
+ "logits/chosen": 3.923008680343628,
1602
+ "logits/rejected": 3.9894745349884033,
1603
+ "logps/chosen": -0.7887361645698547,
1604
+ "logps/rejected": -0.9427839517593384,
1605
+ "loss": 0.6461,
1606
+ "nll_loss": 0.669800877571106,
1607
+ "rewards/accuracies": 0.699999988079071,
1608
+ "rewards/chosen": -0.039436809718608856,
1609
+ "rewards/margins": 0.007702393922954798,
1610
+ "rewards/rejected": -0.04713919758796692,
1611
+ "step": 890
1612
+ },
1613
+ {
1614
+ "epoch": 2.49,
1615
+ "grad_norm": 2.0625,
1616
+ "learning_rate": 1.6666666666666667e-06,
1617
+ "log_odds_chosen": 0.47135716676712036,
1618
+ "log_odds_ratio": -0.5324128270149231,
1619
+ "logits/chosen": 3.2858338356018066,
1620
+ "logits/rejected": 4.336097240447998,
1621
+ "logps/chosen": -0.6921708583831787,
1622
+ "logps/rejected": -0.9751580357551575,
1623
+ "loss": 0.6725,
1624
+ "nll_loss": 0.5913586020469666,
1625
+ "rewards/accuracies": 0.699999988079071,
1626
+ "rewards/chosen": -0.034608542919158936,
1627
+ "rewards/margins": 0.014149360358715057,
1628
+ "rewards/rejected": -0.048757899552583694,
1629
+ "step": 900
1630
+ },
1631
+ {
1632
+ "epoch": 2.52,
1633
+ "grad_norm": 2.203125,
1634
+ "learning_rate": 1.6574838603294898e-06,
1635
+ "log_odds_chosen": 0.3408917784690857,
1636
+ "log_odds_ratio": -0.6049419641494751,
1637
+ "logits/chosen": 4.834186553955078,
1638
+ "logits/rejected": 4.803133964538574,
1639
+ "logps/chosen": -0.9150100946426392,
1640
+ "logps/rejected": -1.1368293762207031,
1641
+ "loss": 0.6901,
1642
+ "nll_loss": 0.7697237730026245,
1643
+ "rewards/accuracies": 0.6000000238418579,
1644
+ "rewards/chosen": -0.04575050249695778,
1645
+ "rewards/margins": 0.011090965941548347,
1646
+ "rewards/rejected": -0.056841470301151276,
1647
+ "step": 910
1648
+ },
1649
+ {
1650
+ "epoch": 2.55,
1651
+ "grad_norm": 1.9296875,
1652
+ "learning_rate": 1.648451183489468e-06,
1653
+ "log_odds_chosen": 0.5196985602378845,
1654
+ "log_odds_ratio": -0.48116031289100647,
1655
+ "logits/chosen": 4.574516773223877,
1656
+ "logits/rejected": 4.646053314208984,
1657
+ "logps/chosen": -0.754222571849823,
1658
+ "logps/rejected": -1.0455068349838257,
1659
+ "loss": 0.6679,
1660
+ "nll_loss": 0.6747040748596191,
1661
+ "rewards/accuracies": 1.0,
1662
+ "rewards/chosen": -0.03771113231778145,
1663
+ "rewards/margins": 0.0145642114803195,
1664
+ "rewards/rejected": -0.05227534845471382,
1665
+ "step": 920
1666
+ },
1667
+ {
1668
+ "epoch": 2.58,
1669
+ "grad_norm": 1.6171875,
1670
+ "learning_rate": 1.6395645894598825e-06,
1671
+ "log_odds_chosen": 0.6613763570785522,
1672
+ "log_odds_ratio": -0.44903701543807983,
1673
+ "logits/chosen": 4.178925514221191,
1674
+ "logits/rejected": 4.050136566162109,
1675
+ "logps/chosen": -0.6592333912849426,
1676
+ "logps/rejected": -0.9929401278495789,
1677
+ "loss": 0.6283,
1678
+ "nll_loss": 0.648184061050415,
1679
+ "rewards/accuracies": 0.8999999761581421,
1680
+ "rewards/chosen": -0.03296167403459549,
1681
+ "rewards/margins": 0.016685333102941513,
1682
+ "rewards/rejected": -0.049647007137537,
1683
+ "step": 930
1684
+ },
1685
+ {
1686
+ "epoch": 2.6,
1687
+ "grad_norm": 2.359375,
1688
+ "learning_rate": 1.6308201826336057e-06,
1689
+ "log_odds_chosen": 0.609188437461853,
1690
+ "log_odds_ratio": -0.46795234084129333,
1691
+ "logits/chosen": 2.6147894859313965,
1692
+ "logits/rejected": 3.9961628913879395,
1693
+ "logps/chosen": -0.6610501408576965,
1694
+ "logps/rejected": -1.0103614330291748,
1695
+ "loss": 0.6824,
1696
+ "nll_loss": 0.5963402390480042,
1697
+ "rewards/accuracies": 0.8999999761581421,
1698
+ "rewards/chosen": -0.033052511513233185,
1699
+ "rewards/margins": 0.017465557903051376,
1700
+ "rewards/rejected": -0.05051806569099426,
1701
+ "step": 940
1702
+ },
1703
+ {
1704
+ "epoch": 2.63,
1705
+ "grad_norm": 2.21875,
1706
+ "learning_rate": 1.6222142113076255e-06,
1707
+ "log_odds_chosen": 0.3838690519332886,
1708
+ "log_odds_ratio": -0.6736805438995361,
1709
+ "logits/chosen": 4.249535083770752,
1710
+ "logits/rejected": 4.508255958557129,
1711
+ "logps/chosen": -0.7882817983627319,
1712
+ "logps/rejected": -0.894466757774353,
1713
+ "loss": 0.6759,
1714
+ "nll_loss": 0.6239164471626282,
1715
+ "rewards/accuracies": 0.6000000238418579,
1716
+ "rewards/chosen": -0.03941408917307854,
1717
+ "rewards/margins": 0.005309252999722958,
1718
+ "rewards/rejected": -0.04472333937883377,
1719
+ "step": 950
1720
+ },
1721
+ {
1722
+ "epoch": 2.66,
1723
+ "grad_norm": 2.03125,
1724
+ "learning_rate": 1.6137430609197571e-06,
1725
+ "log_odds_chosen": 0.7146129012107849,
1726
+ "log_odds_ratio": -0.4838590621948242,
1727
+ "logits/chosen": 3.506737470626831,
1728
+ "logits/rejected": 4.8492279052734375,
1729
+ "logps/chosen": -0.7767097353935242,
1730
+ "logps/rejected": -1.2055903673171997,
1731
+ "loss": 0.6529,
1732
+ "nll_loss": 0.6371511220932007,
1733
+ "rewards/accuracies": 0.8999999761581421,
1734
+ "rewards/chosen": -0.03883548825979233,
1735
+ "rewards/margins": 0.02144402638077736,
1736
+ "rewards/rejected": -0.060279518365859985,
1737
+ "step": 960
1738
+ },
1739
+ {
1740
+ "epoch": 2.69,
1741
+ "grad_norm": 1.671875,
1742
+ "learning_rate": 1.605403247669839e-06,
1743
+ "log_odds_chosen": 0.9047861099243164,
1744
+ "log_odds_ratio": -0.46373167634010315,
1745
+ "logits/chosen": 3.7943649291992188,
1746
+ "logits/rejected": 4.658207416534424,
1747
+ "logps/chosen": -0.644945502281189,
1748
+ "logps/rejected": -0.9372779726982117,
1749
+ "loss": 0.6799,
1750
+ "nll_loss": 0.6254302263259888,
1751
+ "rewards/accuracies": 0.699999988079071,
1752
+ "rewards/chosen": -0.03224727883934975,
1753
+ "rewards/margins": 0.014616618864238262,
1754
+ "rewards/rejected": -0.046863894909620285,
1755
+ "step": 970
1756
+ },
1757
+ {
1758
+ "epoch": 2.71,
1759
+ "grad_norm": 1.984375,
1760
+ "learning_rate": 1.59719141249985e-06,
1761
+ "log_odds_chosen": 1.218353509902954,
1762
+ "log_odds_ratio": -0.5012712478637695,
1763
+ "logits/chosen": 3.276193141937256,
1764
+ "logits/rejected": 3.3000216484069824,
1765
+ "logps/chosen": -0.660887598991394,
1766
+ "logps/rejected": -1.0681812763214111,
1767
+ "loss": 0.6796,
1768
+ "nll_loss": 0.7227128148078918,
1769
+ "rewards/accuracies": 0.800000011920929,
1770
+ "rewards/chosen": -0.03304437920451164,
1771
+ "rewards/margins": 0.020364681258797646,
1772
+ "rewards/rejected": -0.05340906232595444,
1773
+ "step": 980
1774
+ },
1775
+ {
1776
+ "epoch": 2.74,
1777
+ "grad_norm": 2.1875,
1778
+ "learning_rate": 1.5891043154093205e-06,
1779
+ "log_odds_chosen": 0.7260122299194336,
1780
+ "log_odds_ratio": -0.4358360171318054,
1781
+ "logits/chosen": 2.1915860176086426,
1782
+ "logits/rejected": 5.122056007385254,
1783
+ "logps/chosen": -0.6457678079605103,
1784
+ "logps/rejected": -1.0166635513305664,
1785
+ "loss": 0.6739,
1786
+ "nll_loss": 0.5569924712181091,
1787
+ "rewards/accuracies": 0.8999999761581421,
1788
+ "rewards/chosen": -0.03228839114308357,
1789
+ "rewards/margins": 0.018544789403676987,
1790
+ "rewards/rejected": -0.05083317309617996,
1791
+ "step": 990
1792
+ },
1793
+ {
1794
+ "epoch": 2.77,
1795
+ "grad_norm": 1.859375,
1796
+ "learning_rate": 1.5811388300841898e-06,
1797
+ "log_odds_chosen": 0.6579316854476929,
1798
+ "log_odds_ratio": -0.5773683190345764,
1799
+ "logits/chosen": 3.0676589012145996,
1800
+ "logits/rejected": 3.216492176055908,
1801
+ "logps/chosen": -0.7558283805847168,
1802
+ "logps/rejected": -0.9228881597518921,
1803
+ "loss": 0.6639,
1804
+ "nll_loss": 0.7178717255592346,
1805
+ "rewards/accuracies": 0.800000011920929,
1806
+ "rewards/chosen": -0.0377914197742939,
1807
+ "rewards/margins": 0.008352992124855518,
1808
+ "rewards/rejected": -0.046144407242536545,
1809
+ "step": 1000
1810
+ },
1811
+ {
1812
+ "epoch": 2.8,
1813
+ "grad_norm": 2.0625,
1814
+ "learning_rate": 1.5732919388188816e-06,
1815
+ "log_odds_chosen": 0.13750185072422028,
1816
+ "log_odds_ratio": -0.6798086166381836,
1817
+ "logits/chosen": 4.563652515411377,
1818
+ "logits/rejected": 4.581662654876709,
1819
+ "logps/chosen": -0.8638485670089722,
1820
+ "logps/rejected": -0.9687344431877136,
1821
+ "loss": 0.6914,
1822
+ "nll_loss": 0.8300592303276062,
1823
+ "rewards/accuracies": 0.6000000238418579,
1824
+ "rewards/chosen": -0.04319242760539055,
1825
+ "rewards/margins": 0.005244298838078976,
1826
+ "rewards/rejected": -0.0484367236495018,
1827
+ "step": 1010
1828
+ },
1829
+ {
1830
+ "epoch": 2.83,
1831
+ "grad_norm": 2.0,
1832
+ "learning_rate": 1.565560727712874e-06,
1833
+ "log_odds_chosen": 0.2685840129852295,
1834
+ "log_odds_ratio": -0.594863772392273,
1835
+ "logits/chosen": 3.9020438194274902,
1836
+ "logits/rejected": 3.4271602630615234,
1837
+ "logps/chosen": -0.849018394947052,
1838
+ "logps/rejected": -0.9738754034042358,
1839
+ "loss": 0.6944,
1840
+ "nll_loss": 0.799597978591919,
1841
+ "rewards/accuracies": 0.699999988079071,
1842
+ "rewards/chosen": -0.0424509234726429,
1843
+ "rewards/margins": 0.006242851726710796,
1844
+ "rewards/rejected": -0.04869377613067627,
1845
+ "step": 1020
1846
+ },
1847
+ {
1848
+ "epoch": 2.85,
1849
+ "grad_norm": 2.234375,
1850
+ "learning_rate": 1.5579423821243897e-06,
1851
+ "log_odds_chosen": 0.6145926117897034,
1852
+ "log_odds_ratio": -0.5494765043258667,
1853
+ "logits/chosen": 3.713395357131958,
1854
+ "logits/rejected": 3.5231990814208984,
1855
+ "logps/chosen": -0.7322524785995483,
1856
+ "logps/rejected": -0.8949598073959351,
1857
+ "loss": 0.6596,
1858
+ "nll_loss": 0.676146388053894,
1859
+ "rewards/accuracies": 0.800000011920929,
1860
+ "rewards/chosen": -0.0366126224398613,
1861
+ "rewards/margins": 0.008135369047522545,
1862
+ "rewards/rejected": -0.044747985899448395,
1863
+ "step": 1030
1864
+ },
1865
+ {
1866
+ "epoch": 2.88,
1867
+ "grad_norm": 2.296875,
1868
+ "learning_rate": 1.5504341823651056e-06,
1869
+ "log_odds_chosen": 0.3906387686729431,
1870
+ "log_odds_ratio": -0.5653647780418396,
1871
+ "logits/chosen": 2.943089485168457,
1872
+ "logits/rejected": 4.423019886016846,
1873
+ "logps/chosen": -0.6953160166740417,
1874
+ "logps/rejected": -0.895278811454773,
1875
+ "loss": 0.6726,
1876
+ "nll_loss": 0.6234105229377747,
1877
+ "rewards/accuracies": 0.699999988079071,
1878
+ "rewards/chosen": -0.03476579859852791,
1879
+ "rewards/margins": 0.009998142719268799,
1880
+ "rewards/rejected": -0.044763945043087006,
1881
+ "step": 1040
1882
+ },
1883
+ {
1884
+ "epoch": 2.91,
1885
+ "grad_norm": 1.8515625,
1886
+ "learning_rate": 1.5430334996209192e-06,
1887
+ "log_odds_chosen": 0.8320558667182922,
1888
+ "log_odds_ratio": -0.4963892102241516,
1889
+ "logits/chosen": 3.3935484886169434,
1890
+ "logits/rejected": 3.5727221965789795,
1891
+ "logps/chosen": -0.6379098296165466,
1892
+ "logps/rejected": -0.9367613792419434,
1893
+ "loss": 0.6757,
1894
+ "nll_loss": 0.6095770597457886,
1895
+ "rewards/accuracies": 0.800000011920929,
1896
+ "rewards/chosen": -0.03189549595117569,
1897
+ "rewards/margins": 0.014942578971385956,
1898
+ "rewards/rejected": -0.04683807119727135,
1899
+ "step": 1050
1900
+ },
1901
+ {
1902
+ "epoch": 2.94,
1903
+ "grad_norm": 1.625,
1904
+ "learning_rate": 1.5357377920848783e-06,
1905
+ "log_odds_chosen": -0.04924366623163223,
1906
+ "log_odds_ratio": -0.7443690299987793,
1907
+ "logits/chosen": 4.197890281677246,
1908
+ "logits/rejected": 3.850818157196045,
1909
+ "logps/chosen": -0.8803885579109192,
1910
+ "logps/rejected": -0.8369588851928711,
1911
+ "loss": 0.6633,
1912
+ "nll_loss": 0.7210748791694641,
1913
+ "rewards/accuracies": 0.6000000238418579,
1914
+ "rewards/chosen": -0.0440194308757782,
1915
+ "rewards/margins": -0.0021714833565056324,
1916
+ "rewards/rejected": -0.04184794798493385,
1917
+ "step": 1060
1918
+ },
1919
+ {
1920
+ "epoch": 2.96,
1921
+ "grad_norm": 1.9921875,
1922
+ "learning_rate": 1.5285446012893579e-06,
1923
+ "log_odds_chosen": 0.3693309426307678,
1924
+ "log_odds_ratio": -0.5573875904083252,
1925
+ "logits/chosen": 3.786158800125122,
1926
+ "logits/rejected": 3.326815366744995,
1927
+ "logps/chosen": -0.6857789158821106,
1928
+ "logps/rejected": -0.9004032015800476,
1929
+ "loss": 0.6727,
1930
+ "nll_loss": 0.7129669189453125,
1931
+ "rewards/accuracies": 0.699999988079071,
1932
+ "rewards/chosen": -0.03428894653916359,
1933
+ "rewards/margins": 0.010731215588748455,
1934
+ "rewards/rejected": -0.04502015933394432,
1935
+ "step": 1070
1936
+ },
1937
+ {
1938
+ "epoch": 2.99,
1939
+ "grad_norm": 2.171875,
1940
+ "learning_rate": 1.5214515486254614e-06,
1941
+ "log_odds_chosen": 0.3639599680900574,
1942
+ "log_odds_ratio": -0.5332378149032593,
1943
+ "logits/chosen": 3.4862148761749268,
1944
+ "logits/rejected": 3.6384239196777344,
1945
+ "logps/chosen": -0.776229202747345,
1946
+ "logps/rejected": -0.9858113527297974,
1947
+ "loss": 0.6785,
1948
+ "nll_loss": 0.6706089973449707,
1949
+ "rewards/accuracies": 1.0,
1950
+ "rewards/chosen": -0.03881146013736725,
1951
+ "rewards/margins": 0.010479103773832321,
1952
+ "rewards/rejected": -0.04929056018590927,
1953
+ "step": 1080
1954
+ },
1955
+ {
1956
+ "epoch": 3.0,
1957
+ "step": 1083,
1958
+ "total_flos": 0.0,
1959
+ "train_loss": 0.7946326331969004,
1960
+ "train_runtime": 13847.9433,
1961
+ "train_samples_per_second": 2.498,
1962
+ "train_steps_per_second": 0.078
1963
+ }
1964
+ ],
1965
+ "logging_steps": 10,
1966
+ "max_steps": 1083,
1967
+ "num_input_tokens_seen": 0,
1968
+ "num_train_epochs": 3,
1969
+ "save_steps": 500,
1970
+ "total_flos": 0.0,
1971
+ "train_batch_size": 1,
1972
+ "trial_name": null,
1973
+ "trial_params": null
1974
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e45ae9ccd4109329d97d7beb8bb0110e3b2da4ae2755b964730ea162015d6eec
3
+ size 5304