nlee-208 commited on
Commit
2a55780
1 Parent(s): 02efb8d

Model save

Browse files
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: mistralai/Mistral-7B-Instruct-v0.2
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: uf-mistral-it-dpo-iopo-iter1-full
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/nlee28/lucky/runs/3vm2mk1w)
17
+ # uf-mistral-it-dpo-iopo-iter1-full
18
+
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) on an unknown dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-07
39
+ - train_batch_size: 4
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 2
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 32
46
+ - total_eval_batch_size: 16
47
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 1
51
+
52
+ ### Training results
53
+
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.42.4
59
+ - Pytorch 2.1.2.post303
60
+ - Datasets 2.18.0
61
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9998519176662224,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.17761736296081995,
5
+ "train_runtime": 39274.1948,
6
+ "train_samples": 54020,
7
+ "train_samples_per_second": 1.375,
8
+ "train_steps_per_second": 0.043
9
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.42.4"
6
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b6adac529813d1bf1618bb1855fff10ff3c88a24d60c17a34071a53e0784c43
3
+ size 4943162336
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ccb5a9bfe035142bd2d48518ceb07c019bebb830d2a6764ccff00a18cfb7dea
3
+ size 4999819336
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b870b0836e4851c7daaf6a56b31067727f20bef5a992a6f378eec79a1302c26e
3
+ size 4540516344
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 14483464192
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.norm.weight": "model-00003-of-00003.safetensors"
297
+ }
298
+ }
runs/Jul22_01-29-07_gpu-1/events.out.tfevents.1721579981.gpu-1.907562.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4283b6ef8c13b670e3d2abd23275bb9d64daf2b19f46a7e3fe6d7fb54de4f111
3
- size 121582
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b76435acc7b55713d2aefa653c0d9a0c5d0b40946c3666e28b5aa5c9336ddf7
3
+ size 121936
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9998519176662224,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.17761736296081995,
5
+ "train_runtime": 39274.1948,
6
+ "train_samples": 54020,
7
+ "train_samples_per_second": 1.375,
8
+ "train_steps_per_second": 0.043
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9998519176662224,
5
+ "eval_steps": 500,
6
+ "global_step": 1688,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005923293351103213,
13
+ "grad_norm": 504.85714834798955,
14
+ "learning_rate": 2.9585798816568044e-08,
15
+ "logits/chosen": -2.2128281593322754,
16
+ "logits/rejected": -2.1649556159973145,
17
+ "logps/chosen": -334.58282470703125,
18
+ "logps/rejected": -174.33193969726562,
19
+ "loss": 0.694,
20
+ "rewards/accuracies": 0.41874998807907104,
21
+ "rewards/chosen": 0.003443267662078142,
22
+ "rewards/margins": 0.00374796474352479,
23
+ "rewards/rejected": -0.0003046986530534923,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.011846586702206426,
28
+ "grad_norm": 519.6103666653933,
29
+ "learning_rate": 5.917159763313609e-08,
30
+ "logits/chosen": -2.2768917083740234,
31
+ "logits/rejected": -2.2541089057922363,
32
+ "logps/chosen": -442.1549377441406,
33
+ "logps/rejected": -184.51959228515625,
34
+ "loss": 0.6858,
35
+ "rewards/accuracies": 0.5625,
36
+ "rewards/chosen": 0.013772351667284966,
37
+ "rewards/margins": 0.025344645604491234,
38
+ "rewards/rejected": -0.011572292074561119,
39
+ "step": 20
40
+ },
41
+ {
42
+ "epoch": 0.01776988005330964,
43
+ "grad_norm": 433.16385797649355,
44
+ "learning_rate": 8.875739644970414e-08,
45
+ "logits/chosen": -2.2564902305603027,
46
+ "logits/rejected": -2.263812780380249,
47
+ "logps/chosen": -363.1085510253906,
48
+ "logps/rejected": -180.28768920898438,
49
+ "loss": 0.6461,
50
+ "rewards/accuracies": 0.7124999761581421,
51
+ "rewards/chosen": 0.08519863337278366,
52
+ "rewards/margins": 0.12328235059976578,
53
+ "rewards/rejected": -0.038083698600530624,
54
+ "step": 30
55
+ },
56
+ {
57
+ "epoch": 0.023693173404412852,
58
+ "grad_norm": 312.3325020676633,
59
+ "learning_rate": 1.1834319526627217e-07,
60
+ "logits/chosen": -2.258511543273926,
61
+ "logits/rejected": -2.21596097946167,
62
+ "logps/chosen": -358.8316650390625,
63
+ "logps/rejected": -164.7125244140625,
64
+ "loss": 0.5831,
65
+ "rewards/accuracies": 0.6812499761581421,
66
+ "rewards/chosen": 0.21487005054950714,
67
+ "rewards/margins": 0.3081914782524109,
68
+ "rewards/rejected": -0.09332143515348434,
69
+ "step": 40
70
+ },
71
+ {
72
+ "epoch": 0.029616466755516067,
73
+ "grad_norm": 290.61310964931636,
74
+ "learning_rate": 1.4792899408284022e-07,
75
+ "logits/chosen": -2.248854875564575,
76
+ "logits/rejected": -2.2407047748565674,
77
+ "logps/chosen": -362.57843017578125,
78
+ "logps/rejected": -178.95758056640625,
79
+ "loss": 0.4954,
80
+ "rewards/accuracies": 0.8062499761581421,
81
+ "rewards/chosen": 0.38509249687194824,
82
+ "rewards/margins": 0.6190904974937439,
83
+ "rewards/rejected": -0.23399806022644043,
84
+ "step": 50
85
+ },
86
+ {
87
+ "epoch": 0.03553976010661928,
88
+ "grad_norm": 303.52300542572084,
89
+ "learning_rate": 1.7751479289940827e-07,
90
+ "logits/chosen": -2.21785044670105,
91
+ "logits/rejected": -2.182326316833496,
92
+ "logps/chosen": -348.1097412109375,
93
+ "logps/rejected": -162.1182403564453,
94
+ "loss": 0.4517,
95
+ "rewards/accuracies": 0.8125,
96
+ "rewards/chosen": 0.6382587552070618,
97
+ "rewards/margins": 0.9943715929985046,
98
+ "rewards/rejected": -0.3561127781867981,
99
+ "step": 60
100
+ },
101
+ {
102
+ "epoch": 0.04146305345772249,
103
+ "grad_norm": 251.78388219726764,
104
+ "learning_rate": 2.0710059171597633e-07,
105
+ "logits/chosen": -2.2731988430023193,
106
+ "logits/rejected": -2.23414945602417,
107
+ "logps/chosen": -337.77349853515625,
108
+ "logps/rejected": -182.30088806152344,
109
+ "loss": 0.4179,
110
+ "rewards/accuracies": 0.8125,
111
+ "rewards/chosen": 0.6993478536605835,
112
+ "rewards/margins": 1.2595398426055908,
113
+ "rewards/rejected": -0.5601919889450073,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.047386346808825704,
118
+ "grad_norm": 246.56195042754825,
119
+ "learning_rate": 2.3668639053254435e-07,
120
+ "logits/chosen": -2.2068114280700684,
121
+ "logits/rejected": -2.209404468536377,
122
+ "logps/chosen": -408.40484619140625,
123
+ "logps/rejected": -202.7127685546875,
124
+ "loss": 0.3483,
125
+ "rewards/accuracies": 0.856249988079071,
126
+ "rewards/chosen": 0.9628815650939941,
127
+ "rewards/margins": 1.7825168371200562,
128
+ "rewards/rejected": -0.8196353912353516,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 0.05330964015992892,
133
+ "grad_norm": 188.01879345063887,
134
+ "learning_rate": 2.662721893491124e-07,
135
+ "logits/chosen": -2.2164900302886963,
136
+ "logits/rejected": -2.2228798866271973,
137
+ "logps/chosen": -360.1303405761719,
138
+ "logps/rejected": -179.77536010742188,
139
+ "loss": 0.338,
140
+ "rewards/accuracies": 0.824999988079071,
141
+ "rewards/chosen": 0.9752426147460938,
142
+ "rewards/margins": 1.890472650527954,
143
+ "rewards/rejected": -0.9152299761772156,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 0.059232933511032135,
148
+ "grad_norm": 381.23591220338665,
149
+ "learning_rate": 2.9585798816568045e-07,
150
+ "logits/chosen": -2.2488455772399902,
151
+ "logits/rejected": -2.2343482971191406,
152
+ "logps/chosen": -347.54180908203125,
153
+ "logps/rejected": -175.11048889160156,
154
+ "loss": 0.3559,
155
+ "rewards/accuracies": 0.793749988079071,
156
+ "rewards/chosen": 1.015298843383789,
157
+ "rewards/margins": 1.9129976034164429,
158
+ "rewards/rejected": -0.897698700428009,
159
+ "step": 100
160
+ },
161
+ {
162
+ "epoch": 0.06515622686213535,
163
+ "grad_norm": 183.36363122937976,
164
+ "learning_rate": 3.254437869822485e-07,
165
+ "logits/chosen": -2.3256828784942627,
166
+ "logits/rejected": -2.2990269660949707,
167
+ "logps/chosen": -384.78863525390625,
168
+ "logps/rejected": -195.26393127441406,
169
+ "loss": 0.2998,
170
+ "rewards/accuracies": 0.84375,
171
+ "rewards/chosen": 1.2486822605133057,
172
+ "rewards/margins": 2.4611282348632812,
173
+ "rewards/rejected": -1.2124459743499756,
174
+ "step": 110
175
+ },
176
+ {
177
+ "epoch": 0.07107952021323856,
178
+ "grad_norm": 196.55190107882535,
179
+ "learning_rate": 3.5502958579881655e-07,
180
+ "logits/chosen": -2.2839980125427246,
181
+ "logits/rejected": -2.2439706325531006,
182
+ "logps/chosen": -316.72900390625,
183
+ "logps/rejected": -187.08209228515625,
184
+ "loss": 0.2897,
185
+ "rewards/accuracies": 0.875,
186
+ "rewards/chosen": 1.0929539203643799,
187
+ "rewards/margins": 2.364865779876709,
188
+ "rewards/rejected": -1.2719120979309082,
189
+ "step": 120
190
+ },
191
+ {
192
+ "epoch": 0.07700281356434177,
193
+ "grad_norm": 187.394753566322,
194
+ "learning_rate": 3.8461538461538463e-07,
195
+ "logits/chosen": -2.3191745281219482,
196
+ "logits/rejected": -2.3147594928741455,
197
+ "logps/chosen": -341.1308288574219,
198
+ "logps/rejected": -192.97549438476562,
199
+ "loss": 0.2532,
200
+ "rewards/accuracies": 0.887499988079071,
201
+ "rewards/chosen": 1.4034380912780762,
202
+ "rewards/margins": 3.091428756713867,
203
+ "rewards/rejected": -1.6879905462265015,
204
+ "step": 130
205
+ },
206
+ {
207
+ "epoch": 0.08292610691544498,
208
+ "grad_norm": 116.14861473966326,
209
+ "learning_rate": 4.1420118343195265e-07,
210
+ "logits/chosen": -2.361495018005371,
211
+ "logits/rejected": -2.321951150894165,
212
+ "logps/chosen": -333.0472412109375,
213
+ "logps/rejected": -179.1328582763672,
214
+ "loss": 0.2321,
215
+ "rewards/accuracies": 0.8812500238418579,
216
+ "rewards/chosen": 0.9758448600769043,
217
+ "rewards/margins": 3.051640510559082,
218
+ "rewards/rejected": -2.0757956504821777,
219
+ "step": 140
220
+ },
221
+ {
222
+ "epoch": 0.0888494002665482,
223
+ "grad_norm": 259.05273624826594,
224
+ "learning_rate": 4.437869822485207e-07,
225
+ "logits/chosen": -2.3843483924865723,
226
+ "logits/rejected": -2.3312084674835205,
227
+ "logps/chosen": -323.70526123046875,
228
+ "logps/rejected": -197.47764587402344,
229
+ "loss": 0.2627,
230
+ "rewards/accuracies": 0.8999999761581421,
231
+ "rewards/chosen": 0.864323616027832,
232
+ "rewards/margins": 3.313027858734131,
233
+ "rewards/rejected": -2.448704242706299,
234
+ "step": 150
235
+ },
236
+ {
237
+ "epoch": 0.09477269361765141,
238
+ "grad_norm": 260.57944130025356,
239
+ "learning_rate": 4.733727810650887e-07,
240
+ "logits/chosen": -2.3698296546936035,
241
+ "logits/rejected": -2.3505420684814453,
242
+ "logps/chosen": -360.91864013671875,
243
+ "logps/rejected": -198.164794921875,
244
+ "loss": 0.2367,
245
+ "rewards/accuracies": 0.887499988079071,
246
+ "rewards/chosen": 1.216972827911377,
247
+ "rewards/margins": 3.8299670219421387,
248
+ "rewards/rejected": -2.612994432449341,
249
+ "step": 160
250
+ },
251
+ {
252
+ "epoch": 0.10069598696875463,
253
+ "grad_norm": 150.81084803740066,
254
+ "learning_rate": 4.999994653198566e-07,
255
+ "logits/chosen": -2.3464887142181396,
256
+ "logits/rejected": -2.3302454948425293,
257
+ "logps/chosen": -335.73529052734375,
258
+ "logps/rejected": -194.806640625,
259
+ "loss": 0.2132,
260
+ "rewards/accuracies": 0.9125000238418579,
261
+ "rewards/chosen": 0.9059770703315735,
262
+ "rewards/margins": 3.8774585723876953,
263
+ "rewards/rejected": -2.9714818000793457,
264
+ "step": 170
265
+ },
266
+ {
267
+ "epoch": 0.10661928031985785,
268
+ "grad_norm": 280.1531227472986,
269
+ "learning_rate": 4.999353064699471e-07,
270
+ "logits/chosen": -2.364449977874756,
271
+ "logits/rejected": -2.3326315879821777,
272
+ "logps/chosen": -389.11236572265625,
273
+ "logps/rejected": -215.023681640625,
274
+ "loss": 0.236,
275
+ "rewards/accuracies": 0.9125000238418579,
276
+ "rewards/chosen": 0.8287426233291626,
277
+ "rewards/margins": 4.475917816162109,
278
+ "rewards/rejected": -3.647174835205078,
279
+ "step": 180
280
+ },
281
+ {
282
+ "epoch": 0.11254257367096106,
283
+ "grad_norm": 175.0444339450233,
284
+ "learning_rate": 4.99764243036258e-07,
285
+ "logits/chosen": -2.3520586490631104,
286
+ "logits/rejected": -2.3261897563934326,
287
+ "logps/chosen": -359.8037109375,
288
+ "logps/rejected": -211.98666381835938,
289
+ "loss": 0.1991,
290
+ "rewards/accuracies": 0.8999999761581421,
291
+ "rewards/chosen": 0.8636599779129028,
292
+ "rewards/margins": 5.269904136657715,
293
+ "rewards/rejected": -4.40624475479126,
294
+ "step": 190
295
+ },
296
+ {
297
+ "epoch": 0.11846586702206427,
298
+ "grad_norm": 237.8569924192833,
299
+ "learning_rate": 4.994863481875841e-07,
300
+ "logits/chosen": -2.3697867393493652,
301
+ "logits/rejected": -2.3272385597229004,
302
+ "logps/chosen": -382.9794006347656,
303
+ "logps/rejected": -224.5498046875,
304
+ "loss": 0.2205,
305
+ "rewards/accuracies": 0.9125000238418579,
306
+ "rewards/chosen": 0.797897219657898,
307
+ "rewards/margins": 5.273079872131348,
308
+ "rewards/rejected": -4.47518253326416,
309
+ "step": 200
310
+ },
311
+ {
312
+ "epoch": 0.12438916037316748,
313
+ "grad_norm": 283.96288222349347,
314
+ "learning_rate": 4.991017407876165e-07,
315
+ "logits/chosen": -2.3532214164733887,
316
+ "logits/rejected": -2.2996296882629395,
317
+ "logps/chosen": -370.9749450683594,
318
+ "logps/rejected": -198.25497436523438,
319
+ "loss": 0.194,
320
+ "rewards/accuracies": 0.893750011920929,
321
+ "rewards/chosen": 0.7681259512901306,
322
+ "rewards/margins": 5.055164337158203,
323
+ "rewards/rejected": -4.2870378494262695,
324
+ "step": 210
325
+ },
326
+ {
327
+ "epoch": 0.1303124537242707,
328
+ "grad_norm": 386.78866139109,
329
+ "learning_rate": 4.98610585344102e-07,
330
+ "logits/chosen": -2.400724172592163,
331
+ "logits/rejected": -2.375039577484131,
332
+ "logps/chosen": -351.29156494140625,
333
+ "logps/rejected": -214.4847412109375,
334
+ "loss": 0.2662,
335
+ "rewards/accuracies": 0.887499988079071,
336
+ "rewards/chosen": 0.7083452939987183,
337
+ "rewards/margins": 4.722594261169434,
338
+ "rewards/rejected": -4.014249324798584,
339
+ "step": 220
340
+ },
341
+ {
342
+ "epoch": 0.13623574707537392,
343
+ "grad_norm": 441.8804995826954,
344
+ "learning_rate": 4.980130919384768e-07,
345
+ "logits/chosen": -2.423753261566162,
346
+ "logits/rejected": -2.3783416748046875,
347
+ "logps/chosen": -399.88494873046875,
348
+ "logps/rejected": -210.5980682373047,
349
+ "loss": 0.2184,
350
+ "rewards/accuracies": 0.8812500238418579,
351
+ "rewards/chosen": 0.6179746389389038,
352
+ "rewards/margins": 5.342424392700195,
353
+ "rewards/rejected": -4.72445011138916,
354
+ "step": 230
355
+ },
356
+ {
357
+ "epoch": 0.14215904042647712,
358
+ "grad_norm": 327.95375338533654,
359
+ "learning_rate": 4.973095161360105e-07,
360
+ "logits/chosen": -2.3384299278259277,
361
+ "logits/rejected": -2.323362350463867,
362
+ "logps/chosen": -333.81536865234375,
363
+ "logps/rejected": -228.82601928710938,
364
+ "loss": 0.1762,
365
+ "rewards/accuracies": 0.9375,
366
+ "rewards/chosen": 0.22780442237854004,
367
+ "rewards/margins": 5.158236980438232,
368
+ "rewards/rejected": -4.93043327331543,
369
+ "step": 240
370
+ },
371
+ {
372
+ "epoch": 0.14808233377758034,
373
+ "grad_norm": 200.53822627799985,
374
+ "learning_rate": 4.965001588764913e-07,
375
+ "logits/chosen": -2.385148525238037,
376
+ "logits/rejected": -2.3672657012939453,
377
+ "logps/chosen": -397.1217041015625,
378
+ "logps/rejected": -228.70773315429688,
379
+ "loss": 0.1876,
380
+ "rewards/accuracies": 0.949999988079071,
381
+ "rewards/chosen": 1.1027851104736328,
382
+ "rewards/margins": 5.892121315002441,
383
+ "rewards/rejected": -4.789336204528809,
384
+ "step": 250
385
+ },
386
+ {
387
+ "epoch": 0.15400562712868354,
388
+ "grad_norm": 47.97910034361157,
389
+ "learning_rate": 4.955853663455072e-07,
390
+ "logits/chosen": -2.438140392303467,
391
+ "logits/rejected": -2.4087047576904297,
392
+ "logps/chosen": -373.9306945800781,
393
+ "logps/rejected": -242.48764038085938,
394
+ "loss": 0.1722,
395
+ "rewards/accuracies": 0.9125000238418579,
396
+ "rewards/chosen": 0.6679338216781616,
397
+ "rewards/margins": 5.903395175933838,
398
+ "rewards/rejected": -5.235461235046387,
399
+ "step": 260
400
+ },
401
+ {
402
+ "epoch": 0.15992892047978677,
403
+ "grad_norm": 131.31461307719647,
404
+ "learning_rate": 4.945655298263713e-07,
405
+ "logits/chosen": -2.3899612426757812,
406
+ "logits/rejected": -2.356353282928467,
407
+ "logps/chosen": -375.282470703125,
408
+ "logps/rejected": -228.1762237548828,
409
+ "loss": 0.1561,
410
+ "rewards/accuracies": 0.949999988079071,
411
+ "rewards/chosen": 0.6925370097160339,
412
+ "rewards/margins": 5.915196418762207,
413
+ "rewards/rejected": -5.222658634185791,
414
+ "step": 270
415
+ },
416
+ {
417
+ "epoch": 0.16585221383088997,
418
+ "grad_norm": 216.04443101572159,
419
+ "learning_rate": 4.934410855327585e-07,
420
+ "logits/chosen": -2.4134862422943115,
421
+ "logits/rejected": -2.404572010040283,
422
+ "logps/chosen": -332.552490234375,
423
+ "logps/rejected": -228.0837860107422,
424
+ "loss": 0.1927,
425
+ "rewards/accuracies": 0.9125000238418579,
426
+ "rewards/chosen": 0.5202642679214478,
427
+ "rewards/margins": 6.256397724151611,
428
+ "rewards/rejected": -5.736133575439453,
429
+ "step": 280
430
+ },
431
+ {
432
+ "epoch": 0.1717755071819932,
433
+ "grad_norm": 124.31837233503076,
434
+ "learning_rate": 4.922125144221252e-07,
435
+ "logits/chosen": -2.430438995361328,
436
+ "logits/rejected": -2.398848056793213,
437
+ "logps/chosen": -409.75347900390625,
438
+ "logps/rejected": -227.51846313476562,
439
+ "loss": 0.1461,
440
+ "rewards/accuracies": 0.925000011920929,
441
+ "rewards/chosen": 0.5862516164779663,
442
+ "rewards/margins": 6.339666843414307,
443
+ "rewards/rejected": -5.753414630889893,
444
+ "step": 290
445
+ },
446
+ {
447
+ "epoch": 0.1776988005330964,
448
+ "grad_norm": 90.59877362683102,
449
+ "learning_rate": 4.90880341989989e-07,
450
+ "logits/chosen": -2.381986141204834,
451
+ "logits/rejected": -2.354752540588379,
452
+ "logps/chosen": -372.0549011230469,
453
+ "logps/rejected": -238.38900756835938,
454
+ "loss": 0.2123,
455
+ "rewards/accuracies": 0.9312499761581421,
456
+ "rewards/chosen": 0.7634764909744263,
457
+ "rewards/margins": 6.4423956871032715,
458
+ "rewards/rejected": -5.678919792175293,
459
+ "step": 300
460
+ },
461
+ {
462
+ "epoch": 0.18362209388419962,
463
+ "grad_norm": 214.55771845505996,
464
+ "learning_rate": 4.894451380451589e-07,
465
+ "logits/chosen": -2.4163644313812256,
466
+ "logits/rejected": -2.3825597763061523,
467
+ "logps/chosen": -325.26171875,
468
+ "logps/rejected": -230.36776733398438,
469
+ "loss": 0.173,
470
+ "rewards/accuracies": 0.918749988079071,
471
+ "rewards/chosen": 0.395336776971817,
472
+ "rewards/margins": 5.5172624588012695,
473
+ "rewards/rejected": -5.121925354003906,
474
+ "step": 310
475
+ },
476
+ {
477
+ "epoch": 0.18954538723530281,
478
+ "grad_norm": 143.07726831372327,
479
+ "learning_rate": 4.879075164660124e-07,
480
+ "logits/chosen": -2.4872541427612305,
481
+ "logits/rejected": -2.4530301094055176,
482
+ "logps/chosen": -337.13592529296875,
483
+ "logps/rejected": -217.49795532226562,
484
+ "loss": 0.1866,
485
+ "rewards/accuracies": 0.90625,
486
+ "rewards/chosen": 0.2758009433746338,
487
+ "rewards/margins": 5.386232852935791,
488
+ "rewards/rejected": -5.110430717468262,
489
+ "step": 320
490
+ },
491
+ {
492
+ "epoch": 0.19546868058640604,
493
+ "grad_norm": 96.96348568375632,
494
+ "learning_rate": 4.862681349379212e-07,
495
+ "logits/chosen": -2.5405662059783936,
496
+ "logits/rejected": -2.506821870803833,
497
+ "logps/chosen": -343.34637451171875,
498
+ "logps/rejected": -218.04287719726562,
499
+ "loss": 0.1596,
500
+ "rewards/accuracies": 0.9375,
501
+ "rewards/chosen": 0.5433809161186218,
502
+ "rewards/margins": 5.724984645843506,
503
+ "rewards/rejected": -5.181603908538818,
504
+ "step": 330
505
+ },
506
+ {
507
+ "epoch": 0.20139197393750927,
508
+ "grad_norm": 236.3908842304634,
509
+ "learning_rate": 4.8452769467194e-07,
510
+ "logits/chosen": -2.4440038204193115,
511
+ "logits/rejected": -2.4228639602661133,
512
+ "logps/chosen": -398.2060852050781,
513
+ "logps/rejected": -238.4206085205078,
514
+ "loss": 0.161,
515
+ "rewards/accuracies": 0.90625,
516
+ "rewards/chosen": 0.39312082529067993,
517
+ "rewards/margins": 6.064854145050049,
518
+ "rewards/rejected": -5.6717329025268555,
519
+ "step": 340
520
+ },
521
+ {
522
+ "epoch": 0.20731526728861246,
523
+ "grad_norm": 262.31086254004623,
524
+ "learning_rate": 4.82686940104879e-07,
525
+ "logits/chosen": -2.439896583557129,
526
+ "logits/rejected": -2.415922164916992,
527
+ "logps/chosen": -339.29217529296875,
528
+ "logps/rejected": -242.05789184570312,
529
+ "loss": 0.1722,
530
+ "rewards/accuracies": 0.9375,
531
+ "rewards/chosen": -0.6545895338058472,
532
+ "rewards/margins": 5.870488166809082,
533
+ "rewards/rejected": -6.525076866149902,
534
+ "step": 350
535
+ },
536
+ {
537
+ "epoch": 0.2132385606397157,
538
+ "grad_norm": 222.15082465425456,
539
+ "learning_rate": 4.807466585808856e-07,
540
+ "logits/chosen": -2.379359006881714,
541
+ "logits/rejected": -2.360146999359131,
542
+ "logps/chosen": -316.02349853515625,
543
+ "logps/rejected": -233.2420654296875,
544
+ "loss": 0.156,
545
+ "rewards/accuracies": 0.9375,
546
+ "rewards/chosen": -0.11594714224338531,
547
+ "rewards/margins": 6.157732963562012,
548
+ "rewards/rejected": -6.273680210113525,
549
+ "step": 360
550
+ },
551
+ {
552
+ "epoch": 0.2191618539908189,
553
+ "grad_norm": 242.78071949212634,
554
+ "learning_rate": 4.787076800146752e-07,
555
+ "logits/chosen": -2.4132204055786133,
556
+ "logits/rejected": -2.403979778289795,
557
+ "logps/chosen": -359.83953857421875,
558
+ "logps/rejected": -234.0885772705078,
559
+ "loss": 0.1843,
560
+ "rewards/accuracies": 0.8812500238418579,
561
+ "rewards/chosen": -0.31123560667037964,
562
+ "rewards/margins": 5.990903377532959,
563
+ "rewards/rejected": -6.302138328552246,
564
+ "step": 370
565
+ },
566
+ {
567
+ "epoch": 0.22508514734192211,
568
+ "grad_norm": 199.5431768901427,
569
+ "learning_rate": 4.765708765365526e-07,
570
+ "logits/chosen": -2.3680121898651123,
571
+ "logits/rejected": -2.37663197517395,
572
+ "logps/chosen": -356.29052734375,
573
+ "logps/rejected": -248.9377899169922,
574
+ "loss": 0.1632,
575
+ "rewards/accuracies": 0.9375,
576
+ "rewards/chosen": -0.06563782691955566,
577
+ "rewards/margins": 6.597121238708496,
578
+ "rewards/rejected": -6.662759304046631,
579
+ "step": 380
580
+ },
581
+ {
582
+ "epoch": 0.2310084406930253,
583
+ "grad_norm": 252.4218453368612,
584
+ "learning_rate": 4.7433716211937587e-07,
585
+ "logits/chosen": -2.3765156269073486,
586
+ "logits/rejected": -2.3698158264160156,
587
+ "logps/chosen": -355.7226867675781,
588
+ "logps/rejected": -231.8157501220703,
589
+ "loss": 0.1849,
590
+ "rewards/accuracies": 0.9125000238418579,
591
+ "rewards/chosen": 0.5422400236129761,
592
+ "rewards/margins": 6.704817295074463,
593
+ "rewards/rejected": -6.1625776290893555,
594
+ "step": 390
595
+ },
596
+ {
597
+ "epoch": 0.23693173404412854,
598
+ "grad_norm": 145.48012597761922,
599
+ "learning_rate": 4.720074921876245e-07,
600
+ "logits/chosen": -2.398963451385498,
601
+ "logits/rejected": -2.394768238067627,
602
+ "logps/chosen": -322.42864990234375,
603
+ "logps/rejected": -235.3686065673828,
604
+ "loss": 0.1707,
605
+ "rewards/accuracies": 0.918749988079071,
606
+ "rewards/chosen": -0.2241075038909912,
607
+ "rewards/margins": 6.238755226135254,
608
+ "rewards/rejected": -6.462862491607666,
609
+ "step": 400
610
+ },
611
+ {
612
+ "epoch": 0.24285502739523174,
613
+ "grad_norm": 292.71968928639006,
614
+ "learning_rate": 4.6958286320873593e-07,
615
+ "logits/chosen": -2.43870210647583,
616
+ "logits/rejected": -2.3871872425079346,
617
+ "logps/chosen": -397.65423583984375,
618
+ "logps/rejected": -255.60720825195312,
619
+ "loss": 0.1541,
620
+ "rewards/accuracies": 0.949999988079071,
621
+ "rewards/chosen": 0.2829202711582184,
622
+ "rewards/margins": 7.782283782958984,
623
+ "rewards/rejected": -7.499364376068115,
624
+ "step": 410
625
+ },
626
+ {
627
+ "epoch": 0.24877832074633496,
628
+ "grad_norm": 224.80127439466196,
629
+ "learning_rate": 4.6706431226688804e-07,
630
+ "logits/chosen": -2.416389226913452,
631
+ "logits/rejected": -2.3854172229766846,
632
+ "logps/chosen": -403.90765380859375,
633
+ "logps/rejected": -268.164794921875,
634
+ "loss": 0.1664,
635
+ "rewards/accuracies": 0.918749988079071,
636
+ "rewards/chosen": -0.07748878002166748,
637
+ "rewards/margins": 8.120170593261719,
638
+ "rewards/rejected": -8.197659492492676,
639
+ "step": 420
640
+ },
641
+ {
642
+ "epoch": 0.2547016140974382,
643
+ "grad_norm": 314.70972896563933,
644
+ "learning_rate": 4.6445291661940777e-07,
645
+ "logits/chosen": -2.3671650886535645,
646
+ "logits/rejected": -2.3478543758392334,
647
+ "logps/chosen": -364.58673095703125,
648
+ "logps/rejected": -255.23056030273438,
649
+ "loss": 0.2109,
650
+ "rewards/accuracies": 0.918749988079071,
651
+ "rewards/chosen": 0.149729922413826,
652
+ "rewards/margins": 7.63533878326416,
653
+ "rewards/rejected": -7.485608100891113,
654
+ "step": 430
655
+ },
656
+ {
657
+ "epoch": 0.2606249074485414,
658
+ "grad_norm": 123.44333079620083,
659
+ "learning_rate": 4.6174979323599715e-07,
660
+ "logits/chosen": -2.418224811553955,
661
+ "logits/rejected": -2.4098281860351562,
662
+ "logps/chosen": -365.10400390625,
663
+ "logps/rejected": -231.907470703125,
664
+ "loss": 0.2034,
665
+ "rewards/accuracies": 0.9375,
666
+ "rewards/chosen": 0.7242127060890198,
667
+ "rewards/margins": 6.6777753829956055,
668
+ "rewards/rejected": -5.9535627365112305,
669
+ "step": 440
670
+ },
671
+ {
672
+ "epoch": 0.2665482007996446,
673
+ "grad_norm": 185.88217229096983,
674
+ "learning_rate": 4.5895609832097277e-07,
675
+ "logits/chosen": -2.4694488048553467,
676
+ "logits/rejected": -2.4495654106140137,
677
+ "logps/chosen": -340.63934326171875,
678
+ "logps/rejected": -227.07791137695312,
679
+ "loss": 0.1729,
680
+ "rewards/accuracies": 0.9437500238418579,
681
+ "rewards/chosen": 0.8438789248466492,
682
+ "rewards/margins": 6.674716949462891,
683
+ "rewards/rejected": -5.830838680267334,
684
+ "step": 450
685
+ },
686
+ {
687
+ "epoch": 0.27247149415074784,
688
+ "grad_norm": 179.72268316546433,
689
+ "learning_rate": 4.560730268187236e-07,
690
+ "logits/chosen": -2.389660358428955,
691
+ "logits/rejected": -2.396211624145508,
692
+ "logps/chosen": -332.08612060546875,
693
+ "logps/rejected": -227.017578125,
694
+ "loss": 0.1946,
695
+ "rewards/accuracies": 0.9375,
696
+ "rewards/chosen": 0.7443017959594727,
697
+ "rewards/margins": 6.060439586639404,
698
+ "rewards/rejected": -5.316138744354248,
699
+ "step": 460
700
+ },
701
+ {
702
+ "epoch": 0.27839478750185104,
703
+ "grad_norm": 110.66902509862196,
704
+ "learning_rate": 4.531018119025989e-07,
705
+ "logits/chosen": -2.4120264053344727,
706
+ "logits/rejected": -2.4028701782226562,
707
+ "logps/chosen": -314.36895751953125,
708
+ "logps/rejected": -218.5847930908203,
709
+ "loss": 0.1598,
710
+ "rewards/accuracies": 0.9125000238418579,
711
+ "rewards/chosen": 0.032317258417606354,
712
+ "rewards/margins": 5.494265556335449,
713
+ "rewards/rejected": -5.461948871612549,
714
+ "step": 470
715
+ },
716
+ {
717
+ "epoch": 0.28431808085295424,
718
+ "grad_norm": 214.74519309520247,
719
+ "learning_rate": 4.5004372444744376e-07,
720
+ "logits/chosen": -2.426173686981201,
721
+ "logits/rejected": -2.408202648162842,
722
+ "logps/chosen": -353.477294921875,
723
+ "logps/rejected": -240.3783721923828,
724
+ "loss": 0.2256,
725
+ "rewards/accuracies": 0.8812500238418579,
726
+ "rewards/chosen": -0.20403608679771423,
727
+ "rewards/margins": 6.32252311706543,
728
+ "rewards/rejected": -6.526559352874756,
729
+ "step": 480
730
+ },
731
+ {
732
+ "epoch": 0.29024137420405743,
733
+ "grad_norm": 191.0207951068833,
734
+ "learning_rate": 4.4690007248600967e-07,
735
+ "logits/chosen": -2.422132968902588,
736
+ "logits/rejected": -2.40049409866333,
737
+ "logps/chosen": -396.6956481933594,
738
+ "logps/rejected": -240.05593872070312,
739
+ "loss": 0.1501,
740
+ "rewards/accuracies": 0.949999988079071,
741
+ "rewards/chosen": 0.5469577312469482,
742
+ "rewards/margins": 7.562119483947754,
743
+ "rewards/rejected": -7.015161037445068,
744
+ "step": 490
745
+ },
746
+ {
747
+ "epoch": 0.2961646675551607,
748
+ "grad_norm": 98.30324866742373,
749
+ "learning_rate": 4.436722006494701e-07,
750
+ "logits/chosen": -2.439347505569458,
751
+ "logits/rejected": -2.4101297855377197,
752
+ "logps/chosen": -355.91400146484375,
753
+ "logps/rejected": -236.58407592773438,
754
+ "loss": 0.1413,
755
+ "rewards/accuracies": 0.949999988079071,
756
+ "rewards/chosen": 0.17143169045448303,
757
+ "rewards/margins": 7.068209648132324,
758
+ "rewards/rejected": -6.896778106689453,
759
+ "step": 500
760
+ },
761
+ {
762
+ "epoch": 0.3020879609062639,
763
+ "grad_norm": 172.09787160323572,
764
+ "learning_rate": 4.4036148959228356e-07,
765
+ "logits/chosen": -2.4719622135162354,
766
+ "logits/rejected": -2.4255988597869873,
767
+ "logps/chosen": -408.50079345703125,
768
+ "logps/rejected": -240.2022247314453,
769
+ "loss": 0.1282,
770
+ "rewards/accuracies": 0.9375,
771
+ "rewards/chosen": 0.17151732742786407,
772
+ "rewards/margins": 7.430660247802734,
773
+ "rewards/rejected": -7.259142875671387,
774
+ "step": 510
775
+ },
776
+ {
777
+ "epoch": 0.3080112542573671,
778
+ "grad_norm": 213.31450495313408,
779
+ "learning_rate": 4.3696935540164705e-07,
780
+ "logits/chosen": -2.4813694953918457,
781
+ "logits/rejected": -2.4537417888641357,
782
+ "logps/chosen": -305.8951110839844,
783
+ "logps/rejected": -212.90701293945312,
784
+ "loss": 0.2087,
785
+ "rewards/accuracies": 0.8999999761581421,
786
+ "rewards/chosen": -0.024075621739029884,
787
+ "rewards/margins": 6.6571245193481445,
788
+ "rewards/rejected": -6.6812005043029785,
789
+ "step": 520
790
+ },
791
+ {
792
+ "epoch": 0.3139345476084703,
793
+ "grad_norm": 83.43235360842944,
794
+ "learning_rate": 4.334972489917947e-07,
795
+ "logits/chosen": -2.4770638942718506,
796
+ "logits/rejected": -2.4384360313415527,
797
+ "logps/chosen": -343.23797607421875,
798
+ "logps/rejected": -231.5509033203125,
799
+ "loss": 0.138,
800
+ "rewards/accuracies": 0.9375,
801
+ "rewards/chosen": 0.25519540905952454,
802
+ "rewards/margins": 7.639869689941406,
803
+ "rewards/rejected": -7.3846755027771,
804
+ "step": 530
805
+ },
806
+ {
807
+ "epoch": 0.31985784095957354,
808
+ "grad_norm": 48.96905584146253,
809
+ "learning_rate": 4.299466554833997e-07,
810
+ "logits/chosen": -2.462367534637451,
811
+ "logits/rejected": -2.433013439178467,
812
+ "logps/chosen": -331.65179443359375,
813
+ "logps/rejected": -237.80703735351562,
814
+ "loss": 0.2046,
815
+ "rewards/accuracies": 0.9375,
816
+ "rewards/chosen": -0.43259042501449585,
817
+ "rewards/margins": 7.181746482849121,
818
+ "rewards/rejected": -7.614336967468262,
819
+ "step": 540
820
+ },
821
+ {
822
+ "epoch": 0.32578113431067673,
823
+ "grad_norm": 157.94476432481386,
824
+ "learning_rate": 4.263190935683449e-07,
825
+ "logits/chosen": -2.454803943634033,
826
+ "logits/rejected": -2.4091312885284424,
827
+ "logps/chosen": -346.77508544921875,
828
+ "logps/rejected": -241.5703125,
829
+ "loss": 0.1686,
830
+ "rewards/accuracies": 0.949999988079071,
831
+ "rewards/chosen": 0.16140493750572205,
832
+ "rewards/margins": 8.025360107421875,
833
+ "rewards/rejected": -7.863955497741699,
834
+ "step": 550
835
+ },
836
+ {
837
+ "epoch": 0.33170442766177993,
838
+ "grad_norm": 144.6392465996477,
839
+ "learning_rate": 4.2261611486013437e-07,
840
+ "logits/chosen": -2.361588478088379,
841
+ "logits/rejected": -2.340947389602661,
842
+ "logps/chosen": -376.0790710449219,
843
+ "logps/rejected": -261.4156799316406,
844
+ "loss": 0.1378,
845
+ "rewards/accuracies": 0.949999988079071,
846
+ "rewards/chosen": -0.15833595395088196,
847
+ "rewards/margins": 7.758933067321777,
848
+ "rewards/rejected": -7.917269706726074,
849
+ "step": 560
850
+ },
851
+ {
852
+ "epoch": 0.3376277210128832,
853
+ "grad_norm": 65.04958454200869,
854
+ "learning_rate": 4.188393032302233e-07,
855
+ "logits/chosen": -2.3772928714752197,
856
+ "logits/rejected": -2.3414306640625,
857
+ "logps/chosen": -364.4604187011719,
858
+ "logps/rejected": -261.02935791015625,
859
+ "loss": 0.1429,
860
+ "rewards/accuracies": 0.956250011920929,
861
+ "rewards/chosen": -0.16394826769828796,
862
+ "rewards/margins": 8.307961463928223,
863
+ "rewards/rejected": -8.471909523010254,
864
+ "step": 570
865
+ },
866
+ {
867
+ "epoch": 0.3435510143639864,
868
+ "grad_norm": 39.92370071225025,
869
+ "learning_rate": 4.1499027413055e-07,
870
+ "logits/chosen": -2.4158213138580322,
871
+ "logits/rejected": -2.400824546813965,
872
+ "logps/chosen": -355.3429260253906,
873
+ "logps/rejected": -234.93331909179688,
874
+ "loss": 0.1184,
875
+ "rewards/accuracies": 0.949999988079071,
876
+ "rewards/chosen": 0.20681849122047424,
877
+ "rewards/margins": 8.111043930053711,
878
+ "rewards/rejected": -7.904225826263428,
879
+ "step": 580
880
+ },
881
+ {
882
+ "epoch": 0.3494743077150896,
883
+ "grad_norm": 113.63629001311939,
884
+ "learning_rate": 4.1107067390256056e-07,
885
+ "logits/chosen": -2.3974995613098145,
886
+ "logits/rejected": -2.385651111602783,
887
+ "logps/chosen": -371.2859802246094,
888
+ "logps/rejected": -251.9286346435547,
889
+ "loss": 0.1818,
890
+ "rewards/accuracies": 0.956250011920929,
891
+ "rewards/chosen": -0.022083889693021774,
892
+ "rewards/margins": 8.15199089050293,
893
+ "rewards/rejected": -8.174076080322266,
894
+ "step": 590
895
+ },
896
+ {
897
+ "epoch": 0.3553976010661928,
898
+ "grad_norm": 76.65150743146755,
899
+ "learning_rate": 4.0708217907302047e-07,
900
+ "logits/chosen": -2.446026563644409,
901
+ "logits/rejected": -2.446206569671631,
902
+ "logps/chosen": -390.41510009765625,
903
+ "logps/rejected": -249.01260375976562,
904
+ "loss": 0.1302,
905
+ "rewards/accuracies": 0.9750000238418579,
906
+ "rewards/chosen": 0.1251598298549652,
907
+ "rewards/margins": 8.266278266906738,
908
+ "rewards/rejected": -8.141119003295898,
909
+ "step": 600
910
+ },
911
+ {
912
+ "epoch": 0.36132089441729603,
913
+ "grad_norm": 154.61613568075327,
914
+ "learning_rate": 4.030264956369157e-07,
915
+ "logits/chosen": -2.431563377380371,
916
+ "logits/rejected": -2.402116298675537,
917
+ "logps/chosen": -367.7977600097656,
918
+ "logps/rejected": -256.6099548339844,
919
+ "loss": 0.1303,
920
+ "rewards/accuracies": 0.9437500238418579,
921
+ "rewards/chosen": -0.1916094869375229,
922
+ "rewards/margins": 8.200936317443848,
923
+ "rewards/rejected": -8.392545700073242,
924
+ "step": 610
925
+ },
926
+ {
927
+ "epoch": 0.36724418776839923,
928
+ "grad_norm": 365.6788351549646,
929
+ "learning_rate": 3.989053583277492e-07,
930
+ "logits/chosen": -2.437472105026245,
931
+ "logits/rejected": -2.424696445465088,
932
+ "logps/chosen": -364.15374755859375,
933
+ "logps/rejected": -273.16595458984375,
934
+ "loss": 0.2123,
935
+ "rewards/accuracies": 0.9312499761581421,
936
+ "rewards/chosen": -1.120966911315918,
937
+ "rewards/margins": 7.895718574523926,
938
+ "rewards/rejected": -9.016683578491211,
939
+ "step": 620
940
+ },
941
+ {
942
+ "epoch": 0.37316748111950243,
943
+ "grad_norm": 170.20992171320333,
944
+ "learning_rate": 3.947205298755447e-07,
945
+ "logits/chosen": -2.448801279067993,
946
+ "logits/rejected": -2.427565813064575,
947
+ "logps/chosen": -411.0587463378906,
948
+ "logps/rejected": -271.62652587890625,
949
+ "loss": 0.139,
950
+ "rewards/accuracies": 0.9437500238418579,
951
+ "rewards/chosen": -0.17977580428123474,
952
+ "rewards/margins": 8.981353759765625,
953
+ "rewards/rejected": -9.161130905151367,
954
+ "step": 630
955
+ },
956
+ {
957
+ "epoch": 0.37909077447060563,
958
+ "grad_norm": 177.25293521329584,
959
+ "learning_rate": 3.9047380025287634e-07,
960
+ "logits/chosen": -2.372959613800049,
961
+ "logits/rejected": -2.3525047302246094,
962
+ "logps/chosen": -356.154296875,
963
+ "logps/rejected": -274.4513854980469,
964
+ "loss": 0.1744,
965
+ "rewards/accuracies": 0.9375,
966
+ "rewards/chosen": -0.8354924321174622,
967
+ "rewards/margins": 8.103582382202148,
968
+ "rewards/rejected": -8.93907356262207,
969
+ "step": 640
970
+ },
971
+ {
972
+ "epoch": 0.3850140678217089,
973
+ "grad_norm": 360.5455787004992,
974
+ "learning_rate": 3.8616698590924523e-07,
975
+ "logits/chosen": -2.4072346687316895,
976
+ "logits/rejected": -2.383105993270874,
977
+ "logps/chosen": -351.6363220214844,
978
+ "logps/rejected": -239.79788208007812,
979
+ "loss": 0.2141,
980
+ "rewards/accuracies": 0.90625,
981
+ "rewards/chosen": -0.6761496663093567,
982
+ "rewards/margins": 7.148809909820557,
983
+ "rewards/rejected": -7.824960231781006,
984
+ "step": 650
985
+ },
986
+ {
987
+ "epoch": 0.3909373611728121,
988
+ "grad_norm": 270.4274889017812,
989
+ "learning_rate": 3.8180192899413123e-07,
990
+ "logits/chosen": -2.4213879108428955,
991
+ "logits/rejected": -2.4075303077697754,
992
+ "logps/chosen": -417.3751525878906,
993
+ "logps/rejected": -265.44854736328125,
994
+ "loss": 0.1753,
995
+ "rewards/accuracies": 0.925000011920929,
996
+ "rewards/chosen": 0.6145623326301575,
997
+ "rewards/margins": 9.045318603515625,
998
+ "rewards/rejected": -8.430756568908691,
999
+ "step": 660
1000
+ },
1001
+ {
1002
+ "epoch": 0.3968606545239153,
1003
+ "grad_norm": 167.55864604832982,
1004
+ "learning_rate": 3.7738049656905225e-07,
1005
+ "logits/chosen": -2.42012619972229,
1006
+ "logits/rejected": -2.407224416732788,
1007
+ "logps/chosen": -367.69439697265625,
1008
+ "logps/rejected": -252.48849487304688,
1009
+ "loss": 0.1777,
1010
+ "rewards/accuracies": 0.9437500238418579,
1011
+ "rewards/chosen": 0.16578371822834015,
1012
+ "rewards/margins": 8.239130020141602,
1013
+ "rewards/rejected": -8.073347091674805,
1014
+ "step": 670
1015
+ },
1016
+ {
1017
+ "epoch": 0.40278394787501853,
1018
+ "grad_norm": 199.41539357853415,
1019
+ "learning_rate": 3.7290457980896787e-07,
1020
+ "logits/chosen": -2.481067180633545,
1021
+ "logits/rejected": -2.4814565181732178,
1022
+ "logps/chosen": -396.1634216308594,
1023
+ "logps/rejected": -250.05990600585938,
1024
+ "loss": 0.1805,
1025
+ "rewards/accuracies": 0.9375,
1026
+ "rewards/chosen": -0.06804431229829788,
1027
+ "rewards/margins": 7.441187381744385,
1028
+ "rewards/rejected": -7.5092315673828125,
1029
+ "step": 680
1030
+ },
1031
+ {
1032
+ "epoch": 0.40870724122612173,
1033
+ "grad_norm": 127.14212704342435,
1034
+ "learning_rate": 3.68376093193369e-07,
1035
+ "logits/chosen": -2.447186231613159,
1036
+ "logits/rejected": -2.441771984100342,
1037
+ "logps/chosen": -361.73516845703125,
1038
+ "logps/rejected": -253.1171417236328,
1039
+ "loss": 0.2343,
1040
+ "rewards/accuracies": 0.9312499761581421,
1041
+ "rewards/chosen": 0.23877505958080292,
1042
+ "rewards/margins": 8.379014015197754,
1043
+ "rewards/rejected": -8.140238761901855,
1044
+ "step": 690
1045
+ },
1046
+ {
1047
+ "epoch": 0.41463053457722493,
1048
+ "grad_norm": 156.38290182401465,
1049
+ "learning_rate": 3.637969736873992e-07,
1050
+ "logits/chosen": -2.468182325363159,
1051
+ "logits/rejected": -2.4434332847595215,
1052
+ "logps/chosen": -365.93463134765625,
1053
+ "logps/rejected": -235.07699584960938,
1054
+ "loss": 0.1846,
1055
+ "rewards/accuracies": 0.9312499761581421,
1056
+ "rewards/chosen": 0.2839972972869873,
1057
+ "rewards/margins": 7.469516754150391,
1058
+ "rewards/rejected": -7.185519218444824,
1059
+ "step": 700
1060
+ },
1061
+ {
1062
+ "epoch": 0.42055382792832813,
1063
+ "grad_norm": 80.92449178173787,
1064
+ "learning_rate": 3.591691799133587e-07,
1065
+ "logits/chosen": -2.4627020359039307,
1066
+ "logits/rejected": -2.4086921215057373,
1067
+ "logps/chosen": -399.5291442871094,
1068
+ "logps/rejected": -245.04385375976562,
1069
+ "loss": 0.1367,
1070
+ "rewards/accuracies": 0.9437500238418579,
1071
+ "rewards/chosen": 0.26195061206817627,
1072
+ "rewards/margins": 8.103439331054688,
1073
+ "rewards/rejected": -7.841488838195801,
1074
+ "step": 710
1075
+ },
1076
+ {
1077
+ "epoch": 0.4264771212794314,
1078
+ "grad_norm": 164.79458174222637,
1079
+ "learning_rate": 3.5449469131294476e-07,
1080
+ "logits/chosen": -2.4840645790100098,
1081
+ "logits/rejected": -2.4384331703186035,
1082
+ "logps/chosen": -356.69952392578125,
1083
+ "logps/rejected": -245.77041625976562,
1084
+ "loss": 0.1222,
1085
+ "rewards/accuracies": 0.9437500238418579,
1086
+ "rewards/chosen": -0.7162678837776184,
1087
+ "rewards/margins": 7.782705783843994,
1088
+ "rewards/rejected": -8.49897289276123,
1089
+ "step": 720
1090
+ },
1091
+ {
1092
+ "epoch": 0.4324004146305346,
1093
+ "grad_norm": 172.5370543885783,
1094
+ "learning_rate": 3.497755073005868e-07,
1095
+ "logits/chosen": -2.47273850440979,
1096
+ "logits/rejected": -2.4551682472229004,
1097
+ "logps/chosen": -394.3757629394531,
1098
+ "logps/rejected": -252.3651885986328,
1099
+ "loss": 0.1914,
1100
+ "rewards/accuracies": 0.96875,
1101
+ "rewards/chosen": -0.40301981568336487,
1102
+ "rewards/margins": 7.574839115142822,
1103
+ "rewards/rejected": -7.9778594970703125,
1104
+ "step": 730
1105
+ },
1106
+ {
1107
+ "epoch": 0.4383237079816378,
1108
+ "grad_norm": 133.2758942704595,
1109
+ "learning_rate": 3.4501364640823926e-07,
1110
+ "logits/chosen": -2.5142226219177246,
1111
+ "logits/rejected": -2.48344087600708,
1112
+ "logps/chosen": -351.15582275390625,
1113
+ "logps/rejected": -255.9786376953125,
1114
+ "loss": 0.1843,
1115
+ "rewards/accuracies": 0.9312499761581421,
1116
+ "rewards/chosen": -0.5328763723373413,
1117
+ "rewards/margins": 7.858250617980957,
1118
+ "rewards/rejected": -8.39112663269043,
1119
+ "step": 740
1120
+ },
1121
+ {
1122
+ "epoch": 0.44424700133274103,
1123
+ "grad_norm": 151.90351812794674,
1124
+ "learning_rate": 3.402111454219966e-07,
1125
+ "logits/chosen": -2.4132637977600098,
1126
+ "logits/rejected": -2.3917102813720703,
1127
+ "logps/chosen": -364.5772705078125,
1128
+ "logps/rejected": -241.1504364013672,
1129
+ "loss": 0.2091,
1130
+ "rewards/accuracies": 0.893750011920929,
1131
+ "rewards/chosen": -0.33990636467933655,
1132
+ "rewards/margins": 7.563208103179932,
1133
+ "rewards/rejected": -7.90311336517334,
1134
+ "step": 750
1135
+ },
1136
+ {
1137
+ "epoch": 0.45017029468384423,
1138
+ "grad_norm": 175.38346833521052,
1139
+ "learning_rate": 3.353700585109005e-07,
1140
+ "logits/chosen": -2.4272594451904297,
1141
+ "logits/rejected": -2.4220120906829834,
1142
+ "logps/chosen": -374.5780334472656,
1143
+ "logps/rejected": -245.2948760986328,
1144
+ "loss": 0.1494,
1145
+ "rewards/accuracies": 0.96875,
1146
+ "rewards/chosen": 0.7049421668052673,
1147
+ "rewards/margins": 8.787199020385742,
1148
+ "rewards/rejected": -8.082255363464355,
1149
+ "step": 760
1150
+ },
1151
+ {
1152
+ "epoch": 0.45609358803494743,
1153
+ "grad_norm": 105.96376988895295,
1154
+ "learning_rate": 3.304924563483129e-07,
1155
+ "logits/chosen": -2.398015260696411,
1156
+ "logits/rejected": -2.389971971511841,
1157
+ "logps/chosen": -383.84344482421875,
1158
+ "logps/rejected": -238.3228759765625,
1159
+ "loss": 0.1439,
1160
+ "rewards/accuracies": 0.949999988079071,
1161
+ "rewards/chosen": 0.1009693592786789,
1162
+ "rewards/margins": 8.56285285949707,
1163
+ "rewards/rejected": -8.461882591247559,
1164
+ "step": 770
1165
+ },
1166
+ {
1167
+ "epoch": 0.4620168813860506,
1168
+ "grad_norm": 210.20985866366246,
1169
+ "learning_rate": 3.255804252262283e-07,
1170
+ "logits/chosen": -2.450796604156494,
1171
+ "logits/rejected": -2.423675060272217,
1172
+ "logps/chosen": -340.67572021484375,
1173
+ "logps/rejected": -249.12155151367188,
1174
+ "loss": 0.1576,
1175
+ "rewards/accuracies": 0.9312499761581421,
1176
+ "rewards/chosen": -0.6850844621658325,
1177
+ "rewards/margins": 7.610543251037598,
1178
+ "rewards/rejected": -8.29562759399414,
1179
+ "step": 780
1180
+ },
1181
+ {
1182
+ "epoch": 0.4679401747371539,
1183
+ "grad_norm": 38.37233058357723,
1184
+ "learning_rate": 3.2063606616290626e-07,
1185
+ "logits/chosen": -2.3822109699249268,
1186
+ "logits/rejected": -2.354607105255127,
1187
+ "logps/chosen": -382.82598876953125,
1188
+ "logps/rejected": -249.13320922851562,
1189
+ "loss": 0.1334,
1190
+ "rewards/accuracies": 0.9624999761581421,
1191
+ "rewards/chosen": 0.23880510032176971,
1192
+ "rewards/margins": 9.092573165893555,
1193
+ "rewards/rejected": -8.853767395019531,
1194
+ "step": 790
1195
+ },
1196
+ {
1197
+ "epoch": 0.4738634680882571,
1198
+ "grad_norm": 116.83040603007437,
1199
+ "learning_rate": 3.1566149400420523e-07,
1200
+ "logits/chosen": -2.448554515838623,
1201
+ "logits/rejected": -2.4384427070617676,
1202
+ "logps/chosen": -364.5985107421875,
1203
+ "logps/rejected": -256.18109130859375,
1204
+ "loss": 0.1401,
1205
+ "rewards/accuracies": 0.956250011920929,
1206
+ "rewards/chosen": 0.08134280145168304,
1207
+ "rewards/margins": 8.44970989227295,
1208
+ "rewards/rejected": -8.368366241455078,
1209
+ "step": 800
1210
+ },
1211
+ {
1212
+ "epoch": 0.4797867614393603,
1213
+ "grad_norm": 120.81771343753893,
1214
+ "learning_rate": 3.1065883651900087e-07,
1215
+ "logits/chosen": -2.403247356414795,
1216
+ "logits/rejected": -2.3994150161743164,
1217
+ "logps/chosen": -369.07373046875,
1218
+ "logps/rejected": -260.661865234375,
1219
+ "loss": 0.1515,
1220
+ "rewards/accuracies": 0.956250011920929,
1221
+ "rewards/chosen": 0.24362042546272278,
1222
+ "rewards/margins": 8.64600658416748,
1223
+ "rewards/rejected": -8.402385711669922,
1224
+ "step": 810
1225
+ },
1226
+ {
1227
+ "epoch": 0.4857100547904635,
1228
+ "grad_norm": 163.57574382904446,
1229
+ "learning_rate": 3.056302334890786e-07,
1230
+ "logits/chosen": -2.472740888595581,
1231
+ "logits/rejected": -2.447871685028076,
1232
+ "logps/chosen": -364.73382568359375,
1233
+ "logps/rejected": -243.4113311767578,
1234
+ "loss": 0.1651,
1235
+ "rewards/accuracies": 0.918749988079071,
1236
+ "rewards/chosen": 0.1390383541584015,
1237
+ "rewards/margins": 7.885109901428223,
1238
+ "rewards/rejected": -7.746070861816406,
1239
+ "step": 820
1240
+ },
1241
+ {
1242
+ "epoch": 0.49163334814156673,
1243
+ "grad_norm": 110.291493965559,
1244
+ "learning_rate": 3.0057783579388586e-07,
1245
+ "logits/chosen": -2.3814120292663574,
1246
+ "logits/rejected": -2.3707213401794434,
1247
+ "logps/chosen": -380.00115966796875,
1248
+ "logps/rejected": -259.52813720703125,
1249
+ "loss": 0.1149,
1250
+ "rewards/accuracies": 0.9750000238418579,
1251
+ "rewards/chosen": -0.010458474978804588,
1252
+ "rewards/margins": 8.064732551574707,
1253
+ "rewards/rejected": -8.075190544128418,
1254
+ "step": 830
1255
+ },
1256
+ {
1257
+ "epoch": 0.4975566414926699,
1258
+ "grad_norm": 149.32298193701925,
1259
+ "learning_rate": 2.9550380449053907e-07,
1260
+ "logits/chosen": -2.4556055068969727,
1261
+ "logits/rejected": -2.4424943923950195,
1262
+ "logps/chosen": -353.40087890625,
1263
+ "logps/rejected": -249.0240020751953,
1264
+ "loss": 0.1836,
1265
+ "rewards/accuracies": 0.925000011920929,
1266
+ "rewards/chosen": -0.6178598403930664,
1267
+ "rewards/margins": 6.908316135406494,
1268
+ "rewards/rejected": -7.526176452636719,
1269
+ "step": 840
1270
+ },
1271
+ {
1272
+ "epoch": 0.5034799348437732,
1273
+ "grad_norm": 298.0495441928926,
1274
+ "learning_rate": 2.904103098894767e-07,
1275
+ "logits/chosen": -2.4620985984802246,
1276
+ "logits/rejected": -2.4430508613586426,
1277
+ "logps/chosen": -381.40338134765625,
1278
+ "logps/rejected": -244.94735717773438,
1279
+ "loss": 0.1312,
1280
+ "rewards/accuracies": 0.9312499761581421,
1281
+ "rewards/chosen": -0.1066194549202919,
1282
+ "rewards/margins": 8.322629928588867,
1283
+ "rewards/rejected": -8.42924976348877,
1284
+ "step": 850
1285
+ },
1286
+ {
1287
+ "epoch": 0.5094032281948764,
1288
+ "grad_norm": 44.65987676821851,
1289
+ "learning_rate": 2.852995306261545e-07,
1290
+ "logits/chosen": -2.4761202335357666,
1291
+ "logits/rejected": -2.458343982696533,
1292
+ "logps/chosen": -416.10015869140625,
1293
+ "logps/rejected": -269.155029296875,
1294
+ "loss": 0.1904,
1295
+ "rewards/accuracies": 0.949999988079071,
1296
+ "rewards/chosen": -0.5461829900741577,
1297
+ "rewards/margins": 8.20594310760498,
1298
+ "rewards/rejected": -8.75212574005127,
1299
+ "step": 860
1300
+ },
1301
+ {
1302
+ "epoch": 0.5153265215459796,
1303
+ "grad_norm": 138.61047314539718,
1304
+ "learning_rate": 2.801736527291797e-07,
1305
+ "logits/chosen": -2.4353713989257812,
1306
+ "logits/rejected": -2.440162420272827,
1307
+ "logps/chosen": -368.56671142578125,
1308
+ "logps/rejected": -262.32000732421875,
1309
+ "loss": 0.1165,
1310
+ "rewards/accuracies": 0.956250011920929,
1311
+ "rewards/chosen": -0.6010546684265137,
1312
+ "rewards/margins": 8.25703239440918,
1313
+ "rewards/rejected": -8.858087539672852,
1314
+ "step": 870
1315
+ },
1316
+ {
1317
+ "epoch": 0.5212498148970828,
1318
+ "grad_norm": 32.99592732987078,
1319
+ "learning_rate": 2.750348686852836e-07,
1320
+ "logits/chosen": -2.4713826179504395,
1321
+ "logits/rejected": -2.4460389614105225,
1322
+ "logps/chosen": -395.4850769042969,
1323
+ "logps/rejected": -263.14697265625,
1324
+ "loss": 0.0963,
1325
+ "rewards/accuracies": 0.956250011920929,
1326
+ "rewards/chosen": -0.47209230065345764,
1327
+ "rewards/margins": 8.793320655822754,
1328
+ "rewards/rejected": -9.265413284301758,
1329
+ "step": 880
1330
+ },
1331
+ {
1332
+ "epoch": 0.527173108248186,
1333
+ "grad_norm": 225.08525680844807,
1334
+ "learning_rate": 2.69885376501531e-07,
1335
+ "logits/chosen": -2.4371562004089355,
1336
+ "logits/rejected": -2.4292545318603516,
1337
+ "logps/chosen": -368.02349853515625,
1338
+ "logps/rejected": -282.25390625,
1339
+ "loss": 0.0939,
1340
+ "rewards/accuracies": 0.956250011920929,
1341
+ "rewards/chosen": -0.43533068895339966,
1342
+ "rewards/margins": 8.678312301635742,
1343
+ "rewards/rejected": -9.113642692565918,
1344
+ "step": 890
1345
+ },
1346
+ {
1347
+ "epoch": 0.5330964015992892,
1348
+ "grad_norm": 117.84513266560047,
1349
+ "learning_rate": 2.647273787651687e-07,
1350
+ "logits/chosen": -2.4487619400024414,
1351
+ "logits/rejected": -2.401111125946045,
1352
+ "logps/chosen": -412.74359130859375,
1353
+ "logps/rejected": -252.1238250732422,
1354
+ "loss": 0.1775,
1355
+ "rewards/accuracies": 0.9437500238418579,
1356
+ "rewards/chosen": -0.4897252917289734,
1357
+ "rewards/margins": 8.060205459594727,
1358
+ "rewards/rejected": -8.549931526184082,
1359
+ "step": 900
1360
+ },
1361
+ {
1362
+ "epoch": 0.5390196949503924,
1363
+ "grad_norm": 109.88865766710877,
1364
+ "learning_rate": 2.5956308170151526e-07,
1365
+ "logits/chosen": -2.403801441192627,
1366
+ "logits/rejected": -2.3616394996643066,
1367
+ "logps/chosen": -408.7337951660156,
1368
+ "logps/rejected": -258.9072570800781,
1369
+ "loss": 0.1848,
1370
+ "rewards/accuracies": 0.9312499761581421,
1371
+ "rewards/chosen": -0.19426563382148743,
1372
+ "rewards/margins": 8.246191024780273,
1373
+ "rewards/rejected": -8.44045639038086,
1374
+ "step": 910
1375
+ },
1376
+ {
1377
+ "epoch": 0.5449429883014957,
1378
+ "grad_norm": 129.4752751715463,
1379
+ "learning_rate": 2.543946942302944e-07,
1380
+ "logits/chosen": -2.4435765743255615,
1381
+ "logits/rejected": -2.440274477005005,
1382
+ "logps/chosen": -338.3968505859375,
1383
+ "logps/rejected": -252.08511352539062,
1384
+ "loss": 0.1796,
1385
+ "rewards/accuracies": 0.918749988079071,
1386
+ "rewards/chosen": -0.5266432762145996,
1387
+ "rewards/margins": 7.261737823486328,
1388
+ "rewards/rejected": -7.7883806228637695,
1389
+ "step": 920
1390
+ },
1391
+ {
1392
+ "epoch": 0.5508662816525989,
1393
+ "grad_norm": 121.9458874503311,
1394
+ "learning_rate": 2.492244270208158e-07,
1395
+ "logits/chosen": -2.441021680831909,
1396
+ "logits/rejected": -2.4099669456481934,
1397
+ "logps/chosen": -378.41082763671875,
1398
+ "logps/rejected": -243.3146209716797,
1399
+ "loss": 0.1649,
1400
+ "rewards/accuracies": 0.9125000238418579,
1401
+ "rewards/chosen": -0.03578373044729233,
1402
+ "rewards/margins": 7.419234275817871,
1403
+ "rewards/rejected": -7.455018520355225,
1404
+ "step": 930
1405
+ },
1406
+ {
1407
+ "epoch": 0.5567895750037021,
1408
+ "grad_norm": 100.11022642436808,
1409
+ "learning_rate": 2.440544915464078e-07,
1410
+ "logits/chosen": -2.3852946758270264,
1411
+ "logits/rejected": -2.384293794631958,
1412
+ "logps/chosen": -395.0909118652344,
1413
+ "logps/rejected": -283.81109619140625,
1414
+ "loss": 0.1261,
1415
+ "rewards/accuracies": 0.9375,
1416
+ "rewards/chosen": -0.41745632886886597,
1417
+ "rewards/margins": 8.085180282592773,
1418
+ "rewards/rejected": -8.502635955810547,
1419
+ "step": 940
1420
+ },
1421
+ {
1422
+ "epoch": 0.5627128683548053,
1423
+ "grad_norm": 239.01205162989254,
1424
+ "learning_rate": 2.3888709913850593e-07,
1425
+ "logits/chosen": -2.415767192840576,
1426
+ "logits/rejected": -2.4106380939483643,
1427
+ "logps/chosen": -308.04681396484375,
1428
+ "logps/rejected": -246.5779266357422,
1429
+ "loss": 0.2266,
1430
+ "rewards/accuracies": 0.893750011920929,
1431
+ "rewards/chosen": -0.8913130760192871,
1432
+ "rewards/margins": 7.0888872146606445,
1433
+ "rewards/rejected": -7.980200290679932,
1434
+ "step": 950
1435
+ },
1436
+ {
1437
+ "epoch": 0.5686361617059085,
1438
+ "grad_norm": 63.97848384116781,
1439
+ "learning_rate": 2.337244600408025e-07,
1440
+ "logits/chosen": -2.3705592155456543,
1441
+ "logits/rejected": -2.3697023391723633,
1442
+ "logps/chosen": -387.23468017578125,
1443
+ "logps/rejected": -259.0006408691406,
1444
+ "loss": 0.1191,
1445
+ "rewards/accuracies": 0.956250011920929,
1446
+ "rewards/chosen": -0.3846256732940674,
1447
+ "rewards/margins": 8.353618621826172,
1448
+ "rewards/rejected": -8.738243103027344,
1449
+ "step": 960
1450
+ },
1451
+ {
1452
+ "epoch": 0.5745594550570117,
1453
+ "grad_norm": 117.6322527275228,
1454
+ "learning_rate": 2.2856878246386085e-07,
1455
+ "logits/chosen": -2.3312573432922363,
1456
+ "logits/rejected": -2.337036371231079,
1457
+ "logps/chosen": -396.6022644042969,
1458
+ "logps/rejected": -264.2213439941406,
1459
+ "loss": 0.1741,
1460
+ "rewards/accuracies": 0.9125000238418579,
1461
+ "rewards/chosen": -0.13234379887580872,
1462
+ "rewards/margins": 8.489058494567871,
1463
+ "rewards/rejected": -8.621402740478516,
1464
+ "step": 970
1465
+ },
1466
+ {
1467
+ "epoch": 0.5804827484081149,
1468
+ "grad_norm": 104.9036090682925,
1469
+ "learning_rate": 2.2342227164060035e-07,
1470
+ "logits/chosen": -2.3327765464782715,
1471
+ "logits/rejected": -2.3284411430358887,
1472
+ "logps/chosen": -389.34027099609375,
1473
+ "logps/rejected": -267.3720703125,
1474
+ "loss": 0.0761,
1475
+ "rewards/accuracies": 0.987500011920929,
1476
+ "rewards/chosen": 0.05067775771021843,
1477
+ "rewards/margins": 8.64169692993164,
1478
+ "rewards/rejected": -8.591019630432129,
1479
+ "step": 980
1480
+ },
1481
+ {
1482
+ "epoch": 0.5864060417592182,
1483
+ "grad_norm": 159.50861121207825,
1484
+ "learning_rate": 2.182871288830533e-07,
1485
+ "logits/chosen": -2.286221742630005,
1486
+ "logits/rejected": -2.3094284534454346,
1487
+ "logps/chosen": -356.39630126953125,
1488
+ "logps/rejected": -273.9451599121094,
1489
+ "loss": 0.1836,
1490
+ "rewards/accuracies": 0.9375,
1491
+ "rewards/chosen": -0.7623409032821655,
1492
+ "rewards/margins": 7.787258148193359,
1493
+ "rewards/rejected": -8.549599647521973,
1494
+ "step": 990
1495
+ },
1496
+ {
1497
+ "epoch": 0.5923293351103214,
1498
+ "grad_norm": 133.61865184419023,
1499
+ "learning_rate": 2.131655506408007e-07,
1500
+ "logits/chosen": -2.295893430709839,
1501
+ "logits/rejected": -2.2877814769744873,
1502
+ "logps/chosen": -390.1469421386719,
1503
+ "logps/rejected": -258.03076171875,
1504
+ "loss": 0.1684,
1505
+ "rewards/accuracies": 0.9375,
1506
+ "rewards/chosen": -0.3687962293624878,
1507
+ "rewards/margins": 8.301212310791016,
1508
+ "rewards/rejected": -8.670007705688477,
1509
+ "step": 1000
1510
+ },
1511
+ {
1512
+ "epoch": 0.5982526284614246,
1513
+ "grad_norm": 281.753571055999,
1514
+ "learning_rate": 2.0805972756148643e-07,
1515
+ "logits/chosen": -2.3332886695861816,
1516
+ "logits/rejected": -2.3315188884735107,
1517
+ "logps/chosen": -409.4767150878906,
1518
+ "logps/rejected": -261.0493469238281,
1519
+ "loss": 0.1785,
1520
+ "rewards/accuracies": 0.925000011920929,
1521
+ "rewards/chosen": -0.04291856661438942,
1522
+ "rewards/margins": 8.258207321166992,
1523
+ "rewards/rejected": -8.301126480102539,
1524
+ "step": 1010
1525
+ },
1526
+ {
1527
+ "epoch": 0.6041759218125278,
1528
+ "grad_norm": 284.7843229605494,
1529
+ "learning_rate": 2.0297184355381432e-07,
1530
+ "logits/chosen": -2.3366830348968506,
1531
+ "logits/rejected": -2.3286259174346924,
1532
+ "logps/chosen": -380.1471862792969,
1533
+ "logps/rejected": -256.72039794921875,
1534
+ "loss": 0.1423,
1535
+ "rewards/accuracies": 0.9624999761581421,
1536
+ "rewards/chosen": 0.037542905658483505,
1537
+ "rewards/margins": 8.408895492553711,
1538
+ "rewards/rejected": -8.37135124206543,
1539
+ "step": 1020
1540
+ },
1541
+ {
1542
+ "epoch": 0.610099215163631,
1543
+ "grad_norm": 211.9761172407754,
1544
+ "learning_rate": 1.9790407485342638e-07,
1545
+ "logits/chosen": -2.2436161041259766,
1546
+ "logits/rejected": -2.2300117015838623,
1547
+ "logps/chosen": -398.33917236328125,
1548
+ "logps/rejected": -265.8940734863281,
1549
+ "loss": 0.1086,
1550
+ "rewards/accuracies": 0.956250011920929,
1551
+ "rewards/chosen": -0.20479007065296173,
1552
+ "rewards/margins": 8.657182693481445,
1553
+ "rewards/rejected": -8.861973762512207,
1554
+ "step": 1030
1555
+ },
1556
+ {
1557
+ "epoch": 0.6160225085147342,
1558
+ "grad_norm": 170.42441601315315,
1559
+ "learning_rate": 1.928585890920641e-07,
1560
+ "logits/chosen": -2.310929536819458,
1561
+ "logits/rejected": -2.305142879486084,
1562
+ "logps/chosen": -329.87359619140625,
1563
+ "logps/rejected": -256.7166748046875,
1564
+ "loss": 0.1104,
1565
+ "rewards/accuracies": 0.96875,
1566
+ "rewards/chosen": -0.6634355783462524,
1567
+ "rewards/margins": 8.081941604614258,
1568
+ "rewards/rejected": -8.745377540588379,
1569
+ "step": 1040
1570
+ },
1571
+ {
1572
+ "epoch": 0.6219458018658374,
1573
+ "grad_norm": 128.54909030583076,
1574
+ "learning_rate": 1.8783754437040902e-07,
1575
+ "logits/chosen": -2.3843979835510254,
1576
+ "logits/rejected": -2.374628782272339,
1577
+ "logps/chosen": -384.445068359375,
1578
+ "logps/rejected": -262.91778564453125,
1579
+ "loss": 0.1279,
1580
+ "rewards/accuracies": 0.956250011920929,
1581
+ "rewards/chosen": -0.14523005485534668,
1582
+ "rewards/margins": 9.181239128112793,
1583
+ "rewards/rejected": -9.326468467712402,
1584
+ "step": 1050
1585
+ },
1586
+ {
1587
+ "epoch": 0.6278690952169406,
1588
+ "grad_norm": 307.8448068760573,
1589
+ "learning_rate": 1.8284308833500118e-07,
1590
+ "logits/chosen": -2.3700027465820312,
1591
+ "logits/rejected": -2.3581337928771973,
1592
+ "logps/chosen": -355.612060546875,
1593
+ "logps/rejected": -268.969970703125,
1594
+ "loss": 0.133,
1595
+ "rewards/accuracies": 0.949999988079071,
1596
+ "rewards/chosen": -0.7148059010505676,
1597
+ "rewards/margins": 8.257380485534668,
1598
+ "rewards/rejected": -8.972186088562012,
1599
+ "step": 1060
1600
+ },
1601
+ {
1602
+ "epoch": 0.6337923885680439,
1603
+ "grad_norm": 214.38553119597148,
1604
+ "learning_rate": 1.7787735725962756e-07,
1605
+ "logits/chosen": -2.358398199081421,
1606
+ "logits/rejected": -2.3408474922180176,
1607
+ "logps/chosen": -337.56427001953125,
1608
+ "logps/rejected": -256.54119873046875,
1609
+ "loss": 0.2134,
1610
+ "rewards/accuracies": 0.918749988079071,
1611
+ "rewards/chosen": -1.1531927585601807,
1612
+ "rewards/margins": 7.760645389556885,
1613
+ "rewards/rejected": -8.913838386535645,
1614
+ "step": 1070
1615
+ },
1616
+ {
1617
+ "epoch": 0.6397156819191471,
1618
+ "grad_norm": 175.56218680703643,
1619
+ "learning_rate": 1.7294247513157616e-07,
1620
+ "logits/chosen": -2.3990116119384766,
1621
+ "logits/rejected": -2.3715598583221436,
1622
+ "logps/chosen": -430.0047302246094,
1623
+ "logps/rejected": -277.5758361816406,
1624
+ "loss": 0.1585,
1625
+ "rewards/accuracies": 0.949999988079071,
1626
+ "rewards/chosen": -0.28831297159194946,
1627
+ "rewards/margins": 9.353979110717773,
1628
+ "rewards/rejected": -9.642291069030762,
1629
+ "step": 1080
1630
+ },
1631
+ {
1632
+ "epoch": 0.6456389752702503,
1633
+ "grad_norm": 251.12704474839504,
1634
+ "learning_rate": 1.6804055274314494e-07,
1635
+ "logits/chosen": -2.3074963092803955,
1636
+ "logits/rejected": -2.3215384483337402,
1637
+ "logps/chosen": -360.34307861328125,
1638
+ "logps/rejected": -262.5448913574219,
1639
+ "loss": 0.144,
1640
+ "rewards/accuracies": 0.9624999761581421,
1641
+ "rewards/chosen": -0.527413010597229,
1642
+ "rewards/margins": 7.982884883880615,
1643
+ "rewards/rejected": -8.510297775268555,
1644
+ "step": 1090
1645
+ },
1646
+ {
1647
+ "epoch": 0.6515622686213535,
1648
+ "grad_norm": 199.1781794411605,
1649
+ "learning_rate": 1.6317368678879496e-07,
1650
+ "logits/chosen": -2.358985662460327,
1651
+ "logits/rejected": -2.3370273113250732,
1652
+ "logps/chosen": -427.634521484375,
1653
+ "logps/rejected": -271.89276123046875,
1654
+ "loss": 0.1672,
1655
+ "rewards/accuracies": 0.90625,
1656
+ "rewards/chosen": -0.5357122421264648,
1657
+ "rewards/margins": 8.577353477478027,
1658
+ "rewards/rejected": -9.113065719604492,
1659
+ "step": 1100
1660
+ },
1661
+ {
1662
+ "epoch": 0.6574855619724567,
1663
+ "grad_norm": 117.97757864532069,
1664
+ "learning_rate": 1.5834395896833281e-07,
1665
+ "logits/chosen": -2.4176013469696045,
1666
+ "logits/rejected": -2.4249093532562256,
1667
+ "logps/chosen": -401.90667724609375,
1668
+ "logps/rejected": -275.45086669921875,
1669
+ "loss": 0.1541,
1670
+ "rewards/accuracies": 0.9624999761581421,
1671
+ "rewards/chosen": -0.13858655095100403,
1672
+ "rewards/margins": 9.566910743713379,
1673
+ "rewards/rejected": -9.705496788024902,
1674
+ "step": 1110
1675
+ },
1676
+ {
1677
+ "epoch": 0.6634088553235599,
1678
+ "grad_norm": 258.70885157576424,
1679
+ "learning_rate": 1.535534350965075e-07,
1680
+ "logits/chosen": -2.3172340393066406,
1681
+ "logits/rejected": -2.309823989868164,
1682
+ "logps/chosen": -340.2889709472656,
1683
+ "logps/rejected": -265.4190979003906,
1684
+ "loss": 0.1473,
1685
+ "rewards/accuracies": 0.9375,
1686
+ "rewards/chosen": -1.003075361251831,
1687
+ "rewards/margins": 8.131772994995117,
1688
+ "rewards/rejected": -9.134848594665527,
1689
+ "step": 1120
1690
+ },
1691
+ {
1692
+ "epoch": 0.6693321486746631,
1693
+ "grad_norm": 201.96502150689784,
1694
+ "learning_rate": 1.4880416421940154e-07,
1695
+ "logits/chosen": -2.4160609245300293,
1696
+ "logits/rejected": -2.4233767986297607,
1697
+ "logps/chosen": -405.70684814453125,
1698
+ "logps/rejected": -260.34912109375,
1699
+ "loss": 0.1724,
1700
+ "rewards/accuracies": 0.9375,
1701
+ "rewards/chosen": 0.08480462431907654,
1702
+ "rewards/margins": 8.918376922607422,
1703
+ "rewards/rejected": -8.833572387695312,
1704
+ "step": 1130
1705
+ },
1706
+ {
1707
+ "epoch": 0.6752554420257664,
1708
+ "grad_norm": 181.74682648839334,
1709
+ "learning_rate": 1.4409817773799459e-07,
1710
+ "logits/chosen": -2.4215950965881348,
1711
+ "logits/rejected": -2.4049665927886963,
1712
+ "logps/chosen": -366.7375183105469,
1713
+ "logps/rejected": -254.3837432861328,
1714
+ "loss": 0.0756,
1715
+ "rewards/accuracies": 0.96875,
1716
+ "rewards/chosen": -0.5654062032699585,
1717
+ "rewards/margins": 8.568774223327637,
1718
+ "rewards/rejected": -9.134181022644043,
1719
+ "step": 1140
1720
+ },
1721
+ {
1722
+ "epoch": 0.6811787353768696,
1723
+ "grad_norm": 67.27024810633608,
1724
+ "learning_rate": 1.3943748853927385e-07,
1725
+ "logits/chosen": -2.405170202255249,
1726
+ "logits/rejected": -2.371568202972412,
1727
+ "logps/chosen": -390.42718505859375,
1728
+ "logps/rejected": -272.73919677734375,
1729
+ "loss": 0.1351,
1730
+ "rewards/accuracies": 0.9437500238418579,
1731
+ "rewards/chosen": -0.37090998888015747,
1732
+ "rewards/margins": 8.646610260009766,
1733
+ "rewards/rejected": -9.017520904541016,
1734
+ "step": 1150
1735
+ },
1736
+ {
1737
+ "epoch": 0.6871020287279728,
1738
+ "grad_norm": 55.76657618504751,
1739
+ "learning_rate": 1.3482409013526436e-07,
1740
+ "logits/chosen": -2.441098213195801,
1741
+ "logits/rejected": -2.4157867431640625,
1742
+ "logps/chosen": -384.2054443359375,
1743
+ "logps/rejected": -268.0479736328125,
1744
+ "loss": 0.1027,
1745
+ "rewards/accuracies": 0.9624999761581421,
1746
+ "rewards/chosen": -0.44497281312942505,
1747
+ "rewards/margins": 8.480072021484375,
1748
+ "rewards/rejected": -8.925044059753418,
1749
+ "step": 1160
1750
+ },
1751
+ {
1752
+ "epoch": 0.693025322079076,
1753
+ "grad_norm": 25.040104944811635,
1754
+ "learning_rate": 1.302599558103456e-07,
1755
+ "logits/chosen": -2.3982086181640625,
1756
+ "logits/rejected": -2.385756015777588,
1757
+ "logps/chosen": -382.56939697265625,
1758
+ "logps/rejected": -260.25616455078125,
1759
+ "loss": 0.1179,
1760
+ "rewards/accuracies": 0.949999988079071,
1761
+ "rewards/chosen": -0.5692565441131592,
1762
+ "rewards/margins": 8.688024520874023,
1763
+ "rewards/rejected": -9.257280349731445,
1764
+ "step": 1170
1765
+ },
1766
+ {
1767
+ "epoch": 0.6989486154301792,
1768
+ "grad_norm": 296.46624172556943,
1769
+ "learning_rate": 1.257470377772214e-07,
1770
+ "logits/chosen": -2.3739066123962402,
1771
+ "logits/rejected": -2.3816215991973877,
1772
+ "logps/chosen": -385.3483581542969,
1773
+ "logps/rejected": -271.0584411621094,
1774
+ "loss": 0.1722,
1775
+ "rewards/accuracies": 0.956250011920929,
1776
+ "rewards/chosen": 0.0011343419319018722,
1777
+ "rewards/margins": 9.203813552856445,
1778
+ "rewards/rejected": -9.202679634094238,
1779
+ "step": 1180
1780
+ },
1781
+ {
1782
+ "epoch": 0.7048719087812824,
1783
+ "grad_norm": 62.58357200837184,
1784
+ "learning_rate": 1.2128726634190046e-07,
1785
+ "logits/chosen": -2.4125571250915527,
1786
+ "logits/rejected": -2.3731584548950195,
1787
+ "logps/chosen": -360.48260498046875,
1788
+ "logps/rejected": -257.780517578125,
1789
+ "loss": 0.1151,
1790
+ "rewards/accuracies": 0.949999988079071,
1791
+ "rewards/chosen": 0.00764580350369215,
1792
+ "rewards/margins": 8.383014678955078,
1793
+ "rewards/rejected": -8.375368118286133,
1794
+ "step": 1190
1795
+ },
1796
+ {
1797
+ "epoch": 0.7107952021323856,
1798
+ "grad_norm": 104.3167050157556,
1799
+ "learning_rate": 1.1688254907804992e-07,
1800
+ "logits/chosen": -2.4090094566345215,
1801
+ "logits/rejected": -2.4037535190582275,
1802
+ "logps/chosen": -446.6856384277344,
1803
+ "logps/rejected": -272.03125,
1804
+ "loss": 0.1451,
1805
+ "rewards/accuracies": 0.918749988079071,
1806
+ "rewards/chosen": 0.24328398704528809,
1807
+ "rewards/margins": 8.856546401977539,
1808
+ "rewards/rejected": -8.613263130187988,
1809
+ "step": 1200
1810
+ },
1811
+ {
1812
+ "epoch": 0.7167184954834889,
1813
+ "grad_norm": 167.28701038200384,
1814
+ "learning_rate": 1.1253477001106956e-07,
1815
+ "logits/chosen": -2.4815921783447266,
1816
+ "logits/rejected": -2.4600720405578613,
1817
+ "logps/chosen": -373.46539306640625,
1818
+ "logps/rejected": -249.20217895507812,
1819
+ "loss": 0.1822,
1820
+ "rewards/accuracies": 0.956250011920929,
1821
+ "rewards/chosen": 0.3091612756252289,
1822
+ "rewards/margins": 8.595071792602539,
1823
+ "rewards/rejected": -8.285909652709961,
1824
+ "step": 1210
1825
+ },
1826
+ {
1827
+ "epoch": 0.7226417888345921,
1828
+ "grad_norm": 103.53904259555343,
1829
+ "learning_rate": 1.0824578881224065e-07,
1830
+ "logits/chosen": -2.4788851737976074,
1831
+ "logits/rejected": -2.473902940750122,
1832
+ "logps/chosen": -348.9431457519531,
1833
+ "logps/rejected": -238.5637664794922,
1834
+ "loss": 0.1396,
1835
+ "rewards/accuracies": 0.9375,
1836
+ "rewards/chosen": -0.04494175314903259,
1837
+ "rewards/margins": 7.9483323097229,
1838
+ "rewards/rejected": -7.993273735046387,
1839
+ "step": 1220
1840
+ },
1841
+ {
1842
+ "epoch": 0.7285650821856953,
1843
+ "grad_norm": 60.68963545277756,
1844
+ "learning_rate": 1.0401744000328918e-07,
1845
+ "logits/chosen": -2.423682451248169,
1846
+ "logits/rejected": -2.424591302871704,
1847
+ "logps/chosen": -362.5735168457031,
1848
+ "logps/rejected": -250.6082305908203,
1849
+ "loss": 0.0947,
1850
+ "rewards/accuracies": 0.96875,
1851
+ "rewards/chosen": 0.03537973761558533,
1852
+ "rewards/margins": 8.45991325378418,
1853
+ "rewards/rejected": -8.42453384399414,
1854
+ "step": 1230
1855
+ },
1856
+ {
1857
+ "epoch": 0.7344883755367985,
1858
+ "grad_norm": 178.49917347104252,
1859
+ "learning_rate": 9.985153217170902e-08,
1860
+ "logits/chosen": -2.441357135772705,
1861
+ "logits/rejected": -2.414604902267456,
1862
+ "logps/chosen": -331.5675964355469,
1863
+ "logps/rejected": -249.1182861328125,
1864
+ "loss": 0.1617,
1865
+ "rewards/accuracies": 0.9375,
1866
+ "rewards/chosen": -0.4034241735935211,
1867
+ "rewards/margins": 7.731410026550293,
1868
+ "rewards/rejected": -8.134834289550781,
1869
+ "step": 1240
1870
+ },
1871
+ {
1872
+ "epoch": 0.7404116688879017,
1873
+ "grad_norm": 150.3556045078429,
1874
+ "learning_rate": 9.574984719717553e-08,
1875
+ "logits/chosen": -2.402959108352661,
1876
+ "logits/rejected": -2.3822813034057617,
1877
+ "logps/chosen": -344.8562316894531,
1878
+ "logps/rejected": -261.72198486328125,
1879
+ "loss": 0.1141,
1880
+ "rewards/accuracies": 0.9437500238418579,
1881
+ "rewards/chosen": -0.2789481282234192,
1882
+ "rewards/margins": 8.856972694396973,
1883
+ "rewards/rejected": -9.135921478271484,
1884
+ "step": 1250
1885
+ },
1886
+ {
1887
+ "epoch": 0.7463349622390049,
1888
+ "grad_norm": 107.9110175024558,
1889
+ "learning_rate": 9.171413948938459e-08,
1890
+ "logits/chosen": -2.4185938835144043,
1891
+ "logits/rejected": -2.4270195960998535,
1892
+ "logps/chosen": -351.55206298828125,
1893
+ "logps/rejected": -245.40463256835938,
1894
+ "loss": 0.1194,
1895
+ "rewards/accuracies": 0.9750000238418579,
1896
+ "rewards/chosen": -0.38523998856544495,
1897
+ "rewards/margins": 8.319580078125,
1898
+ "rewards/rejected": -8.70482063293457,
1899
+ "step": 1260
1900
+ },
1901
+ {
1902
+ "epoch": 0.7522582555901081,
1903
+ "grad_norm": 201.2050831618457,
1904
+ "learning_rate": 8.774613523764049e-08,
1905
+ "logits/chosen": -2.417402744293213,
1906
+ "logits/rejected": -2.3926000595092773,
1907
+ "logps/chosen": -295.12255859375,
1908
+ "logps/rejected": -243.2334442138672,
1909
+ "loss": 0.1174,
1910
+ "rewards/accuracies": 0.9437500238418579,
1911
+ "rewards/chosen": -0.7759536504745483,
1912
+ "rewards/margins": 7.477177619934082,
1913
+ "rewards/rejected": -8.253131866455078,
1914
+ "step": 1270
1915
+ },
1916
+ {
1917
+ "epoch": 0.7581815489412113,
1918
+ "grad_norm": 118.14466939117713,
1919
+ "learning_rate": 8.384753167251412e-08,
1920
+ "logits/chosen": -2.4332449436187744,
1921
+ "logits/rejected": -2.4204506874084473,
1922
+ "logps/chosen": -347.2571716308594,
1923
+ "logps/rejected": -239.56527709960938,
1924
+ "loss": 0.1154,
1925
+ "rewards/accuracies": 0.9312499761581421,
1926
+ "rewards/chosen": -0.4777204990386963,
1927
+ "rewards/margins": 7.958296775817871,
1928
+ "rewards/rejected": -8.436017990112305,
1929
+ "step": 1280
1930
+ },
1931
+ {
1932
+ "epoch": 0.7641048422923146,
1933
+ "grad_norm": 115.9052038218948,
1934
+ "learning_rate": 8.001999633988942e-08,
1935
+ "logits/chosen": -2.409921169281006,
1936
+ "logits/rejected": -2.38704776763916,
1937
+ "logps/chosen": -343.9907531738281,
1938
+ "logps/rejected": -241.80130004882812,
1939
+ "loss": 0.1142,
1940
+ "rewards/accuracies": 0.9624999761581421,
1941
+ "rewards/chosen": -0.26248881220817566,
1942
+ "rewards/margins": 8.137993812561035,
1943
+ "rewards/rejected": -8.400481224060059,
1944
+ "step": 1290
1945
+ },
1946
+ {
1947
+ "epoch": 0.7700281356434178,
1948
+ "grad_norm": 44.99756218547529,
1949
+ "learning_rate": 7.62651663877042e-08,
1950
+ "logits/chosen": -2.431091547012329,
1951
+ "logits/rejected": -2.4076178073883057,
1952
+ "logps/chosen": -413.41339111328125,
1953
+ "logps/rejected": -272.5387268066406,
1954
+ "loss": 0.1154,
1955
+ "rewards/accuracies": 0.949999988079071,
1956
+ "rewards/chosen": -0.2713491916656494,
1957
+ "rewards/margins": 8.43775749206543,
1958
+ "rewards/rejected": -8.709107398986816,
1959
+ "step": 1300
1960
+ },
1961
+ {
1962
+ "epoch": 0.775951428994521,
1963
+ "grad_norm": 271.4269072656633,
1964
+ "learning_rate": 7.258464786569549e-08,
1965
+ "logits/chosen": -2.382718086242676,
1966
+ "logits/rejected": -2.3769688606262207,
1967
+ "logps/chosen": -327.7436218261719,
1968
+ "logps/rejected": -232.5410614013672,
1969
+ "loss": 0.1171,
1970
+ "rewards/accuracies": 0.9312499761581421,
1971
+ "rewards/chosen": -0.5554865002632141,
1972
+ "rewards/margins": 7.468624114990234,
1973
+ "rewards/rejected": -8.024110794067383,
1974
+ "step": 1310
1975
+ },
1976
+ {
1977
+ "epoch": 0.7818747223456242,
1978
+ "grad_norm": 101.64320308184249,
1979
+ "learning_rate": 6.898001503844483e-08,
1980
+ "logits/chosen": -2.417436361312866,
1981
+ "logits/rejected": -2.4156861305236816,
1982
+ "logps/chosen": -355.09088134765625,
1983
+ "logps/rejected": -256.650634765625,
1984
+ "loss": 0.0912,
1985
+ "rewards/accuracies": 0.9624999761581421,
1986
+ "rewards/chosen": -0.4817001223564148,
1987
+ "rewards/margins": 8.431524276733398,
1988
+ "rewards/rejected": -8.913224220275879,
1989
+ "step": 1320
1990
+ },
1991
+ {
1992
+ "epoch": 0.7877980156967274,
1993
+ "grad_norm": 70.70476612750028,
1994
+ "learning_rate": 6.545280971202014e-08,
1995
+ "logits/chosen": -2.3889780044555664,
1996
+ "logits/rejected": -2.3656888008117676,
1997
+ "logps/chosen": -353.8454895019531,
1998
+ "logps/rejected": -263.7891540527344,
1999
+ "loss": 0.1696,
2000
+ "rewards/accuracies": 0.9312499761581421,
2001
+ "rewards/chosen": -0.36209869384765625,
2002
+ "rewards/margins": 8.301480293273926,
2003
+ "rewards/rejected": -8.663579940795898,
2004
+ "step": 1330
2005
+ },
2006
+ {
2007
+ "epoch": 0.7937213090478306,
2008
+ "grad_norm": 115.20773379897521,
2009
+ "learning_rate": 6.200454057450022e-08,
2010
+ "logits/chosen": -2.4155004024505615,
2011
+ "logits/rejected": -2.3988840579986572,
2012
+ "logps/chosen": -358.6512756347656,
2013
+ "logps/rejected": -260.19317626953125,
2014
+ "loss": 0.1052,
2015
+ "rewards/accuracies": 0.949999988079071,
2016
+ "rewards/chosen": -0.13175497949123383,
2017
+ "rewards/margins": 8.193792343139648,
2018
+ "rewards/rejected": -8.325546264648438,
2019
+ "step": 1340
2020
+ },
2021
+ {
2022
+ "epoch": 0.7996446023989338,
2023
+ "grad_norm": 348.05960789906396,
2024
+ "learning_rate": 5.863668255066492e-08,
2025
+ "logits/chosen": -2.415785312652588,
2026
+ "logits/rejected": -2.402935028076172,
2027
+ "logps/chosen": -328.21258544921875,
2028
+ "logps/rejected": -232.1653289794922,
2029
+ "loss": 0.0952,
2030
+ "rewards/accuracies": 0.949999988079071,
2031
+ "rewards/chosen": -0.03695619851350784,
2032
+ "rewards/margins": 8.016546249389648,
2033
+ "rewards/rejected": -8.053503036499023,
2034
+ "step": 1350
2035
+ },
2036
+ {
2037
+ "epoch": 0.8055678957500371,
2038
+ "grad_norm": 164.95215219581857,
2039
+ "learning_rate": 5.53506761711274e-08,
2040
+ "logits/chosen": -2.4385123252868652,
2041
+ "logits/rejected": -2.410877227783203,
2042
+ "logps/chosen": -429.80712890625,
2043
+ "logps/rejected": -269.0475769042969,
2044
+ "loss": 0.184,
2045
+ "rewards/accuracies": 0.918749988079071,
2046
+ "rewards/chosen": 0.22647914290428162,
2047
+ "rewards/margins": 9.079859733581543,
2048
+ "rewards/rejected": -8.85338020324707,
2049
+ "step": 1360
2050
+ },
2051
+ {
2052
+ "epoch": 0.8114911891011403,
2053
+ "grad_norm": 153.76387128912395,
2054
+ "learning_rate": 5.2147926956177174e-08,
2055
+ "logits/chosen": -2.400052309036255,
2056
+ "logits/rejected": -2.3625330924987793,
2057
+ "logps/chosen": -375.1744079589844,
2058
+ "logps/rejected": -251.15213012695312,
2059
+ "loss": 0.1689,
2060
+ "rewards/accuracies": 0.9312499761581421,
2061
+ "rewards/chosen": -0.37147119641304016,
2062
+ "rewards/margins": 7.78417444229126,
2063
+ "rewards/rejected": -8.155645370483398,
2064
+ "step": 1370
2065
+ },
2066
+ {
2067
+ "epoch": 0.8174144824522435,
2068
+ "grad_norm": 61.83899130563118,
2069
+ "learning_rate": 4.902980481459834e-08,
2070
+ "logits/chosen": -2.410595655441284,
2071
+ "logits/rejected": -2.3901000022888184,
2072
+ "logps/chosen": -425.96466064453125,
2073
+ "logps/rejected": -264.555908203125,
2074
+ "loss": 0.1532,
2075
+ "rewards/accuracies": 0.925000011920929,
2076
+ "rewards/chosen": 0.039742302149534225,
2077
+ "rewards/margins": 8.688650131225586,
2078
+ "rewards/rejected": -8.648908615112305,
2079
+ "step": 1380
2080
+ },
2081
+ {
2082
+ "epoch": 0.8233377758033467,
2083
+ "grad_norm": 66.74057591923928,
2084
+ "learning_rate": 4.5997643457719646e-08,
2085
+ "logits/chosen": -2.343575954437256,
2086
+ "logits/rejected": -2.366262912750244,
2087
+ "logps/chosen": -375.5441589355469,
2088
+ "logps/rejected": -252.6354522705078,
2089
+ "loss": 0.1891,
2090
+ "rewards/accuracies": 0.925000011920929,
2091
+ "rewards/chosen": -0.5915766954421997,
2092
+ "rewards/margins": 8.14863395690918,
2093
+ "rewards/rejected": -8.74021053314209,
2094
+ "step": 1390
2095
+ },
2096
+ {
2097
+ "epoch": 0.8292610691544499,
2098
+ "grad_norm": 114.35276279118753,
2099
+ "learning_rate": 4.305273982894772e-08,
2100
+ "logits/chosen": -2.423459529876709,
2101
+ "logits/rejected": -2.391444683074951,
2102
+ "logps/chosen": -352.93212890625,
2103
+ "logps/rejected": -244.15249633789062,
2104
+ "loss": 0.104,
2105
+ "rewards/accuracies": 0.956250011920929,
2106
+ "rewards/chosen": -0.4639635980129242,
2107
+ "rewards/margins": 8.43293285369873,
2108
+ "rewards/rejected": -8.896896362304688,
2109
+ "step": 1400
2110
+ },
2111
+ {
2112
+ "epoch": 0.8351843625055531,
2113
+ "grad_norm": 289.8483422636645,
2114
+ "learning_rate": 4.0196353549026786e-08,
2115
+ "logits/chosen": -2.4569976329803467,
2116
+ "logits/rejected": -2.421668529510498,
2117
+ "logps/chosen": -413.7970275878906,
2118
+ "logps/rejected": -260.17559814453125,
2119
+ "loss": 0.0994,
2120
+ "rewards/accuracies": 0.956250011920929,
2121
+ "rewards/chosen": -0.029109030961990356,
2122
+ "rewards/margins": 8.420695304870605,
2123
+ "rewards/rejected": -8.449804306030273,
2124
+ "step": 1410
2125
+ },
2126
+ {
2127
+ "epoch": 0.8411076558566563,
2128
+ "grad_norm": 268.194865051895,
2129
+ "learning_rate": 3.742970637726181e-08,
2130
+ "logits/chosen": -2.401829242706299,
2131
+ "logits/rejected": -2.3853981494903564,
2132
+ "logps/chosen": -336.02154541015625,
2133
+ "logps/rejected": -260.9532775878906,
2134
+ "loss": 0.1105,
2135
+ "rewards/accuracies": 0.949999988079071,
2136
+ "rewards/chosen": -0.7859910726547241,
2137
+ "rewards/margins": 7.957832336425781,
2138
+ "rewards/rejected": -8.743824005126953,
2139
+ "step": 1420
2140
+ },
2141
+ {
2142
+ "epoch": 0.8470309492077596,
2143
+ "grad_norm": 108.72556342354649,
2144
+ "learning_rate": 3.4753981688937284e-08,
2145
+ "logits/chosen": -2.4282424449920654,
2146
+ "logits/rejected": -2.4077298641204834,
2147
+ "logps/chosen": -372.74859619140625,
2148
+ "logps/rejected": -255.5635986328125,
2149
+ "loss": 0.1373,
2150
+ "rewards/accuracies": 0.9624999761581421,
2151
+ "rewards/chosen": -0.21870934963226318,
2152
+ "rewards/margins": 8.647294998168945,
2153
+ "rewards/rejected": -8.866004943847656,
2154
+ "step": 1430
2155
+ },
2156
+ {
2157
+ "epoch": 0.8529542425588628,
2158
+ "grad_norm": 91.89889805093581,
2159
+ "learning_rate": 3.217032396915265e-08,
2160
+ "logits/chosen": -2.392305850982666,
2161
+ "logits/rejected": -2.3840205669403076,
2162
+ "logps/chosen": -384.5450439453125,
2163
+ "logps/rejected": -271.7799987792969,
2164
+ "loss": 0.1026,
2165
+ "rewards/accuracies": 0.956250011920929,
2166
+ "rewards/chosen": 0.14383617043495178,
2167
+ "rewards/margins": 8.799039840698242,
2168
+ "rewards/rejected": -8.655204772949219,
2169
+ "step": 1440
2170
+ },
2171
+ {
2172
+ "epoch": 0.858877535909966,
2173
+ "grad_norm": 110.87871865085356,
2174
+ "learning_rate": 2.9679838323293404e-08,
2175
+ "logits/chosen": -2.4049434661865234,
2176
+ "logits/rejected": -2.3713347911834717,
2177
+ "logps/chosen": -388.73480224609375,
2178
+ "logps/rejected": -259.99884033203125,
2179
+ "loss": 0.1238,
2180
+ "rewards/accuracies": 0.96875,
2181
+ "rewards/chosen": -0.4035407602787018,
2182
+ "rewards/margins": 8.943741798400879,
2183
+ "rewards/rejected": -9.347283363342285,
2184
+ "step": 1450
2185
+ },
2186
+ {
2187
+ "epoch": 0.8648008292610692,
2188
+ "grad_norm": 173.62149862701466,
2189
+ "learning_rate": 2.728359000434488e-08,
2190
+ "logits/chosen": -2.460850715637207,
2191
+ "logits/rejected": -2.4532432556152344,
2192
+ "logps/chosen": -408.2056884765625,
2193
+ "logps/rejected": -253.97256469726562,
2194
+ "loss": 0.1465,
2195
+ "rewards/accuracies": 0.925000011920929,
2196
+ "rewards/chosen": -0.10670559108257294,
2197
+ "rewards/margins": 8.27380084991455,
2198
+ "rewards/rejected": -8.38050651550293,
2199
+ "step": 1460
2200
+ },
2201
+ {
2202
+ "epoch": 0.8707241226121724,
2203
+ "grad_norm": 71.42558720517053,
2204
+ "learning_rate": 2.498260395725302e-08,
2205
+ "logits/chosen": -2.43896484375,
2206
+ "logits/rejected": -2.4272048473358154,
2207
+ "logps/chosen": -360.89898681640625,
2208
+ "logps/rejected": -257.8056945800781,
2209
+ "loss": 0.1013,
2210
+ "rewards/accuracies": 0.9624999761581421,
2211
+ "rewards/chosen": -0.2727930545806885,
2212
+ "rewards/margins": 8.105804443359375,
2213
+ "rewards/rejected": -8.378597259521484,
2214
+ "step": 1470
2215
+ },
2216
+ {
2217
+ "epoch": 0.8766474159632756,
2218
+ "grad_norm": 101.27358766803283,
2219
+ "learning_rate": 2.2777864380525426e-08,
2220
+ "logits/chosen": -2.4488790035247803,
2221
+ "logits/rejected": -2.4199492931365967,
2222
+ "logps/chosen": -368.7762145996094,
2223
+ "logps/rejected": -244.88949584960938,
2224
+ "loss": 0.1181,
2225
+ "rewards/accuracies": 0.949999988079071,
2226
+ "rewards/chosen": 0.07807255536317825,
2227
+ "rewards/margins": 8.640520095825195,
2228
+ "rewards/rejected": -8.56244945526123,
2229
+ "step": 1480
2230
+ },
2231
+ {
2232
+ "epoch": 0.8825707093143788,
2233
+ "grad_norm": 248.7079758081644,
2234
+ "learning_rate": 2.0670314305261423e-08,
2235
+ "logits/chosen": -2.443664073944092,
2236
+ "logits/rejected": -2.4383292198181152,
2237
+ "logps/chosen": -358.8103332519531,
2238
+ "logps/rejected": -255.68338012695312,
2239
+ "loss": 0.1486,
2240
+ "rewards/accuracies": 0.918749988079071,
2241
+ "rewards/chosen": -0.19392237067222595,
2242
+ "rewards/margins": 8.529777526855469,
2243
+ "rewards/rejected": -8.723699569702148,
2244
+ "step": 1490
2245
+ },
2246
+ {
2247
+ "epoch": 0.8884940026654821,
2248
+ "grad_norm": 228.603556403638,
2249
+ "learning_rate": 1.866085519178995e-08,
2250
+ "logits/chosen": -2.430896282196045,
2251
+ "logits/rejected": -2.39713716506958,
2252
+ "logps/chosen": -370.627685546875,
2253
+ "logps/rejected": -263.00537109375,
2254
+ "loss": 0.147,
2255
+ "rewards/accuracies": 0.949999988079071,
2256
+ "rewards/chosen": -0.3709534704685211,
2257
+ "rewards/margins": 8.486063003540039,
2258
+ "rewards/rejected": -8.85701847076416,
2259
+ "step": 1500
2260
+ },
2261
+ {
2262
+ "epoch": 0.8944172960165853,
2263
+ "grad_norm": 98.42860918890575,
2264
+ "learning_rate": 1.675034654408894e-08,
2265
+ "logits/chosen": -2.434990167617798,
2266
+ "logits/rejected": -2.408191204071045,
2267
+ "logps/chosen": -367.3474426269531,
2268
+ "logps/rejected": -252.94888305664062,
2269
+ "loss": 0.0803,
2270
+ "rewards/accuracies": 0.9937499761581421,
2271
+ "rewards/chosen": -0.12121151387691498,
2272
+ "rewards/margins": 8.94743537902832,
2273
+ "rewards/rejected": -9.068646430969238,
2274
+ "step": 1510
2275
+ },
2276
+ {
2277
+ "epoch": 0.9003405893676885,
2278
+ "grad_norm": 139.6690107400583,
2279
+ "learning_rate": 1.4939605542150595e-08,
2280
+ "logits/chosen": -2.399888277053833,
2281
+ "logits/rejected": -2.36537504196167,
2282
+ "logps/chosen": -379.11102294921875,
2283
+ "logps/rejected": -256.906005859375,
2284
+ "loss": 0.0731,
2285
+ "rewards/accuracies": 0.9750000238418579,
2286
+ "rewards/chosen": -0.062218405306339264,
2287
+ "rewards/margins": 8.892073631286621,
2288
+ "rewards/rejected": -8.954293251037598,
2289
+ "step": 1520
2290
+ },
2291
+ {
2292
+ "epoch": 0.9062638827187917,
2293
+ "grad_norm": 67.8318737973644,
2294
+ "learning_rate": 1.3229406692449791e-08,
2295
+ "logits/chosen": -2.479100227355957,
2296
+ "logits/rejected": -2.4435877799987793,
2297
+ "logps/chosen": -313.84320068359375,
2298
+ "logps/rejected": -239.1044921875,
2299
+ "loss": 0.1053,
2300
+ "rewards/accuracies": 0.9624999761581421,
2301
+ "rewards/chosen": -0.45267003774642944,
2302
+ "rewards/margins": 7.700293064117432,
2303
+ "rewards/rejected": -8.152963638305664,
2304
+ "step": 1530
2305
+ },
2306
+ {
2307
+ "epoch": 0.9121871760698949,
2308
+ "grad_norm": 133.6230788419226,
2309
+ "learning_rate": 1.162048149666503e-08,
2310
+ "logits/chosen": -2.3948609828948975,
2311
+ "logits/rejected": -2.3574843406677246,
2312
+ "logps/chosen": -375.52130126953125,
2313
+ "logps/rejected": -248.8327178955078,
2314
+ "loss": 0.1221,
2315
+ "rewards/accuracies": 0.9375,
2316
+ "rewards/chosen": -0.5586664080619812,
2317
+ "rewards/margins": 8.581660270690918,
2318
+ "rewards/rejected": -9.140327453613281,
2319
+ "step": 1540
2320
+ },
2321
+ {
2322
+ "epoch": 0.918110469420998,
2323
+ "grad_norm": 81.24757118228953,
2324
+ "learning_rate": 1.0113518138794047e-08,
2325
+ "logits/chosen": -2.424410581588745,
2326
+ "logits/rejected": -2.394204616546631,
2327
+ "logps/chosen": -375.75787353515625,
2328
+ "logps/rejected": -253.07760620117188,
2329
+ "loss": 0.2091,
2330
+ "rewards/accuracies": 0.918749988079071,
2331
+ "rewards/chosen": -0.5351217985153198,
2332
+ "rewards/margins": 7.9864182472229,
2333
+ "rewards/rejected": -8.521539688110352,
2334
+ "step": 1550
2335
+ },
2336
+ {
2337
+ "epoch": 0.9240337627721013,
2338
+ "grad_norm": 335.2650242487096,
2339
+ "learning_rate": 8.709161190797565e-09,
2340
+ "logits/chosen": -2.427302122116089,
2341
+ "logits/rejected": -2.419010639190674,
2342
+ "logps/chosen": -371.4006652832031,
2343
+ "logps/rejected": -262.81365966796875,
2344
+ "loss": 0.1524,
2345
+ "rewards/accuracies": 0.956250011920929,
2346
+ "rewards/chosen": -0.2929847836494446,
2347
+ "rewards/margins": 8.156495094299316,
2348
+ "rewards/rejected": -8.449480056762695,
2349
+ "step": 1560
2350
+ },
2351
+ {
2352
+ "epoch": 0.9299570561232045,
2353
+ "grad_norm": 196.1060901854074,
2354
+ "learning_rate": 7.408011336897141e-09,
2355
+ "logits/chosen": -2.4616658687591553,
2356
+ "logits/rejected": -2.4268431663513184,
2357
+ "logps/chosen": -371.90484619140625,
2358
+ "logps/rejected": -251.7766876220703,
2359
+ "loss": 0.1275,
2360
+ "rewards/accuracies": 0.949999988079071,
2361
+ "rewards/chosen": -0.5105483531951904,
2362
+ "rewards/margins": 8.057133674621582,
2363
+ "rewards/rejected": -8.567682266235352,
2364
+ "step": 1570
2365
+ },
2366
+ {
2367
+ "epoch": 0.9358803494743078,
2368
+ "grad_norm": 58.4219582518985,
2369
+ "learning_rate": 6.210625116645135e-09,
2370
+ "logits/chosen": -2.403099775314331,
2371
+ "logits/rejected": -2.384831666946411,
2372
+ "logps/chosen": -417.583251953125,
2373
+ "logps/rejected": -272.24603271484375,
2374
+ "loss": 0.1594,
2375
+ "rewards/accuracies": 0.918749988079071,
2376
+ "rewards/chosen": -0.04987464100122452,
2377
+ "rewards/margins": 8.769102096557617,
2378
+ "rewards/rejected": -8.818976402282715,
2379
+ "step": 1580
2380
+ },
2381
+ {
2382
+ "epoch": 0.941803642825411,
2383
+ "grad_norm": 144.546751800466,
2384
+ "learning_rate": 5.117514686876378e-09,
2385
+ "logits/chosen": -2.456322193145752,
2386
+ "logits/rejected": -2.416351795196533,
2387
+ "logps/chosen": -351.6046447753906,
2388
+ "logps/rejected": -273.55303955078125,
2389
+ "loss": 0.1241,
2390
+ "rewards/accuracies": 0.9624999761581421,
2391
+ "rewards/chosen": -0.4101320207118988,
2392
+ "rewards/margins": 9.183103561401367,
2393
+ "rewards/rejected": -9.593236923217773,
2394
+ "step": 1590
2395
+ },
2396
+ {
2397
+ "epoch": 0.9477269361765142,
2398
+ "grad_norm": 120.48952563193308,
2399
+ "learning_rate": 4.1291476026441565e-09,
2400
+ "logits/chosen": -2.456437110900879,
2401
+ "logits/rejected": -2.4311928749084473,
2402
+ "logps/chosen": -365.16168212890625,
2403
+ "logps/rejected": -253.9194793701172,
2404
+ "loss": 0.1579,
2405
+ "rewards/accuracies": 0.9437500238418579,
2406
+ "rewards/chosen": -0.06400365382432938,
2407
+ "rewards/margins": 8.453577041625977,
2408
+ "rewards/rejected": -8.51758098602295,
2409
+ "step": 1600
2410
+ },
2411
+ {
2412
+ "epoch": 0.9536502295276174,
2413
+ "grad_norm": 106.80281991069545,
2414
+ "learning_rate": 3.2459466172331253e-09,
2415
+ "logits/chosen": -2.3881659507751465,
2416
+ "logits/rejected": -2.397707939147949,
2417
+ "logps/chosen": -372.622314453125,
2418
+ "logps/rejected": -245.3422393798828,
2419
+ "loss": 0.1586,
2420
+ "rewards/accuracies": 0.956250011920929,
2421
+ "rewards/chosen": -0.32000666856765747,
2422
+ "rewards/margins": 7.897205352783203,
2423
+ "rewards/rejected": -8.217211723327637,
2424
+ "step": 1610
2425
+ },
2426
+ {
2427
+ "epoch": 0.9595735228787206,
2428
+ "grad_norm": 193.24252371332162,
2429
+ "learning_rate": 2.4682895013354854e-09,
2430
+ "logits/chosen": -2.4109058380126953,
2431
+ "logits/rejected": -2.413801670074463,
2432
+ "logps/chosen": -360.99072265625,
2433
+ "logps/rejected": -256.06756591796875,
2434
+ "loss": 0.091,
2435
+ "rewards/accuracies": 0.9624999761581421,
2436
+ "rewards/chosen": -0.6014559268951416,
2437
+ "rewards/margins": 8.314714431762695,
2438
+ "rewards/rejected": -8.916170120239258,
2439
+ "step": 1620
2440
+ },
2441
+ {
2442
+ "epoch": 0.9654968162298238,
2443
+ "grad_norm": 182.40318024580077,
2444
+ "learning_rate": 1.7965088814675677e-09,
2445
+ "logits/chosen": -2.384791851043701,
2446
+ "logits/rejected": -2.382201671600342,
2447
+ "logps/chosen": -375.5494689941406,
2448
+ "logps/rejected": -269.6348876953125,
2449
+ "loss": 0.1368,
2450
+ "rewards/accuracies": 0.9125000238418579,
2451
+ "rewards/chosen": -0.9617233276367188,
2452
+ "rewards/margins": 8.054598808288574,
2453
+ "rewards/rejected": -9.016322135925293,
2454
+ "step": 1630
2455
+ },
2456
+ {
2457
+ "epoch": 0.971420109580927,
2458
+ "grad_norm": 142.9169068872898,
2459
+ "learning_rate": 1.2308920976958348e-09,
2460
+ "logits/chosen": -2.448244094848633,
2461
+ "logits/rejected": -2.4210681915283203,
2462
+ "logps/chosen": -364.89703369140625,
2463
+ "logps/rejected": -260.6488342285156,
2464
+ "loss": 0.1134,
2465
+ "rewards/accuracies": 0.949999988079071,
2466
+ "rewards/chosen": -0.6372100114822388,
2467
+ "rewards/margins": 8.421440124511719,
2468
+ "rewards/rejected": -9.058650016784668,
2469
+ "step": 1640
2470
+ },
2471
+ {
2472
+ "epoch": 0.9773434029320303,
2473
+ "grad_norm": 72.39563192418329,
2474
+ "learning_rate": 7.716810807330276e-10,
2475
+ "logits/chosen": -2.4073166847229004,
2476
+ "logits/rejected": -2.365865707397461,
2477
+ "logps/chosen": -390.2394104003906,
2478
+ "logps/rejected": -262.54150390625,
2479
+ "loss": 0.0787,
2480
+ "rewards/accuracies": 0.96875,
2481
+ "rewards/chosen": -0.4426153302192688,
2482
+ "rewards/margins": 8.694158554077148,
2483
+ "rewards/rejected": -9.136773109436035,
2484
+ "step": 1650
2485
+ },
2486
+ {
2487
+ "epoch": 0.9832666962831335,
2488
+ "grad_norm": 109.48295555626903,
2489
+ "learning_rate": 4.190722484575804e-10,
2490
+ "logits/chosen": -2.4156811237335205,
2491
+ "logits/rejected": -2.381577968597412,
2492
+ "logps/chosen": -399.725341796875,
2493
+ "logps/rejected": -263.4638366699219,
2494
+ "loss": 0.1397,
2495
+ "rewards/accuracies": 0.956250011920929,
2496
+ "rewards/chosen": -0.32644200325012207,
2497
+ "rewards/margins": 8.409704208374023,
2498
+ "rewards/rejected": -8.736146926879883,
2499
+ "step": 1660
2500
+ },
2501
+ {
2502
+ "epoch": 0.9891899896342367,
2503
+ "grad_norm": 137.41353285844747,
2504
+ "learning_rate": 1.732164218998522e-10,
2505
+ "logits/chosen": -2.3911380767822266,
2506
+ "logits/rejected": -2.386594533920288,
2507
+ "logps/chosen": -363.5224609375,
2508
+ "logps/rejected": -253.3739776611328,
2509
+ "loss": 0.125,
2510
+ "rewards/accuracies": 0.949999988079071,
2511
+ "rewards/chosen": -0.11769469082355499,
2512
+ "rewards/margins": 8.378522872924805,
2513
+ "rewards/rejected": -8.496217727661133,
2514
+ "step": 1670
2515
+ },
2516
+ {
2517
+ "epoch": 0.9951132829853399,
2518
+ "grad_norm": 198.01011960831866,
2519
+ "learning_rate": 3.4218760731730136e-11,
2520
+ "logits/chosen": -2.465393543243408,
2521
+ "logits/rejected": -2.44226336479187,
2522
+ "logps/chosen": -390.7846984863281,
2523
+ "logps/rejected": -258.70721435546875,
2524
+ "loss": 0.1317,
2525
+ "rewards/accuracies": 0.9375,
2526
+ "rewards/chosen": -0.5431281924247742,
2527
+ "rewards/margins": 8.081016540527344,
2528
+ "rewards/rejected": -8.624144554138184,
2529
+ "step": 1680
2530
+ },
2531
+ {
2532
+ "epoch": 0.9998519176662224,
2533
+ "step": 1688,
2534
+ "total_flos": 0.0,
2535
+ "train_loss": 0.17761736296081995,
2536
+ "train_runtime": 39274.1948,
2537
+ "train_samples_per_second": 1.375,
2538
+ "train_steps_per_second": 0.043
2539
+ }
2540
+ ],
2541
+ "logging_steps": 10,
2542
+ "max_steps": 1688,
2543
+ "num_input_tokens_seen": 0,
2544
+ "num_train_epochs": 1,
2545
+ "save_steps": 500,
2546
+ "stateful_callbacks": {
2547
+ "TrainerControl": {
2548
+ "args": {
2549
+ "should_epoch_stop": false,
2550
+ "should_evaluate": false,
2551
+ "should_log": false,
2552
+ "should_save": true,
2553
+ "should_training_stop": true
2554
+ },
2555
+ "attributes": {}
2556
+ }
2557
+ },
2558
+ "total_flos": 0.0,
2559
+ "train_batch_size": 4,
2560
+ "trial_name": null,
2561
+ "trial_params": null
2562
+ }