tanliboy commited on
Commit
f0b2c3a
1 Parent(s): 32994ad

Model save

Browse files
README.md CHANGED
@@ -3,16 +3,10 @@ library_name: transformers
3
  license: llama3.2
4
  base_model: tanliboy/llama-3.2-3b-sft
5
  tags:
6
- - alignment-handbook
7
- - trl
8
- - dpo
9
- - generated_from_trainer
10
  - trl
11
  - dpo
 
12
  - generated_from_trainer
13
- datasets:
14
- - HuggingFaceH4/ultrafeedback_binarized
15
- - HuggingFaceH4/orca_dpo_pairs
16
  model-index:
17
  - name: llama-3.2-3b-dpo
18
  results: []
@@ -23,17 +17,17 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  # llama-3.2-3b-dpo
25
 
26
- This model is a fine-tuned version of [tanliboy/llama-3.2-3b-sft](https://huggingface.co/tanliboy/llama-3.2-3b-sft) on the HuggingFaceH4/ultrafeedback_binarized and the HuggingFaceH4/orca_dpo_pairs datasets.
27
  It achieves the following results on the evaluation set:
28
- - Loss: 0.4863
29
- - Rewards/chosen: -1.4532
30
- - Rewards/rejected: -2.6152
31
- - Rewards/accuracies: 0.7215
32
- - Rewards/margins: 1.1620
33
- - Logps/rejected: -561.0516
34
- - Logps/chosen: -483.2492
35
- - Logits/rejected: 0.5144
36
- - Logits/chosen: 0.3599
37
 
38
  ## Model description
39
 
@@ -52,7 +46,7 @@ More information needed
52
  ### Training hyperparameters
53
 
54
  The following hyperparameters were used during training:
55
- - learning_rate: 5e-07
56
  - train_batch_size: 4
57
  - eval_batch_size: 4
58
  - seed: 42
@@ -63,18 +57,19 @@ The following hyperparameters were used during training:
63
  - total_eval_batch_size: 32
64
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
65
  - lr_scheduler_type: cosine
66
- - lr_scheduler_warmup_ratio: 0.1
67
- - num_epochs: 1
68
 
69
  ### Training results
70
 
71
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
72
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
73
- | 0.6115 | 0.1741 | 100 | 0.6000 | -0.3357 | -0.5534 | 0.6329 | 0.2176 | -354.8722 | -371.5030 | 0.8149 | 0.6445 |
74
- | 0.5416 | 0.3483 | 200 | 0.5221 | -1.2141 | -1.9406 | 0.6741 | 0.7265 | -493.5951 | -459.3420 | 0.6298 | 0.4810 |
75
- | 0.5158 | 0.5224 | 300 | 0.5048 | -1.3617 | -2.3705 | 0.7057 | 1.0088 | -536.5855 | -474.1037 | 0.4784 | 0.3320 |
76
- | 0.501 | 0.6966 | 400 | 0.4906 | -1.4306 | -2.5214 | 0.7152 | 1.0908 | -551.6774 | -480.9883 | 0.5108 | 0.3615 |
77
- | 0.4806 | 0.8707 | 500 | 0.4864 | -1.4549 | -2.6071 | 0.7247 | 1.1522 | -560.2480 | -483.4225 | 0.5181 | 0.3637 |
 
78
 
79
 
80
  ### Framework versions
 
3
  license: llama3.2
4
  base_model: tanliboy/llama-3.2-3b-sft
5
  tags:
 
 
 
 
6
  - trl
7
  - dpo
8
+ - alignment-handbook
9
  - generated_from_trainer
 
 
 
10
  model-index:
11
  - name: llama-3.2-3b-dpo
12
  results: []
 
17
 
18
  # llama-3.2-3b-dpo
19
 
20
+ This model is a fine-tuned version of [tanliboy/llama-3.2-3b-sft](https://huggingface.co/tanliboy/llama-3.2-3b-sft) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.6284
23
+ - Rewards/chosen: 0.8504
24
+ - Rewards/rejected: -3.7058
25
+ - Rewards/accuracies: 0.7437
26
+ - Rewards/margins: 4.5562
27
+ - Logps/rejected: -368.9125
28
+ - Logps/chosen: -337.3143
29
+ - Logits/rejected: 0.4571
30
+ - Logits/chosen: 0.3820
31
 
32
  ## Model description
33
 
 
46
  ### Training hyperparameters
47
 
48
  The following hyperparameters were used during training:
49
+ - learning_rate: 1e-06
50
  - train_batch_size: 4
51
  - eval_batch_size: 4
52
  - seed: 42
 
57
  - total_eval_batch_size: 32
58
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
59
  - lr_scheduler_type: cosine
60
+ - lr_scheduler_warmup_ratio: 0.03
61
+ - num_epochs: 3
62
 
63
  ### Training results
64
 
65
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
66
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
67
+ | 0.5801 | 0.4739 | 100 | 0.6840 | 0.6485 | -2.9389 | 0.6899 | 3.5875 | -361.2435 | -339.3325 | 0.6783 | 0.6103 |
68
+ | 0.537 | 0.9479 | 200 | 0.6514 | 0.2045 | -4.0315 | 0.7278 | 4.2360 | -372.1696 | -343.7731 | 0.5648 | 0.4948 |
69
+ | 0.4787 | 1.4218 | 300 | 0.6387 | 0.4099 | -3.9882 | 0.7215 | 4.3981 | -371.7361 | -341.7187 | 0.5326 | 0.4589 |
70
+ | 0.4559 | 1.8957 | 400 | 0.6332 | 0.7690 | -3.6688 | 0.7342 | 4.4379 | -368.5425 | -338.1277 | 0.4841 | 0.4110 |
71
+ | 0.4028 | 2.3697 | 500 | 0.6289 | 0.7479 | -3.8379 | 0.7405 | 4.5857 | -370.2327 | -338.3392 | 0.4475 | 0.3731 |
72
+ | 0.4029 | 2.8436 | 600 | 0.6284 | 0.8504 | -3.7058 | 0.7437 | 4.5562 | -368.9125 | -337.3143 | 0.4571 | 0.3820 |
73
 
74
 
75
  ### Framework versions
all_results.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "epoch": 0.999564649542882,
3
  "eval_logits/chosen": 0.3599458634853363,
4
  "eval_logits/rejected": 0.514390230178833,
5
  "eval_logps/chosen": -483.24920654296875,
@@ -14,9 +14,9 @@
14
  "eval_samples_per_second": 44.381,
15
  "eval_steps_per_second": 1.402,
16
  "total_flos": 0.0,
17
- "train_loss": 0.5408797243330952,
18
- "train_runtime": 4250.8819,
19
- "train_samples": 73493,
20
- "train_samples_per_second": 17.289,
21
- "train_steps_per_second": 0.135
22
  }
 
1
  {
2
+ "epoch": 3.0,
3
  "eval_logits/chosen": 0.3599458634853363,
4
  "eval_logits/rejected": 0.514390230178833,
5
  "eval_logps/chosen": -483.24920654296875,
 
14
  "eval_samples_per_second": 44.381,
15
  "eval_steps_per_second": 1.402,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.5009220597491634,
18
+ "train_runtime": 6227.6413,
19
+ "train_samples": 26990,
20
+ "train_samples_per_second": 13.002,
21
+ "train_steps_per_second": 0.102
22
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a15a798615ab3d1c74b309a9b52b739cf50a081ce1afcb020fd57eaea81a92b
3
  size 4965799096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a949d0d551d426eda481df55c9ce61d9e9f886cf169e5144df329731c5587075
3
  size 4965799096
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17e2fdba19d954562a5f4d4ce7307ec582677840c6cdc9bb8072338b63a699d2
3
  size 1459729952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34d352336d2f3d15b0fdace3f3e6e2ba4213d9a93cc7ff41fc3f50a8dd9beca7
3
  size 1459729952
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.999564649542882,
3
  "total_flos": 0.0,
4
- "train_loss": 0.5408797243330952,
5
- "train_runtime": 4250.8819,
6
- "train_samples": 73493,
7
- "train_samples_per_second": 17.289,
8
- "train_steps_per_second": 0.135
9
  }
 
1
  {
2
+ "epoch": 3.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5009220597491634,
5
+ "train_runtime": 6227.6413,
6
+ "train_samples": 26990,
7
+ "train_samples_per_second": 13.002,
8
+ "train_steps_per_second": 0.102
9
  }
trainer_state.json CHANGED
@@ -1,22 +1,22 @@
1
  {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 0.999564649542882,
5
  "eval_steps": 100,
6
- "global_step": 574,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0017414018284719198,
13
- "grad_norm": 3.3411689380276015,
14
- "learning_rate": 8.620689655172413e-09,
15
- "logits/chosen": 0.4204842150211334,
16
- "logits/rejected": 0.797350287437439,
17
- "logps/chosen": -397.1048889160156,
18
- "logps/rejected": -273.9348449707031,
19
- "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
@@ -24,954 +24,1060 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.0174140182847192,
28
- "grad_norm": 3.2818454280142397,
29
- "learning_rate": 8.620689655172414e-08,
30
- "logits/chosen": 0.6469573974609375,
31
- "logits/rejected": 0.773016631603241,
32
- "logps/chosen": -312.58782958984375,
33
- "logps/rejected": -294.86376953125,
34
- "loss": 0.6932,
35
- "rewards/accuracies": 0.4305555522441864,
36
- "rewards/chosen": -0.0005739983171224594,
37
- "rewards/margins": 4.1095463529927656e-05,
38
- "rewards/rejected": -0.0006150936824269593,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.0348280365694384,
43
- "grad_norm": 3.251396782118138,
44
- "learning_rate": 1.7241379310344828e-07,
45
- "logits/chosen": 0.7936784029006958,
46
- "logits/rejected": 0.7803818583488464,
47
- "logps/chosen": -298.35137939453125,
48
- "logps/rejected": -303.12591552734375,
49
- "loss": 0.6931,
50
- "rewards/accuracies": 0.46875,
51
- "rewards/chosen": -0.0012876250548288226,
52
- "rewards/margins": -0.00017336892778985202,
53
- "rewards/rejected": -0.0011142559815198183,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 0.052242054854157595,
58
- "grad_norm": 3.046183099717941,
59
- "learning_rate": 2.586206896551724e-07,
60
- "logits/chosen": 0.7200006246566772,
61
- "logits/rejected": 0.7392618060112,
62
- "logps/chosen": -327.70428466796875,
63
- "logps/rejected": -313.39422607421875,
64
- "loss": 0.6925,
65
- "rewards/accuracies": 0.6312500238418579,
66
- "rewards/chosen": 0.00010628718882799149,
67
- "rewards/margins": 0.0027407719753682613,
68
- "rewards/rejected": -0.00263448478654027,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 0.0696560731388768,
73
- "grad_norm": 3.340902174817753,
74
- "learning_rate": 3.4482758620689656e-07,
75
- "logits/chosen": 0.7511029839515686,
76
- "logits/rejected": 0.8245828747749329,
77
- "logps/chosen": -295.2296142578125,
78
- "logps/rejected": -329.61358642578125,
79
- "loss": 0.6901,
80
- "rewards/accuracies": 0.65625,
81
- "rewards/chosen": 0.003507365705445409,
82
- "rewards/margins": 0.0062782615423202515,
83
- "rewards/rejected": -0.0027708951383829117,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 0.087070091423596,
88
- "grad_norm": 3.0385013323870447,
89
- "learning_rate": 4.310344827586206e-07,
90
- "logits/chosen": 0.7283368110656738,
91
- "logits/rejected": 0.7367040514945984,
92
- "logps/chosen": -288.52020263671875,
93
- "logps/rejected": -298.375,
94
- "loss": 0.6846,
95
- "rewards/accuracies": 0.737500011920929,
96
- "rewards/chosen": 0.012100273743271828,
97
- "rewards/margins": 0.021190276369452477,
98
- "rewards/rejected": -0.009090004488825798,
99
  "step": 50
100
  },
101
  {
102
- "epoch": 0.10448410970831519,
103
- "grad_norm": 2.8950641460333264,
104
- "learning_rate": 4.999814661783161e-07,
105
- "logits/chosen": 0.7964831590652466,
106
- "logits/rejected": 0.8126887083053589,
107
- "logps/chosen": -291.2521057128906,
108
- "logps/rejected": -293.8636779785156,
109
- "loss": 0.6749,
110
- "rewards/accuracies": 0.6937500238418579,
111
- "rewards/chosen": 0.023344680666923523,
112
- "rewards/margins": 0.03975962847471237,
113
- "rewards/rejected": -0.0164149459451437,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.12189812799303439,
118
- "grad_norm": 3.225678300931263,
119
- "learning_rate": 4.993330709158879e-07,
120
- "logits/chosen": 0.863906979560852,
121
- "logits/rejected": 0.9115025401115417,
122
- "logps/chosen": -317.2396240234375,
123
- "logps/rejected": -319.9203186035156,
124
- "loss": 0.6645,
125
- "rewards/accuracies": 0.625,
126
- "rewards/chosen": 0.026739483699202538,
127
- "rewards/margins": 0.04891226813197136,
128
- "rewards/rejected": -0.02217279188334942,
129
  "step": 70
130
  },
131
  {
132
- "epoch": 0.1393121462777536,
133
- "grad_norm": 3.4663859481682495,
134
- "learning_rate": 4.977607307587086e-07,
135
- "logits/chosen": 0.6847103834152222,
136
- "logits/rejected": 0.8103203773498535,
137
- "logps/chosen": -331.44964599609375,
138
- "logps/rejected": -311.62713623046875,
139
- "loss": 0.6447,
140
- "rewards/accuracies": 0.668749988079071,
141
- "rewards/chosen": 0.003575064241886139,
142
- "rewards/margins": 0.11137789487838745,
143
- "rewards/rejected": -0.10780283063650131,
144
  "step": 80
145
  },
146
  {
147
- "epoch": 0.1567261645624728,
148
- "grad_norm": 3.1824594089594873,
149
- "learning_rate": 4.952702722730485e-07,
150
- "logits/chosen": 0.6765316724777222,
151
- "logits/rejected": 0.885266125202179,
152
- "logps/chosen": -344.87274169921875,
153
- "logps/rejected": -334.42303466796875,
154
- "loss": 0.6273,
155
- "rewards/accuracies": 0.6812499761581421,
156
- "rewards/chosen": -0.05379386991262436,
157
- "rewards/margins": 0.2033984214067459,
158
- "rewards/rejected": -0.25719231367111206,
159
  "step": 90
160
  },
161
  {
162
- "epoch": 0.174140182847192,
163
- "grad_norm": 3.792697956177081,
164
- "learning_rate": 4.918709242643563e-07,
165
- "logits/chosen": 0.6538839936256409,
166
- "logits/rejected": 0.75486820936203,
167
- "logps/chosen": -327.5845642089844,
168
- "logps/rejected": -333.3156433105469,
169
- "loss": 0.6115,
170
- "rewards/accuracies": 0.637499988079071,
171
- "rewards/chosen": -0.18015703558921814,
172
- "rewards/margins": 0.2053672820329666,
173
- "rewards/rejected": -0.38552433252334595,
174
  "step": 100
175
  },
176
  {
177
- "epoch": 0.174140182847192,
178
- "eval_logits/chosen": 0.6445267200469971,
179
- "eval_logits/rejected": 0.8148620128631592,
180
- "eval_logps/chosen": -371.50299072265625,
181
- "eval_logps/rejected": -354.8721618652344,
182
- "eval_loss": 0.599998414516449,
183
- "eval_rewards/accuracies": 0.6329113841056824,
184
- "eval_rewards/chosen": -0.3357231020927429,
185
- "eval_rewards/margins": 0.21764038503170013,
186
- "eval_rewards/rejected": -0.553363561630249,
187
- "eval_runtime": 57.4976,
188
- "eval_samples_per_second": 43.48,
189
- "eval_steps_per_second": 1.374,
190
  "step": 100
191
  },
192
  {
193
- "epoch": 0.19155420113191118,
194
- "grad_norm": 3.786481240905226,
195
- "learning_rate": 4.875752835783956e-07,
196
- "logits/chosen": 0.531981348991394,
197
- "logits/rejected": 0.6186498403549194,
198
- "logps/chosen": -389.45556640625,
199
- "logps/rejected": -391.052734375,
200
- "loss": 0.5967,
201
- "rewards/accuracies": 0.731249988079071,
202
- "rewards/chosen": -0.2725273370742798,
203
- "rewards/margins": 0.3243584632873535,
204
- "rewards/rejected": -0.5968858003616333,
205
  "step": 110
206
  },
207
  {
208
- "epoch": 0.20896821941663038,
209
- "grad_norm": 4.181016924469078,
210
- "learning_rate": 4.823992684215516e-07,
211
- "logits/chosen": 0.48549872636795044,
212
- "logits/rejected": 0.6067858338356018,
213
- "logps/chosen": -375.8733825683594,
214
- "logps/rejected": -405.65240478515625,
215
- "loss": 0.5621,
216
- "rewards/accuracies": 0.7749999761581421,
217
- "rewards/chosen": -0.5050081014633179,
218
- "rewards/margins": 0.46570420265197754,
219
- "rewards/rejected": -0.9707123041152954,
220
  "step": 120
221
  },
222
  {
223
- "epoch": 0.22638223770134958,
224
- "grad_norm": 4.23287712214465,
225
- "learning_rate": 4.7636205937328664e-07,
226
- "logits/chosen": 0.5461466908454895,
227
- "logits/rejected": 0.7331717610359192,
228
- "logps/chosen": -405.69677734375,
229
- "logps/rejected": -414.6841735839844,
230
- "loss": 0.5759,
231
- "rewards/accuracies": 0.7437499761581421,
232
- "rewards/chosen": -0.6262551546096802,
233
- "rewards/margins": 0.4949806332588196,
234
- "rewards/rejected": -1.1212358474731445,
235
  "step": 130
236
  },
237
  {
238
- "epoch": 0.24379625598606877,
239
- "grad_norm": 5.73858119497669,
240
- "learning_rate": 4.69486028309334e-07,
241
- "logits/chosen": 0.5876033902168274,
242
- "logits/rejected": 0.6097403168678284,
243
- "logps/chosen": -379.1561279296875,
244
- "logps/rejected": -430.0502014160156,
245
- "loss": 0.5551,
246
- "rewards/accuracies": 0.762499988079071,
247
- "rewards/chosen": -0.7041950225830078,
248
- "rewards/margins": 0.4728410840034485,
249
- "rewards/rejected": -1.1770360469818115,
250
  "step": 140
251
  },
252
  {
253
- "epoch": 0.26121027427078797,
254
- "grad_norm": 6.624624616809953,
255
- "learning_rate": 4.61796655499015e-07,
256
- "logits/chosen": 0.4104432165622711,
257
- "logits/rejected": 0.6294438242912292,
258
- "logps/chosen": -416.1566467285156,
259
- "logps/rejected": -426.4996032714844,
260
- "loss": 0.5562,
261
- "rewards/accuracies": 0.675000011920929,
262
- "rewards/chosen": -0.7248625755310059,
263
- "rewards/margins": 0.550578236579895,
264
- "rewards/rejected": -1.2754409313201904,
265
  "step": 150
266
  },
267
  {
268
- "epoch": 0.2786242925555072,
269
- "grad_norm": 6.025999930652061,
270
- "learning_rate": 4.5332243518389136e-07,
271
- "logits/chosen": 0.45941343903541565,
272
- "logits/rejected": 0.6137918829917908,
273
- "logps/chosen": -421.73095703125,
274
- "logps/rejected": -463.8855895996094,
275
- "loss": 0.5323,
276
- "rewards/accuracies": 0.7562500238418579,
277
- "rewards/chosen": -0.8197100758552551,
278
- "rewards/margins": 0.7435392141342163,
279
- "rewards/rejected": -1.5632489919662476,
280
  "step": 160
281
  },
282
  {
283
- "epoch": 0.29603831084022636,
284
- "grad_norm": 6.778470707105329,
285
- "learning_rate": 4.4409476998764364e-07,
286
- "logits/chosen": 0.4422511160373688,
287
- "logits/rejected": 0.636109471321106,
288
- "logps/chosen": -439.1053161621094,
289
- "logps/rejected": -487.4310607910156,
290
- "loss": 0.5382,
291
- "rewards/accuracies": 0.6875,
292
- "rewards/chosen": -1.0064823627471924,
293
- "rewards/margins": 0.7615920305252075,
294
- "rewards/rejected": -1.7680743932724,
295
  "step": 170
296
  },
297
  {
298
- "epoch": 0.3134523291249456,
299
- "grad_norm": 5.155340194107305,
300
- "learning_rate": 4.3414785454846093e-07,
301
- "logits/chosen": 0.4746861457824707,
302
- "logits/rejected": 0.49304714798927307,
303
- "logps/chosen": -429.70355224609375,
304
- "logps/rejected": -487.3233337402344,
305
- "loss": 0.5377,
306
- "rewards/accuracies": 0.699999988079071,
307
- "rewards/chosen": -0.9950485229492188,
308
- "rewards/margins": 0.7225963473320007,
309
- "rewards/rejected": -1.7176446914672852,
310
  "step": 180
311
  },
312
  {
313
- "epoch": 0.3308663474096648,
314
- "grad_norm": 7.739271405731777,
315
- "learning_rate": 4.235185488051585e-07,
316
- "logits/chosen": 0.4361630082130432,
317
- "logits/rejected": 0.5531889200210571,
318
- "logps/chosen": -457.5230407714844,
319
- "logps/rejected": -510.4833068847656,
320
- "loss": 0.5269,
321
  "rewards/accuracies": 0.7250000238418579,
322
- "rewards/chosen": -1.0585914850234985,
323
- "rewards/margins": 0.7955325245857239,
324
- "rewards/rejected": -1.8541240692138672,
325
  "step": 190
326
  },
327
  {
328
- "epoch": 0.348280365694384,
329
- "grad_norm": 6.703891062008893,
330
- "learning_rate": 4.1224624140658336e-07,
331
- "logits/chosen": 0.4384649395942688,
332
- "logits/rejected": 0.5880831480026245,
333
- "logps/chosen": -461.74053955078125,
334
- "logps/rejected": -518.26806640625,
335
- "loss": 0.5416,
336
- "rewards/accuracies": 0.71875,
337
- "rewards/chosen": -1.2263882160186768,
338
- "rewards/margins": 0.9702332615852356,
339
- "rewards/rejected": -2.1966214179992676,
340
  "step": 200
341
  },
342
  {
343
- "epoch": 0.348280365694384,
344
- "eval_logits/chosen": 0.48104235529899597,
345
- "eval_logits/rejected": 0.6297945976257324,
346
- "eval_logps/chosen": -459.3420104980469,
347
- "eval_logps/rejected": -493.5951232910156,
348
- "eval_loss": 0.5221381187438965,
349
- "eval_rewards/accuracies": 0.6740506291389465,
350
- "eval_rewards/chosen": -1.214113473892212,
351
- "eval_rewards/margins": 0.7264798283576965,
352
- "eval_rewards/rejected": -1.9405934810638428,
353
- "eval_runtime": 56.9644,
354
- "eval_samples_per_second": 43.887,
355
- "eval_steps_per_second": 1.387,
356
  "step": 200
357
  },
358
  {
359
- "epoch": 0.3656943839791032,
360
- "grad_norm": 6.9743912275745075,
361
- "learning_rate": 4.003727037504676e-07,
362
- "logits/chosen": 0.5726699829101562,
363
- "logits/rejected": 0.6551696062088013,
364
- "logps/chosen": -398.76287841796875,
365
- "logps/rejected": -471.94219970703125,
366
- "loss": 0.525,
367
- "rewards/accuracies": 0.762499988079071,
368
- "rewards/chosen": -1.0725631713867188,
369
- "rewards/margins": 0.7981334924697876,
370
- "rewards/rejected": -1.8706966638565063,
371
  "step": 210
372
  },
373
  {
374
- "epoch": 0.38310840226382237,
375
- "grad_norm": 6.816210344544152,
376
- "learning_rate": 3.879419351926115e-07,
377
- "logits/chosen": 0.48985353112220764,
378
- "logits/rejected": 0.5055981874465942,
379
- "logps/chosen": -444.0646057128906,
380
- "logps/rejected": -542.5759887695312,
381
- "loss": 0.5119,
382
- "rewards/accuracies": 0.7562500238418579,
383
- "rewards/chosen": -1.1069988012313843,
384
- "rewards/margins": 1.0880978107452393,
385
- "rewards/rejected": -2.195096731185913,
386
  "step": 220
387
  },
388
  {
389
- "epoch": 0.4005224205485416,
390
- "grad_norm": 6.1503561448910595,
391
- "learning_rate": 3.75e-07,
392
- "logits/chosen": 0.40112408995628357,
393
- "logits/rejected": 0.4424813389778137,
394
- "logps/chosen": -434.7146911621094,
395
- "logps/rejected": -532.7242431640625,
396
- "loss": 0.5219,
397
- "rewards/accuracies": 0.7749999761581421,
398
- "rewards/chosen": -1.1246516704559326,
399
- "rewards/margins": 1.0327972173690796,
400
- "rewards/rejected": -2.1574490070343018,
401
  "step": 230
402
  },
403
  {
404
- "epoch": 0.41793643883326076,
405
- "grad_norm": 6.948961329300032,
406
- "learning_rate": 3.615948566520498e-07,
407
- "logits/chosen": 0.2614263594150543,
408
- "logits/rejected": 0.316015362739563,
409
- "logps/chosen": -478.199462890625,
410
- "logps/rejected": -555.8576049804688,
411
- "loss": 0.5478,
412
- "rewards/accuracies": 0.6625000238418579,
413
- "rewards/chosen": -1.3383628129959106,
414
- "rewards/margins": 0.8917714953422546,
415
- "rewards/rejected": -2.2301342487335205,
416
  "step": 240
417
  },
418
  {
419
- "epoch": 0.43535045711798,
420
- "grad_norm": 6.214549773976368,
421
- "learning_rate": 3.4777618012253895e-07,
422
- "logits/chosen": 0.5206897854804993,
423
- "logits/rejected": 0.4764159321784973,
424
- "logps/chosen": -427.69720458984375,
425
- "logps/rejected": -528.8837280273438,
426
- "loss": 0.5137,
427
- "rewards/accuracies": 0.762499988079071,
428
- "rewards/chosen": -1.1115195751190186,
429
- "rewards/margins": 0.9832460284233093,
430
- "rewards/rejected": -2.0947654247283936,
431
  "step": 250
432
  },
433
  {
434
- "epoch": 0.45276447540269915,
435
- "grad_norm": 10.426891895491888,
436
- "learning_rate": 3.3359517780078315e-07,
437
- "logits/chosen": 0.31379422545433044,
438
- "logits/rejected": 0.4527011811733246,
439
- "logps/chosen": -460.69390869140625,
440
- "logps/rejected": -530.2111206054688,
441
- "loss": 0.521,
442
- "rewards/accuracies": 0.706250011920929,
443
- "rewards/chosen": -1.0801621675491333,
444
- "rewards/margins": 0.9811126589775085,
445
- "rewards/rejected": -2.061274766921997,
446
  "step": 260
447
  },
448
  {
449
- "epoch": 0.4701784936874184,
450
- "grad_norm": 6.486890040446789,
451
- "learning_rate": 3.191043997341929e-07,
452
- "logits/chosen": 0.41157540678977966,
453
- "logits/rejected": 0.48239168524742126,
454
- "logps/chosen": -449.7867736816406,
455
- "logps/rejected": -511.8810119628906,
456
- "loss": 0.5079,
457
- "rewards/accuracies": 0.731249988079071,
458
- "rewards/chosen": -1.1862131357192993,
459
- "rewards/margins": 0.9392406344413757,
460
- "rewards/rejected": -2.1254539489746094,
461
  "step": 270
462
  },
463
  {
464
- "epoch": 0.48759251197213754,
465
- "grad_norm": 6.17919759443457,
466
- "learning_rate": 3.0435754389538925e-07,
467
- "logits/chosen": 0.4492722153663635,
468
- "logits/rejected": 0.477822482585907,
469
- "logps/chosen": -405.44488525390625,
470
- "logps/rejected": -496.3724060058594,
471
- "loss": 0.5069,
472
  "rewards/accuracies": 0.793749988079071,
473
- "rewards/chosen": -1.1837854385375977,
474
- "rewards/margins": 0.9472991824150085,
475
- "rewards/rejected": -2.13108491897583,
476
  "step": 280
477
  },
478
  {
479
- "epoch": 0.5050065302568568,
480
- "grad_norm": 6.347632610711749,
481
- "learning_rate": 2.8940925719549335e-07,
482
- "logits/chosen": 0.35942569375038147,
483
- "logits/rejected": 0.4725337624549866,
484
- "logps/chosen": -454.9664611816406,
485
- "logps/rejected": -541.79052734375,
486
- "loss": 0.5089,
487
- "rewards/accuracies": 0.737500011920929,
488
- "rewards/chosen": -1.3150675296783447,
489
- "rewards/margins": 1.0351736545562744,
490
- "rewards/rejected": -2.350241184234619,
491
  "step": 290
492
  },
493
  {
494
- "epoch": 0.5224205485415759,
495
- "grad_norm": 9.01871162756941,
496
- "learning_rate": 2.7431493298096725e-07,
497
- "logits/chosen": 0.3180525004863739,
498
- "logits/rejected": 0.4156918525695801,
499
- "logps/chosen": -477.7978515625,
500
- "logps/rejected": -595.4729614257812,
501
- "loss": 0.5158,
502
- "rewards/accuracies": 0.7562500238418579,
503
- "rewards/chosen": -1.484802007675171,
504
- "rewards/margins": 1.233532190322876,
505
- "rewards/rejected": -2.718334197998047,
506
  "step": 300
507
  },
508
  {
509
- "epoch": 0.5224205485415759,
510
- "eval_logits/chosen": 0.3319605886936188,
511
- "eval_logits/rejected": 0.4784182906150818,
512
- "eval_logps/chosen": -474.1037292480469,
513
- "eval_logps/rejected": -536.5855102539062,
514
- "eval_loss": 0.5047817826271057,
515
- "eval_rewards/accuracies": 0.7056962251663208,
516
- "eval_rewards/chosen": -1.3617302179336548,
517
- "eval_rewards/margins": 1.0087664127349854,
518
- "eval_rewards/rejected": -2.3704965114593506,
519
- "eval_runtime": 56.8943,
520
- "eval_samples_per_second": 43.941,
521
- "eval_steps_per_second": 1.389,
522
  "step": 300
523
  },
524
  {
525
- "epoch": 0.5398345668262952,
526
- "grad_norm": 9.655940482231035,
527
- "learning_rate": 2.5913050576441473e-07,
528
- "logits/chosen": 0.28561896085739136,
529
- "logits/rejected": 0.4566105902194977,
530
- "logps/chosen": -485.2098083496094,
531
- "logps/rejected": -578.7564086914062,
532
- "loss": 0.4937,
533
- "rewards/accuracies": 0.7437499761581421,
534
- "rewards/chosen": -1.3072640895843506,
535
- "rewards/margins": 1.2091175317764282,
536
- "rewards/rejected": -2.5163815021514893,
537
  "step": 310
538
  },
539
  {
540
- "epoch": 0.5572485851110144,
541
- "grad_norm": 6.61161140792094,
542
- "learning_rate": 2.439122439500026e-07,
543
- "logits/chosen": 0.2611474394798279,
544
- "logits/rejected": 0.4488650858402252,
545
- "logps/chosen": -484.05462646484375,
546
- "logps/rejected": -537.9361572265625,
547
- "loss": 0.499,
548
- "rewards/accuracies": 0.75,
549
- "rewards/chosen": -1.2471102476119995,
550
- "rewards/margins": 1.0742133855819702,
551
- "rewards/rejected": -2.3213236331939697,
552
  "step": 320
553
  },
554
  {
555
- "epoch": 0.5746626033957336,
556
- "grad_norm": 7.51197628735834,
557
- "learning_rate": 2.2871654132159104e-07,
558
- "logits/chosen": 0.2992505431175232,
559
- "logits/rejected": 0.41670387983322144,
560
- "logps/chosen": -504.9584045410156,
561
- "logps/rejected": -583.96923828125,
562
- "loss": 0.5024,
563
- "rewards/accuracies": 0.6937500238418579,
564
- "rewards/chosen": -1.4169723987579346,
565
- "rewards/margins": 1.1618797779083252,
566
- "rewards/rejected": -2.5788521766662598,
567
  "step": 330
568
  },
569
  {
570
- "epoch": 0.5920766216804527,
571
- "grad_norm": 7.130014982854212,
572
- "learning_rate": 2.1359970806624884e-07,
573
- "logits/chosen": 0.2984461784362793,
574
- "logits/rejected": 0.363652765750885,
575
- "logps/chosen": -478.5108337402344,
576
- "logps/rejected": -577.00048828125,
577
- "loss": 0.5178,
578
- "rewards/accuracies": 0.78125,
579
- "rewards/chosen": -1.4051122665405273,
580
- "rewards/margins": 0.9606796503067017,
581
- "rewards/rejected": -2.3657917976379395,
582
  "step": 340
583
  },
584
  {
585
- "epoch": 0.6094906399651719,
586
- "grad_norm": 7.111868751712803,
587
- "learning_rate": 1.9861776210754986e-07,
588
- "logits/chosen": 0.4120015501976013,
589
- "logits/rejected": 0.5323067903518677,
590
- "logps/chosen": -501.5419921875,
591
- "logps/rejected": -561.2122802734375,
592
- "loss": 0.5137,
593
- "rewards/accuracies": 0.737500011920929,
594
- "rewards/chosen": -1.5511784553527832,
595
- "rewards/margins": 1.0202207565307617,
596
- "rewards/rejected": -2.571399211883545,
597
  "step": 350
598
  },
599
  {
600
- "epoch": 0.6269046582498912,
601
- "grad_norm": 8.698410343780237,
602
- "learning_rate": 1.8382622152190158e-07,
603
- "logits/chosen": 0.3698151707649231,
604
- "logits/rejected": 0.4354040026664734,
605
- "logps/chosen": -461.7838439941406,
606
- "logps/rejected": -519.1609497070312,
607
- "loss": 0.4968,
608
- "rewards/accuracies": 0.762499988079071,
609
- "rewards/chosen": -1.2663956880569458,
610
- "rewards/margins": 0.9138597249984741,
611
- "rewards/rejected": -2.18025541305542,
612
  "step": 360
613
  },
614
  {
615
- "epoch": 0.6443186765346104,
616
- "grad_norm": 7.210125130445963,
617
- "learning_rate": 1.692798988071385e-07,
618
- "logits/chosen": 0.34133654832839966,
619
- "logits/rejected": 0.43571311235427856,
620
- "logps/chosen": -427.91424560546875,
621
- "logps/rejected": -496.55206298828125,
622
- "loss": 0.5107,
623
- "rewards/accuracies": 0.762499988079071,
624
- "rewards/chosen": -1.1491998434066772,
625
- "rewards/margins": 0.9772473573684692,
626
- "rewards/rejected": -2.1264472007751465,
627
  "step": 370
628
  },
629
  {
630
- "epoch": 0.6617326948193296,
631
- "grad_norm": 6.808319521539246,
632
- "learning_rate": 1.5503269776575362e-07,
633
- "logits/chosen": 0.4110635221004486,
634
- "logits/rejected": 0.3942858576774597,
635
- "logps/chosen": -437.7587890625,
636
- "logps/rejected": -536.278564453125,
637
- "loss": 0.4949,
638
- "rewards/accuracies": 0.737500011920929,
639
- "rewards/chosen": -1.3278793096542358,
640
- "rewards/margins": 0.8986042141914368,
641
- "rewards/rejected": -2.2264838218688965,
642
  "step": 380
643
  },
644
  {
645
- "epoch": 0.6791467131040487,
646
- "grad_norm": 5.0994020171282886,
647
- "learning_rate": 1.411374137554522e-07,
648
- "logits/chosen": 0.21764016151428223,
649
- "logits/rejected": 0.3392140865325928,
650
- "logps/chosen": -478.3634338378906,
651
- "logps/rejected": -557.5911865234375,
652
- "loss": 0.5066,
653
- "rewards/accuracies": 0.7250000238418579,
654
- "rewards/chosen": -1.41581130027771,
655
- "rewards/margins": 1.0703743696212769,
656
- "rewards/rejected": -2.4861857891082764,
657
  "step": 390
658
  },
659
  {
660
- "epoch": 0.696560731388768,
661
- "grad_norm": 5.872014380298252,
662
- "learning_rate": 1.2764553804722867e-07,
663
- "logits/chosen": 0.3027791380882263,
664
- "logits/rejected": 0.3619641661643982,
665
- "logps/chosen": -507.347900390625,
666
- "logps/rejected": -654.2447509765625,
667
- "loss": 0.501,
668
- "rewards/accuracies": 0.731249988079071,
669
- "rewards/chosen": -1.6098072528839111,
670
- "rewards/margins": 1.409834384918213,
671
- "rewards/rejected": -3.019641399383545,
672
  "step": 400
673
  },
674
  {
675
- "epoch": 0.696560731388768,
676
- "eval_logits/chosen": 0.36147814989089966,
677
- "eval_logits/rejected": 0.5108113884925842,
678
- "eval_logps/chosen": -480.98834228515625,
679
- "eval_logps/rejected": -551.6774291992188,
680
- "eval_loss": 0.4905659258365631,
681
- "eval_rewards/accuracies": 0.7151898741722107,
682
- "eval_rewards/chosen": -1.4305766820907593,
683
- "eval_rewards/margins": 1.09083890914917,
684
- "eval_rewards/rejected": -2.5214157104492188,
685
- "eval_runtime": 56.8059,
686
- "eval_samples_per_second": 44.01,
687
- "eval_steps_per_second": 1.391,
688
  "step": 400
689
  },
690
  {
691
- "epoch": 0.7139747496734872,
692
- "grad_norm": 7.0264805606792855,
693
- "learning_rate": 1.1460706701595385e-07,
694
- "logits/chosen": 0.3922923505306244,
695
- "logits/rejected": 0.43147850036621094,
696
- "logps/chosen": -446.358154296875,
697
- "logps/rejected": -545.4576416015625,
698
- "loss": 0.5176,
699
- "rewards/accuracies": 0.6875,
700
- "rewards/chosen": -1.2909159660339355,
701
- "rewards/margins": 1.0999126434326172,
702
- "rewards/rejected": -2.3908286094665527,
703
  "step": 410
704
  },
705
  {
706
- "epoch": 0.7313887679582064,
707
- "grad_norm": 7.312076711655367,
708
- "learning_rate": 1.0207031687054663e-07,
709
- "logits/chosen": 0.3949499726295471,
710
- "logits/rejected": 0.4116601347923279,
711
- "logps/chosen": -459.15606689453125,
712
- "logps/rejected": -557.7392578125,
713
- "loss": 0.4935,
714
- "rewards/accuracies": 0.6875,
715
- "rewards/chosen": -1.3280662298202515,
716
- "rewards/margins": 1.038793683052063,
717
- "rewards/rejected": -2.3668599128723145,
718
  "step": 420
719
  },
720
  {
721
- "epoch": 0.7488027862429255,
722
- "grad_norm": 8.57419806945796,
723
- "learning_rate": 9.008174461027723e-08,
724
- "logits/chosen": 0.18675744533538818,
725
- "logits/rejected": 0.36219197511672974,
726
- "logps/chosen": -502.193603515625,
727
- "logps/rejected": -595.4912719726562,
728
- "loss": 0.4973,
729
- "rewards/accuracies": 0.737500011920929,
730
- "rewards/chosen": -1.3678879737854004,
731
- "rewards/margins": 1.271985650062561,
732
- "rewards/rejected": -2.639873743057251,
733
  "step": 430
734
  },
735
  {
736
- "epoch": 0.7662168045276447,
737
- "grad_norm": 7.593562856079916,
738
- "learning_rate": 7.86857758706802e-08,
739
- "logits/chosen": 0.391216903924942,
740
- "logits/rejected": 0.4270710051059723,
741
- "logps/chosen": -434.24713134765625,
742
- "logps/rejected": -548.2892456054688,
743
- "loss": 0.4988,
744
- "rewards/accuracies": 0.71875,
745
- "rewards/chosen": -1.390061378479004,
746
- "rewards/margins": 0.9570953249931335,
747
- "rewards/rejected": -2.3471570014953613,
748
  "step": 440
749
  },
750
  {
751
- "epoch": 0.783630822812364,
752
- "grad_norm": 9.701744270423196,
753
- "learning_rate": 6.792464029702102e-08,
754
- "logits/chosen": 0.40113019943237305,
755
- "logits/rejected": 0.47713106870651245,
756
- "logps/chosen": -432.3943786621094,
757
- "logps/rejected": -561.9862670898438,
758
- "loss": 0.4717,
759
- "rewards/accuracies": 0.762499988079071,
760
- "rewards/chosen": -1.339185357093811,
761
- "rewards/margins": 1.3107116222381592,
762
- "rewards/rejected": -2.6498968601226807,
763
  "step": 450
764
  },
765
  {
766
- "epoch": 0.8010448410970832,
767
- "grad_norm": 8.67365127688629,
768
- "learning_rate": 5.7838215055366954e-08,
769
- "logits/chosen": 0.3222309350967407,
770
- "logits/rejected": 0.4205148220062256,
771
- "logps/chosen": -430.9859924316406,
772
- "logps/rejected": -525.9786376953125,
773
- "loss": 0.4975,
774
- "rewards/accuracies": 0.7562500238418579,
775
- "rewards/chosen": -1.3063308000564575,
776
- "rewards/margins": 0.9936330914497375,
777
- "rewards/rejected": -2.2999637126922607,
778
  "step": 460
779
  },
780
  {
781
- "epoch": 0.8184588593818024,
782
- "grad_norm": 8.357079349594828,
783
- "learning_rate": 4.846387706115931e-08,
784
- "logits/chosen": 0.30623525381088257,
785
- "logits/rejected": 0.3187192380428314,
786
- "logps/chosen": -481.569580078125,
787
- "logps/rejected": -676.182373046875,
788
- "loss": 0.4907,
789
- "rewards/accuracies": 0.8125,
790
- "rewards/chosen": -1.4792513847351074,
791
- "rewards/margins": 2.092211961746216,
792
- "rewards/rejected": -3.5714633464813232,
793
  "step": 470
794
  },
795
  {
796
- "epoch": 0.8358728776665215,
797
- "grad_norm": 7.391474804049522,
798
- "learning_rate": 3.9836364472876555e-08,
799
- "logits/chosen": 0.32915499806404114,
800
- "logits/rejected": 0.469025194644928,
801
- "logps/chosen": -479.5523376464844,
802
- "logps/rejected": -578.352294921875,
803
- "loss": 0.5104,
804
- "rewards/accuracies": 0.7437499761581421,
805
- "rewards/chosen": -1.3519645929336548,
806
- "rewards/margins": 1.1279245615005493,
807
- "rewards/rejected": -2.479889392852783,
808
  "step": 480
809
  },
810
  {
811
- "epoch": 0.8532868959512407,
812
- "grad_norm": 6.99188787539793,
813
- "learning_rate": 3.198764796404807e-08,
814
- "logits/chosen": 0.3297652006149292,
815
- "logits/rejected": 0.5077934265136719,
816
- "logps/chosen": -462.5154724121094,
817
- "logps/rejected": -589.4317016601562,
818
- "loss": 0.4871,
819
- "rewards/accuracies": 0.78125,
820
- "rewards/chosen": -1.5567705631256104,
821
- "rewards/margins": 1.4095462560653687,
822
- "rewards/rejected": -2.9663164615631104,
823
  "step": 490
824
  },
825
  {
826
- "epoch": 0.87070091423596,
827
- "grad_norm": 6.675925102880909,
828
- "learning_rate": 2.494681225064066e-08,
829
- "logits/chosen": 0.25453683733940125,
830
- "logits/rejected": 0.31085866689682007,
831
- "logps/chosen": -464.4830017089844,
832
- "logps/rejected": -590.9197387695312,
833
- "loss": 0.4806,
834
- "rewards/accuracies": 0.7250000238418579,
835
- "rewards/chosen": -1.5319304466247559,
836
- "rewards/margins": 1.2942126989364624,
837
- "rewards/rejected": -2.8261430263519287,
838
  "step": 500
839
  },
840
  {
841
- "epoch": 0.87070091423596,
842
- "eval_logits/chosen": 0.36373192071914673,
843
- "eval_logits/rejected": 0.5181233286857605,
844
- "eval_logps/chosen": -483.4225158691406,
845
- "eval_logps/rejected": -560.248046875,
846
- "eval_loss": 0.4863789975643158,
847
- "eval_rewards/accuracies": 0.7246835231781006,
848
- "eval_rewards/chosen": -1.4549182653427124,
849
- "eval_rewards/margins": 1.1522036790847778,
850
- "eval_rewards/rejected": -2.6071219444274902,
851
- "eval_runtime": 57.0421,
852
- "eval_samples_per_second": 43.827,
853
- "eval_steps_per_second": 1.385,
854
  "step": 500
855
  },
856
  {
857
- "epoch": 0.8881149325206792,
858
- "grad_norm": 7.8390265932386445,
859
- "learning_rate": 1.8739948312837012e-08,
860
- "logits/chosen": 0.2649977207183838,
861
- "logits/rejected": 0.36522507667541504,
862
- "logps/chosen": -476.8448791503906,
863
- "logps/rejected": -576.7594604492188,
864
- "loss": 0.4867,
865
- "rewards/accuracies": 0.78125,
866
- "rewards/chosen": -1.3562562465667725,
867
- "rewards/margins": 1.2045637369155884,
868
- "rewards/rejected": -2.5608201026916504,
869
  "step": 510
870
  },
871
  {
872
- "epoch": 0.9055289508053983,
873
- "grad_norm": 8.11481406815297,
874
- "learning_rate": 1.3390056710597647e-08,
875
- "logits/chosen": 0.4297245144844055,
876
- "logits/rejected": 0.45524507761001587,
877
- "logps/chosen": -461.51593017578125,
878
- "logps/rejected": -598.4251708984375,
879
- "loss": 0.493,
880
- "rewards/accuracies": 0.768750011920929,
881
- "rewards/chosen": -1.5076782703399658,
882
- "rewards/margins": 1.4993212223052979,
883
- "rewards/rejected": -3.0069994926452637,
884
  "step": 520
885
  },
886
  {
887
- "epoch": 0.9229429690901175,
888
- "grad_norm": 13.292030068430211,
889
- "learning_rate": 8.916962351285363e-09,
890
- "logits/chosen": 0.37979334592819214,
891
- "logits/rejected": 0.4294493794441223,
892
- "logps/chosen": -445.51361083984375,
893
- "logps/rejected": -526.9692993164062,
894
- "loss": 0.5026,
895
- "rewards/accuracies": 0.7562500238418579,
896
- "rewards/chosen": -1.3705123662948608,
897
- "rewards/margins": 0.9539819955825806,
898
- "rewards/rejected": -2.3244943618774414,
899
  "step": 530
900
  },
901
  {
902
- "epoch": 0.9403569873748368,
903
- "grad_norm": 7.831877125937131,
904
- "learning_rate": 5.337241025194728e-09,
905
- "logits/chosen": 0.3343893885612488,
906
- "logits/rejected": 0.41524171829223633,
907
- "logps/chosen": -470.4335021972656,
908
- "logps/rejected": -616.5140380859375,
909
- "loss": 0.4923,
910
- "rewards/accuracies": 0.8062499761581421,
911
- "rewards/chosen": -1.6075522899627686,
912
- "rewards/margins": 1.6572930812835693,
913
- "rewards/rejected": -3.264845609664917,
914
  "step": 540
915
  },
916
  {
917
- "epoch": 0.957771005659556,
918
- "grad_norm": 6.987219814527263,
919
- "learning_rate": 2.664157981222437e-09,
920
- "logits/chosen": 0.20853932201862335,
921
- "logits/rejected": 0.2358679324388504,
922
- "logps/chosen": -495.81256103515625,
923
- "logps/rejected": -627.6561889648438,
924
- "loss": 0.4959,
925
- "rewards/accuracies": 0.75,
926
- "rewards/chosen": -1.4556350708007812,
927
- "rewards/margins": 1.3767093420028687,
928
- "rewards/rejected": -2.8323445320129395,
929
  "step": 550
930
  },
931
  {
932
- "epoch": 0.9751850239442751,
933
- "grad_norm": 8.183878735354227,
934
- "learning_rate": 9.076187702954652e-10,
935
- "logits/chosen": 0.2713332772254944,
936
- "logits/rejected": 0.35139814019203186,
937
- "logps/chosen": -478.97515869140625,
938
- "logps/rejected": -582.2161254882812,
939
- "loss": 0.4943,
940
- "rewards/accuracies": 0.7437499761581421,
941
- "rewards/chosen": -1.5067349672317505,
942
- "rewards/margins": 1.3343360424041748,
943
- "rewards/rejected": -2.8410706520080566,
944
  "step": 560
945
  },
946
  {
947
- "epoch": 0.9925990422289943,
948
- "grad_norm": 7.7106115387615395,
949
- "learning_rate": 7.413253871516035e-11,
950
- "logits/chosen": 0.25796160101890564,
951
- "logits/rejected": 0.3311242461204529,
952
- "logps/chosen": -475.98175048828125,
953
- "logps/rejected": -590.1162109375,
954
- "loss": 0.4887,
955
- "rewards/accuracies": 0.75,
956
- "rewards/chosen": -1.4784862995147705,
957
- "rewards/margins": 1.1988673210144043,
958
- "rewards/rejected": -2.677353620529175,
959
  "step": 570
960
  },
961
  {
962
- "epoch": 0.999564649542882,
963
- "step": 574,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
964
  "total_flos": 0.0,
965
- "train_loss": 0.5408797243330952,
966
- "train_runtime": 4250.8819,
967
- "train_samples_per_second": 17.289,
968
- "train_steps_per_second": 0.135
969
  }
970
  ],
971
  "logging_steps": 10,
972
- "max_steps": 574,
973
  "num_input_tokens_seen": 0,
974
- "num_train_epochs": 1,
975
  "save_steps": 500,
976
  "stateful_callbacks": {
977
  "TrainerControl": {
@@ -979,8 +1085,8 @@
979
  "should_epoch_stop": false,
980
  "should_evaluate": false,
981
  "should_log": false,
982
- "should_save": false,
983
- "should_training_stop": false
984
  },
985
  "attributes": {}
986
  }
 
1
  {
2
+ "best_metric": 0.6289177536964417,
3
+ "best_model_checkpoint": "models/llama-3.2-3b-sft-dpo/checkpoint-500",
4
+ "epoch": 3.0,
5
  "eval_steps": 100,
6
+ "global_step": 633,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.004739336492890996,
13
+ "grad_norm": 18.306584799400138,
14
+ "learning_rate": 5.2631578947368416e-08,
15
+ "logits/chosen": 1.1032867431640625,
16
+ "logits/rejected": 1.1176480054855347,
17
+ "logps/chosen": -175.54205322265625,
18
+ "logps/rejected": -196.64266967773438,
19
+ "loss": 1.0,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
22
  "rewards/margins": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.04739336492890995,
28
+ "grad_norm": 18.19518017806804,
29
+ "learning_rate": 5.263157894736842e-07,
30
+ "logits/chosen": 0.6209686994552612,
31
+ "logits/rejected": 0.7449740171432495,
32
+ "logps/chosen": -350.8912658691406,
33
+ "logps/rejected": -307.96142578125,
34
+ "loss": 0.9979,
35
+ "rewards/accuracies": 0.4861111044883728,
36
+ "rewards/chosen": 0.00011829059076262638,
37
+ "rewards/margins": 0.016186419874429703,
38
+ "rewards/rejected": -0.016068127006292343,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.0947867298578199,
43
+ "grad_norm": 15.415652807377189,
44
+ "learning_rate": 9.99993455114332e-07,
45
+ "logits/chosen": 0.9229280352592468,
46
+ "logits/rejected": 0.8609384298324585,
47
+ "logps/chosen": -252.894775390625,
48
+ "logps/rejected": -263.6702575683594,
49
+ "loss": 0.9588,
50
+ "rewards/accuracies": 0.7124999761581421,
51
+ "rewards/chosen": 0.0033816881477832794,
52
+ "rewards/margins": 0.16803663969039917,
53
+ "rewards/rejected": -0.164654940366745,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.14218009478672985,
58
+ "grad_norm": 12.850588595957225,
59
+ "learning_rate": 9.992082761369566e-07,
60
+ "logits/chosen": 0.8715411424636841,
61
+ "logits/rejected": 0.8170267343521118,
62
+ "logps/chosen": -296.8494567871094,
63
+ "logps/rejected": -305.7926025390625,
64
+ "loss": 0.8133,
65
+ "rewards/accuracies": 0.793749988079071,
66
+ "rewards/chosen": 0.6128842830657959,
67
+ "rewards/margins": 1.1374889612197876,
68
+ "rewards/rejected": -0.5246046781539917,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.1895734597156398,
73
+ "grad_norm": 14.501186311778227,
74
+ "learning_rate": 9.971164749660148e-07,
75
+ "logits/chosen": 0.9155582189559937,
76
+ "logits/rejected": 0.9567469358444214,
77
+ "logps/chosen": -313.08514404296875,
78
+ "logps/rejected": -309.0679626464844,
79
+ "loss": 0.7405,
80
+ "rewards/accuracies": 0.737500011920929,
81
+ "rewards/chosen": 0.23792271316051483,
82
+ "rewards/margins": 2.1163926124572754,
83
+ "rewards/rejected": -1.878469467163086,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.23696682464454977,
88
+ "grad_norm": 11.740811645701724,
89
+ "learning_rate": 9.937235266586424e-07,
90
+ "logits/chosen": 0.6986435651779175,
91
+ "logits/rejected": 0.8309999704360962,
92
+ "logps/chosen": -319.8310852050781,
93
+ "logps/rejected": -317.59918212890625,
94
+ "loss": 0.6552,
95
+ "rewards/accuracies": 0.793749988079071,
96
+ "rewards/chosen": 0.6028285622596741,
97
+ "rewards/margins": 3.663621425628662,
98
+ "rewards/rejected": -3.060793161392212,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.2843601895734597,
103
+ "grad_norm": 14.434952077378005,
104
+ "learning_rate": 9.890383118800284e-07,
105
+ "logits/chosen": 0.7444020509719849,
106
+ "logits/rejected": 0.7484663724899292,
107
+ "logps/chosen": -327.59576416015625,
108
+ "logps/rejected": -349.929931640625,
109
+ "loss": 0.6285,
110
+ "rewards/accuracies": 0.706250011920929,
111
+ "rewards/chosen": 0.3002261221408844,
112
+ "rewards/margins": 3.5275771617889404,
113
+ "rewards/rejected": -3.227351427078247,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.33175355450236965,
118
+ "grad_norm": 10.030890442911925,
119
+ "learning_rate": 9.830730936592615e-07,
120
+ "logits/chosen": 0.7815200090408325,
121
+ "logits/rejected": 0.7069059610366821,
122
+ "logps/chosen": -252.94921875,
123
+ "logps/rejected": -323.2224426269531,
124
+ "loss": 0.6106,
125
+ "rewards/accuracies": 0.7875000238418579,
126
+ "rewards/chosen": 1.3401187658309937,
127
+ "rewards/margins": 5.26017427444458,
128
+ "rewards/rejected": -3.920055866241455,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.3791469194312796,
133
+ "grad_norm": 12.131364583934603,
134
+ "learning_rate": 9.758434852922123e-07,
135
+ "logits/chosen": 0.7100412249565125,
136
+ "logits/rejected": 0.6621907353401184,
137
+ "logps/chosen": -271.33331298828125,
138
+ "logps/rejected": -328.0660705566406,
139
+ "loss": 0.59,
140
+ "rewards/accuracies": 0.71875,
141
+ "rewards/chosen": 0.908360481262207,
142
+ "rewards/margins": 4.926724910736084,
143
+ "rewards/rejected": -4.018364429473877,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.4265402843601896,
148
+ "grad_norm": 11.822232959802975,
149
+ "learning_rate": 9.673684094754685e-07,
150
+ "logits/chosen": 0.6003296375274658,
151
+ "logits/rejected": 0.6765642762184143,
152
+ "logps/chosen": -293.85015869140625,
153
+ "logps/rejected": -305.929443359375,
154
+ "loss": 0.586,
155
+ "rewards/accuracies": 0.768750011920929,
156
+ "rewards/chosen": 1.231705904006958,
157
+ "rewards/margins": 4.982685089111328,
158
+ "rewards/rejected": -3.750978946685791,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.47393364928909953,
163
+ "grad_norm": 9.616291876594419,
164
+ "learning_rate": 9.576700487782773e-07,
165
+ "logits/chosen": 0.6642001867294312,
166
+ "logits/rejected": 0.6596721410751343,
167
+ "logps/chosen": -326.2373046875,
168
+ "logps/rejected": -381.3326110839844,
169
+ "loss": 0.5801,
170
+ "rewards/accuracies": 0.824999988079071,
171
+ "rewards/chosen": 1.7316535711288452,
172
+ "rewards/margins": 6.260350704193115,
173
+ "rewards/rejected": -4.5286970138549805,
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.47393364928909953,
178
+ "eval_logits/chosen": 0.610289990901947,
179
+ "eval_logits/rejected": 0.6783497929573059,
180
+ "eval_logps/chosen": -339.33251953125,
181
+ "eval_logps/rejected": -361.24346923828125,
182
+ "eval_loss": 0.6839759349822998,
183
+ "eval_rewards/accuracies": 0.6898733973503113,
184
+ "eval_rewards/chosen": 0.6485355496406555,
185
+ "eval_rewards/margins": 3.587477684020996,
186
+ "eval_rewards/rejected": -2.9389421939849854,
187
+ "eval_runtime": 76.922,
188
+ "eval_samples_per_second": 32.5,
189
+ "eval_steps_per_second": 1.027,
190
  "step": 100
191
  },
192
  {
193
+ "epoch": 0.5213270142180095,
194
+ "grad_norm": 11.519611398516883,
195
+ "learning_rate": 9.467737875821367e-07,
196
+ "logits/chosen": 0.659843385219574,
197
+ "logits/rejected": 0.6010033488273621,
198
+ "logps/chosen": -293.62200927734375,
199
+ "logps/rejected": -334.9098205566406,
200
+ "loss": 0.5742,
201
+ "rewards/accuracies": 0.8062499761581421,
202
+ "rewards/chosen": 1.1434353590011597,
203
+ "rewards/margins": 5.331825256347656,
204
+ "rewards/rejected": -4.188389301300049,
205
  "step": 110
206
  },
207
  {
208
+ "epoch": 0.5687203791469194,
209
+ "grad_norm": 10.75922014108817,
210
+ "learning_rate": 9.347081456399957e-07,
211
+ "logits/chosen": 0.6637296676635742,
212
+ "logits/rejected": 0.5958945155143738,
213
+ "logps/chosen": -272.2585144042969,
214
+ "logps/rejected": -393.41949462890625,
215
+ "loss": 0.5821,
216
+ "rewards/accuracies": 0.800000011920929,
217
+ "rewards/chosen": 0.9803568124771118,
218
+ "rewards/margins": 6.413501739501953,
219
+ "rewards/rejected": -5.433144569396973,
220
  "step": 120
221
  },
222
  {
223
+ "epoch": 0.6161137440758294,
224
+ "grad_norm": 11.497074098204886,
225
+ "learning_rate": 9.215047034289715e-07,
226
+ "logits/chosen": 0.6836856603622437,
227
+ "logits/rejected": 0.6638469696044922,
228
+ "logps/chosen": -275.0943603515625,
229
+ "logps/rejected": -332.6889343261719,
230
+ "loss": 0.5752,
231
+ "rewards/accuracies": 0.800000011920929,
232
+ "rewards/chosen": 1.4476346969604492,
233
+ "rewards/margins": 6.094024658203125,
234
+ "rewards/rejected": -4.646389961242676,
235
  "step": 130
236
  },
237
  {
238
+ "epoch": 0.6635071090047393,
239
+ "grad_norm": 9.658859904375,
240
+ "learning_rate": 9.07198019491959e-07,
241
+ "logits/chosen": 0.61662757396698,
242
+ "logits/rejected": 0.5779851675033569,
243
+ "logps/chosen": -272.382080078125,
244
+ "logps/rejected": -355.6089172363281,
245
+ "loss": 0.5468,
246
+ "rewards/accuracies": 0.793749988079071,
247
+ "rewards/chosen": 0.8889511227607727,
248
+ "rewards/margins": 5.594452857971191,
249
+ "rewards/rejected": -4.705502510070801,
250
  "step": 140
251
  },
252
  {
253
+ "epoch": 0.7109004739336493,
254
+ "grad_norm": 10.07652231167762,
255
+ "learning_rate": 8.918255399844853e-07,
256
+ "logits/chosen": 0.5373108983039856,
257
+ "logits/rejected": 0.654308021068573,
258
+ "logps/chosen": -330.0559997558594,
259
+ "logps/rejected": -349.55224609375,
260
+ "loss": 0.5738,
261
+ "rewards/accuracies": 0.7562500238418579,
262
+ "rewards/chosen": 0.3335852324962616,
263
+ "rewards/margins": 4.550914287567139,
264
+ "rewards/rejected": -4.217329502105713,
265
  "step": 150
266
  },
267
  {
268
+ "epoch": 0.7582938388625592,
269
+ "grad_norm": 8.965490487953566,
270
+ "learning_rate": 8.754275006635572e-07,
271
+ "logits/chosen": 0.565764844417572,
272
+ "logits/rejected": 0.539226233959198,
273
+ "logps/chosen": -269.29742431640625,
274
+ "logps/rejected": -355.60589599609375,
275
+ "loss": 0.5997,
276
+ "rewards/accuracies": 0.731249988079071,
277
+ "rewards/chosen": 0.5406277179718018,
278
+ "rewards/margins": 5.479567527770996,
279
+ "rewards/rejected": -4.938939571380615,
280
  "step": 160
281
  },
282
  {
283
+ "epoch": 0.8056872037914692,
284
+ "grad_norm": 9.437674903727038,
285
+ "learning_rate": 8.580468215750391e-07,
286
+ "logits/chosen": 0.6932438611984253,
287
+ "logits/rejected": 0.636594831943512,
288
+ "logps/chosen": -296.7684631347656,
289
+ "logps/rejected": -367.45318603515625,
290
+ "loss": 0.5783,
291
+ "rewards/accuracies": 0.768750011920929,
292
+ "rewards/chosen": 1.147369146347046,
293
+ "rewards/margins": 5.5389909744262695,
294
+ "rewards/rejected": -4.391622066497803,
295
  "step": 170
296
  },
297
  {
298
+ "epoch": 0.8530805687203792,
299
+ "grad_norm": 8.5658002946873,
300
+ "learning_rate": 8.39728994715202e-07,
301
+ "logits/chosen": 0.6020892858505249,
302
+ "logits/rejected": 0.5168766379356384,
303
+ "logps/chosen": -288.558349609375,
304
+ "logps/rejected": -348.62640380859375,
305
+ "loss": 0.5531,
306
+ "rewards/accuracies": 0.731249988079071,
307
+ "rewards/chosen": 0.6757786870002747,
308
+ "rewards/margins": 5.149857997894287,
309
+ "rewards/rejected": -4.474079132080078,
310
  "step": 180
311
  },
312
  {
313
+ "epoch": 0.9004739336492891,
314
+ "grad_norm": 11.065263225689659,
315
+ "learning_rate": 8.20521964960477e-07,
316
+ "logits/chosen": 0.6599653363227844,
317
+ "logits/rejected": 0.6458830237388611,
318
+ "logps/chosen": -289.4867858886719,
319
+ "logps/rejected": -342.56243896484375,
320
+ "loss": 0.5439,
321
  "rewards/accuracies": 0.7250000238418579,
322
+ "rewards/chosen": 1.274778962135315,
323
+ "rewards/margins": 6.3435516357421875,
324
+ "rewards/rejected": -5.068772792816162,
325
  "step": 190
326
  },
327
  {
328
+ "epoch": 0.9478672985781991,
329
+ "grad_norm": 8.426424572195439,
330
+ "learning_rate": 8.0047600457707e-07,
331
+ "logits/chosen": 0.6277160048484802,
332
+ "logits/rejected": 0.6192003488540649,
333
+ "logps/chosen": -318.033447265625,
334
+ "logps/rejected": -377.3500061035156,
335
+ "loss": 0.537,
336
+ "rewards/accuracies": 0.7749999761581421,
337
+ "rewards/chosen": 1.3354051113128662,
338
+ "rewards/margins": 6.755140781402588,
339
+ "rewards/rejected": -5.419735431671143,
340
  "step": 200
341
  },
342
  {
343
+ "epoch": 0.9478672985781991,
344
+ "eval_logits/chosen": 0.494819700717926,
345
+ "eval_logits/rejected": 0.5648438930511475,
346
+ "eval_logps/chosen": -343.7730712890625,
347
+ "eval_logps/rejected": -372.1695861816406,
348
+ "eval_loss": 0.6514427661895752,
349
+ "eval_rewards/accuracies": 0.7278481125831604,
350
+ "eval_rewards/chosen": 0.20448331534862518,
351
+ "eval_rewards/margins": 4.236032485961914,
352
+ "eval_rewards/rejected": -4.031548976898193,
353
+ "eval_runtime": 74.0508,
354
+ "eval_samples_per_second": 33.761,
355
+ "eval_steps_per_second": 1.067,
356
  "step": 200
357
  },
358
  {
359
+ "epoch": 0.995260663507109,
360
+ "grad_norm": 9.878709661135902,
361
+ "learning_rate": 7.796435816388898e-07,
362
+ "logits/chosen": 0.6760674118995667,
363
+ "logits/rejected": 0.6518660187721252,
364
+ "logps/chosen": -284.24749755859375,
365
+ "logps/rejected": -363.0601501464844,
366
+ "loss": 0.554,
367
+ "rewards/accuracies": 0.7749999761581421,
368
+ "rewards/chosen": 0.6821473836898804,
369
+ "rewards/margins": 6.51880407333374,
370
+ "rewards/rejected": -5.8366570472717285,
371
  "step": 210
372
  },
373
  {
374
+ "epoch": 1.042654028436019,
375
+ "grad_norm": 10.875728154843127,
376
+ "learning_rate": 7.580792226981954e-07,
377
+ "logits/chosen": 0.5221652984619141,
378
+ "logits/rejected": 0.44479990005493164,
379
+ "logps/chosen": -281.39190673828125,
380
+ "logps/rejected": -370.33941650390625,
381
+ "loss": 0.4911,
382
+ "rewards/accuracies": 0.800000011920929,
383
+ "rewards/chosen": 2.0442254543304443,
384
+ "rewards/margins": 7.068573951721191,
385
+ "rewards/rejected": -5.024348258972168,
386
  "step": 220
387
  },
388
  {
389
+ "epoch": 1.0900473933649288,
390
+ "grad_norm": 10.04148994728917,
391
+ "learning_rate": 7.358393700684032e-07,
392
+ "logits/chosen": 0.5540430545806885,
393
+ "logits/rejected": 0.5128260850906372,
394
+ "logps/chosen": -279.4583435058594,
395
+ "logps/rejected": -350.32684326171875,
396
+ "loss": 0.5022,
397
+ "rewards/accuracies": 0.793749988079071,
398
+ "rewards/chosen": 0.9357398152351379,
399
+ "rewards/margins": 5.9159369468688965,
400
+ "rewards/rejected": -4.980198383331299,
401
  "step": 230
402
  },
403
  {
404
+ "epoch": 1.1374407582938388,
405
+ "grad_norm": 11.466420945945197,
406
+ "learning_rate": 7.129822340926043e-07,
407
+ "logits/chosen": 0.5252267122268677,
408
+ "logits/rejected": 0.6392233371734619,
409
+ "logps/chosen": -300.5268859863281,
410
+ "logps/rejected": -328.5356750488281,
411
+ "loss": 0.4908,
412
+ "rewards/accuracies": 0.8187500238418579,
413
+ "rewards/chosen": 1.1534405946731567,
414
+ "rewards/margins": 6.1857991218566895,
415
+ "rewards/rejected": -5.032358169555664,
416
  "step": 240
417
  },
418
  {
419
+ "epoch": 1.1848341232227488,
420
+ "grad_norm": 9.714339627017372,
421
+ "learning_rate": 6.895676407844586e-07,
422
+ "logits/chosen": 0.5342652797698975,
423
+ "logits/rejected": 0.5475658178329468,
424
+ "logps/chosen": -275.02972412109375,
425
+ "logps/rejected": -325.74993896484375,
426
+ "loss": 0.4508,
427
+ "rewards/accuracies": 0.8374999761581421,
428
+ "rewards/chosen": 2.0915255546569824,
429
+ "rewards/margins": 6.8750715255737305,
430
+ "rewards/rejected": -4.783546447753906,
431
  "step": 250
432
  },
433
  {
434
+ "epoch": 1.2322274881516588,
435
+ "grad_norm": 8.702659887264469,
436
+ "learning_rate": 6.656568752402521e-07,
437
+ "logits/chosen": 0.4584909975528717,
438
+ "logits/rejected": 0.5478152632713318,
439
+ "logps/chosen": -314.6927185058594,
440
+ "logps/rejected": -357.88226318359375,
441
+ "loss": 0.4621,
442
+ "rewards/accuracies": 0.8062499761581421,
443
+ "rewards/chosen": 1.3858213424682617,
444
+ "rewards/margins": 6.8659563064575195,
445
+ "rewards/rejected": -5.480134963989258,
446
  "step": 260
447
  },
448
  {
449
+ "epoch": 1.2796208530805688,
450
+ "grad_norm": 10.924278197277149,
451
+ "learning_rate": 6.413125212319663e-07,
452
+ "logits/chosen": 0.6362992525100708,
453
+ "logits/rejected": 0.6484791040420532,
454
+ "logps/chosen": -285.7840270996094,
455
+ "logps/rejected": -360.7676086425781,
456
+ "loss": 0.4712,
457
+ "rewards/accuracies": 0.8687499761581421,
458
+ "rewards/chosen": 2.0224599838256836,
459
+ "rewards/margins": 7.362783908843994,
460
+ "rewards/rejected": -5.3403239250183105,
461
  "step": 270
462
  },
463
  {
464
+ "epoch": 1.3270142180094786,
465
+ "grad_norm": 9.286266066829205,
466
+ "learning_rate": 6.165982974012104e-07,
467
+ "logits/chosen": 0.48062658309936523,
468
+ "logits/rejected": 0.4873732626438141,
469
+ "logps/chosen": -345.07586669921875,
470
+ "logps/rejected": -393.88165283203125,
471
+ "loss": 0.4628,
472
  "rewards/accuracies": 0.793749988079071,
473
+ "rewards/chosen": 1.449973225593567,
474
+ "rewards/margins": 7.039644718170166,
475
+ "rewards/rejected": -5.589670658111572,
476
  "step": 280
477
  },
478
  {
479
+ "epoch": 1.3744075829383886,
480
+ "grad_norm": 9.83819564198541,
481
+ "learning_rate": 5.915788904827553e-07,
482
+ "logits/chosen": 0.43026304244995117,
483
+ "logits/rejected": 0.459343820810318,
484
+ "logps/chosen": -294.733154296875,
485
+ "logps/rejected": -363.80340576171875,
486
+ "loss": 0.4507,
487
+ "rewards/accuracies": 0.831250011920929,
488
+ "rewards/chosen": 1.6585981845855713,
489
+ "rewards/margins": 6.437933444976807,
490
+ "rewards/rejected": -4.779335021972656,
491
  "step": 290
492
  },
493
  {
494
+ "epoch": 1.4218009478672986,
495
+ "grad_norm": 8.577071743246128,
496
+ "learning_rate": 5.663197859941938e-07,
497
+ "logits/chosen": 0.6086027026176453,
498
+ "logits/rejected": 0.6251193881034851,
499
+ "logps/chosen": -262.66644287109375,
500
+ "logps/rejected": -320.42974853515625,
501
+ "loss": 0.4787,
502
+ "rewards/accuracies": 0.7875000238418579,
503
+ "rewards/chosen": 1.298060655593872,
504
+ "rewards/margins": 6.941515922546387,
505
+ "rewards/rejected": -5.643455505371094,
506
  "step": 300
507
  },
508
  {
509
+ "epoch": 1.4218009478672986,
510
+ "eval_logits/chosen": 0.45885032415390015,
511
+ "eval_logits/rejected": 0.5325651168823242,
512
+ "eval_logps/chosen": -341.7187194824219,
513
+ "eval_logps/rejected": -371.7361145019531,
514
+ "eval_loss": 0.6386769413948059,
515
+ "eval_rewards/accuracies": 0.7215189933776855,
516
+ "eval_rewards/chosen": 0.40991881489753723,
517
+ "eval_rewards/margins": 4.398120880126953,
518
+ "eval_rewards/rejected": -3.98820161819458,
519
+ "eval_runtime": 72.3153,
520
+ "eval_samples_per_second": 34.571,
521
+ "eval_steps_per_second": 1.092,
522
  "step": 300
523
  },
524
  {
525
+ "epoch": 1.4691943127962086,
526
+ "grad_norm": 12.642599504555136,
527
+ "learning_rate": 5.408870968348749e-07,
528
+ "logits/chosen": 0.46862930059432983,
529
+ "logits/rejected": 0.45317015051841736,
530
+ "logps/chosen": -269.1434631347656,
531
+ "logps/rejected": -348.3428955078125,
532
+ "loss": 0.4684,
533
+ "rewards/accuracies": 0.8187500238418579,
534
+ "rewards/chosen": 1.3798935413360596,
535
+ "rewards/margins": 6.562399864196777,
536
+ "rewards/rejected": -5.182506561279297,
537
  "step": 310
538
  },
539
  {
540
+ "epoch": 1.5165876777251186,
541
+ "grad_norm": 9.79584839845262,
542
+ "learning_rate": 5.153473902427354e-07,
543
+ "logits/chosen": 0.47858723998069763,
544
+ "logits/rejected": 0.5644794702529907,
545
+ "logps/chosen": -321.48345947265625,
546
+ "logps/rejected": -343.6278991699219,
547
+ "loss": 0.4803,
548
+ "rewards/accuracies": 0.793749988079071,
549
+ "rewards/chosen": 1.1607013940811157,
550
+ "rewards/margins": 5.799595832824707,
551
+ "rewards/rejected": -4.638894557952881,
552
  "step": 320
553
  },
554
  {
555
+ "epoch": 1.5639810426540284,
556
+ "grad_norm": 8.875212778872154,
557
+ "learning_rate": 4.897675135619516e-07,
558
+ "logits/chosen": 0.47927242517471313,
559
+ "logits/rejected": 0.605729341506958,
560
+ "logps/chosen": -296.8520812988281,
561
+ "logps/rejected": -339.26220703125,
562
+ "loss": 0.48,
563
+ "rewards/accuracies": 0.793749988079071,
564
+ "rewards/chosen": 1.206688404083252,
565
+ "rewards/margins": 6.4211745262146,
566
+ "rewards/rejected": -5.214486598968506,
567
  "step": 330
568
  },
569
  {
570
+ "epoch": 1.6113744075829384,
571
+ "grad_norm": 9.788751062324735,
572
+ "learning_rate": 4.642144192774429e-07,
573
+ "logits/chosen": 0.6517030000686646,
574
+ "logits/rejected": 0.6343492269515991,
575
+ "logps/chosen": -256.8311767578125,
576
+ "logps/rejected": -318.10504150390625,
577
+ "loss": 0.4687,
578
+ "rewards/accuracies": 0.8374999761581421,
579
+ "rewards/chosen": 1.4574129581451416,
580
+ "rewards/margins": 7.180891513824463,
581
+ "rewards/rejected": -5.723478317260742,
582
  "step": 340
583
  },
584
  {
585
+ "epoch": 1.6587677725118484,
586
+ "grad_norm": 8.123068784558978,
587
+ "learning_rate": 4.387549897741825e-07,
588
+ "logits/chosen": 0.43539008498191833,
589
+ "logits/rejected": 0.4823547303676605,
590
+ "logps/chosen": -322.7386474609375,
591
+ "logps/rejected": -349.6393127441406,
592
+ "loss": 0.4903,
593
+ "rewards/accuracies": 0.8187500238418579,
594
+ "rewards/chosen": 1.6534090042114258,
595
+ "rewards/margins": 6.494222164154053,
596
+ "rewards/rejected": -4.840813159942627,
597
  "step": 350
598
  },
599
  {
600
+ "epoch": 1.7061611374407581,
601
+ "grad_norm": 10.106462346167355,
602
+ "learning_rate": 4.1345586227998634e-07,
603
+ "logits/chosen": 0.4860106110572815,
604
+ "logits/rejected": 0.48908883333206177,
605
+ "logps/chosen": -289.710693359375,
606
+ "logps/rejected": -384.22686767578125,
607
+ "loss": 0.446,
608
+ "rewards/accuracies": 0.887499988079071,
609
+ "rewards/chosen": 1.587738275527954,
610
+ "rewards/margins": 7.2089128494262695,
611
+ "rewards/rejected": -5.6211748123168945,
612
  "step": 360
613
  },
614
  {
615
+ "epoch": 1.7535545023696684,
616
+ "grad_norm": 10.81635763601606,
617
+ "learning_rate": 3.883832544499735e-07,
618
+ "logits/chosen": 0.5913195013999939,
619
+ "logits/rejected": 0.5606914758682251,
620
+ "logps/chosen": -292.9503173828125,
621
+ "logps/rejected": -390.93878173828125,
622
+ "loss": 0.4592,
623
+ "rewards/accuracies": 0.7749999761581421,
624
+ "rewards/chosen": 1.614689588546753,
625
+ "rewards/margins": 6.656731605529785,
626
+ "rewards/rejected": -5.042041301727295,
627
  "step": 370
628
  },
629
  {
630
+ "epoch": 1.8009478672985781,
631
+ "grad_norm": 10.495084061438284,
632
+ "learning_rate": 3.636027910492114e-07,
633
+ "logits/chosen": 0.4658740162849426,
634
+ "logits/rejected": 0.5308722257614136,
635
+ "logps/chosen": -305.28753662109375,
636
+ "logps/rejected": -352.7513122558594,
637
+ "loss": 0.4648,
638
+ "rewards/accuracies": 0.8062499761581421,
639
+ "rewards/chosen": 1.0712064504623413,
640
+ "rewards/margins": 6.167966365814209,
641
+ "rewards/rejected": -5.096759796142578,
642
  "step": 380
643
  },
644
  {
645
+ "epoch": 1.8483412322274881,
646
+ "grad_norm": 11.413974134819627,
647
+ "learning_rate": 3.3917933218718566e-07,
648
+ "logits/chosen": 0.6185089349746704,
649
+ "logits/rejected": 0.6838531494140625,
650
+ "logps/chosen": -284.1628112792969,
651
+ "logps/rejected": -333.17657470703125,
652
+ "loss": 0.4426,
653
+ "rewards/accuracies": 0.8125,
654
+ "rewards/chosen": 1.4776874780654907,
655
+ "rewards/margins": 6.398137092590332,
656
+ "rewards/rejected": -4.920449733734131,
657
  "step": 390
658
  },
659
  {
660
+ "epoch": 1.8957345971563981,
661
+ "grad_norm": 9.664147195442332,
662
+ "learning_rate": 3.151768035536698e-07,
663
+ "logits/chosen": 0.6407091617584229,
664
+ "logits/rejected": 0.6542560458183289,
665
+ "logps/chosen": -284.20037841796875,
666
+ "logps/rejected": -345.27880859375,
667
+ "loss": 0.4559,
668
+ "rewards/accuracies": 0.824999988079071,
669
+ "rewards/chosen": 2.0247559547424316,
670
+ "rewards/margins": 7.09304141998291,
671
+ "rewards/rejected": -5.0682854652404785,
672
  "step": 400
673
  },
674
  {
675
+ "epoch": 1.8957345971563981,
676
+ "eval_logits/chosen": 0.41101595759391785,
677
+ "eval_logits/rejected": 0.4840773642063141,
678
+ "eval_logps/chosen": -338.1277160644531,
679
+ "eval_logps/rejected": -368.54248046875,
680
+ "eval_loss": 0.6332134008407593,
681
+ "eval_rewards/accuracies": 0.7341772317886353,
682
+ "eval_rewards/chosen": 0.7690173983573914,
683
+ "eval_rewards/margins": 4.437857151031494,
684
+ "eval_rewards/rejected": -3.668839931488037,
685
+ "eval_runtime": 72.5998,
686
+ "eval_samples_per_second": 34.435,
687
+ "eval_steps_per_second": 1.088,
688
  "step": 400
689
  },
690
  {
691
+ "epoch": 1.943127962085308,
692
+ "grad_norm": 10.263641095491934,
693
+ "learning_rate": 2.9165802910033603e-07,
694
+ "logits/chosen": 0.5565508604049683,
695
+ "logits/rejected": 0.5877315402030945,
696
+ "logps/chosen": -328.7551574707031,
697
+ "logps/rejected": -364.5121154785156,
698
+ "loss": 0.4644,
699
+ "rewards/accuracies": 0.78125,
700
+ "rewards/chosen": 1.852020502090454,
701
+ "rewards/margins": 6.0383710861206055,
702
+ "rewards/rejected": -4.186350345611572,
703
  "step": 410
704
  },
705
  {
706
+ "epoch": 1.9905213270142181,
707
+ "grad_norm": 8.889403142715599,
708
+ "learning_rate": 2.686845666060415e-07,
709
+ "logits/chosen": 0.5102426409721375,
710
+ "logits/rejected": 0.43454083800315857,
711
+ "logps/chosen": -271.08160400390625,
712
+ "logps/rejected": -369.26458740234375,
713
+ "loss": 0.461,
714
+ "rewards/accuracies": 0.793749988079071,
715
+ "rewards/chosen": 1.6376615762710571,
716
+ "rewards/margins": 7.588493347167969,
717
+ "rewards/rejected": -5.950831413269043,
718
  "step": 420
719
  },
720
  {
721
+ "epoch": 2.037914691943128,
722
+ "grad_norm": 7.4495856256114195,
723
+ "learning_rate": 2.4631654655618287e-07,
724
+ "logits/chosen": 0.37354058027267456,
725
+ "logits/rejected": 0.4436867833137512,
726
+ "logps/chosen": -310.15802001953125,
727
+ "logps/rejected": -382.03253173828125,
728
+ "loss": 0.3945,
729
+ "rewards/accuracies": 0.8125,
730
+ "rewards/chosen": 1.8288238048553467,
731
+ "rewards/margins": 7.114483833312988,
732
+ "rewards/rejected": -5.2856597900390625,
733
  "step": 430
734
  },
735
  {
736
+ "epoch": 2.085308056872038,
737
+ "grad_norm": 8.829254132221473,
738
+ "learning_rate": 2.2461251475783155e-07,
739
+ "logits/chosen": 0.5162326693534851,
740
+ "logits/rejected": 0.4021889567375183,
741
+ "logps/chosen": -288.923095703125,
742
+ "logps/rejected": -389.34979248046875,
743
+ "loss": 0.3748,
744
+ "rewards/accuracies": 0.8500000238418579,
745
+ "rewards/chosen": 1.8741111755371094,
746
+ "rewards/margins": 7.6665802001953125,
747
+ "rewards/rejected": -5.792468547821045,
748
  "step": 440
749
  },
750
  {
751
+ "epoch": 2.132701421800948,
752
+ "grad_norm": 8.156529944948277,
753
+ "learning_rate": 2.0362927910258986e-07,
754
+ "logits/chosen": 0.45688456296920776,
755
+ "logits/rejected": 0.4526469111442566,
756
+ "logps/chosen": -253.50131225585938,
757
+ "logps/rejected": -349.1957702636719,
758
+ "loss": 0.4147,
759
+ "rewards/accuracies": 0.875,
760
+ "rewards/chosen": 2.0875327587127686,
761
+ "rewards/margins": 8.09435749053955,
762
+ "rewards/rejected": -6.006823539733887,
763
  "step": 450
764
  },
765
  {
766
+ "epoch": 2.1800947867298577,
767
+ "grad_norm": 7.824692642426332,
768
+ "learning_rate": 1.8342176087824573e-07,
769
+ "logits/chosen": 0.4325633645057678,
770
+ "logits/rejected": 0.3565566837787628,
771
+ "logps/chosen": -284.46624755859375,
772
+ "logps/rejected": -372.12091064453125,
773
+ "loss": 0.3992,
774
+ "rewards/accuracies": 0.8500000238418579,
775
+ "rewards/chosen": 1.8221031427383423,
776
+ "rewards/margins": 7.619426727294922,
777
+ "rewards/rejected": -5.797322750091553,
778
  "step": 460
779
  },
780
  {
781
+ "epoch": 2.227488151658768,
782
+ "grad_norm": 13.407256371457692,
783
+ "learning_rate": 1.6404285101840565e-07,
784
+ "logits/chosen": 0.3386808931827545,
785
+ "logits/rejected": 0.47734910249710083,
786
+ "logps/chosen": -331.7251892089844,
787
+ "logps/rejected": -367.4866638183594,
788
+ "loss": 0.3822,
789
+ "rewards/accuracies": 0.8374999761581421,
790
+ "rewards/chosen": 1.9130542278289795,
791
+ "rewards/margins": 7.692631721496582,
792
+ "rewards/rejected": -5.779577732086182,
793
  "step": 470
794
  },
795
  {
796
+ "epoch": 2.2748815165876777,
797
+ "grad_norm": 10.86707059625683,
798
+ "learning_rate": 1.455432716663517e-07,
799
+ "logits/chosen": 0.36686116456985474,
800
+ "logits/rejected": 0.48829737305641174,
801
+ "logps/chosen": -285.77008056640625,
802
+ "logps/rejected": -328.3174743652344,
803
+ "loss": 0.4089,
804
+ "rewards/accuracies": 0.856249988079071,
805
+ "rewards/chosen": 1.7794748544692993,
806
+ "rewards/margins": 6.214818477630615,
807
+ "rewards/rejected": -4.435343265533447,
808
  "step": 480
809
  },
810
  {
811
+ "epoch": 2.322274881516588,
812
+ "grad_norm": 9.830177502454013,
813
+ "learning_rate": 1.2797144341546883e-07,
814
+ "logits/chosen": 0.3986554741859436,
815
+ "logits/rejected": 0.44396382570266724,
816
+ "logps/chosen": -321.13818359375,
817
+ "logps/rejected": -390.934326171875,
818
+ "loss": 0.4219,
819
+ "rewards/accuracies": 0.8187500238418579,
820
+ "rewards/chosen": 1.6029850244522095,
821
+ "rewards/margins": 7.5643768310546875,
822
+ "rewards/rejected": -5.961391448974609,
823
  "step": 490
824
  },
825
  {
826
+ "epoch": 2.3696682464454977,
827
+ "grad_norm": 9.42905977432162,
828
+ "learning_rate": 1.1137335857372043e-07,
829
+ "logits/chosen": 0.4437794089317322,
830
+ "logits/rejected": 0.42870789766311646,
831
+ "logps/chosen": -287.81451416015625,
832
+ "logps/rejected": -374.01873779296875,
833
+ "loss": 0.4028,
834
+ "rewards/accuracies": 0.856249988079071,
835
+ "rewards/chosen": 2.1330111026763916,
836
+ "rewards/margins": 7.767390251159668,
837
+ "rewards/rejected": -5.6343793869018555,
838
  "step": 500
839
  },
840
  {
841
+ "epoch": 2.3696682464454977,
842
+ "eval_logits/chosen": 0.3730663061141968,
843
+ "eval_logits/rejected": 0.4475269019603729,
844
+ "eval_logps/chosen": -338.3392028808594,
845
+ "eval_logps/rejected": -370.232666015625,
846
+ "eval_loss": 0.6289177536964417,
847
+ "eval_rewards/accuracies": 0.7405063509941101,
848
+ "eval_rewards/chosen": 0.7478683590888977,
849
+ "eval_rewards/margins": 4.585729122161865,
850
+ "eval_rewards/rejected": -3.8378612995147705,
851
+ "eval_runtime": 73.3012,
852
+ "eval_samples_per_second": 34.106,
853
+ "eval_steps_per_second": 1.078,
854
  "step": 500
855
  },
856
  {
857
+ "epoch": 2.4170616113744074,
858
+ "grad_norm": 10.06462647313331,
859
+ "learning_rate": 9.579246078389403e-08,
860
+ "logits/chosen": 0.5295278429985046,
861
+ "logits/rejected": 0.43623122572898865,
862
+ "logps/chosen": -258.68963623046875,
863
+ "logps/rejected": -339.7721252441406,
864
+ "loss": 0.3858,
865
+ "rewards/accuracies": 0.84375,
866
+ "rewards/chosen": 1.592254400253296,
867
+ "rewards/margins": 7.2217698097229,
868
+ "rewards/rejected": -5.629514694213867,
869
  "step": 510
870
  },
871
  {
872
+ "epoch": 2.4644549763033177,
873
+ "grad_norm": 9.022052721765009,
874
+ "learning_rate": 8.126953131469228e-08,
875
+ "logits/chosen": 0.44106584787368774,
876
+ "logits/rejected": 0.39466392993927,
877
+ "logps/chosen": -303.3637390136719,
878
+ "logps/rejected": -370.74114990234375,
879
+ "loss": 0.4143,
880
+ "rewards/accuracies": 0.875,
881
+ "rewards/chosen": 1.8263496160507202,
882
+ "rewards/margins": 7.823184013366699,
883
+ "rewards/rejected": -5.996834754943848,
884
  "step": 520
885
  },
886
  {
887
+ "epoch": 2.5118483412322274,
888
+ "grad_norm": 8.021054640921763,
889
+ "learning_rate": 6.784258232029472e-08,
890
+ "logits/chosen": 0.3634105622768402,
891
+ "logits/rejected": 0.3859165608882904,
892
+ "logps/chosen": -307.2467041015625,
893
+ "logps/rejected": -376.1995849609375,
894
+ "loss": 0.3822,
895
+ "rewards/accuracies": 0.8687499761581421,
896
+ "rewards/chosen": 2.497091770172119,
897
+ "rewards/margins": 7.9943437576293945,
898
+ "rewards/rejected": -5.497252464294434,
899
  "step": 530
900
  },
901
  {
902
+ "epoch": 2.5592417061611377,
903
+ "grad_norm": 10.013425700067337,
904
+ "learning_rate": 5.554675734776665e-08,
905
+ "logits/chosen": 0.5024563074111938,
906
+ "logits/rejected": 0.5056658387184143,
907
+ "logps/chosen": -276.1619567871094,
908
+ "logps/rejected": -368.4447021484375,
909
+ "loss": 0.4035,
910
+ "rewards/accuracies": 0.831250011920929,
911
+ "rewards/chosen": 1.820339560508728,
912
+ "rewards/margins": 8.141976356506348,
913
+ "rewards/rejected": -6.321636199951172,
914
  "step": 540
915
  },
916
  {
917
+ "epoch": 2.6066350710900474,
918
+ "grad_norm": 9.209955480260117,
919
+ "learning_rate": 4.4414239352730867e-08,
920
+ "logits/chosen": 0.42310771346092224,
921
+ "logits/rejected": 0.48689502477645874,
922
+ "logps/chosen": -313.3210754394531,
923
+ "logps/rejected": -351.4210205078125,
924
+ "loss": 0.406,
925
+ "rewards/accuracies": 0.893750011920929,
926
+ "rewards/chosen": 2.1306679248809814,
927
+ "rewards/margins": 7.7195258140563965,
928
+ "rewards/rejected": -5.588858127593994,
929
  "step": 550
930
  },
931
  {
932
+ "epoch": 2.654028436018957,
933
+ "grad_norm": 9.959818332708023,
934
+ "learning_rate": 3.447416646405632e-08,
935
+ "logits/chosen": 0.5685544610023499,
936
+ "logits/rejected": 0.5256290435791016,
937
+ "logps/chosen": -287.7798156738281,
938
+ "logps/rejected": -380.33685302734375,
939
+ "loss": 0.4009,
940
+ "rewards/accuracies": 0.893750011920929,
941
+ "rewards/chosen": 1.8459497690200806,
942
+ "rewards/margins": 7.295513153076172,
943
+ "rewards/rejected": -5.449563503265381,
944
  "step": 560
945
  },
946
  {
947
+ "epoch": 2.7014218009478674,
948
+ "grad_norm": 8.593809820816018,
949
+ "learning_rate": 2.575255571804391e-08,
950
+ "logits/chosen": 0.41258078813552856,
951
+ "logits/rejected": 0.4132450222969055,
952
+ "logps/chosen": -287.94476318359375,
953
+ "logps/rejected": -369.03656005859375,
954
+ "loss": 0.4,
955
+ "rewards/accuracies": 0.8125,
956
+ "rewards/chosen": 1.5231783390045166,
957
+ "rewards/margins": 7.392594814300537,
958
+ "rewards/rejected": -5.8694167137146,
959
  "step": 570
960
  },
961
  {
962
+ "epoch": 2.748815165876777,
963
+ "grad_norm": 9.646946039027634,
964
+ "learning_rate": 1.8272234961725084e-08,
965
+ "logits/chosen": 0.48128992319107056,
966
+ "logits/rejected": 0.4887717366218567,
967
+ "logps/chosen": -303.7729797363281,
968
+ "logps/rejected": -359.5372314453125,
969
+ "loss": 0.3912,
970
+ "rewards/accuracies": 0.90625,
971
+ "rewards/chosen": 2.173060655593872,
972
+ "rewards/margins": 8.012847900390625,
973
+ "rewards/rejected": -5.839787006378174,
974
+ "step": 580
975
+ },
976
+ {
977
+ "epoch": 2.7962085308056874,
978
+ "grad_norm": 11.09612482230785,
979
+ "learning_rate": 1.2052783103508102e-08,
980
+ "logits/chosen": 0.5081132650375366,
981
+ "logits/rejected": 0.5602059364318848,
982
+ "logps/chosen": -270.61737060546875,
983
+ "logps/rejected": -335.85577392578125,
984
+ "loss": 0.3991,
985
+ "rewards/accuracies": 0.8125,
986
+ "rewards/chosen": 1.619431495666504,
987
+ "rewards/margins": 6.8268561363220215,
988
+ "rewards/rejected": -5.207424163818359,
989
+ "step": 590
990
+ },
991
+ {
992
+ "epoch": 2.843601895734597,
993
+ "grad_norm": 8.273064520857158,
994
+ "learning_rate": 7.1104788675613315e-09,
995
+ "logits/chosen": 0.32943224906921387,
996
+ "logits/rejected": 0.4085375666618347,
997
+ "logps/chosen": -288.88995361328125,
998
+ "logps/rejected": -364.12860107421875,
999
+ "loss": 0.4029,
1000
+ "rewards/accuracies": 0.893750011920929,
1001
+ "rewards/chosen": 2.0637223720550537,
1002
+ "rewards/margins": 7.937726020812988,
1003
+ "rewards/rejected": -5.874002933502197,
1004
+ "step": 600
1005
+ },
1006
+ {
1007
+ "epoch": 2.843601895734597,
1008
+ "eval_logits/chosen": 0.38198891282081604,
1009
+ "eval_logits/rejected": 0.45711585879325867,
1010
+ "eval_logps/chosen": -337.3143310546875,
1011
+ "eval_logps/rejected": -368.9125061035156,
1012
+ "eval_loss": 0.6283919215202332,
1013
+ "eval_rewards/accuracies": 0.7436708807945251,
1014
+ "eval_rewards/chosen": 0.8503568768501282,
1015
+ "eval_rewards/margins": 4.556199073791504,
1016
+ "eval_rewards/rejected": -3.7058422565460205,
1017
+ "eval_runtime": 73.7958,
1018
+ "eval_samples_per_second": 33.877,
1019
+ "eval_steps_per_second": 1.071,
1020
+ "step": 600
1021
+ },
1022
+ {
1023
+ "epoch": 2.890995260663507,
1024
+ "grad_norm": 9.238913123295514,
1025
+ "learning_rate": 3.4582581860612137e-09,
1026
+ "logits/chosen": 0.43385523557662964,
1027
+ "logits/rejected": 0.43230634927749634,
1028
+ "logps/chosen": -292.0911865234375,
1029
+ "logps/rejected": -353.61590576171875,
1030
+ "loss": 0.3884,
1031
+ "rewards/accuracies": 0.831250011920929,
1032
+ "rewards/chosen": 1.989989995956421,
1033
+ "rewards/margins": 6.724064826965332,
1034
+ "rewards/rejected": -4.734074115753174,
1035
+ "step": 610
1036
+ },
1037
+ {
1038
+ "epoch": 2.938388625592417,
1039
+ "grad_norm": 9.407237089972764,
1040
+ "learning_rate": 1.1056803408273085e-09,
1041
+ "logits/chosen": 0.48387131094932556,
1042
+ "logits/rejected": 0.4587581753730774,
1043
+ "logps/chosen": -282.6869201660156,
1044
+ "logps/rejected": -344.5205078125,
1045
+ "loss": 0.4089,
1046
+ "rewards/accuracies": 0.8374999761581421,
1047
+ "rewards/chosen": 1.806133508682251,
1048
+ "rewards/margins": 7.467283725738525,
1049
+ "rewards/rejected": -5.661149978637695,
1050
+ "step": 620
1051
+ },
1052
+ {
1053
+ "epoch": 2.985781990521327,
1054
+ "grad_norm": 8.481488205996529,
1055
+ "learning_rate": 5.890294296428955e-11,
1056
+ "logits/chosen": 0.44664233922958374,
1057
+ "logits/rejected": 0.5504810810089111,
1058
+ "logps/chosen": -319.47119140625,
1059
+ "logps/rejected": -348.36090087890625,
1060
+ "loss": 0.3848,
1061
+ "rewards/accuracies": 0.862500011920929,
1062
+ "rewards/chosen": 2.1828243732452393,
1063
+ "rewards/margins": 6.884246826171875,
1064
+ "rewards/rejected": -4.701422214508057,
1065
+ "step": 630
1066
+ },
1067
+ {
1068
+ "epoch": 3.0,
1069
+ "step": 633,
1070
  "total_flos": 0.0,
1071
+ "train_loss": 0.5009220597491634,
1072
+ "train_runtime": 6227.6413,
1073
+ "train_samples_per_second": 13.002,
1074
+ "train_steps_per_second": 0.102
1075
  }
1076
  ],
1077
  "logging_steps": 10,
1078
+ "max_steps": 633,
1079
  "num_input_tokens_seen": 0,
1080
+ "num_train_epochs": 3,
1081
  "save_steps": 500,
1082
  "stateful_callbacks": {
1083
  "TrainerControl": {
 
1085
  "should_epoch_stop": false,
1086
  "should_evaluate": false,
1087
  "should_log": false,
1088
+ "should_save": true,
1089
+ "should_training_stop": true
1090
  },
1091
  "attributes": {}
1092
  }