yakazimir commited on
Commit
42a6f26
1 Parent(s): 83a7abc

Model save

Browse files
Files changed (5) hide show
  1. README.md +75 -0
  2. all_results.json +9 -0
  3. generation_config.json +12 -0
  4. train_results.json +9 -0
  5. trainer_state.json +1515 -0
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: llama3
4
+ base_model: meta-llama/Meta-Llama-3-8B-Instruct
5
+ tags:
6
+ - trl
7
+ - simpo
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: llama3_orpo_best_entropy
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # llama3_orpo_best_entropy
18
+
19
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 2.5608
22
+ - Rewards/chosen: -12.8797
23
+ - Rewards/rejected: -17.3972
24
+ - Rewards/accuracies: 0.8072
25
+ - Rewards/margins: 4.5175
26
+ - Logps/rejected: -1.7397
27
+ - Logps/chosen: -1.2880
28
+ - Logits/rejected: -1.3473
29
+ - Logits/chosen: -1.3813
30
+ - Semantic Entropy: 0.7719
31
+
32
+ ## Model description
33
+
34
+ More information needed
35
+
36
+ ## Intended uses & limitations
37
+
38
+ More information needed
39
+
40
+ ## Training and evaluation data
41
+
42
+ More information needed
43
+
44
+ ## Training procedure
45
+
46
+ ### Training hyperparameters
47
+
48
+ The following hyperparameters were used during training:
49
+ - learning_rate: 1e-06
50
+ - train_batch_size: 2
51
+ - eval_batch_size: 4
52
+ - seed: 42
53
+ - distributed_type: multi-GPU
54
+ - num_devices: 4
55
+ - gradient_accumulation_steps: 16
56
+ - total_train_batch_size: 128
57
+ - total_eval_batch_size: 16
58
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
59
+ - lr_scheduler_type: cosine
60
+ - lr_scheduler_warmup_ratio: 0.1
61
+ - num_epochs: 1.0
62
+
63
+ ### Training results
64
+
65
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen | Semantic Entropy |
66
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|:----------------:|
67
+ | 2.3262 | 0.8743 | 400 | 2.5608 | -12.8797 | -17.3972 | 0.8072 | 4.5175 | -1.7397 | -1.2880 | -1.3473 | -1.3813 | 0.7719 |
68
+
69
+
70
+ ### Framework versions
71
+
72
+ - Transformers 4.44.2
73
+ - Pytorch 2.2.2+cu121
74
+ - Datasets 2.18.0
75
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9989071038251366,
3
+ "total_flos": 0.0,
4
+ "train_loss": 3.4314852449513107,
5
+ "train_runtime": 5934.8281,
6
+ "train_samples": 58558,
7
+ "train_samples_per_second": 9.867,
8
+ "train_steps_per_second": 0.077
9
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 128000,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128009
7
+ ],
8
+ "max_length": 4096,
9
+ "temperature": 0.6,
10
+ "top_p": 0.9,
11
+ "transformers_version": "4.44.2"
12
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9989071038251366,
3
+ "total_flos": 0.0,
4
+ "train_loss": 3.4314852449513107,
5
+ "train_runtime": 5934.8281,
6
+ "train_samples": 58558,
7
+ "train_samples_per_second": 9.867,
8
+ "train_steps_per_second": 0.077
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9989071038251366,
5
+ "eval_steps": 400,
6
+ "global_step": 457,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01092896174863388,
13
+ "grad_norm": 358.5620562461214,
14
+ "learning_rate": 1.0869565217391303e-07,
15
+ "logits/chosen": -1.0113575458526611,
16
+ "logits/rejected": -1.0064939260482788,
17
+ "logps/chosen": -0.2803983986377716,
18
+ "logps/rejected": -0.2860395908355713,
19
+ "loss": 5.2315,
20
+ "rewards/accuracies": 0.53125,
21
+ "rewards/chosen": -2.8039839267730713,
22
+ "rewards/margins": 0.056411754339933395,
23
+ "rewards/rejected": -2.860395669937134,
24
+ "semantic_entropy": 0.7518940567970276,
25
+ "step": 5
26
+ },
27
+ {
28
+ "epoch": 0.02185792349726776,
29
+ "grad_norm": 233.11198507849488,
30
+ "learning_rate": 2.1739130434782607e-07,
31
+ "logits/chosen": -1.056563138961792,
32
+ "logits/rejected": -1.0053507089614868,
33
+ "logps/chosen": -0.2568749487400055,
34
+ "logps/rejected": -0.27021342515945435,
35
+ "loss": 5.206,
36
+ "rewards/accuracies": 0.5375000238418579,
37
+ "rewards/chosen": -2.568748950958252,
38
+ "rewards/margins": 0.13338527083396912,
39
+ "rewards/rejected": -2.702134370803833,
40
+ "semantic_entropy": 0.7094504237174988,
41
+ "step": 10
42
+ },
43
+ {
44
+ "epoch": 0.03278688524590164,
45
+ "grad_norm": 210.79380420075688,
46
+ "learning_rate": 3.260869565217391e-07,
47
+ "logits/chosen": -1.0091139078140259,
48
+ "logits/rejected": -0.9631060361862183,
49
+ "logps/chosen": -0.2674282491207123,
50
+ "logps/rejected": -0.27336788177490234,
51
+ "loss": 5.1278,
52
+ "rewards/accuracies": 0.512499988079071,
53
+ "rewards/chosen": -2.6742825508117676,
54
+ "rewards/margins": 0.05939612537622452,
55
+ "rewards/rejected": -2.7336785793304443,
56
+ "semantic_entropy": 0.7274739742279053,
57
+ "step": 15
58
+ },
59
+ {
60
+ "epoch": 0.04371584699453552,
61
+ "grad_norm": 375.0299222500816,
62
+ "learning_rate": 4.3478260869565214e-07,
63
+ "logits/chosen": -0.9487798810005188,
64
+ "logits/rejected": -0.8998070955276489,
65
+ "logps/chosen": -0.2723819613456726,
66
+ "logps/rejected": -0.28497135639190674,
67
+ "loss": 5.222,
68
+ "rewards/accuracies": 0.5687500238418579,
69
+ "rewards/chosen": -2.7238194942474365,
70
+ "rewards/margins": 0.1258939504623413,
71
+ "rewards/rejected": -2.8497138023376465,
72
+ "semantic_entropy": 0.7452942132949829,
73
+ "step": 20
74
+ },
75
+ {
76
+ "epoch": 0.0546448087431694,
77
+ "grad_norm": 299.8105605992341,
78
+ "learning_rate": 5.434782608695652e-07,
79
+ "logits/chosen": -0.9505411386489868,
80
+ "logits/rejected": -0.8759678602218628,
81
+ "logps/chosen": -0.27557066082954407,
82
+ "logps/rejected": -0.29382389783859253,
83
+ "loss": 5.1223,
84
+ "rewards/accuracies": 0.5874999761581421,
85
+ "rewards/chosen": -2.755706548690796,
86
+ "rewards/margins": 0.1825324147939682,
87
+ "rewards/rejected": -2.9382388591766357,
88
+ "semantic_entropy": 0.7546485662460327,
89
+ "step": 25
90
+ },
91
+ {
92
+ "epoch": 0.06557377049180328,
93
+ "grad_norm": 300.511189676041,
94
+ "learning_rate": 6.521739130434782e-07,
95
+ "logits/chosen": -1.0557540655136108,
96
+ "logits/rejected": -0.990186333656311,
97
+ "logps/chosen": -0.267598032951355,
98
+ "logps/rejected": -0.28400346636772156,
99
+ "loss": 5.1993,
100
+ "rewards/accuracies": 0.518750011920929,
101
+ "rewards/chosen": -2.6759800910949707,
102
+ "rewards/margins": 0.1640542596578598,
103
+ "rewards/rejected": -2.8400347232818604,
104
+ "semantic_entropy": 0.7248786091804504,
105
+ "step": 30
106
+ },
107
+ {
108
+ "epoch": 0.07650273224043716,
109
+ "grad_norm": 186.14457687250982,
110
+ "learning_rate": 7.608695652173913e-07,
111
+ "logits/chosen": -1.0076847076416016,
112
+ "logits/rejected": -0.9405019879341125,
113
+ "logps/chosen": -0.2580474615097046,
114
+ "logps/rejected": -0.279694139957428,
115
+ "loss": 5.0812,
116
+ "rewards/accuracies": 0.4937500059604645,
117
+ "rewards/chosen": -2.580474615097046,
118
+ "rewards/margins": 0.2164669930934906,
119
+ "rewards/rejected": -2.7969415187835693,
120
+ "semantic_entropy": 0.7201561331748962,
121
+ "step": 35
122
+ },
123
+ {
124
+ "epoch": 0.08743169398907104,
125
+ "grad_norm": 223.67562662995005,
126
+ "learning_rate": 8.695652173913043e-07,
127
+ "logits/chosen": -0.9655061960220337,
128
+ "logits/rejected": -0.9045132398605347,
129
+ "logps/chosen": -0.2820321023464203,
130
+ "logps/rejected": -0.2990434765815735,
131
+ "loss": 5.3116,
132
+ "rewards/accuracies": 0.518750011920929,
133
+ "rewards/chosen": -2.820321559906006,
134
+ "rewards/margins": 0.17011362314224243,
135
+ "rewards/rejected": -2.9904348850250244,
136
+ "semantic_entropy": 0.7597802877426147,
137
+ "step": 40
138
+ },
139
+ {
140
+ "epoch": 0.09836065573770492,
141
+ "grad_norm": 131.00541589477967,
142
+ "learning_rate": 9.782608695652173e-07,
143
+ "logits/chosen": -1.0179851055145264,
144
+ "logits/rejected": -0.9359539151191711,
145
+ "logps/chosen": -0.2855134606361389,
146
+ "logps/rejected": -0.3078162968158722,
147
+ "loss": 4.919,
148
+ "rewards/accuracies": 0.4937500059604645,
149
+ "rewards/chosen": -2.8551342487335205,
150
+ "rewards/margins": 0.22302868962287903,
151
+ "rewards/rejected": -3.078162908554077,
152
+ "semantic_entropy": 0.7599790096282959,
153
+ "step": 45
154
+ },
155
+ {
156
+ "epoch": 0.1092896174863388,
157
+ "grad_norm": 360.5211336376954,
158
+ "learning_rate": 9.997663088532014e-07,
159
+ "logits/chosen": -0.9701619148254395,
160
+ "logits/rejected": -0.887865424156189,
161
+ "logps/chosen": -0.28138467669487,
162
+ "logps/rejected": -0.28913217782974243,
163
+ "loss": 5.1014,
164
+ "rewards/accuracies": 0.48750001192092896,
165
+ "rewards/chosen": -2.813847064971924,
166
+ "rewards/margins": 0.07747481018304825,
167
+ "rewards/rejected": -2.891321897506714,
168
+ "semantic_entropy": 0.7510823607444763,
169
+ "step": 50
170
+ },
171
+ {
172
+ "epoch": 0.12021857923497267,
173
+ "grad_norm": 162.42445549849919,
174
+ "learning_rate": 9.98817312944725e-07,
175
+ "logits/chosen": -1.0028311014175415,
176
+ "logits/rejected": -0.8850187063217163,
177
+ "logps/chosen": -0.27895885705947876,
178
+ "logps/rejected": -0.3143305778503418,
179
+ "loss": 4.8519,
180
+ "rewards/accuracies": 0.5562499761581421,
181
+ "rewards/chosen": -2.7895889282226562,
182
+ "rewards/margins": 0.3537173271179199,
183
+ "rewards/rejected": -3.143306255340576,
184
+ "semantic_entropy": 0.7608937621116638,
185
+ "step": 55
186
+ },
187
+ {
188
+ "epoch": 0.13114754098360656,
189
+ "grad_norm": 120.127890644715,
190
+ "learning_rate": 9.971397915250336e-07,
191
+ "logits/chosen": -1.0246554613113403,
192
+ "logits/rejected": -0.9786936640739441,
193
+ "logps/chosen": -0.26926669478416443,
194
+ "logps/rejected": -0.3113505244255066,
195
+ "loss": 4.6493,
196
+ "rewards/accuracies": 0.612500011920929,
197
+ "rewards/chosen": -2.692667245864868,
198
+ "rewards/margins": 0.42083826661109924,
199
+ "rewards/rejected": -3.1135053634643555,
200
+ "semantic_entropy": 0.7576217651367188,
201
+ "step": 60
202
+ },
203
+ {
204
+ "epoch": 0.14207650273224043,
205
+ "grad_norm": 214.34275301709462,
206
+ "learning_rate": 9.94736194623663e-07,
207
+ "logits/chosen": -0.9859918355941772,
208
+ "logits/rejected": -0.9181090593338013,
209
+ "logps/chosen": -0.3084973096847534,
210
+ "logps/rejected": -0.3399081826210022,
211
+ "loss": 4.9942,
212
+ "rewards/accuracies": 0.550000011920929,
213
+ "rewards/chosen": -3.084973096847534,
214
+ "rewards/margins": 0.31410855054855347,
215
+ "rewards/rejected": -3.3990814685821533,
216
+ "semantic_entropy": 0.8060176968574524,
217
+ "step": 65
218
+ },
219
+ {
220
+ "epoch": 0.15300546448087432,
221
+ "grad_norm": 544.9393207205336,
222
+ "learning_rate": 9.916100327075037e-07,
223
+ "logits/chosen": -0.9521551132202148,
224
+ "logits/rejected": -0.9334642291069031,
225
+ "logps/chosen": -0.29637694358825684,
226
+ "logps/rejected": -0.3240143656730652,
227
+ "loss": 4.7857,
228
+ "rewards/accuracies": 0.5687500238418579,
229
+ "rewards/chosen": -2.9637694358825684,
230
+ "rewards/margins": 0.27637460827827454,
231
+ "rewards/rejected": -3.2401435375213623,
232
+ "semantic_entropy": 0.7712014317512512,
233
+ "step": 70
234
+ },
235
+ {
236
+ "epoch": 0.16393442622950818,
237
+ "grad_norm": 123.2584567983735,
238
+ "learning_rate": 9.877658715537428e-07,
239
+ "logits/chosen": -0.9457874298095703,
240
+ "logits/rejected": -0.9289388656616211,
241
+ "logps/chosen": -0.31344500184059143,
242
+ "logps/rejected": -0.3473803997039795,
243
+ "loss": 4.8544,
244
+ "rewards/accuracies": 0.625,
245
+ "rewards/chosen": -3.1344501972198486,
246
+ "rewards/margins": 0.33935409784317017,
247
+ "rewards/rejected": -3.473804473876953,
248
+ "semantic_entropy": 0.8020299077033997,
249
+ "step": 75
250
+ },
251
+ {
252
+ "epoch": 0.17486338797814208,
253
+ "grad_norm": 136.55956527115663,
254
+ "learning_rate": 9.832093255815216e-07,
255
+ "logits/chosen": -0.9409273266792297,
256
+ "logits/rejected": -0.8797443509101868,
257
+ "logps/chosen": -0.3109641373157501,
258
+ "logps/rejected": -0.3346250057220459,
259
+ "loss": 4.7853,
260
+ "rewards/accuracies": 0.606249988079071,
261
+ "rewards/chosen": -3.1096413135528564,
262
+ "rewards/margins": 0.23660895228385925,
263
+ "rewards/rejected": -3.346250534057617,
264
+ "semantic_entropy": 0.7786868810653687,
265
+ "step": 80
266
+ },
267
+ {
268
+ "epoch": 0.18579234972677597,
269
+ "grad_norm": 202.99615488786125,
270
+ "learning_rate": 9.779470496520441e-07,
271
+ "logits/chosen": -0.9390329122543335,
272
+ "logits/rejected": -0.89063560962677,
273
+ "logps/chosen": -0.31133827567100525,
274
+ "logps/rejected": -0.3710102438926697,
275
+ "loss": 4.6394,
276
+ "rewards/accuracies": 0.643750011920929,
277
+ "rewards/chosen": -3.1133828163146973,
278
+ "rewards/margins": 0.5967196226119995,
279
+ "rewards/rejected": -3.7101027965545654,
280
+ "semantic_entropy": 0.8020746111869812,
281
+ "step": 85
282
+ },
283
+ {
284
+ "epoch": 0.19672131147540983,
285
+ "grad_norm": 147.8274369377715,
286
+ "learning_rate": 9.719867293491144e-07,
287
+ "logits/chosen": -0.9997242093086243,
288
+ "logits/rejected": -0.9218411445617676,
289
+ "logps/chosen": -0.33917468786239624,
290
+ "logps/rejected": -0.3771332800388336,
291
+ "loss": 4.7466,
292
+ "rewards/accuracies": 0.581250011920929,
293
+ "rewards/chosen": -3.391746997833252,
294
+ "rewards/margins": 0.37958595156669617,
295
+ "rewards/rejected": -3.7713329792022705,
296
+ "semantic_entropy": 0.8518031239509583,
297
+ "step": 90
298
+ },
299
+ {
300
+ "epoch": 0.20765027322404372,
301
+ "grad_norm": 150.5965714109422,
302
+ "learning_rate": 9.653370697542987e-07,
303
+ "logits/chosen": -0.949033260345459,
304
+ "logits/rejected": -0.9504354596138,
305
+ "logps/chosen": -0.34006524085998535,
306
+ "logps/rejected": -0.3670746684074402,
307
+ "loss": 4.5023,
308
+ "rewards/accuracies": 0.574999988079071,
309
+ "rewards/chosen": -3.4006524085998535,
310
+ "rewards/margins": 0.27009397745132446,
311
+ "rewards/rejected": -3.6707465648651123,
312
+ "semantic_entropy": 0.8378095626831055,
313
+ "step": 95
314
+ },
315
+ {
316
+ "epoch": 0.2185792349726776,
317
+ "grad_norm": 205.55603229224332,
318
+ "learning_rate": 9.580077827331037e-07,
319
+ "logits/chosen": -0.9659525752067566,
320
+ "logits/rejected": -0.9214147329330444,
321
+ "logps/chosen": -0.37815189361572266,
322
+ "logps/rejected": -0.4408392012119293,
323
+ "loss": 4.4615,
324
+ "rewards/accuracies": 0.637499988079071,
325
+ "rewards/chosen": -3.7815189361572266,
326
+ "rewards/margins": 0.6268730163574219,
327
+ "rewards/rejected": -4.408391952514648,
328
+ "semantic_entropy": 0.8833344578742981,
329
+ "step": 100
330
+ },
331
+ {
332
+ "epoch": 0.22950819672131148,
333
+ "grad_norm": 168.46156689250674,
334
+ "learning_rate": 9.500095727507419e-07,
335
+ "logits/chosen": -1.0187479257583618,
336
+ "logits/rejected": -0.9876689910888672,
337
+ "logps/chosen": -0.3568420708179474,
338
+ "logps/rejected": -0.4102093577384949,
339
+ "loss": 4.4882,
340
+ "rewards/accuracies": 0.625,
341
+ "rewards/chosen": -3.568420886993408,
342
+ "rewards/margins": 0.5336726903915405,
343
+ "rewards/rejected": -4.102093696594238,
344
+ "semantic_entropy": 0.8596333265304565,
345
+ "step": 105
346
+ },
347
+ {
348
+ "epoch": 0.24043715846994534,
349
+ "grad_norm": 163.05795657074705,
350
+ "learning_rate": 9.413541212382004e-07,
351
+ "logits/chosen": -1.013091802597046,
352
+ "logits/rejected": -0.9950464367866516,
353
+ "logps/chosen": -0.37098902463912964,
354
+ "logps/rejected": -0.45781344175338745,
355
+ "loss": 4.3876,
356
+ "rewards/accuracies": 0.6625000238418579,
357
+ "rewards/chosen": -3.709890842437744,
358
+ "rewards/margins": 0.8682438135147095,
359
+ "rewards/rejected": -4.578134059906006,
360
+ "semantic_entropy": 0.9028980135917664,
361
+ "step": 110
362
+ },
363
+ {
364
+ "epoch": 0.25136612021857924,
365
+ "grad_norm": 159.92608074418774,
366
+ "learning_rate": 9.320540695314438e-07,
367
+ "logits/chosen": -1.0206435918807983,
368
+ "logits/rejected": -0.9817934036254883,
369
+ "logps/chosen": -0.37152332067489624,
370
+ "logps/rejected": -0.4835759103298187,
371
+ "loss": 4.1862,
372
+ "rewards/accuracies": 0.699999988079071,
373
+ "rewards/chosen": -3.715233325958252,
374
+ "rewards/margins": 1.1205257177352905,
375
+ "rewards/rejected": -4.835759162902832,
376
+ "semantic_entropy": 0.8893026113510132,
377
+ "step": 115
378
+ },
379
+ {
380
+ "epoch": 0.26229508196721313,
381
+ "grad_norm": 126.53869344949423,
382
+ "learning_rate": 9.221230004086721e-07,
383
+ "logits/chosen": -1.0430892705917358,
384
+ "logits/rejected": -0.9731811285018921,
385
+ "logps/chosen": -0.38018742203712463,
386
+ "logps/rejected": -0.44267600774765015,
387
+ "loss": 4.3042,
388
+ "rewards/accuracies": 0.706250011920929,
389
+ "rewards/chosen": -3.8018736839294434,
390
+ "rewards/margins": 0.6248863935470581,
391
+ "rewards/rejected": -4.426759719848633,
392
+ "semantic_entropy": 0.9051868319511414,
393
+ "step": 120
394
+ },
395
+ {
396
+ "epoch": 0.273224043715847,
397
+ "grad_norm": 330.21792860177044,
398
+ "learning_rate": 9.11575418252596e-07,
399
+ "logits/chosen": -0.9472485780715942,
400
+ "logits/rejected": -0.9132539629936218,
401
+ "logps/chosen": -0.396454393863678,
402
+ "logps/rejected": -0.4733741283416748,
403
+ "loss": 4.0912,
404
+ "rewards/accuracies": 0.6937500238418579,
405
+ "rewards/chosen": -3.9645442962646484,
406
+ "rewards/margins": 0.7691973447799683,
407
+ "rewards/rejected": -4.733741283416748,
408
+ "semantic_entropy": 0.9098461866378784,
409
+ "step": 125
410
+ },
411
+ {
412
+ "epoch": 0.28415300546448086,
413
+ "grad_norm": 136.7218535491149,
414
+ "learning_rate": 9.004267278667031e-07,
415
+ "logits/chosen": -0.9847833514213562,
416
+ "logits/rejected": -0.9780336618423462,
417
+ "logps/chosen": -0.4162030816078186,
418
+ "logps/rejected": -0.5500742793083191,
419
+ "loss": 4.1012,
420
+ "rewards/accuracies": 0.731249988079071,
421
+ "rewards/chosen": -4.162030220031738,
422
+ "rewards/margins": 1.3387129306793213,
423
+ "rewards/rejected": -5.500743389129639,
424
+ "semantic_entropy": 0.9023580551147461,
425
+ "step": 130
426
+ },
427
+ {
428
+ "epoch": 0.29508196721311475,
429
+ "grad_norm": 114.59260330847327,
430
+ "learning_rate": 8.886932119764565e-07,
431
+ "logits/chosen": -1.018243670463562,
432
+ "logits/rejected": -0.934001624584198,
433
+ "logps/chosen": -0.40604203939437866,
434
+ "logps/rejected": -0.521537184715271,
435
+ "loss": 3.933,
436
+ "rewards/accuracies": 0.675000011920929,
437
+ "rewards/chosen": -4.060420036315918,
438
+ "rewards/margins": 1.154951810836792,
439
+ "rewards/rejected": -5.215372562408447,
440
+ "semantic_entropy": 0.9238536953926086,
441
+ "step": 135
442
+ },
443
+ {
444
+ "epoch": 0.30601092896174864,
445
+ "grad_norm": 151.22961611030988,
446
+ "learning_rate": 8.763920074482809e-07,
447
+ "logits/chosen": -1.02057683467865,
448
+ "logits/rejected": -0.9660438299179077,
449
+ "logps/chosen": -0.43255481123924255,
450
+ "logps/rejected": -0.5811036229133606,
451
+ "loss": 3.5202,
452
+ "rewards/accuracies": 0.71875,
453
+ "rewards/chosen": -4.32554817199707,
454
+ "rewards/margins": 1.485487699508667,
455
+ "rewards/rejected": -5.811036109924316,
456
+ "semantic_entropy": 0.9533751606941223,
457
+ "step": 140
458
+ },
459
+ {
460
+ "epoch": 0.31693989071038253,
461
+ "grad_norm": 171.9485188405326,
462
+ "learning_rate": 8.635410802610723e-07,
463
+ "logits/chosen": -1.0028008222579956,
464
+ "logits/rejected": -0.9838630557060242,
465
+ "logps/chosen": -0.4235480725765228,
466
+ "logps/rejected": -0.49707216024398804,
467
+ "loss": 3.7809,
468
+ "rewards/accuracies": 0.7250000238418579,
469
+ "rewards/chosen": -4.235480785369873,
470
+ "rewards/margins": 0.7352409958839417,
471
+ "rewards/rejected": -4.970722198486328,
472
+ "semantic_entropy": 0.9417489767074585,
473
+ "step": 145
474
+ },
475
+ {
476
+ "epoch": 0.32786885245901637,
477
+ "grad_norm": 135.1994551251087,
478
+ "learning_rate": 8.501591992667849e-07,
479
+ "logits/chosen": -1.0660603046417236,
480
+ "logits/rejected": -1.0330188274383545,
481
+ "logps/chosen": -0.458204448223114,
482
+ "logps/rejected": -0.6369711756706238,
483
+ "loss": 3.6343,
484
+ "rewards/accuracies": 0.762499988079071,
485
+ "rewards/chosen": -4.58204460144043,
486
+ "rewards/margins": 1.7876676321029663,
487
+ "rewards/rejected": -6.369711875915527,
488
+ "semantic_entropy": 0.9522278904914856,
489
+ "step": 150
490
+ },
491
+ {
492
+ "epoch": 0.33879781420765026,
493
+ "grad_norm": 123.13149743090504,
494
+ "learning_rate": 8.362659087784152e-07,
495
+ "logits/chosen": -0.9992470741271973,
496
+ "logits/rejected": -0.946983814239502,
497
+ "logps/chosen": -0.45955339074134827,
498
+ "logps/rejected": -0.5732384324073792,
499
+ "loss": 3.6764,
500
+ "rewards/accuracies": 0.6812499761581421,
501
+ "rewards/chosen": -4.5955328941345215,
502
+ "rewards/margins": 1.1368510723114014,
503
+ "rewards/rejected": -5.732384204864502,
504
+ "semantic_entropy": 0.9510468244552612,
505
+ "step": 155
506
+ },
507
+ {
508
+ "epoch": 0.34972677595628415,
509
+ "grad_norm": 245.75242739092204,
510
+ "learning_rate": 8.218815000254231e-07,
511
+ "logits/chosen": -1.0509494543075562,
512
+ "logits/rejected": -0.9942834973335266,
513
+ "logps/chosen": -0.5233258008956909,
514
+ "logps/rejected": -0.6250703930854797,
515
+ "loss": 3.7281,
516
+ "rewards/accuracies": 0.6937500238418579,
517
+ "rewards/chosen": -5.233258247375488,
518
+ "rewards/margins": 1.0174460411071777,
519
+ "rewards/rejected": -6.250703811645508,
520
+ "semantic_entropy": 0.9671458005905151,
521
+ "step": 160
522
+ },
523
+ {
524
+ "epoch": 0.36065573770491804,
525
+ "grad_norm": 158.28325663577223,
526
+ "learning_rate": 8.07026981518276e-07,
527
+ "logits/chosen": -1.0312683582305908,
528
+ "logits/rejected": -0.9780334234237671,
529
+ "logps/chosen": -0.5141728520393372,
530
+ "logps/rejected": -0.6252259016036987,
531
+ "loss": 3.6091,
532
+ "rewards/accuracies": 0.7437499761581421,
533
+ "rewards/chosen": -5.141728401184082,
534
+ "rewards/margins": 1.1105303764343262,
535
+ "rewards/rejected": -6.252258777618408,
536
+ "semantic_entropy": 0.9860242605209351,
537
+ "step": 165
538
+ },
539
+ {
540
+ "epoch": 0.37158469945355194,
541
+ "grad_norm": 136.70967385020978,
542
+ "learning_rate": 7.917240483654e-07,
543
+ "logits/chosen": -1.0271486043930054,
544
+ "logits/rejected": -0.9650506973266602,
545
+ "logps/chosen": -0.5105966329574585,
546
+ "logps/rejected": -0.6147508025169373,
547
+ "loss": 3.7493,
548
+ "rewards/accuracies": 0.71875,
549
+ "rewards/chosen": -5.105965614318848,
550
+ "rewards/margins": 1.0415422916412354,
551
+ "rewards/rejected": -6.147508144378662,
552
+ "semantic_entropy": 0.9853399991989136,
553
+ "step": 170
554
+ },
555
+ {
556
+ "epoch": 0.3825136612021858,
557
+ "grad_norm": 192.89943041726585,
558
+ "learning_rate": 7.759950505873521e-07,
559
+ "logits/chosen": -1.0819661617279053,
560
+ "logits/rejected": -1.050621747970581,
561
+ "logps/chosen": -0.5413715839385986,
562
+ "logps/rejected": -0.6315657496452332,
563
+ "loss": 3.5161,
564
+ "rewards/accuracies": 0.65625,
565
+ "rewards/chosen": -5.413715362548828,
566
+ "rewards/margins": 0.9019424319267273,
567
+ "rewards/rejected": -6.315657615661621,
568
+ "semantic_entropy": 0.9645811319351196,
569
+ "step": 175
570
+ },
571
+ {
572
+ "epoch": 0.39344262295081966,
573
+ "grad_norm": 152.89832261081025,
574
+ "learning_rate": 7.598629604744872e-07,
575
+ "logits/chosen": -1.0869873762130737,
576
+ "logits/rejected": -1.0785080194473267,
577
+ "logps/chosen": -0.5242325067520142,
578
+ "logps/rejected": -0.7012667655944824,
579
+ "loss": 3.3521,
580
+ "rewards/accuracies": 0.7875000238418579,
581
+ "rewards/chosen": -5.242325305938721,
582
+ "rewards/margins": 1.7703425884246826,
583
+ "rewards/rejected": -7.012667655944824,
584
+ "semantic_entropy": 1.0011693239212036,
585
+ "step": 180
586
+ },
587
+ {
588
+ "epoch": 0.40437158469945356,
589
+ "grad_norm": 150.16456326171243,
590
+ "learning_rate": 7.433513390357989e-07,
591
+ "logits/chosen": -1.1106479167938232,
592
+ "logits/rejected": -1.1203057765960693,
593
+ "logps/chosen": -0.5613775253295898,
594
+ "logps/rejected": -0.7369329333305359,
595
+ "loss": 3.3325,
596
+ "rewards/accuracies": 0.793749988079071,
597
+ "rewards/chosen": -5.61377477645874,
598
+ "rewards/margins": 1.755553960800171,
599
+ "rewards/rejected": -7.36932897567749,
600
+ "semantic_entropy": 1.0098798274993896,
601
+ "step": 185
602
+ },
603
+ {
604
+ "epoch": 0.41530054644808745,
605
+ "grad_norm": 177.03981164325452,
606
+ "learning_rate": 7.264843015879321e-07,
607
+ "logits/chosen": -1.118817925453186,
608
+ "logits/rejected": -1.075656771659851,
609
+ "logps/chosen": -0.5620681047439575,
610
+ "logps/rejected": -0.7573049664497375,
611
+ "loss": 3.3889,
612
+ "rewards/accuracies": 0.7875000238418579,
613
+ "rewards/chosen": -5.620680332183838,
614
+ "rewards/margins": 1.9523694515228271,
615
+ "rewards/rejected": -7.573049068450928,
616
+ "semantic_entropy": 0.998263955116272,
617
+ "step": 190
618
+ },
619
+ {
620
+ "epoch": 0.4262295081967213,
621
+ "grad_norm": 126.76170022708793,
622
+ "learning_rate": 7.092864825346266e-07,
623
+ "logits/chosen": -1.1385692358016968,
624
+ "logits/rejected": -1.1158863306045532,
625
+ "logps/chosen": -0.6738488078117371,
626
+ "logps/rejected": -0.8919061422348022,
627
+ "loss": 3.431,
628
+ "rewards/accuracies": 0.762499988079071,
629
+ "rewards/chosen": -6.73848819732666,
630
+ "rewards/margins": 2.180572986602783,
631
+ "rewards/rejected": -8.919061660766602,
632
+ "semantic_entropy": 0.9914792776107788,
633
+ "step": 195
634
+ },
635
+ {
636
+ "epoch": 0.4371584699453552,
637
+ "grad_norm": 146.12776815167663,
638
+ "learning_rate": 6.917829993880302e-07,
639
+ "logits/chosen": -1.1107165813446045,
640
+ "logits/rejected": -1.0282893180847168,
641
+ "logps/chosen": -0.6398170590400696,
642
+ "logps/rejected": -0.8206952810287476,
643
+ "loss": 3.2848,
644
+ "rewards/accuracies": 0.78125,
645
+ "rewards/chosen": -6.398170471191406,
646
+ "rewards/margins": 1.8087825775146484,
647
+ "rewards/rejected": -8.206953048706055,
648
+ "semantic_entropy": 1.0069334506988525,
649
+ "step": 200
650
+ },
651
+ {
652
+ "epoch": 0.44808743169398907,
653
+ "grad_norm": 114.70543290599623,
654
+ "learning_rate": 6.739994160844309e-07,
655
+ "logits/chosen": -1.0968120098114014,
656
+ "logits/rejected": -1.1100647449493408,
657
+ "logps/chosen": -0.6191304326057434,
658
+ "logps/rejected": -0.8075233697891235,
659
+ "loss": 3.113,
660
+ "rewards/accuracies": 0.8062499761581421,
661
+ "rewards/chosen": -6.1913042068481445,
662
+ "rewards/margins": 1.8839294910430908,
663
+ "rewards/rejected": -8.075233459472656,
664
+ "semantic_entropy": 1.0141515731811523,
665
+ "step": 205
666
+ },
667
+ {
668
+ "epoch": 0.45901639344262296,
669
+ "grad_norm": 207.23320066468338,
670
+ "learning_rate": 6.559617056479827e-07,
671
+ "logits/chosen": -1.1229215860366821,
672
+ "logits/rejected": -1.1227028369903564,
673
+ "logps/chosen": -0.6795850396156311,
674
+ "logps/rejected": -0.9186260104179382,
675
+ "loss": 3.1442,
676
+ "rewards/accuracies": 0.7875000238418579,
677
+ "rewards/chosen": -6.7958502769470215,
678
+ "rewards/margins": 2.3904099464416504,
679
+ "rewards/rejected": -9.186259269714355,
680
+ "semantic_entropy": 0.9720403552055359,
681
+ "step": 210
682
+ },
683
+ {
684
+ "epoch": 0.46994535519125685,
685
+ "grad_norm": 154.50502979258272,
686
+ "learning_rate": 6.376962122569567e-07,
687
+ "logits/chosen": -1.1284302473068237,
688
+ "logits/rejected": -1.0769437551498413,
689
+ "logps/chosen": -0.6974600553512573,
690
+ "logps/rejected": -0.9393995404243469,
691
+ "loss": 3.4054,
692
+ "rewards/accuracies": 0.800000011920929,
693
+ "rewards/chosen": -6.974600315093994,
694
+ "rewards/margins": 2.4193947315216064,
695
+ "rewards/rejected": -9.39399528503418,
696
+ "semantic_entropy": 1.0159441232681274,
697
+ "step": 215
698
+ },
699
+ {
700
+ "epoch": 0.4808743169398907,
701
+ "grad_norm": 138.6943703552703,
702
+ "learning_rate": 6.192296127679192e-07,
703
+ "logits/chosen": -1.1995110511779785,
704
+ "logits/rejected": -1.1735769510269165,
705
+ "logps/chosen": -0.7303147912025452,
706
+ "logps/rejected": -0.9362783432006836,
707
+ "loss": 3.1146,
708
+ "rewards/accuracies": 0.75,
709
+ "rewards/chosen": -7.303147792816162,
710
+ "rewards/margins": 2.0596346855163574,
711
+ "rewards/rejected": -9.362783432006836,
712
+ "semantic_entropy": 0.9887905120849609,
713
+ "step": 220
714
+ },
715
+ {
716
+ "epoch": 0.4918032786885246,
717
+ "grad_norm": 120.75185480754473,
718
+ "learning_rate": 6.005888777540319e-07,
719
+ "logits/chosen": -1.2185839414596558,
720
+ "logits/rejected": -1.1783679723739624,
721
+ "logps/chosen": -0.7471610307693481,
722
+ "logps/rejected": -0.9852075576782227,
723
+ "loss": 3.1221,
724
+ "rewards/accuracies": 0.8125,
725
+ "rewards/chosen": -7.471610069274902,
726
+ "rewards/margins": 2.380465030670166,
727
+ "rewards/rejected": -9.852075576782227,
728
+ "semantic_entropy": 0.9979068040847778,
729
+ "step": 225
730
+ },
731
+ {
732
+ "epoch": 0.5027322404371585,
733
+ "grad_norm": 122.09354415954165,
734
+ "learning_rate": 5.818012321143773e-07,
735
+ "logits/chosen": -1.1167972087860107,
736
+ "logits/rejected": -1.1158543825149536,
737
+ "logps/chosen": -0.7506524920463562,
738
+ "logps/rejected": -1.0082188844680786,
739
+ "loss": 3.1675,
740
+ "rewards/accuracies": 0.8125,
741
+ "rewards/chosen": -7.50652551651001,
742
+ "rewards/margins": 2.5756633281707764,
743
+ "rewards/rejected": -10.082188606262207,
744
+ "semantic_entropy": 0.993320643901825,
745
+ "step": 230
746
+ },
747
+ {
748
+ "epoch": 0.5136612021857924,
749
+ "grad_norm": 147.5670346142303,
750
+ "learning_rate": 5.628941153118388e-07,
751
+ "logits/chosen": -1.1291579008102417,
752
+ "logits/rejected": -1.0918009281158447,
753
+ "logps/chosen": -0.775924563407898,
754
+ "logps/rejected": -1.0093990564346313,
755
+ "loss": 2.9612,
756
+ "rewards/accuracies": 0.824999988079071,
757
+ "rewards/chosen": -7.759246826171875,
758
+ "rewards/margins": 2.3347439765930176,
759
+ "rewards/rejected": -10.093989372253418,
760
+ "semantic_entropy": 0.981053352355957,
761
+ "step": 235
762
+ },
763
+ {
764
+ "epoch": 0.5245901639344263,
765
+ "grad_norm": 133.14046441252583,
766
+ "learning_rate": 5.438951412976098e-07,
767
+ "logits/chosen": -1.1797640323638916,
768
+ "logits/rejected": -1.1900447607040405,
769
+ "logps/chosen": -0.7702494859695435,
770
+ "logps/rejected": -1.056715488433838,
771
+ "loss": 2.7098,
772
+ "rewards/accuracies": 0.8187500238418579,
773
+ "rewards/chosen": -7.702495574951172,
774
+ "rewards/margins": 2.864659547805786,
775
+ "rewards/rejected": -10.567155838012695,
776
+ "semantic_entropy": 0.991602897644043,
777
+ "step": 240
778
+ },
779
+ {
780
+ "epoch": 0.5355191256830601,
781
+ "grad_norm": 133.0444183672,
782
+ "learning_rate": 5.248320581808619e-07,
783
+ "logits/chosen": -1.1087061166763306,
784
+ "logits/rejected": -1.0653207302093506,
785
+ "logps/chosen": -0.792129397392273,
786
+ "logps/rejected": -1.075703501701355,
787
+ "loss": 2.9413,
788
+ "rewards/accuracies": 0.7749999761581421,
789
+ "rewards/chosen": -7.921294212341309,
790
+ "rewards/margins": 2.835740566253662,
791
+ "rewards/rejected": -10.757036209106445,
792
+ "semantic_entropy": 0.9732475280761719,
793
+ "step": 245
794
+ },
795
+ {
796
+ "epoch": 0.546448087431694,
797
+ "grad_norm": 122.1994564086186,
798
+ "learning_rate": 5.057327077024744e-07,
799
+ "logits/chosen": -1.1738090515136719,
800
+ "logits/rejected": -1.1409178972244263,
801
+ "logps/chosen": -0.806613564491272,
802
+ "logps/rejected": -1.0335161685943604,
803
+ "loss": 3.0864,
804
+ "rewards/accuracies": 0.737500011920929,
805
+ "rewards/chosen": -8.06613540649414,
806
+ "rewards/margins": 2.2690269947052,
807
+ "rewards/rejected": -10.335162162780762,
808
+ "semantic_entropy": 0.9622586965560913,
809
+ "step": 250
810
+ },
811
+ {
812
+ "epoch": 0.5573770491803278,
813
+ "grad_norm": 232.10642636329976,
814
+ "learning_rate": 4.866249845720132e-07,
815
+ "logits/chosen": -1.1730918884277344,
816
+ "logits/rejected": -1.1448460817337036,
817
+ "logps/chosen": -0.8790150880813599,
818
+ "logps/rejected": -1.1891063451766968,
819
+ "loss": 2.8134,
820
+ "rewards/accuracies": 0.800000011920929,
821
+ "rewards/chosen": -8.790151596069336,
822
+ "rewards/margins": 3.1009116172790527,
823
+ "rewards/rejected": -11.891061782836914,
824
+ "semantic_entropy": 0.9595390558242798,
825
+ "step": 255
826
+ },
827
+ {
828
+ "epoch": 0.5683060109289617,
829
+ "grad_norm": 131.2755348822229,
830
+ "learning_rate": 4.675367957273505e-07,
831
+ "logits/chosen": -1.141157865524292,
832
+ "logits/rejected": -1.1335737705230713,
833
+ "logps/chosen": -0.8586977124214172,
834
+ "logps/rejected": -1.1454424858093262,
835
+ "loss": 2.8892,
836
+ "rewards/accuracies": 0.8062499761581421,
837
+ "rewards/chosen": -8.586977005004883,
838
+ "rewards/margins": 2.8674476146698,
839
+ "rewards/rejected": -11.454424858093262,
840
+ "semantic_entropy": 0.9425627589225769,
841
+ "step": 260
842
+ },
843
+ {
844
+ "epoch": 0.5792349726775956,
845
+ "grad_norm": 140.64129624293926,
846
+ "learning_rate": 4.4849601957642285e-07,
847
+ "logits/chosen": -1.1671048402786255,
848
+ "logits/rejected": -1.1342850923538208,
849
+ "logps/chosen": -0.884295642375946,
850
+ "logps/rejected": -1.1824935674667358,
851
+ "loss": 2.8754,
852
+ "rewards/accuracies": 0.7875000238418579,
853
+ "rewards/chosen": -8.84295654296875,
854
+ "rewards/margins": 2.9819793701171875,
855
+ "rewards/rejected": -11.824935913085938,
856
+ "semantic_entropy": 0.9442464709281921,
857
+ "step": 265
858
+ },
859
+ {
860
+ "epoch": 0.5901639344262295,
861
+ "grad_norm": 147.4714622434341,
862
+ "learning_rate": 4.295304652806592e-07,
863
+ "logits/chosen": -1.161628246307373,
864
+ "logits/rejected": -1.1403006315231323,
865
+ "logps/chosen": -0.9028175473213196,
866
+ "logps/rejected": -1.2475910186767578,
867
+ "loss": 2.6617,
868
+ "rewards/accuracies": 0.8125,
869
+ "rewards/chosen": -9.028175354003906,
870
+ "rewards/margins": 3.447734832763672,
871
+ "rewards/rejected": -12.475909233093262,
872
+ "semantic_entropy": 0.9366561770439148,
873
+ "step": 270
874
+ },
875
+ {
876
+ "epoch": 0.6010928961748634,
877
+ "grad_norm": 103.487924065158,
878
+ "learning_rate": 4.106678321395433e-07,
879
+ "logits/chosen": -1.149213194847107,
880
+ "logits/rejected": -1.091584324836731,
881
+ "logps/chosen": -0.944200873374939,
882
+ "logps/rejected": -1.1400898694992065,
883
+ "loss": 2.9059,
884
+ "rewards/accuracies": 0.668749988079071,
885
+ "rewards/chosen": -9.442008972167969,
886
+ "rewards/margins": 1.9588892459869385,
887
+ "rewards/rejected": -11.400897979736328,
888
+ "semantic_entropy": 0.9353906512260437,
889
+ "step": 275
890
+ },
891
+ {
892
+ "epoch": 0.6120218579234973,
893
+ "grad_norm": 134.66475933979152,
894
+ "learning_rate": 3.9193566913562915e-07,
895
+ "logits/chosen": -1.112555742263794,
896
+ "logits/rejected": -1.1183173656463623,
897
+ "logps/chosen": -0.9673782587051392,
898
+ "logps/rejected": -1.330810308456421,
899
+ "loss": 2.8642,
900
+ "rewards/accuracies": 0.731249988079071,
901
+ "rewards/chosen": -9.673782348632812,
902
+ "rewards/margins": 3.6343204975128174,
903
+ "rewards/rejected": -13.30810260772705,
904
+ "semantic_entropy": 0.8982528448104858,
905
+ "step": 280
906
+ },
907
+ {
908
+ "epoch": 0.6229508196721312,
909
+ "grad_norm": 122.90153644042246,
910
+ "learning_rate": 3.7336133469909623e-07,
911
+ "logits/chosen": -1.2511584758758545,
912
+ "logits/rejected": -1.2238370180130005,
913
+ "logps/chosen": -0.9596298933029175,
914
+ "logps/rejected": -1.3390361070632935,
915
+ "loss": 2.5195,
916
+ "rewards/accuracies": 0.8374999761581421,
917
+ "rewards/chosen": -9.59630012512207,
918
+ "rewards/margins": 3.794060468673706,
919
+ "rewards/rejected": -13.390359878540039,
920
+ "semantic_entropy": 0.9061362147331238,
921
+ "step": 285
922
+ },
923
+ {
924
+ "epoch": 0.6338797814207651,
925
+ "grad_norm": 155.0095981855771,
926
+ "learning_rate": 3.549719567506076e-07,
927
+ "logits/chosen": -1.190308690071106,
928
+ "logits/rejected": -1.1568708419799805,
929
+ "logps/chosen": -1.0390634536743164,
930
+ "logps/rejected": -1.3806968927383423,
931
+ "loss": 2.8348,
932
+ "rewards/accuracies": 0.8062499761581421,
933
+ "rewards/chosen": -10.390633583068848,
934
+ "rewards/margins": 3.4163336753845215,
935
+ "rewards/rejected": -13.806968688964844,
936
+ "semantic_entropy": 0.8884953260421753,
937
+ "step": 290
938
+ },
939
+ {
940
+ "epoch": 0.644808743169399,
941
+ "grad_norm": 128.49285257186182,
942
+ "learning_rate": 3.3679439308082774e-07,
943
+ "logits/chosen": -1.1692029237747192,
944
+ "logits/rejected": -1.172719120979309,
945
+ "logps/chosen": -1.0511713027954102,
946
+ "logps/rejected": -1.4550367593765259,
947
+ "loss": 2.2826,
948
+ "rewards/accuracies": 0.856249988079071,
949
+ "rewards/chosen": -10.511713027954102,
950
+ "rewards/margins": 4.0386552810668945,
951
+ "rewards/rejected": -14.55036735534668,
952
+ "semantic_entropy": 0.878921389579773,
953
+ "step": 295
954
+ },
955
+ {
956
+ "epoch": 0.6557377049180327,
957
+ "grad_norm": 113.97800775884723,
958
+ "learning_rate": 3.1885519212446716e-07,
959
+ "logits/chosen": -1.2200191020965576,
960
+ "logits/rejected": -1.209221601486206,
961
+ "logps/chosen": -1.0968992710113525,
962
+ "logps/rejected": -1.4734210968017578,
963
+ "loss": 2.5534,
964
+ "rewards/accuracies": 0.793749988079071,
965
+ "rewards/chosen": -10.968993186950684,
966
+ "rewards/margins": 3.7652194499969482,
967
+ "rewards/rejected": -14.734212875366211,
968
+ "semantic_entropy": 0.8491713404655457,
969
+ "step": 300
970
+ },
971
+ {
972
+ "epoch": 0.6666666666666666,
973
+ "grad_norm": 153.40901841678263,
974
+ "learning_rate": 3.0118055418614295e-07,
975
+ "logits/chosen": -1.210296392440796,
976
+ "logits/rejected": -1.166100263595581,
977
+ "logps/chosen": -1.0915873050689697,
978
+ "logps/rejected": -1.5003201961517334,
979
+ "loss": 2.7077,
980
+ "rewards/accuracies": 0.8187500238418579,
981
+ "rewards/chosen": -10.915873527526855,
982
+ "rewards/margins": 4.087328910827637,
983
+ "rewards/rejected": -15.003199577331543,
984
+ "semantic_entropy": 0.851865291595459,
985
+ "step": 305
986
+ },
987
+ {
988
+ "epoch": 0.6775956284153005,
989
+ "grad_norm": 188.92294277142176,
990
+ "learning_rate": 2.83796293174686e-07,
991
+ "logits/chosen": -1.144047498703003,
992
+ "logits/rejected": -1.1552207469940186,
993
+ "logps/chosen": -1.1141725778579712,
994
+ "logps/rejected": -1.5596559047698975,
995
+ "loss": 2.9239,
996
+ "rewards/accuracies": 0.800000011920929,
997
+ "rewards/chosen": -11.141725540161133,
998
+ "rewards/margins": 4.454832077026367,
999
+ "rewards/rejected": -15.5965576171875,
1000
+ "semantic_entropy": 0.8461610078811646,
1001
+ "step": 310
1002
+ },
1003
+ {
1004
+ "epoch": 0.6885245901639344,
1005
+ "grad_norm": 158.1416950642293,
1006
+ "learning_rate": 2.6672779890178046e-07,
1007
+ "logits/chosen": -1.1891381740570068,
1008
+ "logits/rejected": -1.1934657096862793,
1009
+ "logps/chosen": -1.1909846067428589,
1010
+ "logps/rejected": -1.4780324697494507,
1011
+ "loss": 2.6254,
1012
+ "rewards/accuracies": 0.75,
1013
+ "rewards/chosen": -11.9098482131958,
1014
+ "rewards/margins": 2.8704779148101807,
1015
+ "rewards/rejected": -14.780324935913086,
1016
+ "semantic_entropy": 0.8265172243118286,
1017
+ "step": 315
1018
+ },
1019
+ {
1020
+ "epoch": 0.6994535519125683,
1021
+ "grad_norm": 117.90491663822478,
1022
+ "learning_rate": 2.500000000000001e-07,
1023
+ "logits/chosen": -1.2632883787155151,
1024
+ "logits/rejected": -1.2213891744613647,
1025
+ "logps/chosen": -1.2055654525756836,
1026
+ "logps/rejected": -1.6077735424041748,
1027
+ "loss": 2.6783,
1028
+ "rewards/accuracies": 0.800000011920929,
1029
+ "rewards/chosen": -12.05565357208252,
1030
+ "rewards/margins": 4.022082805633545,
1031
+ "rewards/rejected": -16.077735900878906,
1032
+ "semantic_entropy": 0.8192012906074524,
1033
+ "step": 320
1034
+ },
1035
+ {
1036
+ "epoch": 0.7103825136612022,
1037
+ "grad_norm": 164.40962706267163,
1038
+ "learning_rate": 2.3363732751439923e-07,
1039
+ "logits/chosen": -1.2160775661468506,
1040
+ "logits/rejected": -1.205322027206421,
1041
+ "logps/chosen": -1.1661893129348755,
1042
+ "logps/rejected": -1.5277663469314575,
1043
+ "loss": 2.6624,
1044
+ "rewards/accuracies": 0.793749988079071,
1045
+ "rewards/chosen": -11.661892890930176,
1046
+ "rewards/margins": 3.615769863128662,
1047
+ "rewards/rejected": -15.27766227722168,
1048
+ "semantic_entropy": 0.823623538017273,
1049
+ "step": 325
1050
+ },
1051
+ {
1052
+ "epoch": 0.7213114754098361,
1053
+ "grad_norm": 107.16070244814924,
1054
+ "learning_rate": 2.1766367922083283e-07,
1055
+ "logits/chosen": -1.1548130512237549,
1056
+ "logits/rejected": -1.1359449625015259,
1057
+ "logps/chosen": -1.1202858686447144,
1058
+ "logps/rejected": -1.626604676246643,
1059
+ "loss": 2.5554,
1060
+ "rewards/accuracies": 0.84375,
1061
+ "rewards/chosen": -11.20285701751709,
1062
+ "rewards/margins": 5.063187599182129,
1063
+ "rewards/rejected": -16.26604461669922,
1064
+ "semantic_entropy": 0.8284898996353149,
1065
+ "step": 330
1066
+ },
1067
+ {
1068
+ "epoch": 0.73224043715847,
1069
+ "grad_norm": 143.54805923440733,
1070
+ "learning_rate": 2.021023847231202e-07,
1071
+ "logits/chosen": -1.1354625225067139,
1072
+ "logits/rejected": -1.1055997610092163,
1073
+ "logps/chosen": -1.2420397996902466,
1074
+ "logps/rejected": -1.6342121362686157,
1075
+ "loss": 2.5194,
1076
+ "rewards/accuracies": 0.8374999761581421,
1077
+ "rewards/chosen": -12.42039680480957,
1078
+ "rewards/margins": 3.9217231273651123,
1079
+ "rewards/rejected": -16.342121124267578,
1080
+ "semantic_entropy": 0.8055655360221863,
1081
+ "step": 335
1082
+ },
1083
+ {
1084
+ "epoch": 0.7431693989071039,
1085
+ "grad_norm": 143.90601843179704,
1086
+ "learning_rate": 1.869761713800254e-07,
1087
+ "logits/chosen": -1.159055471420288,
1088
+ "logits/rejected": -1.1217933893203735,
1089
+ "logps/chosen": -1.216088056564331,
1090
+ "logps/rejected": -1.6224826574325562,
1091
+ "loss": 2.6871,
1092
+ "rewards/accuracies": 0.768750011920929,
1093
+ "rewards/chosen": -12.160881996154785,
1094
+ "rewards/margins": 4.0639448165893555,
1095
+ "rewards/rejected": -16.22482681274414,
1096
+ "semantic_entropy": 0.7940818667411804,
1097
+ "step": 340
1098
+ },
1099
+ {
1100
+ "epoch": 0.7540983606557377,
1101
+ "grad_norm": 146.77126754239146,
1102
+ "learning_rate": 1.7230713111182164e-07,
1103
+ "logits/chosen": -1.2234665155410767,
1104
+ "logits/rejected": -1.2274234294891357,
1105
+ "logps/chosen": -1.2689043283462524,
1106
+ "logps/rejected": -1.6999661922454834,
1107
+ "loss": 2.6683,
1108
+ "rewards/accuracies": 0.8062499761581421,
1109
+ "rewards/chosen": -12.689043998718262,
1110
+ "rewards/margins": 4.310617923736572,
1111
+ "rewards/rejected": -16.99966049194336,
1112
+ "semantic_entropy": 0.7804916501045227,
1113
+ "step": 345
1114
+ },
1115
+ {
1116
+ "epoch": 0.7650273224043715,
1117
+ "grad_norm": 156.59132030322806,
1118
+ "learning_rate": 1.5811668813491696e-07,
1119
+ "logits/chosen": -1.2142775058746338,
1120
+ "logits/rejected": -1.1987954378128052,
1121
+ "logps/chosen": -1.1977766752243042,
1122
+ "logps/rejected": -1.5605593919754028,
1123
+ "loss": 2.6148,
1124
+ "rewards/accuracies": 0.7875000238418579,
1125
+ "rewards/chosen": -11.977766036987305,
1126
+ "rewards/margins": 3.62782621383667,
1127
+ "rewards/rejected": -15.605592727661133,
1128
+ "semantic_entropy": 0.8086638450622559,
1129
+ "step": 350
1130
+ },
1131
+ {
1132
+ "epoch": 0.7759562841530054,
1133
+ "grad_norm": 144.3372503884412,
1134
+ "learning_rate": 1.4442556767166369e-07,
1135
+ "logits/chosen": -1.1762679815292358,
1136
+ "logits/rejected": -1.1479089260101318,
1137
+ "logps/chosen": -1.193447470664978,
1138
+ "logps/rejected": -1.6105226278305054,
1139
+ "loss": 2.5697,
1140
+ "rewards/accuracies": 0.84375,
1141
+ "rewards/chosen": -11.934475898742676,
1142
+ "rewards/margins": 4.170751094818115,
1143
+ "rewards/rejected": -16.105228424072266,
1144
+ "semantic_entropy": 0.8080552220344543,
1145
+ "step": 355
1146
+ },
1147
+ {
1148
+ "epoch": 0.7868852459016393,
1149
+ "grad_norm": 148.5272314733488,
1150
+ "learning_rate": 1.312537656810549e-07,
1151
+ "logits/chosen": -1.135874629020691,
1152
+ "logits/rejected": -1.1405253410339355,
1153
+ "logps/chosen": -1.242232322692871,
1154
+ "logps/rejected": -1.6316314935684204,
1155
+ "loss": 2.629,
1156
+ "rewards/accuracies": 0.75,
1157
+ "rewards/chosen": -12.422323226928711,
1158
+ "rewards/margins": 3.8939926624298096,
1159
+ "rewards/rejected": -16.316316604614258,
1160
+ "semantic_entropy": 0.8038153648376465,
1161
+ "step": 360
1162
+ },
1163
+ {
1164
+ "epoch": 0.7978142076502732,
1165
+ "grad_norm": 189.12080572010774,
1166
+ "learning_rate": 1.1862051965451214e-07,
1167
+ "logits/chosen": -1.216966152191162,
1168
+ "logits/rejected": -1.2227869033813477,
1169
+ "logps/chosen": -1.2868871688842773,
1170
+ "logps/rejected": -1.718698263168335,
1171
+ "loss": 2.5421,
1172
+ "rewards/accuracies": 0.7875000238418579,
1173
+ "rewards/chosen": -12.868871688842773,
1174
+ "rewards/margins": 4.318110466003418,
1175
+ "rewards/rejected": -17.18698501586914,
1176
+ "semantic_entropy": 0.7714166045188904,
1177
+ "step": 365
1178
+ },
1179
+ {
1180
+ "epoch": 0.8087431693989071,
1181
+ "grad_norm": 130.34992194933218,
1182
+ "learning_rate": 1.0654428051942138e-07,
1183
+ "logits/chosen": -1.2347556352615356,
1184
+ "logits/rejected": -1.2042248249053955,
1185
+ "logps/chosen": -1.3064343929290771,
1186
+ "logps/rejected": -1.777515172958374,
1187
+ "loss": 2.6716,
1188
+ "rewards/accuracies": 0.78125,
1189
+ "rewards/chosen": -13.06434440612793,
1190
+ "rewards/margins": 4.710807800292969,
1191
+ "rewards/rejected": -17.7751522064209,
1192
+ "semantic_entropy": 0.7663524150848389,
1193
+ "step": 370
1194
+ },
1195
+ {
1196
+ "epoch": 0.819672131147541,
1197
+ "grad_norm": 120.52547478396558,
1198
+ "learning_rate": 9.504268569144763e-08,
1199
+ "logits/chosen": -1.2310011386871338,
1200
+ "logits/rejected": -1.179221749305725,
1201
+ "logps/chosen": -1.2675514221191406,
1202
+ "logps/rejected": -1.7146999835968018,
1203
+ "loss": 2.5273,
1204
+ "rewards/accuracies": 0.8187500238418579,
1205
+ "rewards/chosen": -12.675516128540039,
1206
+ "rewards/margins": 4.4714837074279785,
1207
+ "rewards/rejected": -17.14699935913086,
1208
+ "semantic_entropy": 0.7719418406486511,
1209
+ "step": 375
1210
+ },
1211
+ {
1212
+ "epoch": 0.8306010928961749,
1213
+ "grad_norm": 134.85584452412903,
1214
+ "learning_rate": 8.413253331499049e-08,
1215
+ "logits/chosen": -1.1171958446502686,
1216
+ "logits/rejected": -1.1349446773529053,
1217
+ "logps/chosen": -1.2904092073440552,
1218
+ "logps/rejected": -1.6963565349578857,
1219
+ "loss": 2.5116,
1220
+ "rewards/accuracies": 0.84375,
1221
+ "rewards/chosen": -12.904090881347656,
1222
+ "rewards/margins": 4.059475898742676,
1223
+ "rewards/rejected": -16.96356773376465,
1224
+ "semantic_entropy": 0.7810186147689819,
1225
+ "step": 380
1226
+ },
1227
+ {
1228
+ "epoch": 0.8415300546448088,
1229
+ "grad_norm": 137.9880749898867,
1230
+ "learning_rate": 7.382975772939865e-08,
1231
+ "logits/chosen": -1.219100832939148,
1232
+ "logits/rejected": -1.2109915018081665,
1233
+ "logps/chosen": -1.3701751232147217,
1234
+ "logps/rejected": -1.809171438217163,
1235
+ "loss": 2.8231,
1236
+ "rewards/accuracies": 0.831250011920929,
1237
+ "rewards/chosen": -13.701749801635742,
1238
+ "rewards/margins": 4.389963626861572,
1239
+ "rewards/rejected": -18.09171485900879,
1240
+ "semantic_entropy": 0.7575483918190002,
1241
+ "step": 385
1242
+ },
1243
+ {
1244
+ "epoch": 0.8524590163934426,
1245
+ "grad_norm": 192.7458723286622,
1246
+ "learning_rate": 6.414940619677734e-08,
1247
+ "logits/chosen": -1.2024834156036377,
1248
+ "logits/rejected": -1.1897705793380737,
1249
+ "logps/chosen": -1.2859004735946655,
1250
+ "logps/rejected": -1.8171262741088867,
1251
+ "loss": 2.4732,
1252
+ "rewards/accuracies": 0.8374999761581421,
1253
+ "rewards/chosen": -12.85900592803955,
1254
+ "rewards/margins": 5.312258243560791,
1255
+ "rewards/rejected": -18.171262741088867,
1256
+ "semantic_entropy": 0.77564936876297,
1257
+ "step": 390
1258
+ },
1259
+ {
1260
+ "epoch": 0.8633879781420765,
1261
+ "grad_norm": 182.36527261020933,
1262
+ "learning_rate": 5.5105616925376296e-08,
1263
+ "logits/chosen": -1.1923558712005615,
1264
+ "logits/rejected": -1.1765515804290771,
1265
+ "logps/chosen": -1.3574202060699463,
1266
+ "logps/rejected": -1.7072795629501343,
1267
+ "loss": 2.4702,
1268
+ "rewards/accuracies": 0.768750011920929,
1269
+ "rewards/chosen": -13.574200630187988,
1270
+ "rewards/margins": 3.4985928535461426,
1271
+ "rewards/rejected": -17.07279396057129,
1272
+ "semantic_entropy": 0.7606626749038696,
1273
+ "step": 395
1274
+ },
1275
+ {
1276
+ "epoch": 0.8743169398907104,
1277
+ "grad_norm": 156.32689835666156,
1278
+ "learning_rate": 4.6711598420656976e-08,
1279
+ "logits/chosen": -1.144141435623169,
1280
+ "logits/rejected": -1.1195942163467407,
1281
+ "logps/chosen": -1.3408299684524536,
1282
+ "logps/rejected": -1.8310232162475586,
1283
+ "loss": 2.3262,
1284
+ "rewards/accuracies": 0.8687499761581421,
1285
+ "rewards/chosen": -13.408299446105957,
1286
+ "rewards/margins": 4.901931285858154,
1287
+ "rewards/rejected": -18.310230255126953,
1288
+ "semantic_entropy": 0.7442251443862915,
1289
+ "step": 400
1290
+ },
1291
+ {
1292
+ "epoch": 0.8743169398907104,
1293
+ "eval_logits/chosen": -1.3813101053237915,
1294
+ "eval_logits/rejected": -1.3473328351974487,
1295
+ "eval_logps/chosen": -1.2879669666290283,
1296
+ "eval_logps/rejected": -1.739721655845642,
1297
+ "eval_loss": 2.5607941150665283,
1298
+ "eval_rewards/accuracies": 0.8072289228439331,
1299
+ "eval_rewards/chosen": -12.879671096801758,
1300
+ "eval_rewards/margins": 4.517546653747559,
1301
+ "eval_rewards/rejected": -17.397218704223633,
1302
+ "eval_runtime": 36.6792,
1303
+ "eval_samples_per_second": 35.933,
1304
+ "eval_semantic_entropy": 0.7719007730484009,
1305
+ "eval_steps_per_second": 2.263,
1306
+ "step": 400
1307
+ },
1308
+ {
1309
+ "epoch": 0.8852459016393442,
1310
+ "grad_norm": 148.89570610138972,
1311
+ "learning_rate": 3.897961019419516e-08,
1312
+ "logits/chosen": -1.1572606563568115,
1313
+ "logits/rejected": -1.1004865169525146,
1314
+ "logps/chosen": -1.2273566722869873,
1315
+ "logps/rejected": -1.640681266784668,
1316
+ "loss": 2.4505,
1317
+ "rewards/accuracies": 0.7875000238418579,
1318
+ "rewards/chosen": -12.273566246032715,
1319
+ "rewards/margins": 4.13324499130249,
1320
+ "rewards/rejected": -16.406810760498047,
1321
+ "semantic_entropy": 0.7908967137336731,
1322
+ "step": 405
1323
+ },
1324
+ {
1325
+ "epoch": 0.8961748633879781,
1326
+ "grad_norm": 135.13513698899922,
1327
+ "learning_rate": 3.192094485859526e-08,
1328
+ "logits/chosen": -1.164189100265503,
1329
+ "logits/rejected": -1.1957439184188843,
1330
+ "logps/chosen": -1.340441346168518,
1331
+ "logps/rejected": -1.8204329013824463,
1332
+ "loss": 2.5453,
1333
+ "rewards/accuracies": 0.78125,
1334
+ "rewards/chosen": -13.404413223266602,
1335
+ "rewards/margins": 4.799916744232178,
1336
+ "rewards/rejected": -18.204330444335938,
1337
+ "semantic_entropy": 0.7424927949905396,
1338
+ "step": 410
1339
+ },
1340
+ {
1341
+ "epoch": 0.907103825136612,
1342
+ "grad_norm": 138.1252447502212,
1343
+ "learning_rate": 2.5545911634565265e-08,
1344
+ "logits/chosen": -1.2133328914642334,
1345
+ "logits/rejected": -1.217905879020691,
1346
+ "logps/chosen": -1.3356597423553467,
1347
+ "logps/rejected": -1.8359012603759766,
1348
+ "loss": 2.719,
1349
+ "rewards/accuracies": 0.862500011920929,
1350
+ "rewards/chosen": -13.356595993041992,
1351
+ "rewards/margins": 5.002416133880615,
1352
+ "rewards/rejected": -18.359012603759766,
1353
+ "semantic_entropy": 0.7578593492507935,
1354
+ "step": 415
1355
+ },
1356
+ {
1357
+ "epoch": 0.9180327868852459,
1358
+ "grad_norm": 133.36843556957237,
1359
+ "learning_rate": 1.9863821294241522e-08,
1360
+ "logits/chosen": -1.2045748233795166,
1361
+ "logits/rejected": -1.188718557357788,
1362
+ "logps/chosen": -1.3029712438583374,
1363
+ "logps/rejected": -1.7860324382781982,
1364
+ "loss": 2.3706,
1365
+ "rewards/accuracies": 0.8374999761581421,
1366
+ "rewards/chosen": -13.029711723327637,
1367
+ "rewards/margins": 4.830612659454346,
1368
+ "rewards/rejected": -17.86032485961914,
1369
+ "semantic_entropy": 0.7500916123390198,
1370
+ "step": 420
1371
+ },
1372
+ {
1373
+ "epoch": 0.9289617486338798,
1374
+ "grad_norm": 139.42311056578998,
1375
+ "learning_rate": 1.4882972562753615e-08,
1376
+ "logits/chosen": -1.2088757753372192,
1377
+ "logits/rejected": -1.2028796672821045,
1378
+ "logps/chosen": -1.4182978868484497,
1379
+ "logps/rejected": -1.899735450744629,
1380
+ "loss": 2.6805,
1381
+ "rewards/accuracies": 0.800000011920929,
1382
+ "rewards/chosen": -14.182978630065918,
1383
+ "rewards/margins": 4.814376354217529,
1384
+ "rewards/rejected": -18.997356414794922,
1385
+ "semantic_entropy": 0.7167907953262329,
1386
+ "step": 425
1387
+ },
1388
+ {
1389
+ "epoch": 0.9398907103825137,
1390
+ "grad_norm": 138.05690419269808,
1391
+ "learning_rate": 1.0610639997888915e-08,
1392
+ "logits/chosen": -1.1360652446746826,
1393
+ "logits/rejected": -1.1394312381744385,
1394
+ "logps/chosen": -1.265080451965332,
1395
+ "logps/rejected": -1.7739555835723877,
1396
+ "loss": 2.2297,
1397
+ "rewards/accuracies": 0.8999999761581421,
1398
+ "rewards/chosen": -12.65080451965332,
1399
+ "rewards/margins": 5.0887532234191895,
1400
+ "rewards/rejected": -17.73955535888672,
1401
+ "semantic_entropy": 0.7776240110397339,
1402
+ "step": 430
1403
+ },
1404
+ {
1405
+ "epoch": 0.9508196721311475,
1406
+ "grad_norm": 156.31606318677163,
1407
+ "learning_rate": 7.053063365559997e-09,
1408
+ "logits/chosen": -1.1962147951126099,
1409
+ "logits/rejected": -1.2261369228363037,
1410
+ "logps/chosen": -1.330643892288208,
1411
+ "logps/rejected": -1.8400895595550537,
1412
+ "loss": 2.397,
1413
+ "rewards/accuracies": 0.8687499761581421,
1414
+ "rewards/chosen": -13.306437492370605,
1415
+ "rewards/margins": 5.094459056854248,
1416
+ "rewards/rejected": -18.400897979736328,
1417
+ "semantic_entropy": 0.7367173433303833,
1418
+ "step": 435
1419
+ },
1420
+ {
1421
+ "epoch": 0.9617486338797814,
1422
+ "grad_norm": 137.08276067025474,
1423
+ "learning_rate": 4.215438526591064e-09,
1424
+ "logits/chosen": -1.1730583906173706,
1425
+ "logits/rejected": -1.140211582183838,
1426
+ "logps/chosen": -1.3757076263427734,
1427
+ "logps/rejected": -1.7452484369277954,
1428
+ "loss": 2.5885,
1429
+ "rewards/accuracies": 0.78125,
1430
+ "rewards/chosen": -13.757074356079102,
1431
+ "rewards/margins": 3.6954073905944824,
1432
+ "rewards/rejected": -17.452484130859375,
1433
+ "semantic_entropy": 0.7384070754051208,
1434
+ "step": 440
1435
+ },
1436
+ {
1437
+ "epoch": 0.9726775956284153,
1438
+ "grad_norm": 153.84420514847153,
1439
+ "learning_rate": 2.1019098481337426e-09,
1440
+ "logits/chosen": -1.2059695720672607,
1441
+ "logits/rejected": -1.1932657957077026,
1442
+ "logps/chosen": -1.2917059659957886,
1443
+ "logps/rejected": -1.7843118906021118,
1444
+ "loss": 2.4083,
1445
+ "rewards/accuracies": 0.8125,
1446
+ "rewards/chosen": -12.917058944702148,
1447
+ "rewards/margins": 4.926059246063232,
1448
+ "rewards/rejected": -17.843120574951172,
1449
+ "semantic_entropy": 0.7641780376434326,
1450
+ "step": 445
1451
+ },
1452
+ {
1453
+ "epoch": 0.9836065573770492,
1454
+ "grad_norm": 169.02746393168292,
1455
+ "learning_rate": 7.155641507955445e-10,
1456
+ "logits/chosen": -1.122040033340454,
1457
+ "logits/rejected": -1.1226763725280762,
1458
+ "logps/chosen": -1.3791415691375732,
1459
+ "logps/rejected": -1.798413872718811,
1460
+ "loss": 2.6799,
1461
+ "rewards/accuracies": 0.8125,
1462
+ "rewards/chosen": -13.791415214538574,
1463
+ "rewards/margins": 4.192723274230957,
1464
+ "rewards/rejected": -17.98413848876953,
1465
+ "semantic_entropy": 0.7384847402572632,
1466
+ "step": 450
1467
+ },
1468
+ {
1469
+ "epoch": 0.994535519125683,
1470
+ "grad_norm": 160.91531446057363,
1471
+ "learning_rate": 5.842620032053824e-11,
1472
+ "logits/chosen": -1.1409598588943481,
1473
+ "logits/rejected": -1.139664649963379,
1474
+ "logps/chosen": -1.3967525959014893,
1475
+ "logps/rejected": -1.73647940158844,
1476
+ "loss": 2.866,
1477
+ "rewards/accuracies": 0.75,
1478
+ "rewards/chosen": -13.96752643585205,
1479
+ "rewards/margins": 3.3972675800323486,
1480
+ "rewards/rejected": -17.364791870117188,
1481
+ "semantic_entropy": 0.7418167591094971,
1482
+ "step": 455
1483
+ },
1484
+ {
1485
+ "epoch": 0.9989071038251366,
1486
+ "step": 457,
1487
+ "total_flos": 0.0,
1488
+ "train_loss": 3.4314852449513107,
1489
+ "train_runtime": 5934.8281,
1490
+ "train_samples_per_second": 9.867,
1491
+ "train_steps_per_second": 0.077
1492
+ }
1493
+ ],
1494
+ "logging_steps": 5,
1495
+ "max_steps": 457,
1496
+ "num_input_tokens_seen": 0,
1497
+ "num_train_epochs": 1,
1498
+ "save_steps": 1000000,
1499
+ "stateful_callbacks": {
1500
+ "TrainerControl": {
1501
+ "args": {
1502
+ "should_epoch_stop": false,
1503
+ "should_evaluate": false,
1504
+ "should_log": false,
1505
+ "should_save": true,
1506
+ "should_training_stop": true
1507
+ },
1508
+ "attributes": {}
1509
+ }
1510
+ },
1511
+ "total_flos": 0.0,
1512
+ "train_batch_size": 2,
1513
+ "trial_name": null,
1514
+ "trial_params": null
1515
+ }