tweety-mistral-7b-dpo / trainer_state.json
g8a9's picture
Upload folder using huggingface_hub
c2f275b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997810698387214,
"eval_steps": 100,
"global_step": 3425,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00029190688170473617,
"grad_norm": 0.013427734375,
"learning_rate": 1.457725947521866e-08,
"logits/chosen": -2.4752657413482666,
"logits/rejected": -2.4752657413482666,
"logps/chosen": -328.9035949707031,
"logps/rejected": -328.9035949707031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.002919068817047362,
"grad_norm": 0.0147705078125,
"learning_rate": 1.457725947521866e-07,
"logits/chosen": -2.395798683166504,
"logits/rejected": -2.395798683166504,
"logps/chosen": -317.85565185546875,
"logps/rejected": -317.85565185546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0003186435205861926,
"rewards/margins": 0.0,
"rewards/rejected": -0.0003186435205861926,
"step": 10
},
{
"epoch": 0.005838137634094724,
"grad_norm": 0.01318359375,
"learning_rate": 2.915451895043732e-07,
"logits/chosen": -2.4440758228302,
"logits/rejected": -2.4440758228302,
"logps/chosen": -301.12921142578125,
"logps/rejected": -301.12921142578125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -7.847430242691189e-05,
"rewards/margins": 0.0,
"rewards/rejected": -7.847430242691189e-05,
"step": 20
},
{
"epoch": 0.008757206451142086,
"grad_norm": 0.01177978515625,
"learning_rate": 4.373177842565598e-07,
"logits/chosen": -2.441359519958496,
"logits/rejected": -2.441359519958496,
"logps/chosen": -317.1576843261719,
"logps/rejected": -317.1576843261719,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.00025945488596335053,
"rewards/margins": 0.0,
"rewards/rejected": -0.00025945488596335053,
"step": 30
},
{
"epoch": 0.011676275268189448,
"grad_norm": 0.0167236328125,
"learning_rate": 5.830903790087464e-07,
"logits/chosen": -2.455430269241333,
"logits/rejected": -2.455430269241333,
"logps/chosen": -328.7832946777344,
"logps/rejected": -328.7832946777344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.00034936360316351056,
"rewards/margins": 0.0,
"rewards/rejected": -0.00034936360316351056,
"step": 40
},
{
"epoch": 0.014595344085236809,
"grad_norm": 0.012939453125,
"learning_rate": 7.288629737609331e-07,
"logits/chosen": -2.406463384628296,
"logits/rejected": -2.406463384628296,
"logps/chosen": -303.563232421875,
"logps/rejected": -303.563232421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0001031260471791029,
"rewards/margins": 0.0,
"rewards/rejected": 0.0001031260471791029,
"step": 50
},
{
"epoch": 0.01751441290228417,
"grad_norm": 0.016357421875,
"learning_rate": 8.746355685131196e-07,
"logits/chosen": -2.4401960372924805,
"logits/rejected": -2.4401960372924805,
"logps/chosen": -284.1253967285156,
"logps/rejected": -284.1253967285156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.00043843849562108517,
"rewards/margins": 0.0,
"rewards/rejected": -0.00043843849562108517,
"step": 60
},
{
"epoch": 0.02043348171933153,
"grad_norm": 0.01153564453125,
"learning_rate": 1.0204081632653063e-06,
"logits/chosen": -2.423875093460083,
"logits/rejected": -2.423875093460083,
"logps/chosen": -280.09442138671875,
"logps/rejected": -280.09442138671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.00031216375646181405,
"rewards/margins": 0.0,
"rewards/rejected": -0.00031216375646181405,
"step": 70
},
{
"epoch": 0.023352550536378896,
"grad_norm": 0.01214599609375,
"learning_rate": 1.1661807580174927e-06,
"logits/chosen": -2.404435396194458,
"logits/rejected": -2.404435396194458,
"logps/chosen": -267.2549743652344,
"logps/rejected": -267.2549743652344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0006922121392562985,
"rewards/margins": 0.0,
"rewards/rejected": 0.0006922121392562985,
"step": 80
},
{
"epoch": 0.026271619353426257,
"grad_norm": 0.0146484375,
"learning_rate": 1.3119533527696792e-06,
"logits/chosen": -2.416917324066162,
"logits/rejected": -2.416917324066162,
"logps/chosen": -333.58563232421875,
"logps/rejected": -333.58563232421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0005137195694260299,
"rewards/margins": 0.0,
"rewards/rejected": 0.0005137195694260299,
"step": 90
},
{
"epoch": 0.029190688170473617,
"grad_norm": 0.0189208984375,
"learning_rate": 1.4577259475218661e-06,
"logits/chosen": -2.4351730346679688,
"logits/rejected": -2.4351730346679688,
"logps/chosen": -339.3778381347656,
"logps/rejected": -339.3778381347656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0005722829955630004,
"rewards/margins": 0.0,
"rewards/rejected": 0.0005722829955630004,
"step": 100
},
{
"epoch": 0.029190688170473617,
"eval_logits/chosen": -2.394068479537964,
"eval_logits/rejected": -2.394068479537964,
"eval_logps/chosen": -306.389892578125,
"eval_logps/rejected": -306.389892578125,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": 0.0008870832389220595,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": 0.0008870832389220595,
"eval_runtime": 2666.9983,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 100
},
{
"epoch": 0.03210975698752098,
"grad_norm": 0.015869140625,
"learning_rate": 1.6034985422740526e-06,
"logits/chosen": -2.420276165008545,
"logits/rejected": -2.420276165008545,
"logps/chosen": -306.0760803222656,
"logps/rejected": -306.0760803222656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0014700460014864802,
"rewards/margins": 0.0,
"rewards/rejected": 0.0014700460014864802,
"step": 110
},
{
"epoch": 0.03502882580456834,
"grad_norm": 0.01544189453125,
"learning_rate": 1.7492711370262391e-06,
"logits/chosen": -2.4616119861602783,
"logits/rejected": -2.4616119861602783,
"logps/chosen": -328.64129638671875,
"logps/rejected": -328.64129638671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.001054848893545568,
"rewards/margins": 0.0,
"rewards/rejected": 0.001054848893545568,
"step": 120
},
{
"epoch": 0.037947894621615706,
"grad_norm": 0.0250244140625,
"learning_rate": 1.895043731778426e-06,
"logits/chosen": -2.404423236846924,
"logits/rejected": -2.404423236846924,
"logps/chosen": -339.0644836425781,
"logps/rejected": -339.0644836425781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0002889078459702432,
"rewards/margins": 0.0,
"rewards/rejected": 0.0002889078459702432,
"step": 130
},
{
"epoch": 0.04086696343866306,
"grad_norm": 0.0137939453125,
"learning_rate": 2.0408163265306125e-06,
"logits/chosen": -2.4294090270996094,
"logits/rejected": -2.4294090270996094,
"logps/chosen": -299.0234375,
"logps/rejected": -299.0234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0003939162997994572,
"rewards/margins": 0.0,
"rewards/rejected": 0.0003939162997994572,
"step": 140
},
{
"epoch": 0.04378603225571043,
"grad_norm": 0.01470947265625,
"learning_rate": 2.1865889212827988e-06,
"logits/chosen": -2.4415223598480225,
"logits/rejected": -2.4415223598480225,
"logps/chosen": -317.4403991699219,
"logps/rejected": -317.4403991699219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0011137222172692418,
"rewards/margins": 0.0,
"rewards/rejected": 0.0011137222172692418,
"step": 150
},
{
"epoch": 0.04670510107275779,
"grad_norm": 0.01202392578125,
"learning_rate": 2.3323615160349855e-06,
"logits/chosen": -2.433961868286133,
"logits/rejected": -2.433961868286133,
"logps/chosen": -315.8016662597656,
"logps/rejected": -315.8016662597656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.00034690109896473587,
"rewards/margins": 0.0,
"rewards/rejected": 0.00034690109896473587,
"step": 160
},
{
"epoch": 0.04962416988980515,
"grad_norm": 0.01226806640625,
"learning_rate": 2.478134110787172e-06,
"logits/chosen": -2.4214272499084473,
"logits/rejected": -2.4214272499084473,
"logps/chosen": -304.0071105957031,
"logps/rejected": -304.0071105957031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.00021128072694409639,
"rewards/margins": 0.0,
"rewards/rejected": 0.00021128072694409639,
"step": 170
},
{
"epoch": 0.05254323870685251,
"grad_norm": 0.01318359375,
"learning_rate": 2.6239067055393585e-06,
"logits/chosen": -2.410125255584717,
"logits/rejected": -2.410125255584717,
"logps/chosen": -329.052978515625,
"logps/rejected": -329.052978515625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -8.667710062582046e-05,
"rewards/margins": 0.0,
"rewards/rejected": -8.667710062582046e-05,
"step": 180
},
{
"epoch": 0.05546230752389988,
"grad_norm": 0.0111083984375,
"learning_rate": 2.7696793002915456e-06,
"logits/chosen": -2.412470579147339,
"logits/rejected": -2.412470579147339,
"logps/chosen": -302.9618225097656,
"logps/rejected": -302.9618225097656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0005764733068645,
"rewards/margins": 0.0,
"rewards/rejected": 0.0005764733068645,
"step": 190
},
{
"epoch": 0.058381376340947234,
"grad_norm": 0.013671875,
"learning_rate": 2.9154518950437323e-06,
"logits/chosen": -2.3948373794555664,
"logits/rejected": -2.3948373794555664,
"logps/chosen": -312.7694091796875,
"logps/rejected": -312.7694091796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0006709109293296933,
"rewards/margins": 0.0,
"rewards/rejected": -0.0006709109293296933,
"step": 200
},
{
"epoch": 0.058381376340947234,
"eval_logits/chosen": -2.3945627212524414,
"eval_logits/rejected": -2.3945627212524414,
"eval_logps/chosen": -306.5539245605469,
"eval_logps/rejected": -306.5539245605469,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.0007532919407822192,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.0007532919407822192,
"eval_runtime": 2667.9395,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 200
},
{
"epoch": 0.0613004451579946,
"grad_norm": 0.0120849609375,
"learning_rate": 3.0612244897959185e-06,
"logits/chosen": -2.445885181427002,
"logits/rejected": -2.445885181427002,
"logps/chosen": -316.7839050292969,
"logps/rejected": -316.7839050292969,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0010994909098371863,
"rewards/margins": 0.0,
"rewards/rejected": -0.0010994909098371863,
"step": 210
},
{
"epoch": 0.06421951397504196,
"grad_norm": 0.011962890625,
"learning_rate": 3.2069970845481052e-06,
"logits/chosen": -2.4333603382110596,
"logits/rejected": -2.4333603382110596,
"logps/chosen": -277.94915771484375,
"logps/rejected": -277.94915771484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.00041724619222804904,
"rewards/margins": 0.0,
"rewards/rejected": -0.00041724619222804904,
"step": 220
},
{
"epoch": 0.06713858279208933,
"grad_norm": 0.0145263671875,
"learning_rate": 3.352769679300292e-06,
"logits/chosen": -2.4338879585266113,
"logits/rejected": -2.4338879585266113,
"logps/chosen": -325.23455810546875,
"logps/rejected": -325.23455810546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0012703577522188425,
"rewards/margins": 0.0,
"rewards/rejected": -0.0012703577522188425,
"step": 230
},
{
"epoch": 0.07005765160913668,
"grad_norm": 0.0157470703125,
"learning_rate": 3.4985422740524782e-06,
"logits/chosen": -2.413400173187256,
"logits/rejected": -2.413400173187256,
"logps/chosen": -309.69403076171875,
"logps/rejected": -309.69403076171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.001970961457118392,
"rewards/margins": 0.0,
"rewards/rejected": -0.001970961457118392,
"step": 240
},
{
"epoch": 0.07297672042618404,
"grad_norm": 0.01422119140625,
"learning_rate": 3.644314868804665e-06,
"logits/chosen": -2.4458959102630615,
"logits/rejected": -2.4458959102630615,
"logps/chosen": -304.130615234375,
"logps/rejected": -304.130615234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0047234781086444855,
"rewards/margins": 0.0,
"rewards/rejected": -0.0047234781086444855,
"step": 250
},
{
"epoch": 0.07589578924323141,
"grad_norm": 0.01324462890625,
"learning_rate": 3.790087463556852e-06,
"logits/chosen": -2.4266982078552246,
"logits/rejected": -2.4266982078552246,
"logps/chosen": -286.97076416015625,
"logps/rejected": -286.97076416015625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.006642763502895832,
"rewards/margins": 0.0,
"rewards/rejected": -0.006642763502895832,
"step": 260
},
{
"epoch": 0.07881485806027877,
"grad_norm": 0.015625,
"learning_rate": 3.935860058309039e-06,
"logits/chosen": -2.436506748199463,
"logits/rejected": -2.436506748199463,
"logps/chosen": -310.330322265625,
"logps/rejected": -310.330322265625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.007435324601829052,
"rewards/margins": 0.0,
"rewards/rejected": -0.007435324601829052,
"step": 270
},
{
"epoch": 0.08173392687732613,
"grad_norm": 0.01495361328125,
"learning_rate": 4.081632653061225e-06,
"logits/chosen": -2.394254446029663,
"logits/rejected": -2.394254446029663,
"logps/chosen": -304.8192443847656,
"logps/rejected": -304.8192443847656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.00737042585387826,
"rewards/margins": 0.0,
"rewards/rejected": -0.00737042585387826,
"step": 280
},
{
"epoch": 0.0846529956943735,
"grad_norm": 0.0130615234375,
"learning_rate": 4.227405247813411e-06,
"logits/chosen": -2.4005939960479736,
"logits/rejected": -2.4005939960479736,
"logps/chosen": -288.9790954589844,
"logps/rejected": -288.9790954589844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0066338046453893185,
"rewards/margins": 0.0,
"rewards/rejected": -0.0066338046453893185,
"step": 290
},
{
"epoch": 0.08757206451142086,
"grad_norm": 0.01458740234375,
"learning_rate": 4.3731778425655976e-06,
"logits/chosen": -2.4416656494140625,
"logits/rejected": -2.4416656494140625,
"logps/chosen": -288.1855773925781,
"logps/rejected": -288.1855773925781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0070955632254481316,
"rewards/margins": 0.0,
"rewards/rejected": -0.0070955632254481316,
"step": 300
},
{
"epoch": 0.08757206451142086,
"eval_logits/chosen": -2.3941876888275146,
"eval_logits/rejected": -2.3941876888275146,
"eval_logps/chosen": -307.0490417480469,
"eval_logps/rejected": -307.0490417480469,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.005704815499484539,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.005704815499484539,
"eval_runtime": 2667.7916,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 300
},
{
"epoch": 0.09049113332846821,
"grad_norm": 0.01153564453125,
"learning_rate": 4.518950437317785e-06,
"logits/chosen": -2.420503854751587,
"logits/rejected": -2.420503854751587,
"logps/chosen": -276.64093017578125,
"logps/rejected": -276.64093017578125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.007363935001194477,
"rewards/margins": 0.0,
"rewards/rejected": -0.007363935001194477,
"step": 310
},
{
"epoch": 0.09341020214551558,
"grad_norm": 0.0185546875,
"learning_rate": 4.664723032069971e-06,
"logits/chosen": -2.4066500663757324,
"logits/rejected": -2.4066500663757324,
"logps/chosen": -315.653076171875,
"logps/rejected": -315.653076171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.006720393896102905,
"rewards/margins": 0.0,
"rewards/rejected": -0.006720393896102905,
"step": 320
},
{
"epoch": 0.09632927096256294,
"grad_norm": 0.015625,
"learning_rate": 4.810495626822158e-06,
"logits/chosen": -2.445965528488159,
"logits/rejected": -2.445965528488159,
"logps/chosen": -324.6703796386719,
"logps/rejected": -324.6703796386719,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.007294761948287487,
"rewards/margins": 0.0,
"rewards/rejected": -0.007294761948287487,
"step": 330
},
{
"epoch": 0.0992483397796103,
"grad_norm": 0.01446533203125,
"learning_rate": 4.956268221574344e-06,
"logits/chosen": -2.4288485050201416,
"logits/rejected": -2.4288485050201416,
"logps/chosen": -323.6286926269531,
"logps/rejected": -323.6286926269531,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009048479609191418,
"rewards/margins": 0.0,
"rewards/rejected": -0.009048479609191418,
"step": 340
},
{
"epoch": 0.10216740859665767,
"grad_norm": 0.01458740234375,
"learning_rate": 4.999936358746211e-06,
"logits/chosen": -2.4309639930725098,
"logits/rejected": -2.4309639930725098,
"logps/chosen": -271.655029296875,
"logps/rejected": -271.655029296875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009400355629622936,
"rewards/margins": 0.0,
"rewards/rejected": -0.009400355629622936,
"step": 350
},
{
"epoch": 0.10508647741370503,
"grad_norm": 0.0152587890625,
"learning_rate": 4.99962465428288e-06,
"logits/chosen": -2.4447290897369385,
"logits/rejected": -2.4447290897369385,
"logps/chosen": -303.4416198730469,
"logps/rejected": -303.4416198730469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.013941009528934956,
"rewards/margins": 0.0,
"rewards/rejected": -0.013941009528934956,
"step": 360
},
{
"epoch": 0.10800554623075238,
"grad_norm": 0.0185546875,
"learning_rate": 4.999053229746866e-06,
"logits/chosen": -2.440117359161377,
"logits/rejected": -2.440117359161377,
"logps/chosen": -290.806884765625,
"logps/rejected": -290.806884765625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01759205386042595,
"rewards/margins": 0.0,
"rewards/rejected": -0.01759205386042595,
"step": 370
},
{
"epoch": 0.11092461504779975,
"grad_norm": 0.01263427734375,
"learning_rate": 4.9982221445112535e-06,
"logits/chosen": -2.4275150299072266,
"logits/rejected": -2.4275150299072266,
"logps/chosen": -320.67938232421875,
"logps/rejected": -320.67938232421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.018790820613503456,
"rewards/margins": 0.0,
"rewards/rejected": -0.018790820613503456,
"step": 380
},
{
"epoch": 0.11384368386484711,
"grad_norm": 0.01397705078125,
"learning_rate": 4.997131484928813e-06,
"logits/chosen": -2.414685010910034,
"logits/rejected": -2.414685010910034,
"logps/chosen": -301.1441650390625,
"logps/rejected": -301.1441650390625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.015089067630469799,
"rewards/margins": 0.0,
"rewards/rejected": -0.015089067630469799,
"step": 390
},
{
"epoch": 0.11676275268189447,
"grad_norm": 0.01458740234375,
"learning_rate": 4.995781364323035e-06,
"logits/chosen": -2.391239643096924,
"logits/rejected": -2.391239643096924,
"logps/chosen": -285.70941162109375,
"logps/rejected": -285.70941162109375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010374903678894043,
"rewards/margins": 0.0,
"rewards/rejected": -0.010374903678894043,
"step": 400
},
{
"epoch": 0.11676275268189447,
"eval_logits/chosen": -2.393982172012329,
"eval_logits/rejected": -2.393982172012329,
"eval_logps/chosen": -307.3796081542969,
"eval_logps/rejected": -307.3796081542969,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.009010241366922855,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.009010241366922855,
"eval_runtime": 2667.3233,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 400
},
{
"epoch": 0.11968182149894184,
"grad_norm": 0.01300048828125,
"learning_rate": 4.994171922976349e-06,
"logits/chosen": -2.4642019271850586,
"logits/rejected": -2.4642019271850586,
"logps/chosen": -298.46978759765625,
"logps/rejected": -298.46978759765625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009510824456810951,
"rewards/margins": 0.0,
"rewards/rejected": -0.009510824456810951,
"step": 410
},
{
"epoch": 0.1226008903159892,
"grad_norm": 0.0159912109375,
"learning_rate": 4.992303328115551e-06,
"logits/chosen": -2.420297145843506,
"logits/rejected": -2.420297145843506,
"logps/chosen": -306.69610595703125,
"logps/rejected": -306.69610595703125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0014959282707422972,
"rewards/margins": 0.0,
"rewards/rejected": -0.0014959282707422972,
"step": 420
},
{
"epoch": 0.12551995913303657,
"grad_norm": 0.0159912109375,
"learning_rate": 4.990175773894428e-06,
"logits/chosen": -2.46386981010437,
"logits/rejected": -2.46386981010437,
"logps/chosen": -281.81097412109375,
"logps/rejected": -281.81097412109375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.008724676445126534,
"rewards/margins": 0.0,
"rewards/rejected": -0.008724676445126534,
"step": 430
},
{
"epoch": 0.1284390279500839,
"grad_norm": 0.01287841796875,
"learning_rate": 4.987789481373586e-06,
"logits/chosen": -2.406324625015259,
"logits/rejected": -2.406324625015259,
"logps/chosen": -297.7574157714844,
"logps/rejected": -297.7574157714844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.006952273193746805,
"rewards/margins": 0.0,
"rewards/rejected": -0.006952273193746805,
"step": 440
},
{
"epoch": 0.13135809676713128,
"grad_norm": 0.015869140625,
"learning_rate": 4.985144698497477e-06,
"logits/chosen": -2.4094862937927246,
"logits/rejected": -2.4094862937927246,
"logps/chosen": -294.4402160644531,
"logps/rejected": -294.4402160644531,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009783747605979443,
"rewards/margins": 0.0,
"rewards/rejected": -0.009783747605979443,
"step": 450
},
{
"epoch": 0.13427716558417865,
"grad_norm": 0.015625,
"learning_rate": 4.982241700068639e-06,
"logits/chosen": -2.448880434036255,
"logits/rejected": -2.448880434036255,
"logps/chosen": -312.9103088378906,
"logps/rejected": -312.9103088378906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010099256411194801,
"rewards/margins": 0.0,
"rewards/rejected": -0.010099256411194801,
"step": 460
},
{
"epoch": 0.137196234401226,
"grad_norm": 0.014404296875,
"learning_rate": 4.979080787719144e-06,
"logits/chosen": -2.4513556957244873,
"logits/rejected": -2.4513556957244873,
"logps/chosen": -330.3889465332031,
"logps/rejected": -330.3889465332031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.012815428897738457,
"rewards/margins": 0.0,
"rewards/rejected": -0.012815428897738457,
"step": 470
},
{
"epoch": 0.14011530321827337,
"grad_norm": 0.013427734375,
"learning_rate": 4.975662289879257e-06,
"logits/chosen": -2.3824195861816406,
"logits/rejected": -2.3824195861816406,
"logps/chosen": -324.45654296875,
"logps/rejected": -324.45654296875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010385606437921524,
"rewards/margins": 0.0,
"rewards/rejected": -0.010385606437921524,
"step": 480
},
{
"epoch": 0.14303437203532074,
"grad_norm": 0.016845703125,
"learning_rate": 4.971986561743308e-06,
"logits/chosen": -2.388378620147705,
"logits/rejected": -2.388378620147705,
"logps/chosen": -292.9872131347656,
"logps/rejected": -292.9872131347656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.00819515809416771,
"rewards/margins": 0.0,
"rewards/rejected": -0.00819515809416771,
"step": 490
},
{
"epoch": 0.14595344085236808,
"grad_norm": 0.01348876953125,
"learning_rate": 4.96805398523279e-06,
"logits/chosen": -2.438722610473633,
"logits/rejected": -2.438722610473633,
"logps/chosen": -333.7470397949219,
"logps/rejected": -333.7470397949219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.007836517877876759,
"rewards/margins": 0.0,
"rewards/rejected": -0.007836517877876759,
"step": 500
},
{
"epoch": 0.14595344085236808,
"eval_logits/chosen": -2.3937265872955322,
"eval_logits/rejected": -2.3937265872955322,
"eval_logps/chosen": -307.1580505371094,
"eval_logps/rejected": -307.1580505371094,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.006794503424316645,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.006794503424316645,
"eval_runtime": 2668.7964,
"eval_samples_per_second": 2.282,
"eval_steps_per_second": 0.286,
"step": 500
},
{
"epoch": 0.14887250966941545,
"grad_norm": 0.0146484375,
"learning_rate": 4.963864968956674e-06,
"logits/chosen": -2.4363291263580322,
"logits/rejected": -2.4363291263580322,
"logps/chosen": -295.4735412597656,
"logps/rejected": -295.4735412597656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.008334552869200706,
"rewards/margins": 0.0,
"rewards/rejected": -0.008334552869200706,
"step": 510
},
{
"epoch": 0.15179157848646282,
"grad_norm": 0.0113525390625,
"learning_rate": 4.959419948168952e-06,
"logits/chosen": -2.4209957122802734,
"logits/rejected": -2.4209957122802734,
"logps/chosen": -252.09475708007812,
"logps/rejected": -252.09475708007812,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.005244333762675524,
"rewards/margins": 0.0,
"rewards/rejected": -0.005244333762675524,
"step": 520
},
{
"epoch": 0.15471064730351017,
"grad_norm": 0.011962890625,
"learning_rate": 4.954719384723416e-06,
"logits/chosen": -2.4421539306640625,
"logits/rejected": -2.4421539306640625,
"logps/chosen": -290.62939453125,
"logps/rejected": -290.62939453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.006143758539110422,
"rewards/margins": 0.0,
"rewards/rejected": -0.006143758539110422,
"step": 530
},
{
"epoch": 0.15762971612055754,
"grad_norm": 0.0155029296875,
"learning_rate": 4.949763767025665e-06,
"logits/chosen": -2.433292865753174,
"logits/rejected": -2.433292865753174,
"logps/chosen": -301.56488037109375,
"logps/rejected": -301.56488037109375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.007085380610078573,
"rewards/margins": 0.0,
"rewards/rejected": -0.007085380610078573,
"step": 540
},
{
"epoch": 0.1605487849376049,
"grad_norm": 0.01513671875,
"learning_rate": 4.944553609982363e-06,
"logits/chosen": -2.397106647491455,
"logits/rejected": -2.397106647491455,
"logps/chosen": -274.3099670410156,
"logps/rejected": -274.3099670410156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.002214896958321333,
"rewards/margins": 0.0,
"rewards/rejected": -0.002214896958321333,
"step": 550
},
{
"epoch": 0.16346785375465225,
"grad_norm": 0.0152587890625,
"learning_rate": 4.939089454947734e-06,
"logits/chosen": -2.417797088623047,
"logits/rejected": -2.417797088623047,
"logps/chosen": -299.5130615234375,
"logps/rejected": -299.5130615234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.005161653272807598,
"rewards/margins": 0.0,
"rewards/rejected": -0.005161653272807598,
"step": 560
},
{
"epoch": 0.16638692257169962,
"grad_norm": 0.01507568359375,
"learning_rate": 4.933371869667315e-06,
"logits/chosen": -2.4109036922454834,
"logits/rejected": -2.4109036922454834,
"logps/chosen": -279.4015808105469,
"logps/rejected": -279.4015808105469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0024168032687157393,
"rewards/margins": 0.0,
"rewards/rejected": -0.0024168032687157393,
"step": 570
},
{
"epoch": 0.169305991388747,
"grad_norm": 0.00885009765625,
"learning_rate": 4.9274014482189654e-06,
"logits/chosen": -2.4315690994262695,
"logits/rejected": -2.4315690994262695,
"logps/chosen": -309.34234619140625,
"logps/rejected": -309.34234619140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.004016817547380924,
"rewards/margins": 0.0,
"rewards/rejected": -0.004016817547380924,
"step": 580
},
{
"epoch": 0.17222506020579434,
"grad_norm": 0.017578125,
"learning_rate": 4.9211788109511405e-06,
"logits/chosen": -2.460508108139038,
"logits/rejected": -2.460508108139038,
"logps/chosen": -334.00933837890625,
"logps/rejected": -334.00933837890625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.005641533527523279,
"rewards/margins": 0.0,
"rewards/rejected": -0.005641533527523279,
"step": 590
},
{
"epoch": 0.1751441290228417,
"grad_norm": 0.016845703125,
"learning_rate": 4.914704604418435e-06,
"logits/chosen": -2.4566855430603027,
"logits/rejected": -2.4566855430603027,
"logps/chosen": -307.21331787109375,
"logps/rejected": -307.21331787109375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0077440254390239716,
"rewards/margins": 0.0,
"rewards/rejected": -0.0077440254390239716,
"step": 600
},
{
"epoch": 0.1751441290228417,
"eval_logits/chosen": -2.394993782043457,
"eval_logits/rejected": -2.394993782043457,
"eval_logps/chosen": -306.9631042480469,
"eval_logps/rejected": -306.9631042480469,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.004845078103244305,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.004845078103244305,
"eval_runtime": 2667.3075,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 600
},
{
"epoch": 0.17806319783988908,
"grad_norm": 0.01312255859375,
"learning_rate": 4.907979501314402e-06,
"logits/chosen": -2.452761173248291,
"logits/rejected": -2.452761173248291,
"logps/chosen": -293.330078125,
"logps/rejected": -293.330078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.005413960665464401,
"rewards/margins": 0.0,
"rewards/rejected": -0.005413960665464401,
"step": 610
},
{
"epoch": 0.18098226665693642,
"grad_norm": 0.013427734375,
"learning_rate": 4.901004200401659e-06,
"logits/chosen": -2.415590763092041,
"logits/rejected": -2.415590763092041,
"logps/chosen": -316.59185791015625,
"logps/rejected": -316.59185791015625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009168794378638268,
"rewards/margins": 0.0,
"rewards/rejected": -0.009168794378638268,
"step": 620
},
{
"epoch": 0.1839013354739838,
"grad_norm": 0.017333984375,
"learning_rate": 4.893779426439285e-06,
"logits/chosen": -2.4269957542419434,
"logits/rejected": -2.4269957542419434,
"logps/chosen": -330.297607421875,
"logps/rejected": -330.297607421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.008635496720671654,
"rewards/margins": 0.0,
"rewards/rejected": -0.008635496720671654,
"step": 630
},
{
"epoch": 0.18682040429103117,
"grad_norm": 0.0137939453125,
"learning_rate": 4.886305930107512e-06,
"logits/chosen": -2.4132332801818848,
"logits/rejected": -2.4132332801818848,
"logps/chosen": -334.0628967285156,
"logps/rejected": -334.0628967285156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.00843154825270176,
"rewards/margins": 0.0,
"rewards/rejected": -0.00843154825270176,
"step": 640
},
{
"epoch": 0.1897394731080785,
"grad_norm": 0.0162353515625,
"learning_rate": 4.878584487929731e-06,
"logits/chosen": -2.393531084060669,
"logits/rejected": -2.393531084060669,
"logps/chosen": -312.2678527832031,
"logps/rejected": -312.2678527832031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.008157333359122276,
"rewards/margins": 0.0,
"rewards/rejected": -0.008157333359122276,
"step": 650
},
{
"epoch": 0.19265854192512588,
"grad_norm": 0.01141357421875,
"learning_rate": 4.8706159021918046e-06,
"logits/chosen": -2.4334394931793213,
"logits/rejected": -2.4334394931793213,
"logps/chosen": -313.9178466796875,
"logps/rejected": -313.9178466796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010157248005270958,
"rewards/margins": 0.0,
"rewards/rejected": -0.010157248005270958,
"step": 660
},
{
"epoch": 0.19557761074217325,
"grad_norm": 0.01446533203125,
"learning_rate": 4.86240100085871e-06,
"logits/chosen": -2.4123024940490723,
"logits/rejected": -2.4123024940490723,
"logps/chosen": -330.71856689453125,
"logps/rejected": -330.71856689453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01049681194126606,
"rewards/margins": 0.0,
"rewards/rejected": -0.01049681194126606,
"step": 670
},
{
"epoch": 0.1984966795592206,
"grad_norm": 0.0145263671875,
"learning_rate": 4.853940637488505e-06,
"logits/chosen": -2.4219470024108887,
"logits/rejected": -2.4219470024108887,
"logps/chosen": -347.1614990234375,
"logps/rejected": -347.1614990234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010124921798706055,
"rewards/margins": 0.0,
"rewards/rejected": -0.010124921798706055,
"step": 680
},
{
"epoch": 0.20141574837626797,
"grad_norm": 0.0140380859375,
"learning_rate": 4.84523569114365e-06,
"logits/chosen": -2.441845417022705,
"logits/rejected": -2.441845417022705,
"logps/chosen": -268.2397766113281,
"logps/rejected": -268.2397766113281,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.013552245683968067,
"rewards/margins": 0.0,
"rewards/rejected": -0.013552245683968067,
"step": 690
},
{
"epoch": 0.20433481719331534,
"grad_norm": 0.020751953125,
"learning_rate": 4.8362870662996574e-06,
"logits/chosen": -2.408205509185791,
"logits/rejected": -2.408205509185791,
"logps/chosen": -313.0887756347656,
"logps/rejected": -313.0887756347656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01138794980943203,
"rewards/margins": 0.0,
"rewards/rejected": -0.01138794980943203,
"step": 700
},
{
"epoch": 0.20433481719331534,
"eval_logits/chosen": -2.394869565963745,
"eval_logits/rejected": -2.394869565963745,
"eval_logps/chosen": -307.6349182128906,
"eval_logps/rejected": -307.6349182128906,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.011563203297555447,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.011563203297555447,
"eval_runtime": 2685.1829,
"eval_samples_per_second": 2.268,
"eval_steps_per_second": 0.284,
"step": 700
},
{
"epoch": 0.20725388601036268,
"grad_norm": 0.015380859375,
"learning_rate": 4.827095692751124e-06,
"logits/chosen": -2.4306788444519043,
"logits/rejected": -2.4306788444519043,
"logps/chosen": -295.8254089355469,
"logps/rejected": -295.8254089355469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009687040001153946,
"rewards/margins": 0.0,
"rewards/rejected": -0.009687040001153946,
"step": 710
},
{
"epoch": 0.21017295482741005,
"grad_norm": 0.0135498046875,
"learning_rate": 4.817662525515116e-06,
"logits/chosen": -2.399963855743408,
"logits/rejected": -2.399963855743408,
"logps/chosen": -285.0207824707031,
"logps/rejected": -285.0207824707031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010509965009987354,
"rewards/margins": 0.0,
"rewards/rejected": -0.010509965009987354,
"step": 720
},
{
"epoch": 0.21309202364445742,
"grad_norm": 0.01275634765625,
"learning_rate": 4.807988544731944e-06,
"logits/chosen": -2.4015610218048096,
"logits/rejected": -2.4015610218048096,
"logps/chosen": -301.6191711425781,
"logps/rejected": -301.6191711425781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0023958988022059202,
"rewards/margins": 0.0,
"rewards/rejected": -0.0023958988022059202,
"step": 730
},
{
"epoch": 0.21601109246150477,
"grad_norm": 0.0120849609375,
"learning_rate": 4.7980747555633174e-06,
"logits/chosen": -2.421522617340088,
"logits/rejected": -2.421522617340088,
"logps/chosen": -300.5765380859375,
"logps/rejected": -300.5765380859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0066505610011518,
"rewards/margins": 0.0,
"rewards/rejected": -0.0066505610011518,
"step": 740
},
{
"epoch": 0.21893016127855214,
"grad_norm": 0.0167236328125,
"learning_rate": 4.787922188087907e-06,
"logits/chosen": -2.3898696899414062,
"logits/rejected": -2.3898696899414062,
"logps/chosen": -312.099853515625,
"logps/rejected": -312.099853515625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009563307277858257,
"rewards/margins": 0.0,
"rewards/rejected": -0.009563307277858257,
"step": 750
},
{
"epoch": 0.2218492300955995,
"grad_norm": 0.0185546875,
"learning_rate": 4.7775318971943165e-06,
"logits/chosen": -2.368053674697876,
"logits/rejected": -2.368053674697876,
"logps/chosen": -280.77703857421875,
"logps/rejected": -280.77703857421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.008711813017725945,
"rewards/margins": 0.0,
"rewards/rejected": -0.008711813017725945,
"step": 760
},
{
"epoch": 0.22476829891264685,
"grad_norm": 0.01434326171875,
"learning_rate": 4.766904962471477e-06,
"logits/chosen": -2.428321361541748,
"logits/rejected": -2.428321361541748,
"logps/chosen": -283.40704345703125,
"logps/rejected": -283.40704345703125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0074463835917413235,
"rewards/margins": 0.0,
"rewards/rejected": -0.0074463835917413235,
"step": 770
},
{
"epoch": 0.22768736772969422,
"grad_norm": 0.020751953125,
"learning_rate": 4.756042488096472e-06,
"logits/chosen": -2.421441078186035,
"logits/rejected": -2.421441078186035,
"logps/chosen": -283.1347961425781,
"logps/rejected": -283.1347961425781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009277506731450558,
"rewards/margins": 0.0,
"rewards/rejected": -0.009277506731450558,
"step": 780
},
{
"epoch": 0.2306064365467416,
"grad_norm": 0.0169677734375,
"learning_rate": 4.744945602719806e-06,
"logits/chosen": -2.4225807189941406,
"logits/rejected": -2.4225807189941406,
"logps/chosen": -296.5173645019531,
"logps/rejected": -296.5173645019531,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009408360347151756,
"rewards/margins": 0.0,
"rewards/rejected": -0.009408360347151756,
"step": 790
},
{
"epoch": 0.23352550536378894,
"grad_norm": 0.01495361328125,
"learning_rate": 4.733615459348143e-06,
"logits/chosen": -2.3777918815612793,
"logits/rejected": -2.3777918815612793,
"logps/chosen": -337.0318298339844,
"logps/rejected": -337.0318298339844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.012588550336658955,
"rewards/margins": 0.0,
"rewards/rejected": -0.012588550336658955,
"step": 800
},
{
"epoch": 0.23352550536378894,
"eval_logits/chosen": -2.394713878631592,
"eval_logits/rejected": -2.394713878631592,
"eval_logps/chosen": -307.6956787109375,
"eval_logps/rejected": -307.6956787109375,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.012170875445008278,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.012170875445008278,
"eval_runtime": 2762.1462,
"eval_samples_per_second": 2.205,
"eval_steps_per_second": 0.276,
"step": 800
},
{
"epoch": 0.2364445741808363,
"grad_norm": 0.0145263671875,
"learning_rate": 4.722053235224495e-06,
"logits/chosen": -2.4402616024017334,
"logits/rejected": -2.4402616024017334,
"logps/chosen": -333.5353698730469,
"logps/rejected": -333.5353698730469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.008296088315546513,
"rewards/margins": 0.0,
"rewards/rejected": -0.008296088315546513,
"step": 810
},
{
"epoch": 0.23936364299788368,
"grad_norm": 0.0128173828125,
"learning_rate": 4.710260131705908e-06,
"logits/chosen": -2.411567211151123,
"logits/rejected": -2.411567211151123,
"logps/chosen": -274.9350280761719,
"logps/rejected": -274.9350280761719,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.015997527167201042,
"rewards/margins": 0.0,
"rewards/rejected": -0.015997527167201042,
"step": 820
},
{
"epoch": 0.24228271181493102,
"grad_norm": 0.01531982421875,
"learning_rate": 4.698237374138634e-06,
"logits/chosen": -2.420203447341919,
"logits/rejected": -2.420203447341919,
"logps/chosen": -312.3550720214844,
"logps/rejected": -312.3550720214844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.015846502035856247,
"rewards/margins": 0.0,
"rewards/rejected": -0.015846502035856247,
"step": 830
},
{
"epoch": 0.2452017806319784,
"grad_norm": 0.01513671875,
"learning_rate": 4.685986211730816e-06,
"logits/chosen": -2.3960068225860596,
"logits/rejected": -2.3960068225860596,
"logps/chosen": -331.6641845703125,
"logps/rejected": -331.6641845703125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01894356682896614,
"rewards/margins": 0.0,
"rewards/rejected": -0.01894356682896614,
"step": 840
},
{
"epoch": 0.24812084944902577,
"grad_norm": 0.01165771484375,
"learning_rate": 4.6735079174226864e-06,
"logits/chosen": -2.408433198928833,
"logits/rejected": -2.408433198928833,
"logps/chosen": -269.3624572753906,
"logps/rejected": -269.3624572753906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.009970271959900856,
"rewards/margins": 0.0,
"rewards/rejected": -0.009970271959900856,
"step": 850
},
{
"epoch": 0.25103991826607314,
"grad_norm": 0.01483154296875,
"learning_rate": 4.660803787754306e-06,
"logits/chosen": -2.416790723800659,
"logits/rejected": -2.416790723800659,
"logps/chosen": -302.0819396972656,
"logps/rejected": -302.0819396972656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010707431472837925,
"rewards/margins": 0.0,
"rewards/rejected": -0.010707431472837925,
"step": 860
},
{
"epoch": 0.2539589870831205,
"grad_norm": 0.0157470703125,
"learning_rate": 4.647875142730853e-06,
"logits/chosen": -2.3868987560272217,
"logits/rejected": -2.3868987560272217,
"logps/chosen": -299.74444580078125,
"logps/rejected": -299.74444580078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.012594198808073997,
"rewards/margins": 0.0,
"rewards/rejected": -0.012594198808073997,
"step": 870
},
{
"epoch": 0.2568780559001678,
"grad_norm": 0.0140380859375,
"learning_rate": 4.634723325685462e-06,
"logits/chosen": -2.442610263824463,
"logits/rejected": -2.442610263824463,
"logps/chosen": -308.396240234375,
"logps/rejected": -308.396240234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.011057281866669655,
"rewards/margins": 0.0,
"rewards/rejected": -0.011057281866669655,
"step": 880
},
{
"epoch": 0.2597971247172152,
"grad_norm": 0.0157470703125,
"learning_rate": 4.621349703139651e-06,
"logits/chosen": -2.4502758979797363,
"logits/rejected": -2.4502758979797363,
"logps/chosen": -327.5845031738281,
"logps/rejected": -327.5845031738281,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.012428502552211285,
"rewards/margins": 0.0,
"rewards/rejected": -0.012428502552211285,
"step": 890
},
{
"epoch": 0.26271619353426257,
"grad_norm": 0.01519775390625,
"learning_rate": 4.6077556646613365e-06,
"logits/chosen": -2.4429335594177246,
"logits/rejected": -2.4429335594177246,
"logps/chosen": -309.44598388671875,
"logps/rejected": -309.44598388671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.008300786837935448,
"rewards/margins": 0.0,
"rewards/rejected": -0.008300786837935448,
"step": 900
},
{
"epoch": 0.26271619353426257,
"eval_logits/chosen": -2.396768093109131,
"eval_logits/rejected": -2.396768093109131,
"eval_logps/chosen": -307.1708068847656,
"eval_logps/rejected": -307.1708068847656,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.0069224112667143345,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.0069224112667143345,
"eval_runtime": 2667.1988,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 900
},
{
"epoch": 0.2656352623513099,
"grad_norm": 0.0150146484375,
"learning_rate": 4.593942622720449e-06,
"logits/chosen": -2.431570529937744,
"logits/rejected": -2.431570529937744,
"logps/chosen": -333.9033203125,
"logps/rejected": -333.9033203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.008790754713118076,
"rewards/margins": 0.0,
"rewards/rejected": -0.008790754713118076,
"step": 910
},
{
"epoch": 0.2685543311683573,
"grad_norm": 0.011474609375,
"learning_rate": 4.579912012542172e-06,
"logits/chosen": -2.4538259506225586,
"logits/rejected": -2.4538259506225586,
"logps/chosen": -330.14776611328125,
"logps/rejected": -330.14776611328125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010161884129047394,
"rewards/margins": 0.0,
"rewards/rejected": -0.010161884129047394,
"step": 920
},
{
"epoch": 0.27147339998540465,
"grad_norm": 0.0164794921875,
"learning_rate": 4.565665291957821e-06,
"logits/chosen": -2.412051200866699,
"logits/rejected": -2.412051200866699,
"logps/chosen": -300.0600891113281,
"logps/rejected": -300.0600891113281,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.010114507749676704,
"rewards/margins": 0.0,
"rewards/rejected": -0.010114507749676704,
"step": 930
},
{
"epoch": 0.274392468802452,
"grad_norm": 0.0125732421875,
"learning_rate": 4.551203941253367e-06,
"logits/chosen": -2.4353108406066895,
"logits/rejected": -2.4353108406066895,
"logps/chosen": -288.15032958984375,
"logps/rejected": -288.15032958984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01036372222006321,
"rewards/margins": 0.0,
"rewards/rejected": -0.01036372222006321,
"step": 940
},
{
"epoch": 0.2773115376194994,
"grad_norm": 0.01434326171875,
"learning_rate": 4.5365294630156264e-06,
"logits/chosen": -2.4350383281707764,
"logits/rejected": -2.4350383281707764,
"logps/chosen": -319.06195068359375,
"logps/rejected": -319.06195068359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.011402562260627747,
"rewards/margins": 0.0,
"rewards/rejected": -0.011402562260627747,
"step": 950
},
{
"epoch": 0.28023060643654674,
"grad_norm": 0.012451171875,
"learning_rate": 4.521643381976142e-06,
"logits/chosen": -2.428330898284912,
"logits/rejected": -2.428330898284912,
"logps/chosen": -322.0547790527344,
"logps/rejected": -322.0547790527344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.012878289446234703,
"rewards/margins": 0.0,
"rewards/rejected": -0.012878289446234703,
"step": 960
},
{
"epoch": 0.2831496752535941,
"grad_norm": 0.013671875,
"learning_rate": 4.506547244852756e-06,
"logits/chosen": -2.4220213890075684,
"logits/rejected": -2.4220213890075684,
"logps/chosen": -298.77056884765625,
"logps/rejected": -298.77056884765625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.012091143056750298,
"rewards/margins": 0.0,
"rewards/rejected": -0.012091143056750298,
"step": 970
},
{
"epoch": 0.2860687440706415,
"grad_norm": 0.0145263671875,
"learning_rate": 4.491242620188898e-06,
"logits/chosen": -2.400778293609619,
"logits/rejected": -2.400778293609619,
"logps/chosen": -302.7762756347656,
"logps/rejected": -302.7762756347656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01696743816137314,
"rewards/margins": 0.0,
"rewards/rejected": -0.01696743816137314,
"step": 980
},
{
"epoch": 0.2889878128876888,
"grad_norm": 0.012451171875,
"learning_rate": 4.475731098190611e-06,
"logits/chosen": -2.4159862995147705,
"logits/rejected": -2.4159862995147705,
"logps/chosen": -278.34356689453125,
"logps/rejected": -278.34356689453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.014010600745677948,
"rewards/margins": 0.0,
"rewards/rejected": -0.014010600745677948,
"step": 990
},
{
"epoch": 0.29190688170473617,
"grad_norm": 0.0145263671875,
"learning_rate": 4.4600142905613216e-06,
"logits/chosen": -2.416891098022461,
"logits/rejected": -2.416891098022461,
"logps/chosen": -310.4523620605469,
"logps/rejected": -310.4523620605469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01909947767853737,
"rewards/margins": 0.0,
"rewards/rejected": -0.01909947767853737,
"step": 1000
},
{
"epoch": 0.29190688170473617,
"eval_logits/chosen": -2.3967111110687256,
"eval_logits/rejected": -2.3967111110687256,
"eval_logps/chosen": -308.2130432128906,
"eval_logps/rejected": -308.2130432128906,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.017344659194350243,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.017344659194350243,
"eval_runtime": 2668.0913,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 1000
},
{
"epoch": 0.29482595052178356,
"grad_norm": 0.029296875,
"learning_rate": 4.444093830334381e-06,
"logits/chosen": -2.395017147064209,
"logits/rejected": -2.395017147064209,
"logps/chosen": -330.1224670410156,
"logps/rejected": -330.1224670410156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.013958754017949104,
"rewards/margins": 0.0,
"rewards/rejected": -0.013958754017949104,
"step": 1010
},
{
"epoch": 0.2977450193388309,
"grad_norm": 0.01611328125,
"learning_rate": 4.427971371703378e-06,
"logits/chosen": -2.4404492378234863,
"logits/rejected": -2.4404492378234863,
"logps/chosen": -314.79888916015625,
"logps/rejected": -314.79888916015625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.027685949578881264,
"rewards/margins": 0.0,
"rewards/rejected": -0.027685949578881264,
"step": 1020
},
{
"epoch": 0.30066408815587825,
"grad_norm": 0.01263427734375,
"learning_rate": 4.411648589850276e-06,
"logits/chosen": -2.4368889331817627,
"logits/rejected": -2.4368889331817627,
"logps/chosen": -299.6970520019531,
"logps/rejected": -299.6970520019531,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01648426428437233,
"rewards/margins": 0.0,
"rewards/rejected": -0.01648426428437233,
"step": 1030
},
{
"epoch": 0.30358315697292565,
"grad_norm": 0.01416015625,
"learning_rate": 4.395127180771342e-06,
"logits/chosen": -2.4541175365448,
"logits/rejected": -2.4541175365448,
"logps/chosen": -326.87841796875,
"logps/rejected": -326.87841796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.020237499848008156,
"rewards/margins": 0.0,
"rewards/rejected": -0.020237499848008156,
"step": 1040
},
{
"epoch": 0.306502225789973,
"grad_norm": 0.01318359375,
"learning_rate": 4.378408861100937e-06,
"logits/chosen": -2.415283203125,
"logits/rejected": -2.415283203125,
"logps/chosen": -261.1552429199219,
"logps/rejected": -261.1552429199219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01741962879896164,
"rewards/margins": 0.0,
"rewards/rejected": -0.01741962879896164,
"step": 1050
},
{
"epoch": 0.30942129460702034,
"grad_norm": 0.01416015625,
"learning_rate": 4.361495367933144e-06,
"logits/chosen": -2.396031141281128,
"logits/rejected": -2.396031141281128,
"logps/chosen": -322.30377197265625,
"logps/rejected": -322.30377197265625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.014474359340965748,
"rewards/margins": 0.0,
"rewards/rejected": -0.014474359340965748,
"step": 1060
},
{
"epoch": 0.31234036342406774,
"grad_norm": 0.0181884765625,
"learning_rate": 4.344388458641283e-06,
"logits/chosen": -2.4288814067840576,
"logits/rejected": -2.4288814067840576,
"logps/chosen": -324.64501953125,
"logps/rejected": -324.64501953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.025701653212308884,
"rewards/margins": 0.0,
"rewards/rejected": -0.025701653212308884,
"step": 1070
},
{
"epoch": 0.3152594322411151,
"grad_norm": 0.0164794921875,
"learning_rate": 4.32708991069531e-06,
"logits/chosen": -2.411003589630127,
"logits/rejected": -2.411003589630127,
"logps/chosen": -318.289794921875,
"logps/rejected": -318.289794921875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02725202962756157,
"rewards/margins": 0.0,
"rewards/rejected": -0.02725202962756157,
"step": 1080
},
{
"epoch": 0.3181785010581624,
"grad_norm": 0.01312255859375,
"learning_rate": 4.309601521477134e-06,
"logits/chosen": -2.437730550765991,
"logits/rejected": -2.437730550765991,
"logps/chosen": -318.1125793457031,
"logps/rejected": -318.1125793457031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.035508617758750916,
"rewards/margins": 0.0,
"rewards/rejected": -0.035508617758750916,
"step": 1090
},
{
"epoch": 0.3210975698752098,
"grad_norm": 0.01373291015625,
"learning_rate": 4.291925108093856e-06,
"logits/chosen": -2.4134514331817627,
"logits/rejected": -2.4134514331817627,
"logps/chosen": -306.98712158203125,
"logps/rejected": -306.98712158203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02741456963121891,
"rewards/margins": 0.0,
"rewards/rejected": -0.02741456963121891,
"step": 1100
},
{
"epoch": 0.3210975698752098,
"eval_logits/chosen": -2.3970751762390137,
"eval_logits/rejected": -2.3970751762390137,
"eval_logps/chosen": -309.472412109375,
"eval_logps/rejected": -309.472412109375,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.029938040301203728,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.029938040301203728,
"eval_runtime": 2667.8688,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 1100
},
{
"epoch": 0.32401663869225716,
"grad_norm": 0.0120849609375,
"learning_rate": 4.274062507188978e-06,
"logits/chosen": -2.413846492767334,
"logits/rejected": -2.413846492767334,
"logps/chosen": -319.53887939453125,
"logps/rejected": -319.53887939453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03637847676873207,
"rewards/margins": 0.0,
"rewards/rejected": -0.03637847676873207,
"step": 1110
},
{
"epoch": 0.3269357075093045,
"grad_norm": 0.0130615234375,
"learning_rate": 4.256015574751555e-06,
"logits/chosen": -2.443239212036133,
"logits/rejected": -2.443239212036133,
"logps/chosen": -302.9671630859375,
"logps/rejected": -302.9671630859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.032804206013679504,
"rewards/margins": 0.0,
"rewards/rejected": -0.032804206013679504,
"step": 1120
},
{
"epoch": 0.3298547763263519,
"grad_norm": 0.0159912109375,
"learning_rate": 4.2377861859233604e-06,
"logits/chosen": -2.4368813037872314,
"logits/rejected": -2.4368813037872314,
"logps/chosen": -277.4005126953125,
"logps/rejected": -277.4005126953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.030909573659300804,
"rewards/margins": 0.0,
"rewards/rejected": -0.030909573659300804,
"step": 1130
},
{
"epoch": 0.33277384514339925,
"grad_norm": 0.01263427734375,
"learning_rate": 4.219376234804047e-06,
"logits/chosen": -2.4358789920806885,
"logits/rejected": -2.4358789920806885,
"logps/chosen": -294.87567138671875,
"logps/rejected": -294.87567138671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.033531349152326584,
"rewards/margins": 0.0,
"rewards/rejected": -0.033531349152326584,
"step": 1140
},
{
"epoch": 0.3356929139604466,
"grad_norm": 0.01519775390625,
"learning_rate": 4.200787634254345e-06,
"logits/chosen": -2.458458662033081,
"logits/rejected": -2.458458662033081,
"logps/chosen": -284.5567321777344,
"logps/rejected": -284.5567321777344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.029438916593790054,
"rewards/margins": 0.0,
"rewards/rejected": -0.029438916593790054,
"step": 1150
},
{
"epoch": 0.338611982777494,
"grad_norm": 0.0157470703125,
"learning_rate": 4.18202231569731e-06,
"logits/chosen": -2.465770721435547,
"logits/rejected": -2.465770721435547,
"logps/chosen": -325.60443115234375,
"logps/rejected": -325.60443115234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03488076478242874,
"rewards/margins": 0.0,
"rewards/rejected": -0.03488076478242874,
"step": 1160
},
{
"epoch": 0.34153105159454133,
"grad_norm": 0.0194091796875,
"learning_rate": 4.163082228917639e-06,
"logits/chosen": -2.42230224609375,
"logits/rejected": -2.42230224609375,
"logps/chosen": -332.96807861328125,
"logps/rejected": -332.96807861328125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03761008754372597,
"rewards/margins": 0.0,
"rewards/rejected": -0.03761008754372597,
"step": 1170
},
{
"epoch": 0.3444501204115887,
"grad_norm": 0.01519775390625,
"learning_rate": 4.143969341859083e-06,
"logits/chosen": -2.4006218910217285,
"logits/rejected": -2.4006218910217285,
"logps/chosen": -298.38372802734375,
"logps/rejected": -298.38372802734375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.027944693341851234,
"rewards/margins": 0.0,
"rewards/rejected": -0.027944693341851234,
"step": 1180
},
{
"epoch": 0.3473691892286361,
"grad_norm": 0.0167236328125,
"learning_rate": 4.124685640419967e-06,
"logits/chosen": -2.4376044273376465,
"logits/rejected": -2.4376044273376465,
"logps/chosen": -339.3370666503906,
"logps/rejected": -339.3370666503906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04471305012702942,
"rewards/margins": 0.0,
"rewards/rejected": -0.04471305012702942,
"step": 1190
},
{
"epoch": 0.3502882580456834,
"grad_norm": 0.015625,
"learning_rate": 4.105233128246849e-06,
"logits/chosen": -2.4307379722595215,
"logits/rejected": -2.4307379722595215,
"logps/chosen": -314.7157287597656,
"logps/rejected": -314.7157287597656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04377968981862068,
"rewards/margins": 0.0,
"rewards/rejected": -0.04377968981862068,
"step": 1200
},
{
"epoch": 0.3502882580456834,
"eval_logits/chosen": -2.3975985050201416,
"eval_logits/rejected": -2.3975985050201416,
"eval_logps/chosen": -310.0194091796875,
"eval_logps/rejected": -310.0194091796875,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.035407647490501404,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.035407647490501404,
"eval_runtime": 2667.8638,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 1200
},
{
"epoch": 0.35320732686273076,
"grad_norm": 0.01373291015625,
"learning_rate": 4.085613826526338e-06,
"logits/chosen": -2.4104952812194824,
"logits/rejected": -2.4104952812194824,
"logps/chosen": -307.89056396484375,
"logps/rejected": -307.89056396484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.034878071397542953,
"rewards/margins": 0.0,
"rewards/rejected": -0.034878071397542953,
"step": 1210
},
{
"epoch": 0.35612639567977816,
"grad_norm": 0.0130615234375,
"learning_rate": 4.065829773775082e-06,
"logits/chosen": -2.454697847366333,
"logits/rejected": -2.454697847366333,
"logps/chosen": -331.95556640625,
"logps/rejected": -331.95556640625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.035688284784555435,
"rewards/margins": 0.0,
"rewards/rejected": -0.035688284784555435,
"step": 1220
},
{
"epoch": 0.3590454644968255,
"grad_norm": 0.01318359375,
"learning_rate": 4.045883025627957e-06,
"logits/chosen": -2.416503429412842,
"logits/rejected": -2.416503429412842,
"logps/chosen": -317.5516662597656,
"logps/rejected": -317.5516662597656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.036794569343328476,
"rewards/margins": 0.0,
"rewards/rejected": -0.036794569343328476,
"step": 1230
},
{
"epoch": 0.36196453331387285,
"grad_norm": 0.0159912109375,
"learning_rate": 4.025775654624481e-06,
"logits/chosen": -2.431762218475342,
"logits/rejected": -2.431762218475342,
"logps/chosen": -286.4144592285156,
"logps/rejected": -286.4144592285156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0327475443482399,
"rewards/margins": 0.0,
"rewards/rejected": -0.0327475443482399,
"step": 1240
},
{
"epoch": 0.36488360213092025,
"grad_norm": 0.01373291015625,
"learning_rate": 4.005509749993471e-06,
"logits/chosen": -2.4348835945129395,
"logits/rejected": -2.4348835945129395,
"logps/chosen": -264.43670654296875,
"logps/rejected": -264.43670654296875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03447514772415161,
"rewards/margins": 0.0,
"rewards/rejected": -0.03447514772415161,
"step": 1250
},
{
"epoch": 0.3678026709479676,
"grad_norm": 0.01544189453125,
"learning_rate": 3.985087417435964e-06,
"logits/chosen": -2.4379494190216064,
"logits/rejected": -2.4379494190216064,
"logps/chosen": -306.0783386230469,
"logps/rejected": -306.0783386230469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03243451565504074,
"rewards/margins": 0.0,
"rewards/rejected": -0.03243451565504074,
"step": 1260
},
{
"epoch": 0.37072173976501493,
"grad_norm": 0.01318359375,
"learning_rate": 3.964510778906425e-06,
"logits/chosen": -2.434380292892456,
"logits/rejected": -2.434380292892456,
"logps/chosen": -316.9388427734375,
"logps/rejected": -316.9388427734375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.038867734372615814,
"rewards/margins": 0.0,
"rewards/rejected": -0.038867734372615814,
"step": 1270
},
{
"epoch": 0.37364080858206233,
"grad_norm": 0.0142822265625,
"learning_rate": 3.943781972392269e-06,
"logits/chosen": -2.4212710857391357,
"logits/rejected": -2.4212710857391357,
"logps/chosen": -326.74237060546875,
"logps/rejected": -326.74237060546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.038525618612766266,
"rewards/margins": 0.0,
"rewards/rejected": -0.038525618612766266,
"step": 1280
},
{
"epoch": 0.3765598773991097,
"grad_norm": 0.016357421875,
"learning_rate": 3.922903151691716e-06,
"logits/chosen": -2.450032949447632,
"logits/rejected": -2.450032949447632,
"logps/chosen": -329.82073974609375,
"logps/rejected": -329.82073974609375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.037473224103450775,
"rewards/margins": 0.0,
"rewards/rejected": -0.037473224103450775,
"step": 1290
},
{
"epoch": 0.379478946216157,
"grad_norm": 0.018310546875,
"learning_rate": 3.901876486190008e-06,
"logits/chosen": -2.4351401329040527,
"logits/rejected": -2.4351401329040527,
"logps/chosen": -315.5516662597656,
"logps/rejected": -315.5516662597656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03452508896589279,
"rewards/margins": 0.0,
"rewards/rejected": -0.03452508896589279,
"step": 1300
},
{
"epoch": 0.379478946216157,
"eval_logits/chosen": -2.3963370323181152,
"eval_logits/rejected": -2.3963370323181152,
"eval_logps/chosen": -309.5113525390625,
"eval_logps/rejected": -309.5113525390625,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.030327608808875084,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.030327608808875084,
"eval_runtime": 2666.8225,
"eval_samples_per_second": 2.284,
"eval_steps_per_second": 0.286,
"step": 1300
},
{
"epoch": 0.3823980150332044,
"grad_norm": 0.018798828125,
"learning_rate": 3.880704160633995e-06,
"logits/chosen": -2.4444994926452637,
"logits/rejected": -2.4444994926452637,
"logps/chosen": -295.88348388671875,
"logps/rejected": -295.88348388671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.036420173943042755,
"rewards/margins": 0.0,
"rewards/rejected": -0.036420173943042755,
"step": 1310
},
{
"epoch": 0.38531708385025176,
"grad_norm": 0.013671875,
"learning_rate": 3.859388374905136e-06,
"logits/chosen": -2.41549015045166,
"logits/rejected": -2.41549015045166,
"logps/chosen": -291.2346496582031,
"logps/rejected": -291.2346496582031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03046022728085518,
"rewards/margins": 0.0,
"rewards/rejected": -0.03046022728085518,
"step": 1320
},
{
"epoch": 0.3882361526672991,
"grad_norm": 0.0152587890625,
"learning_rate": 3.837931343790924e-06,
"logits/chosen": -2.4401891231536865,
"logits/rejected": -2.4401891231536865,
"logps/chosen": -297.060791015625,
"logps/rejected": -297.060791015625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02374189719557762,
"rewards/margins": 0.0,
"rewards/rejected": -0.02374189719557762,
"step": 1330
},
{
"epoch": 0.3911552214843465,
"grad_norm": 0.0152587890625,
"learning_rate": 3.8163352967547575e-06,
"logits/chosen": -2.4282491207122803,
"logits/rejected": -2.4282491207122803,
"logps/chosen": -350.7884216308594,
"logps/rejected": -350.7884216308594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.031809043139219284,
"rewards/margins": 0.0,
"rewards/rejected": -0.031809043139219284,
"step": 1340
},
{
"epoch": 0.39407429030139385,
"grad_norm": 0.01190185546875,
"learning_rate": 3.7946024777042974e-06,
"logits/chosen": -2.423346996307373,
"logits/rejected": -2.423346996307373,
"logps/chosen": -300.26800537109375,
"logps/rejected": -300.26800537109375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.029370862990617752,
"rewards/margins": 0.0,
"rewards/rejected": -0.029370862990617752,
"step": 1350
},
{
"epoch": 0.3969933591184412,
"grad_norm": 0.01953125,
"learning_rate": 3.7727351447583095e-06,
"logits/chosen": -2.397026538848877,
"logits/rejected": -2.397026538848877,
"logps/chosen": -318.9501647949219,
"logps/rejected": -318.9501647949219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.030634000897407532,
"rewards/margins": 0.0,
"rewards/rejected": -0.030634000897407532,
"step": 1360
},
{
"epoch": 0.3999124279354886,
"grad_norm": 0.01385498046875,
"learning_rate": 3.750735570012043e-06,
"logits/chosen": -2.438441276550293,
"logits/rejected": -2.438441276550293,
"logps/chosen": -330.5710754394531,
"logps/rejected": -330.5710754394531,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03722671791911125,
"rewards/margins": 0.0,
"rewards/rejected": -0.03722671791911125,
"step": 1370
},
{
"epoch": 0.40283149675253593,
"grad_norm": 0.01806640625,
"learning_rate": 3.7286060393011513e-06,
"logits/chosen": -2.419067144393921,
"logits/rejected": -2.419067144393921,
"logps/chosen": -314.528564453125,
"logps/rejected": -314.528564453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.032639987766742706,
"rewards/margins": 0.0,
"rewards/rejected": -0.032639987766742706,
"step": 1380
},
{
"epoch": 0.4057505655695833,
"grad_norm": 0.01904296875,
"learning_rate": 3.7063488519641825e-06,
"logits/chosen": -2.4223015308380127,
"logits/rejected": -2.4223015308380127,
"logps/chosen": -329.4114685058594,
"logps/rejected": -329.4114685058594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03504698723554611,
"rewards/margins": 0.0,
"rewards/rejected": -0.03504698723554611,
"step": 1390
},
{
"epoch": 0.4086696343866307,
"grad_norm": 0.0162353515625,
"learning_rate": 3.6839663206036715e-06,
"logits/chosen": -2.4432168006896973,
"logits/rejected": -2.4432168006896973,
"logps/chosen": -293.8369445800781,
"logps/rejected": -293.8369445800781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.031177738681435585,
"rewards/margins": 0.0,
"rewards/rejected": -0.031177738681435585,
"step": 1400
},
{
"epoch": 0.4086696343866307,
"eval_logits/chosen": -2.395508050918579,
"eval_logits/rejected": -2.395508050918579,
"eval_logps/chosen": -309.2061462402344,
"eval_logps/rejected": -309.2061462402344,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.02727527543902397,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.02727527543902397,
"eval_runtime": 2667.2175,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 1400
},
{
"epoch": 0.411588703203678,
"grad_norm": 0.01239013671875,
"learning_rate": 3.6614607708458532e-06,
"logits/chosen": -2.418804883956909,
"logits/rejected": -2.418804883956909,
"logps/chosen": -295.696533203125,
"logps/rejected": -295.696533203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.024944758042693138,
"rewards/margins": 0.0,
"rewards/rejected": -0.024944758042693138,
"step": 1410
},
{
"epoch": 0.41450777202072536,
"grad_norm": 0.0146484375,
"learning_rate": 3.6388345410990195e-06,
"logits/chosen": -2.4199652671813965,
"logits/rejected": -2.4199652671813965,
"logps/chosen": -341.0202331542969,
"logps/rejected": -341.0202331542969,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.030235985293984413,
"rewards/margins": 0.0,
"rewards/rejected": -0.030235985293984413,
"step": 1420
},
{
"epoch": 0.41742684083777276,
"grad_norm": 0.01141357421875,
"learning_rate": 3.6160899823105518e-06,
"logits/chosen": -2.4291069507598877,
"logits/rejected": -2.4291069507598877,
"logps/chosen": -287.2336730957031,
"logps/rejected": -287.2336730957031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0277925543487072,
"rewards/margins": 0.0,
"rewards/rejected": -0.0277925543487072,
"step": 1430
},
{
"epoch": 0.4203459096548201,
"grad_norm": 0.0140380859375,
"learning_rate": 3.5932294577226468e-06,
"logits/chosen": -2.440561532974243,
"logits/rejected": -2.440561532974243,
"logps/chosen": -276.7684020996094,
"logps/rejected": -276.7684020996094,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.017870336771011353,
"rewards/margins": 0.0,
"rewards/rejected": -0.017870336771011353,
"step": 1440
},
{
"epoch": 0.42326497847186745,
"grad_norm": 0.0118408203125,
"learning_rate": 3.5702553426267704e-06,
"logits/chosen": -2.449218988418579,
"logits/rejected": -2.449218988418579,
"logps/chosen": -305.78814697265625,
"logps/rejected": -305.78814697265625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.024231892079114914,
"rewards/margins": 0.0,
"rewards/rejected": -0.024231892079114914,
"step": 1450
},
{
"epoch": 0.42618404728891485,
"grad_norm": 0.015625,
"learning_rate": 3.547170024116854e-06,
"logits/chosen": -2.4015636444091797,
"logits/rejected": -2.4015636444091797,
"logps/chosen": -281.1402893066406,
"logps/rejected": -281.1402893066406,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.027333328500390053,
"rewards/margins": 0.0,
"rewards/rejected": -0.027333328500390053,
"step": 1460
},
{
"epoch": 0.4291031161059622,
"grad_norm": 0.0164794921875,
"learning_rate": 3.5239759008412666e-06,
"logits/chosen": -2.461341381072998,
"logits/rejected": -2.461341381072998,
"logps/chosen": -315.0804443359375,
"logps/rejected": -315.0804443359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.026240995153784752,
"rewards/margins": 0.0,
"rewards/rejected": -0.026240995153784752,
"step": 1470
},
{
"epoch": 0.43202218492300953,
"grad_norm": 0.0164794921875,
"learning_rate": 3.500675382753588e-06,
"logits/chosen": -2.420381784439087,
"logits/rejected": -2.420381784439087,
"logps/chosen": -310.7515563964844,
"logps/rejected": -310.7515563964844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.023152858018875122,
"rewards/margins": 0.0,
"rewards/rejected": -0.023152858018875122,
"step": 1480
},
{
"epoch": 0.43494125374005693,
"grad_norm": 0.01336669921875,
"learning_rate": 3.477270890862204e-06,
"logits/chosen": -2.3881866931915283,
"logits/rejected": -2.3881866931915283,
"logps/chosen": -318.3128356933594,
"logps/rejected": -318.3128356933594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.030725980177521706,
"rewards/margins": 0.0,
"rewards/rejected": -0.030725980177521706,
"step": 1490
},
{
"epoch": 0.4378603225571043,
"grad_norm": 0.0140380859375,
"learning_rate": 3.453764856978758e-06,
"logits/chosen": -2.409209728240967,
"logits/rejected": -2.409209728240967,
"logps/chosen": -331.4593200683594,
"logps/rejected": -331.4593200683594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.022285277023911476,
"rewards/margins": 0.0,
"rewards/rejected": -0.022285277023911476,
"step": 1500
},
{
"epoch": 0.4378603225571043,
"eval_logits/chosen": -2.394321918487549,
"eval_logits/rejected": -2.394321918487549,
"eval_logps/chosen": -308.9651794433594,
"eval_logps/rejected": -308.9651794433594,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.024866018444299698,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.024866018444299698,
"eval_runtime": 2666.7789,
"eval_samples_per_second": 2.284,
"eval_steps_per_second": 0.286,
"step": 1500
},
{
"epoch": 0.4407793913741516,
"grad_norm": 0.01312255859375,
"learning_rate": 3.4301597234654733e-06,
"logits/chosen": -2.4193215370178223,
"logits/rejected": -2.4193215370178223,
"logps/chosen": -304.951171875,
"logps/rejected": -304.951171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02852988801896572,
"rewards/margins": 0.0,
"rewards/rejected": -0.02852988801896572,
"step": 1510
},
{
"epoch": 0.443698460191199,
"grad_norm": 0.0177001953125,
"learning_rate": 3.406457942981384e-06,
"logits/chosen": -2.430614948272705,
"logits/rejected": -2.430614948272705,
"logps/chosen": -333.06988525390625,
"logps/rejected": -333.06988525390625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.024759288877248764,
"rewards/margins": 0.0,
"rewards/rejected": -0.024759288877248764,
"step": 1520
},
{
"epoch": 0.44661752900824636,
"grad_norm": 0.0133056640625,
"learning_rate": 3.3826619782274954e-06,
"logits/chosen": -2.43021559715271,
"logits/rejected": -2.43021559715271,
"logps/chosen": -284.0345153808594,
"logps/rejected": -284.0345153808594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.025433775037527084,
"rewards/margins": 0.0,
"rewards/rejected": -0.025433775037527084,
"step": 1530
},
{
"epoch": 0.4495365978252937,
"grad_norm": 0.0142822265625,
"learning_rate": 3.3587743016909013e-06,
"logits/chosen": -2.439312219619751,
"logits/rejected": -2.439312219619751,
"logps/chosen": -320.015380859375,
"logps/rejected": -320.015380859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02944205328822136,
"rewards/margins": 0.0,
"rewards/rejected": -0.02944205328822136,
"step": 1540
},
{
"epoch": 0.4524556666423411,
"grad_norm": 0.044677734375,
"learning_rate": 3.334797395387882e-06,
"logits/chosen": -2.4262938499450684,
"logits/rejected": -2.4262938499450684,
"logps/chosen": -329.60504150390625,
"logps/rejected": -329.60504150390625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.027106398716568947,
"rewards/margins": 0.0,
"rewards/rejected": -0.027106398716568947,
"step": 1550
},
{
"epoch": 0.45537473545938845,
"grad_norm": 0.01226806640625,
"learning_rate": 3.3107337506060145e-06,
"logits/chosen": -2.4414420127868652,
"logits/rejected": -2.4414420127868652,
"logps/chosen": -289.9877014160156,
"logps/rejected": -289.9877014160156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.028158003464341164,
"rewards/margins": 0.0,
"rewards/rejected": -0.028158003464341164,
"step": 1560
},
{
"epoch": 0.4582938042764358,
"grad_norm": 0.0301513671875,
"learning_rate": 3.2865858676453172e-06,
"logits/chosen": -2.434182643890381,
"logits/rejected": -2.434182643890381,
"logps/chosen": -306.0428466796875,
"logps/rejected": -306.0428466796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02475564181804657,
"rewards/margins": 0.0,
"rewards/rejected": -0.02475564181804657,
"step": 1570
},
{
"epoch": 0.4612128730934832,
"grad_norm": 0.0098876953125,
"learning_rate": 3.2623562555584633e-06,
"logits/chosen": -2.430816411972046,
"logits/rejected": -2.430816411972046,
"logps/chosen": -281.2196960449219,
"logps/rejected": -281.2196960449219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02929893136024475,
"rewards/margins": 0.0,
"rewards/rejected": -0.02929893136024475,
"step": 1580
},
{
"epoch": 0.46413194191053053,
"grad_norm": 0.024658203125,
"learning_rate": 3.2380474318900766e-06,
"logits/chosen": -2.4165406227111816,
"logits/rejected": -2.4165406227111816,
"logps/chosen": -310.68511962890625,
"logps/rejected": -310.68511962890625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03376628831028938,
"rewards/margins": 0.0,
"rewards/rejected": -0.03376628831028938,
"step": 1590
},
{
"epoch": 0.4670510107275779,
"grad_norm": 0.016845703125,
"learning_rate": 3.2136619224151533e-06,
"logits/chosen": -2.4508678913116455,
"logits/rejected": -2.4508678913116455,
"logps/chosen": -327.84619140625,
"logps/rejected": -327.84619140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03426826745271683,
"rewards/margins": 0.0,
"rewards/rejected": -0.03426826745271683,
"step": 1600
},
{
"epoch": 0.4670510107275779,
"eval_logits/chosen": -2.3953943252563477,
"eval_logits/rejected": -2.3953943252563477,
"eval_logps/chosen": -309.15863037109375,
"eval_logps/rejected": -309.15863037109375,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.026800233870744705,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.026800233870744705,
"eval_runtime": 2666.9806,
"eval_samples_per_second": 2.283,
"eval_steps_per_second": 0.286,
"step": 1600
},
{
"epoch": 0.4699700795446253,
"grad_norm": 0.014892578125,
"learning_rate": 3.1892022608766215e-06,
"logits/chosen": -2.361971378326416,
"logits/rejected": -2.361971378326416,
"logps/chosen": -299.3944396972656,
"logps/rejected": -299.3944396972656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0262086633592844,
"rewards/margins": 0.0,
"rewards/rejected": -0.0262086633592844,
"step": 1610
},
{
"epoch": 0.4728891483616726,
"grad_norm": 0.01422119140625,
"learning_rate": 3.16467098872208e-06,
"logits/chosen": -2.4706971645355225,
"logits/rejected": -2.4706971645355225,
"logps/chosen": -332.5861511230469,
"logps/rejected": -332.5861511230469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.034811943769454956,
"rewards/margins": 0.0,
"rewards/rejected": -0.034811943769454956,
"step": 1620
},
{
"epoch": 0.47580821717871996,
"grad_norm": 0.032470703125,
"learning_rate": 3.140070654839728e-06,
"logits/chosen": -2.4026148319244385,
"logits/rejected": -2.4026148319244385,
"logps/chosen": -296.76605224609375,
"logps/rejected": -296.76605224609375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02368254028260708,
"rewards/margins": 0.0,
"rewards/rejected": -0.02368254028260708,
"step": 1630
},
{
"epoch": 0.47872728599576736,
"grad_norm": 0.0242919921875,
"learning_rate": 3.115403815293532e-06,
"logits/chosen": -2.43617582321167,
"logits/rejected": -2.43617582321167,
"logps/chosen": -342.2427062988281,
"logps/rejected": -342.2427062988281,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.036105576902627945,
"rewards/margins": 0.0,
"rewards/rejected": -0.036105576902627945,
"step": 1640
},
{
"epoch": 0.4816463548128147,
"grad_norm": 0.0113525390625,
"learning_rate": 3.0906730330576345e-06,
"logits/chosen": -2.4739155769348145,
"logits/rejected": -2.4739155769348145,
"logps/chosen": -332.26678466796875,
"logps/rejected": -332.26678466796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.028261488303542137,
"rewards/margins": 0.0,
"rewards/rejected": -0.028261488303542137,
"step": 1650
},
{
"epoch": 0.48456542362986205,
"grad_norm": 0.017333984375,
"learning_rate": 3.065880877750059e-06,
"logits/chosen": -2.427436351776123,
"logits/rejected": -2.427436351776123,
"logps/chosen": -304.4495544433594,
"logps/rejected": -304.4495544433594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03417867794632912,
"rewards/margins": 0.0,
"rewards/rejected": -0.03417867794632912,
"step": 1660
},
{
"epoch": 0.48748449244690945,
"grad_norm": 0.01226806640625,
"learning_rate": 3.041029925365711e-06,
"logits/chosen": -2.4058425426483154,
"logits/rejected": -2.4058425426483154,
"logps/chosen": -308.30072021484375,
"logps/rejected": -308.30072021484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.035630594938993454,
"rewards/margins": 0.0,
"rewards/rejected": -0.035630594938993454,
"step": 1670
},
{
"epoch": 0.4904035612639568,
"grad_norm": 0.0126953125,
"learning_rate": 3.0161227580087282e-06,
"logits/chosen": -2.433281421661377,
"logits/rejected": -2.433281421661377,
"logps/chosen": -342.0614013671875,
"logps/rejected": -342.0614013671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03289630264043808,
"rewards/margins": 0.0,
"rewards/rejected": -0.03289630264043808,
"step": 1680
},
{
"epoch": 0.49332263008100413,
"grad_norm": 0.0123291015625,
"learning_rate": 2.9911619636241862e-06,
"logits/chosen": -2.4333884716033936,
"logits/rejected": -2.4333884716033936,
"logps/chosen": -322.1616516113281,
"logps/rejected": -322.1616516113281,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.034327663481235504,
"rewards/margins": 0.0,
"rewards/rejected": -0.034327663481235504,
"step": 1690
},
{
"epoch": 0.49624169889805153,
"grad_norm": 0.01275634765625,
"learning_rate": 2.966150135729203e-06,
"logits/chosen": -2.38623046875,
"logits/rejected": -2.38623046875,
"logps/chosen": -335.8984680175781,
"logps/rejected": -335.8984680175781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03050742670893669,
"rewards/margins": 0.0,
"rewards/rejected": -0.03050742670893669,
"step": 1700
},
{
"epoch": 0.49624169889805153,
"eval_logits/chosen": -2.3913044929504395,
"eval_logits/rejected": -2.3913044929504395,
"eval_logps/chosen": -309.405517578125,
"eval_logps/rejected": -309.405517578125,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.029269486665725708,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.029269486665725708,
"eval_runtime": 2669.612,
"eval_samples_per_second": 2.281,
"eval_steps_per_second": 0.285,
"step": 1700
},
{
"epoch": 0.4991607677150989,
"grad_norm": 0.01324462890625,
"learning_rate": 2.9410898731434667e-06,
"logits/chosen": -2.41214919090271,
"logits/rejected": -2.41214919090271,
"logps/chosen": -302.40887451171875,
"logps/rejected": -302.40887451171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.028926188126206398,
"rewards/margins": 0.0,
"rewards/rejected": -0.028926188126206398,
"step": 1710
},
{
"epoch": 0.5020798365321463,
"grad_norm": 0.0152587890625,
"learning_rate": 2.9159837797192003e-06,
"logits/chosen": -2.415527820587158,
"logits/rejected": -2.415527820587158,
"logps/chosen": -329.7999267578125,
"logps/rejected": -329.7999267578125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03768650442361832,
"rewards/margins": 0.0,
"rewards/rejected": -0.03768650442361832,
"step": 1720
},
{
"epoch": 0.5049989053491936,
"grad_norm": 0.014404296875,
"learning_rate": 2.890834464070623e-06,
"logits/chosen": -2.4205574989318848,
"logits/rejected": -2.4205574989318848,
"logps/chosen": -309.94329833984375,
"logps/rejected": -309.94329833984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03702525794506073,
"rewards/margins": 0.0,
"rewards/rejected": -0.03702525794506073,
"step": 1730
},
{
"epoch": 0.507917974166241,
"grad_norm": 0.013671875,
"learning_rate": 2.865644539302896e-06,
"logits/chosen": -2.389092206954956,
"logits/rejected": -2.389092206954956,
"logps/chosen": -339.6660461425781,
"logps/rejected": -339.6660461425781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.029835382476449013,
"rewards/margins": 0.0,
"rewards/rejected": -0.029835382476449013,
"step": 1740
},
{
"epoch": 0.5108370429832884,
"grad_norm": 0.01300048828125,
"learning_rate": 2.840416622740617e-06,
"logits/chosen": -2.444392681121826,
"logits/rejected": -2.444392681121826,
"logps/chosen": -318.47296142578125,
"logps/rejected": -318.47296142578125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03087993524968624,
"rewards/margins": 0.0,
"rewards/rejected": -0.03087993524968624,
"step": 1750
},
{
"epoch": 0.5137561118003356,
"grad_norm": 0.01263427734375,
"learning_rate": 2.8151533356558673e-06,
"logits/chosen": -2.4179341793060303,
"logits/rejected": -2.4179341793060303,
"logps/chosen": -295.8548889160156,
"logps/rejected": -295.8548889160156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.032246123999357224,
"rewards/margins": 0.0,
"rewards/rejected": -0.032246123999357224,
"step": 1760
},
{
"epoch": 0.516675180617383,
"grad_norm": 0.014892578125,
"learning_rate": 2.7898573029958563e-06,
"logits/chosen": -2.377382516860962,
"logits/rejected": -2.377382516860962,
"logps/chosen": -305.41656494140625,
"logps/rejected": -305.41656494140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03165289759635925,
"rewards/margins": 0.0,
"rewards/rejected": -0.03165289759635925,
"step": 1770
},
{
"epoch": 0.5195942494344304,
"grad_norm": 0.0103759765625,
"learning_rate": 2.7645311531101763e-06,
"logits/chosen": -2.412802219390869,
"logits/rejected": -2.412802219390869,
"logps/chosen": -312.50067138671875,
"logps/rejected": -312.50067138671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.034763775765895844,
"rewards/margins": 0.0,
"rewards/rejected": -0.034763775765895844,
"step": 1780
},
{
"epoch": 0.5225133182514777,
"grad_norm": 0.0135498046875,
"learning_rate": 2.7391775174777084e-06,
"logits/chosen": -2.419868230819702,
"logits/rejected": -2.419868230819702,
"logps/chosen": -310.26922607421875,
"logps/rejected": -310.26922607421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.035530101507902145,
"rewards/margins": 0.0,
"rewards/rejected": -0.035530101507902145,
"step": 1790
},
{
"epoch": 0.5254323870685251,
"grad_norm": 0.0167236328125,
"learning_rate": 2.713799030433203e-06,
"logits/chosen": -2.423767566680908,
"logits/rejected": -2.423767566680908,
"logps/chosen": -308.0718688964844,
"logps/rejected": -308.0718688964844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03834807127714157,
"rewards/margins": 0.0,
"rewards/rejected": -0.03834807127714157,
"step": 1800
},
{
"epoch": 0.5254323870685251,
"eval_logits/chosen": -2.392709732055664,
"eval_logits/rejected": -2.392709732055664,
"eval_logps/chosen": -310.26434326171875,
"eval_logps/rejected": -310.26434326171875,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.037857454270124435,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.037857454270124435,
"eval_runtime": 2669.2359,
"eval_samples_per_second": 2.282,
"eval_steps_per_second": 0.285,
"step": 1800
},
{
"epoch": 0.5283514558855725,
"grad_norm": 0.01373291015625,
"learning_rate": 2.688398328893561e-06,
"logits/chosen": -2.4216887950897217,
"logits/rejected": -2.4216887950897217,
"logps/chosen": -307.491455078125,
"logps/rejected": -307.491455078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03987189009785652,
"rewards/margins": 0.0,
"rewards/rejected": -0.03987189009785652,
"step": 1810
},
{
"epoch": 0.5312705247026198,
"grad_norm": 0.013916015625,
"learning_rate": 2.6629780520838526e-06,
"logits/chosen": -2.389004945755005,
"logits/rejected": -2.389004945755005,
"logps/chosen": -314.912353515625,
"logps/rejected": -314.912353515625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03697946295142174,
"rewards/margins": 0.0,
"rewards/rejected": -0.03697946295142174,
"step": 1820
},
{
"epoch": 0.5341895935196672,
"grad_norm": 0.016845703125,
"learning_rate": 2.637540841263088e-06,
"logits/chosen": -2.4251251220703125,
"logits/rejected": -2.4251251220703125,
"logps/chosen": -309.82611083984375,
"logps/rejected": -309.82611083984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.041506171226501465,
"rewards/margins": 0.0,
"rewards/rejected": -0.041506171226501465,
"step": 1830
},
{
"epoch": 0.5371086623367146,
"grad_norm": 0.0130615234375,
"learning_rate": 2.6120893394497825e-06,
"logits/chosen": -2.4095826148986816,
"logits/rejected": -2.4095826148986816,
"logps/chosen": -290.29876708984375,
"logps/rejected": -290.29876708984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03885159641504288,
"rewards/margins": 0.0,
"rewards/rejected": -0.03885159641504288,
"step": 1840
},
{
"epoch": 0.5400277311537619,
"grad_norm": 0.0203857421875,
"learning_rate": 2.586626191147337e-06,
"logits/chosen": -2.414461612701416,
"logits/rejected": -2.414461612701416,
"logps/chosen": -298.74444580078125,
"logps/rejected": -298.74444580078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.035465486347675323,
"rewards/margins": 0.0,
"rewards/rejected": -0.035465486347675323,
"step": 1850
},
{
"epoch": 0.5429467999708093,
"grad_norm": 0.0142822265625,
"learning_rate": 2.5611540420692666e-06,
"logits/chosen": -2.4189705848693848,
"logits/rejected": -2.4189705848693848,
"logps/chosen": -361.6686706542969,
"logps/rejected": -361.6686706542969,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04054202139377594,
"rewards/margins": 0.0,
"rewards/rejected": -0.04054202139377594,
"step": 1860
},
{
"epoch": 0.5458658687878567,
"grad_norm": 0.01446533203125,
"learning_rate": 2.5356755388642973e-06,
"logits/chosen": -2.4053876399993896,
"logits/rejected": -2.4053876399993896,
"logps/chosen": -290.9534606933594,
"logps/rejected": -290.9534606933594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.037081241607666016,
"rewards/margins": 0.0,
"rewards/rejected": -0.037081241607666016,
"step": 1870
},
{
"epoch": 0.548784937604904,
"grad_norm": 0.01611328125,
"learning_rate": 2.510193328841375e-06,
"logits/chosen": -2.4209909439086914,
"logits/rejected": -2.4209909439086914,
"logps/chosen": -304.0765075683594,
"logps/rejected": -304.0765075683594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03415703400969505,
"rewards/margins": 0.0,
"rewards/rejected": -0.03415703400969505,
"step": 1880
},
{
"epoch": 0.5517040064219514,
"grad_norm": 0.0164794921875,
"learning_rate": 2.484710059694594e-06,
"logits/chosen": -2.4459662437438965,
"logits/rejected": -2.4459662437438965,
"logps/chosen": -274.7349548339844,
"logps/rejected": -274.7349548339844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03464942425489426,
"rewards/margins": 0.0,
"rewards/rejected": -0.03464942425489426,
"step": 1890
},
{
"epoch": 0.5546230752389988,
"grad_norm": 0.01348876953125,
"learning_rate": 2.4592283792280977e-06,
"logits/chosen": -2.384141206741333,
"logits/rejected": -2.384141206741333,
"logps/chosen": -293.96533203125,
"logps/rejected": -293.96533203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04034542292356491,
"rewards/margins": 0.0,
"rewards/rejected": -0.04034542292356491,
"step": 1900
},
{
"epoch": 0.5546230752389988,
"eval_logits/chosen": -2.3927481174468994,
"eval_logits/rejected": -2.3927481174468994,
"eval_logps/chosen": -310.4163818359375,
"eval_logps/rejected": -310.4163818359375,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.03937768191099167,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.03937768191099167,
"eval_runtime": 2717.9911,
"eval_samples_per_second": 2.241,
"eval_steps_per_second": 0.28,
"step": 1900
},
{
"epoch": 0.5575421440560461,
"grad_norm": 0.01123046875,
"learning_rate": 2.433750935080959e-06,
"logits/chosen": -2.438390016555786,
"logits/rejected": -2.438390016555786,
"logps/chosen": -282.78106689453125,
"logps/rejected": -282.78106689453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05149908736348152,
"rewards/margins": 0.0,
"rewards/rejected": -0.05149908736348152,
"step": 1910
},
{
"epoch": 0.5604612128730935,
"grad_norm": 0.011962890625,
"learning_rate": 2.408280374452083e-06,
"logits/chosen": -2.4534342288970947,
"logits/rejected": -2.4534342288970947,
"logps/chosen": -306.63946533203125,
"logps/rejected": -306.63946533203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04204695671796799,
"rewards/margins": 0.0,
"rewards/rejected": -0.04204695671796799,
"step": 1920
},
{
"epoch": 0.5633802816901409,
"grad_norm": 0.01385498046875,
"learning_rate": 2.3828193438251497e-06,
"logits/chosen": -2.4302496910095215,
"logits/rejected": -2.4302496910095215,
"logps/chosen": -328.1105651855469,
"logps/rejected": -328.1105651855469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03839876502752304,
"rewards/margins": 0.0,
"rewards/rejected": -0.03839876502752304,
"step": 1930
},
{
"epoch": 0.5662993505071882,
"grad_norm": 0.01513671875,
"learning_rate": 2.3573704886936414e-06,
"logits/chosen": -2.4566609859466553,
"logits/rejected": -2.4566609859466553,
"logps/chosen": -314.76910400390625,
"logps/rejected": -314.76910400390625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04071163386106491,
"rewards/margins": 0.0,
"rewards/rejected": -0.04071163386106491,
"step": 1940
},
{
"epoch": 0.5692184193242356,
"grad_norm": 0.01397705078125,
"learning_rate": 2.331936453285957e-06,
"logits/chosen": -2.414055109024048,
"logits/rejected": -2.414055109024048,
"logps/chosen": -346.7576904296875,
"logps/rejected": -346.7576904296875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.036326270550489426,
"rewards/margins": 0.0,
"rewards/rejected": -0.036326270550489426,
"step": 1950
},
{
"epoch": 0.572137488141283,
"grad_norm": 0.0157470703125,
"learning_rate": 2.3065198802906767e-06,
"logits/chosen": -2.4286112785339355,
"logits/rejected": -2.4286112785339355,
"logps/chosen": -339.60064697265625,
"logps/rejected": -339.60064697265625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04548191279172897,
"rewards/margins": 0.0,
"rewards/rejected": -0.04548191279172897,
"step": 1960
},
{
"epoch": 0.5750565569583302,
"grad_norm": 0.01141357421875,
"learning_rate": 2.2811234105819714e-06,
"logits/chosen": -2.4342637062072754,
"logits/rejected": -2.4342637062072754,
"logps/chosen": -314.4915771484375,
"logps/rejected": -314.4915771484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03697306662797928,
"rewards/margins": 0.0,
"rewards/rejected": -0.03697306662797928,
"step": 1970
},
{
"epoch": 0.5779756257753776,
"grad_norm": 0.01495361328125,
"learning_rate": 2.2557496829452056e-06,
"logits/chosen": -2.387324810028076,
"logits/rejected": -2.387324810028076,
"logps/chosen": -349.37835693359375,
"logps/rejected": -349.37835693359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04250973090529442,
"rewards/margins": 0.0,
"rewards/rejected": -0.04250973090529442,
"step": 1980
},
{
"epoch": 0.580894694592425,
"grad_norm": 0.0152587890625,
"learning_rate": 2.230401333802763e-06,
"logits/chosen": -2.412137985229492,
"logits/rejected": -2.412137985229492,
"logps/chosen": -310.9895324707031,
"logps/rejected": -310.9895324707031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.040226660668849945,
"rewards/margins": 0.0,
"rewards/rejected": -0.040226660668849945,
"step": 1990
},
{
"epoch": 0.5838137634094723,
"grad_norm": 0.01483154296875,
"learning_rate": 2.205080996940108e-06,
"logits/chosen": -2.4124810695648193,
"logits/rejected": -2.4124810695648193,
"logps/chosen": -273.5890197753906,
"logps/rejected": -273.5890197753906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04197770729660988,
"rewards/margins": 0.0,
"rewards/rejected": -0.04197770729660988,
"step": 2000
},
{
"epoch": 0.5838137634094723,
"eval_logits/chosen": -2.392037868499756,
"eval_logits/rejected": -2.392037868499756,
"eval_logps/chosen": -310.4427185058594,
"eval_logps/rejected": -310.4427185058594,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.039641354233026505,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.039641354233026505,
"eval_runtime": 2711.551,
"eval_samples_per_second": 2.246,
"eval_steps_per_second": 0.281,
"step": 2000
},
{
"epoch": 0.5867328322265197,
"grad_norm": 0.01214599609375,
"learning_rate": 2.1797913032321283e-06,
"logits/chosen": -2.420572519302368,
"logits/rejected": -2.420572519302368,
"logps/chosen": -277.4279479980469,
"logps/rejected": -277.4279479980469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03539072722196579,
"rewards/margins": 0.0,
"rewards/rejected": -0.03539072722196579,
"step": 2010
},
{
"epoch": 0.5896519010435671,
"grad_norm": 0.0157470703125,
"learning_rate": 2.1545348803697745e-06,
"logits/chosen": -2.4433321952819824,
"logits/rejected": -2.4433321952819824,
"logps/chosen": -281.5128479003906,
"logps/rejected": -281.5128479003906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.016543438658118248,
"rewards/margins": 0.0,
"rewards/rejected": 0.016543438658118248,
"step": 2020
},
{
"epoch": 0.5925709698606144,
"grad_norm": 0.015869140625,
"learning_rate": 2.1293143525870396e-06,
"logits/chosen": -2.435228109359741,
"logits/rejected": -2.435228109359741,
"logps/chosen": -315.1198425292969,
"logps/rejected": -315.1198425292969,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04324204847216606,
"rewards/margins": 0.0,
"rewards/rejected": -0.04324204847216606,
"step": 2030
},
{
"epoch": 0.5954900386776618,
"grad_norm": 0.0133056640625,
"learning_rate": 2.1041323403882836e-06,
"logits/chosen": -2.458317995071411,
"logits/rejected": -2.458317995071411,
"logps/chosen": -314.63482666015625,
"logps/rejected": -314.63482666015625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.039002105593681335,
"rewards/margins": 0.0,
"rewards/rejected": -0.039002105593681335,
"step": 2040
},
{
"epoch": 0.5984091074947092,
"grad_norm": 0.0164794921875,
"learning_rate": 2.078991460275958e-06,
"logits/chosen": -2.4496326446533203,
"logits/rejected": -2.4496326446533203,
"logps/chosen": -295.86199951171875,
"logps/rejected": -295.86199951171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03856682404875755,
"rewards/margins": 0.0,
"rewards/rejected": -0.03856682404875755,
"step": 2050
},
{
"epoch": 0.6013281763117565,
"grad_norm": 0.01409912109375,
"learning_rate": 2.0538943244787452e-06,
"logits/chosen": -2.440256118774414,
"logits/rejected": -2.440256118774414,
"logps/chosen": -302.68463134765625,
"logps/rejected": -302.68463134765625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.040991030633449554,
"rewards/margins": 0.0,
"rewards/rejected": -0.040991030633449554,
"step": 2060
},
{
"epoch": 0.6042472451288039,
"grad_norm": 0.01226806640625,
"learning_rate": 2.0288435406801293e-06,
"logits/chosen": -2.4207422733306885,
"logits/rejected": -2.4207422733306885,
"logps/chosen": -347.23297119140625,
"logps/rejected": -347.23297119140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03826383873820305,
"rewards/margins": 0.0,
"rewards/rejected": -0.03826383873820305,
"step": 2070
},
{
"epoch": 0.6071663139458513,
"grad_norm": 0.01275634765625,
"learning_rate": 2.0038417117474574e-06,
"logits/chosen": -2.4277267456054688,
"logits/rejected": -2.4277267456054688,
"logps/chosen": -314.09674072265625,
"logps/rejected": -314.09674072265625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05563684552907944,
"rewards/margins": 0.0,
"rewards/rejected": -0.05563684552907944,
"step": 2080
},
{
"epoch": 0.6100853827628986,
"grad_norm": 0.01251220703125,
"learning_rate": 1.9788914354614853e-06,
"logits/chosen": -2.4430274963378906,
"logits/rejected": -2.4430274963378906,
"logps/chosen": -280.791015625,
"logps/rejected": -280.791015625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.039162371307611465,
"rewards/margins": 0.0,
"rewards/rejected": -0.039162371307611465,
"step": 2090
},
{
"epoch": 0.613004451579946,
"grad_norm": 0.0159912109375,
"learning_rate": 1.9539953042464656e-06,
"logits/chosen": -2.4126973152160645,
"logits/rejected": -2.4126973152160645,
"logps/chosen": -341.8514709472656,
"logps/rejected": -341.8514709472656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04444648697972298,
"rewards/margins": 0.0,
"rewards/rejected": -0.04444648697972298,
"step": 2100
},
{
"epoch": 0.613004451579946,
"eval_logits/chosen": -2.390094041824341,
"eval_logits/rejected": -2.390094041824341,
"eval_logps/chosen": -310.71502685546875,
"eval_logps/rejected": -310.71502685546875,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.042364299297332764,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.042364299297332764,
"eval_runtime": 2698.9127,
"eval_samples_per_second": 2.256,
"eval_steps_per_second": 0.282,
"step": 2100
},
{
"epoch": 0.6159235203969934,
"grad_norm": 0.0126953125,
"learning_rate": 1.929155904900778e-06,
"logits/chosen": -2.442920207977295,
"logits/rejected": -2.442920207977295,
"logps/chosen": -336.13153076171875,
"logps/rejected": -336.13153076171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04605261981487274,
"rewards/margins": 0.0,
"rewards/rejected": -0.04605261981487274,
"step": 2110
},
{
"epoch": 0.6188425892140407,
"grad_norm": 0.0128173828125,
"learning_rate": 1.9043758183281548e-06,
"logits/chosen": -2.398139476776123,
"logits/rejected": -2.398139476776123,
"logps/chosen": -297.93353271484375,
"logps/rejected": -297.93353271484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03661734238266945,
"rewards/margins": 0.0,
"rewards/rejected": -0.03661734238266945,
"step": 2120
},
{
"epoch": 0.6217616580310881,
"grad_norm": 0.0162353515625,
"learning_rate": 1.8796576192695198e-06,
"logits/chosen": -2.4115586280822754,
"logits/rejected": -2.4115586280822754,
"logps/chosen": -283.5032653808594,
"logps/rejected": -283.5032653808594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05024952441453934,
"rewards/margins": 0.0,
"rewards/rejected": -0.05024952441453934,
"step": 2130
},
{
"epoch": 0.6246807268481355,
"grad_norm": 0.01611328125,
"learning_rate": 1.8550038760354559e-06,
"logits/chosen": -2.4140570163726807,
"logits/rejected": -2.4140570163726807,
"logps/chosen": -328.29241943359375,
"logps/rejected": -328.29241943359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03783569857478142,
"rewards/margins": 0.0,
"rewards/rejected": -0.03783569857478142,
"step": 2140
},
{
"epoch": 0.6275997956651828,
"grad_norm": 0.01470947265625,
"learning_rate": 1.8304171502393542e-06,
"logits/chosen": -2.4498252868652344,
"logits/rejected": -2.4498252868652344,
"logps/chosen": -333.46807861328125,
"logps/rejected": -333.46807861328125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05009561777114868,
"rewards/margins": 0.0,
"rewards/rejected": -0.05009561777114868,
"step": 2150
},
{
"epoch": 0.6305188644822302,
"grad_norm": 0.0198974609375,
"learning_rate": 1.8058999965312484e-06,
"logits/chosen": -2.3965957164764404,
"logits/rejected": -2.3965957164764404,
"logps/chosen": -306.3211669921875,
"logps/rejected": -306.3211669921875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04475449398159981,
"rewards/margins": 0.0,
"rewards/rejected": -0.04475449398159981,
"step": 2160
},
{
"epoch": 0.6334379332992776,
"grad_norm": 0.016357421875,
"learning_rate": 1.7814549623323828e-06,
"logits/chosen": -2.400684356689453,
"logits/rejected": -2.400684356689453,
"logps/chosen": -286.625,
"logps/rejected": -286.625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.043368883430957794,
"rewards/margins": 0.0,
"rewards/rejected": -0.043368883430957794,
"step": 2170
},
{
"epoch": 0.6363570021163248,
"grad_norm": 0.01531982421875,
"learning_rate": 1.7570845875705205e-06,
"logits/chosen": -2.4366753101348877,
"logits/rejected": -2.4366753101348877,
"logps/chosen": -338.27679443359375,
"logps/rejected": -338.27679443359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05562018230557442,
"rewards/margins": 0.0,
"rewards/rejected": -0.05562018230557442,
"step": 2180
},
{
"epoch": 0.6392760709333722,
"grad_norm": 0.0162353515625,
"learning_rate": 1.7327914044160388e-06,
"logits/chosen": -2.449612617492676,
"logits/rejected": -2.449612617492676,
"logps/chosen": -316.91766357421875,
"logps/rejected": -316.91766357421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04444233328104019,
"rewards/margins": 0.0,
"rewards/rejected": -0.04444233328104019,
"step": 2190
},
{
"epoch": 0.6421951397504196,
"grad_norm": 0.0145263671875,
"learning_rate": 1.7085779370188276e-06,
"logits/chosen": -2.3980746269226074,
"logits/rejected": -2.3980746269226074,
"logps/chosen": -308.85906982421875,
"logps/rejected": -308.85906982421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0481327660381794,
"rewards/margins": 0.0,
"rewards/rejected": -0.0481327660381794,
"step": 2200
},
{
"epoch": 0.6421951397504196,
"eval_logits/chosen": -2.3910679817199707,
"eval_logits/rejected": -2.3910679817199707,
"eval_logps/chosen": -311.0310363769531,
"eval_logps/rejected": -311.0310363769531,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04552413523197174,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04552413523197174,
"eval_runtime": 2707.3295,
"eval_samples_per_second": 2.249,
"eval_steps_per_second": 0.281,
"step": 2200
},
{
"epoch": 0.6451142085674669,
"grad_norm": 0.016845703125,
"learning_rate": 1.6844467012460193e-06,
"logits/chosen": -2.429086446762085,
"logits/rejected": -2.429086446762085,
"logps/chosen": -306.8155822753906,
"logps/rejected": -306.8155822753906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04282836988568306,
"rewards/margins": 0.0,
"rewards/rejected": -0.04282836988568306,
"step": 2210
},
{
"epoch": 0.6480332773845143,
"grad_norm": 0.014404296875,
"learning_rate": 1.6604002044205825e-06,
"logits/chosen": -2.4325811862945557,
"logits/rejected": -2.4325811862945557,
"logps/chosen": -337.0578308105469,
"logps/rejected": -337.0578308105469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04457175359129906,
"rewards/margins": 0.0,
"rewards/rejected": -0.04457175359129906,
"step": 2220
},
{
"epoch": 0.6509523462015617,
"grad_norm": 0.01397705078125,
"learning_rate": 1.6364409450608018e-06,
"logits/chosen": -2.4428985118865967,
"logits/rejected": -2.4428985118865967,
"logps/chosen": -308.55657958984375,
"logps/rejected": -308.55657958984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04731472209095955,
"rewards/margins": 0.0,
"rewards/rejected": -0.04731472209095955,
"step": 2230
},
{
"epoch": 0.653871415018609,
"grad_norm": 0.013427734375,
"learning_rate": 1.6125714126206736e-06,
"logits/chosen": -2.4196009635925293,
"logits/rejected": -2.4196009635925293,
"logps/chosen": -348.8056335449219,
"logps/rejected": -348.8056335449219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.049455009400844574,
"rewards/margins": 0.0,
"rewards/rejected": -0.049455009400844574,
"step": 2240
},
{
"epoch": 0.6567904838356564,
"grad_norm": 0.01556396484375,
"learning_rate": 1.5887940872312391e-06,
"logits/chosen": -2.4100897312164307,
"logits/rejected": -2.4100897312164307,
"logps/chosen": -320.3233642578125,
"logps/rejected": -320.3233642578125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04676957428455353,
"rewards/margins": 0.0,
"rewards/rejected": -0.04676957428455353,
"step": 2250
},
{
"epoch": 0.6597095526527038,
"grad_norm": 0.0147705078125,
"learning_rate": 1.5651114394428955e-06,
"logits/chosen": -2.4624266624450684,
"logits/rejected": -2.4624266624450684,
"logps/chosen": -344.6718444824219,
"logps/rejected": -344.6718444824219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0538489893078804,
"rewards/margins": 0.0,
"rewards/rejected": -0.0538489893078804,
"step": 2260
},
{
"epoch": 0.6626286214697511,
"grad_norm": 0.01251220703125,
"learning_rate": 1.5415259299686903e-06,
"logits/chosen": -2.4147191047668457,
"logits/rejected": -2.4147191047668457,
"logps/chosen": -316.6529235839844,
"logps/rejected": -316.6529235839844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.041484713554382324,
"rewards/margins": 0.0,
"rewards/rejected": -0.041484713554382324,
"step": 2270
},
{
"epoch": 0.6655476902867985,
"grad_norm": 0.01348876953125,
"learning_rate": 1.5180400094286496e-06,
"logits/chosen": -2.440053939819336,
"logits/rejected": -2.440053939819336,
"logps/chosen": -309.5370178222656,
"logps/rejected": -309.5370178222656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04570756107568741,
"rewards/margins": 0.0,
"rewards/rejected": -0.04570756107568741,
"step": 2280
},
{
"epoch": 0.6684667591038459,
"grad_norm": 0.017822265625,
"learning_rate": 1.494656118095149e-06,
"logits/chosen": -2.407764434814453,
"logits/rejected": -2.407764434814453,
"logps/chosen": -320.51263427734375,
"logps/rejected": -320.51263427734375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04706931859254837,
"rewards/margins": 0.0,
"rewards/rejected": -0.04706931859254837,
"step": 2290
},
{
"epoch": 0.6713858279208932,
"grad_norm": 0.0120849609375,
"learning_rate": 1.4713766856393557e-06,
"logits/chosen": -2.420919895172119,
"logits/rejected": -2.420919895172119,
"logps/chosen": -295.04547119140625,
"logps/rejected": -295.04547119140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05071335285902023,
"rewards/margins": 0.0,
"rewards/rejected": -0.05071335285902023,
"step": 2300
},
{
"epoch": 0.6713858279208932,
"eval_logits/chosen": -2.391244411468506,
"eval_logits/rejected": -2.391244411468506,
"eval_logps/chosen": -310.7880554199219,
"eval_logps/rejected": -310.7880554199219,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04309455305337906,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04309455305337906,
"eval_runtime": 2670.058,
"eval_samples_per_second": 2.281,
"eval_steps_per_second": 0.285,
"step": 2300
},
{
"epoch": 0.6743048967379406,
"grad_norm": 0.0198974609375,
"learning_rate": 1.448204130878785e-06,
"logits/chosen": -2.3968968391418457,
"logits/rejected": -2.3968968391418457,
"logps/chosen": -287.2406005859375,
"logps/rejected": -287.2406005859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04974811524152756,
"rewards/margins": 0.0,
"rewards/rejected": -0.04974811524152756,
"step": 2310
},
{
"epoch": 0.677223965554988,
"grad_norm": 0.013916015625,
"learning_rate": 1.425140861525967e-06,
"logits/chosen": -2.407982587814331,
"logits/rejected": -2.407982587814331,
"logps/chosen": -346.8302307128906,
"logps/rejected": -346.8302307128906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.045040082186460495,
"rewards/margins": 0.0,
"rewards/rejected": -0.045040082186460495,
"step": 2320
},
{
"epoch": 0.6801430343720353,
"grad_norm": 0.01531982421875,
"learning_rate": 1.4021892739382853e-06,
"logits/chosen": -2.4366557598114014,
"logits/rejected": -2.4366557598114014,
"logps/chosen": -315.5507507324219,
"logps/rejected": -315.5507507324219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.053034014999866486,
"rewards/margins": 0.0,
"rewards/rejected": -0.053034014999866486,
"step": 2330
},
{
"epoch": 0.6830621031890827,
"grad_norm": 0.013916015625,
"learning_rate": 1.3793517528689804e-06,
"logits/chosen": -2.40993070602417,
"logits/rejected": -2.40993070602417,
"logps/chosen": -322.5754699707031,
"logps/rejected": -322.5754699707031,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04859765246510506,
"rewards/margins": 0.0,
"rewards/rejected": -0.04859765246510506,
"step": 2340
},
{
"epoch": 0.6859811720061301,
"grad_norm": 0.0167236328125,
"learning_rate": 1.3566306712193704e-06,
"logits/chosen": -2.4204134941101074,
"logits/rejected": -2.4204134941101074,
"logps/chosen": -349.4993896484375,
"logps/rejected": -349.4993896484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05103806406259537,
"rewards/margins": 0.0,
"rewards/rejected": -0.05103806406259537,
"step": 2350
},
{
"epoch": 0.6889002408231774,
"grad_norm": 0.01531982421875,
"learning_rate": 1.3340283897922911e-06,
"logits/chosen": -2.4295237064361572,
"logits/rejected": -2.4295237064361572,
"logps/chosen": -330.99005126953125,
"logps/rejected": -330.99005126953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04879484325647354,
"rewards/margins": 0.0,
"rewards/rejected": -0.04879484325647354,
"step": 2360
},
{
"epoch": 0.6918193096402248,
"grad_norm": 0.0146484375,
"learning_rate": 1.3115472570468058e-06,
"logits/chosen": -2.4285712242126465,
"logits/rejected": -2.4285712242126465,
"logps/chosen": -336.67364501953125,
"logps/rejected": -336.67364501953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04440216347575188,
"rewards/margins": 0.0,
"rewards/rejected": -0.04440216347575188,
"step": 2370
},
{
"epoch": 0.6947383784572722,
"grad_norm": 0.0162353515625,
"learning_rate": 1.2891896088541928e-06,
"logits/chosen": -2.405956745147705,
"logits/rejected": -2.405956745147705,
"logps/chosen": -338.88739013671875,
"logps/rejected": -338.88739013671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.047105275094509125,
"rewards/margins": 0.0,
"rewards/rejected": -0.047105275094509125,
"step": 2380
},
{
"epoch": 0.6976574472743194,
"grad_norm": 0.0169677734375,
"learning_rate": 1.266957768255232e-06,
"logits/chosen": -2.422194719314575,
"logits/rejected": -2.422194719314575,
"logps/chosen": -318.286865234375,
"logps/rejected": -318.286865234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04666005074977875,
"rewards/margins": 0.0,
"rewards/rejected": -0.04666005074977875,
"step": 2390
},
{
"epoch": 0.7005765160913668,
"grad_norm": 0.0142822265625,
"learning_rate": 1.2448540452188432e-06,
"logits/chosen": -2.3955206871032715,
"logits/rejected": -2.3955206871032715,
"logps/chosen": -314.3586120605469,
"logps/rejected": -314.3586120605469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04044215753674507,
"rewards/margins": 0.0,
"rewards/rejected": -0.04044215753674507,
"step": 2400
},
{
"epoch": 0.7005765160913668,
"eval_logits/chosen": -2.3899266719818115,
"eval_logits/rejected": -2.3899266719818115,
"eval_logps/chosen": -310.6455078125,
"eval_logps/rejected": -310.6455078125,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04166920483112335,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04166920483112335,
"eval_runtime": 2668.7744,
"eval_samples_per_second": 2.282,
"eval_steps_per_second": 0.286,
"step": 2400
},
{
"epoch": 0.7034955849084142,
"grad_norm": 0.01446533203125,
"learning_rate": 1.2228807364020617e-06,
"logits/chosen": -2.4090027809143066,
"logits/rejected": -2.4090027809143066,
"logps/chosen": -268.48944091796875,
"logps/rejected": -268.48944091796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.037643421441316605,
"rewards/margins": 0.0,
"rewards/rejected": -0.037643421441316605,
"step": 2410
},
{
"epoch": 0.7064146537254615,
"grad_norm": 0.012451171875,
"learning_rate": 1.2010401249114166e-06,
"logits/chosen": -2.4060184955596924,
"logits/rejected": -2.4060184955596924,
"logps/chosen": -338.2677001953125,
"logps/rejected": -338.2677001953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.035085879266262054,
"rewards/margins": 0.0,
"rewards/rejected": -0.035085879266262054,
"step": 2420
},
{
"epoch": 0.7093337225425089,
"grad_norm": 0.0206298828125,
"learning_rate": 1.1793344800656995e-06,
"logits/chosen": -2.3857572078704834,
"logits/rejected": -2.3857572078704834,
"logps/chosen": -325.4837646484375,
"logps/rejected": -325.4837646484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03704181686043739,
"rewards/margins": 0.0,
"rewards/rejected": -0.03704181686043739,
"step": 2430
},
{
"epoch": 0.7122527913595563,
"grad_norm": 0.01544189453125,
"learning_rate": 1.1577660571601796e-06,
"logits/chosen": -2.396127223968506,
"logits/rejected": -2.396127223968506,
"logps/chosen": -321.38897705078125,
"logps/rejected": -321.38897705078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0452260822057724,
"rewards/margins": 0.0,
"rewards/rejected": -0.0452260822057724,
"step": 2440
},
{
"epoch": 0.7151718601766036,
"grad_norm": 0.0137939453125,
"learning_rate": 1.1363370972322694e-06,
"logits/chosen": -2.4177489280700684,
"logits/rejected": -2.4177489280700684,
"logps/chosen": -296.6512756347656,
"logps/rejected": -296.6512756347656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04763947054743767,
"rewards/margins": 0.0,
"rewards/rejected": -0.04763947054743767,
"step": 2450
},
{
"epoch": 0.718090928993651,
"grad_norm": 0.0142822265625,
"learning_rate": 1.115049826828669e-06,
"logits/chosen": -2.4321625232696533,
"logits/rejected": -2.4321625232696533,
"logps/chosen": -306.14141845703125,
"logps/rejected": -306.14141845703125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04333222657442093,
"rewards/margins": 0.0,
"rewards/rejected": -0.04333222657442093,
"step": 2460
},
{
"epoch": 0.7210099978106984,
"grad_norm": 0.01483154296875,
"learning_rate": 1.0939064577740266e-06,
"logits/chosen": -2.4054694175720215,
"logits/rejected": -2.4054694175720215,
"logps/chosen": -301.36334228515625,
"logps/rejected": -301.36334228515625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.041240572929382324,
"rewards/margins": 0.0,
"rewards/rejected": -0.041240572929382324,
"step": 2470
},
{
"epoch": 0.7239290666277457,
"grad_norm": 0.0159912109375,
"learning_rate": 1.0729091869411137e-06,
"logits/chosen": -2.4020252227783203,
"logits/rejected": -2.4020252227783203,
"logps/chosen": -332.1387023925781,
"logps/rejected": -332.1387023925781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.043921731412410736,
"rewards/margins": 0.0,
"rewards/rejected": -0.043921731412410736,
"step": 2480
},
{
"epoch": 0.7268481354447931,
"grad_norm": 0.013427734375,
"learning_rate": 1.0520601960225708e-06,
"logits/chosen": -2.421534299850464,
"logits/rejected": -2.421534299850464,
"logps/chosen": -314.00311279296875,
"logps/rejected": -314.00311279296875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.01183334831148386,
"rewards/margins": 0.0,
"rewards/rejected": -0.01183334831148386,
"step": 2490
},
{
"epoch": 0.7297672042618405,
"grad_norm": 0.020751953125,
"learning_rate": 1.0313616513042133e-06,
"logits/chosen": -2.4747350215911865,
"logits/rejected": -2.4747350215911865,
"logps/chosen": -319.47918701171875,
"logps/rejected": -319.47918701171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.055976878851652145,
"rewards/margins": 0.0,
"rewards/rejected": -0.055976878851652145,
"step": 2500
},
{
"epoch": 0.7297672042618405,
"eval_logits/chosen": -2.3914709091186523,
"eval_logits/rejected": -2.3914709091186523,
"eval_logps/chosen": -310.819580078125,
"eval_logps/rejected": -310.819580078125,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04341000318527222,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04341000318527222,
"eval_runtime": 2669.3967,
"eval_samples_per_second": 2.281,
"eval_steps_per_second": 0.285,
"step": 2500
},
{
"epoch": 0.7326862730788878,
"grad_norm": 0.0145263671875,
"learning_rate": 1.0108157034399532e-06,
"logits/chosen": -2.4052977561950684,
"logits/rejected": -2.4052977561950684,
"logps/chosen": -298.67474365234375,
"logps/rejected": -298.67474365234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04686864838004112,
"rewards/margins": 0.0,
"rewards/rejected": -0.04686864838004112,
"step": 2510
},
{
"epoch": 0.7356053418959352,
"grad_norm": 0.0179443359375,
"learning_rate": 9.90424487228334e-07,
"logits/chosen": -2.411712646484375,
"logits/rejected": -2.411712646484375,
"logps/chosen": -322.70428466796875,
"logps/rejected": -322.70428466796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.043312918394804,
"rewards/margins": 0.0,
"rewards/rejected": -0.043312918394804,
"step": 2520
},
{
"epoch": 0.7385244107129826,
"grad_norm": 0.01611328125,
"learning_rate": 9.701901213907192e-07,
"logits/chosen": -2.4330382347106934,
"logits/rejected": -2.4330382347106934,
"logps/chosen": -324.5224609375,
"logps/rejected": -324.5224609375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05447854846715927,
"rewards/margins": 0.0,
"rewards/rejected": -0.05447854846715927,
"step": 2530
},
{
"epoch": 0.7414434795300299,
"grad_norm": 0.01416015625,
"learning_rate": 9.501147083511511e-07,
"logits/chosen": -2.45332407951355,
"logits/rejected": -2.45332407951355,
"logps/chosen": -321.7140808105469,
"logps/rejected": -321.7140808105469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0516216978430748,
"rewards/margins": 0.0,
"rewards/rejected": -0.0516216978430748,
"step": 2540
},
{
"epoch": 0.7443625483470773,
"grad_norm": 0.015625,
"learning_rate": 9.302003340178962e-07,
"logits/chosen": -2.417236804962158,
"logits/rejected": -2.417236804962158,
"logps/chosen": -333.95574951171875,
"logps/rejected": -333.95574951171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0465201810002327,
"rewards/margins": 0.0,
"rewards/rejected": -0.0465201810002327,
"step": 2550
},
{
"epoch": 0.7472816171641247,
"grad_norm": 0.01422119140625,
"learning_rate": 9.10449067566718e-07,
"logits/chosen": -2.459394931793213,
"logits/rejected": -2.459394931793213,
"logps/chosen": -303.9725646972656,
"logps/rejected": -303.9725646972656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04736841470003128,
"rewards/margins": 0.0,
"rewards/rejected": -0.04736841470003128,
"step": 2560
},
{
"epoch": 0.750200685981172,
"grad_norm": 0.01513671875,
"learning_rate": 8.908629612258765e-07,
"logits/chosen": -2.435121774673462,
"logits/rejected": -2.435121774673462,
"logps/chosen": -300.51055908203125,
"logps/rejected": -300.51055908203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04969844967126846,
"rewards/margins": 0.0,
"rewards/rejected": -0.04969844967126846,
"step": 2570
},
{
"epoch": 0.7531197547982194,
"grad_norm": 0.0145263671875,
"learning_rate": 8.714440500628999e-07,
"logits/chosen": -2.393557071685791,
"logits/rejected": -2.393557071685791,
"logps/chosen": -305.946044921875,
"logps/rejected": -305.946044921875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.035433102399110794,
"rewards/margins": 0.0,
"rewards/rejected": -0.035433102399110794,
"step": 2580
},
{
"epoch": 0.7560388236152668,
"grad_norm": 0.01385498046875,
"learning_rate": 8.521943517731276e-07,
"logits/chosen": -2.394944667816162,
"logits/rejected": -2.394944667816162,
"logps/chosen": -329.5417175292969,
"logps/rejected": -329.5417175292969,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.029879886656999588,
"rewards/margins": 0.0,
"rewards/rejected": -0.029879886656999588,
"step": 2590
},
{
"epoch": 0.758957892432314,
"grad_norm": 0.01513671875,
"learning_rate": 8.33115866470069e-07,
"logits/chosen": -2.3986093997955322,
"logits/rejected": -2.3986093997955322,
"logps/chosen": -297.0606994628906,
"logps/rejected": -297.0606994628906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04173046723008156,
"rewards/margins": 0.0,
"rewards/rejected": -0.04173046723008156,
"step": 2600
},
{
"epoch": 0.758957892432314,
"eval_logits/chosen": -2.3918919563293457,
"eval_logits/rejected": -2.3918919563293457,
"eval_logps/chosen": -310.8546447753906,
"eval_logps/rejected": -310.8546447753906,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.0437602661550045,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.0437602661550045,
"eval_runtime": 2682.4018,
"eval_samples_per_second": 2.27,
"eval_steps_per_second": 0.284,
"step": 2600
},
{
"epoch": 0.7618769612493614,
"grad_norm": 0.01544189453125,
"learning_rate": 8.142105764775824e-07,
"logits/chosen": -2.384005546569824,
"logits/rejected": -2.384005546569824,
"logps/chosen": -327.1615295410156,
"logps/rejected": -327.1615295410156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.051810719072818756,
"rewards/margins": 0.0,
"rewards/rejected": -0.051810719072818756,
"step": 2610
},
{
"epoch": 0.7647960300664088,
"grad_norm": 0.01458740234375,
"learning_rate": 7.954804461239054e-07,
"logits/chosen": -2.444282054901123,
"logits/rejected": -2.444282054901123,
"logps/chosen": -314.5889587402344,
"logps/rejected": -314.5889587402344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04727676510810852,
"rewards/margins": 0.0,
"rewards/rejected": -0.04727676510810852,
"step": 2620
},
{
"epoch": 0.7677150988834561,
"grad_norm": 0.016357421875,
"learning_rate": 7.769274215375544e-07,
"logits/chosen": -2.432978391647339,
"logits/rejected": -2.432978391647339,
"logps/chosen": -293.0484924316406,
"logps/rejected": -293.0484924316406,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0413900688290596,
"rewards/margins": 0.0,
"rewards/rejected": -0.0413900688290596,
"step": 2630
},
{
"epoch": 0.7706341677005035,
"grad_norm": 0.01446533203125,
"learning_rate": 7.585534304451103e-07,
"logits/chosen": -2.444913387298584,
"logits/rejected": -2.444913387298584,
"logps/chosen": -330.8976135253906,
"logps/rejected": -330.8976135253906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.043237775564193726,
"rewards/margins": 0.0,
"rewards/rejected": -0.043237775564193726,
"step": 2640
},
{
"epoch": 0.7735532365175509,
"grad_norm": 0.01312255859375,
"learning_rate": 7.403603819709288e-07,
"logits/chosen": -2.4194247722625732,
"logits/rejected": -2.4194247722625732,
"logps/chosen": -302.08465576171875,
"logps/rejected": -302.08465576171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04694979637861252,
"rewards/margins": 0.0,
"rewards/rejected": -0.04694979637861252,
"step": 2650
},
{
"epoch": 0.7764723053345982,
"grad_norm": 0.014404296875,
"learning_rate": 7.223501664387664e-07,
"logits/chosen": -2.440764904022217,
"logits/rejected": -2.440764904022217,
"logps/chosen": -280.7825622558594,
"logps/rejected": -280.7825622558594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.036083877086639404,
"rewards/margins": 0.0,
"rewards/rejected": -0.036083877086639404,
"step": 2660
},
{
"epoch": 0.7793913741516456,
"grad_norm": 0.01458740234375,
"learning_rate": 7.045246551753779e-07,
"logits/chosen": -2.4197888374328613,
"logits/rejected": -2.4197888374328613,
"logps/chosen": -323.67938232421875,
"logps/rejected": -323.67938232421875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.043979812413454056,
"rewards/margins": 0.0,
"rewards/rejected": -0.043979812413454056,
"step": 2670
},
{
"epoch": 0.782310442968693,
"grad_norm": 0.0142822265625,
"learning_rate": 6.868857003160709e-07,
"logits/chosen": -2.470567226409912,
"logits/rejected": -2.470567226409912,
"logps/chosen": -356.6578369140625,
"logps/rejected": -356.6578369140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05309978872537613,
"rewards/margins": 0.0,
"rewards/rejected": -0.05309978872537613,
"step": 2680
},
{
"epoch": 0.7852295117857403,
"grad_norm": 0.0150146484375,
"learning_rate": 6.69435134612266e-07,
"logits/chosen": -2.4125561714172363,
"logits/rejected": -2.4125561714172363,
"logps/chosen": -302.1919250488281,
"logps/rejected": -302.1919250488281,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04638701677322388,
"rewards/margins": 0.0,
"rewards/rejected": -0.04638701677322388,
"step": 2690
},
{
"epoch": 0.7881485806027877,
"grad_norm": 0.013427734375,
"learning_rate": 6.521747712410687e-07,
"logits/chosen": -2.431802988052368,
"logits/rejected": -2.431802988052368,
"logps/chosen": -319.6323547363281,
"logps/rejected": -319.6323547363281,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04552530124783516,
"rewards/margins": 0.0,
"rewards/rejected": -0.04552530124783516,
"step": 2700
},
{
"epoch": 0.7881485806027877,
"eval_logits/chosen": -2.3916165828704834,
"eval_logits/rejected": -2.3916165828704834,
"eval_logps/chosen": -310.8406677246094,
"eval_logps/rejected": -310.8406677246094,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04362065717577934,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04362065717577934,
"eval_runtime": 2682.1268,
"eval_samples_per_second": 2.271,
"eval_steps_per_second": 0.284,
"step": 2700
},
{
"epoch": 0.7910676494198351,
"grad_norm": 0.0250244140625,
"learning_rate": 6.351064036168708e-07,
"logits/chosen": -2.4238877296447754,
"logits/rejected": -2.4238877296447754,
"logps/chosen": -338.21759033203125,
"logps/rejected": -338.21759033203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.048554692417383194,
"rewards/margins": 0.0,
"rewards/rejected": -0.048554692417383194,
"step": 2710
},
{
"epoch": 0.7939867182368824,
"grad_norm": 0.01416015625,
"learning_rate": 6.182318052050102e-07,
"logits/chosen": -2.398974895477295,
"logits/rejected": -2.398974895477295,
"logps/chosen": -329.53106689453125,
"logps/rejected": -329.53106689453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05249527841806412,
"rewards/margins": 0.0,
"rewards/rejected": -0.05249527841806412,
"step": 2720
},
{
"epoch": 0.7969057870539298,
"grad_norm": 0.019287109375,
"learning_rate": 6.015527293374979e-07,
"logits/chosen": -2.4338581562042236,
"logits/rejected": -2.4338581562042236,
"logps/chosen": -334.1202087402344,
"logps/rejected": -334.1202087402344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04841463267803192,
"rewards/margins": 0.0,
"rewards/rejected": -0.04841463267803192,
"step": 2730
},
{
"epoch": 0.7998248558709772,
"grad_norm": 0.014404296875,
"learning_rate": 5.850709090308459e-07,
"logits/chosen": -2.4255330562591553,
"logits/rejected": -2.4255330562591553,
"logps/chosen": -295.30523681640625,
"logps/rejected": -295.30523681640625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0433431938290596,
"rewards/margins": 0.0,
"rewards/rejected": -0.0433431938290596,
"step": 2740
},
{
"epoch": 0.8027439246880245,
"grad_norm": 0.0133056640625,
"learning_rate": 5.687880568059961e-07,
"logits/chosen": -2.3997416496276855,
"logits/rejected": -2.3997416496276855,
"logps/chosen": -314.76361083984375,
"logps/rejected": -314.76361083984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04872073233127594,
"rewards/margins": 0.0,
"rewards/rejected": -0.04872073233127594,
"step": 2750
},
{
"epoch": 0.8056629935050719,
"grad_norm": 0.01422119140625,
"learning_rate": 5.527058645103842e-07,
"logits/chosen": -2.3996376991271973,
"logits/rejected": -2.3996376991271973,
"logps/chosen": -376.6802673339844,
"logps/rejected": -376.6802673339844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0522351935505867,
"rewards/margins": 0.0,
"rewards/rejected": -0.0522351935505867,
"step": 2760
},
{
"epoch": 0.8085820623221193,
"grad_norm": 0.0159912109375,
"learning_rate": 5.368260031421526e-07,
"logits/chosen": -2.4533755779266357,
"logits/rejected": -2.4533755779266357,
"logps/chosen": -338.7648010253906,
"logps/rejected": -338.7648010253906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04105439782142639,
"rewards/margins": 0.0,
"rewards/rejected": -0.04105439782142639,
"step": 2770
},
{
"epoch": 0.8115011311391666,
"grad_norm": 0.01263427734375,
"learning_rate": 5.211501226765242e-07,
"logits/chosen": -2.43373441696167,
"logits/rejected": -2.43373441696167,
"logps/chosen": -285.7012023925781,
"logps/rejected": -285.7012023925781,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.02375207468867302,
"rewards/margins": 0.0,
"rewards/rejected": -0.02375207468867302,
"step": 2780
},
{
"epoch": 0.814420199956214,
"grad_norm": 0.0184326171875,
"learning_rate": 5.056798518943678e-07,
"logits/chosen": -2.4133718013763428,
"logits/rejected": -2.4133718013763428,
"logps/chosen": -315.09210205078125,
"logps/rejected": -315.09210205078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.053018081933259964,
"rewards/margins": 0.0,
"rewards/rejected": -0.053018081933259964,
"step": 2790
},
{
"epoch": 0.8173392687732614,
"grad_norm": 0.01397705078125,
"learning_rate": 4.904167982129591e-07,
"logits/chosen": -2.423839569091797,
"logits/rejected": -2.423839569091797,
"logps/chosen": -294.44683837890625,
"logps/rejected": -294.44683837890625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04831403121352196,
"rewards/margins": 0.0,
"rewards/rejected": -0.04831403121352196,
"step": 2800
},
{
"epoch": 0.8173392687732614,
"eval_logits/chosen": -2.3914895057678223,
"eval_logits/rejected": -2.3914895057678223,
"eval_logps/chosen": -310.798095703125,
"eval_logps/rejected": -310.798095703125,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04319505766034126,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04319505766034126,
"eval_runtime": 2682.0962,
"eval_samples_per_second": 2.271,
"eval_steps_per_second": 0.284,
"step": 2800
},
{
"epoch": 0.8202583375903086,
"grad_norm": 0.0126953125,
"learning_rate": 4.7536254751896493e-07,
"logits/chosen": -2.4333229064941406,
"logits/rejected": -2.4333229064941406,
"logps/chosen": -315.96234130859375,
"logps/rejected": -315.96234130859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05122748017311096,
"rewards/margins": 0.0,
"rewards/rejected": -0.05122748017311096,
"step": 2810
},
{
"epoch": 0.823177406407356,
"grad_norm": 0.0167236328125,
"learning_rate": 4.6051866400366354e-07,
"logits/chosen": -2.4289793968200684,
"logits/rejected": -2.4289793968200684,
"logps/chosen": -344.29608154296875,
"logps/rejected": -344.29608154296875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04764155298471451,
"rewards/margins": 0.0,
"rewards/rejected": -0.04764155298471451,
"step": 2820
},
{
"epoch": 0.8260964752244034,
"grad_norm": 0.0166015625,
"learning_rate": 4.4588669000042133e-07,
"logits/chosen": -2.4046084880828857,
"logits/rejected": -2.4046084880828857,
"logps/chosen": -325.74957275390625,
"logps/rejected": -325.74957275390625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.035486068576574326,
"rewards/margins": 0.0,
"rewards/rejected": -0.035486068576574326,
"step": 2830
},
{
"epoch": 0.8290155440414507,
"grad_norm": 0.016845703125,
"learning_rate": 4.3146814582443605e-07,
"logits/chosen": -2.418729066848755,
"logits/rejected": -2.418729066848755,
"logps/chosen": -327.8818359375,
"logps/rejected": -327.8818359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.046850480139255524,
"rewards/margins": 0.0,
"rewards/rejected": -0.046850480139255524,
"step": 2840
},
{
"epoch": 0.8319346128584981,
"grad_norm": 0.0135498046875,
"learning_rate": 4.1726452961477147e-07,
"logits/chosen": -2.416329860687256,
"logits/rejected": -2.416329860687256,
"logps/chosen": -319.5370178222656,
"logps/rejected": -319.5370178222656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0506584569811821,
"rewards/margins": 0.0,
"rewards/rejected": -0.0506584569811821,
"step": 2850
},
{
"epoch": 0.8348536816755455,
"grad_norm": 0.0146484375,
"learning_rate": 4.0327731717869775e-07,
"logits/chosen": -2.4376559257507324,
"logits/rejected": -2.4376559257507324,
"logps/chosen": -272.7819519042969,
"logps/rejected": -272.7819519042969,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05310596153140068,
"rewards/margins": 0.0,
"rewards/rejected": -0.05310596153140068,
"step": 2860
},
{
"epoch": 0.8377727504925928,
"grad_norm": 0.0191650390625,
"learning_rate": 3.8950796183834516e-07,
"logits/chosen": -2.4388468265533447,
"logits/rejected": -2.4388468265533447,
"logps/chosen": -345.3861389160156,
"logps/rejected": -345.3861389160156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05019260570406914,
"rewards/margins": 0.0,
"rewards/rejected": -0.05019260570406914,
"step": 2870
},
{
"epoch": 0.8406918193096402,
"grad_norm": 0.01495361328125,
"learning_rate": 3.759578942797029e-07,
"logits/chosen": -2.4550201892852783,
"logits/rejected": -2.4550201892852783,
"logps/chosen": -306.2907409667969,
"logps/rejected": -306.2907409667969,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.046652454882860184,
"rewards/margins": 0.0,
"rewards/rejected": -0.046652454882860184,
"step": 2880
},
{
"epoch": 0.8436108881266876,
"grad_norm": 0.0126953125,
"learning_rate": 3.6262852240396356e-07,
"logits/chosen": -2.446690082550049,
"logits/rejected": -2.446690082550049,
"logps/chosen": -310.69708251953125,
"logps/rejected": -310.69708251953125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04568660259246826,
"rewards/margins": 0.0,
"rewards/rejected": -0.04568660259246826,
"step": 2890
},
{
"epoch": 0.8465299569437349,
"grad_norm": 0.014404296875,
"learning_rate": 3.4952123118123735e-07,
"logits/chosen": -2.402627468109131,
"logits/rejected": -2.402627468109131,
"logps/chosen": -312.1624755859375,
"logps/rejected": -312.1624755859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.045014895498752594,
"rewards/margins": 0.0,
"rewards/rejected": -0.045014895498752594,
"step": 2900
},
{
"epoch": 0.8465299569437349,
"eval_logits/chosen": -2.391954183578491,
"eval_logits/rejected": -2.391954183578491,
"eval_logps/chosen": -310.79425048828125,
"eval_logps/rejected": -310.79425048828125,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.043156567960977554,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.043156567960977554,
"eval_runtime": 2682.3759,
"eval_samples_per_second": 2.27,
"eval_steps_per_second": 0.284,
"step": 2900
},
{
"epoch": 0.8494490257607823,
"grad_norm": 0.01470947265625,
"learning_rate": 3.3663738250664853e-07,
"logits/chosen": -2.416839122772217,
"logits/rejected": -2.416839122772217,
"logps/chosen": -342.91815185546875,
"logps/rejected": -342.91815185546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.051371246576309204,
"rewards/margins": 0.0,
"rewards/rejected": -0.051371246576309204,
"step": 2910
},
{
"epoch": 0.8523680945778297,
"grad_norm": 0.0159912109375,
"learning_rate": 3.239783150588283e-07,
"logits/chosen": -2.3476662635803223,
"logits/rejected": -2.3476662635803223,
"logps/chosen": -304.71368408203125,
"logps/rejected": -304.71368408203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.005339882802218199,
"rewards/margins": 0.0,
"rewards/rejected": 0.005339882802218199,
"step": 2920
},
{
"epoch": 0.855287163394877,
"grad_norm": 0.01409912109375,
"learning_rate": 3.1154534416082573e-07,
"logits/chosen": -2.416965961456299,
"logits/rejected": -2.416965961456299,
"logps/chosen": -299.3199157714844,
"logps/rejected": -299.3199157714844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04180007427930832,
"rewards/margins": 0.0,
"rewards/rejected": -0.04180007427930832,
"step": 2930
},
{
"epoch": 0.8582062322119244,
"grad_norm": 0.01055908203125,
"learning_rate": 2.9933976164343514e-07,
"logits/chosen": -2.4285387992858887,
"logits/rejected": -2.4285387992858887,
"logps/chosen": -303.32183837890625,
"logps/rejected": -303.32183837890625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04410778731107712,
"rewards/margins": 0.0,
"rewards/rejected": -0.04410778731107712,
"step": 2940
},
{
"epoch": 0.8611253010289718,
"grad_norm": 0.0162353515625,
"learning_rate": 2.873628357109745e-07,
"logits/chosen": -2.4083211421966553,
"logits/rejected": -2.4083211421966553,
"logps/chosen": -326.7142028808594,
"logps/rejected": -326.7142028808594,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.046554580330848694,
"rewards/margins": 0.0,
"rewards/rejected": -0.046554580330848694,
"step": 2950
},
{
"epoch": 0.8640443698460191,
"grad_norm": 0.01324462890625,
"learning_rate": 2.7561581080951195e-07,
"logits/chosen": -2.4226157665252686,
"logits/rejected": -2.4226157665252686,
"logps/chosen": -292.55767822265625,
"logps/rejected": -292.55767822265625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04246639460325241,
"rewards/margins": 0.0,
"rewards/rejected": -0.04246639460325241,
"step": 2960
},
{
"epoch": 0.8669634386630665,
"grad_norm": 0.01361083984375,
"learning_rate": 2.640999074975645e-07,
"logits/chosen": -2.43457293510437,
"logits/rejected": -2.43457293510437,
"logps/chosen": -298.2882385253906,
"logps/rejected": -298.2882385253906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04169774800539017,
"rewards/margins": 0.0,
"rewards/rejected": -0.04169774800539017,
"step": 2970
},
{
"epoch": 0.8698825074801139,
"grad_norm": 0.01708984375,
"learning_rate": 2.5281632231927786e-07,
"logits/chosen": -2.473017930984497,
"logits/rejected": -2.473017930984497,
"logps/chosen": -307.8494567871094,
"logps/rejected": -307.8494567871094,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.045904386788606644,
"rewards/margins": 0.0,
"rewards/rejected": -0.045904386788606644,
"step": 2980
},
{
"epoch": 0.8728015762971612,
"grad_norm": 0.014404296875,
"learning_rate": 2.417662276800997e-07,
"logits/chosen": -2.4377925395965576,
"logits/rejected": -2.4377925395965576,
"logps/chosen": -329.8043518066406,
"logps/rejected": -329.8043518066406,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.052566416561603546,
"rewards/margins": 0.0,
"rewards/rejected": -0.052566416561603546,
"step": 2990
},
{
"epoch": 0.8757206451142086,
"grad_norm": 0.01226806640625,
"learning_rate": 2.30950771724964e-07,
"logits/chosen": -2.4452061653137207,
"logits/rejected": -2.4452061653137207,
"logps/chosen": -316.7723388671875,
"logps/rejected": -316.7723388671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.048340607434511185,
"rewards/margins": 0.0,
"rewards/rejected": -0.048340607434511185,
"step": 3000
},
{
"epoch": 0.8757206451142086,
"eval_logits/chosen": -2.3918232917785645,
"eval_logits/rejected": -2.3918232917785645,
"eval_logps/chosen": -310.78662109375,
"eval_logps/rejected": -310.78662109375,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04308019578456879,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04308019578456879,
"eval_runtime": 2681.8561,
"eval_samples_per_second": 2.271,
"eval_steps_per_second": 0.284,
"step": 3000
},
{
"epoch": 0.878639713931256,
"grad_norm": 0.017822265625,
"learning_rate": 2.2037107821899272e-07,
"logits/chosen": -2.414727210998535,
"logits/rejected": -2.414727210998535,
"logps/chosen": -343.22796630859375,
"logps/rejected": -343.22796630859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.047485075891017914,
"rewards/margins": 0.0,
"rewards/rejected": -0.047485075891017914,
"step": 3010
},
{
"epoch": 0.8815587827483032,
"grad_norm": 0.01708984375,
"learning_rate": 2.100282464307357e-07,
"logits/chosen": -2.4386258125305176,
"logits/rejected": -2.4386258125305176,
"logps/chosen": -305.25250244140625,
"logps/rejected": -305.25250244140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04185379669070244,
"rewards/margins": 0.0,
"rewards/rejected": -0.04185379669070244,
"step": 3020
},
{
"epoch": 0.8844778515653506,
"grad_norm": 0.016357421875,
"learning_rate": 1.999233510179488e-07,
"logits/chosen": -2.4112370014190674,
"logits/rejected": -2.4112370014190674,
"logps/chosen": -339.65093994140625,
"logps/rejected": -339.65093994140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.044059764593839645,
"rewards/margins": 0.0,
"rewards/rejected": -0.044059764593839645,
"step": 3030
},
{
"epoch": 0.887396920382398,
"grad_norm": 0.012939453125,
"learning_rate": 1.9005744191593678e-07,
"logits/chosen": -2.4179887771606445,
"logits/rejected": -2.4179887771606445,
"logps/chosen": -297.5303649902344,
"logps/rejected": -297.5303649902344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03384246677160263,
"rewards/margins": 0.0,
"rewards/rejected": -0.03384246677160263,
"step": 3040
},
{
"epoch": 0.8903159891994453,
"grad_norm": 0.0120849609375,
"learning_rate": 1.8043154422845794e-07,
"logits/chosen": -2.4646730422973633,
"logits/rejected": -2.4646730422973633,
"logps/chosen": -295.91790771484375,
"logps/rejected": -295.91790771484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04882458597421646,
"rewards/margins": 0.0,
"rewards/rejected": -0.04882458597421646,
"step": 3050
},
{
"epoch": 0.8932350580164927,
"grad_norm": 0.0186767578125,
"learning_rate": 1.7104665812121445e-07,
"logits/chosen": -2.423285961151123,
"logits/rejected": -2.423285961151123,
"logps/chosen": -297.9593505859375,
"logps/rejected": -297.9593505859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04218859225511551,
"rewards/margins": 0.0,
"rewards/rejected": -0.04218859225511551,
"step": 3060
},
{
"epoch": 0.8961541268335401,
"grad_norm": 0.0164794921875,
"learning_rate": 1.619037587179309e-07,
"logits/chosen": -2.3985249996185303,
"logits/rejected": -2.3985249996185303,
"logps/chosen": -332.85809326171875,
"logps/rejected": -332.85809326171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.048540227115154266,
"rewards/margins": 0.0,
"rewards/rejected": -0.048540227115154266,
"step": 3070
},
{
"epoch": 0.8990731956505874,
"grad_norm": 0.0172119140625,
"learning_rate": 1.5300379599903408e-07,
"logits/chosen": -2.4070308208465576,
"logits/rejected": -2.4070308208465576,
"logps/chosen": -310.7314147949219,
"logps/rejected": -310.7314147949219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03895800933241844,
"rewards/margins": 0.0,
"rewards/rejected": -0.03895800933241844,
"step": 3080
},
{
"epoch": 0.9019922644676348,
"grad_norm": 0.013671875,
"learning_rate": 1.44347694702949e-07,
"logits/chosen": -2.3916313648223877,
"logits/rejected": -2.3916313648223877,
"logps/chosen": -288.28106689453125,
"logps/rejected": -288.28106689453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03293871134519577,
"rewards/margins": 0.0,
"rewards/rejected": -0.03293871134519577,
"step": 3090
},
{
"epoch": 0.9049113332846822,
"grad_norm": 0.017822265625,
"learning_rate": 1.359363542300124e-07,
"logits/chosen": -2.4147801399230957,
"logits/rejected": -2.4147801399230957,
"logps/chosen": -295.56768798828125,
"logps/rejected": -295.56768798828125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04364749416708946,
"rewards/margins": 0.0,
"rewards/rejected": -0.04364749416708946,
"step": 3100
},
{
"epoch": 0.9049113332846822,
"eval_logits/chosen": -2.390821933746338,
"eval_logits/rejected": -2.390821933746338,
"eval_logps/chosen": -310.7793884277344,
"eval_logps/rejected": -310.7793884277344,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04300786182284355,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04300786182284355,
"eval_runtime": 2681.9583,
"eval_samples_per_second": 2.271,
"eval_steps_per_second": 0.284,
"step": 3100
},
{
"epoch": 0.9078304021017295,
"grad_norm": 0.0152587890625,
"learning_rate": 1.2777064854902487e-07,
"logits/chosen": -2.44869065284729,
"logits/rejected": -2.44869065284729,
"logps/chosen": -324.82257080078125,
"logps/rejected": -324.82257080078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04247719421982765,
"rewards/margins": 0.0,
"rewards/rejected": -0.04247719421982765,
"step": 3110
},
{
"epoch": 0.9107494709187769,
"grad_norm": 0.023681640625,
"learning_rate": 1.1985142610643902e-07,
"logits/chosen": -2.4080257415771484,
"logits/rejected": -2.4080257415771484,
"logps/chosen": -321.1974792480469,
"logps/rejected": -321.1974792480469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05160089209675789,
"rewards/margins": 0.0,
"rewards/rejected": -0.05160089209675789,
"step": 3120
},
{
"epoch": 0.9136685397358243,
"grad_norm": 0.01275634765625,
"learning_rate": 1.121795097382064e-07,
"logits/chosen": -2.422560691833496,
"logits/rejected": -2.422560691833496,
"logps/chosen": -335.0086975097656,
"logps/rejected": -335.0086975097656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04736005887389183,
"rewards/margins": 0.0,
"rewards/rejected": -0.04736005887389183,
"step": 3130
},
{
"epoch": 0.9165876085528716,
"grad_norm": 0.0169677734375,
"learning_rate": 1.0475569658427803e-07,
"logits/chosen": -2.438781261444092,
"logits/rejected": -2.438781261444092,
"logps/chosen": -311.33868408203125,
"logps/rejected": -311.33868408203125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03844516724348068,
"rewards/margins": 0.0,
"rewards/rejected": -0.03844516724348068,
"step": 3140
},
{
"epoch": 0.919506677369919,
"grad_norm": 0.02001953125,
"learning_rate": 9.758075800578193e-08,
"logits/chosen": -2.4374260902404785,
"logits/rejected": -2.4374260902404785,
"logps/chosen": -300.9288635253906,
"logps/rejected": -300.9288635253906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0434752032160759,
"rewards/margins": 0.0,
"rewards/rejected": -0.0434752032160759,
"step": 3150
},
{
"epoch": 0.9224257461869664,
"grad_norm": 0.01544189453125,
"learning_rate": 9.06554395048742e-08,
"logits/chosen": -2.4104561805725098,
"logits/rejected": -2.4104561805725098,
"logps/chosen": -310.27789306640625,
"logps/rejected": -310.27789306640625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04003220796585083,
"rewards/margins": 0.0,
"rewards/rejected": -0.04003220796585083,
"step": 3160
},
{
"epoch": 0.9253448150040137,
"grad_norm": 0.01416015625,
"learning_rate": 8.398046064727855e-08,
"logits/chosen": -2.448122262954712,
"logits/rejected": -2.448122262954712,
"logps/chosen": -303.9940185546875,
"logps/rejected": -303.9940185546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04497329518198967,
"rewards/margins": 0.0,
"rewards/rejected": -0.04497329518198967,
"step": 3170
},
{
"epoch": 0.9282638838210611,
"grad_norm": 0.0140380859375,
"learning_rate": 7.755651498752265e-08,
"logits/chosen": -2.4395852088928223,
"logits/rejected": -2.4395852088928223,
"logps/chosen": -292.140380859375,
"logps/rejected": -292.140380859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04522908851504326,
"rewards/margins": 0.0,
"rewards/rejected": -0.04522908851504326,
"step": 3180
},
{
"epoch": 0.9311829526381085,
"grad_norm": 0.016357421875,
"learning_rate": 7.138426999687171e-08,
"logits/chosen": -2.4227964878082275,
"logits/rejected": -2.4227964878082275,
"logps/chosen": -333.205810546875,
"logps/rejected": -333.205810546875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04331531375646591,
"rewards/margins": 0.0,
"rewards/rejected": -0.04331531375646591,
"step": 3190
},
{
"epoch": 0.9341020214551558,
"grad_norm": 0.0177001953125,
"learning_rate": 6.546436699398029e-08,
"logits/chosen": -2.4100470542907715,
"logits/rejected": -2.4100470542907715,
"logps/chosen": -334.2508850097656,
"logps/rejected": -334.2508850097656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.043238040059804916,
"rewards/margins": 0.0,
"rewards/rejected": -0.043238040059804916,
"step": 3200
},
{
"epoch": 0.9341020214551558,
"eval_logits/chosen": -2.391075849533081,
"eval_logits/rejected": -2.391075849533081,
"eval_logps/chosen": -310.7811584472656,
"eval_logps/rejected": -310.7811584472656,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.043025679886341095,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.043025679886341095,
"eval_runtime": 2682.606,
"eval_samples_per_second": 2.27,
"eval_steps_per_second": 0.284,
"step": 3200
},
{
"epoch": 0.9370210902722031,
"grad_norm": 0.0150146484375,
"learning_rate": 5.979742107825287e-08,
"logits/chosen": -2.3894600868225098,
"logits/rejected": -2.3894600868225098,
"logps/chosen": -313.91131591796875,
"logps/rejected": -313.91131591796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04388252645730972,
"rewards/margins": 0.0,
"rewards/rejected": -0.04388252645730972,
"step": 3210
},
{
"epoch": 0.9399401590892505,
"grad_norm": 0.01446533203125,
"learning_rate": 5.4384021065936045e-08,
"logits/chosen": -2.408024549484253,
"logits/rejected": -2.408024549484253,
"logps/chosen": -288.5419006347656,
"logps/rejected": -288.5419006347656,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0435100793838501,
"rewards/margins": 0.0,
"rewards/rejected": -0.0435100793838501,
"step": 3220
},
{
"epoch": 0.9428592279062978,
"grad_norm": 0.033447265625,
"learning_rate": 4.9224729428935806e-08,
"logits/chosen": -2.423318862915039,
"logits/rejected": -2.423318862915039,
"logps/chosen": -309.74176025390625,
"logps/rejected": -309.74176025390625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04652589559555054,
"rewards/margins": 0.0,
"rewards/rejected": -0.04652589559555054,
"step": 3230
},
{
"epoch": 0.9457782967233452,
"grad_norm": 0.011962890625,
"learning_rate": 4.432008223637596e-08,
"logits/chosen": -2.4209766387939453,
"logits/rejected": -2.4209766387939453,
"logps/chosen": -299.3330078125,
"logps/rejected": -299.3330078125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04555036872625351,
"rewards/margins": 0.0,
"rewards/rejected": -0.04555036872625351,
"step": 3240
},
{
"epoch": 0.9486973655403926,
"grad_norm": 0.01953125,
"learning_rate": 3.967058909889937e-08,
"logits/chosen": -2.397352457046509,
"logits/rejected": -2.397352457046509,
"logps/chosen": -313.8124694824219,
"logps/rejected": -313.8124694824219,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03916158154606819,
"rewards/margins": 0.0,
"rewards/rejected": -0.03916158154606819,
"step": 3250
},
{
"epoch": 0.9516164343574399,
"grad_norm": 0.014404296875,
"learning_rate": 3.5276733115715556e-08,
"logits/chosen": -2.448172092437744,
"logits/rejected": -2.448172092437744,
"logps/chosen": -305.734130859375,
"logps/rejected": -305.734130859375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.050101179629564285,
"rewards/margins": 0.0,
"rewards/rejected": -0.050101179629564285,
"step": 3260
},
{
"epoch": 0.9545355031744873,
"grad_norm": 0.01397705078125,
"learning_rate": 3.11389708244067e-08,
"logits/chosen": -2.4387991428375244,
"logits/rejected": -2.4387991428375244,
"logps/chosen": -325.77374267578125,
"logps/rejected": -325.77374267578125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04632676765322685,
"rewards/margins": 0.0,
"rewards/rejected": -0.04632676765322685,
"step": 3270
},
{
"epoch": 0.9574545719915347,
"grad_norm": 0.0130615234375,
"learning_rate": 2.7257732153490313e-08,
"logits/chosen": -2.3997585773468018,
"logits/rejected": -2.3997585773468018,
"logps/chosen": -323.36962890625,
"logps/rejected": -323.36962890625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04246259480714798,
"rewards/margins": 0.0,
"rewards/rejected": -0.04246259480714798,
"step": 3280
},
{
"epoch": 0.960373640808582,
"grad_norm": 0.01226806640625,
"learning_rate": 2.3633420377749684e-08,
"logits/chosen": -2.404913902282715,
"logits/rejected": -2.404913902282715,
"logps/chosen": -309.89715576171875,
"logps/rejected": -309.89715576171875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.05698453634977341,
"rewards/margins": 0.0,
"rewards/rejected": -0.05698453634977341,
"step": 3290
},
{
"epoch": 0.9632927096256294,
"grad_norm": 0.013671875,
"learning_rate": 2.0266412076330457e-08,
"logits/chosen": -2.431570529937744,
"logits/rejected": -2.431570529937744,
"logps/chosen": -297.8599548339844,
"logps/rejected": -297.8599548339844,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.052738986909389496,
"rewards/margins": 0.0,
"rewards/rejected": -0.052738986909389496,
"step": 3300
},
{
"epoch": 0.9632927096256294,
"eval_logits/chosen": -2.3914639949798584,
"eval_logits/rejected": -2.3914639949798584,
"eval_logps/chosen": -310.7767333984375,
"eval_logps/rejected": -310.7767333984375,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.0429811105132103,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.0429811105132103,
"eval_runtime": 2682.6978,
"eval_samples_per_second": 2.27,
"eval_steps_per_second": 0.284,
"step": 3300
},
{
"epoch": 0.9662117784426768,
"grad_norm": 0.01361083984375,
"learning_rate": 1.7157057093614704e-08,
"logits/chosen": -2.452519178390503,
"logits/rejected": -2.452519178390503,
"logps/chosen": -296.8190002441406,
"logps/rejected": -296.8190002441406,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.041969865560531616,
"rewards/margins": 0.0,
"rewards/rejected": -0.041969865560531616,
"step": 3310
},
{
"epoch": 0.9691308472597241,
"grad_norm": 0.01422119140625,
"learning_rate": 1.430567850286807e-08,
"logits/chosen": -2.4390811920166016,
"logits/rejected": -2.4390811920166016,
"logps/chosen": -339.14801025390625,
"logps/rejected": -339.14801025390625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0495796874165535,
"rewards/margins": 0.0,
"rewards/rejected": -0.0495796874165535,
"step": 3320
},
{
"epoch": 0.9720499160767715,
"grad_norm": 0.017333984375,
"learning_rate": 1.1712572572674386e-08,
"logits/chosen": -2.3779425621032715,
"logits/rejected": -2.3779425621032715,
"logps/chosen": -342.52117919921875,
"logps/rejected": -342.52117919921875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03777734562754631,
"rewards/margins": 0.0,
"rewards/rejected": -0.03777734562754631,
"step": 3330
},
{
"epoch": 0.9749689848938189,
"grad_norm": 0.0166015625,
"learning_rate": 9.378008736149746e-09,
"logits/chosen": -2.408357620239258,
"logits/rejected": -2.408357620239258,
"logps/chosen": -321.5648193359375,
"logps/rejected": -321.5648193359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03947510942816734,
"rewards/margins": 0.0,
"rewards/rejected": -0.03947510942816734,
"step": 3340
},
{
"epoch": 0.9778880537108662,
"grad_norm": 0.01275634765625,
"learning_rate": 7.30222956294907e-09,
"logits/chosen": -2.456228733062744,
"logits/rejected": -2.456228733062744,
"logps/chosen": -322.9805603027344,
"logps/rejected": -322.9805603027344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04836040362715721,
"rewards/margins": 0.0,
"rewards/rejected": -0.04836040362715721,
"step": 3350
},
{
"epoch": 0.9808071225279136,
"grad_norm": 0.015380859375,
"learning_rate": 5.485450734061259e-09,
"logits/chosen": -2.395473003387451,
"logits/rejected": -2.395473003387451,
"logps/chosen": -292.87994384765625,
"logps/rejected": -292.87994384765625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0445740707218647,
"rewards/margins": 0.0,
"rewards/rejected": -0.0445740707218647,
"step": 3360
},
{
"epoch": 0.983726191344961,
"grad_norm": 0.01544189453125,
"learning_rate": 3.927861019399903e-09,
"logits/chosen": -2.406294345855713,
"logits/rejected": -2.406294345855713,
"logps/chosen": -288.55987548828125,
"logps/rejected": -288.55987548828125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.03885042294859886,
"rewards/margins": 0.0,
"rewards/rejected": -0.03885042294859886,
"step": 3370
},
{
"epoch": 0.9866452601620083,
"grad_norm": 0.0155029296875,
"learning_rate": 2.629622258188691e-09,
"logits/chosen": -2.4149577617645264,
"logits/rejected": -2.4149577617645264,
"logps/chosen": -282.56585693359375,
"logps/rejected": -282.56585693359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.0403311550617218,
"rewards/margins": 0.0,
"rewards/rejected": -0.0403311550617218,
"step": 3380
},
{
"epoch": 0.9895643289790557,
"grad_norm": 0.01544189453125,
"learning_rate": 1.5908693421465282e-09,
"logits/chosen": -2.4097559452056885,
"logits/rejected": -2.4097559452056885,
"logps/chosen": -284.1174011230469,
"logps/rejected": -284.1174011230469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04213656857609749,
"rewards/margins": 0.0,
"rewards/rejected": -0.04213656857609749,
"step": 3390
},
{
"epoch": 0.9924833977961031,
"grad_norm": 0.0137939453125,
"learning_rate": 8.11710201470417e-10,
"logits/chosen": -2.4348714351654053,
"logits/rejected": -2.4348714351654053,
"logps/chosen": -325.41339111328125,
"logps/rejected": -325.41339111328125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04575268179178238,
"rewards/margins": 0.0,
"rewards/rejected": -0.04575268179178238,
"step": 3400
},
{
"epoch": 0.9924833977961031,
"eval_logits/chosen": -2.3908708095550537,
"eval_logits/rejected": -2.3908708095550537,
"eval_logps/chosen": -310.7832336425781,
"eval_logps/rejected": -310.7832336425781,
"eval_loss": 0.6931472420692444,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -0.04304642230272293,
"eval_rewards/margins": 0.0,
"eval_rewards/rejected": -0.04304642230272293,
"eval_runtime": 2747.5083,
"eval_samples_per_second": 2.217,
"eval_steps_per_second": 0.277,
"step": 3400
},
{
"epoch": 0.9954024666131503,
"grad_norm": 0.01165771484375,
"learning_rate": 2.922257936230355e-10,
"logits/chosen": -2.409545421600342,
"logits/rejected": -2.409545421600342,
"logps/chosen": -264.7913818359375,
"logps/rejected": -264.7913818359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.038275159895420074,
"rewards/margins": 0.0,
"rewards/rejected": -0.038275159895420074,
"step": 3410
},
{
"epoch": 0.9983215354301977,
"grad_norm": 0.0167236328125,
"learning_rate": 3.247009491946784e-11,
"logits/chosen": -2.429719924926758,
"logits/rejected": -2.429719924926758,
"logps/chosen": -340.23126220703125,
"logps/rejected": -340.23126220703125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -0.04289505258202553,
"rewards/margins": 0.0,
"rewards/rejected": -0.04289505258202553,
"step": 3420
},
{
"epoch": 0.9997810698387214,
"step": 3425,
"total_flos": 0.0,
"train_loss": 0.18720042388804636,
"train_runtime": 41876.0871,
"train_samples_per_second": 1.309,
"train_steps_per_second": 0.082
}
],
"logging_steps": 10,
"max_steps": 3425,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}