llama3-wpo-lora / trainer_state.json
Wenboz's picture
Model save
28a42b5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 100,
"global_step": 955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010468463752944255,
"grad_norm": 4.4375,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -0.3494967222213745,
"logits/rejected": -0.3728627860546112,
"logps/chosen": -285.8127136230469,
"logps/ref_response": -0.3494967222213745,
"logps/rejected": -212.7957000732422,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010468463752944255,
"grad_norm": 4.5,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": -0.540075421333313,
"logits/rejected": -0.54986971616745,
"logps/chosen": -315.31512451171875,
"logps/ref_response": -0.5399107336997986,
"logps/rejected": -278.0267639160156,
"loss": 0.6929,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": 0.001649973331950605,
"rewards/margins": 0.0034635968040674925,
"rewards/rejected": -0.0018136235885322094,
"step": 10
},
{
"epoch": 0.02093692750588851,
"grad_norm": 4.15625,
"learning_rate": 1.0416666666666667e-06,
"logits/chosen": -0.5037816762924194,
"logits/rejected": -0.5245965719223022,
"logps/chosen": -306.7390441894531,
"logps/ref_response": -0.5032420754432678,
"logps/rejected": -271.2138671875,
"loss": 0.6934,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.003458543913438916,
"rewards/margins": 0.0031067535746842623,
"rewards/rejected": 0.0003517906297929585,
"step": 20
},
{
"epoch": 0.031405391258832765,
"grad_norm": 4.75,
"learning_rate": 1.5625e-06,
"logits/chosen": -0.5102043151855469,
"logits/rejected": -0.5178056955337524,
"logps/chosen": -291.02197265625,
"logps/ref_response": -0.5080639123916626,
"logps/rejected": -252.41531372070312,
"loss": 0.6867,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.014292215928435326,
"rewards/margins": 0.014373516663908958,
"rewards/rejected": -8.130413334583864e-05,
"step": 30
},
{
"epoch": 0.04187385501177702,
"grad_norm": 3.921875,
"learning_rate": 2.0833333333333334e-06,
"logits/chosen": -0.48268669843673706,
"logits/rejected": -0.5177565813064575,
"logps/chosen": -305.90875244140625,
"logps/ref_response": -0.47757530212402344,
"logps/rejected": -244.60757446289062,
"loss": 0.6781,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.039179086685180664,
"rewards/margins": 0.04343840479850769,
"rewards/rejected": -0.004259312059730291,
"step": 40
},
{
"epoch": 0.05234231876472128,
"grad_norm": 3.0625,
"learning_rate": 2.604166666666667e-06,
"logits/chosen": -0.5464528799057007,
"logits/rejected": -0.5745548605918884,
"logps/chosen": -304.85235595703125,
"logps/ref_response": -0.5367640256881714,
"logps/rejected": -282.80804443359375,
"loss": 0.6733,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.07245220243930817,
"rewards/margins": 0.046217553317546844,
"rewards/rejected": 0.026234647259116173,
"step": 50
},
{
"epoch": 0.06281078251766553,
"grad_norm": 3.6875,
"learning_rate": 3.125e-06,
"logits/chosen": -0.5682042837142944,
"logits/rejected": -0.5693326592445374,
"logps/chosen": -290.4607849121094,
"logps/ref_response": -0.5527787804603577,
"logps/rejected": -254.50967407226562,
"loss": 0.6554,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.12507006525993347,
"rewards/margins": 0.05596587061882019,
"rewards/rejected": 0.06910420954227448,
"step": 60
},
{
"epoch": 0.07327924627060979,
"grad_norm": 3.625,
"learning_rate": 3.6458333333333333e-06,
"logits/chosen": -0.5585962533950806,
"logits/rejected": -0.5734174847602844,
"logps/chosen": -286.166748046875,
"logps/ref_response": -0.5369429588317871,
"logps/rejected": -263.13885498046875,
"loss": 0.6366,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.21329982578754425,
"rewards/margins": 0.146693617105484,
"rewards/rejected": 0.06660620868206024,
"step": 70
},
{
"epoch": 0.08374771002355404,
"grad_norm": 3.75,
"learning_rate": 4.166666666666667e-06,
"logits/chosen": -0.4981383681297302,
"logits/rejected": -0.5249155759811401,
"logps/chosen": -287.4258728027344,
"logps/ref_response": -0.46965378522872925,
"logps/rejected": -273.86474609375,
"loss": 0.617,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.31552475690841675,
"rewards/margins": 0.25278010964393616,
"rewards/rejected": 0.06274466216564178,
"step": 80
},
{
"epoch": 0.0942161737764983,
"grad_norm": 3.734375,
"learning_rate": 4.6875000000000004e-06,
"logits/chosen": -0.5283939838409424,
"logits/rejected": -0.5496431589126587,
"logps/chosen": -330.2322692871094,
"logps/ref_response": -0.4922845959663391,
"logps/rejected": -295.63018798828125,
"loss": 0.5998,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.2877245545387268,
"rewards/margins": 0.3177236020565033,
"rewards/rejected": -0.02999904192984104,
"step": 90
},
{
"epoch": 0.10468463752944256,
"grad_norm": 3.8125,
"learning_rate": 4.9997324926814375e-06,
"logits/chosen": -0.5698152184486389,
"logits/rejected": -0.5635516047477722,
"logps/chosen": -275.736328125,
"logps/ref_response": -0.533843994140625,
"logps/rejected": -290.2398376464844,
"loss": 0.6142,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.30928176641464233,
"rewards/margins": 0.3119629919528961,
"rewards/rejected": -0.0026811982970684767,
"step": 100
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -0.5543237924575806,
"eval_logits/rejected": -0.548694908618927,
"eval_logps/chosen": -290.523193359375,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -277.9860534667969,
"eval_loss": 0.5973454713821411,
"eval_rewards/accuracies": 0.7020000219345093,
"eval_rewards/chosen": 0.2023972123861313,
"eval_rewards/margins": 0.33334478735923767,
"eval_rewards/rejected": -0.13094758987426758,
"eval_runtime": 351.8267,
"eval_samples_per_second": 5.685,
"eval_steps_per_second": 0.355,
"step": 100
},
{
"epoch": 0.11515310128238682,
"grad_norm": 2.96875,
"learning_rate": 4.996723692767927e-06,
"logits/chosen": -0.6081199645996094,
"logits/rejected": -0.6322951912879944,
"logps/chosen": -289.56561279296875,
"logps/ref_response": -0.5667906999588013,
"logps/rejected": -277.76922607421875,
"loss": 0.5906,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.20684650540351868,
"rewards/margins": 0.4277339577674866,
"rewards/rejected": -0.22088749706745148,
"step": 110
},
{
"epoch": 0.12562156503533106,
"grad_norm": 3.34375,
"learning_rate": 4.9903757462135984e-06,
"logits/chosen": -0.5567010641098022,
"logits/rejected": -0.5700705051422119,
"logps/chosen": -262.211181640625,
"logps/ref_response": -0.5169209837913513,
"logps/rejected": -253.9445343017578,
"loss": 0.5753,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.034964192658662796,
"rewards/margins": 0.36375877261161804,
"rewards/rejected": -0.32879456877708435,
"step": 120
},
{
"epoch": 0.1360900287882753,
"grad_norm": 3.5,
"learning_rate": 4.980697142834315e-06,
"logits/chosen": -0.5261090397834778,
"logits/rejected": -0.5425637364387512,
"logps/chosen": -302.4659118652344,
"logps/ref_response": -0.4790240228176117,
"logps/rejected": -338.93597412109375,
"loss": 0.5798,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.04841077700257301,
"rewards/margins": 0.3843201696872711,
"rewards/rejected": -0.335909366607666,
"step": 130
},
{
"epoch": 0.14655849254121958,
"grad_norm": 3.015625,
"learning_rate": 4.967700826904229e-06,
"logits/chosen": -0.5993494391441345,
"logits/rejected": -0.607743501663208,
"logps/chosen": -283.1224670410156,
"logps/ref_response": -0.5482783913612366,
"logps/rejected": -276.7977294921875,
"loss": 0.5616,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0036001927219331264,
"rewards/margins": 0.5189536213874817,
"rewards/rejected": -0.5225538015365601,
"step": 140
},
{
"epoch": 0.15702695629416383,
"grad_norm": 3.34375,
"learning_rate": 4.951404179843963e-06,
"logits/chosen": -0.6007939577102661,
"logits/rejected": -0.5676769018173218,
"logps/chosen": -308.45916748046875,
"logps/ref_response": -0.5423828363418579,
"logps/rejected": -280.7994689941406,
"loss": 0.5581,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.18967030942440033,
"rewards/margins": 0.5712782740592957,
"rewards/rejected": -0.38160794973373413,
"step": 150
},
{
"epoch": 0.16749542004710807,
"grad_norm": 3.140625,
"learning_rate": 4.931828996974498e-06,
"logits/chosen": -0.5421828031539917,
"logits/rejected": -0.5333597660064697,
"logps/chosen": -297.0065612792969,
"logps/ref_response": -0.4895528256893158,
"logps/rejected": -272.0807189941406,
"loss": 0.5427,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.20104511082172394,
"rewards/margins": 0.61830735206604,
"rewards/rejected": -0.4172622263431549,
"step": 160
},
{
"epoch": 0.17796388380005235,
"grad_norm": 3.46875,
"learning_rate": 4.909001458367867e-06,
"logits/chosen": -0.6238254308700562,
"logits/rejected": -0.6083575487136841,
"logps/chosen": -288.6294860839844,
"logps/ref_response": -0.5753272771835327,
"logps/rejected": -277.1576232910156,
"loss": 0.5646,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.02497274801135063,
"rewards/margins": 0.5278475880622864,
"rewards/rejected": -0.5528203248977661,
"step": 170
},
{
"epoch": 0.1884323475529966,
"grad_norm": 3.46875,
"learning_rate": 4.882952093833628e-06,
"logits/chosen": -0.6268518567085266,
"logits/rejected": -0.5990904569625854,
"logps/chosen": -303.6324157714844,
"logps/ref_response": -0.5761692523956299,
"logps/rejected": -267.112060546875,
"loss": 0.5521,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.13170073926448822,
"rewards/margins": 0.5248185396194458,
"rewards/rejected": -0.6565192341804504,
"step": 180
},
{
"epoch": 0.19890081130594087,
"grad_norm": 4.0,
"learning_rate": 4.853715742087947e-06,
"logits/chosen": -0.5630252957344055,
"logits/rejected": -0.5427506566047668,
"logps/chosen": -276.95648193359375,
"logps/ref_response": -0.5028859972953796,
"logps/rejected": -284.7541809082031,
"loss": 0.5535,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.004456658847630024,
"rewards/margins": 0.5479073524475098,
"rewards/rejected": -0.5434507131576538,
"step": 190
},
{
"epoch": 0.2093692750588851,
"grad_norm": 5.0625,
"learning_rate": 4.821331504159906e-06,
"logits/chosen": -0.5674183368682861,
"logits/rejected": -0.5801523327827454,
"logps/chosen": -298.1530456542969,
"logps/ref_response": -0.5163358449935913,
"logps/rejected": -257.4419250488281,
"loss": 0.5579,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.06740345805883408,
"rewards/margins": 0.5899164080619812,
"rewards/rejected": -0.6573198437690735,
"step": 200
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -0.5041880011558533,
"eval_logits/rejected": -0.48473650217056274,
"eval_logps/chosen": -293.2984924316406,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -283.7411193847656,
"eval_loss": 0.5482621788978577,
"eval_rewards/accuracies": 0.7120000123977661,
"eval_rewards/chosen": -0.07513303309679031,
"eval_rewards/margins": 0.6313197016716003,
"eval_rewards/rejected": -0.7064527869224548,
"eval_runtime": 349.57,
"eval_samples_per_second": 5.721,
"eval_steps_per_second": 0.358,
"step": 200
},
{
"epoch": 0.21983773881182936,
"grad_norm": 4.84375,
"learning_rate": 4.7858426910973435e-06,
"logits/chosen": -0.6099163889884949,
"logits/rejected": -0.6096245050430298,
"logps/chosen": -280.4328918457031,
"logps/ref_response": -0.5563252568244934,
"logps/rejected": -274.54486083984375,
"loss": 0.5453,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.019828548654913902,
"rewards/margins": 0.5767567157745361,
"rewards/rejected": -0.596585214138031,
"step": 210
},
{
"epoch": 0.23030620256477363,
"grad_norm": 3.578125,
"learning_rate": 4.747296766042161e-06,
"logits/chosen": -0.5816672444343567,
"logits/rejected": -0.5607967376708984,
"logps/chosen": -319.9589538574219,
"logps/ref_response": -0.525614857673645,
"logps/rejected": -272.82952880859375,
"loss": 0.5401,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.03950881212949753,
"rewards/margins": 0.63139808177948,
"rewards/rejected": -0.5918892621994019,
"step": 220
},
{
"epoch": 0.24077466631771788,
"grad_norm": 3.921875,
"learning_rate": 4.705745280752586e-06,
"logits/chosen": -0.6150011420249939,
"logits/rejected": -0.572602391242981,
"logps/chosen": -293.0460205078125,
"logps/ref_response": -0.5675605535507202,
"logps/rejected": -290.7093200683594,
"loss": 0.5514,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.07407680153846741,
"rewards/margins": 0.6289999485015869,
"rewards/rejected": -0.7030767202377319,
"step": 230
},
{
"epoch": 0.2512431300706621,
"grad_norm": 3.015625,
"learning_rate": 4.661243806657256e-06,
"logits/chosen": -0.5870501399040222,
"logits/rejected": -0.546720564365387,
"logps/chosen": -300.5980529785156,
"logps/ref_response": -0.5330287218093872,
"logps/rejected": -264.90716552734375,
"loss": 0.5493,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.01282467134296894,
"rewards/margins": 0.5539323091506958,
"rewards/rejected": -0.5667570233345032,
"step": 240
},
{
"epoch": 0.26171159382360637,
"grad_norm": 3.140625,
"learning_rate": 4.613851860533367e-06,
"logits/chosen": -0.5954620242118835,
"logits/rejected": -0.5543604493141174,
"logps/chosen": -293.9695739746094,
"logps/ref_response": -0.5492520928382874,
"logps/rejected": -260.6333312988281,
"loss": 0.5657,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.10280059278011322,
"rewards/margins": 0.5389178395271301,
"rewards/rejected": -0.43611717224121094,
"step": 250
},
{
"epoch": 0.2721800575765506,
"grad_norm": 3.765625,
"learning_rate": 4.563632824908252e-06,
"logits/chosen": -0.564812183380127,
"logits/rejected": -0.5271375179290771,
"logps/chosen": -293.28594970703125,
"logps/ref_response": -0.5089389085769653,
"logps/rejected": -280.13018798828125,
"loss": 0.5289,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.3052422106266022,
"rewards/margins": 0.8550776243209839,
"rewards/rejected": -0.5498353838920593,
"step": 260
},
{
"epoch": 0.2826485213294949,
"grad_norm": 4.25,
"learning_rate": 4.510653863290871e-06,
"logits/chosen": -0.558210015296936,
"logits/rejected": -0.5351649522781372,
"logps/chosen": -296.7794494628906,
"logps/ref_response": -0.5091123580932617,
"logps/rejected": -305.01654052734375,
"loss": 0.5405,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.008460876531898975,
"rewards/margins": 0.6207507848739624,
"rewards/rejected": -0.6122900247573853,
"step": 270
},
{
"epoch": 0.29311698508243916,
"grad_norm": 3.203125,
"learning_rate": 4.454985830346574e-06,
"logits/chosen": -0.6231056451797485,
"logits/rejected": -0.5877543687820435,
"logps/chosen": -302.84906005859375,
"logps/ref_response": -0.5748014450073242,
"logps/rejected": -286.6356201171875,
"loss": 0.5587,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.0532672181725502,
"rewards/margins": 0.5458566546440125,
"rewards/rejected": -0.5991239547729492,
"step": 280
},
{
"epoch": 0.3035854488353834,
"grad_norm": 3.34375,
"learning_rate": 4.396703177135262e-06,
"logits/chosen": -0.582170844078064,
"logits/rejected": -0.5509222149848938,
"logps/chosen": -287.865478515625,
"logps/ref_response": -0.5320878624916077,
"logps/rejected": -259.5517883300781,
"loss": 0.5264,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.1213446855545044,
"rewards/margins": 0.7001182436943054,
"rewards/rejected": -0.5787736177444458,
"step": 290
},
{
"epoch": 0.31405391258832765,
"grad_norm": 4.125,
"learning_rate": 4.335883851539693e-06,
"logits/chosen": -0.6001819372177124,
"logits/rejected": -0.5625559091567993,
"logps/chosen": -297.6165771484375,
"logps/ref_response": -0.5529105067253113,
"logps/rejected": -294.80816650390625,
"loss": 0.5402,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.15017859637737274,
"rewards/margins": 0.683876097202301,
"rewards/rejected": -0.8340547680854797,
"step": 300
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -0.4637250602245331,
"eval_logits/rejected": -0.4386967718601227,
"eval_logps/chosen": -293.8652648925781,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -285.25445556640625,
"eval_loss": 0.5354303121566772,
"eval_rewards/accuracies": 0.7260000109672546,
"eval_rewards/chosen": -0.1318078190088272,
"eval_rewards/margins": 0.725982666015625,
"eval_rewards/rejected": -0.857790470123291,
"eval_runtime": 349.4289,
"eval_samples_per_second": 5.724,
"eval_steps_per_second": 0.358,
"step": 300
},
{
"epoch": 0.3245223763412719,
"grad_norm": 4.40625,
"learning_rate": 4.2726091940171055e-06,
"logits/chosen": -0.549019992351532,
"logits/rejected": -0.5806938409805298,
"logps/chosen": -296.0178527832031,
"logps/ref_response": -0.5006662607192993,
"logps/rejected": -342.6523742675781,
"loss": 0.5123,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.08306514471769333,
"rewards/margins": 0.8439720869064331,
"rewards/rejected": -0.760906994342804,
"step": 310
},
{
"epoch": 0.33499084009421615,
"grad_norm": 2.953125,
"learning_rate": 4.206963828813555e-06,
"logits/chosen": -0.6003859043121338,
"logits/rejected": -0.5595699548721313,
"logps/chosen": -297.0275573730469,
"logps/ref_response": -0.5563712120056152,
"logps/rejected": -280.803466796875,
"loss": 0.5233,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.02343413233757019,
"rewards/margins": 0.791476845741272,
"rewards/rejected": -0.8149110078811646,
"step": 320
},
{
"epoch": 0.34545930384716045,
"grad_norm": 3.03125,
"learning_rate": 4.139035550786495e-06,
"logits/chosen": -0.6349459886550903,
"logits/rejected": -0.5698983073234558,
"logps/chosen": -290.1078186035156,
"logps/ref_response": -0.5800708532333374,
"logps/rejected": -261.7127685546875,
"loss": 0.5278,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.08804565668106079,
"rewards/margins": 0.6575459837913513,
"rewards/rejected": -0.7455916404724121,
"step": 330
},
{
"epoch": 0.3559277676001047,
"grad_norm": 3.953125,
"learning_rate": 4.068915207986931e-06,
"logits/chosen": -0.5905895233154297,
"logits/rejected": -0.5241268277168274,
"logps/chosen": -298.50506591796875,
"logps/ref_response": -0.5407181978225708,
"logps/rejected": -259.384765625,
"loss": 0.5256,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.2834884226322174,
"rewards/margins": 0.8159183263778687,
"rewards/rejected": -1.0994068384170532,
"step": 340
},
{
"epoch": 0.36639623135304894,
"grad_norm": 4.09375,
"learning_rate": 3.996696580158211e-06,
"logits/chosen": -0.5350117683410645,
"logits/rejected": -0.509468674659729,
"logps/chosen": -337.4930114746094,
"logps/ref_response": -0.486247718334198,
"logps/rejected": -292.4802551269531,
"loss": 0.5304,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.055437762290239334,
"rewards/margins": 0.7416700124740601,
"rewards/rejected": -0.7971076965332031,
"step": 350
},
{
"epoch": 0.3768646951059932,
"grad_norm": 3.140625,
"learning_rate": 3.922476253313921e-06,
"logits/chosen": -0.5137478113174438,
"logits/rejected": -0.5199310183525085,
"logps/chosen": -275.74993896484375,
"logps/ref_response": -0.48780474066734314,
"logps/rejected": -299.1233215332031,
"loss": 0.5377,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.2543216347694397,
"rewards/margins": 0.7176491618156433,
"rewards/rejected": -0.9719708561897278,
"step": 360
},
{
"epoch": 0.38733315885893743,
"grad_norm": 4.25,
"learning_rate": 3.846353490562664e-06,
"logits/chosen": -0.5372802019119263,
"logits/rejected": -0.539850115776062,
"logps/chosen": -290.45709228515625,
"logps/ref_response": -0.491685152053833,
"logps/rejected": -265.22857666015625,
"loss": 0.5065,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.026863550767302513,
"rewards/margins": 0.8847667574882507,
"rewards/rejected": -0.9116303324699402,
"step": 370
},
{
"epoch": 0.39780162261188173,
"grad_norm": 3.609375,
"learning_rate": 3.768430099352445e-06,
"logits/chosen": -0.5611374378204346,
"logits/rejected": -0.5598313808441162,
"logps/chosen": -306.60662841796875,
"logps/ref_response": -0.5215914845466614,
"logps/rejected": -280.02392578125,
"loss": 0.5133,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.001783865736797452,
"rewards/margins": 1.001586675643921,
"rewards/rejected": -0.9998028874397278,
"step": 380
},
{
"epoch": 0.408270086364826,
"grad_norm": 4.625,
"learning_rate": 3.6888102953122307e-06,
"logits/chosen": -0.6096396446228027,
"logits/rejected": -0.5705077052116394,
"logps/chosen": -263.71826171875,
"logps/ref_response": -0.5661150813102722,
"logps/rejected": -265.1150817871094,
"loss": 0.5434,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.06994754076004028,
"rewards/margins": 0.7673075795173645,
"rewards/rejected": -0.8372551202774048,
"step": 390
},
{
"epoch": 0.4187385501177702,
"grad_norm": 3.875,
"learning_rate": 3.607600562872785e-06,
"logits/chosen": -0.5598580241203308,
"logits/rejected": -0.5253019332885742,
"logps/chosen": -286.2227478027344,
"logps/ref_response": -0.5258094072341919,
"logps/rejected": -277.05267333984375,
"loss": 0.5112,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.047982849180698395,
"rewards/margins": 0.7805670499801636,
"rewards/rejected": -0.8285499811172485,
"step": 400
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -0.4029563367366791,
"eval_logits/rejected": -0.37146955728530884,
"eval_logps/chosen": -294.2449645996094,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -286.346923828125,
"eval_loss": 0.5277438759803772,
"eval_rewards/accuracies": 0.722000002861023,
"eval_rewards/chosen": -0.16978110373020172,
"eval_rewards/margins": 0.7972525954246521,
"eval_rewards/rejected": -0.9670337438583374,
"eval_runtime": 349.6206,
"eval_samples_per_second": 5.72,
"eval_steps_per_second": 0.358,
"step": 400
},
{
"epoch": 0.42920701387071447,
"grad_norm": 3.703125,
"learning_rate": 3.5249095128531863e-06,
"logits/chosen": -0.5799764394760132,
"logits/rejected": -0.5290526151657104,
"logps/chosen": -279.68115234375,
"logps/ref_response": -0.5564926862716675,
"logps/rejected": -277.60308837890625,
"loss": 0.5124,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.1508873999118805,
"rewards/margins": 0.8393732905387878,
"rewards/rejected": -0.9902607798576355,
"step": 410
},
{
"epoch": 0.4396754776236587,
"grad_norm": 4.34375,
"learning_rate": 3.4408477372034743e-06,
"logits/chosen": -0.5675779581069946,
"logits/rejected": -0.5418698191642761,
"logps/chosen": -310.0582275390625,
"logps/ref_response": -0.5361344218254089,
"logps/rejected": -298.15447998046875,
"loss": 0.5504,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.11406157165765762,
"rewards/margins": 0.6680425405502319,
"rewards/rejected": -0.7821041345596313,
"step": 420
},
{
"epoch": 0.45014394137660296,
"grad_norm": 4.1875,
"learning_rate": 3.355527661097728e-06,
"logits/chosen": -0.569171130657196,
"logits/rejected": -0.5680087804794312,
"logps/chosen": -281.50506591796875,
"logps/ref_response": -0.5477866530418396,
"logps/rejected": -282.78619384765625,
"loss": 0.5291,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.24793243408203125,
"rewards/margins": 0.6574320197105408,
"rewards/rejected": -0.9053643941879272,
"step": 430
},
{
"epoch": 0.46061240512954726,
"grad_norm": 3.859375,
"learning_rate": 3.269063392575352e-06,
"logits/chosen": -0.5341087579727173,
"logits/rejected": -0.5249664187431335,
"logps/chosen": -328.975341796875,
"logps/ref_response": -0.5050511360168457,
"logps/rejected": -307.42388916015625,
"loss": 0.5158,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.021743040531873703,
"rewards/margins": 0.7577444314956665,
"rewards/rejected": -0.7794874310493469,
"step": 440
},
{
"epoch": 0.4710808688824915,
"grad_norm": 3.828125,
"learning_rate": 3.181570569931697e-06,
"logits/chosen": -0.5577148199081421,
"logits/rejected": -0.5402424931526184,
"logps/chosen": -287.3887939453125,
"logps/ref_response": -0.5224987864494324,
"logps/rejected": -283.7501525878906,
"loss": 0.5068,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.2252788096666336,
"rewards/margins": 0.7505895495414734,
"rewards/rejected": -0.9758683443069458,
"step": 450
},
{
"epoch": 0.48154933263543576,
"grad_norm": 2.765625,
"learning_rate": 3.09316620706208e-06,
"logits/chosen": -0.5113469362258911,
"logits/rejected": -0.5197252631187439,
"logps/chosen": -308.82244873046875,
"logps/ref_response": -0.4874509274959564,
"logps/rejected": -290.3113708496094,
"loss": 0.4968,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.09814532101154327,
"rewards/margins": 0.974043071269989,
"rewards/rejected": -1.072188377380371,
"step": 460
},
{
"epoch": 0.49201779638838,
"grad_norm": 3.515625,
"learning_rate": 3.0039685369660785e-06,
"logits/chosen": -0.513633131980896,
"logits/rejected": -0.46772414445877075,
"logps/chosen": -282.93438720703125,
"logps/ref_response": -0.4861488938331604,
"logps/rejected": -267.72796630859375,
"loss": 0.5329,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.05372269079089165,
"rewards/margins": 0.8479040861129761,
"rewards/rejected": -0.7941814661026001,
"step": 470
},
{
"epoch": 0.5024862601413242,
"grad_norm": 3.75,
"learning_rate": 2.91409685362137e-06,
"logits/chosen": -0.5242967009544373,
"logits/rejected": -0.5119736790657043,
"logps/chosen": -280.0267028808594,
"logps/ref_response": -0.5061747431755066,
"logps/rejected": -277.7090759277344,
"loss": 0.5085,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.04931460693478584,
"rewards/margins": 0.8077665567398071,
"rewards/rejected": -0.8570810556411743,
"step": 480
},
{
"epoch": 0.5129547238942685,
"grad_norm": 2.8125,
"learning_rate": 2.8236713524386085e-06,
"logits/chosen": -0.587617039680481,
"logits/rejected": -0.5464242100715637,
"logps/chosen": -280.3719482421875,
"logps/ref_response": -0.5583964586257935,
"logps/rejected": -258.3996276855469,
"loss": 0.5056,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.056199681013822556,
"rewards/margins": 0.8544878959655762,
"rewards/rejected": -0.7982882261276245,
"step": 490
},
{
"epoch": 0.5234231876472127,
"grad_norm": 3.546875,
"learning_rate": 2.7328129695107205e-06,
"logits/chosen": -0.4950012266635895,
"logits/rejected": -0.49973049759864807,
"logps/chosen": -266.21197509765625,
"logps/ref_response": -0.46682921051979065,
"logps/rejected": -276.02667236328125,
"loss": 0.5319,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.011867323890328407,
"rewards/margins": 1.0351099967956543,
"rewards/rejected": -1.023242712020874,
"step": 500
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -0.3727329671382904,
"eval_logits/rejected": -0.3377152681350708,
"eval_logps/chosen": -294.09320068359375,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -286.45953369140625,
"eval_loss": 0.521223783493042,
"eval_rewards/accuracies": 0.7260000109672546,
"eval_rewards/chosen": -0.1546054631471634,
"eval_rewards/margins": 0.8236899375915527,
"eval_rewards/rejected": -0.9782953858375549,
"eval_runtime": 349.5098,
"eval_samples_per_second": 5.722,
"eval_steps_per_second": 0.358,
"step": 500
},
{
"epoch": 0.533891651400157,
"grad_norm": 3.046875,
"learning_rate": 2.641643219871597e-06,
"logits/chosen": -0.5270382165908813,
"logits/rejected": -0.484760582447052,
"logps/chosen": -314.9234619140625,
"logps/ref_response": -0.5090646743774414,
"logps/rejected": -299.1803894042969,
"loss": 0.5149,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.1974947154521942,
"rewards/margins": 0.7501281499862671,
"rewards/rejected": -0.9476228952407837,
"step": 510
},
{
"epoch": 0.5443601151531012,
"grad_norm": 4.53125,
"learning_rate": 2.5502840349805074e-06,
"logits/chosen": -0.5182399749755859,
"logits/rejected": -0.5034407377243042,
"logps/chosen": -310.6617126464844,
"logps/ref_response": -0.5057616829872131,
"logps/rejected": -298.3763427734375,
"loss": 0.538,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.06424389034509659,
"rewards/margins": 0.8863626718521118,
"rewards/rejected": -0.9506064653396606,
"step": 520
},
{
"epoch": 0.5548285789060455,
"grad_norm": 3.6875,
"learning_rate": 2.4588575996495797e-06,
"logits/chosen": -0.47386473417282104,
"logits/rejected": -0.47526755928993225,
"logps/chosen": -272.9559631347656,
"logps/ref_response": -0.45075368881225586,
"logps/rejected": -263.4909973144531,
"loss": 0.5251,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.2501363754272461,
"rewards/margins": 0.891460120677948,
"rewards/rejected": -1.1415965557098389,
"step": 530
},
{
"epoch": 0.5652970426589898,
"grad_norm": 4.15625,
"learning_rate": 2.367486188632446e-06,
"logits/chosen": -0.5168323516845703,
"logits/rejected": -0.5130370855331421,
"logps/chosen": -286.60076904296875,
"logps/ref_response": -0.5035119652748108,
"logps/rejected": -326.2984619140625,
"loss": 0.5101,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.09159872680902481,
"rewards/margins": 0.8820840120315552,
"rewards/rejected": -0.9736827611923218,
"step": 540
},
{
"epoch": 0.575765506411934,
"grad_norm": 3.28125,
"learning_rate": 2.276292003092593e-06,
"logits/chosen": -0.538984477519989,
"logits/rejected": -0.5137041807174683,
"logps/chosen": -258.5223083496094,
"logps/ref_response": -0.5067554712295532,
"logps/rejected": -266.6609802246094,
"loss": 0.4938,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.0642676055431366,
"rewards/margins": 0.9660905003547668,
"rewards/rejected": -1.030358076095581,
"step": 550
},
{
"epoch": 0.5862339701648783,
"grad_norm": 3.234375,
"learning_rate": 2.1853970071701415e-06,
"logits/chosen": -0.5292374491691589,
"logits/rejected": -0.49377211928367615,
"logps/chosen": -279.6850891113281,
"logps/ref_response": -0.5059608817100525,
"logps/rejected": -281.2428283691406,
"loss": 0.5137,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.03345666453242302,
"rewards/margins": 0.8658466339111328,
"rewards/rejected": -0.8993034362792969,
"step": 560
},
{
"epoch": 0.5967024339178225,
"grad_norm": 4.0,
"learning_rate": 2.0949227648656194e-06,
"logits/chosen": -0.5543760657310486,
"logits/rejected": -0.5229381918907166,
"logps/chosen": -296.0806579589844,
"logps/ref_response": -0.5283219218254089,
"logps/rejected": -263.48150634765625,
"loss": 0.5265,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.09281423687934875,
"rewards/margins": 0.9194073677062988,
"rewards/rejected": -1.0122215747833252,
"step": 570
},
{
"epoch": 0.6071708976707668,
"grad_norm": 3.5625,
"learning_rate": 2.00499027745888e-06,
"logits/chosen": -0.5253512263298035,
"logits/rejected": -0.5014483332633972,
"logps/chosen": -300.3424377441406,
"logps/ref_response": -0.5130476355552673,
"logps/rejected": -299.5146789550781,
"loss": 0.5382,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.05144411325454712,
"rewards/margins": 0.8135349154472351,
"rewards/rejected": -0.8649789690971375,
"step": 580
},
{
"epoch": 0.6176393614237111,
"grad_norm": 3.46875,
"learning_rate": 1.915719821680624e-06,
"logits/chosen": -0.5522564053535461,
"logits/rejected": -0.4926326870918274,
"logps/chosen": -288.0102844238281,
"logps/ref_response": -0.5210384130477905,
"logps/rejected": -284.1330261230469,
"loss": 0.5143,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.19294333457946777,
"rewards/margins": 0.9110462069511414,
"rewards/rejected": -0.7181028127670288,
"step": 590
},
{
"epoch": 0.6281078251766553,
"grad_norm": 3.5,
"learning_rate": 1.8272307888529276e-06,
"logits/chosen": -0.4761735796928406,
"logits/rejected": -0.4301334321498871,
"logps/chosen": -264.5964660644531,
"logps/ref_response": -0.4653666913509369,
"logps/rejected": -281.8071594238281,
"loss": 0.5155,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.022416314110159874,
"rewards/margins": 0.9364219903945923,
"rewards/rejected": -0.9588383436203003,
"step": 600
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -0.36081573367118835,
"eval_logits/rejected": -0.32467401027679443,
"eval_logps/chosen": -293.3980407714844,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -285.9612121582031,
"eval_loss": 0.5195037722587585,
"eval_rewards/accuracies": 0.7360000014305115,
"eval_rewards/chosen": -0.0850897878408432,
"eval_rewards/margins": 0.8433744311332703,
"eval_rewards/rejected": -0.9284642338752747,
"eval_runtime": 349.4276,
"eval_samples_per_second": 5.724,
"eval_steps_per_second": 0.358,
"step": 600
},
{
"epoch": 0.6385762889295996,
"grad_norm": 3.0,
"learning_rate": 1.739641525213929e-06,
"logits/chosen": -0.5045549869537354,
"logits/rejected": -0.4906612038612366,
"logps/chosen": -267.28338623046875,
"logps/ref_response": -0.500705897808075,
"logps/rejected": -273.13714599609375,
"loss": 0.4986,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.08845983445644379,
"rewards/margins": 0.9310785531997681,
"rewards/rejected": -1.0195385217666626,
"step": 610
},
{
"epoch": 0.6490447526825438,
"grad_norm": 3.0625,
"learning_rate": 1.6530691736402317e-06,
"logits/chosen": -0.5177065134048462,
"logits/rejected": -0.4805786609649658,
"logps/chosen": -295.038818359375,
"logps/ref_response": -0.502620279788971,
"logps/rejected": -284.8276672363281,
"loss": 0.5092,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.21657638251781464,
"rewards/margins": 0.9273554086685181,
"rewards/rejected": -1.1439317464828491,
"step": 620
},
{
"epoch": 0.6595132164354881,
"grad_norm": 3.625,
"learning_rate": 1.5676295169786864e-06,
"logits/chosen": -0.5329315662384033,
"logits/rejected": -0.4890086054801941,
"logps/chosen": -288.74847412109375,
"logps/ref_response": -0.522256076335907,
"logps/rejected": -274.2325439453125,
"loss": 0.5196,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.2333780825138092,
"rewards/margins": 0.8650426864624023,
"rewards/rejected": -1.0984207391738892,
"step": 630
},
{
"epoch": 0.6699816801884323,
"grad_norm": 3.21875,
"learning_rate": 1.4834368231970922e-06,
"logits/chosen": -0.5609028935432434,
"logits/rejected": -0.5060838460922241,
"logps/chosen": -288.317138671875,
"logps/ref_response": -0.5478745698928833,
"logps/rejected": -274.0264892578125,
"loss": 0.5003,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.1753380298614502,
"rewards/margins": 0.7557013034820557,
"rewards/rejected": -0.9310394525527954,
"step": 640
},
{
"epoch": 0.6804501439413766,
"grad_norm": 3.171875,
"learning_rate": 1.4006036925609245e-06,
"logits/chosen": -0.5304352045059204,
"logits/rejected": -0.48552340269088745,
"logps/chosen": -300.5797424316406,
"logps/ref_response": -0.5103051662445068,
"logps/rejected": -250.87216186523438,
"loss": 0.5263,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.19611187279224396,
"rewards/margins": 0.8849571347236633,
"rewards/rejected": -1.0810692310333252,
"step": 650
},
{
"epoch": 0.6909186076943209,
"grad_norm": 3.453125,
"learning_rate": 1.3192409070404582e-06,
"logits/chosen": -0.5464252233505249,
"logits/rejected": -0.5194587111473083,
"logps/chosen": -304.4057312011719,
"logps/ref_response": -0.5286127328872681,
"logps/rejected": -306.74737548828125,
"loss": 0.5106,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.0011399202048778534,
"rewards/margins": 0.8308441042900085,
"rewards/rejected": -0.8297042846679688,
"step": 660
},
{
"epoch": 0.7013870714472651,
"grad_norm": 4.5,
"learning_rate": 1.2394572821496953e-06,
"logits/chosen": -0.5439696311950684,
"logits/rejected": -0.5098680257797241,
"logps/chosen": -277.79193115234375,
"logps/ref_response": -0.5491371154785156,
"logps/rejected": -259.3212890625,
"loss": 0.5184,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.07701723277568817,
"rewards/margins": 0.8310605883598328,
"rewards/rejected": -0.9080777168273926,
"step": 670
},
{
"epoch": 0.7118555352002094,
"grad_norm": 3.109375,
"learning_rate": 1.1613595214152713e-06,
"logits/chosen": -0.5734778642654419,
"logits/rejected": -0.526314377784729,
"logps/chosen": -287.5583801269531,
"logps/ref_response": -0.5694643259048462,
"logps/rejected": -276.66607666015625,
"loss": 0.4993,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.16065016388893127,
"rewards/margins": 0.8544554710388184,
"rewards/rejected": -1.0151057243347168,
"step": 680
},
{
"epoch": 0.7223239989531536,
"grad_norm": 2.578125,
"learning_rate": 1.0850520736699362e-06,
"logits/chosen": -0.5160936117172241,
"logits/rejected": -0.48213791847229004,
"logps/chosen": -341.7447814941406,
"logps/ref_response": -0.4945286810398102,
"logps/rejected": -317.0802001953125,
"loss": 0.5268,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1648830622434616,
"rewards/margins": 0.9320821762084961,
"rewards/rejected": -1.0969650745391846,
"step": 690
},
{
"epoch": 0.7327924627060979,
"grad_norm": 3.34375,
"learning_rate": 1.0106369933615043e-06,
"logits/chosen": -0.5604196786880493,
"logits/rejected": -0.5098714828491211,
"logps/chosen": -316.64105224609375,
"logps/ref_response": -0.5506774187088013,
"logps/rejected": -264.15411376953125,
"loss": 0.5113,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.22138896584510803,
"rewards/margins": 0.7301830053329468,
"rewards/rejected": -0.9515719413757324,
"step": 700
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -0.34110894799232483,
"eval_logits/rejected": -0.30364856123924255,
"eval_logps/chosen": -294.4884948730469,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -287.16522216796875,
"eval_loss": 0.5173361301422119,
"eval_rewards/accuracies": 0.734000027179718,
"eval_rewards/chosen": -0.19413326680660248,
"eval_rewards/margins": 0.8547297120094299,
"eval_rewards/rejected": -1.0488630533218384,
"eval_runtime": 349.4515,
"eval_samples_per_second": 5.723,
"eval_steps_per_second": 0.358,
"step": 700
},
{
"epoch": 0.7432609264590422,
"grad_norm": 3.890625,
"learning_rate": 9.382138040640714e-07,
"logits/chosen": -0.5672627687454224,
"logits/rejected": -0.5198173522949219,
"logps/chosen": -265.9830627441406,
"logps/ref_response": -0.5634459257125854,
"logps/rejected": -280.0369873046875,
"loss": 0.532,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.16655750572681427,
"rewards/margins": 0.8667360544204712,
"rewards/rejected": -1.0332934856414795,
"step": 710
},
{
"epoch": 0.7537293902119864,
"grad_norm": 3.25,
"learning_rate": 8.678793653740633e-07,
"logits/chosen": -0.492758572101593,
"logits/rejected": -0.4843314290046692,
"logps/chosen": -264.08270263671875,
"logps/ref_response": -0.49243393540382385,
"logps/rejected": -264.4930114746094,
"loss": 0.5104,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.08720338344573975,
"rewards/margins": 0.9140432476997375,
"rewards/rejected": -1.001246690750122,
"step": 720
},
{
"epoch": 0.7641978539649307,
"grad_norm": 2.609375,
"learning_rate": 7.997277433690984e-07,
"logits/chosen": -0.5135891437530518,
"logits/rejected": -0.455097496509552,
"logps/chosen": -302.38580322265625,
"logps/ref_response": -0.4944031834602356,
"logps/rejected": -288.3606872558594,
"loss": 0.5071,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.15266835689544678,
"rewards/margins": 0.821180522441864,
"rewards/rejected": -0.973848819732666,
"step": 730
},
{
"epoch": 0.7746663177178749,
"grad_norm": 2.890625,
"learning_rate": 7.338500848029603e-07,
"logits/chosen": -0.4669191241264343,
"logits/rejected": -0.47497326135635376,
"logps/chosen": -292.3653564453125,
"logps/ref_response": -0.4282347559928894,
"logps/rejected": -276.3725280761719,
"loss": 0.4932,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.010840972885489464,
"rewards/margins": 0.8605710864067078,
"rewards/rejected": -0.8714120984077454,
"step": 740
},
{
"epoch": 0.7851347814708192,
"grad_norm": 3.234375,
"learning_rate": 6.70334495204884e-07,
"logits/chosen": -0.5066109299659729,
"logits/rejected": -0.4780009388923645,
"logps/chosen": -324.728515625,
"logps/ref_response": -0.49645256996154785,
"logps/rejected": -287.18304443359375,
"loss": 0.5074,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.02283366583287716,
"rewards/margins": 0.854231059551239,
"rewards/rejected": -0.8313972353935242,
"step": 750
},
{
"epoch": 0.7956032452237635,
"grad_norm": 3.4375,
"learning_rate": 6.092659210462232e-07,
"logits/chosen": -0.5308446884155273,
"logits/rejected": -0.5082138776779175,
"logps/chosen": -270.0063781738281,
"logps/ref_response": -0.5222411751747131,
"logps/rejected": -269.8592224121094,
"loss": 0.534,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.09487895667552948,
"rewards/margins": 0.7679542303085327,
"rewards/rejected": -0.8628333210945129,
"step": 760
},
{
"epoch": 0.8060717089767077,
"grad_norm": 3.109375,
"learning_rate": 5.507260361320738e-07,
"logits/chosen": -0.5149004459381104,
"logits/rejected": -0.5143811702728271,
"logps/chosen": -285.86566162109375,
"logps/ref_response": -0.50932776927948,
"logps/rejected": -280.4559020996094,
"loss": 0.5136,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.04052960127592087,
"rewards/margins": 0.7234804034233093,
"rewards/rejected": -0.6829507946968079,
"step": 770
},
{
"epoch": 0.816540172729652,
"grad_norm": 2.9375,
"learning_rate": 4.947931323697983e-07,
"logits/chosen": -0.5068045854568481,
"logits/rejected": -0.47291022539138794,
"logps/chosen": -287.3854675292969,
"logps/ref_response": -0.49121037125587463,
"logps/rejected": -281.04669189453125,
"loss": 0.5179,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09810696542263031,
"rewards/margins": 0.7164817452430725,
"rewards/rejected": -0.8145886659622192,
"step": 780
},
{
"epoch": 0.8270086364825961,
"grad_norm": 3.40625,
"learning_rate": 4.4154201506053985e-07,
"logits/chosen": -0.5332745313644409,
"logits/rejected": -0.5107340812683105,
"logps/chosen": -301.6036071777344,
"logps/ref_response": -0.5042006373405457,
"logps/rejected": -265.80133056640625,
"loss": 0.5114,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.08654189109802246,
"rewards/margins": 0.8664189577102661,
"rewards/rejected": -0.9529608488082886,
"step": 790
},
{
"epoch": 0.8374771002355405,
"grad_norm": 2.828125,
"learning_rate": 3.910439028537638e-07,
"logits/chosen": -0.5402854681015015,
"logits/rejected": -0.48599618673324585,
"logps/chosen": -349.26226806640625,
"logps/ref_response": -0.5149141550064087,
"logps/rejected": -303.9481201171875,
"loss": 0.5268,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.020410016179084778,
"rewards/margins": 0.7255326509475708,
"rewards/rejected": -0.7051225900650024,
"step": 800
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -0.3452620208263397,
"eval_logits/rejected": -0.30824077129364014,
"eval_logps/chosen": -293.0043640136719,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -285.70001220703125,
"eval_loss": 0.5176524519920349,
"eval_rewards/accuracies": 0.722000002861023,
"eval_rewards/chosen": -0.045722391456365585,
"eval_rewards/margins": 0.8566184043884277,
"eval_rewards/rejected": -0.902340829372406,
"eval_runtime": 349.422,
"eval_samples_per_second": 5.724,
"eval_steps_per_second": 0.358,
"step": 800
},
{
"epoch": 0.8479455639884846,
"grad_norm": 2.78125,
"learning_rate": 3.4336633249862084e-07,
"logits/chosen": -0.564243733882904,
"logits/rejected": -0.48312124609947205,
"logps/chosen": -321.1978759765625,
"logps/ref_response": -0.5519742369651794,
"logps/rejected": -290.4500732421875,
"loss": 0.5015,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.08782587200403214,
"rewards/margins": 0.8487712740898132,
"rewards/rejected": -0.9365970492362976,
"step": 810
},
{
"epoch": 0.8584140277414289,
"grad_norm": 3.203125,
"learning_rate": 2.98573068519539e-07,
"logits/chosen": -0.5403026342391968,
"logits/rejected": -0.5190576910972595,
"logps/chosen": -308.6977844238281,
"logps/ref_response": -0.5307375192642212,
"logps/rejected": -295.208984375,
"loss": 0.5196,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.004915142897516489,
"rewards/margins": 0.8321554064750671,
"rewards/rejected": -0.8370705842971802,
"step": 820
},
{
"epoch": 0.8688824914943732,
"grad_norm": 3.453125,
"learning_rate": 2.5672401793681854e-07,
"logits/chosen": -0.5584547519683838,
"logits/rejected": -0.5359379053115845,
"logps/chosen": -276.21697998046875,
"logps/ref_response": -0.5466696619987488,
"logps/rejected": -271.29046630859375,
"loss": 0.5047,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.08783929795026779,
"rewards/margins": 0.8796290159225464,
"rewards/rejected": -0.9674683809280396,
"step": 830
},
{
"epoch": 0.8793509552473174,
"grad_norm": 3.109375,
"learning_rate": 2.178751501463036e-07,
"logits/chosen": -0.5241914987564087,
"logits/rejected": -0.5036768317222595,
"logps/chosen": -316.0185852050781,
"logps/ref_response": -0.5086795091629028,
"logps/rejected": -309.330810546875,
"loss": 0.5019,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.10169823467731476,
"rewards/margins": 0.8505386114120483,
"rewards/rejected": -0.9522367715835571,
"step": 840
},
{
"epoch": 0.8898194190002617,
"grad_norm": 3.296875,
"learning_rate": 1.820784220652766e-07,
"logits/chosen": -0.5777777433395386,
"logits/rejected": -0.5282770991325378,
"logps/chosen": -347.5041809082031,
"logps/ref_response": -0.5546728372573853,
"logps/rejected": -281.29705810546875,
"loss": 0.5108,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.10623917728662491,
"rewards/margins": 0.9054906964302063,
"rewards/rejected": -0.7992514371871948,
"step": 850
},
{
"epoch": 0.9002878827532059,
"grad_norm": 2.96875,
"learning_rate": 1.4938170864468636e-07,
"logits/chosen": -0.4989868998527527,
"logits/rejected": -0.4683281481266022,
"logps/chosen": -291.2259216308594,
"logps/ref_response": -0.4814940392971039,
"logps/rejected": -270.903564453125,
"loss": 0.4862,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.06208313629031181,
"rewards/margins": 0.9479352235794067,
"rewards/rejected": -1.010018229484558,
"step": 860
},
{
"epoch": 0.9107563465061502,
"grad_norm": 3.828125,
"learning_rate": 1.1982873884064466e-07,
"logits/chosen": -0.4674592614173889,
"logits/rejected": -0.46020442247390747,
"logps/chosen": -288.8802795410156,
"logps/ref_response": -0.463235467672348,
"logps/rejected": -279.07818603515625,
"loss": 0.5202,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.09532221406698227,
"rewards/margins": 0.7154585719108582,
"rewards/rejected": -0.8107808232307434,
"step": 870
},
{
"epoch": 0.9212248102590945,
"grad_norm": 3.640625,
"learning_rate": 9.345903713082305e-08,
"logits/chosen": -0.550748884677887,
"logits/rejected": -0.5384425520896912,
"logps/chosen": -316.01953125,
"logps/ref_response": -0.5406745672225952,
"logps/rejected": -282.6295471191406,
"loss": 0.5288,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.04705243557691574,
"rewards/margins": 0.6842992901802063,
"rewards/rejected": -0.7313517332077026,
"step": 880
},
{
"epoch": 0.9316932740120387,
"grad_norm": 4.21875,
"learning_rate": 7.030787065396866e-08,
"logits/chosen": -0.5192651152610779,
"logits/rejected": -0.47466206550598145,
"logps/chosen": -320.68121337890625,
"logps/ref_response": -0.5117658376693726,
"logps/rejected": -295.110107421875,
"loss": 0.5163,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.031226161867380142,
"rewards/margins": 0.7437794208526611,
"rewards/rejected": -0.7750056385993958,
"step": 890
},
{
"epoch": 0.942161737764983,
"grad_norm": 3.015625,
"learning_rate": 5.0406202043228604e-08,
"logits/chosen": -0.5355256795883179,
"logits/rejected": -0.5220322012901306,
"logps/chosen": -334.91900634765625,
"logps/ref_response": -0.5195636749267578,
"logps/rejected": -276.53973388671875,
"loss": 0.4923,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.11498089134693146,
"rewards/margins": 1.026011347770691,
"rewards/rejected": -0.9110305905342102,
"step": 900
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -0.34425824880599976,
"eval_logits/rejected": -0.30724215507507324,
"eval_logps/chosen": -293.0645446777344,
"eval_logps/ref_response": -0.5363935232162476,
"eval_logps/rejected": -285.7690734863281,
"eval_loss": 0.5175051093101501,
"eval_rewards/accuracies": 0.7279999852180481,
"eval_rewards/chosen": -0.05173807963728905,
"eval_rewards/margins": 0.8575091361999512,
"eval_rewards/rejected": -0.9092472791671753,
"eval_runtime": 349.4689,
"eval_samples_per_second": 5.723,
"eval_steps_per_second": 0.358,
"step": 900
},
{
"epoch": 0.9526302015179272,
"grad_norm": 2.953125,
"learning_rate": 3.378064801637687e-08,
"logits/chosen": -0.5745955109596252,
"logits/rejected": -0.5213441848754883,
"logps/chosen": -316.2192077636719,
"logps/ref_response": -0.561827540397644,
"logps/rejected": -315.3379821777344,
"loss": 0.5218,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.04833642765879631,
"rewards/margins": 0.7893211245536804,
"rewards/rejected": -0.7409847378730774,
"step": 910
},
{
"epoch": 0.9630986652708715,
"grad_norm": 2.78125,
"learning_rate": 2.0453443778310766e-08,
"logits/chosen": -0.5022112131118774,
"logits/rejected": -0.45878568291664124,
"logps/chosen": -330.7922668457031,
"logps/ref_response": -0.4732615351676941,
"logps/rejected": -306.924560546875,
"loss": 0.5181,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.02221493050456047,
"rewards/margins": 0.9629158973693848,
"rewards/rejected": -0.9407010078430176,
"step": 920
},
{
"epoch": 0.9735671290238157,
"grad_norm": 2.6875,
"learning_rate": 1.0442413283435759e-08,
"logits/chosen": -0.5029438734054565,
"logits/rejected": -0.45212322473526,
"logps/chosen": -319.1156921386719,
"logps/ref_response": -0.4792579114437103,
"logps/rejected": -280.00390625,
"loss": 0.5078,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.054530493915081024,
"rewards/margins": 1.144339680671692,
"rewards/rejected": -1.0898091793060303,
"step": 930
},
{
"epoch": 0.98403559277676,
"grad_norm": 2.375,
"learning_rate": 3.760945397705828e-09,
"logits/chosen": -0.5288008451461792,
"logits/rejected": -0.4695435166358948,
"logps/chosen": -292.11932373046875,
"logps/ref_response": -0.5234506726264954,
"logps/rejected": -262.4600830078125,
"loss": 0.4922,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.12431895732879639,
"rewards/margins": 1.0587154626846313,
"rewards/rejected": -0.9343963861465454,
"step": 940
},
{
"epoch": 0.9945040565297043,
"grad_norm": 2.8125,
"learning_rate": 4.1797599220405605e-10,
"logits/chosen": -0.5448582172393799,
"logits/rejected": -0.521163821220398,
"logps/chosen": -296.86529541015625,
"logps/ref_response": -0.5367287397384644,
"logps/rejected": -280.74432373046875,
"loss": 0.5133,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.027158012613654137,
"rewards/margins": 0.8814530372619629,
"rewards/rejected": -0.8542949557304382,
"step": 950
},
{
"epoch": 0.9997382884061764,
"step": 955,
"total_flos": 0.0,
"train_loss": 0.5383632463934533,
"train_runtime": 19109.9128,
"train_samples_per_second": 3.199,
"train_steps_per_second": 0.05
}
],
"logging_steps": 10,
"max_steps": 955,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}