Llama3.1-Mamba2-8B-dpo / trainer_state.json
Junxiong Wang
add models
7ae6913
raw
history blame
222 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 5000,
"global_step": 4168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002399232245681382,
"grad_norm": 12.830117413274314,
"learning_rate": 1.199040767386091e-09,
"logits/chosen": -0.9333123564720154,
"logits/rejected": -0.9608660936355591,
"logps/chosen": -159.56137084960938,
"logps/rejected": -163.75823974609375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0023992322456813818,
"grad_norm": 14.474357775073676,
"learning_rate": 1.199040767386091e-08,
"logits/chosen": -0.7961726188659668,
"logits/rejected": -0.9891128540039062,
"logps/chosen": -372.8497314453125,
"logps/rejected": -307.27880859375,
"loss": 0.6936,
"rewards/accuracies": 0.2222222238779068,
"rewards/chosen": -0.0022438960149884224,
"rewards/margins": -0.0032549728639423847,
"rewards/rejected": 0.001011077081784606,
"step": 10
},
{
"epoch": 0.0047984644913627635,
"grad_norm": 14.81799284714085,
"learning_rate": 2.398081534772182e-08,
"logits/chosen": -0.8415049314498901,
"logits/rejected": -0.8875001072883606,
"logps/chosen": -254.6843719482422,
"logps/rejected": -224.0112762451172,
"loss": 0.6933,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.000493885949254036,
"rewards/margins": -0.0004514312313403934,
"rewards/rejected": -4.245472882757895e-05,
"step": 20
},
{
"epoch": 0.007197696737044146,
"grad_norm": 13.794514003749402,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -0.9713956117630005,
"logits/rejected": -1.0547858476638794,
"logps/chosen": -246.8055419921875,
"logps/rejected": -250.71664428710938,
"loss": 0.6935,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.001063814153894782,
"rewards/margins": -0.0020584219601005316,
"rewards/rejected": 0.0009946079226210713,
"step": 30
},
{
"epoch": 0.009596928982725527,
"grad_norm": 13.247567844837228,
"learning_rate": 4.796163069544364e-08,
"logits/chosen": -0.9620189666748047,
"logits/rejected": -1.0594929456710815,
"logps/chosen": -245.97018432617188,
"logps/rejected": -238.09866333007812,
"loss": 0.6934,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0002268419339088723,
"rewards/margins": -0.0007565030828118324,
"rewards/rejected": 0.0009833450894802809,
"step": 40
},
{
"epoch": 0.01199616122840691,
"grad_norm": 14.63019248204087,
"learning_rate": 5.995203836930455e-08,
"logits/chosen": -0.8769725561141968,
"logits/rejected": -0.9388787150382996,
"logps/chosen": -273.33740234375,
"logps/rejected": -236.96035766601562,
"loss": 0.6932,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00011439235095167533,
"rewards/margins": -0.0013857032172381878,
"rewards/rejected": 0.0012713107280433178,
"step": 50
},
{
"epoch": 0.014395393474088292,
"grad_norm": 14.759835440686068,
"learning_rate": 7.194244604316546e-08,
"logits/chosen": -1.0583831071853638,
"logits/rejected": -0.98320472240448,
"logps/chosen": -289.59100341796875,
"logps/rejected": -263.46539306640625,
"loss": 0.693,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.001381714828312397,
"rewards/margins": -0.001984237926080823,
"rewards/rejected": 0.0006025228649377823,
"step": 60
},
{
"epoch": 0.016794625719769675,
"grad_norm": 13.293061279163139,
"learning_rate": 8.393285371702638e-08,
"logits/chosen": -0.7076249122619629,
"logits/rejected": -0.7553724050521851,
"logps/chosen": -280.7513122558594,
"logps/rejected": -270.6877136230469,
"loss": 0.6927,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.0003030824300367385,
"rewards/margins": 0.0011825991095975041,
"rewards/rejected": -0.0014856813941150904,
"step": 70
},
{
"epoch": 0.019193857965451054,
"grad_norm": 13.898946391946994,
"learning_rate": 9.592326139088728e-08,
"logits/chosen": -1.043713927268982,
"logits/rejected": -0.7779287099838257,
"logps/chosen": -202.8668212890625,
"logps/rejected": -240.8871612548828,
"loss": 0.6931,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0010693834628909826,
"rewards/margins": 0.0018330765888094902,
"rewards/rejected": -0.000763692834880203,
"step": 80
},
{
"epoch": 0.021593090211132437,
"grad_norm": 13.852772166042456,
"learning_rate": 1.0791366906474819e-07,
"logits/chosen": -1.053758978843689,
"logits/rejected": -1.118728756904602,
"logps/chosen": -346.8046875,
"logps/rejected": -297.9544677734375,
"loss": 0.693,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0008068332681432366,
"rewards/margins": 0.0018463155720382929,
"rewards/rejected": -0.0010394820710644126,
"step": 90
},
{
"epoch": 0.02399232245681382,
"grad_norm": 14.519229316524262,
"learning_rate": 1.199040767386091e-07,
"logits/chosen": -0.7971643209457397,
"logits/rejected": -0.6808469295501709,
"logps/chosen": -260.2826843261719,
"logps/rejected": -278.73406982421875,
"loss": 0.6924,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.001732430187985301,
"rewards/margins": 0.0005316648748703301,
"rewards/rejected": -0.002264095237478614,
"step": 100
},
{
"epoch": 0.026391554702495202,
"grad_norm": 12.435724553655291,
"learning_rate": 1.3189448441247004e-07,
"logits/chosen": -0.9703518152236938,
"logits/rejected": -1.0042107105255127,
"logps/chosen": -230.6790771484375,
"logps/rejected": -227.7737274169922,
"loss": 0.6922,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.002495633903890848,
"rewards/margins": 0.00144762615673244,
"rewards/rejected": -0.0039432598277926445,
"step": 110
},
{
"epoch": 0.028790786948176585,
"grad_norm": 14.017624859502787,
"learning_rate": 1.4388489208633092e-07,
"logits/chosen": -0.8614851832389832,
"logits/rejected": -0.9825533032417297,
"logps/chosen": -302.4905090332031,
"logps/rejected": -277.0740966796875,
"loss": 0.6918,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.004078847821801901,
"rewards/margins": 0.0013862637570127845,
"rewards/rejected": -0.005465111695230007,
"step": 120
},
{
"epoch": 0.031190019193857964,
"grad_norm": 12.415568509175522,
"learning_rate": 1.5587529976019183e-07,
"logits/chosen": -1.0289809703826904,
"logits/rejected": -0.9567875862121582,
"logps/chosen": -224.55801391601562,
"logps/rejected": -305.19329833984375,
"loss": 0.6912,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0029300746973603964,
"rewards/margins": 0.0053221117705106735,
"rewards/rejected": -0.008252186700701714,
"step": 130
},
{
"epoch": 0.03358925143953935,
"grad_norm": 13.71977139583337,
"learning_rate": 1.6786570743405277e-07,
"logits/chosen": -0.7473757266998291,
"logits/rejected": -0.7813047170639038,
"logps/chosen": -280.0737609863281,
"logps/rejected": -270.67962646484375,
"loss": 0.6908,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.007163494825363159,
"rewards/margins": 0.0030975989066064358,
"rewards/rejected": -0.010261094197630882,
"step": 140
},
{
"epoch": 0.03598848368522073,
"grad_norm": 13.807869552669564,
"learning_rate": 1.7985611510791365e-07,
"logits/chosen": -0.9591526985168457,
"logits/rejected": -0.973153293132782,
"logps/chosen": -230.39022827148438,
"logps/rejected": -225.483642578125,
"loss": 0.6903,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.00702179130166769,
"rewards/margins": 0.008269025012850761,
"rewards/rejected": -0.015290816314518452,
"step": 150
},
{
"epoch": 0.03838771593090211,
"grad_norm": 14.320312006310687,
"learning_rate": 1.9184652278177456e-07,
"logits/chosen": -0.7980312705039978,
"logits/rejected": -0.8937035799026489,
"logps/chosen": -296.2853088378906,
"logps/rejected": -229.8527374267578,
"loss": 0.6888,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.012824411503970623,
"rewards/margins": 0.0057816035114228725,
"rewards/rejected": -0.018606014549732208,
"step": 160
},
{
"epoch": 0.040786948176583494,
"grad_norm": 13.000634918715434,
"learning_rate": 2.038369304556355e-07,
"logits/chosen": -0.770675778388977,
"logits/rejected": -0.8101946711540222,
"logps/chosen": -343.27947998046875,
"logps/rejected": -335.1900329589844,
"loss": 0.6879,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.01736806333065033,
"rewards/margins": 0.010059249587357044,
"rewards/rejected": -0.0274273119866848,
"step": 170
},
{
"epoch": 0.04318618042226487,
"grad_norm": 14.311756905286574,
"learning_rate": 2.1582733812949638e-07,
"logits/chosen": -1.056549072265625,
"logits/rejected": -1.0499789714813232,
"logps/chosen": -238.1182403564453,
"logps/rejected": -229.7616729736328,
"loss": 0.6891,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.02097422443330288,
"rewards/margins": 0.00995118822902441,
"rewards/rejected": -0.030925417318940163,
"step": 180
},
{
"epoch": 0.04558541266794626,
"grad_norm": 15.983225243139483,
"learning_rate": 2.278177458033573e-07,
"logits/chosen": -0.8500925302505493,
"logits/rejected": -0.9182466268539429,
"logps/chosen": -307.84356689453125,
"logps/rejected": -250.31552124023438,
"loss": 0.6864,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.02229948900640011,
"rewards/margins": 0.012221109122037888,
"rewards/rejected": -0.034520603716373444,
"step": 190
},
{
"epoch": 0.04798464491362764,
"grad_norm": 12.792863291405517,
"learning_rate": 2.398081534772182e-07,
"logits/chosen": -0.9231821894645691,
"logits/rejected": -0.8621791005134583,
"logps/chosen": -315.01055908203125,
"logps/rejected": -300.3835754394531,
"loss": 0.684,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.028801122680306435,
"rewards/margins": 0.020397091284394264,
"rewards/rejected": -0.0491982102394104,
"step": 200
},
{
"epoch": 0.05038387715930902,
"grad_norm": 13.564982268308716,
"learning_rate": 2.517985611510791e-07,
"logits/chosen": -0.8487990498542786,
"logits/rejected": -0.8659976720809937,
"logps/chosen": -232.47500610351562,
"logps/rejected": -256.1665344238281,
"loss": 0.6824,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.036128297448158264,
"rewards/margins": 0.01755443774163723,
"rewards/rejected": -0.053682733327150345,
"step": 210
},
{
"epoch": 0.052783109404990404,
"grad_norm": 13.6123418164062,
"learning_rate": 2.637889688249401e-07,
"logits/chosen": -0.8184248208999634,
"logits/rejected": -0.8951767683029175,
"logps/chosen": -316.2499084472656,
"logps/rejected": -317.16680908203125,
"loss": 0.6826,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.052874885499477386,
"rewards/margins": 0.020790213719010353,
"rewards/rejected": -0.07366509735584259,
"step": 220
},
{
"epoch": 0.05518234165067178,
"grad_norm": 14.40416586096848,
"learning_rate": 2.7577937649880093e-07,
"logits/chosen": -0.8254715204238892,
"logits/rejected": -0.7313406467437744,
"logps/chosen": -243.4302520751953,
"logps/rejected": -282.7564697265625,
"loss": 0.6764,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.05869588255882263,
"rewards/margins": 0.04219502583146095,
"rewards/rejected": -0.10089089721441269,
"step": 230
},
{
"epoch": 0.05758157389635317,
"grad_norm": 15.966117267131516,
"learning_rate": 2.8776978417266184e-07,
"logits/chosen": -0.9691603779792786,
"logits/rejected": -1.0244967937469482,
"logps/chosen": -307.95867919921875,
"logps/rejected": -264.9517517089844,
"loss": 0.6713,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.061439257115125656,
"rewards/margins": 0.04740726947784424,
"rewards/rejected": -0.1088465228676796,
"step": 240
},
{
"epoch": 0.05998080614203455,
"grad_norm": 15.540992469488703,
"learning_rate": 2.997601918465228e-07,
"logits/chosen": -0.9066941142082214,
"logits/rejected": -0.9724334478378296,
"logps/chosen": -245.83828735351562,
"logps/rejected": -241.96401977539062,
"loss": 0.6705,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.08735504001379013,
"rewards/margins": 0.03149568662047386,
"rewards/rejected": -0.1188507229089737,
"step": 250
},
{
"epoch": 0.06238003838771593,
"grad_norm": 14.016457897110723,
"learning_rate": 3.1175059952038366e-07,
"logits/chosen": -0.9671137928962708,
"logits/rejected": -0.8278489112854004,
"logps/chosen": -271.09344482421875,
"logps/rejected": -267.5201110839844,
"loss": 0.6634,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.09744887799024582,
"rewards/margins": 0.07394719123840332,
"rewards/rejected": -0.17139606177806854,
"step": 260
},
{
"epoch": 0.0647792706333973,
"grad_norm": 15.591971270081572,
"learning_rate": 3.2374100719424457e-07,
"logits/chosen": -0.8864096403121948,
"logits/rejected": -1.0893969535827637,
"logps/chosen": -297.97674560546875,
"logps/rejected": -242.01272583007812,
"loss": 0.6597,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.10928692668676376,
"rewards/margins": 0.024153277277946472,
"rewards/rejected": -0.13344022631645203,
"step": 270
},
{
"epoch": 0.0671785028790787,
"grad_norm": 16.24546965787235,
"learning_rate": 3.3573141486810554e-07,
"logits/chosen": -1.0096687078475952,
"logits/rejected": -0.9647713899612427,
"logps/chosen": -311.0169677734375,
"logps/rejected": -303.4920349121094,
"loss": 0.6447,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.15291506052017212,
"rewards/margins": 0.09652292728424072,
"rewards/rejected": -0.24943797290325165,
"step": 280
},
{
"epoch": 0.06957773512476008,
"grad_norm": 13.70885821013161,
"learning_rate": 3.477218225419664e-07,
"logits/chosen": -0.879925549030304,
"logits/rejected": -0.7820504903793335,
"logps/chosen": -306.32293701171875,
"logps/rejected": -284.5321960449219,
"loss": 0.6453,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.19852982461452484,
"rewards/margins": 0.10227125883102417,
"rewards/rejected": -0.3008010983467102,
"step": 290
},
{
"epoch": 0.07197696737044146,
"grad_norm": 16.420599197521923,
"learning_rate": 3.597122302158273e-07,
"logits/chosen": -0.9782785177230835,
"logits/rejected": -1.02054762840271,
"logps/chosen": -278.99755859375,
"logps/rejected": -304.5132141113281,
"loss": 0.648,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.25667688250541687,
"rewards/margins": 0.10419619083404541,
"rewards/rejected": -0.3608730733394623,
"step": 300
},
{
"epoch": 0.07437619961612284,
"grad_norm": 15.857184742509135,
"learning_rate": 3.7170263788968827e-07,
"logits/chosen": -0.8711949586868286,
"logits/rejected": -0.9557937383651733,
"logps/chosen": -297.0644226074219,
"logps/rejected": -263.8592529296875,
"loss": 0.655,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.26040521264076233,
"rewards/margins": 0.16062946617603302,
"rewards/rejected": -0.42103463411331177,
"step": 310
},
{
"epoch": 0.07677543186180422,
"grad_norm": 15.154553129414445,
"learning_rate": 3.836930455635491e-07,
"logits/chosen": -0.9826911687850952,
"logits/rejected": -1.0516026020050049,
"logps/chosen": -302.815673828125,
"logps/rejected": -278.9208984375,
"loss": 0.6482,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.28811559081077576,
"rewards/margins": 0.10437599569559097,
"rewards/rejected": -0.39249157905578613,
"step": 320
},
{
"epoch": 0.07917466410748561,
"grad_norm": 14.842365249432783,
"learning_rate": 3.9568345323741003e-07,
"logits/chosen": -0.8737947344779968,
"logits/rejected": -0.765605092048645,
"logps/chosen": -283.9456787109375,
"logps/rejected": -342.69580078125,
"loss": 0.632,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3966819941997528,
"rewards/margins": 0.21009349822998047,
"rewards/rejected": -0.6067754030227661,
"step": 330
},
{
"epoch": 0.08157389635316699,
"grad_norm": 16.05033848642518,
"learning_rate": 4.07673860911271e-07,
"logits/chosen": -0.7628117799758911,
"logits/rejected": -0.8412669897079468,
"logps/chosen": -271.9056091308594,
"logps/rejected": -311.70526123046875,
"loss": 0.6336,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.318366676568985,
"rewards/margins": 0.2277708798646927,
"rewards/rejected": -0.5461375117301941,
"step": 340
},
{
"epoch": 0.08397312859884837,
"grad_norm": 17.720120609163384,
"learning_rate": 4.1966426858513185e-07,
"logits/chosen": -1.0602662563323975,
"logits/rejected": -1.0608508586883545,
"logps/chosen": -316.4015197753906,
"logps/rejected": -330.9758605957031,
"loss": 0.6383,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4658915102481842,
"rewards/margins": 0.16727015376091003,
"rewards/rejected": -0.6331616640090942,
"step": 350
},
{
"epoch": 0.08637236084452975,
"grad_norm": 16.65175354746556,
"learning_rate": 4.3165467625899276e-07,
"logits/chosen": -0.8833236694335938,
"logits/rejected": -1.0470280647277832,
"logps/chosen": -314.62298583984375,
"logps/rejected": -272.09527587890625,
"loss": 0.6334,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4979444146156311,
"rewards/margins": 0.1287182867527008,
"rewards/rejected": -0.6266626715660095,
"step": 360
},
{
"epoch": 0.08877159309021113,
"grad_norm": 18.783320614959067,
"learning_rate": 4.436450839328537e-07,
"logits/chosen": -0.8926633596420288,
"logits/rejected": -0.820746898651123,
"logps/chosen": -289.3975524902344,
"logps/rejected": -326.2989196777344,
"loss": 0.6239,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.5632190108299255,
"rewards/margins": 0.2842358648777008,
"rewards/rejected": -0.8474549055099487,
"step": 370
},
{
"epoch": 0.09117082533589252,
"grad_norm": 15.455693570330665,
"learning_rate": 4.556354916067146e-07,
"logits/chosen": -1.033244013786316,
"logits/rejected": -0.9363048672676086,
"logps/chosen": -280.1861877441406,
"logps/rejected": -303.18902587890625,
"loss": 0.5982,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.43663984537124634,
"rewards/margins": 0.23225346207618713,
"rewards/rejected": -0.6688933372497559,
"step": 380
},
{
"epoch": 0.0935700575815739,
"grad_norm": 19.882511537780037,
"learning_rate": 4.676258992805755e-07,
"logits/chosen": -0.8549890518188477,
"logits/rejected": -0.8999547958374023,
"logps/chosen": -325.2740783691406,
"logps/rejected": -315.0982666015625,
"loss": 0.6162,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6211831569671631,
"rewards/margins": 0.15810197591781616,
"rewards/rejected": -0.779285192489624,
"step": 390
},
{
"epoch": 0.09596928982725528,
"grad_norm": 16.372176463297425,
"learning_rate": 4.796163069544364e-07,
"logits/chosen": -0.9059786796569824,
"logits/rejected": -0.9771867990493774,
"logps/chosen": -333.3121032714844,
"logps/rejected": -343.83929443359375,
"loss": 0.6163,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.710607647895813,
"rewards/margins": 0.3263167440891266,
"rewards/rejected": -1.0369244813919067,
"step": 400
},
{
"epoch": 0.09836852207293666,
"grad_norm": 21.155324613616017,
"learning_rate": 4.916067146282974e-07,
"logits/chosen": -1.0448824167251587,
"logits/rejected": -0.9939874410629272,
"logps/chosen": -319.8747253417969,
"logps/rejected": -378.6325378417969,
"loss": 0.5826,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7368643879890442,
"rewards/margins": 0.3942120373249054,
"rewards/rejected": -1.131076455116272,
"step": 410
},
{
"epoch": 0.10076775431861804,
"grad_norm": 23.801221104048313,
"learning_rate": 4.999992108529978e-07,
"logits/chosen": -0.9094634056091309,
"logits/rejected": -0.9350014925003052,
"logps/chosen": -435.54412841796875,
"logps/rejected": -445.7367248535156,
"loss": 0.6147,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0090878009796143,
"rewards/margins": 0.46369805932044983,
"rewards/rejected": -1.4727857112884521,
"step": 420
},
{
"epoch": 0.10316698656429943,
"grad_norm": 24.146443568702797,
"learning_rate": 4.999851817115532e-07,
"logits/chosen": -1.1204873323440552,
"logits/rejected": -1.0295510292053223,
"logps/chosen": -337.2879943847656,
"logps/rejected": -372.93011474609375,
"loss": 0.6154,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.8201831579208374,
"rewards/margins": 0.4531213343143463,
"rewards/rejected": -1.2733044624328613,
"step": 430
},
{
"epoch": 0.10556621880998081,
"grad_norm": 17.50567821054797,
"learning_rate": 4.999536171027889e-07,
"logits/chosen": -0.8827461004257202,
"logits/rejected": -0.9801127314567566,
"logps/chosen": -339.089599609375,
"logps/rejected": -357.59283447265625,
"loss": 0.6034,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8114099502563477,
"rewards/margins": 0.2592558264732361,
"rewards/rejected": -1.0706658363342285,
"step": 440
},
{
"epoch": 0.10796545105566219,
"grad_norm": 20.292040142941033,
"learning_rate": 4.999045192408369e-07,
"logits/chosen": -0.9986470937728882,
"logits/rejected": -1.0170477628707886,
"logps/chosen": -317.05694580078125,
"logps/rejected": -324.7137145996094,
"loss": 0.6153,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7792980074882507,
"rewards/margins": 0.23761887848377228,
"rewards/rejected": -1.0169168710708618,
"step": 450
},
{
"epoch": 0.11036468330134357,
"grad_norm": 20.30371683148718,
"learning_rate": 4.998378915697171e-07,
"logits/chosen": -0.9217838048934937,
"logits/rejected": -0.9802480936050415,
"logps/chosen": -339.0627136230469,
"logps/rejected": -374.9605407714844,
"loss": 0.5762,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5629919767379761,
"rewards/margins": 0.43420299887657166,
"rewards/rejected": -0.9971949458122253,
"step": 460
},
{
"epoch": 0.11276391554702495,
"grad_norm": 23.739625609026348,
"learning_rate": 4.997537387630958e-07,
"logits/chosen": -1.0254257917404175,
"logits/rejected": -1.0769364833831787,
"logps/chosen": -299.828125,
"logps/rejected": -346.30621337890625,
"loss": 0.5646,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.757472038269043,
"rewards/margins": 0.4156116843223572,
"rewards/rejected": -1.1730836629867554,
"step": 470
},
{
"epoch": 0.11516314779270634,
"grad_norm": 24.76367716580562,
"learning_rate": 4.996520667239582e-07,
"logits/chosen": -1.2554540634155273,
"logits/rejected": -1.1178642511367798,
"logps/chosen": -323.2625427246094,
"logps/rejected": -438.39569091796875,
"loss": 0.5663,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7605866193771362,
"rewards/margins": 0.6550606489181519,
"rewards/rejected": -1.415647268295288,
"step": 480
},
{
"epoch": 0.11756238003838772,
"grad_norm": 24.50216433662912,
"learning_rate": 4.995328825841939e-07,
"logits/chosen": -0.967789351940155,
"logits/rejected": -0.9846280813217163,
"logps/chosen": -327.4945373535156,
"logps/rejected": -458.9354553222656,
"loss": 0.5625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8878215551376343,
"rewards/margins": 1.3659319877624512,
"rewards/rejected": -2.253753662109375,
"step": 490
},
{
"epoch": 0.1199616122840691,
"grad_norm": 22.44855997761646,
"learning_rate": 4.993961947040967e-07,
"logits/chosen": -0.9332793354988098,
"logits/rejected": -1.0110089778900146,
"logps/chosen": -384.03509521484375,
"logps/rejected": -390.9599914550781,
"loss": 0.5758,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0161880254745483,
"rewards/margins": 0.4222269654273987,
"rewards/rejected": -1.438415288925171,
"step": 500
},
{
"epoch": 0.12236084452975048,
"grad_norm": 20.668791420309383,
"learning_rate": 4.992420126717784e-07,
"logits/chosen": -1.0082364082336426,
"logits/rejected": -0.9416404962539673,
"logps/chosen": -328.42987060546875,
"logps/rejected": -429.83099365234375,
"loss": 0.5653,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.6905103921890259,
"rewards/margins": 1.0820300579071045,
"rewards/rejected": -1.7725404500961304,
"step": 510
},
{
"epoch": 0.12476007677543186,
"grad_norm": 22.77455852864622,
"learning_rate": 4.990703473024958e-07,
"logits/chosen": -0.8772110939025879,
"logits/rejected": -1.0350819826126099,
"logps/chosen": -414.46978759765625,
"logps/rejected": -478.1474609375,
"loss": 0.5796,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.259319543838501,
"rewards/margins": 0.6991303563117981,
"rewards/rejected": -1.9584500789642334,
"step": 520
},
{
"epoch": 0.12715930902111325,
"grad_norm": 22.984070098434174,
"learning_rate": 4.98881210637893e-07,
"logits/chosen": -1.167495608329773,
"logits/rejected": -1.10280442237854,
"logps/chosen": -298.97845458984375,
"logps/rejected": -406.2267761230469,
"loss": 0.5751,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.7922585606575012,
"rewards/margins": 0.7441150546073914,
"rewards/rejected": -1.5363736152648926,
"step": 530
},
{
"epoch": 0.1295585412667946,
"grad_norm": 28.715620814370784,
"learning_rate": 4.986746159451553e-07,
"logits/chosen": -0.9988299608230591,
"logits/rejected": -1.0019475221633911,
"logps/chosen": -349.500732421875,
"logps/rejected": -464.0498046875,
"loss": 0.5733,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9567633867263794,
"rewards/margins": 1.248631477355957,
"rewards/rejected": -2.205394983291626,
"step": 540
},
{
"epoch": 0.131957773512476,
"grad_norm": 24.64040581669011,
"learning_rate": 4.984505777160795e-07,
"logits/chosen": -0.8519158363342285,
"logits/rejected": -0.8761106729507446,
"logps/chosen": -391.86907958984375,
"logps/rejected": -490.4788513183594,
"loss": 0.5847,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9429546594619751,
"rewards/margins": 0.9859554171562195,
"rewards/rejected": -1.9289100170135498,
"step": 550
},
{
"epoch": 0.1343570057581574,
"grad_norm": 21.445602932905317,
"learning_rate": 4.982091116660574e-07,
"logits/chosen": -0.984406590461731,
"logits/rejected": -1.1147202253341675,
"logps/chosen": -273.537353515625,
"logps/rejected": -286.6374816894531,
"loss": 0.5874,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.7375530004501343,
"rewards/margins": 0.3278002142906189,
"rewards/rejected": -1.0653531551361084,
"step": 560
},
{
"epoch": 0.13675623800383876,
"grad_norm": 30.19526853243183,
"learning_rate": 4.979502347329732e-07,
"logits/chosen": -0.8063471913337708,
"logits/rejected": -0.8000919222831726,
"logps/chosen": -368.16864013671875,
"logps/rejected": -476.47247314453125,
"loss": 0.556,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9274753332138062,
"rewards/margins": 0.7706169486045837,
"rewards/rejected": -1.6980924606323242,
"step": 570
},
{
"epoch": 0.13915547024952016,
"grad_norm": 43.10504058256567,
"learning_rate": 4.976739650760151e-07,
"logits/chosen": -1.0534042119979858,
"logits/rejected": -1.0876705646514893,
"logps/chosen": -374.096923828125,
"logps/rejected": -453.3235778808594,
"loss": 0.5548,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1569541692733765,
"rewards/margins": 0.8895587921142578,
"rewards/rejected": -2.046513080596924,
"step": 580
},
{
"epoch": 0.14155470249520152,
"grad_norm": 33.097831242106516,
"learning_rate": 4.97380322074402e-07,
"logits/chosen": -0.7486315369606018,
"logits/rejected": -0.8265093564987183,
"logps/chosen": -342.4309387207031,
"logps/rejected": -411.20416259765625,
"loss": 0.5917,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2151663303375244,
"rewards/margins": 0.699668824672699,
"rewards/rejected": -1.9148353338241577,
"step": 590
},
{
"epoch": 0.14395393474088292,
"grad_norm": 26.621827613808147,
"learning_rate": 4.970693263260237e-07,
"logits/chosen": -1.0230684280395508,
"logits/rejected": -1.1072685718536377,
"logps/chosen": -387.28839111328125,
"logps/rejected": -406.2510986328125,
"loss": 0.5536,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8648967742919922,
"rewards/margins": 0.6640993356704712,
"rewards/rejected": -1.528996229171753,
"step": 600
},
{
"epoch": 0.1463531669865643,
"grad_norm": 36.11327139057057,
"learning_rate": 4.967409996459966e-07,
"logits/chosen": -0.9299138188362122,
"logits/rejected": -0.949521541595459,
"logps/chosen": -357.72381591796875,
"logps/rejected": -395.5397644042969,
"loss": 0.5318,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9965571165084839,
"rewards/margins": 0.6282019019126892,
"rewards/rejected": -1.6247589588165283,
"step": 610
},
{
"epoch": 0.14875239923224567,
"grad_norm": 110.15447659680028,
"learning_rate": 4.963953650651326e-07,
"logits/chosen": -0.8881407976150513,
"logits/rejected": -0.9683340191841125,
"logps/chosen": -517.2413330078125,
"logps/rejected": -486.80865478515625,
"loss": 0.5738,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.7012176513671875,
"rewards/margins": 0.6705323457717896,
"rewards/rejected": -2.3717498779296875,
"step": 620
},
{
"epoch": 0.15115163147792707,
"grad_norm": 24.7770238152267,
"learning_rate": 4.960324468283248e-07,
"logits/chosen": -1.0681040287017822,
"logits/rejected": -1.102770209312439,
"logps/chosen": -297.15594482421875,
"logps/rejected": -379.40032958984375,
"loss": 0.5235,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.7767683267593384,
"rewards/margins": 0.8115633726119995,
"rewards/rejected": -1.5883318185806274,
"step": 630
},
{
"epoch": 0.15355086372360843,
"grad_norm": 39.277732598538044,
"learning_rate": 4.956522703928451e-07,
"logits/chosen": -1.011266827583313,
"logits/rejected": -0.8994027376174927,
"logps/chosen": -318.6116943359375,
"logps/rejected": -397.0060119628906,
"loss": 0.5233,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8186876177787781,
"rewards/margins": 0.6957352757453918,
"rewards/rejected": -1.5144227743148804,
"step": 640
},
{
"epoch": 0.15595009596928983,
"grad_norm": 37.25851736588769,
"learning_rate": 4.952548624265606e-07,
"logits/chosen": -0.9296770095825195,
"logits/rejected": -0.9706400632858276,
"logps/chosen": -394.94012451171875,
"logps/rejected": -462.7085876464844,
"loss": 0.5689,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3027846813201904,
"rewards/margins": 0.7685932517051697,
"rewards/rejected": -2.071377992630005,
"step": 650
},
{
"epoch": 0.15834932821497122,
"grad_norm": 23.054150563951033,
"learning_rate": 4.948402508060607e-07,
"logits/chosen": -1.106856346130371,
"logits/rejected": -1.1293423175811768,
"logps/chosen": -319.3155822753906,
"logps/rejected": -400.53863525390625,
"loss": 0.5562,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8783910870552063,
"rewards/margins": 0.9650925397872925,
"rewards/rejected": -1.843483567237854,
"step": 660
},
{
"epoch": 0.16074856046065258,
"grad_norm": 42.278776304188725,
"learning_rate": 4.944084646147038e-07,
"logits/chosen": -0.9512613415718079,
"logits/rejected": -1.0062638521194458,
"logps/chosen": -390.70245361328125,
"logps/rejected": -408.19207763671875,
"loss": 0.6226,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8297697305679321,
"rewards/margins": 0.40133753418922424,
"rewards/rejected": -1.2311073541641235,
"step": 670
},
{
"epoch": 0.16314779270633398,
"grad_norm": 26.571180814300117,
"learning_rate": 4.939595341405754e-07,
"logits/chosen": -0.936383843421936,
"logits/rejected": -0.9673361778259277,
"logps/chosen": -333.87176513671875,
"logps/rejected": -367.17034912109375,
"loss": 0.5324,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8141332864761353,
"rewards/margins": 0.5334564447402954,
"rewards/rejected": -1.3475896120071411,
"step": 680
},
{
"epoch": 0.16554702495201534,
"grad_norm": 31.203377729047734,
"learning_rate": 4.93493490874365e-07,
"logits/chosen": -0.970528781414032,
"logits/rejected": -0.9868891835212708,
"logps/chosen": -359.50250244140625,
"logps/rejected": -437.6192932128906,
"loss": 0.5371,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2206305265426636,
"rewards/margins": 0.7284920811653137,
"rewards/rejected": -1.949122667312622,
"step": 690
},
{
"epoch": 0.16794625719769674,
"grad_norm": 26.866432369585116,
"learning_rate": 4.93010367507156e-07,
"logits/chosen": -1.1611895561218262,
"logits/rejected": -1.1491591930389404,
"logps/chosen": -315.22088623046875,
"logps/rejected": -436.4913635253906,
"loss": 0.5069,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0012879371643066,
"rewards/margins": 1.438588261604309,
"rewards/rejected": -2.439876079559326,
"step": 700
},
{
"epoch": 0.17034548944337813,
"grad_norm": 39.78872918816068,
"learning_rate": 4.925101979281332e-07,
"logits/chosen": -1.0544211864471436,
"logits/rejected": -1.2225271463394165,
"logps/chosen": -457.6055603027344,
"logps/rejected": -532.146240234375,
"loss": 0.5203,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.50386381149292,
"rewards/margins": 1.2967182397842407,
"rewards/rejected": -2.800581693649292,
"step": 710
},
{
"epoch": 0.1727447216890595,
"grad_norm": 51.98312964639157,
"learning_rate": 4.919930172222054e-07,
"logits/chosen": -1.0103614330291748,
"logits/rejected": -1.0998207330703735,
"logps/chosen": -389.05126953125,
"logps/rejected": -467.95892333984375,
"loss": 0.5022,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.418017029762268,
"rewards/margins": 0.8280168771743774,
"rewards/rejected": -2.2460339069366455,
"step": 720
},
{
"epoch": 0.1751439539347409,
"grad_norm": 63.53276334873331,
"learning_rate": 4.914588616675445e-07,
"logits/chosen": -1.100673794746399,
"logits/rejected": -1.091517686843872,
"logps/chosen": -379.39208984375,
"logps/rejected": -458.4183044433594,
"loss": 0.5784,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3578667640686035,
"rewards/margins": 0.995037853717804,
"rewards/rejected": -2.3529045581817627,
"step": 730
},
{
"epoch": 0.17754318618042225,
"grad_norm": 29.527270739257894,
"learning_rate": 4.909077687330404e-07,
"logits/chosen": -0.8857021331787109,
"logits/rejected": -0.975568413734436,
"logps/chosen": -357.48187255859375,
"logps/rejected": -378.65234375,
"loss": 0.5127,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9617660641670227,
"rewards/margins": 0.4279107451438904,
"rewards/rejected": -1.3896766901016235,
"step": 740
},
{
"epoch": 0.17994241842610365,
"grad_norm": 48.092463876152614,
"learning_rate": 4.903397770756729e-07,
"logits/chosen": -1.0979223251342773,
"logits/rejected": -1.15996515750885,
"logps/chosen": -342.7486267089844,
"logps/rejected": -429.194580078125,
"loss": 0.5268,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8847867846488953,
"rewards/margins": 0.835138201713562,
"rewards/rejected": -1.7199251651763916,
"step": 750
},
{
"epoch": 0.18234165067178504,
"grad_norm": 32.41150087194352,
"learning_rate": 4.897549265378004e-07,
"logits/chosen": -1.0312681198120117,
"logits/rejected": -1.1079070568084717,
"logps/chosen": -474.59259033203125,
"logps/rejected": -628.4571533203125,
"loss": 0.5173,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.605443000793457,
"rewards/margins": 1.5567104816436768,
"rewards/rejected": -3.162153720855713,
"step": 760
},
{
"epoch": 0.1847408829174664,
"grad_norm": 43.75189458459123,
"learning_rate": 4.891532581443643e-07,
"logits/chosen": -1.274938941001892,
"logits/rejected": -1.308607816696167,
"logps/chosen": -466.8857421875,
"logps/rejected": -609.5013427734375,
"loss": 0.5271,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6197988986968994,
"rewards/margins": 1.4712640047073364,
"rewards/rejected": -3.0910630226135254,
"step": 770
},
{
"epoch": 0.1871401151631478,
"grad_norm": 39.5075439819695,
"learning_rate": 4.885348141000122e-07,
"logits/chosen": -1.109499216079712,
"logits/rejected": -1.0709692239761353,
"logps/chosen": -367.5265808105469,
"logps/rejected": -455.7412109375,
"loss": 0.4992,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2532503604888916,
"rewards/margins": 0.814247727394104,
"rewards/rejected": -2.067498207092285,
"step": 780
},
{
"epoch": 0.18953934740882916,
"grad_norm": 50.004130642833005,
"learning_rate": 4.878996377861367e-07,
"logits/chosen": -1.068798303604126,
"logits/rejected": -1.1314045190811157,
"logps/chosen": -326.4665222167969,
"logps/rejected": -417.7521057128906,
"loss": 0.5083,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.0814282894134521,
"rewards/margins": 0.8757278323173523,
"rewards/rejected": -1.9571564197540283,
"step": 790
},
{
"epoch": 0.19193857965451055,
"grad_norm": 36.000949990598,
"learning_rate": 4.872477737578327e-07,
"logits/chosen": -1.091275930404663,
"logits/rejected": -1.0248304605484009,
"logps/chosen": -439.6976623535156,
"logps/rejected": -656.8413696289062,
"loss": 0.4559,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.5745375156402588,
"rewards/margins": 2.239027261734009,
"rewards/rejected": -3.8135643005371094,
"step": 800
},
{
"epoch": 0.19433781190019195,
"grad_norm": 74.95862087165564,
"learning_rate": 4.865792677407718e-07,
"logits/chosen": -1.1572951078414917,
"logits/rejected": -1.2205321788787842,
"logps/chosen": -395.1188659667969,
"logps/rejected": -492.11865234375,
"loss": 0.5769,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5695520639419556,
"rewards/margins": 1.1664550304412842,
"rewards/rejected": -2.73600697517395,
"step": 810
},
{
"epoch": 0.1967370441458733,
"grad_norm": 33.56632786243978,
"learning_rate": 4.858941666279955e-07,
"logits/chosen": -0.9763522148132324,
"logits/rejected": -1.0531394481658936,
"logps/chosen": -409.54302978515625,
"logps/rejected": -430.5146484375,
"loss": 0.584,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4252262115478516,
"rewards/margins": 0.3249640464782715,
"rewards/rejected": -1.7501903772354126,
"step": 820
},
{
"epoch": 0.1991362763915547,
"grad_norm": 44.099904884961624,
"learning_rate": 4.851925184766247e-07,
"logits/chosen": -1.0988633632659912,
"logits/rejected": -1.1514496803283691,
"logps/chosen": -356.073974609375,
"logps/rejected": -433.0359802246094,
"loss": 0.5209,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.0130106210708618,
"rewards/margins": 0.9259645342826843,
"rewards/rejected": -1.9389750957489014,
"step": 830
},
{
"epoch": 0.20153550863723607,
"grad_norm": 32.03347708687078,
"learning_rate": 4.844743725044897e-07,
"logits/chosen": -1.031770944595337,
"logits/rejected": -1.2438600063323975,
"logps/chosen": -349.9236755371094,
"logps/rejected": -410.17193603515625,
"loss": 0.5217,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9646291732788086,
"rewards/margins": 0.8405798077583313,
"rewards/rejected": -1.8052088022232056,
"step": 840
},
{
"epoch": 0.20393474088291746,
"grad_norm": 28.831301311281855,
"learning_rate": 4.837397790866774e-07,
"logits/chosen": -1.206099033355713,
"logits/rejected": -1.1994575262069702,
"logps/chosen": -389.10491943359375,
"logps/rejected": -480.0270080566406,
"loss": 0.5573,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9688132405281067,
"rewards/margins": 1.1717948913574219,
"rewards/rejected": -2.140608310699463,
"step": 850
},
{
"epoch": 0.20633397312859886,
"grad_norm": 32.87509191390056,
"learning_rate": 4.829887897519974e-07,
"logits/chosen": -1.2817895412445068,
"logits/rejected": -1.247855305671692,
"logps/chosen": -326.98419189453125,
"logps/rejected": -420.003173828125,
"loss": 0.532,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9683993458747864,
"rewards/margins": 0.7596687078475952,
"rewards/rejected": -1.7280681133270264,
"step": 860
},
{
"epoch": 0.20873320537428022,
"grad_norm": 37.88063549323584,
"learning_rate": 4.82221457179368e-07,
"logits/chosen": -1.261070966720581,
"logits/rejected": -1.2210159301757812,
"logps/chosen": -387.666259765625,
"logps/rejected": -531.7518310546875,
"loss": 0.4613,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.138819932937622,
"rewards/margins": 1.6149892807006836,
"rewards/rejected": -2.7538094520568848,
"step": 870
},
{
"epoch": 0.21113243761996162,
"grad_norm": 58.84381198153733,
"learning_rate": 4.814378351941206e-07,
"logits/chosen": -1.1720526218414307,
"logits/rejected": -1.23434579372406,
"logps/chosen": -395.0361328125,
"logps/rejected": -475.3619079589844,
"loss": 0.5302,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.363128900527954,
"rewards/margins": 0.9177983403205872,
"rewards/rejected": -2.2809271812438965,
"step": 880
},
{
"epoch": 0.21353166986564298,
"grad_norm": 49.26204797533434,
"learning_rate": 4.806379787642241e-07,
"logits/chosen": -1.207521915435791,
"logits/rejected": -1.2205421924591064,
"logps/chosen": -425.2489318847656,
"logps/rejected": -575.5451049804688,
"loss": 0.5495,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7557293176651,
"rewards/margins": 1.5624897480010986,
"rewards/rejected": -3.318218946456909,
"step": 890
},
{
"epoch": 0.21593090211132437,
"grad_norm": 43.44042160829665,
"learning_rate": 4.798219439964293e-07,
"logits/chosen": -1.1142994165420532,
"logits/rejected": -1.1826696395874023,
"logps/chosen": -370.36236572265625,
"logps/rejected": -413.3993225097656,
"loss": 0.4903,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.159775733947754,
"rewards/margins": 0.3043367862701416,
"rewards/rejected": -1.4641125202178955,
"step": 900
},
{
"epoch": 0.21833013435700577,
"grad_norm": 40.8770183610601,
"learning_rate": 4.78989788132333e-07,
"logits/chosen": -1.0691736936569214,
"logits/rejected": -1.0526028871536255,
"logps/chosen": -325.0689392089844,
"logps/rejected": -487.765869140625,
"loss": 0.4695,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.01616632938385,
"rewards/margins": 1.5912829637527466,
"rewards/rejected": -2.6074492931365967,
"step": 910
},
{
"epoch": 0.22072936660268713,
"grad_norm": 42.43700473469738,
"learning_rate": 4.781415695443631e-07,
"logits/chosen": -1.1770126819610596,
"logits/rejected": -1.2120893001556396,
"logps/chosen": -507.10418701171875,
"logps/rejected": -643.808349609375,
"loss": 0.5198,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.1411383152008057,
"rewards/margins": 1.3329979181289673,
"rewards/rejected": -3.4741358757019043,
"step": 920
},
{
"epoch": 0.22312859884836853,
"grad_norm": 35.4780805866807,
"learning_rate": 4.772773477316836e-07,
"logits/chosen": -1.09368896484375,
"logits/rejected": -1.1431119441986084,
"logps/chosen": -385.4435729980469,
"logps/rejected": -476.1529846191406,
"loss": 0.5098,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2141118049621582,
"rewards/margins": 0.879438579082489,
"rewards/rejected": -2.093550205230713,
"step": 930
},
{
"epoch": 0.2255278310940499,
"grad_norm": 62.42297425070803,
"learning_rate": 4.7639718331602117e-07,
"logits/chosen": -1.0556354522705078,
"logits/rejected": -1.0695927143096924,
"logps/chosen": -428.8665466308594,
"logps/rejected": -625.2503662109375,
"loss": 0.5214,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4926120042800903,
"rewards/margins": 2.130432367324829,
"rewards/rejected": -3.62304425239563,
"step": 940
},
{
"epoch": 0.22792706333973128,
"grad_norm": 72.76307491975679,
"learning_rate": 4.7550113803741275e-07,
"logits/chosen": -1.135399580001831,
"logits/rejected": -1.2878599166870117,
"logps/chosen": -425.1543884277344,
"logps/rejected": -450.4256286621094,
"loss": 0.5152,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3350064754486084,
"rewards/margins": 0.9679735898971558,
"rewards/rejected": -2.3029801845550537,
"step": 950
},
{
"epoch": 0.23032629558541268,
"grad_norm": 81.86639367536438,
"learning_rate": 4.7458927474987454e-07,
"logits/chosen": -1.1322991847991943,
"logits/rejected": -1.2043081521987915,
"logps/chosen": -430.93438720703125,
"logps/rejected": -453.57568359375,
"loss": 0.5059,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.235338568687439,
"rewards/margins": 0.7968686819076538,
"rewards/rejected": -2.0322070121765137,
"step": 960
},
{
"epoch": 0.23272552783109404,
"grad_norm": 63.23515877313946,
"learning_rate": 4.7366165741699347e-07,
"logits/chosen": -1.0109546184539795,
"logits/rejected": -1.0750302076339722,
"logps/chosen": -470.6412048339844,
"logps/rejected": -530.3613891601562,
"loss": 0.4727,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.46035635471344,
"rewards/margins": 0.9168599247932434,
"rewards/rejected": -2.377216100692749,
"step": 970
},
{
"epoch": 0.23512476007677544,
"grad_norm": 56.92032329813907,
"learning_rate": 4.727183511074401e-07,
"logits/chosen": -1.2379209995269775,
"logits/rejected": -1.258826732635498,
"logps/chosen": -412.4378356933594,
"logps/rejected": -459.80169677734375,
"loss": 0.505,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.271667718887329,
"rewards/margins": 0.624854326248169,
"rewards/rejected": -1.8965221643447876,
"step": 980
},
{
"epoch": 0.2375239923224568,
"grad_norm": 50.38298248984227,
"learning_rate": 4.717594219904043e-07,
"logits/chosen": -1.0959298610687256,
"logits/rejected": -1.2155076265335083,
"logps/chosen": -446.4449157714844,
"logps/rejected": -513.0243530273438,
"loss": 0.5422,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8132413625717163,
"rewards/margins": 1.0427824258804321,
"rewards/rejected": -2.8560237884521484,
"step": 990
},
{
"epoch": 0.2399232245681382,
"grad_norm": 31.549570234936283,
"learning_rate": 4.7078493733095393e-07,
"logits/chosen": -1.0665780305862427,
"logits/rejected": -1.1320844888687134,
"logps/chosen": -422.85858154296875,
"logps/rejected": -537.7945556640625,
"loss": 0.4704,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6550779342651367,
"rewards/margins": 1.1452900171279907,
"rewards/rejected": -2.800367832183838,
"step": 1000
},
{
"epoch": 0.2423224568138196,
"grad_norm": 69.1151390549962,
"learning_rate": 4.6979496548531614e-07,
"logits/chosen": -1.2547630071640015,
"logits/rejected": -1.210106372833252,
"logps/chosen": -437.75146484375,
"logps/rejected": -624.9695434570312,
"loss": 0.5139,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.724177598953247,
"rewards/margins": 1.4503930807113647,
"rewards/rejected": -3.1745710372924805,
"step": 1010
},
{
"epoch": 0.24472168905950095,
"grad_norm": 54.03780429318879,
"learning_rate": 4.6878957589608293e-07,
"logits/chosen": -1.163216233253479,
"logits/rejected": -1.1093411445617676,
"logps/chosen": -447.0042419433594,
"logps/rejected": -650.7940063476562,
"loss": 0.5585,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.826093077659607,
"rewards/margins": 1.6275144815444946,
"rewards/rejected": -3.4536075592041016,
"step": 1020
},
{
"epoch": 0.24712092130518235,
"grad_norm": 32.185267978697475,
"learning_rate": 4.6776883908733956e-07,
"logits/chosen": -1.2985765933990479,
"logits/rejected": -1.4077537059783936,
"logps/chosen": -435.51824951171875,
"logps/rejected": -492.0585021972656,
"loss": 0.4836,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.3929134607315063,
"rewards/margins": 1.2694865465164185,
"rewards/rejected": -2.6623997688293457,
"step": 1030
},
{
"epoch": 0.2495201535508637,
"grad_norm": 41.371752275653265,
"learning_rate": 4.667328266597178e-07,
"logits/chosen": -1.1596192121505737,
"logits/rejected": -1.1949832439422607,
"logps/chosen": -382.23492431640625,
"logps/rejected": -510.849609375,
"loss": 0.4711,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.294812798500061,
"rewards/margins": 1.3286726474761963,
"rewards/rejected": -2.623485565185547,
"step": 1040
},
{
"epoch": 0.2519193857965451,
"grad_norm": 35.25863038534864,
"learning_rate": 4.6568161128537354e-07,
"logits/chosen": -1.1067166328430176,
"logits/rejected": -1.262487530708313,
"logps/chosen": -414.877685546875,
"logps/rejected": -446.10076904296875,
"loss": 0.5067,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.5195152759552002,
"rewards/margins": 0.8884671926498413,
"rewards/rejected": -2.407982349395752,
"step": 1050
},
{
"epoch": 0.2543186180422265,
"grad_norm": 41.117661614634926,
"learning_rate": 4.6461526670288877e-07,
"logits/chosen": -1.1547003984451294,
"logits/rejected": -1.1661105155944824,
"logps/chosen": -398.5390319824219,
"logps/rejected": -465.92010498046875,
"loss": 0.5088,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.312683343887329,
"rewards/margins": 0.9579456448554993,
"rewards/rejected": -2.2706291675567627,
"step": 1060
},
{
"epoch": 0.2567178502879079,
"grad_norm": 43.48076574914832,
"learning_rate": 4.635338677120994e-07,
"logits/chosen": -1.366939902305603,
"logits/rejected": -1.3449715375900269,
"logps/chosen": -382.4663391113281,
"logps/rejected": -528.1848754882812,
"loss": 0.4686,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1687244176864624,
"rewards/margins": 1.3069040775299072,
"rewards/rejected": -2.475628614425659,
"step": 1070
},
{
"epoch": 0.2591170825335892,
"grad_norm": 44.614924118264355,
"learning_rate": 4.6243749016884835e-07,
"logits/chosen": -1.1942687034606934,
"logits/rejected": -1.2592580318450928,
"logps/chosen": -479.9891662597656,
"logps/rejected": -708.2697143554688,
"loss": 0.5414,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0574417114257812,
"rewards/margins": 1.8553825616836548,
"rewards/rejected": -3.9128241539001465,
"step": 1080
},
{
"epoch": 0.2615163147792706,
"grad_norm": 59.97327717096107,
"learning_rate": 4.613262109796645e-07,
"logits/chosen": -1.2804964780807495,
"logits/rejected": -1.1930066347122192,
"logps/chosen": -415.44415283203125,
"logps/rejected": -597.2564697265625,
"loss": 0.5053,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.665036916732788,
"rewards/margins": 1.5294617414474487,
"rewards/rejected": -3.1944985389709473,
"step": 1090
},
{
"epoch": 0.263915547024952,
"grad_norm": 38.425411352382326,
"learning_rate": 4.602001080963678e-07,
"logits/chosen": -1.2301857471466064,
"logits/rejected": -1.2801592350006104,
"logps/chosen": -401.37860107421875,
"logps/rejected": -571.9041748046875,
"loss": 0.4782,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3433271646499634,
"rewards/margins": 1.8979564905166626,
"rewards/rejected": -3.241283416748047,
"step": 1100
},
{
"epoch": 0.2663147792706334,
"grad_norm": 64.38781898233145,
"learning_rate": 4.590592605106017e-07,
"logits/chosen": -1.2344551086425781,
"logits/rejected": -1.2683765888214111,
"logps/chosen": -422.1192321777344,
"logps/rejected": -536.6231079101562,
"loss": 0.5268,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.380975604057312,
"rewards/margins": 1.4407894611358643,
"rewards/rejected": -2.821765184402466,
"step": 1110
},
{
"epoch": 0.2687140115163148,
"grad_norm": 37.84082368188261,
"learning_rate": 4.5790374824829165e-07,
"logits/chosen": -1.185731053352356,
"logits/rejected": -1.245924949645996,
"logps/chosen": -321.22283935546875,
"logps/rejected": -476.427001953125,
"loss": 0.5349,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.3014849424362183,
"rewards/margins": 1.4552912712097168,
"rewards/rejected": -2.7567763328552246,
"step": 1120
},
{
"epoch": 0.27111324376199614,
"grad_norm": 69.30402137772205,
"learning_rate": 4.5673365236403216e-07,
"logits/chosen": -1.1933674812316895,
"logits/rejected": -1.273466944694519,
"logps/chosen": -366.64105224609375,
"logps/rejected": -533.8867797851562,
"loss": 0.4867,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6127936840057373,
"rewards/margins": 1.428100347518921,
"rewards/rejected": -3.040894031524658,
"step": 1130
},
{
"epoch": 0.27351247600767753,
"grad_norm": 42.84417987935753,
"learning_rate": 4.5554905493540075e-07,
"logits/chosen": -1.4118454456329346,
"logits/rejected": -1.4354169368743896,
"logps/chosen": -371.85394287109375,
"logps/rejected": -596.58349609375,
"loss": 0.4489,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.5539534091949463,
"rewards/margins": 2.193103313446045,
"rewards/rejected": -3.747056484222412,
"step": 1140
},
{
"epoch": 0.2759117082533589,
"grad_norm": 67.4443408265607,
"learning_rate": 4.5435003905720074e-07,
"logits/chosen": -1.305117130279541,
"logits/rejected": -1.3726755380630493,
"logps/chosen": -571.3667602539062,
"logps/rejected": -721.839111328125,
"loss": 0.5057,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.947200059890747,
"rewards/margins": 1.7557926177978516,
"rewards/rejected": -4.7029924392700195,
"step": 1150
},
{
"epoch": 0.2783109404990403,
"grad_norm": 41.247470313425566,
"learning_rate": 4.531366888356324e-07,
"logits/chosen": -1.2661703824996948,
"logits/rejected": -1.209538459777832,
"logps/chosen": -392.5999450683594,
"logps/rejected": -645.23779296875,
"loss": 0.4602,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.0089774131774902,
"rewards/margins": 2.1448001861572266,
"rewards/rejected": -4.153777122497559,
"step": 1160
},
{
"epoch": 0.2807101727447217,
"grad_norm": 44.44476003130934,
"learning_rate": 4.519090893823931e-07,
"logits/chosen": -1.2478493452072144,
"logits/rejected": -1.3221884965896606,
"logps/chosen": -430.3058166503906,
"logps/rejected": -560.8129272460938,
"loss": 0.4847,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8127937316894531,
"rewards/margins": 1.4228136539459229,
"rewards/rejected": -3.235607624053955,
"step": 1170
},
{
"epoch": 0.28310940499040305,
"grad_norm": 58.90294230523504,
"learning_rate": 4.5066732680870734e-07,
"logits/chosen": -1.2605488300323486,
"logits/rejected": -1.371800422668457,
"logps/chosen": -394.3585205078125,
"logps/rejected": -516.7437744140625,
"loss": 0.4532,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.4103697538375854,
"rewards/margins": 1.6034168004989624,
"rewards/rejected": -3.0137863159179688,
"step": 1180
},
{
"epoch": 0.28550863723608444,
"grad_norm": 63.52046207494392,
"learning_rate": 4.494114882192862e-07,
"logits/chosen": -1.1947085857391357,
"logits/rejected": -1.2123124599456787,
"logps/chosen": -448.3394470214844,
"logps/rejected": -599.2200927734375,
"loss": 0.4769,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9440696239471436,
"rewards/margins": 1.7676725387573242,
"rewards/rejected": -3.711742877960205,
"step": 1190
},
{
"epoch": 0.28790786948176583,
"grad_norm": 39.36667987271267,
"learning_rate": 4.4814166170621735e-07,
"logits/chosen": -1.3341089487075806,
"logits/rejected": -1.3843356370925903,
"logps/chosen": -467.9960021972656,
"logps/rejected": -570.3829956054688,
"loss": 0.4999,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.193697452545166,
"rewards/margins": 1.253821611404419,
"rewards/rejected": -3.447518825531006,
"step": 1200
},
{
"epoch": 0.2903071017274472,
"grad_norm": 87.46450239350362,
"learning_rate": 4.468579363427858e-07,
"logits/chosen": -1.2197396755218506,
"logits/rejected": -1.2728745937347412,
"logps/chosen": -416.8896484375,
"logps/rejected": -539.3204345703125,
"loss": 0.4654,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5721436738967896,
"rewards/margins": 1.510520339012146,
"rewards/rejected": -3.0826640129089355,
"step": 1210
},
{
"epoch": 0.2927063339731286,
"grad_norm": 51.78500104445991,
"learning_rate": 4.4556040217722555e-07,
"logits/chosen": -1.3058878183364868,
"logits/rejected": -1.25492262840271,
"logps/chosen": -362.447509765625,
"logps/rejected": -506.99200439453125,
"loss": 0.4799,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2098300457000732,
"rewards/margins": 1.2542223930358887,
"rewards/rejected": -2.464052200317383,
"step": 1220
},
{
"epoch": 0.29510556621880996,
"grad_norm": 56.76563538419931,
"learning_rate": 4.442491502264033e-07,
"logits/chosen": -1.2897526025772095,
"logits/rejected": -1.2957372665405273,
"logps/chosen": -358.2672424316406,
"logps/rejected": -424.9501953125,
"loss": 0.5064,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.3041239976882935,
"rewards/margins": 0.8152852058410645,
"rewards/rejected": -2.1194090843200684,
"step": 1230
},
{
"epoch": 0.29750479846449135,
"grad_norm": 37.17813817407036,
"learning_rate": 4.429242724694338e-07,
"logits/chosen": -1.4048488140106201,
"logits/rejected": -1.3725817203521729,
"logps/chosen": -374.0482482910156,
"logps/rejected": -543.1328125,
"loss": 0.4712,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2972534894943237,
"rewards/margins": 1.5613895654678345,
"rewards/rejected": -2.858642816543579,
"step": 1240
},
{
"epoch": 0.29990403071017274,
"grad_norm": 121.14546355660654,
"learning_rate": 4.4158586184122817e-07,
"logits/chosen": -1.3434243202209473,
"logits/rejected": -1.4299046993255615,
"logps/chosen": -455.94219970703125,
"logps/rejected": -574.8892822265625,
"loss": 0.4767,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.651075005531311,
"rewards/margins": 1.5524400472640991,
"rewards/rejected": -3.203514814376831,
"step": 1250
},
{
"epoch": 0.30230326295585414,
"grad_norm": 58.51061462804094,
"learning_rate": 4.4023401222597443e-07,
"logits/chosen": -1.2570745944976807,
"logits/rejected": -1.3877769708633423,
"logps/chosen": -464.9574279785156,
"logps/rejected": -576.7808837890625,
"loss": 0.503,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9455432891845703,
"rewards/margins": 1.3386828899383545,
"rewards/rejected": -3.284226179122925,
"step": 1260
},
{
"epoch": 0.30470249520153553,
"grad_norm": 50.8382185022156,
"learning_rate": 4.3886881845055235e-07,
"logits/chosen": -1.3191502094268799,
"logits/rejected": -1.4105967283248901,
"logps/chosen": -372.0560302734375,
"logps/rejected": -600.4634399414062,
"loss": 0.4746,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2365124225616455,
"rewards/margins": 2.38405179977417,
"rewards/rejected": -3.6205639839172363,
"step": 1270
},
{
"epoch": 0.30710172744721687,
"grad_norm": 38.93701739946284,
"learning_rate": 4.374903762778814e-07,
"logits/chosen": -1.4359400272369385,
"logits/rejected": -1.4567973613739014,
"logps/chosen": -475.7760314941406,
"logps/rejected": -609.7572021484375,
"loss": 0.5013,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.182716131210327,
"rewards/margins": 1.6052064895629883,
"rewards/rejected": -3.7879226207733154,
"step": 1280
},
{
"epoch": 0.30950095969289826,
"grad_norm": 36.62047718195085,
"learning_rate": 4.3609878240020356e-07,
"logits/chosen": -1.2985656261444092,
"logits/rejected": -1.4006609916687012,
"logps/chosen": -532.8348999023438,
"logps/rejected": -629.5772705078125,
"loss": 0.5078,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.3774900436401367,
"rewards/margins": 1.5861461162567139,
"rewards/rejected": -3.9636359214782715,
"step": 1290
},
{
"epoch": 0.31190019193857965,
"grad_norm": 44.56202560175983,
"learning_rate": 4.346941344323005e-07,
"logits/chosen": -1.3752353191375732,
"logits/rejected": -1.4714699983596802,
"logps/chosen": -441.88787841796875,
"logps/rejected": -458.97039794921875,
"loss": 0.525,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.8622640371322632,
"rewards/margins": 0.7467669248580933,
"rewards/rejected": -2.6090309619903564,
"step": 1300
},
{
"epoch": 0.31429942418426104,
"grad_norm": 55.01759944637494,
"learning_rate": 4.332765309046467e-07,
"logits/chosen": -1.4191035032272339,
"logits/rejected": -1.42724609375,
"logps/chosen": -423.49676513671875,
"logps/rejected": -555.864013671875,
"loss": 0.4903,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.615816354751587,
"rewards/margins": 1.7208226919174194,
"rewards/rejected": -3.336638927459717,
"step": 1310
},
{
"epoch": 0.31669865642994244,
"grad_norm": 57.373160721067336,
"learning_rate": 4.3184607125649754e-07,
"logits/chosen": -1.3222754001617432,
"logits/rejected": -1.3755613565444946,
"logps/chosen": -435.70098876953125,
"logps/rejected": -632.2313232421875,
"loss": 0.5046,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4980878829956055,
"rewards/margins": 1.8624553680419922,
"rewards/rejected": -3.3605434894561768,
"step": 1320
},
{
"epoch": 0.3190978886756238,
"grad_norm": 41.42793793737372,
"learning_rate": 4.304028558289141e-07,
"logits/chosen": -1.4451624155044556,
"logits/rejected": -1.4865562915802002,
"logps/chosen": -459.1952209472656,
"logps/rejected": -624.0546875,
"loss": 0.45,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.689760446548462,
"rewards/margins": 1.864870309829712,
"rewards/rejected": -3.5546302795410156,
"step": 1330
},
{
"epoch": 0.32149712092130517,
"grad_norm": 73.49052120855735,
"learning_rate": 4.28946985857725e-07,
"logits/chosen": -1.528925895690918,
"logits/rejected": -1.5248639583587646,
"logps/chosen": -477.422119140625,
"logps/rejected": -631.7359619140625,
"loss": 0.4778,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.0255789756774902,
"rewards/margins": 1.6332505941390991,
"rewards/rejected": -3.658829927444458,
"step": 1340
},
{
"epoch": 0.32389635316698656,
"grad_norm": 40.76124275734906,
"learning_rate": 4.2747856346642445e-07,
"logits/chosen": -1.4490742683410645,
"logits/rejected": -1.4240951538085938,
"logps/chosen": -389.1294250488281,
"logps/rejected": -492.74932861328125,
"loss": 0.4236,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6151390075683594,
"rewards/margins": 1.2310636043548584,
"rewards/rejected": -2.8462026119232178,
"step": 1350
},
{
"epoch": 0.32629558541266795,
"grad_norm": 56.780023366402574,
"learning_rate": 4.2599769165900933e-07,
"logits/chosen": -1.4480865001678467,
"logits/rejected": -1.4879968166351318,
"logps/chosen": -458.68988037109375,
"logps/rejected": -565.7777099609375,
"loss": 0.4956,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.167175054550171,
"rewards/margins": 1.2777191400527954,
"rewards/rejected": -3.4448940753936768,
"step": 1360
},
{
"epoch": 0.32869481765834935,
"grad_norm": 36.1346588520156,
"learning_rate": 4.245044743127535e-07,
"logits/chosen": -1.5145995616912842,
"logits/rejected": -1.4562435150146484,
"logps/chosen": -433.6683044433594,
"logps/rejected": -546.98193359375,
"loss": 0.4736,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7565805912017822,
"rewards/margins": 1.206705093383789,
"rewards/rejected": -2.9632861614227295,
"step": 1370
},
{
"epoch": 0.3310940499040307,
"grad_norm": 34.079682299668946,
"learning_rate": 4.229990161709214e-07,
"logits/chosen": -1.3262040615081787,
"logits/rejected": -1.188719630241394,
"logps/chosen": -356.0837097167969,
"logps/rejected": -541.5905151367188,
"loss": 0.4866,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2393875122070312,
"rewards/margins": 1.6804568767547607,
"rewards/rejected": -2.919844627380371,
"step": 1380
},
{
"epoch": 0.3334932821497121,
"grad_norm": 37.6846699410638,
"learning_rate": 4.214814228354204e-07,
"logits/chosen": -1.337536334991455,
"logits/rejected": -1.3587877750396729,
"logps/chosen": -431.2080078125,
"logps/rejected": -660.2830810546875,
"loss": 0.4482,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.5639148950576782,
"rewards/margins": 2.4401581287384033,
"rewards/rejected": -4.004073143005371,
"step": 1390
},
{
"epoch": 0.33589251439539347,
"grad_norm": 44.17494049898159,
"learning_rate": 4.1995180075939375e-07,
"logits/chosen": -1.51606023311615,
"logits/rejected": -1.48744535446167,
"logps/chosen": -444.0301208496094,
"logps/rejected": -591.0535888671875,
"loss": 0.449,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.744881272315979,
"rewards/margins": 1.708705186843872,
"rewards/rejected": -3.4535861015319824,
"step": 1400
},
{
"epoch": 0.33829174664107486,
"grad_norm": 41.73324787992279,
"learning_rate": 4.1841025723975297e-07,
"logits/chosen": -1.3268964290618896,
"logits/rejected": -1.3524177074432373,
"logps/chosen": -445.13232421875,
"logps/rejected": -646.4071044921875,
"loss": 0.4661,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6669384241104126,
"rewards/margins": 2.2194371223449707,
"rewards/rejected": -3.8863754272460938,
"step": 1410
},
{
"epoch": 0.34069097888675626,
"grad_norm": 80.91305832361327,
"learning_rate": 4.168569004096516e-07,
"logits/chosen": -1.341691017150879,
"logits/rejected": -1.3161810636520386,
"logps/chosen": -409.152099609375,
"logps/rejected": -593.16845703125,
"loss": 0.4624,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7553116083145142,
"rewards/margins": 1.7082653045654297,
"rewards/rejected": -3.4635767936706543,
"step": 1420
},
{
"epoch": 0.3430902111324376,
"grad_norm": 50.725033498599586,
"learning_rate": 4.152918392308997e-07,
"logits/chosen": -1.546785831451416,
"logits/rejected": -1.5451892614364624,
"logps/chosen": -470.802734375,
"logps/rejected": -639.1434326171875,
"loss": 0.4797,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.1750898361206055,
"rewards/margins": 1.7993156909942627,
"rewards/rejected": -3.974405288696289,
"step": 1430
},
{
"epoch": 0.345489443378119,
"grad_norm": 85.40295220916695,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": -1.418149709701538,
"logits/rejected": -1.3302658796310425,
"logps/chosen": -490.89776611328125,
"logps/rejected": -763.4044799804688,
"loss": 0.5091,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.5681164264678955,
"rewards/margins": 2.4175021648406982,
"rewards/rejected": -4.985618591308594,
"step": 1440
},
{
"epoch": 0.3478886756238004,
"grad_norm": 49.948399938173715,
"learning_rate": 4.121270437720526e-07,
"logits/chosen": -1.1961567401885986,
"logits/rejected": -1.1831985712051392,
"logps/chosen": -395.45294189453125,
"logps/rejected": -528.1370849609375,
"loss": 0.4953,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.9191783666610718,
"rewards/margins": 0.9305855631828308,
"rewards/rejected": -2.849764108657837,
"step": 1450
},
{
"epoch": 0.3502879078694818,
"grad_norm": 35.580564672474935,
"learning_rate": 4.105275314897852e-07,
"logits/chosen": -1.3679462671279907,
"logits/rejected": -1.3246995210647583,
"logps/chosen": -401.3337707519531,
"logps/rejected": -686.9393920898438,
"loss": 0.469,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7229747772216797,
"rewards/margins": 2.585207462310791,
"rewards/rejected": -4.308182239532471,
"step": 1460
},
{
"epoch": 0.35268714011516317,
"grad_norm": 43.685991160210406,
"learning_rate": 4.089167588389508e-07,
"logits/chosen": -1.1158957481384277,
"logits/rejected": -1.230369210243225,
"logps/chosen": -547.0029907226562,
"logps/rejected": -697.0067138671875,
"loss": 0.4694,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.114428758621216,
"rewards/margins": 1.9506731033325195,
"rewards/rejected": -4.065101146697998,
"step": 1470
},
{
"epoch": 0.3550863723608445,
"grad_norm": 66.37356591019706,
"learning_rate": 4.072948388088515e-07,
"logits/chosen": -1.233522891998291,
"logits/rejected": -1.276780366897583,
"logps/chosen": -492.72845458984375,
"logps/rejected": -658.1353149414062,
"loss": 0.4829,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2197484970092773,
"rewards/margins": 1.6216094493865967,
"rewards/rejected": -3.841358184814453,
"step": 1480
},
{
"epoch": 0.3574856046065259,
"grad_norm": 69.97901799878963,
"learning_rate": 4.056618851707334e-07,
"logits/chosen": -1.2373313903808594,
"logits/rejected": -1.3132033348083496,
"logps/chosen": -457.95831298828125,
"logps/rejected": -695.0521240234375,
"loss": 0.4173,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.7973763942718506,
"rewards/margins": 2.393643379211426,
"rewards/rejected": -4.191019535064697,
"step": 1490
},
{
"epoch": 0.3598848368522073,
"grad_norm": 48.587428711970404,
"learning_rate": 4.0401801246980675e-07,
"logits/chosen": -1.3571937084197998,
"logits/rejected": -1.4006597995758057,
"logps/chosen": -463.6883239746094,
"logps/rejected": -594.7393798828125,
"loss": 0.4892,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.4648189544677734,
"rewards/margins": 1.542531132698059,
"rewards/rejected": -4.007349491119385,
"step": 1500
},
{
"epoch": 0.3622840690978887,
"grad_norm": 38.57207095495995,
"learning_rate": 4.0236333601721043e-07,
"logits/chosen": -1.3310314416885376,
"logits/rejected": -1.2778218984603882,
"logps/chosen": -455.7976989746094,
"logps/rejected": -558.8304443359375,
"loss": 0.5111,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.769777536392212,
"rewards/margins": 0.9363303184509277,
"rewards/rejected": -2.7061076164245605,
"step": 1510
},
{
"epoch": 0.3646833013435701,
"grad_norm": 37.96256839231154,
"learning_rate": 4.0069797188192364e-07,
"logits/chosen": -1.203385591506958,
"logits/rejected": -1.1847422122955322,
"logps/chosen": -462.11865234375,
"logps/rejected": -606.5435791015625,
"loss": 0.4851,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7279574871063232,
"rewards/margins": 1.8359229564666748,
"rewards/rejected": -3.563880443572998,
"step": 1520
},
{
"epoch": 0.3670825335892514,
"grad_norm": 44.348458391464845,
"learning_rate": 3.9902203688262417e-07,
"logits/chosen": -1.205517292022705,
"logits/rejected": -1.2756140232086182,
"logps/chosen": -385.13153076171875,
"logps/rejected": -469.98858642578125,
"loss": 0.4599,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2476422786712646,
"rewards/margins": 0.9903982877731323,
"rewards/rejected": -2.2380406856536865,
"step": 1530
},
{
"epoch": 0.3694817658349328,
"grad_norm": 69.63434467808052,
"learning_rate": 3.9733564857949365e-07,
"logits/chosen": -1.2490085363388062,
"logits/rejected": -1.3441739082336426,
"logps/chosen": -466.0321350097656,
"logps/rejected": -585.1414794921875,
"loss": 0.4321,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6616054773330688,
"rewards/margins": 1.6084191799163818,
"rewards/rejected": -3.270024538040161,
"step": 1540
},
{
"epoch": 0.3718809980806142,
"grad_norm": 61.067944902480086,
"learning_rate": 3.9563892526597177e-07,
"logits/chosen": -1.4084941148757935,
"logits/rejected": -1.3332915306091309,
"logps/chosen": -368.38494873046875,
"logps/rejected": -485.837890625,
"loss": 0.4421,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4880536794662476,
"rewards/margins": 0.7631991505622864,
"rewards/rejected": -2.2512528896331787,
"step": 1550
},
{
"epoch": 0.3742802303262956,
"grad_norm": 42.61893096692006,
"learning_rate": 3.9393198596045795e-07,
"logits/chosen": -1.30875563621521,
"logits/rejected": -1.2471106052398682,
"logps/chosen": -410.9712829589844,
"logps/rejected": -568.45751953125,
"loss": 0.5049,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7241986989974976,
"rewards/margins": 1.4812783002853394,
"rewards/rejected": -3.205477237701416,
"step": 1560
},
{
"epoch": 0.376679462571977,
"grad_norm": 31.366095609512197,
"learning_rate": 3.922149503979628e-07,
"logits/chosen": -1.1689096689224243,
"logits/rejected": -1.1878143548965454,
"logps/chosen": -481.097900390625,
"logps/rejected": -785.8397827148438,
"loss": 0.4457,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0780715942382812,
"rewards/margins": 3.0227134227752686,
"rewards/rejected": -5.100784778594971,
"step": 1570
},
{
"epoch": 0.3790786948176583,
"grad_norm": 46.590501351524594,
"learning_rate": 3.904879390217095e-07,
"logits/chosen": -1.2587759494781494,
"logits/rejected": -1.3025065660476685,
"logps/chosen": -419.4425354003906,
"logps/rejected": -535.4026489257812,
"loss": 0.458,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6549510955810547,
"rewards/margins": 1.389463186264038,
"rewards/rejected": -3.0444140434265137,
"step": 1580
},
{
"epoch": 0.3814779270633397,
"grad_norm": 74.73243898734299,
"learning_rate": 3.8875107297468463e-07,
"logits/chosen": -1.2635529041290283,
"logits/rejected": -1.2022044658660889,
"logps/chosen": -391.54791259765625,
"logps/rejected": -712.3580322265625,
"loss": 0.503,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.5201681852340698,
"rewards/margins": 2.7511065006256104,
"rewards/rejected": -4.271274566650391,
"step": 1590
},
{
"epoch": 0.3838771593090211,
"grad_norm": 60.47812298444817,
"learning_rate": 3.87004474091141e-07,
"logits/chosen": -1.1937386989593506,
"logits/rejected": -1.223789930343628,
"logps/chosen": -374.3924255371094,
"logps/rejected": -540.607421875,
"loss": 0.4657,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5437977313995361,
"rewards/margins": 1.513298749923706,
"rewards/rejected": -3.0570967197418213,
"step": 1600
},
{
"epoch": 0.3862763915547025,
"grad_norm": 39.1187908744064,
"learning_rate": 3.8524826488805114e-07,
"logits/chosen": -1.4063940048217773,
"logits/rejected": -1.3802636861801147,
"logps/chosen": -486.917724609375,
"logps/rejected": -567.9946899414062,
"loss": 0.543,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0151114463806152,
"rewards/margins": 1.2625728845596313,
"rewards/rejected": -3.2776846885681152,
"step": 1610
},
{
"epoch": 0.3886756238003839,
"grad_norm": 53.71098690399445,
"learning_rate": 3.834825685565133e-07,
"logits/chosen": -1.313389778137207,
"logits/rejected": -1.4078295230865479,
"logps/chosen": -371.45654296875,
"logps/rejected": -425.3412170410156,
"loss": 0.4473,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3473589420318604,
"rewards/margins": 0.8928249478340149,
"rewards/rejected": -2.2401843070983887,
"step": 1620
},
{
"epoch": 0.39107485604606523,
"grad_norm": 107.17695988171393,
"learning_rate": 3.8170750895311007e-07,
"logits/chosen": -1.2148559093475342,
"logits/rejected": -1.1752598285675049,
"logps/chosen": -438.32232666015625,
"logps/rejected": -568.3878173828125,
"loss": 0.4171,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.5638713836669922,
"rewards/margins": 1.5277128219604492,
"rewards/rejected": -3.0915839672088623,
"step": 1630
},
{
"epoch": 0.3934740882917466,
"grad_norm": 37.15180787425003,
"learning_rate": 3.7992321059122045e-07,
"logits/chosen": -1.1769952774047852,
"logits/rejected": -1.300578236579895,
"logps/chosen": -472.325439453125,
"logps/rejected": -621.8825073242188,
"loss": 0.4885,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2215113639831543,
"rewards/margins": 1.6850610971450806,
"rewards/rejected": -3.9065728187561035,
"step": 1640
},
{
"epoch": 0.395873320537428,
"grad_norm": 43.17139372813891,
"learning_rate": 3.7812979863228576e-07,
"logits/chosen": -1.3489793539047241,
"logits/rejected": -1.3900834321975708,
"logps/chosen": -443.6715393066406,
"logps/rejected": -586.6777954101562,
"loss": 0.4476,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2688426971435547,
"rewards/margins": 1.326430320739746,
"rewards/rejected": -3.59527325630188,
"step": 1650
},
{
"epoch": 0.3982725527831094,
"grad_norm": 46.141900540489175,
"learning_rate": 3.763273988770296e-07,
"logits/chosen": -1.3089871406555176,
"logits/rejected": -1.3825414180755615,
"logps/chosen": -422.4200744628906,
"logps/rejected": -581.9949340820312,
"loss": 0.4636,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8401399850845337,
"rewards/margins": 1.5839296579360962,
"rewards/rejected": -3.42406964302063,
"step": 1660
},
{
"epoch": 0.4006717850287908,
"grad_norm": 62.372203609183686,
"learning_rate": 3.7451613775663405e-07,
"logits/chosen": -1.2362799644470215,
"logits/rejected": -1.1603913307189941,
"logps/chosen": -412.76837158203125,
"logps/rejected": -674.3446655273438,
"loss": 0.4667,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.710687279701233,
"rewards/margins": 2.6008946895599365,
"rewards/rejected": -4.311581611633301,
"step": 1670
},
{
"epoch": 0.40307101727447214,
"grad_norm": 64.9643167880585,
"learning_rate": 3.726961423238706e-07,
"logits/chosen": -1.3476378917694092,
"logits/rejected": -1.3207610845565796,
"logps/chosen": -394.1864013671875,
"logps/rejected": -580.0813598632812,
"loss": 0.4791,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6723476648330688,
"rewards/margins": 1.6961578130722046,
"rewards/rejected": -3.3685054779052734,
"step": 1680
},
{
"epoch": 0.40547024952015354,
"grad_norm": 49.88660013825801,
"learning_rate": 3.708675402441882e-07,
"logits/chosen": -1.1998610496520996,
"logits/rejected": -1.35236394405365,
"logps/chosen": -444.1595764160156,
"logps/rejected": -540.0147705078125,
"loss": 0.5023,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6736522912979126,
"rewards/margins": 1.3222843408584595,
"rewards/rejected": -2.995936632156372,
"step": 1690
},
{
"epoch": 0.40786948176583493,
"grad_norm": 77.03784806851208,
"learning_rate": 3.6903045978675775e-07,
"logits/chosen": -1.2468807697296143,
"logits/rejected": -1.2648643255233765,
"logps/chosen": -403.4354553222656,
"logps/rejected": -598.949462890625,
"loss": 0.4621,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6437181234359741,
"rewards/margins": 2.1794071197509766,
"rewards/rejected": -3.823125123977661,
"step": 1700
},
{
"epoch": 0.4102687140115163,
"grad_norm": 53.95037172519313,
"learning_rate": 3.6718502981547474e-07,
"logits/chosen": -1.3636986017227173,
"logits/rejected": -1.326259732246399,
"logps/chosen": -409.52935791015625,
"logps/rejected": -571.069091796875,
"loss": 0.467,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.626726508140564,
"rewards/margins": 1.2190231084823608,
"rewards/rejected": -2.845749616622925,
"step": 1710
},
{
"epoch": 0.4126679462571977,
"grad_norm": 49.15860337517488,
"learning_rate": 3.6533137977991986e-07,
"logits/chosen": -1.2063934803009033,
"logits/rejected": -1.215348243713379,
"logps/chosen": -441.303466796875,
"logps/rejected": -561.9154052734375,
"loss": 0.4958,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.6133943796157837,
"rewards/margins": 0.9018635749816895,
"rewards/rejected": -2.5152580738067627,
"step": 1720
},
{
"epoch": 0.41506717850287905,
"grad_norm": 67.07571213677744,
"learning_rate": 3.6346963970627865e-07,
"logits/chosen": -1.2759754657745361,
"logits/rejected": -1.1578261852264404,
"logps/chosen": -399.96307373046875,
"logps/rejected": -558.6627197265625,
"loss": 0.4549,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.611649513244629,
"rewards/margins": 1.4893057346343994,
"rewards/rejected": -3.100955009460449,
"step": 1730
},
{
"epoch": 0.41746641074856045,
"grad_norm": 64.78695085834201,
"learning_rate": 3.615999401882207e-07,
"logits/chosen": -1.5288084745407104,
"logits/rejected": -1.4955217838287354,
"logps/chosen": -473.1421813964844,
"logps/rejected": -715.1536254882812,
"loss": 0.4492,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.391709804534912,
"rewards/margins": 2.3399155139923096,
"rewards/rejected": -4.731625556945801,
"step": 1740
},
{
"epoch": 0.41986564299424184,
"grad_norm": 47.6256598631063,
"learning_rate": 3.597224123777389e-07,
"logits/chosen": -1.4542527198791504,
"logits/rejected": -1.43537175655365,
"logps/chosen": -557.6041870117188,
"logps/rejected": -791.1467895507812,
"loss": 0.4589,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.0537285804748535,
"rewards/margins": 2.2443196773529053,
"rewards/rejected": -5.298048973083496,
"step": 1750
},
{
"epoch": 0.42226487523992323,
"grad_norm": 77.93257969251549,
"learning_rate": 3.5783718797595e-07,
"logits/chosen": -1.3806182146072388,
"logits/rejected": -1.4632576704025269,
"logps/chosen": -529.6144409179688,
"logps/rejected": -654.4708862304688,
"loss": 0.4986,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2454628944396973,
"rewards/margins": 1.7240183353424072,
"rewards/rejected": -3.9694809913635254,
"step": 1760
},
{
"epoch": 0.4246641074856046,
"grad_norm": 47.87060495829484,
"learning_rate": 3.559443992238558e-07,
"logits/chosen": -1.3499047756195068,
"logits/rejected": -1.3741114139556885,
"logps/chosen": -404.3805847167969,
"logps/rejected": -626.6046142578125,
"loss": 0.4617,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6187816858291626,
"rewards/margins": 1.976681113243103,
"rewards/rejected": -3.5954627990722656,
"step": 1770
},
{
"epoch": 0.42706333973128596,
"grad_norm": 31.34540043802626,
"learning_rate": 3.540441788930673e-07,
"logits/chosen": -1.341737985610962,
"logits/rejected": -1.3971807956695557,
"logps/chosen": -490.4947814941406,
"logps/rejected": -688.3030395507812,
"loss": 0.4217,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.927219033241272,
"rewards/margins": 2.3758645057678223,
"rewards/rejected": -4.303083419799805,
"step": 1780
},
{
"epoch": 0.42946257197696736,
"grad_norm": 52.184614479528065,
"learning_rate": 3.5213666027649123e-07,
"logits/chosen": -1.334465503692627,
"logits/rejected": -1.4451353549957275,
"logps/chosen": -483.6954040527344,
"logps/rejected": -572.4603271484375,
"loss": 0.4556,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0889644622802734,
"rewards/margins": 1.2906540632247925,
"rewards/rejected": -3.3796188831329346,
"step": 1790
},
{
"epoch": 0.43186180422264875,
"grad_norm": 62.61854115073099,
"learning_rate": 3.5022197717898017e-07,
"logits/chosen": -1.2022984027862549,
"logits/rejected": -1.3184864521026611,
"logps/chosen": -415.38134765625,
"logps/rejected": -565.1746826171875,
"loss": 0.4032,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9461984634399414,
"rewards/margins": 1.8992345333099365,
"rewards/rejected": -3.845432996749878,
"step": 1800
},
{
"epoch": 0.43426103646833014,
"grad_norm": 61.20243852328461,
"learning_rate": 3.4830026390794633e-07,
"logits/chosen": -1.3132349252700806,
"logits/rejected": -1.3491175174713135,
"logps/chosen": -539.82666015625,
"logps/rejected": -713.0654907226562,
"loss": 0.4442,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.517554521560669,
"rewards/margins": 2.220135450363159,
"rewards/rejected": -4.737689018249512,
"step": 1810
},
{
"epoch": 0.43666026871401153,
"grad_norm": 37.022962730693074,
"learning_rate": 3.4637165526394104e-07,
"logits/chosen": -1.3220465183258057,
"logits/rejected": -1.3310649394989014,
"logps/chosen": -396.91748046875,
"logps/rejected": -562.5906982421875,
"loss": 0.4541,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6625458002090454,
"rewards/margins": 1.5693401098251343,
"rewards/rejected": -3.2318859100341797,
"step": 1820
},
{
"epoch": 0.43905950095969287,
"grad_norm": 31.533784042262116,
"learning_rate": 3.4443628653119814e-07,
"logits/chosen": -1.2581579685211182,
"logits/rejected": -1.2321877479553223,
"logps/chosen": -457.6893615722656,
"logps/rejected": -755.5025634765625,
"loss": 0.4958,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8591516017913818,
"rewards/margins": 2.5752387046813965,
"rewards/rejected": -4.434390068054199,
"step": 1830
},
{
"epoch": 0.44145873320537427,
"grad_norm": 74.3602562066919,
"learning_rate": 3.424942934681453e-07,
"logits/chosen": -1.3297505378723145,
"logits/rejected": -1.4661356210708618,
"logps/chosen": -383.6442565917969,
"logps/rejected": -581.6712036132812,
"loss": 0.4591,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4883172512054443,
"rewards/margins": 2.007455587387085,
"rewards/rejected": -3.4957728385925293,
"step": 1840
},
{
"epoch": 0.44385796545105566,
"grad_norm": 75.37888411441581,
"learning_rate": 3.405458122978804e-07,
"logits/chosen": -1.3553504943847656,
"logits/rejected": -1.3567984104156494,
"logps/chosen": -431.16143798828125,
"logps/rejected": -534.2850341796875,
"loss": 0.4234,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.5984207391738892,
"rewards/margins": 1.381314754486084,
"rewards/rejected": -2.9797348976135254,
"step": 1850
},
{
"epoch": 0.44625719769673705,
"grad_norm": 72.84055950942061,
"learning_rate": 3.3859097969861633e-07,
"logits/chosen": -1.2684893608093262,
"logits/rejected": -1.268117070198059,
"logps/chosen": -473.4444274902344,
"logps/rejected": -596.4513549804688,
"loss": 0.4823,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8286316394805908,
"rewards/margins": 1.582054853439331,
"rewards/rejected": -3.410686492919922,
"step": 1860
},
{
"epoch": 0.44865642994241844,
"grad_norm": 50.17662967544575,
"learning_rate": 3.366299327940936e-07,
"logits/chosen": -1.3475382328033447,
"logits/rejected": -1.255838394165039,
"logps/chosen": -461.9693298339844,
"logps/rejected": -676.9766845703125,
"loss": 0.4373,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7682750225067139,
"rewards/margins": 1.9757261276245117,
"rewards/rejected": -3.7440009117126465,
"step": 1870
},
{
"epoch": 0.4510556621880998,
"grad_norm": 41.91903545671121,
"learning_rate": 3.3466280914396117e-07,
"logits/chosen": -1.2820563316345215,
"logits/rejected": -1.2576204538345337,
"logps/chosen": -396.26971435546875,
"logps/rejected": -576.42822265625,
"loss": 0.4575,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6182832717895508,
"rewards/margins": 1.6133058071136475,
"rewards/rejected": -3.2315890789031982,
"step": 1880
},
{
"epoch": 0.4534548944337812,
"grad_norm": 68.00245043430192,
"learning_rate": 3.326897467341281e-07,
"logits/chosen": -1.286210536956787,
"logits/rejected": -1.3600260019302368,
"logps/chosen": -429.225341796875,
"logps/rejected": -622.280029296875,
"loss": 0.4278,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2270400524139404,
"rewards/margins": 1.820407509803772,
"rewards/rejected": -4.047447681427002,
"step": 1890
},
{
"epoch": 0.45585412667946257,
"grad_norm": 59.680334488165904,
"learning_rate": 3.3071088396708335e-07,
"logits/chosen": -1.330963373184204,
"logits/rejected": -1.283299207687378,
"logps/chosen": -389.97808837890625,
"logps/rejected": -664.4937744140625,
"loss": 0.4647,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8736871480941772,
"rewards/margins": 2.509003162384033,
"rewards/rejected": -4.3826904296875,
"step": 1900
},
{
"epoch": 0.45825335892514396,
"grad_norm": 55.13504798919061,
"learning_rate": 3.2872635965218824e-07,
"logits/chosen": -1.2652981281280518,
"logits/rejected": -1.2861101627349854,
"logps/chosen": -509.47467041015625,
"logps/rejected": -697.8663330078125,
"loss": 0.5082,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.491791009902954,
"rewards/margins": 1.7968419790267944,
"rewards/rejected": -4.288632869720459,
"step": 1910
},
{
"epoch": 0.46065259117082535,
"grad_norm": 52.124591511811545,
"learning_rate": 3.2673631299593905e-07,
"logits/chosen": -1.2162278890609741,
"logits/rejected": -1.3705085515975952,
"logps/chosen": -480.2521057128906,
"logps/rejected": -630.8917846679688,
"loss": 0.4528,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.0875420570373535,
"rewards/margins": 1.5945124626159668,
"rewards/rejected": -3.6820545196533203,
"step": 1920
},
{
"epoch": 0.4630518234165067,
"grad_norm": 62.707006736242334,
"learning_rate": 3.247408835922024e-07,
"logits/chosen": -1.3850958347320557,
"logits/rejected": -1.3544895648956299,
"logps/chosen": -555.5938720703125,
"logps/rejected": -746.1630859375,
"loss": 0.4605,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.5104541778564453,
"rewards/margins": 1.8147118091583252,
"rewards/rejected": -4.32516622543335,
"step": 1930
},
{
"epoch": 0.4654510556621881,
"grad_norm": 46.08250458603549,
"learning_rate": 3.2274021141242306e-07,
"logits/chosen": -1.296891212463379,
"logits/rejected": -1.3404098749160767,
"logps/chosen": -482.0558166503906,
"logps/rejected": -644.9270629882812,
"loss": 0.4805,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2868685722351074,
"rewards/margins": 1.5186541080474854,
"rewards/rejected": -3.805522918701172,
"step": 1940
},
{
"epoch": 0.4678502879078695,
"grad_norm": 69.44491393075363,
"learning_rate": 3.2073443679580613e-07,
"logits/chosen": -1.1887061595916748,
"logits/rejected": -1.2752236127853394,
"logps/chosen": -440.823974609375,
"logps/rejected": -540.7047119140625,
"loss": 0.4653,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7992362976074219,
"rewards/margins": 1.0187088251113892,
"rewards/rejected": -2.8179447650909424,
"step": 1950
},
{
"epoch": 0.47024952015355087,
"grad_norm": 57.153969593846476,
"learning_rate": 3.1872370043947194e-07,
"logits/chosen": -1.402218222618103,
"logits/rejected": -1.4274516105651855,
"logps/chosen": -428.96343994140625,
"logps/rejected": -651.5018310546875,
"loss": 0.3983,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.6342216730117798,
"rewards/margins": 2.3417115211486816,
"rewards/rejected": -3.9759325981140137,
"step": 1960
},
{
"epoch": 0.47264875239923226,
"grad_norm": 64.27887622667613,
"learning_rate": 3.167081433885874e-07,
"logits/chosen": -1.082318902015686,
"logits/rejected": -1.1133878231048584,
"logps/chosen": -543.10498046875,
"logps/rejected": -741.9865112304688,
"loss": 0.4258,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.397563934326172,
"rewards/margins": 1.6185156106948853,
"rewards/rejected": -4.016079902648926,
"step": 1970
},
{
"epoch": 0.4750479846449136,
"grad_norm": 57.701882074535135,
"learning_rate": 3.14687907026472e-07,
"logits/chosen": -1.1284734010696411,
"logits/rejected": -1.2509753704071045,
"logps/chosen": -404.6647033691406,
"logps/rejected": -615.397705078125,
"loss": 0.43,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.8688653707504272,
"rewards/margins": 1.8782680034637451,
"rewards/rejected": -3.747133731842041,
"step": 1980
},
{
"epoch": 0.477447216890595,
"grad_norm": 51.61036006005554,
"learning_rate": 3.126631330646801e-07,
"logits/chosen": -1.193585991859436,
"logits/rejected": -1.2708711624145508,
"logps/chosen": -519.4849853515625,
"logps/rejected": -704.0787963867188,
"loss": 0.4634,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2691690921783447,
"rewards/margins": 1.7898368835449219,
"rewards/rejected": -4.059006214141846,
"step": 1990
},
{
"epoch": 0.4798464491362764,
"grad_norm": 41.46670714894445,
"learning_rate": 3.1063396353306097e-07,
"logits/chosen": -1.2346079349517822,
"logits/rejected": -1.3723104000091553,
"logps/chosen": -419.3143615722656,
"logps/rejected": -545.783935546875,
"loss": 0.4396,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.60104501247406,
"rewards/margins": 1.641794204711914,
"rewards/rejected": -3.2428393363952637,
"step": 2000
},
{
"epoch": 0.4822456813819578,
"grad_norm": 60.7041482335727,
"learning_rate": 3.0860054076979535e-07,
"logits/chosen": -1.3001885414123535,
"logits/rejected": -1.2961161136627197,
"logps/chosen": -490.758544921875,
"logps/rejected": -629.7095947265625,
"loss": 0.4257,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.158540964126587,
"rewards/margins": 1.7136547565460205,
"rewards/rejected": -3.8721961975097656,
"step": 2010
},
{
"epoch": 0.4846449136276392,
"grad_norm": 51.131960110389144,
"learning_rate": 3.065630074114115e-07,
"logits/chosen": -1.3280750513076782,
"logits/rejected": -1.402672290802002,
"logps/chosen": -487.7989196777344,
"logps/rejected": -672.5093994140625,
"loss": 0.4767,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.0603041648864746,
"rewards/margins": 2.2750654220581055,
"rewards/rejected": -4.33536958694458,
"step": 2020
},
{
"epoch": 0.4870441458733205,
"grad_norm": 39.68429607245266,
"learning_rate": 3.0452150638277947e-07,
"logits/chosen": -1.1976875066757202,
"logits/rejected": -1.1391303539276123,
"logps/chosen": -432.50439453125,
"logps/rejected": -571.4925537109375,
"loss": 0.4554,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.130190134048462,
"rewards/margins": 1.2737354040145874,
"rewards/rejected": -3.4039254188537598,
"step": 2030
},
{
"epoch": 0.4894433781190019,
"grad_norm": 32.52999927801439,
"learning_rate": 3.024761808870856e-07,
"logits/chosen": -1.3780122995376587,
"logits/rejected": -1.3512189388275146,
"logps/chosen": -390.6479797363281,
"logps/rejected": -635.6519165039062,
"loss": 0.4069,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.4688472747802734,
"rewards/margins": 2.515817403793335,
"rewards/rejected": -3.9846644401550293,
"step": 2040
},
{
"epoch": 0.4918426103646833,
"grad_norm": 95.60445146391889,
"learning_rate": 3.004271743957875e-07,
"logits/chosen": -1.1661708354949951,
"logits/rejected": -1.1566636562347412,
"logps/chosen": -498.49444580078125,
"logps/rejected": -651.3515625,
"loss": 0.4674,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.443251848220825,
"rewards/margins": 1.215421438217163,
"rewards/rejected": -3.6586735248565674,
"step": 2050
},
{
"epoch": 0.4942418426103647,
"grad_norm": 55.47701742913237,
"learning_rate": 2.983746306385499e-07,
"logits/chosen": -1.396315574645996,
"logits/rejected": -1.3816092014312744,
"logps/chosen": -464.2110900878906,
"logps/rejected": -665.644775390625,
"loss": 0.431,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.19048810005188,
"rewards/margins": 1.8881957530975342,
"rewards/rejected": -4.078683853149414,
"step": 2060
},
{
"epoch": 0.4966410748560461,
"grad_norm": 59.85231462052711,
"learning_rate": 2.963186935931628e-07,
"logits/chosen": -1.3054215908050537,
"logits/rejected": -1.2787823677062988,
"logps/chosen": -412.70562744140625,
"logps/rejected": -570.3099365234375,
"loss": 0.4321,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5982519388198853,
"rewards/margins": 1.624089241027832,
"rewards/rejected": -3.2223410606384277,
"step": 2070
},
{
"epoch": 0.4990403071017274,
"grad_norm": 54.133794185753565,
"learning_rate": 2.9425950747544176e-07,
"logits/chosen": -1.2475080490112305,
"logits/rejected": -1.3604786396026611,
"logps/chosen": -532.1666259765625,
"logps/rejected": -762.163818359375,
"loss": 0.4371,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.4116852283477783,
"rewards/margins": 2.56913685798645,
"rewards/rejected": -4.9808220863342285,
"step": 2080
},
{
"epoch": 0.5014395393474088,
"grad_norm": 83.6927398487227,
"learning_rate": 2.921972167291119e-07,
"logits/chosen": -1.212083101272583,
"logits/rejected": -1.2536664009094238,
"logps/chosen": -482.11505126953125,
"logps/rejected": -664.6883544921875,
"loss": 0.4412,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.0365757942199707,
"rewards/margins": 1.8161745071411133,
"rewards/rejected": -3.852750301361084,
"step": 2090
},
{
"epoch": 0.5038387715930902,
"grad_norm": 50.31802122151575,
"learning_rate": 2.9013196601567567e-07,
"logits/chosen": -1.1923916339874268,
"logits/rejected": -1.1941927671432495,
"logps/chosen": -405.5143127441406,
"logps/rejected": -548.7236328125,
"loss": 0.5319,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5634148120880127,
"rewards/margins": 1.2738431692123413,
"rewards/rejected": -2.8372581005096436,
"step": 2100
},
{
"epoch": 0.5062380038387716,
"grad_norm": 56.698876160844584,
"learning_rate": 2.8806390020426555e-07,
"logits/chosen": -1.164459228515625,
"logits/rejected": -1.1488382816314697,
"logps/chosen": -419.48785400390625,
"logps/rejected": -572.8009033203125,
"loss": 0.4367,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.433246374130249,
"rewards/margins": 1.561497449874878,
"rewards/rejected": -2.994743824005127,
"step": 2110
},
{
"epoch": 0.508637236084453,
"grad_norm": 62.14972092087307,
"learning_rate": 2.8599316436148187e-07,
"logits/chosen": -1.3236867189407349,
"logits/rejected": -1.351775884628296,
"logps/chosen": -434.72540283203125,
"logps/rejected": -557.2949829101562,
"loss": 0.4449,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8828712701797485,
"rewards/margins": 1.2600587606430054,
"rewards/rejected": -3.142929792404175,
"step": 2120
},
{
"epoch": 0.5110364683301344,
"grad_norm": 48.103685381780075,
"learning_rate": 2.8391990374121723e-07,
"logits/chosen": -1.3166625499725342,
"logits/rejected": -1.3257856369018555,
"logps/chosen": -475.3426818847656,
"logps/rejected": -778.1724243164062,
"loss": 0.421,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.332332134246826,
"rewards/margins": 2.738534450531006,
"rewards/rejected": -5.070866584777832,
"step": 2130
},
{
"epoch": 0.5134357005758158,
"grad_norm": 47.162564476256435,
"learning_rate": 2.818442637744669e-07,
"logits/chosen": -1.3116474151611328,
"logits/rejected": -1.3327019214630127,
"logps/chosen": -460.369140625,
"logps/rejected": -660.98388671875,
"loss": 0.4342,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.1553499698638916,
"rewards/margins": 1.9608310461044312,
"rewards/rejected": -4.116180896759033,
"step": 2140
},
{
"epoch": 0.5158349328214972,
"grad_norm": 60.64034230881214,
"learning_rate": 2.797663900591284e-07,
"logits/chosen": -1.3147813081741333,
"logits/rejected": -1.386897325515747,
"logps/chosen": -468.63470458984375,
"logps/rejected": -615.6099243164062,
"loss": 0.3904,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.097090244293213,
"rewards/margins": 1.7188762426376343,
"rewards/rejected": -3.8159663677215576,
"step": 2150
},
{
"epoch": 0.5182341650671785,
"grad_norm": 66.07767465138748,
"learning_rate": 2.776864283497874e-07,
"logits/chosen": -1.3374742269515991,
"logits/rejected": -1.4205501079559326,
"logps/chosen": -465.52313232421875,
"logps/rejected": -728.1679077148438,
"loss": 0.4107,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.3437561988830566,
"rewards/margins": 2.708853006362915,
"rewards/rejected": -5.052609920501709,
"step": 2160
},
{
"epoch": 0.5206333973128598,
"grad_norm": 62.29334430926315,
"learning_rate": 2.756045245474943e-07,
"logits/chosen": -1.261883020401001,
"logits/rejected": -1.2139081954956055,
"logps/chosen": -485.0502014160156,
"logps/rejected": -680.46875,
"loss": 0.4545,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.26891827583313,
"rewards/margins": 1.8356815576553345,
"rewards/rejected": -4.104599475860596,
"step": 2170
},
{
"epoch": 0.5230326295585412,
"grad_norm": 41.365181681908616,
"learning_rate": 2.7352082468952977e-07,
"logits/chosen": -1.2522320747375488,
"logits/rejected": -1.301537275314331,
"logps/chosen": -509.38494873046875,
"logps/rejected": -802.3201293945312,
"loss": 0.4662,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.6157939434051514,
"rewards/margins": 2.8169338703155518,
"rewards/rejected": -5.432727336883545,
"step": 2180
},
{
"epoch": 0.5254318618042226,
"grad_norm": 61.71676072813974,
"learning_rate": 2.7143547493916e-07,
"logits/chosen": -1.3673561811447144,
"logits/rejected": -1.3156074285507202,
"logps/chosen": -452.85595703125,
"logps/rejected": -724.8404541015625,
"loss": 0.4678,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.957660436630249,
"rewards/margins": 2.71724271774292,
"rewards/rejected": -4.674903392791748,
"step": 2190
},
{
"epoch": 0.527831094049904,
"grad_norm": 64.09199641901836,
"learning_rate": 2.693486215753853e-07,
"logits/chosen": -1.3113658428192139,
"logits/rejected": -1.3134915828704834,
"logps/chosen": -487.13507080078125,
"logps/rejected": -752.1974487304688,
"loss": 0.436,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.3614611625671387,
"rewards/margins": 2.879239559173584,
"rewards/rejected": -5.240700721740723,
"step": 2200
},
{
"epoch": 0.5302303262955854,
"grad_norm": 62.382486148855605,
"learning_rate": 2.6726041098267805e-07,
"logits/chosen": -1.2454713582992554,
"logits/rejected": -1.2938363552093506,
"logps/chosen": -489.5714416503906,
"logps/rejected": -586.869140625,
"loss": 0.4975,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.014014720916748,
"rewards/margins": 1.3383963108062744,
"rewards/rejected": -3.3524117469787598,
"step": 2210
},
{
"epoch": 0.5326295585412668,
"grad_norm": 77.82285089480301,
"learning_rate": 2.6517098964071507e-07,
"logits/chosen": -1.3246665000915527,
"logits/rejected": -1.3666447401046753,
"logps/chosen": -430.4461975097656,
"logps/rejected": -537.7572021484375,
"loss": 0.4907,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8055528402328491,
"rewards/margins": 0.8791055679321289,
"rewards/rejected": -2.6846585273742676,
"step": 2220
},
{
"epoch": 0.5350287907869482,
"grad_norm": 68.93201264173544,
"learning_rate": 2.630805041141023e-07,
"logits/chosen": -1.4450138807296753,
"logits/rejected": -1.4398632049560547,
"logps/chosen": -390.3777770996094,
"logps/rejected": -676.9429931640625,
"loss": 0.4491,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.6461594104766846,
"rewards/margins": 2.6611924171447754,
"rewards/rejected": -4.307351589202881,
"step": 2230
},
{
"epoch": 0.5374280230326296,
"grad_norm": 58.5502482141954,
"learning_rate": 2.609891010420941e-07,
"logits/chosen": -1.4054218530654907,
"logits/rejected": -1.4029279947280884,
"logps/chosen": -484.80145263671875,
"logps/rejected": -684.0780029296875,
"loss": 0.4067,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.152812957763672,
"rewards/margins": 1.9849519729614258,
"rewards/rejected": -4.137764930725098,
"step": 2240
},
{
"epoch": 0.539827255278311,
"grad_norm": 49.966474065126825,
"learning_rate": 2.5889692712830674e-07,
"logits/chosen": -1.2064855098724365,
"logits/rejected": -1.2673299312591553,
"logps/chosen": -412.9921875,
"logps/rejected": -598.0367431640625,
"loss": 0.4303,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8478339910507202,
"rewards/margins": 1.9323060512542725,
"rewards/rejected": -3.7801403999328613,
"step": 2250
},
{
"epoch": 0.5422264875239923,
"grad_norm": 78.42556826114057,
"learning_rate": 2.5680412913042843e-07,
"logits/chosen": -1.4747849702835083,
"logits/rejected": -1.426845669746399,
"logps/chosen": -482.0741271972656,
"logps/rejected": -703.7940673828125,
"loss": 0.4473,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.30895733833313,
"rewards/margins": 2.2580885887145996,
"rewards/rejected": -4.567046165466309,
"step": 2260
},
{
"epoch": 0.5446257197696737,
"grad_norm": 62.375368028657434,
"learning_rate": 2.5471085384992404e-07,
"logits/chosen": -1.3756840229034424,
"logits/rejected": -1.31130850315094,
"logps/chosen": -460.5741271972656,
"logps/rejected": -753.0440673828125,
"loss": 0.4342,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.089881181716919,
"rewards/margins": 2.9207491874694824,
"rewards/rejected": -5.010630130767822,
"step": 2270
},
{
"epoch": 0.5470249520153551,
"grad_norm": 59.8486418535735,
"learning_rate": 2.526172481217381e-07,
"logits/chosen": -1.3878482580184937,
"logits/rejected": -1.3218697309494019,
"logps/chosen": -430.990966796875,
"logps/rejected": -592.5296630859375,
"loss": 0.4811,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.273965358734131,
"rewards/margins": 1.4596675634384155,
"rewards/rejected": -3.733633041381836,
"step": 2280
},
{
"epoch": 0.5494241842610365,
"grad_norm": 62.76515485107013,
"learning_rate": 2.5052345880399456e-07,
"logits/chosen": -1.4437415599822998,
"logits/rejected": -1.5084867477416992,
"logps/chosen": -436.0667419433594,
"logps/rejected": -592.7450561523438,
"loss": 0.4273,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0785436630249023,
"rewards/margins": 1.571500539779663,
"rewards/rejected": -3.6500442028045654,
"step": 2290
},
{
"epoch": 0.5518234165067178,
"grad_norm": 58.27430309569641,
"learning_rate": 2.4842963276769555e-07,
"logits/chosen": -1.4130902290344238,
"logits/rejected": -1.3396443128585815,
"logps/chosen": -389.29888916015625,
"logps/rejected": -609.4580078125,
"loss": 0.4542,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7715704441070557,
"rewards/margins": 1.7805286645889282,
"rewards/rejected": -3.5520987510681152,
"step": 2300
},
{
"epoch": 0.5542226487523992,
"grad_norm": 53.972591355193885,
"learning_rate": 2.463359168864189e-07,
"logits/chosen": -1.2576197385787964,
"logits/rejected": -1.4408254623413086,
"logps/chosen": -481.51318359375,
"logps/rejected": -619.8674926757812,
"loss": 0.4782,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8940531015396118,
"rewards/margins": 1.729431390762329,
"rewards/rejected": -3.6234841346740723,
"step": 2310
},
{
"epoch": 0.5566218809980806,
"grad_norm": 71.62733220276354,
"learning_rate": 2.4424245802601555e-07,
"logits/chosen": -1.336724042892456,
"logits/rejected": -1.3197355270385742,
"logps/chosen": -375.65155029296875,
"logps/rejected": -558.9928588867188,
"loss": 0.412,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6036341190338135,
"rewards/margins": 1.2925240993499756,
"rewards/rejected": -2.896157741546631,
"step": 2320
},
{
"epoch": 0.559021113243762,
"grad_norm": 79.75428920809141,
"learning_rate": 2.421494030343072e-07,
"logits/chosen": -1.262160062789917,
"logits/rejected": -1.4333540201187134,
"logps/chosen": -479.90118408203125,
"logps/rejected": -587.5850219726562,
"loss": 0.5374,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.142888069152832,
"rewards/margins": 1.5673768520355225,
"rewards/rejected": -3.7102653980255127,
"step": 2330
},
{
"epoch": 0.5614203454894434,
"grad_norm": 67.46659078588667,
"learning_rate": 2.400568987307861e-07,
"logits/chosen": -1.3090261220932007,
"logits/rejected": -1.3993467092514038,
"logps/chosen": -436.4292907714844,
"logps/rejected": -518.08837890625,
"loss": 0.3933,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.021019458770752,
"rewards/margins": 0.9995520710945129,
"rewards/rejected": -3.0205719470977783,
"step": 2340
},
{
"epoch": 0.5638195777351248,
"grad_norm": 68.75205403037342,
"learning_rate": 2.379650918963156e-07,
"logits/chosen": -1.3889644145965576,
"logits/rejected": -1.376657247543335,
"logps/chosen": -432.72503662109375,
"logps/rejected": -657.9354248046875,
"loss": 0.4304,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.385554075241089,
"rewards/margins": 2.185183048248291,
"rewards/rejected": -4.570737361907959,
"step": 2350
},
{
"epoch": 0.5662188099808061,
"grad_norm": 99.03508718649326,
"learning_rate": 2.3587412926283438e-07,
"logits/chosen": -1.3818020820617676,
"logits/rejected": -1.3562251329421997,
"logps/chosen": -541.4228515625,
"logps/rejected": -719.0054931640625,
"loss": 0.4708,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.324052333831787,
"rewards/margins": 2.2292587757110596,
"rewards/rejected": -4.553310871124268,
"step": 2360
},
{
"epoch": 0.5686180422264875,
"grad_norm": 51.35517953676917,
"learning_rate": 2.337841575030642e-07,
"logits/chosen": -1.1950929164886475,
"logits/rejected": -1.216797113418579,
"logps/chosen": -471.62579345703125,
"logps/rejected": -675.9489135742188,
"loss": 0.4223,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9462970495224,
"rewards/margins": 1.9327852725982666,
"rewards/rejected": -3.879082202911377,
"step": 2370
},
{
"epoch": 0.5710172744721689,
"grad_norm": 66.71050035899282,
"learning_rate": 2.316953232202206e-07,
"logits/chosen": -1.3075534105300903,
"logits/rejected": -1.509835958480835,
"logps/chosen": -453.71038818359375,
"logps/rejected": -504.8943786621094,
"loss": 0.4325,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0954437255859375,
"rewards/margins": 1.1868107318878174,
"rewards/rejected": -3.282254457473755,
"step": 2380
},
{
"epoch": 0.5734165067178503,
"grad_norm": 53.41657725702868,
"learning_rate": 2.2960777293772958e-07,
"logits/chosen": -1.2998007535934448,
"logits/rejected": -1.4061708450317383,
"logps/chosen": -413.1690979003906,
"logps/rejected": -623.9481201171875,
"loss": 0.4201,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.010094404220581,
"rewards/margins": 2.3299851417541504,
"rewards/rejected": -4.3400797843933105,
"step": 2390
},
{
"epoch": 0.5758157389635317,
"grad_norm": 67.55796862724411,
"learning_rate": 2.2752165308894974e-07,
"logits/chosen": -1.2547554969787598,
"logits/rejected": -1.2700796127319336,
"logps/chosen": -388.22125244140625,
"logps/rejected": -563.0255126953125,
"loss": 0.4317,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9077174663543701,
"rewards/margins": 1.8513762950897217,
"rewards/rejected": -3.75909423828125,
"step": 2400
},
{
"epoch": 0.5782149712092131,
"grad_norm": 53.06300597839346,
"learning_rate": 2.254371100069005e-07,
"logits/chosen": -1.2128334045410156,
"logits/rejected": -1.1340278387069702,
"logps/chosen": -400.8592529296875,
"logps/rejected": -608.3983764648438,
"loss": 0.3902,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.6949599981307983,
"rewards/margins": 1.897270917892456,
"rewards/rejected": -3.5922305583953857,
"step": 2410
},
{
"epoch": 0.5806142034548945,
"grad_norm": 67.83708549754716,
"learning_rate": 2.2335428991399725e-07,
"logits/chosen": -1.2652689218521118,
"logits/rejected": -1.2790223360061646,
"logps/chosen": -512.0337524414062,
"logps/rejected": -827.4010620117188,
"loss": 0.4194,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.9982750415802,
"rewards/margins": 3.151942729949951,
"rewards/rejected": -6.1502180099487305,
"step": 2420
},
{
"epoch": 0.5830134357005758,
"grad_norm": 45.41387584500605,
"learning_rate": 2.2127333891179458e-07,
"logits/chosen": -1.3656853437423706,
"logits/rejected": -1.3812001943588257,
"logps/chosen": -446.84002685546875,
"logps/rejected": -745.3057861328125,
"loss": 0.4553,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.392338275909424,
"rewards/margins": 2.7484543323516846,
"rewards/rejected": -5.140792369842529,
"step": 2430
},
{
"epoch": 0.5854126679462572,
"grad_norm": 110.5756097353505,
"learning_rate": 2.1919440297073782e-07,
"logits/chosen": -1.2845975160598755,
"logits/rejected": -1.3235517740249634,
"logps/chosen": -438.25811767578125,
"logps/rejected": -696.1348876953125,
"loss": 0.4683,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2012147903442383,
"rewards/margins": 2.572181224822998,
"rewards/rejected": -4.773395538330078,
"step": 2440
},
{
"epoch": 0.5878119001919386,
"grad_norm": 54.1923950374701,
"learning_rate": 2.1711762791992368e-07,
"logits/chosen": -1.3080785274505615,
"logits/rejected": -1.344999074935913,
"logps/chosen": -509.382568359375,
"logps/rejected": -613.4410400390625,
"loss": 0.4518,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2509043216705322,
"rewards/margins": 1.362375020980835,
"rewards/rejected": -3.613279342651367,
"step": 2450
},
{
"epoch": 0.5902111324376199,
"grad_norm": 76.8189855205521,
"learning_rate": 2.1504315943687114e-07,
"logits/chosen": -1.190882921218872,
"logits/rejected": -1.1361911296844482,
"logps/chosen": -406.93231201171875,
"logps/rejected": -693.5443725585938,
"loss": 0.4276,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7856594324111938,
"rewards/margins": 2.4512863159179688,
"rewards/rejected": -4.236945629119873,
"step": 2460
},
{
"epoch": 0.5926103646833013,
"grad_norm": 85.69322529294834,
"learning_rate": 2.1297114303730248e-07,
"logits/chosen": -1.197404146194458,
"logits/rejected": -1.080083966255188,
"logps/chosen": -424.1693420410156,
"logps/rejected": -695.1914672851562,
"loss": 0.4925,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.1172688007354736,
"rewards/margins": 2.2054264545440674,
"rewards/rejected": -4.322694778442383,
"step": 2470
},
{
"epoch": 0.5950095969289827,
"grad_norm": 61.15777752405356,
"learning_rate": 2.1090172406493616e-07,
"logits/chosen": -1.1817920207977295,
"logits/rejected": -1.1163372993469238,
"logps/chosen": -369.00054931640625,
"logps/rejected": -556.2276611328125,
"loss": 0.3871,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.420171856880188,
"rewards/margins": 1.80911386013031,
"rewards/rejected": -3.229285478591919,
"step": 2480
},
{
"epoch": 0.5974088291746641,
"grad_norm": 83.03736161560481,
"learning_rate": 2.0883504768129146e-07,
"logits/chosen": -1.3368021249771118,
"logits/rejected": -1.3336868286132812,
"logps/chosen": -524.5817260742188,
"logps/rejected": -736.2766723632812,
"loss": 0.4307,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.5531392097473145,
"rewards/margins": 2.1611647605895996,
"rewards/rejected": -4.7143049240112305,
"step": 2490
},
{
"epoch": 0.5998080614203455,
"grad_norm": 106.91723764545198,
"learning_rate": 2.0677125885550571e-07,
"logits/chosen": -1.2371995449066162,
"logits/rejected": -1.4159976243972778,
"logps/chosen": -500.94000244140625,
"logps/rejected": -625.9182739257812,
"loss": 0.4678,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.676302433013916,
"rewards/margins": 1.586896300315857,
"rewards/rejected": -4.263198375701904,
"step": 2500
},
{
"epoch": 0.6022072936660269,
"grad_norm": 98.24854916731117,
"learning_rate": 2.0471050235416587e-07,
"logits/chosen": -1.1585936546325684,
"logits/rejected": -1.3378689289093018,
"logps/chosen": -570.26806640625,
"logps/rejected": -740.4846801757812,
"loss": 0.4088,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.0679523944854736,
"rewards/margins": 2.1525187492370605,
"rewards/rejected": -5.220470905303955,
"step": 2510
},
{
"epoch": 0.6046065259117083,
"grad_norm": 70.50575931793564,
"learning_rate": 2.026529227311532e-07,
"logits/chosen": -1.3343918323516846,
"logits/rejected": -1.3401401042938232,
"logps/chosen": -479.2476501464844,
"logps/rejected": -671.0690307617188,
"loss": 0.4758,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.6294853687286377,
"rewards/margins": 1.830687165260315,
"rewards/rejected": -4.4601731300354,
"step": 2520
},
{
"epoch": 0.6070057581573897,
"grad_norm": 64.24239379101434,
"learning_rate": 2.005986643175036e-07,
"logits/chosen": -1.2955334186553955,
"logits/rejected": -1.2314105033874512,
"logps/chosen": -471.40313720703125,
"logps/rejected": -755.314697265625,
"loss": 0.3891,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.193781614303589,
"rewards/margins": 2.968440532684326,
"rewards/rejected": -5.162222862243652,
"step": 2530
},
{
"epoch": 0.6094049904030711,
"grad_norm": 80.02360466830004,
"learning_rate": 1.9854787121128328e-07,
"logits/chosen": -1.2517433166503906,
"logits/rejected": -1.4158328771591187,
"logps/chosen": -400.4693908691406,
"logps/rejected": -503.8955993652344,
"loss": 0.4843,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7598693370819092,
"rewards/margins": 1.488473892211914,
"rewards/rejected": -3.2483432292938232,
"step": 2540
},
{
"epoch": 0.6118042226487524,
"grad_norm": 41.04039113747286,
"learning_rate": 1.9650068726748106e-07,
"logits/chosen": -1.2276278734207153,
"logits/rejected": -1.341862440109253,
"logps/chosen": -467.55224609375,
"logps/rejected": -600.6633911132812,
"loss": 0.4714,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9688072204589844,
"rewards/margins": 1.3757946491241455,
"rewards/rejected": -3.34460186958313,
"step": 2550
},
{
"epoch": 0.6142034548944337,
"grad_norm": 53.85241430730791,
"learning_rate": 1.9445725608791718e-07,
"logits/chosen": -1.2154093980789185,
"logits/rejected": -1.2377523183822632,
"logps/chosen": -479.86181640625,
"logps/rejected": -831.2470703125,
"loss": 0.4467,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.2339017391204834,
"rewards/margins": 3.4522805213928223,
"rewards/rejected": -5.686182975769043,
"step": 2560
},
{
"epoch": 0.6166026871401151,
"grad_norm": 63.29432990026096,
"learning_rate": 1.924177210111705e-07,
"logits/chosen": -1.302901029586792,
"logits/rejected": -1.3746470212936401,
"logps/chosen": -419.08673095703125,
"logps/rejected": -662.0726928710938,
"loss": 0.4464,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.846747636795044,
"rewards/margins": 2.3124613761901855,
"rewards/rejected": -4.15920877456665,
"step": 2570
},
{
"epoch": 0.6190019193857965,
"grad_norm": 48.25299827375637,
"learning_rate": 1.9038222510252364e-07,
"logits/chosen": -1.290828824043274,
"logits/rejected": -1.2858607769012451,
"logps/chosen": -419.1243591308594,
"logps/rejected": -589.756591796875,
"loss": 0.4235,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.7411692142486572,
"rewards/margins": 1.792761206626892,
"rewards/rejected": -3.533930540084839,
"step": 2580
},
{
"epoch": 0.6214011516314779,
"grad_norm": 69.23625636105592,
"learning_rate": 1.883509111439277e-07,
"logits/chosen": -1.2867010831832886,
"logits/rejected": -1.278282880783081,
"logps/chosen": -413.43341064453125,
"logps/rejected": -817.8923950195312,
"loss": 0.4301,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9501148462295532,
"rewards/margins": 3.3637890815734863,
"rewards/rejected": -5.31390380859375,
"step": 2590
},
{
"epoch": 0.6238003838771593,
"grad_norm": 76.56766681454758,
"learning_rate": 1.8632392162398665e-07,
"logits/chosen": -1.184441328048706,
"logits/rejected": -1.1581648588180542,
"logps/chosen": -494.22955322265625,
"logps/rejected": -747.8279418945312,
"loss": 0.4215,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.0423431396484375,
"rewards/margins": 2.603577136993408,
"rewards/rejected": -4.6459197998046875,
"step": 2600
},
{
"epoch": 0.6261996161228407,
"grad_norm": 62.594952599021646,
"learning_rate": 1.84301398727962e-07,
"logits/chosen": -1.3408887386322021,
"logits/rejected": -1.2393563985824585,
"logps/chosen": -374.20220947265625,
"logps/rejected": -702.144287109375,
"loss": 0.4431,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9351218938827515,
"rewards/margins": 2.950066089630127,
"rewards/rejected": -4.885188102722168,
"step": 2610
},
{
"epoch": 0.6285988483685221,
"grad_norm": 95.54656449411624,
"learning_rate": 1.8228348432779966e-07,
"logits/chosen": -1.30184006690979,
"logits/rejected": -1.301501989364624,
"logps/chosen": -444.2120056152344,
"logps/rejected": -676.330078125,
"loss": 0.454,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1440858840942383,
"rewards/margins": 2.344844341278076,
"rewards/rejected": -4.488929748535156,
"step": 2620
},
{
"epoch": 0.6309980806142035,
"grad_norm": 50.932751077914006,
"learning_rate": 1.8027031997217773e-07,
"logits/chosen": -1.4296987056732178,
"logits/rejected": -1.3950811624526978,
"logps/chosen": -546.5574340820312,
"logps/rejected": -892.3458862304688,
"loss": 0.3946,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.1905577182769775,
"rewards/margins": 3.3431007862091064,
"rewards/rejected": -6.533658504486084,
"step": 2630
},
{
"epoch": 0.6333973128598849,
"grad_norm": 49.84649756553616,
"learning_rate": 1.7826204687657758e-07,
"logits/chosen": -1.171356201171875,
"logits/rejected": -1.1530375480651855,
"logps/chosen": -500.71783447265625,
"logps/rejected": -566.6802978515625,
"loss": 0.4427,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.245105266571045,
"rewards/margins": 1.0768263339996338,
"rewards/rejected": -3.321931838989258,
"step": 2640
},
{
"epoch": 0.6357965451055663,
"grad_norm": 60.344915608566154,
"learning_rate": 1.762588059133781e-07,
"logits/chosen": -1.2368324995040894,
"logits/rejected": -1.367072343826294,
"logps/chosen": -481.23284912109375,
"logps/rejected": -660.7156982421875,
"loss": 0.4512,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8352482318878174,
"rewards/margins": 2.1524555683135986,
"rewards/rejected": -3.987703800201416,
"step": 2650
},
{
"epoch": 0.6381957773512476,
"grad_norm": 50.18340018947516,
"learning_rate": 1.7426073760197406e-07,
"logits/chosen": -1.1454894542694092,
"logits/rejected": -1.0864006280899048,
"logps/chosen": -459.0577087402344,
"logps/rejected": -807.2691650390625,
"loss": 0.4246,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0700535774230957,
"rewards/margins": 3.1858713626861572,
"rewards/rejected": -5.255925178527832,
"step": 2660
},
{
"epoch": 0.6405950095969289,
"grad_norm": 48.64380350571709,
"learning_rate": 1.7226798209891935e-07,
"logits/chosen": -1.1781213283538818,
"logits/rejected": -1.4034216403961182,
"logps/chosen": -472.9505920410156,
"logps/rejected": -646.2312622070312,
"loss": 0.4072,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.156583070755005,
"rewards/margins": 2.2617027759552,
"rewards/rejected": -4.418285369873047,
"step": 2670
},
{
"epoch": 0.6429942418426103,
"grad_norm": 70.13615144233104,
"learning_rate": 1.7028067918809535e-07,
"logits/chosen": -1.256744623184204,
"logits/rejected": -1.2579745054244995,
"logps/chosen": -446.1131286621094,
"logps/rejected": -807.1907348632812,
"loss": 0.4259,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2229795455932617,
"rewards/margins": 3.256349563598633,
"rewards/rejected": -5.4793291091918945,
"step": 2680
},
{
"epoch": 0.6453934740882917,
"grad_norm": 90.53079993576371,
"learning_rate": 1.6829896827090584e-07,
"logits/chosen": -1.3745183944702148,
"logits/rejected": -1.4126373529434204,
"logps/chosen": -463.4715881347656,
"logps/rejected": -547.9916381835938,
"loss": 0.4597,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9967275857925415,
"rewards/margins": 1.1592689752578735,
"rewards/rejected": -3.155996322631836,
"step": 2690
},
{
"epoch": 0.6477927063339731,
"grad_norm": 67.94576160324466,
"learning_rate": 1.6632298835649844e-07,
"logits/chosen": -1.2770214080810547,
"logits/rejected": -1.2181642055511475,
"logps/chosen": -491.872802734375,
"logps/rejected": -741.3040771484375,
"loss": 0.4183,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.2546989917755127,
"rewards/margins": 2.2701687812805176,
"rewards/rejected": -4.524867534637451,
"step": 2700
},
{
"epoch": 0.6501919385796545,
"grad_norm": 51.51411740739314,
"learning_rate": 1.6435287805201364e-07,
"logits/chosen": -1.42020845413208,
"logits/rejected": -1.3592700958251953,
"logps/chosen": -451.265869140625,
"logps/rejected": -593.9780883789062,
"loss": 0.4433,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.8679488897323608,
"rewards/margins": 1.4463961124420166,
"rewards/rejected": -3.314344882965088,
"step": 2710
},
{
"epoch": 0.6525911708253359,
"grad_norm": 50.81929422501917,
"learning_rate": 1.6238877555286207e-07,
"logits/chosen": -1.3951126337051392,
"logits/rejected": -1.3844718933105469,
"logps/chosen": -433.7630920410156,
"logps/rejected": -667.7963256835938,
"loss": 0.3907,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.691265344619751,
"rewards/margins": 2.2821648120880127,
"rewards/rejected": -3.9734299182891846,
"step": 2720
},
{
"epoch": 0.6549904030710173,
"grad_norm": 67.06059702675684,
"learning_rate": 1.60430818633031e-07,
"logits/chosen": -1.1817668676376343,
"logits/rejected": -1.2036738395690918,
"logps/chosen": -421.98236083984375,
"logps/rejected": -603.4003295898438,
"loss": 0.4012,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.750417709350586,
"rewards/margins": 1.8823362588882446,
"rewards/rejected": -3.63275408744812,
"step": 2730
},
{
"epoch": 0.6573896353166987,
"grad_norm": 66.36257208214307,
"learning_rate": 1.5847914463541939e-07,
"logits/chosen": -1.3497378826141357,
"logits/rejected": -1.4073352813720703,
"logps/chosen": -385.95751953125,
"logps/rejected": -616.4376220703125,
"loss": 0.3841,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8801229000091553,
"rewards/margins": 2.074592113494873,
"rewards/rejected": -3.9547152519226074,
"step": 2740
},
{
"epoch": 0.6597888675623801,
"grad_norm": 72.43378197120275,
"learning_rate": 1.5653389046220427e-07,
"logits/chosen": -1.3133211135864258,
"logits/rejected": -1.319744348526001,
"logps/chosen": -389.60784912109375,
"logps/rejected": -561.4827880859375,
"loss": 0.489,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.566753625869751,
"rewards/margins": 1.575201392173767,
"rewards/rejected": -3.1419551372528076,
"step": 2750
},
{
"epoch": 0.6621880998080614,
"grad_norm": 72.16620019968276,
"learning_rate": 1.545951925652375e-07,
"logits/chosen": -1.2689671516418457,
"logits/rejected": -1.41306734085083,
"logps/chosen": -517.5333862304688,
"logps/rejected": -688.21337890625,
"loss": 0.4433,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.1187641620635986,
"rewards/margins": 2.2521450519561768,
"rewards/rejected": -4.370909214019775,
"step": 2760
},
{
"epoch": 0.6645873320537428,
"grad_norm": 82.81608813170432,
"learning_rate": 1.5266318693647423e-07,
"logits/chosen": -1.3315207958221436,
"logits/rejected": -1.3426904678344727,
"logps/chosen": -471.59814453125,
"logps/rejected": -622.7100219726562,
"loss": 0.4119,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9626314640045166,
"rewards/margins": 1.5937044620513916,
"rewards/rejected": -3.556335926055908,
"step": 2770
},
{
"epoch": 0.6669865642994242,
"grad_norm": 50.06790457176136,
"learning_rate": 1.5073800909843353e-07,
"logits/chosen": -1.2962977886199951,
"logits/rejected": -1.433579683303833,
"logps/chosen": -469.7752990722656,
"logps/rejected": -641.1842041015625,
"loss": 0.4062,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.060004711151123,
"rewards/margins": 2.2752344608306885,
"rewards/rejected": -4.335238933563232,
"step": 2780
},
{
"epoch": 0.6693857965451055,
"grad_norm": 51.67259422731343,
"learning_rate": 1.488197940946922e-07,
"logits/chosen": -1.1685657501220703,
"logits/rejected": -1.208673357963562,
"logps/chosen": -441.9027404785156,
"logps/rejected": -606.9600830078125,
"loss": 0.4113,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6688663959503174,
"rewards/margins": 2.1915180683135986,
"rewards/rejected": -3.860384702682495,
"step": 2790
},
{
"epoch": 0.6717850287907869,
"grad_norm": 77.60853331730296,
"learning_rate": 1.4690867648041167e-07,
"logits/chosen": -1.1332224607467651,
"logits/rejected": -1.292792797088623,
"logps/chosen": -453.847412109375,
"logps/rejected": -647.2982177734375,
"loss": 0.4279,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9809976816177368,
"rewards/margins": 2.1735682487487793,
"rewards/rejected": -4.154566287994385,
"step": 2800
},
{
"epoch": 0.6741842610364683,
"grad_norm": 66.03903500616863,
"learning_rate": 1.4500479031289987e-07,
"logits/chosen": -1.2553117275238037,
"logits/rejected": -1.3797930479049683,
"logps/chosen": -419.65350341796875,
"logps/rejected": -583.0729370117188,
"loss": 0.4735,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5933611392974854,
"rewards/margins": 1.654476523399353,
"rewards/rejected": -3.247837781906128,
"step": 2810
},
{
"epoch": 0.6765834932821497,
"grad_norm": 42.6539101740705,
"learning_rate": 1.4310826914220747e-07,
"logits/chosen": -1.2346922159194946,
"logits/rejected": -1.3095893859863281,
"logps/chosen": -456.98846435546875,
"logps/rejected": -612.8602294921875,
"loss": 0.459,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.729434609413147,
"rewards/margins": 1.615557074546814,
"rewards/rejected": -3.344991683959961,
"step": 2820
},
{
"epoch": 0.6789827255278311,
"grad_norm": 84.75345605479198,
"learning_rate": 1.412192460017597e-07,
"logits/chosen": -1.3369228839874268,
"logits/rejected": -1.2705490589141846,
"logps/chosen": -474.5061950683594,
"logps/rejected": -645.0938720703125,
"loss": 0.4598,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.3626677989959717,
"rewards/margins": 1.6654049158096313,
"rewards/rejected": -4.028072834014893,
"step": 2830
},
{
"epoch": 0.6813819577735125,
"grad_norm": 67.06503369524259,
"learning_rate": 1.3933785339902504e-07,
"logits/chosen": -1.4095046520233154,
"logits/rejected": -1.2895666360855103,
"logps/chosen": -395.1067199707031,
"logps/rejected": -605.366943359375,
"loss": 0.4617,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8949320316314697,
"rewards/margins": 1.6969797611236572,
"rewards/rejected": -3.591911792755127,
"step": 2840
},
{
"epoch": 0.6837811900191939,
"grad_norm": 39.93390492293781,
"learning_rate": 1.374642233062197e-07,
"logits/chosen": -1.349837064743042,
"logits/rejected": -1.4833858013153076,
"logps/chosen": -487.5069274902344,
"logps/rejected": -665.9603881835938,
"loss": 0.4536,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0987796783447266,
"rewards/margins": 2.1197714805603027,
"rewards/rejected": -4.218550682067871,
"step": 2850
},
{
"epoch": 0.6861804222648752,
"grad_norm": 37.8109610801416,
"learning_rate": 1.355984871510511e-07,
"logits/chosen": -1.2923767566680908,
"logits/rejected": -1.2817193269729614,
"logps/chosen": -528.0411376953125,
"logps/rejected": -712.6599731445312,
"loss": 0.3873,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.3655641078948975,
"rewards/margins": 1.8116811513900757,
"rewards/rejected": -4.177245140075684,
"step": 2860
},
{
"epoch": 0.6885796545105566,
"grad_norm": 69.3259789726919,
"learning_rate": 1.3374077580749783e-07,
"logits/chosen": -1.5063129663467407,
"logits/rejected": -1.4583210945129395,
"logps/chosen": -387.3618469238281,
"logps/rejected": -595.7457275390625,
"loss": 0.4364,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9980754852294922,
"rewards/margins": 1.9206987619400024,
"rewards/rejected": -3.918774127960205,
"step": 2870
},
{
"epoch": 0.690978886756238,
"grad_norm": 103.02719824327798,
"learning_rate": 1.3189121958663024e-07,
"logits/chosen": -1.2567538022994995,
"logits/rejected": -1.4682070016860962,
"logps/chosen": -512.067626953125,
"logps/rejected": -630.2479248046875,
"loss": 0.4713,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.413303852081299,
"rewards/margins": 1.4284738302230835,
"rewards/rejected": -3.841777801513672,
"step": 2880
},
{
"epoch": 0.6933781190019194,
"grad_norm": 61.466096297911875,
"learning_rate": 1.3004994822746895e-07,
"logits/chosen": -1.379688024520874,
"logits/rejected": -1.4199955463409424,
"logps/chosen": -412.78369140625,
"logps/rejected": -605.1600341796875,
"loss": 0.433,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6713438034057617,
"rewards/margins": 1.8622289896011353,
"rewards/rejected": -3.5335726737976074,
"step": 2890
},
{
"epoch": 0.6957773512476008,
"grad_norm": 56.56735513727127,
"learning_rate": 1.2821709088788434e-07,
"logits/chosen": -1.1930772066116333,
"logits/rejected": -1.2361228466033936,
"logps/chosen": -373.34027099609375,
"logps/rejected": -582.1116943359375,
"loss": 0.4406,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7239477634429932,
"rewards/margins": 2.0639774799346924,
"rewards/rejected": -3.7879252433776855,
"step": 2900
},
{
"epoch": 0.6981765834932822,
"grad_norm": 50.93832251487875,
"learning_rate": 1.2639277613553736e-07,
"logits/chosen": -1.447892427444458,
"logits/rejected": -1.4102813005447388,
"logps/chosen": -364.46197509765625,
"logps/rejected": -530.6686401367188,
"loss": 0.446,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5684380531311035,
"rewards/margins": 1.651158094406128,
"rewards/rejected": -3.2195963859558105,
"step": 2910
},
{
"epoch": 0.7005758157389635,
"grad_norm": 43.2276930156436,
"learning_rate": 1.2457713193885975e-07,
"logits/chosen": -1.2600657939910889,
"logits/rejected": -1.2159188985824585,
"logps/chosen": -355.7850036621094,
"logps/rejected": -597.0232543945312,
"loss": 0.4108,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7937647104263306,
"rewards/margins": 2.0653727054595947,
"rewards/rejected": -3.859137773513794,
"step": 2920
},
{
"epoch": 0.7029750479846449,
"grad_norm": 72.27785496283792,
"learning_rate": 1.2277028565807838e-07,
"logits/chosen": -1.36660635471344,
"logits/rejected": -1.4359194040298462,
"logps/chosen": -425.2757263183594,
"logps/rejected": -566.6820678710938,
"loss": 0.4388,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.7790648937225342,
"rewards/margins": 1.5231965780258179,
"rewards/rejected": -3.3022613525390625,
"step": 2930
},
{
"epoch": 0.7053742802303263,
"grad_norm": 72.05535743928472,
"learning_rate": 1.209723640362815e-07,
"logits/chosen": -1.3078200817108154,
"logits/rejected": -1.3207073211669922,
"logps/chosen": -490.8136291503906,
"logps/rejected": -722.05712890625,
"loss": 0.4757,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.272034168243408,
"rewards/margins": 2.4088001251220703,
"rewards/rejected": -4.680834770202637,
"step": 2940
},
{
"epoch": 0.7077735124760077,
"grad_norm": 54.476978349000916,
"learning_rate": 1.191834931905277e-07,
"logits/chosen": -1.2736198902130127,
"logits/rejected": -1.28280770778656,
"logps/chosen": -531.6434936523438,
"logps/rejected": -682.0370483398438,
"loss": 0.4566,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.395185947418213,
"rewards/margins": 1.5002753734588623,
"rewards/rejected": -3.895461320877075,
"step": 2950
},
{
"epoch": 0.710172744721689,
"grad_norm": 49.0121212454874,
"learning_rate": 1.1740379860299988e-07,
"logits/chosen": -1.3577029705047607,
"logits/rejected": -1.3682730197906494,
"logps/chosen": -492.86968994140625,
"logps/rejected": -667.6934814453125,
"loss": 0.4496,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.102973461151123,
"rewards/margins": 1.538559913635254,
"rewards/rejected": -3.641533613204956,
"step": 2960
},
{
"epoch": 0.7125719769673704,
"grad_norm": 53.86489014590353,
"learning_rate": 1.1563340511220254e-07,
"logits/chosen": -1.3047479391098022,
"logits/rejected": -1.4146584272384644,
"logps/chosen": -505.7129821777344,
"logps/rejected": -683.0394897460938,
"loss": 0.4351,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.213013172149658,
"rewards/margins": 1.9895169734954834,
"rewards/rejected": -4.202530384063721,
"step": 2970
},
{
"epoch": 0.7149712092130518,
"grad_norm": 65.39616806465553,
"learning_rate": 1.1387243690420556e-07,
"logits/chosen": -1.2899211645126343,
"logits/rejected": -1.3665580749511719,
"logps/chosen": -525.3897705078125,
"logps/rejected": -745.2420043945312,
"loss": 0.4695,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.0657598972320557,
"rewards/margins": 2.3017890453338623,
"rewards/rejected": -4.367548942565918,
"step": 2980
},
{
"epoch": 0.7173704414587332,
"grad_norm": 109.17338886735457,
"learning_rate": 1.1212101750393235e-07,
"logits/chosen": -1.4184496402740479,
"logits/rejected": -1.4868519306182861,
"logps/chosen": -459.9681701660156,
"logps/rejected": -679.7008666992188,
"loss": 0.4322,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0537400245666504,
"rewards/margins": 2.383260726928711,
"rewards/rejected": -4.4370012283325195,
"step": 2990
},
{
"epoch": 0.7197696737044146,
"grad_norm": 55.756753852863056,
"learning_rate": 1.1037926976649562e-07,
"logits/chosen": -1.3477023839950562,
"logits/rejected": -1.3794200420379639,
"logps/chosen": -488.9522399902344,
"logps/rejected": -725.9136352539062,
"loss": 0.4548,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.273102283477783,
"rewards/margins": 2.1587283611297607,
"rewards/rejected": -4.431830406188965,
"step": 3000
},
{
"epoch": 0.722168905950096,
"grad_norm": 68.80775812348986,
"learning_rate": 1.0864731586857936e-07,
"logits/chosen": -1.3036201000213623,
"logits/rejected": -1.453201174736023,
"logps/chosen": -474.9127502441406,
"logps/rejected": -661.4716186523438,
"loss": 0.4176,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.9301677942276,
"rewards/margins": 2.1636157035827637,
"rewards/rejected": -4.093783855438232,
"step": 3010
},
{
"epoch": 0.7245681381957774,
"grad_norm": 79.25153161336767,
"learning_rate": 1.0692527729986839e-07,
"logits/chosen": -1.2558592557907104,
"logits/rejected": -1.3554977178573608,
"logps/chosen": -466.6205139160156,
"logps/rejected": -644.95654296875,
"loss": 0.3843,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.134385347366333,
"rewards/margins": 2.0006089210510254,
"rewards/rejected": -4.1349945068359375,
"step": 3020
},
{
"epoch": 0.7269673704414588,
"grad_norm": 86.56858666466508,
"learning_rate": 1.0521327485452692e-07,
"logits/chosen": -1.3626697063446045,
"logits/rejected": -1.4285178184509277,
"logps/chosen": -469.54327392578125,
"logps/rejected": -667.6971435546875,
"loss": 0.4287,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.349073886871338,
"rewards/margins": 2.126603841781616,
"rewards/rejected": -4.475677967071533,
"step": 3030
},
{
"epoch": 0.7293666026871402,
"grad_norm": 70.38320684395659,
"learning_rate": 1.0351142862272468e-07,
"logits/chosen": -1.273447036743164,
"logits/rejected": -1.430558204650879,
"logps/chosen": -425.697998046875,
"logps/rejected": -738.443603515625,
"loss": 0.4349,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.1780591011047363,
"rewards/margins": 3.248278856277466,
"rewards/rejected": -5.426337718963623,
"step": 3040
},
{
"epoch": 0.7317658349328215,
"grad_norm": 69.1842946105189,
"learning_rate": 1.0181985798221343e-07,
"logits/chosen": -1.324233055114746,
"logits/rejected": -1.3126153945922852,
"logps/chosen": -449.85931396484375,
"logps/rejected": -683.6974487304688,
"loss": 0.4176,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.064021110534668,
"rewards/margins": 2.2973549365997314,
"rewards/rejected": -4.3613762855529785,
"step": 3050
},
{
"epoch": 0.7341650671785028,
"grad_norm": 79.20520032825635,
"learning_rate": 1.0013868158995329e-07,
"logits/chosen": -1.3214269876480103,
"logits/rejected": -1.3733516931533813,
"logps/chosen": -499.84619140625,
"logps/rejected": -665.3927001953125,
"loss": 0.4477,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.2968623638153076,
"rewards/margins": 1.8752143383026123,
"rewards/rejected": -4.172077178955078,
"step": 3060
},
{
"epoch": 0.7365642994241842,
"grad_norm": 60.91192909422294,
"learning_rate": 9.84680173737887e-08,
"logits/chosen": -1.4117096662521362,
"logits/rejected": -1.4878652095794678,
"logps/chosen": -454.979736328125,
"logps/rejected": -587.895263671875,
"loss": 0.4438,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8998470306396484,
"rewards/margins": 1.7405235767364502,
"rewards/rejected": -3.6403706073760986,
"step": 3070
},
{
"epoch": 0.7389635316698656,
"grad_norm": 56.810179840146574,
"learning_rate": 9.680798252417713e-08,
"logits/chosen": -1.490818738937378,
"logits/rejected": -1.5360429286956787,
"logps/chosen": -389.6299743652344,
"logps/rejected": -561.4585571289062,
"loss": 0.4186,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7963836193084717,
"rewards/margins": 1.37783944606781,
"rewards/rejected": -3.1742234230041504,
"step": 3080
},
{
"epoch": 0.741362763915547,
"grad_norm": 64.39208946081128,
"learning_rate": 9.515869348596808e-08,
"logits/chosen": -1.2397037744522095,
"logits/rejected": -1.3714118003845215,
"logps/chosen": -469.8956604003906,
"logps/rejected": -645.2073364257812,
"loss": 0.4252,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.875208854675293,
"rewards/margins": 1.950748085975647,
"rewards/rejected": -3.8259568214416504,
"step": 3090
},
{
"epoch": 0.7437619961612284,
"grad_norm": 69.08537862579493,
"learning_rate": 9.352026595023493e-08,
"logits/chosen": -1.2812139987945557,
"logits/rejected": -1.3342682123184204,
"logps/chosen": -455.315673828125,
"logps/rejected": -566.673095703125,
"loss": 0.4451,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.7951748371124268,
"rewards/margins": 1.3370133638381958,
"rewards/rejected": -3.132188320159912,
"step": 3100
},
{
"epoch": 0.7461612284069098,
"grad_norm": 65.26028448347373,
"learning_rate": 9.189281484616004e-08,
"logits/chosen": -1.281347393989563,
"logits/rejected": -1.2689536809921265,
"logps/chosen": -385.11810302734375,
"logps/rejected": -632.1613159179688,
"loss": 0.4691,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8461825847625732,
"rewards/margins": 2.0561070442199707,
"rewards/rejected": -3.902289867401123,
"step": 3110
},
{
"epoch": 0.7485604606525912,
"grad_norm": 80.61871004422235,
"learning_rate": 9.027645433297249e-08,
"logits/chosen": -1.1624202728271484,
"logits/rejected": -1.2329314947128296,
"logps/chosen": -566.9415893554688,
"logps/rejected": -743.143798828125,
"loss": 0.5046,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.6038241386413574,
"rewards/margins": 2.1354241371154785,
"rewards/rejected": -4.739248275756836,
"step": 3120
},
{
"epoch": 0.7509596928982726,
"grad_norm": 92.80348247180913,
"learning_rate": 8.867129779194066e-08,
"logits/chosen": -1.304614782333374,
"logits/rejected": -1.4066526889801025,
"logps/chosen": -354.045654296875,
"logps/rejected": -597.7302856445312,
"loss": 0.4367,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.410996437072754,
"rewards/margins": 2.426766872406006,
"rewards/rejected": -3.837763547897339,
"step": 3130
},
{
"epoch": 0.753358925143954,
"grad_norm": 61.53877658813085,
"learning_rate": 8.707745781841866e-08,
"logits/chosen": -1.1687877178192139,
"logits/rejected": -1.2777760028839111,
"logps/chosen": -418.43511962890625,
"logps/rejected": -630.715576171875,
"loss": 0.4253,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9291305541992188,
"rewards/margins": 2.163900852203369,
"rewards/rejected": -4.093031883239746,
"step": 3140
},
{
"epoch": 0.7557581573896354,
"grad_norm": 35.11204054429555,
"learning_rate": 8.549504621394831e-08,
"logits/chosen": -1.354907751083374,
"logits/rejected": -1.3612796068191528,
"logps/chosen": -402.2507019042969,
"logps/rejected": -620.3946533203125,
"loss": 0.3723,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.7278754711151123,
"rewards/margins": 2.2112643718719482,
"rewards/rejected": -3.9391398429870605,
"step": 3150
},
{
"epoch": 0.7581573896353166,
"grad_norm": 88.00288297264423,
"learning_rate": 8.392417397841703e-08,
"logits/chosen": -1.289905309677124,
"logits/rejected": -1.3830498456954956,
"logps/chosen": -455.31707763671875,
"logps/rejected": -634.426025390625,
"loss": 0.4426,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.930456519126892,
"rewards/margins": 1.6569267511367798,
"rewards/rejected": -3.587383270263672,
"step": 3160
},
{
"epoch": 0.760556621880998,
"grad_norm": 74.17199448695689,
"learning_rate": 8.236495130227083e-08,
"logits/chosen": -1.3127448558807373,
"logits/rejected": -1.4850471019744873,
"logps/chosen": -532.6678466796875,
"logps/rejected": -737.5643310546875,
"loss": 0.4652,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.458889961242676,
"rewards/margins": 2.453096389770508,
"rewards/rejected": -4.911986351013184,
"step": 3170
},
{
"epoch": 0.7629558541266794,
"grad_norm": 67.92460340777723,
"learning_rate": 8.081748755878612e-08,
"logits/chosen": -1.310718297958374,
"logits/rejected": -1.4427484273910522,
"logps/chosen": -485.64007568359375,
"logps/rejected": -605.2127685546875,
"loss": 0.4271,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.0952415466308594,
"rewards/margins": 1.715080976486206,
"rewards/rejected": -3.8103225231170654,
"step": 3180
},
{
"epoch": 0.7653550863723608,
"grad_norm": 57.095685251850284,
"learning_rate": 7.928189129639632e-08,
"logits/chosen": -1.2018425464630127,
"logits/rejected": -1.2011052370071411,
"logps/chosen": -412.9466857910156,
"logps/rejected": -631.907958984375,
"loss": 0.393,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.932543158531189,
"rewards/margins": 2.0471224784851074,
"rewards/rejected": -3.9796652793884277,
"step": 3190
},
{
"epoch": 0.7677543186180422,
"grad_norm": 72.05124634833409,
"learning_rate": 7.775827023107834e-08,
"logits/chosen": -1.3011525869369507,
"logits/rejected": -1.3565986156463623,
"logps/chosen": -445.1830139160156,
"logps/rejected": -628.5127563476562,
"loss": 0.4404,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.1575093269348145,
"rewards/margins": 1.6431667804718018,
"rewards/rejected": -3.800675868988037,
"step": 3200
},
{
"epoch": 0.7701535508637236,
"grad_norm": 67.55127608074025,
"learning_rate": 7.624673123879682e-08,
"logits/chosen": -1.176064372062683,
"logits/rejected": -1.3284848928451538,
"logps/chosen": -422.73468017578125,
"logps/rejected": -569.9011840820312,
"loss": 0.4598,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8534692525863647,
"rewards/margins": 1.6330394744873047,
"rewards/rejected": -3.486508846282959,
"step": 3210
},
{
"epoch": 0.772552783109405,
"grad_norm": 75.19351516985365,
"learning_rate": 7.474738034800663e-08,
"logits/chosen": -1.327171802520752,
"logits/rejected": -1.2900383472442627,
"logps/chosen": -395.79327392578125,
"logps/rejected": -663.9730834960938,
"loss": 0.4643,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7923393249511719,
"rewards/margins": 2.9014949798583984,
"rewards/rejected": -4.6938347816467285,
"step": 3220
},
{
"epoch": 0.7749520153550864,
"grad_norm": 47.848721211003586,
"learning_rate": 7.326032273221606e-08,
"logits/chosen": -1.4597444534301758,
"logits/rejected": -1.4309790134429932,
"logps/chosen": -524.79541015625,
"logps/rejected": -688.5581665039062,
"loss": 0.439,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.4929757118225098,
"rewards/margins": 1.8584671020507812,
"rewards/rejected": -4.351442337036133,
"step": 3230
},
{
"epoch": 0.7773512476007678,
"grad_norm": 74.01694932477116,
"learning_rate": 7.178566270260872e-08,
"logits/chosen": -1.416358232498169,
"logits/rejected": -1.4566500186920166,
"logps/chosen": -498.6495666503906,
"logps/rejected": -710.8546752929688,
"loss": 0.4484,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.396069049835205,
"rewards/margins": 1.9357521533966064,
"rewards/rejected": -4.331821441650391,
"step": 3240
},
{
"epoch": 0.7797504798464492,
"grad_norm": 60.26009757023119,
"learning_rate": 7.032350370072709e-08,
"logits/chosen": -1.2919445037841797,
"logits/rejected": -1.3735246658325195,
"logps/chosen": -451.0271911621094,
"logps/rejected": -627.9866943359375,
"loss": 0.4287,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8197091817855835,
"rewards/margins": 1.8777036666870117,
"rewards/rejected": -3.6974129676818848,
"step": 3250
},
{
"epoch": 0.7821497120921305,
"grad_norm": 46.06757696293524,
"learning_rate": 6.887394829121596e-08,
"logits/chosen": -1.3799974918365479,
"logits/rejected": -1.4900586605072021,
"logps/chosen": -511.42242431640625,
"logps/rejected": -814.45458984375,
"loss": 0.4112,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.4561266899108887,
"rewards/margins": 3.2779293060302734,
"rewards/rejected": -5.73405647277832,
"step": 3260
},
{
"epoch": 0.7845489443378119,
"grad_norm": 66.08707079226782,
"learning_rate": 6.743709815462833e-08,
"logits/chosen": -1.3536878824234009,
"logits/rejected": -1.424392819404602,
"logps/chosen": -499.06951904296875,
"logps/rejected": -663.4131469726562,
"loss": 0.4438,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.306351900100708,
"rewards/margins": 1.9852672815322876,
"rewards/rejected": -4.291619300842285,
"step": 3270
},
{
"epoch": 0.7869481765834933,
"grad_norm": 56.522101390783696,
"learning_rate": 6.601305408029287e-08,
"logits/chosen": -1.3996652364730835,
"logits/rejected": -1.5223954916000366,
"logps/chosen": -453.1273498535156,
"logps/rejected": -657.0367431640625,
"loss": 0.4095,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.1622931957244873,
"rewards/margins": 2.026867151260376,
"rewards/rejected": -4.189160346984863,
"step": 3280
},
{
"epoch": 0.7893474088291746,
"grad_norm": 58.94344273406338,
"learning_rate": 6.460191595924366e-08,
"logits/chosen": -1.3207144737243652,
"logits/rejected": -1.3845504522323608,
"logps/chosen": -452.1018981933594,
"logps/rejected": -652.0777587890625,
"loss": 0.4001,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.1246771812438965,
"rewards/margins": 1.9852821826934814,
"rewards/rejected": -4.109959125518799,
"step": 3290
},
{
"epoch": 0.791746641074856,
"grad_norm": 73.72070547985138,
"learning_rate": 6.320378277721342e-08,
"logits/chosen": -1.445039987564087,
"logits/rejected": -1.4531352519989014,
"logps/chosen": -474.85272216796875,
"logps/rejected": -595.89013671875,
"loss": 0.4363,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2441697120666504,
"rewards/margins": 1.3221468925476074,
"rewards/rejected": -3.566316604614258,
"step": 3300
},
{
"epoch": 0.7941458733205374,
"grad_norm": 99.15702080965917,
"learning_rate": 6.181875260769032e-08,
"logits/chosen": -1.3354700803756714,
"logits/rejected": -1.5110574960708618,
"logps/chosen": -463.51544189453125,
"logps/rejected": -614.9706420898438,
"loss": 0.4487,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8257434368133545,
"rewards/margins": 2.170253038406372,
"rewards/rejected": -3.9959964752197266,
"step": 3310
},
{
"epoch": 0.7965451055662188,
"grad_norm": 74.68786765448685,
"learning_rate": 6.044692260503797e-08,
"logits/chosen": -1.2728173732757568,
"logits/rejected": -1.4052507877349854,
"logps/chosen": -539.398681640625,
"logps/rejected": -777.3851928710938,
"loss": 0.3819,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.54783034324646,
"rewards/margins": 2.600435972213745,
"rewards/rejected": -5.148266792297363,
"step": 3320
},
{
"epoch": 0.7989443378119002,
"grad_norm": 49.2779890787065,
"learning_rate": 5.9088388997680984e-08,
"logits/chosen": -1.2829517126083374,
"logits/rejected": -1.413627028465271,
"logps/chosen": -531.353271484375,
"logps/rejected": -713.0965576171875,
"loss": 0.4236,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.072298765182495,
"rewards/margins": 2.5900845527648926,
"rewards/rejected": -4.662383079528809,
"step": 3330
},
{
"epoch": 0.8013435700575816,
"grad_norm": 59.17229679593995,
"learning_rate": 5.774324708135439e-08,
"logits/chosen": -1.450781226158142,
"logits/rejected": -1.532762885093689,
"logps/chosen": -397.7679443359375,
"logps/rejected": -564.6585693359375,
"loss": 0.4507,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.851607322692871,
"rewards/margins": 1.8835060596466064,
"rewards/rejected": -3.7351138591766357,
"step": 3340
},
{
"epoch": 0.803742802303263,
"grad_norm": 34.8860528557667,
"learning_rate": 5.641159121241953e-08,
"logits/chosen": -1.4477955102920532,
"logits/rejected": -1.3717939853668213,
"logps/chosen": -461.99151611328125,
"logps/rejected": -707.8023681640625,
"loss": 0.4269,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.312293529510498,
"rewards/margins": 2.164968729019165,
"rewards/rejected": -4.477262020111084,
"step": 3350
},
{
"epoch": 0.8061420345489443,
"grad_norm": 59.873519110220755,
"learning_rate": 5.5093514801245106e-08,
"logits/chosen": -1.3825056552886963,
"logits/rejected": -1.3870340585708618,
"logps/chosen": -470.130859375,
"logps/rejected": -666.9625244140625,
"loss": 0.4412,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.260951042175293,
"rewards/margins": 1.7683954238891602,
"rewards/rejected": -4.029345989227295,
"step": 3360
},
{
"epoch": 0.8085412667946257,
"grad_norm": 49.58444606035955,
"learning_rate": 5.378911030565453e-08,
"logits/chosen": -1.3055133819580078,
"logits/rejected": -1.3414740562438965,
"logps/chosen": -552.4376220703125,
"logps/rejected": -754.7117309570312,
"loss": 0.4357,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.7371158599853516,
"rewards/margins": 1.7635667324066162,
"rewards/rejected": -4.500682830810547,
"step": 3370
},
{
"epoch": 0.8109404990403071,
"grad_norm": 68.89475959244419,
"learning_rate": 5.249846922444101e-08,
"logits/chosen": -1.4738832712173462,
"logits/rejected": -1.525614619255066,
"logps/chosen": -470.60821533203125,
"logps/rejected": -818.2075805664062,
"loss": 0.4114,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.500138282775879,
"rewards/margins": 3.7085862159729004,
"rewards/rejected": -6.2087249755859375,
"step": 3380
},
{
"epoch": 0.8133397312859885,
"grad_norm": 79.56262016402897,
"learning_rate": 5.122168209094865e-08,
"logits/chosen": -1.385946273803711,
"logits/rejected": -1.4624695777893066,
"logps/chosen": -425.1856384277344,
"logps/rejected": -532.6458740234375,
"loss": 0.4265,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.1394572257995605,
"rewards/margins": 1.0579640865325928,
"rewards/rejected": -3.197421073913574,
"step": 3390
},
{
"epoch": 0.8157389635316699,
"grad_norm": 52.696107408105114,
"learning_rate": 4.995883846672222e-08,
"logits/chosen": -1.2016537189483643,
"logits/rejected": -1.3861643075942993,
"logps/chosen": -596.3150024414062,
"logps/rejected": -683.9937744140625,
"loss": 0.4343,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.35935378074646,
"rewards/margins": 1.6371619701385498,
"rewards/rejected": -3.9965157508850098,
"step": 3400
},
{
"epoch": 0.8181381957773513,
"grad_norm": 49.46829805686536,
"learning_rate": 4.871002693522486e-08,
"logits/chosen": -1.346164345741272,
"logits/rejected": -1.3557155132293701,
"logps/chosen": -499.1998596191406,
"logps/rejected": -619.0379638671875,
"loss": 0.4461,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.2972307205200195,
"rewards/margins": 1.5589020252227783,
"rewards/rejected": -3.856133222579956,
"step": 3410
},
{
"epoch": 0.8205374280230326,
"grad_norm": 49.216158194263706,
"learning_rate": 4.7475335095623956e-08,
"logits/chosen": -1.4238225221633911,
"logits/rejected": -1.3791711330413818,
"logps/chosen": -494.3877868652344,
"logps/rejected": -666.6940307617188,
"loss": 0.4389,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.409292221069336,
"rewards/margins": 1.8552162647247314,
"rewards/rejected": -4.264508247375488,
"step": 3420
},
{
"epoch": 0.822936660268714,
"grad_norm": 99.88268703047527,
"learning_rate": 4.6254849556646714e-08,
"logits/chosen": -1.2449895143508911,
"logits/rejected": -1.3020665645599365,
"logps/chosen": -563.0958862304688,
"logps/rejected": -768.5317993164062,
"loss": 0.4356,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.662400007247925,
"rewards/margins": 2.4422526359558105,
"rewards/rejected": -5.104652404785156,
"step": 3430
},
{
"epoch": 0.8253358925143954,
"grad_norm": 62.76497827080244,
"learning_rate": 4.504865593050483e-08,
"logits/chosen": -1.3190138339996338,
"logits/rejected": -1.3208320140838623,
"logps/chosen": -483.7323303222656,
"logps/rejected": -660.0289306640625,
"loss": 0.4451,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.17266583442688,
"rewards/margins": 1.7615305185317993,
"rewards/rejected": -3.9341964721679688,
"step": 3440
},
{
"epoch": 0.8277351247600768,
"grad_norm": 59.22990924876174,
"learning_rate": 4.385683882688895e-08,
"logits/chosen": -1.1815550327301025,
"logits/rejected": -1.2879037857055664,
"logps/chosen": -483.38427734375,
"logps/rejected": -521.2240600585938,
"loss": 0.4899,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.1154184341430664,
"rewards/margins": 0.7992331981658936,
"rewards/rejected": -2.91465163230896,
"step": 3450
},
{
"epoch": 0.8301343570057581,
"grad_norm": 82.86527709001345,
"learning_rate": 4.2679481847033985e-08,
"logits/chosen": -1.3511393070220947,
"logits/rejected": -1.4024460315704346,
"logps/chosen": -451.1329040527344,
"logps/rejected": -676.7293701171875,
"loss": 0.476,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8962301015853882,
"rewards/margins": 2.275341749191284,
"rewards/rejected": -4.171571731567383,
"step": 3460
},
{
"epoch": 0.8325335892514395,
"grad_norm": 52.66863159129303,
"learning_rate": 4.151666757785435e-08,
"logits/chosen": -1.2461029291152954,
"logits/rejected": -1.2992231845855713,
"logps/chosen": -400.0786437988281,
"logps/rejected": -668.3745727539062,
"loss": 0.3924,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.6190965175628662,
"rewards/margins": 2.7610526084899902,
"rewards/rejected": -4.380148887634277,
"step": 3470
},
{
"epoch": 0.8349328214971209,
"grad_norm": 56.576155344810864,
"learning_rate": 4.036847758615136e-08,
"logits/chosen": -1.1943762302398682,
"logits/rejected": -1.332884430885315,
"logps/chosen": -517.8956298828125,
"logps/rejected": -671.5089111328125,
"loss": 0.4592,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.780350685119629,
"rewards/margins": 1.4910480976104736,
"rewards/rejected": -4.271399021148682,
"step": 3480
},
{
"epoch": 0.8373320537428023,
"grad_norm": 46.825486174943826,
"learning_rate": 3.923499241289113e-08,
"logits/chosen": -1.2725188732147217,
"logits/rejected": -1.41265070438385,
"logps/chosen": -526.8565063476562,
"logps/rejected": -632.02880859375,
"loss": 0.4499,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.301103115081787,
"rewards/margins": 1.6291067600250244,
"rewards/rejected": -3.9302101135253906,
"step": 3490
},
{
"epoch": 0.8397312859884837,
"grad_norm": 53.125011909511024,
"learning_rate": 3.811629156755541e-08,
"logits/chosen": -1.28065824508667,
"logits/rejected": -1.2841136455535889,
"logps/chosen": -484.08251953125,
"logps/rejected": -641.4912719726562,
"loss": 0.4517,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.014495372772217,
"rewards/margins": 1.684488296508789,
"rewards/rejected": -3.698983669281006,
"step": 3500
},
{
"epoch": 0.8421305182341651,
"grad_norm": 66.99344284990306,
"learning_rate": 3.701245352256391e-08,
"logits/chosen": -1.3107590675354004,
"logits/rejected": -1.4500467777252197,
"logps/chosen": -464.06268310546875,
"logps/rejected": -548.866455078125,
"loss": 0.4163,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7597360610961914,
"rewards/margins": 1.0881340503692627,
"rewards/rejected": -2.847869873046875,
"step": 3510
},
{
"epoch": 0.8445297504798465,
"grad_norm": 71.85929092000093,
"learning_rate": 3.592355570776984e-08,
"logits/chosen": -1.2988982200622559,
"logits/rejected": -1.3713183403015137,
"logps/chosen": -366.3793029785156,
"logps/rejected": -544.4915771484375,
"loss": 0.4361,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.464356541633606,
"rewards/margins": 1.7605937719345093,
"rewards/rejected": -3.2249503135681152,
"step": 3520
},
{
"epoch": 0.8469289827255279,
"grad_norm": 44.064848069845205,
"learning_rate": 3.484967450502904e-08,
"logits/chosen": -1.257299542427063,
"logits/rejected": -1.3451262712478638,
"logps/chosen": -373.0574645996094,
"logps/rejected": -586.8876953125,
"loss": 0.4248,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.7843666076660156,
"rewards/margins": 1.750450849533081,
"rewards/rejected": -3.534817934036255,
"step": 3530
},
{
"epoch": 0.8493282149712092,
"grad_norm": 84.80584097617765,
"learning_rate": 3.3790885242841296e-08,
"logits/chosen": -1.2018029689788818,
"logits/rejected": -1.2710869312286377,
"logps/chosen": -486.6034240722656,
"logps/rejected": -715.1466064453125,
"loss": 0.4212,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.3987205028533936,
"rewards/margins": 2.365349292755127,
"rewards/rejected": -4.764069080352783,
"step": 3540
},
{
"epoch": 0.8517274472168906,
"grad_norm": 61.61146272483576,
"learning_rate": 3.274726219106677e-08,
"logits/chosen": -1.1959749460220337,
"logits/rejected": -1.265887975692749,
"logps/chosen": -500.03045654296875,
"logps/rejected": -700.082763671875,
"loss": 0.4673,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.255139112472534,
"rewards/margins": 2.0479519367218018,
"rewards/rejected": -4.303091526031494,
"step": 3550
},
{
"epoch": 0.8541266794625719,
"grad_norm": 48.622523103355604,
"learning_rate": 3.171887855571642e-08,
"logits/chosen": -1.3621529340744019,
"logits/rejected": -1.3234202861785889,
"logps/chosen": -397.14666748046875,
"logps/rejected": -537.7269287109375,
"loss": 0.3959,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7443649768829346,
"rewards/margins": 1.4419325590133667,
"rewards/rejected": -3.186297655105591,
"step": 3560
},
{
"epoch": 0.8565259117082533,
"grad_norm": 79.36925636948864,
"learning_rate": 3.070580647381643e-08,
"logits/chosen": -1.2797110080718994,
"logits/rejected": -1.3679635524749756,
"logps/chosen": -430.72607421875,
"logps/rejected": -691.2688598632812,
"loss": 0.4679,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.9860126972198486,
"rewards/margins": 2.591972827911377,
"rewards/rejected": -4.577984809875488,
"step": 3570
},
{
"epoch": 0.8589251439539347,
"grad_norm": 49.95738385204341,
"learning_rate": 2.9708117008348576e-08,
"logits/chosen": -1.3578163385391235,
"logits/rejected": -1.4846515655517578,
"logps/chosen": -500.74542236328125,
"logps/rejected": -625.261474609375,
"loss": 0.4209,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.9553468227386475,
"rewards/margins": 1.7270301580429077,
"rewards/rejected": -3.6823768615722656,
"step": 3580
},
{
"epoch": 0.8613243761996161,
"grad_norm": 53.31282017405371,
"learning_rate": 2.8725880143264992e-08,
"logits/chosen": -1.3193708658218384,
"logits/rejected": -1.3403939008712769,
"logps/chosen": -466.86053466796875,
"logps/rejected": -651.1651611328125,
"loss": 0.4863,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.282682418823242,
"rewards/margins": 1.4440171718597412,
"rewards/rejected": -3.7266998291015625,
"step": 3590
},
{
"epoch": 0.8637236084452975,
"grad_norm": 79.7283378065534,
"learning_rate": 2.775916477857948e-08,
"logits/chosen": -1.2309401035308838,
"logits/rejected": -1.263295292854309,
"logps/chosen": -408.5320739746094,
"logps/rejected": -557.888916015625,
"loss": 0.3998,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.013779640197754,
"rewards/margins": 1.4895678758621216,
"rewards/rejected": -3.503347396850586,
"step": 3600
},
{
"epoch": 0.8661228406909789,
"grad_norm": 70.03228325462196,
"learning_rate": 2.680803872553408e-08,
"logits/chosen": -1.3484076261520386,
"logits/rejected": -1.4340890645980835,
"logps/chosen": -430.97967529296875,
"logps/rejected": -752.5364379882812,
"loss": 0.4204,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.7587569952011108,
"rewards/margins": 3.472735643386841,
"rewards/rejected": -5.23149299621582,
"step": 3610
},
{
"epoch": 0.8685220729366603,
"grad_norm": 92.81223070149015,
"learning_rate": 2.5872568701842706e-08,
"logits/chosen": -1.3619986772537231,
"logits/rejected": -1.4464303255081177,
"logps/chosen": -365.88458251953125,
"logps/rejected": -585.7950439453125,
"loss": 0.472,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5396031141281128,
"rewards/margins": 1.9823001623153687,
"rewards/rejected": -3.5219035148620605,
"step": 3620
},
{
"epoch": 0.8709213051823417,
"grad_norm": 75.34177531865268,
"learning_rate": 2.495282032701096e-08,
"logits/chosen": -1.248120665550232,
"logits/rejected": -1.437888264656067,
"logps/chosen": -346.2172546386719,
"logps/rejected": -514.9561767578125,
"loss": 0.4199,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4829394817352295,
"rewards/margins": 1.9923450946807861,
"rewards/rejected": -3.4752845764160156,
"step": 3630
},
{
"epoch": 0.8733205374280231,
"grad_norm": 50.2253211425811,
"learning_rate": 2.4048858117733133e-08,
"logits/chosen": -1.3424656391143799,
"logits/rejected": -1.4596822261810303,
"logps/chosen": -480.2220153808594,
"logps/rejected": -694.8543701171875,
"loss": 0.3756,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.1350576877593994,
"rewards/margins": 2.600602865219116,
"rewards/rejected": -4.735660552978516,
"step": 3640
},
{
"epoch": 0.8757197696737045,
"grad_norm": 62.6157678185224,
"learning_rate": 2.3160745483366938e-08,
"logits/chosen": -1.315459966659546,
"logits/rejected": -1.327807903289795,
"logps/chosen": -441.5907287597656,
"logps/rejected": -655.9127807617188,
"loss": 0.4272,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.082794666290283,
"rewards/margins": 1.818223237991333,
"rewards/rejected": -3.901017665863037,
"step": 3650
},
{
"epoch": 0.8781190019193857,
"grad_norm": 77.24793221419046,
"learning_rate": 2.2288544721485197e-08,
"logits/chosen": -1.2426245212554932,
"logits/rejected": -1.2283251285552979,
"logps/chosen": -385.5768127441406,
"logps/rejected": -649.7178344726562,
"loss": 0.4319,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7174314260482788,
"rewards/margins": 2.4441914558410645,
"rewards/rejected": -4.161623001098633,
"step": 3660
},
{
"epoch": 0.8805182341650671,
"grad_norm": 69.06274330727302,
"learning_rate": 2.1432317013506117e-08,
"logits/chosen": -1.4047738313674927,
"logits/rejected": -1.5020328760147095,
"logps/chosen": -470.654052734375,
"logps/rejected": -579.29345703125,
"loss": 0.4543,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1058661937713623,
"rewards/margins": 1.4943852424621582,
"rewards/rejected": -3.6002509593963623,
"step": 3670
},
{
"epoch": 0.8829174664107485,
"grad_norm": 93.57949742759158,
"learning_rate": 2.0592122420401704e-08,
"logits/chosen": -1.205072283744812,
"logits/rejected": -1.3298732042312622,
"logps/chosen": -474.8382873535156,
"logps/rejected": -616.9782104492188,
"loss": 0.4672,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.427994966506958,
"rewards/margins": 1.4084312915802002,
"rewards/rejected": -3.83642578125,
"step": 3680
},
{
"epoch": 0.8853166986564299,
"grad_norm": 64.61385860680265,
"learning_rate": 1.976801987848459e-08,
"logits/chosen": -1.3770363330841064,
"logits/rejected": -1.410556435585022,
"logps/chosen": -488.1275939941406,
"logps/rejected": -752.9757690429688,
"loss": 0.4352,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.2426881790161133,
"rewards/margins": 2.5078392028808594,
"rewards/rejected": -4.750527381896973,
"step": 3690
},
{
"epoch": 0.8877159309021113,
"grad_norm": 71.4887011288993,
"learning_rate": 1.8960067195273987e-08,
"logits/chosen": -1.3647260665893555,
"logits/rejected": -1.4772412776947021,
"logps/chosen": -388.358154296875,
"logps/rejected": -628.3886108398438,
"loss": 0.4365,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7657787799835205,
"rewards/margins": 2.391746997833252,
"rewards/rejected": -4.157525539398193,
"step": 3700
},
{
"epoch": 0.8901151631477927,
"grad_norm": 63.6446466769933,
"learning_rate": 1.816832104544072e-08,
"logits/chosen": -1.2321730852127075,
"logits/rejected": -1.274123191833496,
"logps/chosen": -496.39849853515625,
"logps/rejected": -621.4663696289062,
"loss": 0.3994,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.2261962890625,
"rewards/margins": 1.5047309398651123,
"rewards/rejected": -3.7309272289276123,
"step": 3710
},
{
"epoch": 0.8925143953934741,
"grad_norm": 53.35919816815898,
"learning_rate": 1.7392836966831553e-08,
"logits/chosen": -1.2064793109893799,
"logits/rejected": -1.3003342151641846,
"logps/chosen": -481.5274963378906,
"logps/rejected": -657.7899780273438,
"loss": 0.4145,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.1689791679382324,
"rewards/margins": 2.0934038162231445,
"rewards/rejected": -4.262383460998535,
"step": 3720
},
{
"epoch": 0.8949136276391555,
"grad_norm": 77.1830594419351,
"learning_rate": 1.663366935657373e-08,
"logits/chosen": -1.3655575513839722,
"logits/rejected": -1.5064128637313843,
"logps/chosen": -393.8701171875,
"logps/rejected": -586.9609375,
"loss": 0.4555,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6656783819198608,
"rewards/margins": 1.8161319494247437,
"rewards/rejected": -3.4818103313446045,
"step": 3730
},
{
"epoch": 0.8973128598848369,
"grad_norm": 80.81720382253343,
"learning_rate": 1.5890871467258898e-08,
"logits/chosen": -1.1568529605865479,
"logits/rejected": -1.2427462339401245,
"logps/chosen": -553.2216796875,
"logps/rejected": -702.3026123046875,
"loss": 0.4235,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.3667430877685547,
"rewards/margins": 1.8206110000610352,
"rewards/rejected": -4.18735408782959,
"step": 3740
},
{
"epoch": 0.8997120921305183,
"grad_norm": 71.27801287402498,
"learning_rate": 1.5164495403207967e-08,
"logits/chosen": -1.313631534576416,
"logits/rejected": -1.3183560371398926,
"logps/chosen": -476.8546447753906,
"logps/rejected": -734.6409912109375,
"loss": 0.4195,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2888331413269043,
"rewards/margins": 2.343087673187256,
"rewards/rejected": -4.631920337677002,
"step": 3750
},
{
"epoch": 0.9021113243761996,
"grad_norm": 60.1603274481907,
"learning_rate": 1.4454592116815962e-08,
"logits/chosen": -1.2539043426513672,
"logits/rejected": -1.2726166248321533,
"logps/chosen": -430.16302490234375,
"logps/rejected": -626.4070434570312,
"loss": 0.369,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.85342538356781,
"rewards/margins": 1.881243109703064,
"rewards/rejected": -3.734668254852295,
"step": 3760
},
{
"epoch": 0.904510556621881,
"grad_norm": 56.052739726722386,
"learning_rate": 1.3761211404977934e-08,
"logits/chosen": -1.3558170795440674,
"logits/rejected": -1.3669774532318115,
"logps/chosen": -496.6163635253906,
"logps/rejected": -738.0737915039062,
"loss": 0.3665,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.6606838703155518,
"rewards/margins": 2.5214426517486572,
"rewards/rejected": -5.182126045227051,
"step": 3770
},
{
"epoch": 0.9069097888675623,
"grad_norm": 77.97728054957587,
"learning_rate": 1.3084401905596177e-08,
"logits/chosen": -1.210475206375122,
"logits/rejected": -1.3625085353851318,
"logps/chosen": -500.43927001953125,
"logps/rejected": -618.0930786132812,
"loss": 0.445,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0452969074249268,
"rewards/margins": 1.7742798328399658,
"rewards/rejected": -3.8195769786834717,
"step": 3780
},
{
"epoch": 0.9093090211132437,
"grad_norm": 89.65951610082931,
"learning_rate": 1.2424211094168053e-08,
"logits/chosen": -1.2439241409301758,
"logits/rejected": -1.3879896402359009,
"logps/chosen": -515.781005859375,
"logps/rejected": -729.7136840820312,
"loss": 0.4271,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.0545547008514404,
"rewards/margins": 2.210728168487549,
"rewards/rejected": -4.265283107757568,
"step": 3790
},
{
"epoch": 0.9117082533589251,
"grad_norm": 122.7685693253469,
"learning_rate": 1.1780685280456143e-08,
"logits/chosen": -1.3709090948104858,
"logits/rejected": -1.4039212465286255,
"logps/chosen": -562.2694091796875,
"logps/rejected": -850.7741088867188,
"loss": 0.4475,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.7632789611816406,
"rewards/margins": 2.866482734680176,
"rewards/rejected": -5.629761695861816,
"step": 3800
},
{
"epoch": 0.9141074856046065,
"grad_norm": 63.82642729554356,
"learning_rate": 1.1153869605239564e-08,
"logits/chosen": -1.3386048078536987,
"logits/rejected": -1.4672105312347412,
"logps/chosen": -443.0409240722656,
"logps/rejected": -555.0245971679688,
"loss": 0.4234,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8594999313354492,
"rewards/margins": 1.5090968608856201,
"rewards/rejected": -3.3685965538024902,
"step": 3810
},
{
"epoch": 0.9165067178502879,
"grad_norm": 55.265923914478826,
"learning_rate": 1.0543808037147606e-08,
"logits/chosen": -1.3850547075271606,
"logits/rejected": -1.3882572650909424,
"logps/chosen": -501.9463806152344,
"logps/rejected": -765.228515625,
"loss": 0.4109,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.4593310356140137,
"rewards/margins": 2.621159315109253,
"rewards/rejected": -5.0804901123046875,
"step": 3820
},
{
"epoch": 0.9189059500959693,
"grad_norm": 41.44845350613424,
"learning_rate": 9.95054336957557e-09,
"logits/chosen": -1.3671766519546509,
"logits/rejected": -1.353990912437439,
"logps/chosen": -438.7281188964844,
"logps/rejected": -604.5745239257812,
"loss": 0.3935,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.9596534967422485,
"rewards/margins": 1.6154146194458008,
"rewards/rejected": -3.575068235397339,
"step": 3830
},
{
"epoch": 0.9213051823416507,
"grad_norm": 83.64590537901655,
"learning_rate": 9.37411721768286e-09,
"logits/chosen": -1.4458494186401367,
"logits/rejected": -1.469053030014038,
"logps/chosen": -509.1304626464844,
"logps/rejected": -748.7863159179688,
"loss": 0.421,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.341409921646118,
"rewards/margins": 2.1367950439453125,
"rewards/rejected": -4.47820520401001,
"step": 3840
},
{
"epoch": 0.9237044145873321,
"grad_norm": 57.94036452379268,
"learning_rate": 8.81457001547392e-09,
"logits/chosen": -1.2755274772644043,
"logits/rejected": -1.2877039909362793,
"logps/chosen": -447.0801696777344,
"logps/rejected": -612.2420043945312,
"loss": 0.4167,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.0958006381988525,
"rewards/margins": 1.5613038539886475,
"rewards/rejected": -3.6571044921875,
"step": 3850
},
{
"epoch": 0.9261036468330134,
"grad_norm": 59.794827391985145,
"learning_rate": 8.271941012961942e-09,
"logits/chosen": -1.30680251121521,
"logits/rejected": -1.2633702754974365,
"logps/chosen": -441.3211364746094,
"logps/rejected": -780.7581787109375,
"loss": 0.3915,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2775466442108154,
"rewards/margins": 2.8263659477233887,
"rewards/rejected": -5.103912353515625,
"step": 3860
},
{
"epoch": 0.9285028790786948,
"grad_norm": 90.04034076724625,
"learning_rate": 7.746268273415568e-09,
"logits/chosen": -1.44291090965271,
"logits/rejected": -1.3620684146881104,
"logps/chosen": -457.104248046875,
"logps/rejected": -608.0975341796875,
"loss": 0.426,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.997802495956421,
"rewards/margins": 1.0049917697906494,
"rewards/rejected": -3.002794027328491,
"step": 3870
},
{
"epoch": 0.9309021113243762,
"grad_norm": 56.08532487513005,
"learning_rate": 7.237588670689076e-09,
"logits/chosen": -1.2944238185882568,
"logits/rejected": -1.4328762292861938,
"logps/chosen": -487.39019775390625,
"logps/rejected": -697.9758911132812,
"loss": 0.4261,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.2080299854278564,
"rewards/margins": 2.519274950027466,
"rewards/rejected": -4.727304458618164,
"step": 3880
},
{
"epoch": 0.9333013435700576,
"grad_norm": 89.11995129718771,
"learning_rate": 6.745937886635606e-09,
"logits/chosen": -1.389968752861023,
"logits/rejected": -1.4157402515411377,
"logps/chosen": -507.00897216796875,
"logps/rejected": -853.0285034179688,
"loss": 0.4189,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.2686798572540283,
"rewards/margins": 3.43501615524292,
"rewards/rejected": -5.703696250915527,
"step": 3890
},
{
"epoch": 0.935700575815739,
"grad_norm": 91.52948017854044,
"learning_rate": 6.271350408604409e-09,
"logits/chosen": -1.4022436141967773,
"logits/rejected": -1.406585454940796,
"logps/chosen": -399.2693786621094,
"logps/rejected": -612.744873046875,
"loss": 0.4141,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8768850564956665,
"rewards/margins": 1.962498664855957,
"rewards/rejected": -3.839383602142334,
"step": 3900
},
{
"epoch": 0.9380998080614203,
"grad_norm": 52.039893504735,
"learning_rate": 5.813859527021487e-09,
"logits/chosen": -1.3069987297058105,
"logits/rejected": -1.3816003799438477,
"logps/chosen": -500.3255920410156,
"logps/rejected": -697.2518310546875,
"loss": 0.3715,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.350861072540283,
"rewards/margins": 2.23738956451416,
"rewards/rejected": -4.588251113891602,
"step": 3910
},
{
"epoch": 0.9404990403071017,
"grad_norm": 49.34387167178098,
"learning_rate": 5.373497333054616e-09,
"logits/chosen": -1.3901867866516113,
"logits/rejected": -1.4163486957550049,
"logps/chosen": -488.06903076171875,
"logps/rejected": -603.2286987304688,
"loss": 0.4687,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.2455077171325684,
"rewards/margins": 1.2376153469085693,
"rewards/rejected": -3.4831230640411377,
"step": 3920
},
{
"epoch": 0.9428982725527831,
"grad_norm": 59.69376404826322,
"learning_rate": 4.950294716362213e-09,
"logits/chosen": -1.3077560663223267,
"logits/rejected": -1.4218385219573975,
"logps/chosen": -501.00457763671875,
"logps/rejected": -623.9380493164062,
"loss": 0.4254,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.2068111896514893,
"rewards/margins": 1.2557148933410645,
"rewards/rejected": -3.4625256061553955,
"step": 3930
},
{
"epoch": 0.9452975047984645,
"grad_norm": 63.61485606948048,
"learning_rate": 4.544281362926422e-09,
"logits/chosen": -1.2970178127288818,
"logits/rejected": -1.3176815509796143,
"logps/chosen": -494.03515625,
"logps/rejected": -667.9765625,
"loss": 0.4567,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9948374032974243,
"rewards/margins": 1.8618943691253662,
"rewards/rejected": -3.85673189163208,
"step": 3940
},
{
"epoch": 0.9476967370441459,
"grad_norm": 40.97335214555011,
"learning_rate": 4.15548575297095e-09,
"logits/chosen": -1.234287142753601,
"logits/rejected": -1.3237779140472412,
"logps/chosen": -482.68072509765625,
"logps/rejected": -734.9530639648438,
"loss": 0.3526,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.3361096382141113,
"rewards/margins": 2.631274938583374,
"rewards/rejected": -4.967384338378906,
"step": 3950
},
{
"epoch": 0.9500959692898272,
"grad_norm": 43.454652393867875,
"learning_rate": 3.7839351589631366e-09,
"logits/chosen": -1.3561676740646362,
"logits/rejected": -1.1930186748504639,
"logps/chosen": -454.8728942871094,
"logps/rejected": -687.5897827148438,
"loss": 0.4226,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.5094499588012695,
"rewards/margins": 1.905800461769104,
"rewards/rejected": -4.415250778198242,
"step": 3960
},
{
"epoch": 0.9524952015355086,
"grad_norm": 81.2377732744721,
"learning_rate": 3.4296556437010405e-09,
"logits/chosen": -1.3651882410049438,
"logits/rejected": -1.383840799331665,
"logps/chosen": -446.6869201660156,
"logps/rejected": -608.2105712890625,
"loss": 0.4052,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.4141621589660645,
"rewards/margins": 1.6848506927490234,
"rewards/rejected": -4.099012851715088,
"step": 3970
},
{
"epoch": 0.95489443378119,
"grad_norm": 54.227114337789175,
"learning_rate": 3.092672058485124e-09,
"logits/chosen": -1.5264198780059814,
"logits/rejected": -1.4988195896148682,
"logps/chosen": -509.2149353027344,
"logps/rejected": -768.14404296875,
"loss": 0.426,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.823775053024292,
"rewards/margins": 2.4652981758117676,
"rewards/rejected": -5.289073944091797,
"step": 3980
},
{
"epoch": 0.9572936660268714,
"grad_norm": 78.81638905697392,
"learning_rate": 2.7730080413750356e-09,
"logits/chosen": -1.2725681066513062,
"logits/rejected": -1.3801984786987305,
"logps/chosen": -467.46343994140625,
"logps/rejected": -614.0919189453125,
"loss": 0.4265,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0063366889953613,
"rewards/margins": 1.4813331365585327,
"rewards/rejected": -3.4876697063446045,
"step": 3990
},
{
"epoch": 0.9596928982725528,
"grad_norm": 59.40509449378525,
"learning_rate": 2.4706860155316033e-09,
"logits/chosen": -1.28085196018219,
"logits/rejected": -1.3779548406600952,
"logps/chosen": -599.7765502929688,
"logps/rejected": -758.8671875,
"loss": 0.4413,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.598813533782959,
"rewards/margins": 1.6893723011016846,
"rewards/rejected": -4.288186073303223,
"step": 4000
},
{
"epoch": 0.9620921305182342,
"grad_norm": 73.30482756852622,
"learning_rate": 2.185727187643843e-09,
"logits/chosen": -1.2994678020477295,
"logits/rejected": -1.3173738718032837,
"logps/chosen": -446.7986755371094,
"logps/rejected": -717.6575317382812,
"loss": 0.4822,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.304640531539917,
"rewards/margins": 2.731722593307495,
"rewards/rejected": -5.036363124847412,
"step": 4010
},
{
"epoch": 0.9644913627639156,
"grad_norm": 93.67770845413142,
"learning_rate": 1.9181515464413434e-09,
"logits/chosen": -1.1958959102630615,
"logits/rejected": -1.2263587713241577,
"logps/chosen": -557.4002685546875,
"logps/rejected": -799.4276733398438,
"loss": 0.4172,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.232234477996826,
"rewards/margins": 2.496126890182495,
"rewards/rejected": -4.7283616065979,
"step": 4020
},
{
"epoch": 0.966890595009597,
"grad_norm": 53.26258013949856,
"learning_rate": 1.6679778612923302e-09,
"logits/chosen": -1.3231043815612793,
"logits/rejected": -1.475568413734436,
"logps/chosen": -514.719482421875,
"logps/rejected": -669.9471435546875,
"loss": 0.4052,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.4012928009033203,
"rewards/margins": 1.4460484981536865,
"rewards/rejected": -3.8473410606384277,
"step": 4030
},
{
"epoch": 0.9692898272552783,
"grad_norm": 72.94634856988112,
"learning_rate": 1.43522368088686e-09,
"logits/chosen": -1.3796308040618896,
"logits/rejected": -1.4548887014389038,
"logps/chosen": -524.4054565429688,
"logps/rejected": -795.0631713867188,
"loss": 0.4778,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.7115061283111572,
"rewards/margins": 2.742997646331787,
"rewards/rejected": -5.454503536224365,
"step": 4040
},
{
"epoch": 0.9716890595009597,
"grad_norm": 83.28511968100013,
"learning_rate": 1.2199053320059993e-09,
"logits/chosen": -1.3421388864517212,
"logits/rejected": -1.3489320278167725,
"logps/chosen": -483.73052978515625,
"logps/rejected": -671.6555786132812,
"loss": 0.4179,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.17889666557312,
"rewards/margins": 1.7280422449111938,
"rewards/rejected": -3.9069390296936035,
"step": 4050
},
{
"epoch": 0.974088291746641,
"grad_norm": 65.34944649763949,
"learning_rate": 1.0220379183764338e-09,
"logits/chosen": -1.3666952848434448,
"logits/rejected": -1.358161449432373,
"logps/chosen": -379.8059387207031,
"logps/rejected": -614.4222412109375,
"loss": 0.3993,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6943544149398804,
"rewards/margins": 2.2639663219451904,
"rewards/rejected": -3.9583206176757812,
"step": 4060
},
{
"epoch": 0.9764875239923224,
"grad_norm": 70.1049047850288,
"learning_rate": 8.416353196111503e-10,
"logits/chosen": -1.375160813331604,
"logits/rejected": -1.340598225593567,
"logps/chosen": -499.16876220703125,
"logps/rejected": -708.440185546875,
"loss": 0.4537,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.6097168922424316,
"rewards/margins": 2.180154323577881,
"rewards/rejected": -4.789872169494629,
"step": 4070
},
{
"epoch": 0.9788867562380038,
"grad_norm": 66.82721727412559,
"learning_rate": 6.787101902356873e-10,
"logits/chosen": -1.4533971548080444,
"logits/rejected": -1.4280126094818115,
"logps/chosen": -495.2933654785156,
"logps/rejected": -711.2422485351562,
"loss": 0.4494,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.3556466102600098,
"rewards/margins": 1.9516353607177734,
"rewards/rejected": -4.307281970977783,
"step": 4080
},
{
"epoch": 0.9812859884836852,
"grad_norm": 101.18233866719163,
"learning_rate": 5.332739588005953e-10,
"logits/chosen": -1.381775975227356,
"logits/rejected": -1.4199464321136475,
"logps/chosen": -389.28411865234375,
"logps/rejected": -655.2838134765625,
"loss": 0.4249,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8873682022094727,
"rewards/margins": 2.4491372108459473,
"rewards/rejected": -4.336504936218262,
"step": 4090
},
{
"epoch": 0.9836852207293666,
"grad_norm": 76.0835459397726,
"learning_rate": 4.053368270797164e-10,
"logits/chosen": -1.371535062789917,
"logits/rejected": -1.3988001346588135,
"logps/chosen": -473.8731994628906,
"logps/rejected": -699.0272827148438,
"loss": 0.401,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.4284753799438477,
"rewards/margins": 2.310962677001953,
"rewards/rejected": -4.739438056945801,
"step": 4100
},
{
"epoch": 0.986084452975048,
"grad_norm": 41.658180713138364,
"learning_rate": 2.949077693545354e-10,
"logits/chosen": -1.3081371784210205,
"logits/rejected": -1.4109846353530884,
"logps/chosen": -503.1234436035156,
"logps/rejected": -697.5470581054688,
"loss": 0.4703,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.3372557163238525,
"rewards/margins": 1.6467803716659546,
"rewards/rejected": -3.9840362071990967,
"step": 4110
},
{
"epoch": 0.9884836852207294,
"grad_norm": 53.16215503139034,
"learning_rate": 2.0199453178471047e-10,
"logits/chosen": -1.2432799339294434,
"logits/rejected": -1.3958756923675537,
"logps/chosen": -532.030029296875,
"logps/rejected": -602.0155639648438,
"loss": 0.4235,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.2111809253692627,
"rewards/margins": 1.1188135147094727,
"rewards/rejected": -3.3299942016601562,
"step": 4120
},
{
"epoch": 0.9908829174664108,
"grad_norm": 62.61946131554966,
"learning_rate": 1.266036318647301e-10,
"logits/chosen": -1.373792052268982,
"logits/rejected": -1.4291802644729614,
"logps/chosen": -555.7026977539062,
"logps/rejected": -774.6337890625,
"loss": 0.4632,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.6357736587524414,
"rewards/margins": 2.4022860527038574,
"rewards/rejected": -5.038060188293457,
"step": 4130
},
{
"epoch": 0.9932821497120922,
"grad_norm": 76.41090561563904,
"learning_rate": 6.874035796672339e-11,
"logits/chosen": -1.2975847721099854,
"logits/rejected": -1.378894329071045,
"logps/chosen": -503.6324157714844,
"logps/rejected": -720.40576171875,
"loss": 0.4241,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.0853748321533203,
"rewards/margins": 2.7113640308380127,
"rewards/rejected": -4.796738624572754,
"step": 4140
},
{
"epoch": 0.9956813819577736,
"grad_norm": 76.34576359226114,
"learning_rate": 2.8408768969423458e-11,
"logits/chosen": -1.3285682201385498,
"logits/rejected": -1.330243706703186,
"logps/chosen": -468.9698181152344,
"logps/rejected": -653.3128051757812,
"loss": 0.4075,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0265982151031494,
"rewards/margins": 1.8034579753875732,
"rewards/rejected": -3.8300559520721436,
"step": 4150
},
{
"epoch": 0.9980806142034548,
"grad_norm": 66.48623115115576,
"learning_rate": 5.611693973617271e-12,
"logits/chosen": -1.3855040073394775,
"logits/rejected": -1.3751308917999268,
"logps/chosen": -445.00225830078125,
"logps/rejected": -637.5364990234375,
"loss": 0.4574,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.166611909866333,
"rewards/margins": 1.7966630458831787,
"rewards/rejected": -3.9632747173309326,
"step": 4160
},
{
"epoch": 1.0,
"step": 4168,
"total_flos": 0.0,
"train_loss": 0.4822349088434523,
"train_runtime": 13347.0673,
"train_samples_per_second": 9.992,
"train_steps_per_second": 0.312
}
],
"logging_steps": 10,
"max_steps": 4168,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}