simonycl's picture
Upload folder using huggingface_hub
8516279 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984168865435357,
"eval_steps": 400,
"global_step": 473,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021108179419525065,
"grad_norm": 3.792602400172418,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -0.723710298538208,
"logits/rejected": -1.1678439378738403,
"logps/chosen": -266.5860900878906,
"logps/rejected": -246.2262420654297,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010554089709762533,
"grad_norm": 5.35027261694182,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -0.6524915099143982,
"logits/rejected": -0.9277956485748291,
"logps/chosen": -282.5875549316406,
"logps/rejected": -269.2027893066406,
"loss": 0.6933,
"rewards/accuracies": 0.3828125,
"rewards/chosen": 0.000355295545887202,
"rewards/margins": -0.00032308147638104856,
"rewards/rejected": 0.000678377109579742,
"step": 5
},
{
"epoch": 0.021108179419525065,
"grad_norm": 5.266933872220353,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -0.6941147446632385,
"logits/rejected": -1.03800368309021,
"logps/chosen": -290.0839538574219,
"logps/rejected": -274.08502197265625,
"loss": 0.6931,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.0008805571123957634,
"rewards/margins": -0.0002368297427892685,
"rewards/rejected": 0.00111738673876971,
"step": 10
},
{
"epoch": 0.0316622691292876,
"grad_norm": 4.4222736963146785,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -0.6915597319602966,
"logits/rejected": -1.0270450115203857,
"logps/chosen": -286.4000549316406,
"logps/rejected": -268.19305419921875,
"loss": 0.6931,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0015847303438931704,
"rewards/margins": -0.00021869130432605743,
"rewards/rejected": 0.0018034216482192278,
"step": 15
},
{
"epoch": 0.04221635883905013,
"grad_norm": 4.370999160332841,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -0.6628856658935547,
"logits/rejected": -1.0627143383026123,
"logps/chosen": -281.633056640625,
"logps/rejected": -258.80975341796875,
"loss": 0.6928,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.004043369088321924,
"rewards/margins": 0.0007513560703955591,
"rewards/rejected": 0.0032920129597187042,
"step": 20
},
{
"epoch": 0.052770448548812667,
"grad_norm": 4.295540874340828,
"learning_rate": 2.604166666666667e-07,
"logits/chosen": -0.6402955651283264,
"logits/rejected": -0.9882392883300781,
"logps/chosen": -303.6094055175781,
"logps/rejected": -278.68792724609375,
"loss": 0.6921,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.012096477672457695,
"rewards/margins": 0.002340012462809682,
"rewards/rejected": 0.009756465442478657,
"step": 25
},
{
"epoch": 0.0633245382585752,
"grad_norm": 4.480110631795238,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -0.6986342668533325,
"logits/rejected": -1.0124592781066895,
"logps/chosen": -277.3695983886719,
"logps/rejected": -256.33648681640625,
"loss": 0.6908,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.019197864457964897,
"rewards/margins": 0.006392383016645908,
"rewards/rejected": 0.01280547957867384,
"step": 30
},
{
"epoch": 0.07387862796833773,
"grad_norm": 4.572546926633594,
"learning_rate": 3.645833333333333e-07,
"logits/chosen": -0.7217592597007751,
"logits/rejected": -0.9826194047927856,
"logps/chosen": -276.353515625,
"logps/rejected": -269.84747314453125,
"loss": 0.6889,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.03310415893793106,
"rewards/margins": 0.008944300934672356,
"rewards/rejected": 0.024159858003258705,
"step": 35
},
{
"epoch": 0.08443271767810026,
"grad_norm": 3.950940685241822,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -0.6703137755393982,
"logits/rejected": -1.0556083917617798,
"logps/chosen": -277.72515869140625,
"logps/rejected": -255.3736572265625,
"loss": 0.6856,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.044867198914289474,
"rewards/margins": 0.01742670312523842,
"rewards/rejected": 0.027440497651696205,
"step": 40
},
{
"epoch": 0.09498680738786279,
"grad_norm": 4.408045626085674,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -0.7604807615280151,
"logits/rejected": -1.0656068325042725,
"logps/chosen": -283.796142578125,
"logps/rejected": -269.21075439453125,
"loss": 0.6824,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.055293601006269455,
"rewards/margins": 0.017781417816877365,
"rewards/rejected": 0.03751217946410179,
"step": 45
},
{
"epoch": 0.10554089709762533,
"grad_norm": 4.594023555859445,
"learning_rate": 4.999726797933858e-07,
"logits/chosen": -0.7825593948364258,
"logits/rejected": -1.0136535167694092,
"logps/chosen": -268.57232666015625,
"logps/rejected": -254.4635772705078,
"loss": 0.6786,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.04131672903895378,
"rewards/margins": 0.02473551593720913,
"rewards/rejected": 0.016581213101744652,
"step": 50
},
{
"epoch": 0.11609498680738786,
"grad_norm": 4.732128821227025,
"learning_rate": 4.99665396039775e-07,
"logits/chosen": -0.8582944869995117,
"logits/rejected": -1.092308759689331,
"logps/chosen": -272.50872802734375,
"logps/rejected": -269.22015380859375,
"loss": 0.6711,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.015134250745177269,
"rewards/margins": 0.03893275931477547,
"rewards/rejected": -0.02379850670695305,
"step": 55
},
{
"epoch": 0.1266490765171504,
"grad_norm": 5.480552136086532,
"learning_rate": 4.99017099386437e-07,
"logits/chosen": -0.9315390586853027,
"logits/rejected": -1.1771332025527954,
"logps/chosen": -278.89837646484375,
"logps/rejected": -268.14080810546875,
"loss": 0.6679,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.014189760200679302,
"rewards/margins": 0.06192191690206528,
"rewards/rejected": -0.07611168175935745,
"step": 60
},
{
"epoch": 0.13720316622691292,
"grad_norm": 5.176626164434011,
"learning_rate": 4.980286753286194e-07,
"logits/chosen": -0.8333457708358765,
"logits/rejected": -1.3162130117416382,
"logps/chosen": -288.89825439453125,
"logps/rejected": -264.5441589355469,
"loss": 0.6667,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.045755136758089066,
"rewards/margins": 0.08817130327224731,
"rewards/rejected": -0.13392645120620728,
"step": 65
},
{
"epoch": 0.14775725593667546,
"grad_norm": 5.725175266189831,
"learning_rate": 4.967014739346915e-07,
"logits/chosen": -0.9382959604263306,
"logits/rejected": -1.3034207820892334,
"logps/chosen": -273.29193115234375,
"logps/rejected": -274.21929931640625,
"loss": 0.6606,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.05725777894258499,
"rewards/margins": 0.08167224377393723,
"rewards/rejected": -0.13892999291419983,
"step": 70
},
{
"epoch": 0.158311345646438,
"grad_norm": 5.9050273856078395,
"learning_rate": 4.950373080021136e-07,
"logits/chosen": -1.0476350784301758,
"logits/rejected": -1.337590217590332,
"logps/chosen": -292.19378662109375,
"logps/rejected": -282.83001708984375,
"loss": 0.6585,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07018107920885086,
"rewards/margins": 0.08405766636133194,
"rewards/rejected": -0.1542387306690216,
"step": 75
},
{
"epoch": 0.16886543535620052,
"grad_norm": 5.714632118731764,
"learning_rate": 4.930384505813737e-07,
"logits/chosen": -0.9645854830741882,
"logits/rejected": -1.3480749130249023,
"logps/chosen": -290.5950012207031,
"logps/rejected": -275.71417236328125,
"loss": 0.6617,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.11630520969629288,
"rewards/margins": 0.08103077113628387,
"rewards/rejected": -0.19733598828315735,
"step": 80
},
{
"epoch": 0.17941952506596306,
"grad_norm": 6.048274761863404,
"learning_rate": 4.907076318712738e-07,
"logits/chosen": -1.0770204067230225,
"logits/rejected": -1.342997431755066,
"logps/chosen": -301.7802734375,
"logps/rejected": -287.3224792480469,
"loss": 0.6561,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.13322284817695618,
"rewards/margins": 0.07080608606338501,
"rewards/rejected": -0.2040289342403412,
"step": 85
},
{
"epoch": 0.18997361477572558,
"grad_norm": 5.616972735220456,
"learning_rate": 4.88048035489807e-07,
"logits/chosen": -1.0288609266281128,
"logits/rejected": -1.537954568862915,
"logps/chosen": -303.514892578125,
"logps/rejected": -282.09832763671875,
"loss": 0.6458,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.12775097787380219,
"rewards/margins": 0.11901189386844635,
"rewards/rejected": -0.24676287174224854,
"step": 90
},
{
"epoch": 0.20052770448548812,
"grad_norm": 6.041190762428844,
"learning_rate": 4.85063294125718e-07,
"logits/chosen": -1.1466128826141357,
"logits/rejected": -1.4186201095581055,
"logps/chosen": -323.9360046386719,
"logps/rejected": -326.41461181640625,
"loss": 0.6493,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.18756112456321716,
"rewards/margins": 0.12050308287143707,
"rewards/rejected": -0.3080642521381378,
"step": 95
},
{
"epoch": 0.21108179419525067,
"grad_norm": 7.792002911640772,
"learning_rate": 4.817574845766874e-07,
"logits/chosen": -1.1385769844055176,
"logits/rejected": -1.4923776388168335,
"logps/chosen": -314.1307373046875,
"logps/rejected": -307.49102783203125,
"loss": 0.6441,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.26007553935050964,
"rewards/margins": 0.1371382772922516,
"rewards/rejected": -0.397213876247406,
"step": 100
},
{
"epoch": 0.22163588390501318,
"grad_norm": 6.885087311095594,
"learning_rate": 4.781351221809166e-07,
"logits/chosen": -1.1828514337539673,
"logits/rejected": -1.624103307723999,
"logps/chosen": -304.28204345703125,
"logps/rejected": -294.31048583984375,
"loss": 0.6373,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.30514588952064514,
"rewards/margins": 0.1688612401485443,
"rewards/rejected": -0.47400718927383423,
"step": 105
},
{
"epoch": 0.23218997361477572,
"grad_norm": 8.481883842604432,
"learning_rate": 4.742011546497182e-07,
"logits/chosen": -1.212425947189331,
"logits/rejected": -1.3756533861160278,
"logps/chosen": -313.9586486816406,
"logps/rejected": -320.29425048828125,
"loss": 0.6538,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.30393490195274353,
"rewards/margins": 0.1464935690164566,
"rewards/rejected": -0.45042848587036133,
"step": 110
},
{
"epoch": 0.24274406332453827,
"grad_norm": 7.149769163847217,
"learning_rate": 4.6996095530953875e-07,
"logits/chosen": -1.2339892387390137,
"logits/rejected": -1.58319890499115,
"logps/chosen": -315.6721496582031,
"logps/rejected": -308.2062072753906,
"loss": 0.6291,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.32919472455978394,
"rewards/margins": 0.1386784464120865,
"rewards/rejected": -0.4678731858730316,
"step": 115
},
{
"epoch": 0.2532981530343008,
"grad_norm": 7.759815340386084,
"learning_rate": 4.654203157626399e-07,
"logits/chosen": -1.2471096515655518,
"logits/rejected": -1.6236129999160767,
"logps/chosen": -341.6539611816406,
"logps/rejected": -330.80926513671875,
"loss": 0.6335,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.4439183175563812,
"rewards/margins": 0.12948934733867645,
"rewards/rejected": -0.5734077095985413,
"step": 120
},
{
"epoch": 0.2638522427440633,
"grad_norm": 8.303750659351337,
"learning_rate": 4.605854379764673e-07,
"logits/chosen": -1.2065553665161133,
"logits/rejected": -1.5575497150421143,
"logps/chosen": -347.19696044921875,
"logps/rejected": -339.4477233886719,
"loss": 0.63,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.4391602873802185,
"rewards/margins": 0.14842209219932556,
"rewards/rejected": -0.5875824093818665,
"step": 125
},
{
"epoch": 0.27440633245382584,
"grad_norm": 7.626112760961139,
"learning_rate": 4.5546292581250857e-07,
"logits/chosen": -1.1812589168548584,
"logits/rejected": -1.513511300086975,
"logps/chosen": -325.56005859375,
"logps/rejected": -315.3307800292969,
"loss": 0.6305,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.5028723478317261,
"rewards/margins": 0.12545283138751984,
"rewards/rejected": -0.6283251643180847,
"step": 130
},
{
"epoch": 0.2849604221635884,
"grad_norm": 8.681810962953072,
"learning_rate": 4.5005977600621275e-07,
"logits/chosen": -1.33579683303833,
"logits/rejected": -1.586660623550415,
"logps/chosen": -343.98089599609375,
"logps/rejected": -351.74066162109375,
"loss": 0.631,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.5469980835914612,
"rewards/margins": 0.19922946393489838,
"rewards/rejected": -0.7462274432182312,
"step": 135
},
{
"epoch": 0.2955145118733509,
"grad_norm": 9.263751197369732,
"learning_rate": 4.443833686102919e-07,
"logits/chosen": -1.4017233848571777,
"logits/rejected": -1.7090505361557007,
"logps/chosen": -355.2716369628906,
"logps/rejected": -371.23492431640625,
"loss": 0.6335,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.667505145072937,
"rewards/margins": 0.2195053994655609,
"rewards/rejected": -0.8870105743408203,
"step": 140
},
{
"epoch": 0.30606860158311344,
"grad_norm": 8.944976382840098,
"learning_rate": 4.384414569144561e-07,
"logits/chosen": -1.3571860790252686,
"logits/rejected": -1.624506950378418,
"logps/chosen": -356.50885009765625,
"logps/rejected": -361.44512939453125,
"loss": 0.6242,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.723587691783905,
"rewards/margins": 0.22243139147758484,
"rewards/rejected": -0.9460189938545227,
"step": 145
},
{
"epoch": 0.316622691292876,
"grad_norm": 9.048728108809618,
"learning_rate": 4.3224215685535287e-07,
"logits/chosen": -1.2304835319519043,
"logits/rejected": -1.607114553451538,
"logps/chosen": -340.3996887207031,
"logps/rejected": -343.8750915527344,
"loss": 0.6193,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5864183902740479,
"rewards/margins": 0.2611897587776184,
"rewards/rejected": -0.8476082682609558,
"step": 150
},
{
"epoch": 0.32717678100263853,
"grad_norm": 10.012310357130646,
"learning_rate": 4.2579393593117364e-07,
"logits/chosen": -1.3340481519699097,
"logits/rejected": -1.707767128944397,
"logps/chosen": -366.13104248046875,
"logps/rejected": -364.83026123046875,
"loss": 0.6204,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7475859522819519,
"rewards/margins": 0.2101312130689621,
"rewards/rejected": -0.9577171206474304,
"step": 155
},
{
"epoch": 0.33773087071240104,
"grad_norm": 9.68044164663275,
"learning_rate": 4.191056016360699e-07,
"logits/chosen": -1.394718050956726,
"logits/rejected": -1.6881500482559204,
"logps/chosen": -368.72381591796875,
"logps/rejected": -381.956298828125,
"loss": 0.6135,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8789850473403931,
"rewards/margins": 0.3012150526046753,
"rewards/rejected": -1.1802000999450684,
"step": 160
},
{
"epoch": 0.3482849604221636,
"grad_norm": 10.276456210059177,
"learning_rate": 4.121862894301754e-07,
"logits/chosen": -1.3367292881011963,
"logits/rejected": -1.7920604944229126,
"logps/chosen": -379.0816650390625,
"logps/rejected": -372.62432861328125,
"loss": 0.6186,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8941882252693176,
"rewards/margins": 0.2552604675292969,
"rewards/rejected": -1.1494486331939697,
"step": 165
},
{
"epoch": 0.35883905013192613,
"grad_norm": 10.349641550261767,
"learning_rate": 4.050454502616667e-07,
"logits/chosen": -1.3888546228408813,
"logits/rejected": -1.7364885807037354,
"logps/chosen": -375.4383239746094,
"logps/rejected": -369.5252685546875,
"loss": 0.6183,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8307794332504272,
"rewards/margins": 0.19674496352672577,
"rewards/rejected": -1.027524471282959,
"step": 170
},
{
"epoch": 0.36939313984168864,
"grad_norm": 10.29658804390271,
"learning_rate": 3.976928376579047e-07,
"logits/chosen": -1.4784464836120605,
"logits/rejected": -1.8144117593765259,
"logps/chosen": -355.7376708984375,
"logps/rejected": -354.1457824707031,
"loss": 0.6153,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8430948257446289,
"rewards/margins": 0.21338331699371338,
"rewards/rejected": -1.0564781427383423,
"step": 175
},
{
"epoch": 0.37994722955145116,
"grad_norm": 20.628198563240826,
"learning_rate": 3.9013849440328945e-07,
"logits/chosen": -1.3779172897338867,
"logits/rejected": -1.7602001428604126,
"logps/chosen": -353.769287109375,
"logps/rejected": -358.7577209472656,
"loss": 0.6204,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.8876091837882996,
"rewards/margins": 0.22880685329437256,
"rewards/rejected": -1.1164162158966064,
"step": 180
},
{
"epoch": 0.39050131926121373,
"grad_norm": 10.868907026626026,
"learning_rate": 3.8239273882202473e-07,
"logits/chosen": -1.439247488975525,
"logits/rejected": -1.8125137090682983,
"logps/chosen": -412.8868103027344,
"logps/rejected": -431.59063720703125,
"loss": 0.6016,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1018221378326416,
"rewards/margins": 0.4088074564933777,
"rewards/rejected": -1.5106297731399536,
"step": 185
},
{
"epoch": 0.40105540897097625,
"grad_norm": 10.784941413981636,
"learning_rate": 3.7446615068452804e-07,
"logits/chosen": -1.4441838264465332,
"logits/rejected": -1.7783229351043701,
"logps/chosen": -398.41009521484375,
"logps/rejected": -396.8212890625,
"loss": 0.594,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.115227222442627,
"rewards/margins": 0.2704046070575714,
"rewards/rejected": -1.3856319189071655,
"step": 190
},
{
"epoch": 0.41160949868073876,
"grad_norm": 10.229960177651233,
"learning_rate": 3.6636955675673743e-07,
"logits/chosen": -1.5908405780792236,
"logits/rejected": -1.9355300664901733,
"logps/chosen": -426.3243713378906,
"logps/rejected": -420.7511291503906,
"loss": 0.5957,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1847457885742188,
"rewards/margins": 0.32065972685813904,
"rewards/rejected": -1.5054056644439697,
"step": 195
},
{
"epoch": 0.42216358839050133,
"grad_norm": 18.20685869729302,
"learning_rate": 3.5811401601205093e-07,
"logits/chosen": -1.6325582265853882,
"logits/rejected": -1.8879244327545166,
"logps/chosen": -426.10943603515625,
"logps/rejected": -426.29376220703125,
"loss": 0.6339,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.3495900630950928,
"rewards/margins": 0.15765051543712616,
"rewards/rejected": -1.507240653038025,
"step": 200
},
{
"epoch": 0.43271767810026385,
"grad_norm": 10.716178488233457,
"learning_rate": 3.497108045260995e-07,
"logits/chosen": -1.6447012424468994,
"logits/rejected": -1.9266440868377686,
"logps/chosen": -422.4698181152344,
"logps/rejected": -423.3296813964844,
"loss": 0.6095,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.2483638525009155,
"rewards/margins": 0.21740670502185822,
"rewards/rejected": -1.4657707214355469,
"step": 205
},
{
"epoch": 0.44327176781002636,
"grad_norm": 9.319577970375986,
"learning_rate": 3.411714000749838e-07,
"logits/chosen": -1.5758410692214966,
"logits/rejected": -1.9720706939697266,
"logps/chosen": -413.7496032714844,
"logps/rejected": -432.4217834472656,
"loss": 0.5971,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.2690123319625854,
"rewards/margins": 0.31965065002441406,
"rewards/rejected": -1.58866286277771,
"step": 210
},
{
"epoch": 0.45382585751978893,
"grad_norm": 18.334377917058617,
"learning_rate": 3.3250746645801287e-07,
"logits/chosen": -1.6151403188705444,
"logits/rejected": -1.9621028900146484,
"logps/chosen": -431.717529296875,
"logps/rejected": -438.23095703125,
"loss": 0.5914,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.601030945777893,
"rewards/margins": 0.29736214876174927,
"rewards/rejected": -1.8983930349349976,
"step": 215
},
{
"epoch": 0.46437994722955145,
"grad_norm": 13.987559233928428,
"learning_rate": 3.237308375663571e-07,
"logits/chosen": -1.5672855377197266,
"logits/rejected": -1.8798201084136963,
"logps/chosen": -465.22882080078125,
"logps/rejected": -480.69036865234375,
"loss": 0.5731,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.6366822719573975,
"rewards/margins": 0.33864718675613403,
"rewards/rejected": -1.9753293991088867,
"step": 220
},
{
"epoch": 0.47493403693931396,
"grad_norm": 15.585874610978292,
"learning_rate": 3.148535012193767e-07,
"logits/chosen": -1.4787318706512451,
"logits/rejected": -1.7937052249908447,
"logps/chosen": -463.3704528808594,
"logps/rejected": -513.5693359375,
"loss": 0.5913,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.7906410694122314,
"rewards/margins": 0.637313723564148,
"rewards/rejected": -2.42795467376709,
"step": 225
},
{
"epoch": 0.48548812664907653,
"grad_norm": 10.989676492328872,
"learning_rate": 3.0588758279070183e-07,
"logits/chosen": -1.4634826183319092,
"logits/rejected": -1.688738226890564,
"logps/chosen": -402.5445556640625,
"logps/rejected": -404.0518493652344,
"loss": 0.62,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4222261905670166,
"rewards/margins": 0.1772518903017044,
"rewards/rejected": -1.599478006362915,
"step": 230
},
{
"epoch": 0.49604221635883905,
"grad_norm": 10.557802697469821,
"learning_rate": 2.968453286464312e-07,
"logits/chosen": -1.386103868484497,
"logits/rejected": -1.759375810623169,
"logps/chosen": -398.8132629394531,
"logps/rejected": -399.6328125,
"loss": 0.5904,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1334482431411743,
"rewards/margins": 0.23164169490337372,
"rewards/rejected": -1.365089774131775,
"step": 235
},
{
"epoch": 0.5065963060686016,
"grad_norm": 13.209672009218341,
"learning_rate": 2.8773908941806877e-07,
"logits/chosen": -1.5705225467681885,
"logits/rejected": -1.753831148147583,
"logps/chosen": -442.28857421875,
"logps/rejected": -449.0203552246094,
"loss": 0.5998,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.6151583194732666,
"rewards/margins": 0.23577281832695007,
"rewards/rejected": -1.85093092918396,
"step": 240
},
{
"epoch": 0.5171503957783641,
"grad_norm": 16.396333599315767,
"learning_rate": 2.785813031330473e-07,
"logits/chosen": -1.6287492513656616,
"logits/rejected": -1.9647096395492554,
"logps/chosen": -466.08599853515625,
"logps/rejected": -482.62847900390625,
"loss": 0.6041,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.8849313259124756,
"rewards/margins": 0.3867245614528656,
"rewards/rejected": -2.271656036376953,
"step": 245
},
{
"epoch": 0.5277044854881267,
"grad_norm": 10.479150105315131,
"learning_rate": 2.693844782258779e-07,
"logits/chosen": -1.6182796955108643,
"logits/rejected": -1.851154088973999,
"logps/chosen": -442.0950622558594,
"logps/rejected": -452.76416015625,
"loss": 0.6023,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.5875468254089355,
"rewards/margins": 0.27402180433273315,
"rewards/rejected": -1.8615686893463135,
"step": 250
},
{
"epoch": 0.5382585751978892,
"grad_norm": 11.245899562560366,
"learning_rate": 2.601611764531342e-07,
"logits/chosen": -1.5520964860916138,
"logits/rejected": -1.8409061431884766,
"logps/chosen": -385.7509765625,
"logps/rejected": -413.82147216796875,
"loss": 0.602,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3112901449203491,
"rewards/margins": 0.3254047930240631,
"rewards/rejected": -1.6366949081420898,
"step": 255
},
{
"epoch": 0.5488126649076517,
"grad_norm": 10.216434963455866,
"learning_rate": 2.5092399573560323e-07,
"logits/chosen": -1.552223563194275,
"logits/rejected": -1.9581362009048462,
"logps/chosen": -435.2206115722656,
"logps/rejected": -440.0597229003906,
"loss": 0.6024,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.5202934741973877,
"rewards/margins": 0.2939620614051819,
"rewards/rejected": -1.8142554759979248,
"step": 260
},
{
"epoch": 0.5593667546174143,
"grad_norm": 15.557028702183048,
"learning_rate": 2.4168555295104124e-07,
"logits/chosen": -1.5453598499298096,
"logits/rejected": -1.900339126586914,
"logps/chosen": -430.10980224609375,
"logps/rejected": -445.18658447265625,
"loss": 0.5844,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5744996070861816,
"rewards/margins": 0.3325752317905426,
"rewards/rejected": -1.9070749282836914,
"step": 265
},
{
"epoch": 0.5699208443271768,
"grad_norm": 17.943254997397123,
"learning_rate": 2.3245846670103626e-07,
"logits/chosen": -1.604867935180664,
"logits/rejected": -2.0065605640411377,
"logps/chosen": -474.488037109375,
"logps/rejected": -498.0807189941406,
"loss": 0.5789,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.8745540380477905,
"rewards/margins": 0.41972631216049194,
"rewards/rejected": -2.294280529022217,
"step": 270
},
{
"epoch": 0.5804749340369393,
"grad_norm": 24.025134545110568,
"learning_rate": 2.232553400755159e-07,
"logits/chosen": -1.5600621700286865,
"logits/rejected": -1.9929841756820679,
"logps/chosen": -506.9547424316406,
"logps/rejected": -510.70306396484375,
"loss": 0.6081,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.104123592376709,
"rewards/margins": 0.3540397882461548,
"rewards/rejected": -2.4581634998321533,
"step": 275
},
{
"epoch": 0.5910290237467019,
"grad_norm": 12.929099239614445,
"learning_rate": 2.1408874343844294e-07,
"logits/chosen": -1.6627086400985718,
"logits/rejected": -1.9773311614990234,
"logps/chosen": -452.6092224121094,
"logps/rejected": -466.3548889160156,
"loss": 0.5697,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.777646780014038,
"rewards/margins": 0.39899054169654846,
"rewards/rejected": -2.1766371726989746,
"step": 280
},
{
"epoch": 0.6015831134564644,
"grad_norm": 14.764167900995057,
"learning_rate": 2.049711972582101e-07,
"logits/chosen": -1.4953606128692627,
"logits/rejected": -1.8248519897460938,
"logps/chosen": -454.2190856933594,
"logps/rejected": -484.0538635253906,
"loss": 0.5691,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.6517919301986694,
"rewards/margins": 0.40098685026168823,
"rewards/rejected": -2.052778720855713,
"step": 285
},
{
"epoch": 0.6121372031662269,
"grad_norm": 16.272348359396457,
"learning_rate": 1.9591515500618588e-07,
"logits/chosen": -1.5684363842010498,
"logits/rejected": -1.8171417713165283,
"logps/chosen": -463.537841796875,
"logps/rejected": -480.9203186035156,
"loss": 0.5867,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.7810500860214233,
"rewards/margins": 0.29418668150901794,
"rewards/rejected": -2.0752367973327637,
"step": 290
},
{
"epoch": 0.6226912928759895,
"grad_norm": 14.742811810031489,
"learning_rate": 1.8693298614677112e-07,
"logits/chosen": -1.466384768486023,
"logits/rejected": -1.8593746423721313,
"logps/chosen": -479.5718688964844,
"logps/rejected": -491.52154541015625,
"loss": 0.5822,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.8734004497528076,
"rewards/margins": 0.33124423027038574,
"rewards/rejected": -2.2046444416046143,
"step": 295
},
{
"epoch": 0.633245382585752,
"grad_norm": 17.118353279558573,
"learning_rate": 1.7803695924219814e-07,
"logits/chosen": -1.6126632690429688,
"logits/rejected": -1.906806230545044,
"logps/chosen": -501.42083740234375,
"logps/rejected": -519.7081909179688,
"loss": 0.5917,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.099165439605713,
"rewards/margins": 0.307799756526947,
"rewards/rejected": -2.4069650173187256,
"step": 300
},
{
"epoch": 0.6437994722955145,
"grad_norm": 13.624538503432188,
"learning_rate": 1.6923922519515067e-07,
"logits/chosen": -1.6364351511001587,
"logits/rejected": -1.9255473613739014,
"logps/chosen": -485.3211975097656,
"logps/rejected": -504.00701904296875,
"loss": 0.5809,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9193140268325806,
"rewards/margins": 0.4129720628261566,
"rewards/rejected": -2.3322861194610596,
"step": 305
},
{
"epoch": 0.6543535620052771,
"grad_norm": 17.071661718014518,
"learning_rate": 1.605518006520924e-07,
"logits/chosen": -1.727064847946167,
"logits/rejected": -2.0727763175964355,
"logps/chosen": -501.14495849609375,
"logps/rejected": -513.572509765625,
"loss": 0.5871,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.1765451431274414,
"rewards/margins": 0.34206461906433105,
"rewards/rejected": -2.5186100006103516,
"step": 310
},
{
"epoch": 0.6649076517150396,
"grad_norm": 13.617029224965975,
"learning_rate": 1.519865515899731e-07,
"logits/chosen": -1.722412109375,
"logits/rejected": -2.04305362701416,
"logps/chosen": -467.9588928222656,
"logps/rejected": -480.5577087402344,
"loss": 0.5821,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.8842096328735352,
"rewards/margins": 0.34835028648376465,
"rewards/rejected": -2.2325596809387207,
"step": 315
},
{
"epoch": 0.6754617414248021,
"grad_norm": 13.33856540505469,
"learning_rate": 1.4355517710873182e-07,
"logits/chosen": -1.8616483211517334,
"logits/rejected": -2.127676248550415,
"logps/chosen": -491.52545166015625,
"logps/rejected": -527.18212890625,
"loss": 0.5874,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.0936801433563232,
"rewards/margins": 0.45663338899612427,
"rewards/rejected": -2.5503134727478027,
"step": 320
},
{
"epoch": 0.6860158311345647,
"grad_norm": 17.145800349025656,
"learning_rate": 1.3526919345173318e-07,
"logits/chosen": -1.7799503803253174,
"logits/rejected": -2.053417921066284,
"logps/chosen": -521.0397338867188,
"logps/rejected": -544.9762573242188,
"loss": 0.5769,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.4065451622009277,
"rewards/margins": 0.452395498752594,
"rewards/rejected": -2.858940601348877,
"step": 325
},
{
"epoch": 0.6965699208443272,
"grad_norm": 19.087646634462068,
"learning_rate": 1.2713991827596443e-07,
"logits/chosen": -1.8048852682113647,
"logits/rejected": -2.0732533931732178,
"logps/chosen": -538.1304931640625,
"logps/rejected": -579.5018310546875,
"loss": 0.5753,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.58000111579895,
"rewards/margins": 0.5617579221725464,
"rewards/rejected": -3.141758680343628,
"step": 330
},
{
"epoch": 0.7071240105540897,
"grad_norm": 16.296965660815633,
"learning_rate": 1.191784551934773e-07,
"logits/chosen": -1.6937000751495361,
"logits/rejected": -2.0096402168273926,
"logps/chosen": -490.8270568847656,
"logps/rejected": -560.6513671875,
"loss": 0.5806,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.2391765117645264,
"rewards/margins": 0.8371523022651672,
"rewards/rejected": -3.076328992843628,
"step": 335
},
{
"epoch": 0.7176781002638523,
"grad_norm": 13.84198150957549,
"learning_rate": 1.1139567860518953e-07,
"logits/chosen": -1.6130354404449463,
"logits/rejected": -1.875739336013794,
"logps/chosen": -477.005615234375,
"logps/rejected": -505.4608459472656,
"loss": 0.5914,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9602851867675781,
"rewards/margins": 0.4700210988521576,
"rewards/rejected": -2.4303066730499268,
"step": 340
},
{
"epoch": 0.7282321899736148,
"grad_norm": 15.316683752394184,
"learning_rate": 1.0380221884776128e-07,
"logits/chosen": -1.671500563621521,
"logits/rejected": -1.958186149597168,
"logps/chosen": -483.4461975097656,
"logps/rejected": -497.53643798828125,
"loss": 0.5842,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9341436624526978,
"rewards/margins": 0.3594801723957062,
"rewards/rejected": -2.293623924255371,
"step": 345
},
{
"epoch": 0.7387862796833773,
"grad_norm": 11.225540406360041,
"learning_rate": 9.640844767383405e-08,
"logits/chosen": -1.7304404973983765,
"logits/rejected": -2.0152411460876465,
"logps/chosen": -474.5326232910156,
"logps/rejected": -519.5494384765625,
"loss": 0.5663,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.8573243618011475,
"rewards/margins": 0.5369530916213989,
"rewards/rejected": -2.394277334213257,
"step": 350
},
{
"epoch": 0.7493403693931399,
"grad_norm": 69.37431303110792,
"learning_rate": 8.922446408546378e-08,
"logits/chosen": -1.636301040649414,
"logits/rejected": -1.9108378887176514,
"logps/chosen": -474.32769775390625,
"logps/rejected": -491.1766052246094,
"loss": 0.5914,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.870996117591858,
"rewards/margins": 0.4108423590660095,
"rewards/rejected": -2.2818384170532227,
"step": 355
},
{
"epoch": 0.7598944591029023,
"grad_norm": 20.752730975509387,
"learning_rate": 8.22600805400994e-08,
"logits/chosen": -1.597144603729248,
"logits/rejected": -1.939162015914917,
"logps/chosen": -516.8674926757812,
"logps/rejected": -526.4575805664062,
"loss": 0.5934,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.107037305831909,
"rewards/margins": 0.36362889409065247,
"rewards/rejected": -2.4706661701202393,
"step": 360
},
{
"epoch": 0.7704485488126649,
"grad_norm": 17.42422968220554,
"learning_rate": 7.552480954794558e-08,
"logits/chosen": -1.664350152015686,
"logits/rejected": -1.8763881921768188,
"logps/chosen": -474.96917724609375,
"logps/rejected": -517.1463623046875,
"loss": 0.5755,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.939814567565918,
"rewards/margins": 0.3745439350605011,
"rewards/rejected": -2.3143584728240967,
"step": 365
},
{
"epoch": 0.7810026385224275,
"grad_norm": 14.771602880443869,
"learning_rate": 6.902785067901854e-08,
"logits/chosen": -1.6192362308502197,
"logits/rejected": -1.9148075580596924,
"logps/chosen": -488.96221923828125,
"logps/rejected": -493.0494689941406,
"loss": 0.5705,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.987235426902771,
"rewards/margins": 0.29930660128593445,
"rewards/rejected": -2.2865424156188965,
"step": 370
},
{
"epoch": 0.7915567282321899,
"grad_norm": 17.979535692288096,
"learning_rate": 6.277807799763973e-08,
"logits/chosen": -1.739436149597168,
"logits/rejected": -1.9250596761703491,
"logps/chosen": -524.38720703125,
"logps/rejected": -558.7305908203125,
"loss": 0.5799,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.345944881439209,
"rewards/margins": 0.3936893045902252,
"rewards/rejected": -2.7396342754364014,
"step": 375
},
{
"epoch": 0.8021108179419525,
"grad_norm": 16.020544985708035,
"learning_rate": 5.678402794153145e-08,
"logits/chosen": -1.6335742473602295,
"logits/rejected": -1.9916164875030518,
"logps/chosen": -496.64111328125,
"logps/rejected": -516.6607666015625,
"loss": 0.5759,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.1185414791107178,
"rewards/margins": 0.3739583492279053,
"rewards/rejected": -2.492499828338623,
"step": 380
},
{
"epoch": 0.8126649076517151,
"grad_norm": 15.483975057559833,
"learning_rate": 5.105388766206969e-08,
"logits/chosen": -1.7242807149887085,
"logits/rejected": -1.9720449447631836,
"logps/chosen": -476.0779724121094,
"logps/rejected": -498.2892150878906,
"loss": 0.5878,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9155553579330444,
"rewards/margins": 0.33329516649246216,
"rewards/rejected": -2.2488505840301514,
"step": 385
},
{
"epoch": 0.8232189973614775,
"grad_norm": 12.980915706351402,
"learning_rate": 4.5595483841620484e-08,
"logits/chosen": -1.685105562210083,
"logits/rejected": -1.9450676441192627,
"logps/chosen": -459.869384765625,
"logps/rejected": -495.52069091796875,
"loss": 0.5753,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7653785943984985,
"rewards/margins": 0.45078420639038086,
"rewards/rejected": -2.216163158416748,
"step": 390
},
{
"epoch": 0.8337730870712401,
"grad_norm": 12.943578700815056,
"learning_rate": 4.0416272003232526e-08,
"logits/chosen": -1.5918303728103638,
"logits/rejected": -1.9432264566421509,
"logps/chosen": -461.55078125,
"logps/rejected": -483.1607971191406,
"loss": 0.5828,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.7096798419952393,
"rewards/margins": 0.43595314025878906,
"rewards/rejected": -2.1456329822540283,
"step": 395
},
{
"epoch": 0.8443271767810027,
"grad_norm": 13.529250322769109,
"learning_rate": 3.552332632729041e-08,
"logits/chosen": -1.676417350769043,
"logits/rejected": -1.8683099746704102,
"logps/chosen": -448.98809814453125,
"logps/rejected": -474.80450439453125,
"loss": 0.5696,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.735640525817871,
"rewards/margins": 0.3609997630119324,
"rewards/rejected": -2.096640110015869,
"step": 400
},
{
"epoch": 0.8443271767810027,
"eval_logits/chosen": -1.8635751008987427,
"eval_logits/rejected": -1.727868914604187,
"eval_logps/chosen": -464.8841857910156,
"eval_logps/rejected": -503.46514892578125,
"eval_loss": 0.6257370710372925,
"eval_rewards/accuracies": 0.6639676094055176,
"eval_rewards/chosen": -1.8789465427398682,
"eval_rewards/margins": 0.299042671918869,
"eval_rewards/rejected": -2.1779892444610596,
"eval_runtime": 316.7001,
"eval_samples_per_second": 6.239,
"eval_steps_per_second": 1.56,
"step": 400
},
{
"epoch": 0.8548812664907651,
"grad_norm": 16.739492605341695,
"learning_rate": 3.092332998903416e-08,
"logits/chosen": -1.7163026332855225,
"logits/rejected": -2.0801901817321777,
"logps/chosen": -481.8212890625,
"logps/rejected": -521.2871704101562,
"loss": 0.5594,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8860801458358765,
"rewards/margins": 0.5326521992683411,
"rewards/rejected": -2.418732166290283,
"step": 405
},
{
"epoch": 0.8654353562005277,
"grad_norm": 18.511909575910575,
"learning_rate": 2.6622566030146455e-08,
"logits/chosen": -1.7279727458953857,
"logits/rejected": -1.9562079906463623,
"logps/chosen": -501.9583435058594,
"logps/rejected": -521.0777587890625,
"loss": 0.5736,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.089940309524536,
"rewards/margins": 0.37453165650367737,
"rewards/rejected": -2.4644720554351807,
"step": 410
},
{
"epoch": 0.8759894459102903,
"grad_norm": 13.262757276399812,
"learning_rate": 2.26269087768734e-08,
"logits/chosen": -1.7813360691070557,
"logits/rejected": -1.99080491065979,
"logps/chosen": -470.19732666015625,
"logps/rejected": -517.9837646484375,
"loss": 0.5669,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0514144897460938,
"rewards/margins": 0.6286773681640625,
"rewards/rejected": -2.680091619491577,
"step": 415
},
{
"epoch": 0.8865435356200527,
"grad_norm": 16.729852500651287,
"learning_rate": 1.894181581640106e-08,
"logits/chosen": -1.7729663848876953,
"logits/rejected": -2.0622265338897705,
"logps/chosen": -503.3247985839844,
"logps/rejected": -532.9273681640625,
"loss": 0.5733,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.2469749450683594,
"rewards/margins": 0.4464968144893646,
"rewards/rejected": -2.6934714317321777,
"step": 420
},
{
"epoch": 0.8970976253298153,
"grad_norm": 15.498959956089978,
"learning_rate": 1.5572320542448143e-08,
"logits/chosen": -1.8235836029052734,
"logits/rejected": -2.0790963172912598,
"logps/chosen": -518.3297119140625,
"logps/rejected": -555.9387817382812,
"loss": 0.5909,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.182375431060791,
"rewards/margins": 0.5672179460525513,
"rewards/rejected": -2.7495932579040527,
"step": 425
},
{
"epoch": 0.9076517150395779,
"grad_norm": 13.029691392427118,
"learning_rate": 1.2523025280255729e-08,
"logits/chosen": -1.7515465021133423,
"logits/rejected": -2.0758919715881348,
"logps/chosen": -505.37646484375,
"logps/rejected": -527.7960815429688,
"loss": 0.5682,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.237623691558838,
"rewards/margins": 0.4710654616355896,
"rewards/rejected": -2.7086894512176514,
"step": 430
},
{
"epoch": 0.9182058047493403,
"grad_norm": 16.269526596286124,
"learning_rate": 9.798095000364214e-09,
"logits/chosen": -1.7598968744277954,
"logits/rejected": -1.9988504648208618,
"logps/chosen": -508.0267028808594,
"logps/rejected": -554.0763549804688,
"loss": 0.5581,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.217205762863159,
"rewards/margins": 0.5872582197189331,
"rewards/rejected": -2.8044638633728027,
"step": 435
},
{
"epoch": 0.9287598944591029,
"grad_norm": 13.648970556247901,
"learning_rate": 7.401251629764876e-09,
"logits/chosen": -1.830775499343872,
"logits/rejected": -2.0407309532165527,
"logps/chosen": -511.0887145996094,
"logps/rejected": -543.5230712890625,
"loss": 0.5799,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.284379482269287,
"rewards/margins": 0.47375327348709106,
"rewards/rejected": -2.7581324577331543,
"step": 440
},
{
"epoch": 0.9393139841688655,
"grad_norm": 17.489158193863855,
"learning_rate": 5.335768968195098e-09,
"logits/chosen": -1.7661769390106201,
"logits/rejected": -2.1901516914367676,
"logps/chosen": -519.0462646484375,
"logps/rejected": -544.9937133789062,
"loss": 0.5703,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.308170795440674,
"rewards/margins": 0.4751991331577301,
"rewards/rejected": -2.783369779586792,
"step": 445
},
{
"epoch": 0.9498680738786279,
"grad_norm": 18.472750585474607,
"learning_rate": 3.604468216521883e-09,
"logits/chosen": -1.8184922933578491,
"logits/rejected": -2.069641590118408,
"logps/chosen": -510.5535583496094,
"logps/rejected": -536.5929565429688,
"loss": 0.5651,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.3444266319274902,
"rewards/margins": 0.45197463035583496,
"rewards/rejected": -2.7964015007019043,
"step": 450
},
{
"epoch": 0.9604221635883905,
"grad_norm": 19.193548961658735,
"learning_rate": 2.2097141233206884e-09,
"logits/chosen": -1.7842222452163696,
"logits/rejected": -2.0406641960144043,
"logps/chosen": -513.885986328125,
"logps/rejected": -545.530029296875,
"loss": 0.5708,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.247333526611328,
"rewards/margins": 0.44551533460617065,
"rewards/rejected": -2.6928489208221436,
"step": 455
},
{
"epoch": 0.9709762532981531,
"grad_norm": 15.684871774317772,
"learning_rate": 1.1534117549133472e-09,
"logits/chosen": -1.8590974807739258,
"logits/rejected": -2.08577036857605,
"logps/chosen": -512.5687866210938,
"logps/rejected": -551.6975708007812,
"loss": 0.5662,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.2769620418548584,
"rewards/margins": 0.5433439016342163,
"rewards/rejected": -2.8203060626983643,
"step": 460
},
{
"epoch": 0.9815303430079155,
"grad_norm": 16.324336075352026,
"learning_rate": 4.3700389327672173e-10,
"logits/chosen": -1.74801504611969,
"logits/rejected": -2.0831220149993896,
"logps/chosen": -508.1880798339844,
"logps/rejected": -548.400390625,
"loss": 0.578,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.2477779388427734,
"rewards/margins": 0.5980393886566162,
"rewards/rejected": -2.8458173274993896,
"step": 465
},
{
"epoch": 0.9920844327176781,
"grad_norm": 18.434311800327553,
"learning_rate": 6.146906537587982e-11,
"logits/chosen": -1.7675012350082397,
"logits/rejected": -2.0456321239471436,
"logps/chosen": -524.4590454101562,
"logps/rejected": -550.3624877929688,
"loss": 0.5793,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.3114473819732666,
"rewards/margins": 0.4332752823829651,
"rewards/rejected": -2.744722366333008,
"step": 470
},
{
"epoch": 0.9984168865435357,
"step": 473,
"total_flos": 0.0,
"train_loss": 0.6103140643736776,
"train_runtime": 23898.8744,
"train_samples_per_second": 2.537,
"train_steps_per_second": 0.02
}
],
"logging_steps": 5,
"max_steps": 473,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}