mistral-sft-dpo-v / trainer_state.json
AmberYifan's picture
Model save
60238bb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 1563,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006397952655150352,
"grad_norm": 60.118304941939414,
"learning_rate": 3.1847133757961784e-09,
"logits/chosen": -2.853665351867676,
"logits/rejected": -2.8379149436950684,
"logps/chosen": -83.49566650390625,
"logps/rejected": -123.54679870605469,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.006397952655150352,
"grad_norm": 57.81423019053645,
"learning_rate": 3.184713375796178e-08,
"logits/chosen": -2.902895927429199,
"logits/rejected": -2.875051259994507,
"logps/chosen": -115.33470153808594,
"logps/rejected": -92.90689086914062,
"loss": 0.6927,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.0009852921357378364,
"rewards/margins": -0.0015405109152197838,
"rewards/rejected": 0.0025258036330342293,
"step": 10
},
{
"epoch": 0.012795905310300703,
"grad_norm": 61.55427334185566,
"learning_rate": 6.369426751592356e-08,
"logits/chosen": -2.903585433959961,
"logits/rejected": -2.8909051418304443,
"logps/chosen": -133.5170440673828,
"logps/rejected": -110.08155822753906,
"loss": 0.6924,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.00143242790363729,
"rewards/margins": -0.0002988163323607296,
"rewards/rejected": 0.0017312444979324937,
"step": 20
},
{
"epoch": 0.019193857965451054,
"grad_norm": 55.02555810267581,
"learning_rate": 9.554140127388536e-08,
"logits/chosen": -2.892906904220581,
"logits/rejected": -2.8784232139587402,
"logps/chosen": -114.21958923339844,
"logps/rejected": -97.69152069091797,
"loss": 0.6913,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.012303625233471394,
"rewards/margins": 0.007176184095442295,
"rewards/rejected": 0.0051274425350129604,
"step": 30
},
{
"epoch": 0.025591810620601407,
"grad_norm": 57.294821146884644,
"learning_rate": 1.2738853503184713e-07,
"logits/chosen": -2.911865711212158,
"logits/rejected": -2.896728992462158,
"logps/chosen": -124.02873229980469,
"logps/rejected": -107.5560073852539,
"loss": 0.6838,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.03221520036458969,
"rewards/margins": 0.018920911476016045,
"rewards/rejected": 0.013294287025928497,
"step": 40
},
{
"epoch": 0.03198976327575176,
"grad_norm": 66.3217272293019,
"learning_rate": 1.592356687898089e-07,
"logits/chosen": -2.908454656600952,
"logits/rejected": -2.884701728820801,
"logps/chosen": -121.34477233886719,
"logps/rejected": -102.1689453125,
"loss": 0.683,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.05464054271578789,
"rewards/margins": 0.035086970776319504,
"rewards/rejected": 0.019553570076823235,
"step": 50
},
{
"epoch": 0.03838771593090211,
"grad_norm": 58.62876538394977,
"learning_rate": 1.9108280254777072e-07,
"logits/chosen": -2.9217209815979004,
"logits/rejected": -2.8922061920166016,
"logps/chosen": -124.75311279296875,
"logps/rejected": -81.85745239257812,
"loss": 0.6796,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.08774559199810028,
"rewards/margins": 0.069425567984581,
"rewards/rejected": 0.01832001842558384,
"step": 60
},
{
"epoch": 0.044785668586052464,
"grad_norm": 56.19565905301092,
"learning_rate": 2.2292993630573247e-07,
"logits/chosen": -2.9313912391662598,
"logits/rejected": -2.890962839126587,
"logps/chosen": -114.58404541015625,
"logps/rejected": -89.98262023925781,
"loss": 0.6635,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.08911927044391632,
"rewards/margins": 0.06764236837625504,
"rewards/rejected": 0.02147689089179039,
"step": 70
},
{
"epoch": 0.05118362124120281,
"grad_norm": 57.42643526688929,
"learning_rate": 2.5477707006369425e-07,
"logits/chosen": -2.9421534538269043,
"logits/rejected": -2.9394068717956543,
"logps/chosen": -108.8486328125,
"logps/rejected": -106.7107162475586,
"loss": 0.6684,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.09960101544857025,
"rewards/margins": 0.029676537960767746,
"rewards/rejected": 0.06992447376251221,
"step": 80
},
{
"epoch": 0.05758157389635317,
"grad_norm": 65.93546292866482,
"learning_rate": 2.86624203821656e-07,
"logits/chosen": -2.9601008892059326,
"logits/rejected": -2.928912401199341,
"logps/chosen": -135.4389190673828,
"logps/rejected": -104.19834899902344,
"loss": 0.6681,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.17293739318847656,
"rewards/margins": 0.07813060283660889,
"rewards/rejected": 0.09480679780244827,
"step": 90
},
{
"epoch": 0.06397952655150352,
"grad_norm": 57.01131226596072,
"learning_rate": 3.184713375796178e-07,
"logits/chosen": -2.96305513381958,
"logits/rejected": -2.9574124813079834,
"logps/chosen": -130.2837677001953,
"logps/rejected": -111.2293701171875,
"loss": 0.6608,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.18434445559978485,
"rewards/margins": 0.12251557409763336,
"rewards/rejected": 0.06182890012860298,
"step": 100
},
{
"epoch": 0.07037747920665387,
"grad_norm": 69.70550779478351,
"learning_rate": 3.5031847133757957e-07,
"logits/chosen": -2.9655303955078125,
"logits/rejected": -2.951481342315674,
"logps/chosen": -111.79109954833984,
"logps/rejected": -109.75225830078125,
"loss": 0.6721,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.1715824156999588,
"rewards/margins": 0.13271735608577728,
"rewards/rejected": 0.03886505961418152,
"step": 110
},
{
"epoch": 0.07677543186180422,
"grad_norm": 52.38866527542888,
"learning_rate": 3.8216560509554143e-07,
"logits/chosen": -2.903899908065796,
"logits/rejected": -2.881450653076172,
"logps/chosen": -97.67478942871094,
"logps/rejected": -84.29422760009766,
"loss": 0.6516,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.12506189942359924,
"rewards/margins": 0.13894985616207123,
"rewards/rejected": -0.013887954875826836,
"step": 120
},
{
"epoch": 0.08317338451695458,
"grad_norm": 64.70820481988301,
"learning_rate": 4.140127388535032e-07,
"logits/chosen": -2.895112991333008,
"logits/rejected": -2.88991641998291,
"logps/chosen": -96.93889617919922,
"logps/rejected": -90.70915222167969,
"loss": 0.6585,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.038154274225234985,
"rewards/margins": 0.07399366050958633,
"rewards/rejected": -0.03583937883377075,
"step": 130
},
{
"epoch": 0.08957133717210493,
"grad_norm": 60.15429861597433,
"learning_rate": 4.4585987261146494e-07,
"logits/chosen": -2.8921263217926025,
"logits/rejected": -2.878185749053955,
"logps/chosen": -98.92501068115234,
"logps/rejected": -91.44964599609375,
"loss": 0.6562,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.04788198322057724,
"rewards/margins": 0.14094644784927368,
"rewards/rejected": -0.18882843852043152,
"step": 140
},
{
"epoch": 0.09596928982725528,
"grad_norm": 61.72005334408183,
"learning_rate": 4.777070063694267e-07,
"logits/chosen": -2.8982043266296387,
"logits/rejected": -2.8801465034484863,
"logps/chosen": -109.93717193603516,
"logps/rejected": -105.29100036621094,
"loss": 0.6436,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11187714338302612,
"rewards/margins": 0.06109604984521866,
"rewards/rejected": -0.17297318577766418,
"step": 150
},
{
"epoch": 0.10236724248240563,
"grad_norm": 50.464722527852004,
"learning_rate": 4.989331436699858e-07,
"logits/chosen": -2.876605749130249,
"logits/rejected": -2.869777202606201,
"logps/chosen": -114.76756286621094,
"logps/rejected": -95.31561279296875,
"loss": 0.6387,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.06972712278366089,
"rewards/margins": 0.17382851243019104,
"rewards/rejected": -0.24355562031269073,
"step": 160
},
{
"epoch": 0.10876519513755598,
"grad_norm": 52.998328288429626,
"learning_rate": 4.953769559032717e-07,
"logits/chosen": -2.877915859222412,
"logits/rejected": -2.8578855991363525,
"logps/chosen": -127.77665710449219,
"logps/rejected": -94.31978607177734,
"loss": 0.6357,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.042665015906095505,
"rewards/margins": 0.3181106448173523,
"rewards/rejected": -0.3607756495475769,
"step": 170
},
{
"epoch": 0.11516314779270634,
"grad_norm": 53.51042626016048,
"learning_rate": 4.918207681365576e-07,
"logits/chosen": -2.865325450897217,
"logits/rejected": -2.8459763526916504,
"logps/chosen": -107.5126724243164,
"logps/rejected": -95.08032989501953,
"loss": 0.6438,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.12875112891197205,
"rewards/margins": 0.29623284935951233,
"rewards/rejected": -0.4249839782714844,
"step": 180
},
{
"epoch": 0.12156110044785669,
"grad_norm": 61.69592094178514,
"learning_rate": 4.882645803698435e-07,
"logits/chosen": -2.884171724319458,
"logits/rejected": -2.863605499267578,
"logps/chosen": -120.2016372680664,
"logps/rejected": -112.73602294921875,
"loss": 0.661,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.348448246717453,
"rewards/margins": 0.15380175411701202,
"rewards/rejected": -0.502250075340271,
"step": 190
},
{
"epoch": 0.12795905310300704,
"grad_norm": 47.21307657695218,
"learning_rate": 4.847083926031294e-07,
"logits/chosen": -2.8833072185516357,
"logits/rejected": -2.864515542984009,
"logps/chosen": -120.51539611816406,
"logps/rejected": -123.11246490478516,
"loss": 0.6556,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2995172441005707,
"rewards/margins": 0.23192158341407776,
"rewards/rejected": -0.5314388275146484,
"step": 200
},
{
"epoch": 0.12795905310300704,
"eval_logits/chosen": -2.822009325027466,
"eval_logits/rejected": -2.808537006378174,
"eval_logps/chosen": -114.53223419189453,
"eval_logps/rejected": -98.75383758544922,
"eval_loss": 0.6150196194648743,
"eval_rewards/accuracies": 0.6831210255622864,
"eval_rewards/chosen": -0.34572336077690125,
"eval_rewards/margins": 0.25221893191337585,
"eval_rewards/rejected": -0.5979422926902771,
"eval_runtime": 755.1997,
"eval_samples_per_second": 6.621,
"eval_steps_per_second": 0.208,
"step": 200
},
{
"epoch": 0.1343570057581574,
"grad_norm": 56.52759183086139,
"learning_rate": 4.811522048364154e-07,
"logits/chosen": -2.849454402923584,
"logits/rejected": -2.8368048667907715,
"logps/chosen": -103.74124908447266,
"logps/rejected": -101.4257583618164,
"loss": 0.6547,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.31585240364074707,
"rewards/margins": 0.26868245005607605,
"rewards/rejected": -0.5845348238945007,
"step": 210
},
{
"epoch": 0.14075495841330773,
"grad_norm": 47.14356880425196,
"learning_rate": 4.775960170697012e-07,
"logits/chosen": -2.882763147354126,
"logits/rejected": -2.871001958847046,
"logps/chosen": -118.808349609375,
"logps/rejected": -114.5535888671875,
"loss": 0.6077,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2729986011981964,
"rewards/margins": 0.38306480646133423,
"rewards/rejected": -0.656063437461853,
"step": 220
},
{
"epoch": 0.1471529110684581,
"grad_norm": 63.22883397819913,
"learning_rate": 4.7403982930298717e-07,
"logits/chosen": -2.855159282684326,
"logits/rejected": -2.837705135345459,
"logps/chosen": -101.80978393554688,
"logps/rejected": -90.93101501464844,
"loss": 0.6211,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5196329951286316,
"rewards/margins": 0.19388818740844727,
"rewards/rejected": -0.7135211229324341,
"step": 230
},
{
"epoch": 0.15355086372360843,
"grad_norm": 66.64354781660717,
"learning_rate": 4.7048364153627306e-07,
"logits/chosen": -2.851243734359741,
"logits/rejected": -2.8454842567443848,
"logps/chosen": -121.05877685546875,
"logps/rejected": -106.16410827636719,
"loss": 0.6517,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.46725529432296753,
"rewards/margins": 0.14034488797187805,
"rewards/rejected": -0.607600212097168,
"step": 240
},
{
"epoch": 0.1599488163787588,
"grad_norm": 57.16801182208312,
"learning_rate": 4.66927453769559e-07,
"logits/chosen": -2.8734335899353027,
"logits/rejected": -2.847668409347534,
"logps/chosen": -123.86625671386719,
"logps/rejected": -109.57325744628906,
"loss": 0.6281,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3798033595085144,
"rewards/margins": 0.3836090564727783,
"rewards/rejected": -0.7634124755859375,
"step": 250
},
{
"epoch": 0.16634676903390916,
"grad_norm": 54.517039547772974,
"learning_rate": 4.633712660028449e-07,
"logits/chosen": -2.876328706741333,
"logits/rejected": -2.8625760078430176,
"logps/chosen": -130.20523071289062,
"logps/rejected": -112.2127685546875,
"loss": 0.67,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.47367653250694275,
"rewards/margins": 0.38414520025253296,
"rewards/rejected": -0.8578217625617981,
"step": 260
},
{
"epoch": 0.1727447216890595,
"grad_norm": 54.60395925488424,
"learning_rate": 4.5981507823613085e-07,
"logits/chosen": -2.844329357147217,
"logits/rejected": -2.8402137756347656,
"logps/chosen": -114.29241943359375,
"logps/rejected": -105.92488861083984,
"loss": 0.6194,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6592072248458862,
"rewards/margins": 0.2703477740287781,
"rewards/rejected": -0.92955482006073,
"step": 270
},
{
"epoch": 0.17914267434420986,
"grad_norm": 56.55213803108218,
"learning_rate": 4.562588904694168e-07,
"logits/chosen": -2.8192648887634277,
"logits/rejected": -2.8061060905456543,
"logps/chosen": -111.86820220947266,
"logps/rejected": -115.575439453125,
"loss": 0.6544,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5377305746078491,
"rewards/margins": 0.6384528875350952,
"rewards/rejected": -1.1761833429336548,
"step": 280
},
{
"epoch": 0.1855406269993602,
"grad_norm": 69.17771143675574,
"learning_rate": 4.5270270270270264e-07,
"logits/chosen": -2.843867778778076,
"logits/rejected": -2.829209804534912,
"logps/chosen": -110.8249740600586,
"logps/rejected": -103.67692565917969,
"loss": 0.6367,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5854028463363647,
"rewards/margins": 0.6495189666748047,
"rewards/rejected": -1.2349218130111694,
"step": 290
},
{
"epoch": 0.19193857965451055,
"grad_norm": 47.35052032879965,
"learning_rate": 4.491465149359886e-07,
"logits/chosen": -2.8120689392089844,
"logits/rejected": -2.7911698818206787,
"logps/chosen": -119.08061218261719,
"logps/rejected": -112.31675720214844,
"loss": 0.57,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.45342451333999634,
"rewards/margins": 0.7334454655647278,
"rewards/rejected": -1.1868698596954346,
"step": 300
},
{
"epoch": 0.19833653230966092,
"grad_norm": 49.203663039796716,
"learning_rate": 4.4559032716927454e-07,
"logits/chosen": -2.821526527404785,
"logits/rejected": -2.8225045204162598,
"logps/chosen": -119.77931213378906,
"logps/rejected": -109.47914123535156,
"loss": 0.6554,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7418456077575684,
"rewards/margins": 0.45222848653793335,
"rewards/rejected": -1.194074273109436,
"step": 310
},
{
"epoch": 0.20473448496481125,
"grad_norm": 49.58718436216588,
"learning_rate": 4.420341394025605e-07,
"logits/chosen": -2.785693883895874,
"logits/rejected": -2.7701306343078613,
"logps/chosen": -107.3865966796875,
"logps/rejected": -106.08250427246094,
"loss": 0.6065,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7089810371398926,
"rewards/margins": 0.6241555213928223,
"rewards/rejected": -1.3331366777420044,
"step": 320
},
{
"epoch": 0.21113243761996162,
"grad_norm": 60.39033430088021,
"learning_rate": 4.384779516358463e-07,
"logits/chosen": -2.828508138656616,
"logits/rejected": -2.8167166709899902,
"logps/chosen": -122.96031188964844,
"logps/rejected": -115.10658264160156,
"loss": 0.662,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6257882118225098,
"rewards/margins": 0.5289193391799927,
"rewards/rejected": -1.154707670211792,
"step": 330
},
{
"epoch": 0.21753039027511195,
"grad_norm": 63.14689712905548,
"learning_rate": 4.3492176386913227e-07,
"logits/chosen": -2.8433403968811035,
"logits/rejected": -2.8479769229888916,
"logps/chosen": -121.58296203613281,
"logps/rejected": -115.6580810546875,
"loss": 0.6481,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6279059648513794,
"rewards/margins": 0.3463929295539856,
"rewards/rejected": -0.9742989540100098,
"step": 340
},
{
"epoch": 0.22392834293026231,
"grad_norm": 53.84331868502145,
"learning_rate": 4.313655761024182e-07,
"logits/chosen": -2.8341145515441895,
"logits/rejected": -2.8294851779937744,
"logps/chosen": -114.5806884765625,
"logps/rejected": -103.96630859375,
"loss": 0.6084,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6928296089172363,
"rewards/margins": 0.44785672426223755,
"rewards/rejected": -1.140686273574829,
"step": 350
},
{
"epoch": 0.23032629558541268,
"grad_norm": 48.0087088492426,
"learning_rate": 4.278093883357041e-07,
"logits/chosen": -2.853909969329834,
"logits/rejected": -2.8232522010803223,
"logps/chosen": -110.68212890625,
"logps/rejected": -98.71046447753906,
"loss": 0.607,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7764779925346375,
"rewards/margins": 0.4268825054168701,
"rewards/rejected": -1.2033603191375732,
"step": 360
},
{
"epoch": 0.236724248240563,
"grad_norm": 49.094230475270784,
"learning_rate": 4.2425320056899e-07,
"logits/chosen": -2.831592559814453,
"logits/rejected": -2.821815013885498,
"logps/chosen": -106.56488037109375,
"logps/rejected": -108.5312728881836,
"loss": 0.6504,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.4470910429954529,
"rewards/margins": 0.3594434857368469,
"rewards/rejected": -0.8065345883369446,
"step": 370
},
{
"epoch": 0.24312220089571338,
"grad_norm": 66.63628934256334,
"learning_rate": 4.2069701280227595e-07,
"logits/chosen": -2.824704170227051,
"logits/rejected": -2.8049395084381104,
"logps/chosen": -120.53758239746094,
"logps/rejected": -106.18167877197266,
"loss": 0.6455,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8338532447814941,
"rewards/margins": 0.6111718416213989,
"rewards/rejected": -1.4450252056121826,
"step": 380
},
{
"epoch": 0.2495201535508637,
"grad_norm": 69.12388513034483,
"learning_rate": 4.1714082503556185e-07,
"logits/chosen": -2.8500986099243164,
"logits/rejected": -2.826770544052124,
"logps/chosen": -131.38690185546875,
"logps/rejected": -106.4390869140625,
"loss": 0.6847,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.8073829412460327,
"rewards/margins": 0.36057430505752563,
"rewards/rejected": -1.167957067489624,
"step": 390
},
{
"epoch": 0.2559181062060141,
"grad_norm": 57.108856581509585,
"learning_rate": 4.135846372688478e-07,
"logits/chosen": -2.832038164138794,
"logits/rejected": -2.8184292316436768,
"logps/chosen": -119.92472839355469,
"logps/rejected": -125.78714752197266,
"loss": 0.6305,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.4277513921260834,
"rewards/margins": 0.7275630235671997,
"rewards/rejected": -1.1553144454956055,
"step": 400
},
{
"epoch": 0.2559181062060141,
"eval_logits/chosen": -2.8320508003234863,
"eval_logits/rejected": -2.8121678829193115,
"eval_logps/chosen": -117.8738021850586,
"eval_logps/rejected": -104.50364685058594,
"eval_loss": 0.5884435772895813,
"eval_rewards/accuracies": 0.6958598494529724,
"eval_rewards/chosen": -0.6798812747001648,
"eval_rewards/margins": 0.4930422306060791,
"eval_rewards/rejected": -1.1729233264923096,
"eval_runtime": 739.6751,
"eval_samples_per_second": 6.76,
"eval_steps_per_second": 0.212,
"step": 400
},
{
"epoch": 0.26231605886116444,
"grad_norm": 59.148448424766435,
"learning_rate": 4.100284495021337e-07,
"logits/chosen": -2.8497817516326904,
"logits/rejected": -2.8282692432403564,
"logps/chosen": -121.6697998046875,
"logps/rejected": -120.14678955078125,
"loss": 0.6345,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6451729536056519,
"rewards/margins": 0.7675411701202393,
"rewards/rejected": -1.4127142429351807,
"step": 410
},
{
"epoch": 0.2687140115163148,
"grad_norm": 41.06134554396416,
"learning_rate": 4.064722617354196e-07,
"logits/chosen": -2.8894846439361572,
"logits/rejected": -2.861454486846924,
"logps/chosen": -125.02449035644531,
"logps/rejected": -121.65775299072266,
"loss": 0.6217,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6182137727737427,
"rewards/margins": 0.7963579893112183,
"rewards/rejected": -1.41457200050354,
"step": 420
},
{
"epoch": 0.2751119641714651,
"grad_norm": 52.35881081541796,
"learning_rate": 4.0291607396870553e-07,
"logits/chosen": -2.8551623821258545,
"logits/rejected": -2.815028667449951,
"logps/chosen": -109.940673828125,
"logps/rejected": -97.78315734863281,
"loss": 0.6036,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6586909890174866,
"rewards/margins": 0.7408519387245178,
"rewards/rejected": -1.3995428085327148,
"step": 430
},
{
"epoch": 0.28150991682661547,
"grad_norm": 67.91564328860713,
"learning_rate": 3.993598862019915e-07,
"logits/chosen": -2.813927412033081,
"logits/rejected": -2.8176522254943848,
"logps/chosen": -99.02717590332031,
"logps/rejected": -111.13435363769531,
"loss": 0.6412,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6925631761550903,
"rewards/margins": 0.6947155594825745,
"rewards/rejected": -1.38727867603302,
"step": 440
},
{
"epoch": 0.28790786948176583,
"grad_norm": 46.303651808982835,
"learning_rate": 3.9580369843527737e-07,
"logits/chosen": -2.8682820796966553,
"logits/rejected": -2.860414981842041,
"logps/chosen": -119.4373779296875,
"logps/rejected": -110.44520568847656,
"loss": 0.6108,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5994283556938171,
"rewards/margins": 0.7199314832687378,
"rewards/rejected": -1.3193597793579102,
"step": 450
},
{
"epoch": 0.2943058221369162,
"grad_norm": 56.48779597669934,
"learning_rate": 3.9224751066856327e-07,
"logits/chosen": -2.839963912963867,
"logits/rejected": -2.8135039806365967,
"logps/chosen": -132.95506286621094,
"logps/rejected": -108.5090103149414,
"loss": 0.696,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.6798344254493713,
"rewards/margins": 0.549295961856842,
"rewards/rejected": -1.2291303873062134,
"step": 460
},
{
"epoch": 0.30070377479206656,
"grad_norm": 45.638457628682765,
"learning_rate": 3.886913229018492e-07,
"logits/chosen": -2.8314132690429688,
"logits/rejected": -2.822693109512329,
"logps/chosen": -117.86392974853516,
"logps/rejected": -123.7062759399414,
"loss": 0.6809,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8428860902786255,
"rewards/margins": 0.7259670495986938,
"rewards/rejected": -1.5688531398773193,
"step": 470
},
{
"epoch": 0.30710172744721687,
"grad_norm": 48.11242331520591,
"learning_rate": 3.851351351351351e-07,
"logits/chosen": -2.8257322311401367,
"logits/rejected": -2.8105177879333496,
"logps/chosen": -112.5777359008789,
"logps/rejected": -107.22886657714844,
"loss": 0.6627,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.941847026348114,
"rewards/margins": 0.46391814947128296,
"rewards/rejected": -1.405765175819397,
"step": 480
},
{
"epoch": 0.31349968010236723,
"grad_norm": 47.98414298526401,
"learning_rate": 3.8157894736842105e-07,
"logits/chosen": -2.854234218597412,
"logits/rejected": -2.8304595947265625,
"logps/chosen": -114.91446685791016,
"logps/rejected": -112.93721008300781,
"loss": 0.6314,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6863322257995605,
"rewards/margins": 0.7260019183158875,
"rewards/rejected": -1.4123342037200928,
"step": 490
},
{
"epoch": 0.3198976327575176,
"grad_norm": 54.45893948180426,
"learning_rate": 3.7802275960170695e-07,
"logits/chosen": -2.821240186691284,
"logits/rejected": -2.813072443008423,
"logps/chosen": -115.79627990722656,
"logps/rejected": -108.83988952636719,
"loss": 0.6302,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6848796010017395,
"rewards/margins": 0.7942295670509338,
"rewards/rejected": -1.4791094064712524,
"step": 500
},
{
"epoch": 0.32629558541266795,
"grad_norm": 72.78110036828252,
"learning_rate": 3.7446657183499284e-07,
"logits/chosen": -2.8358187675476074,
"logits/rejected": -2.8056414127349854,
"logps/chosen": -125.2596664428711,
"logps/rejected": -122.6445083618164,
"loss": 0.6721,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0303077697753906,
"rewards/margins": 0.3377246856689453,
"rewards/rejected": -1.368032693862915,
"step": 510
},
{
"epoch": 0.3326935380678183,
"grad_norm": 51.57380900020349,
"learning_rate": 3.709103840682788e-07,
"logits/chosen": -2.8579823970794678,
"logits/rejected": -2.8210737705230713,
"logps/chosen": -123.88114166259766,
"logps/rejected": -107.84675598144531,
"loss": 0.628,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7567764520645142,
"rewards/margins": 0.8902345895767212,
"rewards/rejected": -1.647011160850525,
"step": 520
},
{
"epoch": 0.3390914907229686,
"grad_norm": 45.41853425648929,
"learning_rate": 3.6735419630156474e-07,
"logits/chosen": -2.8426403999328613,
"logits/rejected": -2.808004856109619,
"logps/chosen": -115.7242202758789,
"logps/rejected": -105.14561462402344,
"loss": 0.6144,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.744358241558075,
"rewards/margins": 0.7397834658622742,
"rewards/rejected": -1.4841415882110596,
"step": 530
},
{
"epoch": 0.345489443378119,
"grad_norm": 45.6068502491896,
"learning_rate": 3.637980085348506e-07,
"logits/chosen": -2.8041722774505615,
"logits/rejected": -2.7811694145202637,
"logps/chosen": -116.31886291503906,
"logps/rejected": -101.0390625,
"loss": 0.6406,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.816309928894043,
"rewards/margins": 0.7303619384765625,
"rewards/rejected": -1.5466718673706055,
"step": 540
},
{
"epoch": 0.35188739603326935,
"grad_norm": 54.038270911291995,
"learning_rate": 3.602418207681365e-07,
"logits/chosen": -2.8286209106445312,
"logits/rejected": -2.8093667030334473,
"logps/chosen": -125.18721008300781,
"logps/rejected": -119.35871887207031,
"loss": 0.6892,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.797341525554657,
"rewards/margins": 0.7269377708435059,
"rewards/rejected": -1.5242793560028076,
"step": 550
},
{
"epoch": 0.3582853486884197,
"grad_norm": 37.61754758300332,
"learning_rate": 3.5668563300142247e-07,
"logits/chosen": -2.7906124591827393,
"logits/rejected": -2.7900514602661133,
"logps/chosen": -104.17930603027344,
"logps/rejected": -117.13945007324219,
"loss": 0.596,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8746695518493652,
"rewards/margins": 0.798784613609314,
"rewards/rejected": -1.6734540462493896,
"step": 560
},
{
"epoch": 0.3646833013435701,
"grad_norm": 51.519397153489784,
"learning_rate": 3.5312944523470837e-07,
"logits/chosen": -2.760509490966797,
"logits/rejected": -2.745539665222168,
"logps/chosen": -108.7869644165039,
"logps/rejected": -102.55323791503906,
"loss": 0.6569,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.0966747999191284,
"rewards/margins": 0.5053921937942505,
"rewards/rejected": -1.602066993713379,
"step": 570
},
{
"epoch": 0.3710812539987204,
"grad_norm": 54.888184782180275,
"learning_rate": 3.495732574679943e-07,
"logits/chosen": -2.764152765274048,
"logits/rejected": -2.763406991958618,
"logps/chosen": -113.4003677368164,
"logps/rejected": -114.9386215209961,
"loss": 0.6024,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9524062871932983,
"rewards/margins": 0.6822084188461304,
"rewards/rejected": -1.6346147060394287,
"step": 580
},
{
"epoch": 0.37747920665387075,
"grad_norm": 43.031105911892546,
"learning_rate": 3.460170697012802e-07,
"logits/chosen": -2.7501633167266846,
"logits/rejected": -2.722970485687256,
"logps/chosen": -116.67124938964844,
"logps/rejected": -100.81819152832031,
"loss": 0.567,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8928259015083313,
"rewards/margins": 0.7573403120040894,
"rewards/rejected": -1.6501661539077759,
"step": 590
},
{
"epoch": 0.3838771593090211,
"grad_norm": 50.40927225064406,
"learning_rate": 3.424608819345661e-07,
"logits/chosen": -2.751669406890869,
"logits/rejected": -2.730020523071289,
"logps/chosen": -112.72367095947266,
"logps/rejected": -104.26304626464844,
"loss": 0.6374,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5136244297027588,
"rewards/margins": 0.48020678758621216,
"rewards/rejected": -1.9938310384750366,
"step": 600
},
{
"epoch": 0.3838771593090211,
"eval_logits/chosen": -2.7680461406707764,
"eval_logits/rejected": -2.7502799034118652,
"eval_logps/chosen": -122.29470825195312,
"eval_logps/rejected": -110.04557800292969,
"eval_loss": 0.5879228711128235,
"eval_rewards/accuracies": 0.6799362897872925,
"eval_rewards/chosen": -1.1219713687896729,
"eval_rewards/margins": 0.6051455736160278,
"eval_rewards/rejected": -1.7271168231964111,
"eval_runtime": 286.7339,
"eval_samples_per_second": 17.438,
"eval_steps_per_second": 0.548,
"step": 600
},
{
"epoch": 0.3902751119641715,
"grad_norm": 38.22901412601261,
"learning_rate": 3.3890469416785205e-07,
"logits/chosen": -2.7812764644622803,
"logits/rejected": -2.7706708908081055,
"logps/chosen": -121.8727798461914,
"logps/rejected": -115.7394027709961,
"loss": 0.6536,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3321597576141357,
"rewards/margins": 0.34968990087509155,
"rewards/rejected": -1.681849479675293,
"step": 610
},
{
"epoch": 0.39667306461932184,
"grad_norm": 62.8449168285338,
"learning_rate": 3.35348506401138e-07,
"logits/chosen": -2.7748451232910156,
"logits/rejected": -2.7738356590270996,
"logps/chosen": -139.68206787109375,
"logps/rejected": -123.91287994384766,
"loss": 0.6531,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9537972211837769,
"rewards/margins": 0.5792536735534668,
"rewards/rejected": -1.5330508947372437,
"step": 620
},
{
"epoch": 0.40307101727447214,
"grad_norm": 55.7199501098711,
"learning_rate": 3.3179231863442384e-07,
"logits/chosen": -2.766322612762451,
"logits/rejected": -2.7525458335876465,
"logps/chosen": -115.6636734008789,
"logps/rejected": -115.79632568359375,
"loss": 0.6742,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.1982686519622803,
"rewards/margins": 0.32977309823036194,
"rewards/rejected": -1.5280416011810303,
"step": 630
},
{
"epoch": 0.4094689699296225,
"grad_norm": 59.627044623955385,
"learning_rate": 3.282361308677098e-07,
"logits/chosen": -2.752450942993164,
"logits/rejected": -2.7423059940338135,
"logps/chosen": -117.9424057006836,
"logps/rejected": -100.12724304199219,
"loss": 0.6591,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.099172592163086,
"rewards/margins": 0.4547777771949768,
"rewards/rejected": -1.553950548171997,
"step": 640
},
{
"epoch": 0.41586692258477287,
"grad_norm": 39.78897483264296,
"learning_rate": 3.2467994310099573e-07,
"logits/chosen": -2.7846992015838623,
"logits/rejected": -2.7701334953308105,
"logps/chosen": -120.6253890991211,
"logps/rejected": -114.74141693115234,
"loss": 0.6499,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.135921835899353,
"rewards/margins": 0.39883118867874146,
"rewards/rejected": -1.5347530841827393,
"step": 650
},
{
"epoch": 0.42226487523992323,
"grad_norm": 46.01875825095158,
"learning_rate": 3.211237553342817e-07,
"logits/chosen": -2.7716829776763916,
"logits/rejected": -2.7672672271728516,
"logps/chosen": -121.64005279541016,
"logps/rejected": -125.25276947021484,
"loss": 0.6072,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1352884769439697,
"rewards/margins": 0.6041684746742249,
"rewards/rejected": -1.7394568920135498,
"step": 660
},
{
"epoch": 0.4286628278950736,
"grad_norm": 59.01505064312942,
"learning_rate": 3.175675675675675e-07,
"logits/chosen": -2.712557315826416,
"logits/rejected": -2.709873914718628,
"logps/chosen": -107.31956481933594,
"logps/rejected": -107.1248779296875,
"loss": 0.6156,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3347444534301758,
"rewards/margins": 0.6245480179786682,
"rewards/rejected": -1.9592926502227783,
"step": 670
},
{
"epoch": 0.4350607805502239,
"grad_norm": 46.021209760495935,
"learning_rate": 3.1401137980085347e-07,
"logits/chosen": -2.7640795707702637,
"logits/rejected": -2.759766101837158,
"logps/chosen": -113.60371398925781,
"logps/rejected": -111.9162826538086,
"loss": 0.5924,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1441354751586914,
"rewards/margins": 0.737005352973938,
"rewards/rejected": -1.881140947341919,
"step": 680
},
{
"epoch": 0.44145873320537427,
"grad_norm": 34.614349860027566,
"learning_rate": 3.104551920341394e-07,
"logits/chosen": -2.7723183631896973,
"logits/rejected": -2.7740321159362793,
"logps/chosen": -133.4069366455078,
"logps/rejected": -120.85469055175781,
"loss": 0.6382,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1162710189819336,
"rewards/margins": 0.729344367980957,
"rewards/rejected": -1.8456153869628906,
"step": 690
},
{
"epoch": 0.44785668586052463,
"grad_norm": 61.67528158068899,
"learning_rate": 3.068990042674253e-07,
"logits/chosen": -2.770324468612671,
"logits/rejected": -2.756533145904541,
"logps/chosen": -116.19869232177734,
"logps/rejected": -124.49647521972656,
"loss": 0.7084,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2367490530014038,
"rewards/margins": 0.4493141770362854,
"rewards/rejected": -1.6860634088516235,
"step": 700
},
{
"epoch": 0.454254638515675,
"grad_norm": 63.971216074190245,
"learning_rate": 3.033428165007112e-07,
"logits/chosen": -2.7677056789398193,
"logits/rejected": -2.7645699977874756,
"logps/chosen": -121.53895568847656,
"logps/rejected": -111.52986145019531,
"loss": 0.729,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3768677711486816,
"rewards/margins": 0.27729541063308716,
"rewards/rejected": -1.654163122177124,
"step": 710
},
{
"epoch": 0.46065259117082535,
"grad_norm": 47.22353574035586,
"learning_rate": 2.9978662873399715e-07,
"logits/chosen": -2.8085451126098633,
"logits/rejected": -2.788102388381958,
"logps/chosen": -130.08607482910156,
"logps/rejected": -116.80802917480469,
"loss": 0.6264,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9673389196395874,
"rewards/margins": 0.9441471099853516,
"rewards/rejected": -1.911486029624939,
"step": 720
},
{
"epoch": 0.46705054382597566,
"grad_norm": 61.77806719375072,
"learning_rate": 2.9623044096728305e-07,
"logits/chosen": -2.7899954319000244,
"logits/rejected": -2.7709438800811768,
"logps/chosen": -124.1104507446289,
"logps/rejected": -125.25813293457031,
"loss": 0.7038,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1099923849105835,
"rewards/margins": 0.7608176469802856,
"rewards/rejected": -1.8708101511001587,
"step": 730
},
{
"epoch": 0.473448496481126,
"grad_norm": 50.86047987086807,
"learning_rate": 2.92674253200569e-07,
"logits/chosen": -2.7680702209472656,
"logits/rejected": -2.7549118995666504,
"logps/chosen": -121.9208755493164,
"logps/rejected": -110.9996566772461,
"loss": 0.5918,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2252216339111328,
"rewards/margins": 0.7127790451049805,
"rewards/rejected": -1.9380006790161133,
"step": 740
},
{
"epoch": 0.4798464491362764,
"grad_norm": 58.045381028548476,
"learning_rate": 2.8911806543385494e-07,
"logits/chosen": -2.819340229034424,
"logits/rejected": -2.803678035736084,
"logps/chosen": -137.2626495361328,
"logps/rejected": -124.6344985961914,
"loss": 0.6366,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.0756101608276367,
"rewards/margins": 0.8651407957077026,
"rewards/rejected": -1.940751075744629,
"step": 750
},
{
"epoch": 0.48624440179142675,
"grad_norm": 45.24460591639927,
"learning_rate": 2.855618776671408e-07,
"logits/chosen": -2.787562847137451,
"logits/rejected": -2.766334056854248,
"logps/chosen": -122.2945556640625,
"logps/rejected": -107.06929779052734,
"loss": 0.6259,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0217363834381104,
"rewards/margins": 0.8248605728149414,
"rewards/rejected": -1.8465969562530518,
"step": 760
},
{
"epoch": 0.4926423544465771,
"grad_norm": 51.01901234924279,
"learning_rate": 2.8200568990042673e-07,
"logits/chosen": -2.773236036300659,
"logits/rejected": -2.7518670558929443,
"logps/chosen": -122.63824462890625,
"logps/rejected": -108.55671691894531,
"loss": 0.5488,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9058364629745483,
"rewards/margins": 0.8456705212593079,
"rewards/rejected": -1.7515071630477905,
"step": 770
},
{
"epoch": 0.4990403071017274,
"grad_norm": 47.059317214322036,
"learning_rate": 2.784495021337127e-07,
"logits/chosen": -2.7841098308563232,
"logits/rejected": -2.765838146209717,
"logps/chosen": -132.35830688476562,
"logps/rejected": -108.7806396484375,
"loss": 0.6505,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1347086429595947,
"rewards/margins": 0.5946494340896606,
"rewards/rejected": -1.7293580770492554,
"step": 780
},
{
"epoch": 0.5054382597568778,
"grad_norm": 55.81431596800419,
"learning_rate": 2.7489331436699857e-07,
"logits/chosen": -2.7832138538360596,
"logits/rejected": -2.7748045921325684,
"logps/chosen": -129.96388244628906,
"logps/rejected": -128.643798828125,
"loss": 0.6315,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.3096433877944946,
"rewards/margins": 0.460153192281723,
"rewards/rejected": -1.76979660987854,
"step": 790
},
{
"epoch": 0.5118362124120281,
"grad_norm": 47.88527721947066,
"learning_rate": 2.7133712660028446e-07,
"logits/chosen": -2.747264862060547,
"logits/rejected": -2.73887300491333,
"logps/chosen": -118.75581359863281,
"logps/rejected": -112.53265380859375,
"loss": 0.5953,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1123216152191162,
"rewards/margins": 0.8585169911384583,
"rewards/rejected": -1.9708385467529297,
"step": 800
},
{
"epoch": 0.5118362124120281,
"eval_logits/chosen": -2.7703402042388916,
"eval_logits/rejected": -2.7539730072021484,
"eval_logps/chosen": -123.07151794433594,
"eval_logps/rejected": -110.77460479736328,
"eval_loss": 0.5856689214706421,
"eval_rewards/accuracies": 0.6958598494529724,
"eval_rewards/chosen": -1.199651837348938,
"eval_rewards/margins": 0.6003690361976624,
"eval_rewards/rejected": -1.8000208139419556,
"eval_runtime": 282.6345,
"eval_samples_per_second": 17.691,
"eval_steps_per_second": 0.555,
"step": 800
},
{
"epoch": 0.5182341650671785,
"grad_norm": 53.88804382450492,
"learning_rate": 2.677809388335704e-07,
"logits/chosen": -2.7724366188049316,
"logits/rejected": -2.7628140449523926,
"logps/chosen": -115.2289810180664,
"logps/rejected": -118.9228744506836,
"loss": 0.6289,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2024407386779785,
"rewards/margins": 0.8105290532112122,
"rewards/rejected": -2.012969970703125,
"step": 810
},
{
"epoch": 0.5246321177223289,
"grad_norm": 64.97852446899729,
"learning_rate": 2.642247510668563e-07,
"logits/chosen": -2.7826988697052,
"logits/rejected": -2.7776927947998047,
"logps/chosen": -128.9366455078125,
"logps/rejected": -126.57633209228516,
"loss": 0.6896,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0592586994171143,
"rewards/margins": 0.7127590179443359,
"rewards/rejected": -1.7720177173614502,
"step": 820
},
{
"epoch": 0.5310300703774792,
"grad_norm": 44.76851798380462,
"learning_rate": 2.6066856330014225e-07,
"logits/chosen": -2.7971854209899902,
"logits/rejected": -2.7861063480377197,
"logps/chosen": -130.8908233642578,
"logps/rejected": -124.34150695800781,
"loss": 0.6262,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9852834939956665,
"rewards/margins": 0.6130325198173523,
"rewards/rejected": -1.5983158349990845,
"step": 830
},
{
"epoch": 0.5374280230326296,
"grad_norm": 46.52784154303764,
"learning_rate": 2.5711237553342815e-07,
"logits/chosen": -2.8150525093078613,
"logits/rejected": -2.800579071044922,
"logps/chosen": -136.0585479736328,
"logps/rejected": -105.21810150146484,
"loss": 0.568,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.169297218322754,
"rewards/margins": 0.7267537713050842,
"rewards/rejected": -1.8960508108139038,
"step": 840
},
{
"epoch": 0.5438259756877799,
"grad_norm": 53.63407392324102,
"learning_rate": 2.5355618776671404e-07,
"logits/chosen": -2.833627223968506,
"logits/rejected": -2.8091251850128174,
"logps/chosen": -131.2395477294922,
"logps/rejected": -124.14871978759766,
"loss": 0.6241,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.223509430885315,
"rewards/margins": 0.6891213059425354,
"rewards/rejected": -1.9126307964324951,
"step": 850
},
{
"epoch": 0.5502239283429302,
"grad_norm": 40.739420757399046,
"learning_rate": 2.5e-07,
"logits/chosen": -2.804734468460083,
"logits/rejected": -2.7905805110931396,
"logps/chosen": -117.0075454711914,
"logps/rejected": -106.9182357788086,
"loss": 0.609,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2571464776992798,
"rewards/margins": 0.5797263383865356,
"rewards/rejected": -1.8368728160858154,
"step": 860
},
{
"epoch": 0.5566218809980806,
"grad_norm": 52.20343272589441,
"learning_rate": 2.4644381223328594e-07,
"logits/chosen": -2.8013713359832764,
"logits/rejected": -2.7786829471588135,
"logps/chosen": -120.1400375366211,
"logps/rejected": -105.8758773803711,
"loss": 0.6688,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.250685453414917,
"rewards/margins": 0.7675702571868896,
"rewards/rejected": -2.0182557106018066,
"step": 870
},
{
"epoch": 0.5630198336532309,
"grad_norm": 55.78019195317067,
"learning_rate": 2.4288762446657183e-07,
"logits/chosen": -2.824733257293701,
"logits/rejected": -2.8199667930603027,
"logps/chosen": -123.25453186035156,
"logps/rejected": -134.382080078125,
"loss": 0.6199,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1268641948699951,
"rewards/margins": 0.8908056020736694,
"rewards/rejected": -2.017669916152954,
"step": 880
},
{
"epoch": 0.5694177863083814,
"grad_norm": 46.8913181947373,
"learning_rate": 2.393314366998578e-07,
"logits/chosen": -2.7837607860565186,
"logits/rejected": -2.783323049545288,
"logps/chosen": -118.7066421508789,
"logps/rejected": -119.07723236083984,
"loss": 0.7139,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0711259841918945,
"rewards/margins": 0.7609738707542419,
"rewards/rejected": -1.8321001529693604,
"step": 890
},
{
"epoch": 0.5758157389635317,
"grad_norm": 61.90420729087464,
"learning_rate": 2.3577524893314365e-07,
"logits/chosen": -2.7695021629333496,
"logits/rejected": -2.761353015899658,
"logps/chosen": -111.3558578491211,
"logps/rejected": -111.15797424316406,
"loss": 0.6603,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1308844089508057,
"rewards/margins": 0.5882295966148376,
"rewards/rejected": -1.7191137075424194,
"step": 900
},
{
"epoch": 0.582213691618682,
"grad_norm": 48.59525480797925,
"learning_rate": 2.322190611664296e-07,
"logits/chosen": -2.789656639099121,
"logits/rejected": -2.781515121459961,
"logps/chosen": -118.76673889160156,
"logps/rejected": -105.30081939697266,
"loss": 0.6481,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1197543144226074,
"rewards/margins": 0.6071382761001587,
"rewards/rejected": -1.7268924713134766,
"step": 910
},
{
"epoch": 0.5886116442738324,
"grad_norm": 43.472580466054765,
"learning_rate": 2.2866287339971549e-07,
"logits/chosen": -2.789944887161255,
"logits/rejected": -2.771533250808716,
"logps/chosen": -120.75154113769531,
"logps/rejected": -119.8617935180664,
"loss": 0.6377,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0260751247406006,
"rewards/margins": 0.7392138242721558,
"rewards/rejected": -1.7652889490127563,
"step": 920
},
{
"epoch": 0.5950095969289827,
"grad_norm": 55.2411167534148,
"learning_rate": 2.251066856330014e-07,
"logits/chosen": -2.7701098918914795,
"logits/rejected": -2.7492525577545166,
"logps/chosen": -114.12837219238281,
"logps/rejected": -100.77635192871094,
"loss": 0.6102,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.210095763206482,
"rewards/margins": 0.505190372467041,
"rewards/rejected": -1.7152862548828125,
"step": 930
},
{
"epoch": 0.6014075495841331,
"grad_norm": 50.0185028587488,
"learning_rate": 2.2155049786628733e-07,
"logits/chosen": -2.7873411178588867,
"logits/rejected": -2.7700142860412598,
"logps/chosen": -114.91324615478516,
"logps/rejected": -105.86214447021484,
"loss": 0.6749,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.211742639541626,
"rewards/margins": 0.5054360628128052,
"rewards/rejected": -1.7171787023544312,
"step": 940
},
{
"epoch": 0.6078055022392834,
"grad_norm": 42.87415326293721,
"learning_rate": 2.1799431009957325e-07,
"logits/chosen": -2.78757381439209,
"logits/rejected": -2.776582717895508,
"logps/chosen": -128.70608520507812,
"logps/rejected": -124.4533920288086,
"loss": 0.6757,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.146447777748108,
"rewards/margins": 0.6257702112197876,
"rewards/rejected": -1.7722179889678955,
"step": 950
},
{
"epoch": 0.6142034548944337,
"grad_norm": 49.65392226267386,
"learning_rate": 2.1443812233285914e-07,
"logits/chosen": -2.7512404918670654,
"logits/rejected": -2.744755983352661,
"logps/chosen": -110.56729888916016,
"logps/rejected": -121.34492492675781,
"loss": 0.6406,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2666919231414795,
"rewards/margins": 0.49605482816696167,
"rewards/rejected": -1.762746810913086,
"step": 960
},
{
"epoch": 0.6206014075495841,
"grad_norm": 47.93077072841145,
"learning_rate": 2.108819345661451e-07,
"logits/chosen": -2.764608144760132,
"logits/rejected": -2.764833927154541,
"logps/chosen": -124.5059585571289,
"logps/rejected": -116.89808654785156,
"loss": 0.597,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0087820291519165,
"rewards/margins": 0.6207214593887329,
"rewards/rejected": -1.6295034885406494,
"step": 970
},
{
"epoch": 0.6269993602047345,
"grad_norm": 52.39545922578806,
"learning_rate": 2.0732574679943098e-07,
"logits/chosen": -2.7816779613494873,
"logits/rejected": -2.761172294616699,
"logps/chosen": -137.40052795410156,
"logps/rejected": -122.988525390625,
"loss": 0.6108,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0072863101959229,
"rewards/margins": 0.8142153024673462,
"rewards/rejected": -1.8215014934539795,
"step": 980
},
{
"epoch": 0.6333973128598849,
"grad_norm": 44.290985976089594,
"learning_rate": 2.0376955903271693e-07,
"logits/chosen": -2.774445056915283,
"logits/rejected": -2.763420343399048,
"logps/chosen": -126.58064270019531,
"logps/rejected": -120.07356262207031,
"loss": 0.5508,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9515323638916016,
"rewards/margins": 0.677230179309845,
"rewards/rejected": -1.6287622451782227,
"step": 990
},
{
"epoch": 0.6397952655150352,
"grad_norm": 47.67634608134284,
"learning_rate": 2.0021337126600283e-07,
"logits/chosen": -2.772531747817993,
"logits/rejected": -2.749040365219116,
"logps/chosen": -130.54287719726562,
"logps/rejected": -118.84379577636719,
"loss": 0.5874,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2028357982635498,
"rewards/margins": 0.8163717985153198,
"rewards/rejected": -2.01920747756958,
"step": 1000
},
{
"epoch": 0.6397952655150352,
"eval_logits/chosen": -2.754683494567871,
"eval_logits/rejected": -2.738701581954956,
"eval_logps/chosen": -123.66197204589844,
"eval_logps/rejected": -111.75140380859375,
"eval_loss": 0.5864265561103821,
"eval_rewards/accuracies": 0.6918789744377136,
"eval_rewards/chosen": -1.2586979866027832,
"eval_rewards/margins": 0.6390010714530945,
"eval_rewards/rejected": -1.897699236869812,
"eval_runtime": 280.4436,
"eval_samples_per_second": 17.829,
"eval_steps_per_second": 0.56,
"step": 1000
},
{
"epoch": 0.6461932181701855,
"grad_norm": 63.64218572486143,
"learning_rate": 1.9665718349928875e-07,
"logits/chosen": -2.737196445465088,
"logits/rejected": -2.734534978866577,
"logps/chosen": -121.40058898925781,
"logps/rejected": -127.97169494628906,
"loss": 0.5982,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1952520608901978,
"rewards/margins": 0.8720852136611938,
"rewards/rejected": -2.0673370361328125,
"step": 1010
},
{
"epoch": 0.6525911708253359,
"grad_norm": 48.03592117722671,
"learning_rate": 1.931009957325747e-07,
"logits/chosen": -2.7495694160461426,
"logits/rejected": -2.735703229904175,
"logps/chosen": -141.25796508789062,
"logps/rejected": -112.2705078125,
"loss": 0.5639,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8717496991157532,
"rewards/margins": 1.0097849369049072,
"rewards/rejected": -1.8815345764160156,
"step": 1020
},
{
"epoch": 0.6589891234804862,
"grad_norm": 50.405047652842626,
"learning_rate": 1.895448079658606e-07,
"logits/chosen": -2.769981861114502,
"logits/rejected": -2.7553834915161133,
"logps/chosen": -117.5353012084961,
"logps/rejected": -118.06101989746094,
"loss": 0.6584,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4604730606079102,
"rewards/margins": 0.6957337856292725,
"rewards/rejected": -2.1562066078186035,
"step": 1030
},
{
"epoch": 0.6653870761356366,
"grad_norm": 42.71526690235821,
"learning_rate": 1.859886201991465e-07,
"logits/chosen": -2.753505229949951,
"logits/rejected": -2.739760637283325,
"logps/chosen": -132.32034301757812,
"logps/rejected": -117.8974838256836,
"loss": 0.6377,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3013321161270142,
"rewards/margins": 0.5227604508399963,
"rewards/rejected": -1.8240925073623657,
"step": 1040
},
{
"epoch": 0.6717850287907869,
"grad_norm": 41.397749630534875,
"learning_rate": 1.8243243243243243e-07,
"logits/chosen": -2.703765869140625,
"logits/rejected": -2.709289789199829,
"logps/chosen": -113.61067199707031,
"logps/rejected": -124.3532943725586,
"loss": 0.6313,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3022973537445068,
"rewards/margins": 0.5214705467224121,
"rewards/rejected": -1.823767900466919,
"step": 1050
},
{
"epoch": 0.6781829814459372,
"grad_norm": 47.2302545472036,
"learning_rate": 1.7887624466571835e-07,
"logits/chosen": -2.7339816093444824,
"logits/rejected": -2.7247262001037598,
"logps/chosen": -113.62435150146484,
"logps/rejected": -124.93162536621094,
"loss": 0.6671,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3559902906417847,
"rewards/margins": 0.7308332920074463,
"rewards/rejected": -2.0868237018585205,
"step": 1060
},
{
"epoch": 0.6845809341010877,
"grad_norm": 45.97787540053498,
"learning_rate": 1.7532005689900424e-07,
"logits/chosen": -2.726437568664551,
"logits/rejected": -2.7229714393615723,
"logps/chosen": -121.73079681396484,
"logps/rejected": -122.3262939453125,
"loss": 0.6183,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.3220767974853516,
"rewards/margins": 0.7562096118927002,
"rewards/rejected": -2.0782861709594727,
"step": 1070
},
{
"epoch": 0.690978886756238,
"grad_norm": 36.28143478337335,
"learning_rate": 1.717638691322902e-07,
"logits/chosen": -2.753014326095581,
"logits/rejected": -2.7368969917297363,
"logps/chosen": -130.1658172607422,
"logps/rejected": -130.8894805908203,
"loss": 0.6223,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1469093561172485,
"rewards/margins": 0.7361122369766235,
"rewards/rejected": -1.883021593093872,
"step": 1080
},
{
"epoch": 0.6973768394113884,
"grad_norm": 43.630489091326666,
"learning_rate": 1.6820768136557609e-07,
"logits/chosen": -2.729057788848877,
"logits/rejected": -2.717261791229248,
"logps/chosen": -125.09024810791016,
"logps/rejected": -125.5413589477539,
"loss": 0.6573,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.042244791984558,
"rewards/margins": 0.36305585503578186,
"rewards/rejected": -1.4053006172180176,
"step": 1090
},
{
"epoch": 0.7037747920665387,
"grad_norm": 48.19222828508351,
"learning_rate": 1.64651493598862e-07,
"logits/chosen": -2.7365171909332275,
"logits/rejected": -2.712463140487671,
"logps/chosen": -125.68775939941406,
"logps/rejected": -114.6905746459961,
"loss": 0.5656,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0582258701324463,
"rewards/margins": 0.8331489562988281,
"rewards/rejected": -1.891374945640564,
"step": 1100
},
{
"epoch": 0.710172744721689,
"grad_norm": 45.80775187080158,
"learning_rate": 1.6109530583214793e-07,
"logits/chosen": -2.7182765007019043,
"logits/rejected": -2.712038040161133,
"logps/chosen": -118.6636962890625,
"logps/rejected": -127.0698471069336,
"loss": 0.5943,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.082397222518921,
"rewards/margins": 1.1772416830062866,
"rewards/rejected": -2.259639263153076,
"step": 1110
},
{
"epoch": 0.7165706973768394,
"grad_norm": 49.2718796270582,
"learning_rate": 1.5753911806543385e-07,
"logits/chosen": -2.7429823875427246,
"logits/rejected": -2.735952854156494,
"logps/chosen": -139.99502563476562,
"logps/rejected": -133.4441680908203,
"loss": 0.6082,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9839005470275879,
"rewards/margins": 0.8153827786445618,
"rewards/rejected": -1.7992833852767944,
"step": 1120
},
{
"epoch": 0.7229686500319897,
"grad_norm": 53.63575414905778,
"learning_rate": 1.5398293029871974e-07,
"logits/chosen": -2.6896634101867676,
"logits/rejected": -2.6739554405212402,
"logps/chosen": -132.13693237304688,
"logps/rejected": -115.6174087524414,
"loss": 0.615,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.495951771736145,
"rewards/margins": 0.6560246348381042,
"rewards/rejected": -2.1519765853881836,
"step": 1130
},
{
"epoch": 0.7293666026871402,
"grad_norm": 45.459330351613914,
"learning_rate": 1.504267425320057e-07,
"logits/chosen": -2.747023820877075,
"logits/rejected": -2.7279767990112305,
"logps/chosen": -131.20159912109375,
"logps/rejected": -118.8502426147461,
"loss": 0.6432,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0559256076812744,
"rewards/margins": 0.745880126953125,
"rewards/rejected": -1.801805853843689,
"step": 1140
},
{
"epoch": 0.7357645553422905,
"grad_norm": 50.69242455839191,
"learning_rate": 1.4687055476529158e-07,
"logits/chosen": -2.724719524383545,
"logits/rejected": -2.696193218231201,
"logps/chosen": -132.23805236816406,
"logps/rejected": -116.33622741699219,
"loss": 0.5897,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1425752639770508,
"rewards/margins": 1.2291043996810913,
"rewards/rejected": -2.3716797828674316,
"step": 1150
},
{
"epoch": 0.7421625079974408,
"grad_norm": 48.260915633915204,
"learning_rate": 1.4331436699857753e-07,
"logits/chosen": -2.7609190940856934,
"logits/rejected": -2.751678466796875,
"logps/chosen": -132.62606811523438,
"logps/rejected": -131.4741668701172,
"loss": 0.6021,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3398916721343994,
"rewards/margins": 0.5659030079841614,
"rewards/rejected": -1.9057947397232056,
"step": 1160
},
{
"epoch": 0.7485604606525912,
"grad_norm": 51.70354571255556,
"learning_rate": 1.3975817923186345e-07,
"logits/chosen": -2.748525619506836,
"logits/rejected": -2.731635570526123,
"logps/chosen": -148.0665740966797,
"logps/rejected": -112.72142028808594,
"loss": 0.6329,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.066410779953003,
"rewards/margins": 0.9416675567626953,
"rewards/rejected": -2.0080783367156982,
"step": 1170
},
{
"epoch": 0.7549584133077415,
"grad_norm": 46.631060529374324,
"learning_rate": 1.3620199146514935e-07,
"logits/chosen": -2.715355396270752,
"logits/rejected": -2.7042181491851807,
"logps/chosen": -122.81558990478516,
"logps/rejected": -123.2553482055664,
"loss": 0.6433,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.463372826576233,
"rewards/margins": 0.7959567308425903,
"rewards/rejected": -2.2593295574188232,
"step": 1180
},
{
"epoch": 0.7613563659628919,
"grad_norm": 48.30565774738731,
"learning_rate": 1.326458036984353e-07,
"logits/chosen": -2.703979253768921,
"logits/rejected": -2.711151599884033,
"logps/chosen": -123.97117614746094,
"logps/rejected": -118.76310729980469,
"loss": 0.662,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5227792263031006,
"rewards/margins": 0.5576712489128113,
"rewards/rejected": -2.0804507732391357,
"step": 1190
},
{
"epoch": 0.7677543186180422,
"grad_norm": 42.393577486160744,
"learning_rate": 1.290896159317212e-07,
"logits/chosen": -2.7326602935791016,
"logits/rejected": -2.719825267791748,
"logps/chosen": -139.4118194580078,
"logps/rejected": -122.456787109375,
"loss": 0.5937,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2478922605514526,
"rewards/margins": 0.7356584668159485,
"rewards/rejected": -1.983550786972046,
"step": 1200
},
{
"epoch": 0.7677543186180422,
"eval_logits/chosen": -2.724303960800171,
"eval_logits/rejected": -2.710904359817505,
"eval_logps/chosen": -125.66478729248047,
"eval_logps/rejected": -114.08828735351562,
"eval_loss": 0.5853144526481628,
"eval_rewards/accuracies": 0.6942675113677979,
"eval_rewards/chosen": -1.4589799642562866,
"eval_rewards/margins": 0.672407329082489,
"eval_rewards/rejected": -2.131387233734131,
"eval_runtime": 282.866,
"eval_samples_per_second": 17.676,
"eval_steps_per_second": 0.555,
"step": 1200
},
{
"epoch": 0.7741522712731925,
"grad_norm": 44.50545746661839,
"learning_rate": 1.255334281650071e-07,
"logits/chosen": -2.72662615776062,
"logits/rejected": -2.701281785964966,
"logps/chosen": -126.24546813964844,
"logps/rejected": -121.22220611572266,
"loss": 0.6014,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2705129384994507,
"rewards/margins": 0.8755936622619629,
"rewards/rejected": -2.146106719970703,
"step": 1210
},
{
"epoch": 0.780550223928343,
"grad_norm": 40.447524019572015,
"learning_rate": 1.2197724039829303e-07,
"logits/chosen": -2.7539114952087402,
"logits/rejected": -2.744138717651367,
"logps/chosen": -126.99958801269531,
"logps/rejected": -123.30317687988281,
"loss": 0.7122,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4938628673553467,
"rewards/margins": 0.9415397644042969,
"rewards/rejected": -2.4354023933410645,
"step": 1220
},
{
"epoch": 0.7869481765834933,
"grad_norm": 52.15736150053683,
"learning_rate": 1.1842105263157894e-07,
"logits/chosen": -2.709707736968994,
"logits/rejected": -2.7100932598114014,
"logps/chosen": -107.58488464355469,
"logps/rejected": -112.5229263305664,
"loss": 0.6585,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4826072454452515,
"rewards/margins": 0.853145956993103,
"rewards/rejected": -2.3357534408569336,
"step": 1230
},
{
"epoch": 0.7933461292386437,
"grad_norm": 50.42046207889911,
"learning_rate": 1.1486486486486487e-07,
"logits/chosen": -2.7156357765197754,
"logits/rejected": -2.7060704231262207,
"logps/chosen": -114.21882629394531,
"logps/rejected": -110.34465026855469,
"loss": 0.5955,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1574304103851318,
"rewards/margins": 0.7427719831466675,
"rewards/rejected": -1.9002023935317993,
"step": 1240
},
{
"epoch": 0.799744081893794,
"grad_norm": 42.01442320837617,
"learning_rate": 1.1130867709815078e-07,
"logits/chosen": -2.7430567741394043,
"logits/rejected": -2.7422475814819336,
"logps/chosen": -128.494140625,
"logps/rejected": -127.20819091796875,
"loss": 0.6143,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1710187196731567,
"rewards/margins": 0.7917193174362183,
"rewards/rejected": -1.962738037109375,
"step": 1250
},
{
"epoch": 0.8061420345489443,
"grad_norm": 49.68905015741029,
"learning_rate": 1.077524893314367e-07,
"logits/chosen": -2.7214901447296143,
"logits/rejected": -2.7370707988739014,
"logps/chosen": -114.65711975097656,
"logps/rejected": -123.6860580444336,
"loss": 0.5802,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3035097122192383,
"rewards/margins": 0.888281524181366,
"rewards/rejected": -2.19179105758667,
"step": 1260
},
{
"epoch": 0.8125399872040947,
"grad_norm": 53.09725997704241,
"learning_rate": 1.0419630156472262e-07,
"logits/chosen": -2.741344928741455,
"logits/rejected": -2.726548671722412,
"logps/chosen": -141.3468017578125,
"logps/rejected": -131.9232940673828,
"loss": 0.6122,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1199138164520264,
"rewards/margins": 1.1132014989852905,
"rewards/rejected": -2.2331154346466064,
"step": 1270
},
{
"epoch": 0.818937939859245,
"grad_norm": 40.62862249158643,
"learning_rate": 1.0064011379800854e-07,
"logits/chosen": -2.7529187202453613,
"logits/rejected": -2.7343528270721436,
"logps/chosen": -133.5608367919922,
"logps/rejected": -124.2785415649414,
"loss": 0.5805,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.192963719367981,
"rewards/margins": 0.6357627511024475,
"rewards/rejected": -1.8287265300750732,
"step": 1280
},
{
"epoch": 0.8253358925143954,
"grad_norm": 41.682064792926646,
"learning_rate": 9.708392603129445e-08,
"logits/chosen": -2.738985538482666,
"logits/rejected": -2.735153913497925,
"logps/chosen": -128.6671600341797,
"logps/rejected": -118.43110656738281,
"loss": 0.5913,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1868826150894165,
"rewards/margins": 0.8519124984741211,
"rewards/rejected": -2.038794994354248,
"step": 1290
},
{
"epoch": 0.8317338451695457,
"grad_norm": 39.13726550603149,
"learning_rate": 9.352773826458037e-08,
"logits/chosen": -2.7326302528381348,
"logits/rejected": -2.7126924991607666,
"logps/chosen": -116.5313949584961,
"logps/rejected": -110.55419921875,
"loss": 0.6194,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4761756658554077,
"rewards/margins": 0.7760677933692932,
"rewards/rejected": -2.2522435188293457,
"step": 1300
},
{
"epoch": 0.838131797824696,
"grad_norm": 45.15762786397904,
"learning_rate": 8.997155049786629e-08,
"logits/chosen": -2.722672462463379,
"logits/rejected": -2.709507703781128,
"logps/chosen": -118.71284484863281,
"logps/rejected": -112.42694091796875,
"loss": 0.5624,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.477581262588501,
"rewards/margins": 0.778394341468811,
"rewards/rejected": -2.2559754848480225,
"step": 1310
},
{
"epoch": 0.8445297504798465,
"grad_norm": 53.01149491791956,
"learning_rate": 8.64153627311522e-08,
"logits/chosen": -2.7539539337158203,
"logits/rejected": -2.742306709289551,
"logps/chosen": -140.6981201171875,
"logps/rejected": -123.42921447753906,
"loss": 0.6231,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.2546924352645874,
"rewards/margins": 0.8097953796386719,
"rewards/rejected": -2.064487934112549,
"step": 1320
},
{
"epoch": 0.8509277031349968,
"grad_norm": 50.72776697356608,
"learning_rate": 8.285917496443812e-08,
"logits/chosen": -2.7258553504943848,
"logits/rejected": -2.7142205238342285,
"logps/chosen": -112.5505142211914,
"logps/rejected": -115.3934097290039,
"loss": 0.5864,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3138844966888428,
"rewards/margins": 0.8471376299858093,
"rewards/rejected": -2.1610217094421387,
"step": 1330
},
{
"epoch": 0.8573256557901472,
"grad_norm": 42.82498871290927,
"learning_rate": 7.930298719772404e-08,
"logits/chosen": -2.7396187782287598,
"logits/rejected": -2.723381757736206,
"logps/chosen": -125.97160339355469,
"logps/rejected": -136.47666931152344,
"loss": 0.6425,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.326712727546692,
"rewards/margins": 0.7852845788002014,
"rewards/rejected": -2.111997365951538,
"step": 1340
},
{
"epoch": 0.8637236084452975,
"grad_norm": 45.714252062937796,
"learning_rate": 7.574679943100994e-08,
"logits/chosen": -2.7322983741760254,
"logits/rejected": -2.7127528190612793,
"logps/chosen": -136.2902374267578,
"logps/rejected": -122.64701843261719,
"loss": 0.6639,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2270106077194214,
"rewards/margins": 0.9102425575256348,
"rewards/rejected": -2.1372532844543457,
"step": 1350
},
{
"epoch": 0.8701215611004478,
"grad_norm": 38.60709881607122,
"learning_rate": 7.219061166429587e-08,
"logits/chosen": -2.743222713470459,
"logits/rejected": -2.7290613651275635,
"logps/chosen": -124.32318115234375,
"logps/rejected": -121.3166732788086,
"loss": 0.6216,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4048868417739868,
"rewards/margins": 0.6789718866348267,
"rewards/rejected": -2.0838589668273926,
"step": 1360
},
{
"epoch": 0.8765195137555982,
"grad_norm": 51.70983027706398,
"learning_rate": 6.863442389758179e-08,
"logits/chosen": -2.730867862701416,
"logits/rejected": -2.7255940437316895,
"logps/chosen": -134.34658813476562,
"logps/rejected": -125.96586608886719,
"loss": 0.6424,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.6426973342895508,
"rewards/margins": 0.5250149965286255,
"rewards/rejected": -2.167712450027466,
"step": 1370
},
{
"epoch": 0.8829174664107485,
"grad_norm": 37.61931616704097,
"learning_rate": 6.507823613086771e-08,
"logits/chosen": -2.7112784385681152,
"logits/rejected": -2.694859027862549,
"logps/chosen": -123.34944915771484,
"logps/rejected": -104.432861328125,
"loss": 0.5347,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2228862047195435,
"rewards/margins": 0.8305751085281372,
"rewards/rejected": -2.0534613132476807,
"step": 1380
},
{
"epoch": 0.889315419065899,
"grad_norm": 54.48021270096627,
"learning_rate": 6.152204836415363e-08,
"logits/chosen": -2.7345829010009766,
"logits/rejected": -2.72003436088562,
"logps/chosen": -136.1989288330078,
"logps/rejected": -122.9295654296875,
"loss": 0.562,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.347959280014038,
"rewards/margins": 1.0466934442520142,
"rewards/rejected": -2.3946526050567627,
"step": 1390
},
{
"epoch": 0.8957133717210493,
"grad_norm": 54.21936478355526,
"learning_rate": 5.796586059743954e-08,
"logits/chosen": -2.7425692081451416,
"logits/rejected": -2.720557928085327,
"logps/chosen": -122.23038482666016,
"logps/rejected": -130.52120971679688,
"loss": 0.6276,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1752973794937134,
"rewards/margins": 1.030903935432434,
"rewards/rejected": -2.2062013149261475,
"step": 1400
},
{
"epoch": 0.8957133717210493,
"eval_logits/chosen": -2.724804639816284,
"eval_logits/rejected": -2.712906837463379,
"eval_logps/chosen": -125.66414642333984,
"eval_logps/rejected": -114.13500213623047,
"eval_loss": 0.584474503993988,
"eval_rewards/accuracies": 0.699840784072876,
"eval_rewards/chosen": -1.4589147567749023,
"eval_rewards/margins": 0.6771440505981445,
"eval_rewards/rejected": -2.136058807373047,
"eval_runtime": 279.8917,
"eval_samples_per_second": 17.864,
"eval_steps_per_second": 0.561,
"step": 1400
},
{
"epoch": 0.9021113243761996,
"grad_norm": 55.42656529498578,
"learning_rate": 5.4409672830725456e-08,
"logits/chosen": -2.757150888442993,
"logits/rejected": -2.7437245845794678,
"logps/chosen": -148.27255249023438,
"logps/rejected": -129.71078491210938,
"loss": 0.5897,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3937550783157349,
"rewards/margins": 1.0089712142944336,
"rewards/rejected": -2.402726173400879,
"step": 1410
},
{
"epoch": 0.90850927703135,
"grad_norm": 44.831480768616196,
"learning_rate": 5.0853485064011376e-08,
"logits/chosen": -2.7586827278137207,
"logits/rejected": -2.747013568878174,
"logps/chosen": -141.61642456054688,
"logps/rejected": -140.4078826904297,
"loss": 0.5571,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5571249723434448,
"rewards/margins": 0.6771610975265503,
"rewards/rejected": -2.234286069869995,
"step": 1420
},
{
"epoch": 0.9149072296865003,
"grad_norm": 45.6706384991516,
"learning_rate": 4.72972972972973e-08,
"logits/chosen": -2.7541661262512207,
"logits/rejected": -2.741849899291992,
"logps/chosen": -131.68832397460938,
"logps/rejected": -112.1330795288086,
"loss": 0.5596,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3573007583618164,
"rewards/margins": 0.9988776445388794,
"rewards/rejected": -2.356178045272827,
"step": 1430
},
{
"epoch": 0.9213051823416507,
"grad_norm": 32.91552794038842,
"learning_rate": 4.374110953058322e-08,
"logits/chosen": -2.7465932369232178,
"logits/rejected": -2.7333266735076904,
"logps/chosen": -130.47242736816406,
"logps/rejected": -122.72697448730469,
"loss": 0.5869,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6356592178344727,
"rewards/margins": 1.0344393253326416,
"rewards/rejected": -2.6700987815856934,
"step": 1440
},
{
"epoch": 0.927703134996801,
"grad_norm": 46.88062796504381,
"learning_rate": 4.018492176386913e-08,
"logits/chosen": -2.76434326171875,
"logits/rejected": -2.746072769165039,
"logps/chosen": -130.2028350830078,
"logps/rejected": -122.24436950683594,
"loss": 0.6267,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.112995982170105,
"rewards/margins": 1.0192458629608154,
"rewards/rejected": -2.132241725921631,
"step": 1450
},
{
"epoch": 0.9341010876519513,
"grad_norm": 52.44065789666525,
"learning_rate": 3.6628733997155046e-08,
"logits/chosen": -2.751476287841797,
"logits/rejected": -2.7347888946533203,
"logps/chosen": -133.9870147705078,
"logps/rejected": -123.64642333984375,
"loss": 0.669,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.618487000465393,
"rewards/margins": 0.8043287992477417,
"rewards/rejected": -2.4228157997131348,
"step": 1460
},
{
"epoch": 0.9404990403071017,
"grad_norm": 46.529272074784444,
"learning_rate": 3.3072546230440967e-08,
"logits/chosen": -2.7366256713867188,
"logits/rejected": -2.7187893390655518,
"logps/chosen": -123.87571716308594,
"logps/rejected": -120.4697265625,
"loss": 0.5889,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5764787197113037,
"rewards/margins": 0.6577258110046387,
"rewards/rejected": -2.2342045307159424,
"step": 1470
},
{
"epoch": 0.946896992962252,
"grad_norm": 43.88442923670593,
"learning_rate": 2.9516358463726884e-08,
"logits/chosen": -2.7498910427093506,
"logits/rejected": -2.748452663421631,
"logps/chosen": -133.49234008789062,
"logps/rejected": -140.14096069335938,
"loss": 0.6418,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.693602204322815,
"rewards/margins": 0.790652871131897,
"rewards/rejected": -2.484255075454712,
"step": 1480
},
{
"epoch": 0.9532949456174025,
"grad_norm": 44.15596171124334,
"learning_rate": 2.59601706970128e-08,
"logits/chosen": -2.7611732482910156,
"logits/rejected": -2.7567832469940186,
"logps/chosen": -127.27906799316406,
"logps/rejected": -123.48602294921875,
"loss": 0.6792,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2605165243148804,
"rewards/margins": 0.9137457013130188,
"rewards/rejected": -2.174262285232544,
"step": 1490
},
{
"epoch": 0.9596928982725528,
"grad_norm": 54.18735827602704,
"learning_rate": 2.240398293029872e-08,
"logits/chosen": -2.7405667304992676,
"logits/rejected": -2.7390694618225098,
"logps/chosen": -130.6249237060547,
"logps/rejected": -117.32889556884766,
"loss": 0.6408,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.4110792875289917,
"rewards/margins": 0.7672127485275269,
"rewards/rejected": -2.1782920360565186,
"step": 1500
},
{
"epoch": 0.9660908509277031,
"grad_norm": 36.9566011502463,
"learning_rate": 1.8847795163584636e-08,
"logits/chosen": -2.7048280239105225,
"logits/rejected": -2.7038590908050537,
"logps/chosen": -109.9410400390625,
"logps/rejected": -115.59232330322266,
"loss": 0.5897,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.5398355722427368,
"rewards/margins": 0.9320799112319946,
"rewards/rejected": -2.4719154834747314,
"step": 1510
},
{
"epoch": 0.9724888035828535,
"grad_norm": 34.807994778606435,
"learning_rate": 1.5291607396870554e-08,
"logits/chosen": -2.7435240745544434,
"logits/rejected": -2.730076551437378,
"logps/chosen": -131.5994873046875,
"logps/rejected": -111.15098571777344,
"loss": 0.5825,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3073338270187378,
"rewards/margins": 0.9064651727676392,
"rewards/rejected": -2.213798999786377,
"step": 1520
},
{
"epoch": 0.9788867562380038,
"grad_norm": 54.31697188600043,
"learning_rate": 1.1735419630156473e-08,
"logits/chosen": -2.7345783710479736,
"logits/rejected": -2.7173001766204834,
"logps/chosen": -122.76127624511719,
"logps/rejected": -112.13035583496094,
"loss": 0.6551,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.4825352430343628,
"rewards/margins": 0.7752220034599304,
"rewards/rejected": -2.2577571868896484,
"step": 1530
},
{
"epoch": 0.9852847088931542,
"grad_norm": 40.57564149859545,
"learning_rate": 8.179231863442388e-09,
"logits/chosen": -2.741443157196045,
"logits/rejected": -2.7301018238067627,
"logps/chosen": -129.3629608154297,
"logps/rejected": -118.81050109863281,
"loss": 0.5773,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.336868405342102,
"rewards/margins": 0.9008957743644714,
"rewards/rejected": -2.237764358520508,
"step": 1540
},
{
"epoch": 0.9916826615483045,
"grad_norm": 52.957380548005816,
"learning_rate": 4.623044096728307e-09,
"logits/chosen": -2.7427573204040527,
"logits/rejected": -2.7327628135681152,
"logps/chosen": -130.18255615234375,
"logps/rejected": -125.54536437988281,
"loss": 0.6057,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.218837857246399,
"rewards/margins": 0.9455093145370483,
"rewards/rejected": -2.1643471717834473,
"step": 1550
},
{
"epoch": 0.9980806142034548,
"grad_norm": 45.45734607577736,
"learning_rate": 1.0668563300142248e-09,
"logits/chosen": -2.7248375415802,
"logits/rejected": -2.705608367919922,
"logps/chosen": -140.51812744140625,
"logps/rejected": -118.7292251586914,
"loss": 0.6287,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.6361656188964844,
"rewards/margins": 0.8488451838493347,
"rewards/rejected": -2.4850106239318848,
"step": 1560
},
{
"epoch": 1.0,
"step": 1563,
"total_flos": 0.0,
"train_loss": 0.424483765719872,
"train_runtime": 7188.1877,
"train_samples_per_second": 6.956,
"train_steps_per_second": 0.217
}
],
"logging_steps": 10,
"max_steps": 1563,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}