{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010468463752944255, "grad_norm": 4.4375, "learning_rate": 5.208333333333333e-08, "logits/chosen": -0.3494967222213745, "logits/rejected": -0.3728627860546112, "logps/chosen": -285.8127136230469, "logps/ref_response": -0.3494967222213745, "logps/rejected": -212.7957000732422, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010468463752944255, "grad_norm": 4.5, "learning_rate": 5.208333333333334e-07, "logits/chosen": -0.540075421333313, "logits/rejected": -0.54986971616745, "logps/chosen": -315.31512451171875, "logps/ref_response": -0.5399107336997986, "logps/rejected": -278.0267639160156, "loss": 0.6929, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.001649973331950605, "rewards/margins": 0.0034635968040674925, "rewards/rejected": -0.0018136235885322094, "step": 10 }, { "epoch": 0.02093692750588851, "grad_norm": 4.15625, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -0.5037816762924194, "logits/rejected": -0.5245965719223022, "logps/chosen": -306.7390441894531, "logps/ref_response": -0.5032420754432678, "logps/rejected": -271.2138671875, "loss": 0.6934, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.003458543913438916, "rewards/margins": 0.0031067535746842623, "rewards/rejected": 0.0003517906297929585, "step": 20 }, { "epoch": 0.031405391258832765, "grad_norm": 4.75, "learning_rate": 1.5625e-06, "logits/chosen": -0.5102043151855469, "logits/rejected": -0.5178056955337524, "logps/chosen": -291.02197265625, "logps/ref_response": -0.5080639123916626, "logps/rejected": -252.41531372070312, "loss": 0.6867, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.014292215928435326, "rewards/margins": 0.014373516663908958, "rewards/rejected": -8.130413334583864e-05, "step": 30 }, { "epoch": 0.04187385501177702, "grad_norm": 3.921875, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -0.48268669843673706, "logits/rejected": -0.5177565813064575, "logps/chosen": -305.90875244140625, "logps/ref_response": -0.47757530212402344, "logps/rejected": -244.60757446289062, "loss": 0.6781, "rewards/accuracies": 0.65625, "rewards/chosen": 0.039179086685180664, "rewards/margins": 0.04343840479850769, "rewards/rejected": -0.004259312059730291, "step": 40 }, { "epoch": 0.05234231876472128, "grad_norm": 3.0625, "learning_rate": 2.604166666666667e-06, "logits/chosen": -0.5464528799057007, "logits/rejected": -0.5745548605918884, "logps/chosen": -304.85235595703125, "logps/ref_response": -0.5367640256881714, "logps/rejected": -282.80804443359375, "loss": 0.6733, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.07245220243930817, "rewards/margins": 0.046217553317546844, "rewards/rejected": 0.026234647259116173, "step": 50 }, { "epoch": 0.06281078251766553, "grad_norm": 3.6875, "learning_rate": 3.125e-06, "logits/chosen": -0.5682042837142944, "logits/rejected": -0.5693326592445374, "logps/chosen": -290.4607849121094, "logps/ref_response": -0.5527787804603577, "logps/rejected": -254.50967407226562, "loss": 0.6554, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.12507006525993347, "rewards/margins": 0.05596587061882019, "rewards/rejected": 0.06910420954227448, "step": 60 }, { "epoch": 0.07327924627060979, "grad_norm": 3.625, "learning_rate": 3.6458333333333333e-06, "logits/chosen": -0.5585962533950806, "logits/rejected": -0.5734174847602844, "logps/chosen": -286.166748046875, "logps/ref_response": -0.5369429588317871, "logps/rejected": -263.13885498046875, "loss": 0.6366, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21329982578754425, "rewards/margins": 0.146693617105484, "rewards/rejected": 0.06660620868206024, "step": 70 }, { "epoch": 0.08374771002355404, "grad_norm": 3.75, "learning_rate": 4.166666666666667e-06, "logits/chosen": -0.4981383681297302, "logits/rejected": -0.5249155759811401, "logps/chosen": -287.4258728027344, "logps/ref_response": -0.46965378522872925, "logps/rejected": -273.86474609375, "loss": 0.617, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.31552475690841675, "rewards/margins": 0.25278010964393616, "rewards/rejected": 0.06274466216564178, "step": 80 }, { "epoch": 0.0942161737764983, "grad_norm": 3.734375, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -0.5283939838409424, "logits/rejected": -0.5496431589126587, "logps/chosen": -330.2322692871094, "logps/ref_response": -0.4922845959663391, "logps/rejected": -295.63018798828125, "loss": 0.5998, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.2877245545387268, "rewards/margins": 0.3177236020565033, "rewards/rejected": -0.02999904192984104, "step": 90 }, { "epoch": 0.10468463752944256, "grad_norm": 3.8125, "learning_rate": 4.9997324926814375e-06, "logits/chosen": -0.5698152184486389, "logits/rejected": -0.5635516047477722, "logps/chosen": -275.736328125, "logps/ref_response": -0.533843994140625, "logps/rejected": -290.2398376464844, "loss": 0.6142, "rewards/accuracies": 0.6875, "rewards/chosen": 0.30928176641464233, "rewards/margins": 0.3119629919528961, "rewards/rejected": -0.0026811982970684767, "step": 100 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -0.5543237924575806, "eval_logits/rejected": -0.548694908618927, "eval_logps/chosen": -290.523193359375, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -277.9860534667969, "eval_loss": 0.5973454713821411, "eval_rewards/accuracies": 0.7020000219345093, "eval_rewards/chosen": 0.2023972123861313, "eval_rewards/margins": 0.33334478735923767, "eval_rewards/rejected": -0.13094758987426758, "eval_runtime": 351.8267, "eval_samples_per_second": 5.685, "eval_steps_per_second": 0.355, "step": 100 }, { "epoch": 0.11515310128238682, "grad_norm": 2.96875, "learning_rate": 4.996723692767927e-06, "logits/chosen": -0.6081199645996094, "logits/rejected": -0.6322951912879944, "logps/chosen": -289.56561279296875, "logps/ref_response": -0.5667906999588013, "logps/rejected": -277.76922607421875, "loss": 0.5906, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.20684650540351868, "rewards/margins": 0.4277339577674866, "rewards/rejected": -0.22088749706745148, "step": 110 }, { "epoch": 0.12562156503533106, "grad_norm": 3.34375, "learning_rate": 4.9903757462135984e-06, "logits/chosen": -0.5567010641098022, "logits/rejected": -0.5700705051422119, "logps/chosen": -262.211181640625, "logps/ref_response": -0.5169209837913513, "logps/rejected": -253.9445343017578, "loss": 0.5753, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.034964192658662796, "rewards/margins": 0.36375877261161804, "rewards/rejected": -0.32879456877708435, "step": 120 }, { "epoch": 0.1360900287882753, "grad_norm": 3.5, "learning_rate": 4.980697142834315e-06, "logits/chosen": -0.5261090397834778, "logits/rejected": -0.5425637364387512, "logps/chosen": -302.4659118652344, "logps/ref_response": -0.4790240228176117, "logps/rejected": -338.93597412109375, "loss": 0.5798, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04841077700257301, "rewards/margins": 0.3843201696872711, "rewards/rejected": -0.335909366607666, "step": 130 }, { "epoch": 0.14655849254121958, "grad_norm": 3.015625, "learning_rate": 4.967700826904229e-06, "logits/chosen": -0.5993494391441345, "logits/rejected": -0.607743501663208, "logps/chosen": -283.1224670410156, "logps/ref_response": -0.5482783913612366, "logps/rejected": -276.7977294921875, "loss": 0.5616, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0036001927219331264, "rewards/margins": 0.5189536213874817, "rewards/rejected": -0.5225538015365601, "step": 140 }, { "epoch": 0.15702695629416383, "grad_norm": 3.34375, "learning_rate": 4.951404179843963e-06, "logits/chosen": -0.6007939577102661, "logits/rejected": -0.5676769018173218, "logps/chosen": -308.45916748046875, "logps/ref_response": -0.5423828363418579, "logps/rejected": -280.7994689941406, "loss": 0.5581, "rewards/accuracies": 0.75, "rewards/chosen": 0.18967030942440033, "rewards/margins": 0.5712782740592957, "rewards/rejected": -0.38160794973373413, "step": 150 }, { "epoch": 0.16749542004710807, "grad_norm": 3.140625, "learning_rate": 4.931828996974498e-06, "logits/chosen": -0.5421828031539917, "logits/rejected": -0.5333597660064697, "logps/chosen": -297.0065612792969, "logps/ref_response": -0.4895528256893158, "logps/rejected": -272.0807189941406, "loss": 0.5427, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.20104511082172394, "rewards/margins": 0.61830735206604, "rewards/rejected": -0.4172622263431549, "step": 160 }, { "epoch": 0.17796388380005235, "grad_norm": 3.46875, "learning_rate": 4.909001458367867e-06, "logits/chosen": -0.6238254308700562, "logits/rejected": -0.6083575487136841, "logps/chosen": -288.6294860839844, "logps/ref_response": -0.5753272771835327, "logps/rejected": -277.1576232910156, "loss": 0.5646, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.02497274801135063, "rewards/margins": 0.5278475880622864, "rewards/rejected": -0.5528203248977661, "step": 170 }, { "epoch": 0.1884323475529966, "grad_norm": 3.46875, "learning_rate": 4.882952093833628e-06, "logits/chosen": -0.6268518567085266, "logits/rejected": -0.5990904569625854, "logps/chosen": -303.6324157714844, "logps/ref_response": -0.5761692523956299, "logps/rejected": -267.112060546875, "loss": 0.5521, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.13170073926448822, "rewards/margins": 0.5248185396194458, "rewards/rejected": -0.6565192341804504, "step": 180 }, { "epoch": 0.19890081130594087, "grad_norm": 4.0, "learning_rate": 4.853715742087947e-06, "logits/chosen": -0.5630252957344055, "logits/rejected": -0.5427506566047668, "logps/chosen": -276.95648193359375, "logps/ref_response": -0.5028859972953796, "logps/rejected": -284.7541809082031, "loss": 0.5535, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.004456658847630024, "rewards/margins": 0.5479073524475098, "rewards/rejected": -0.5434507131576538, "step": 190 }, { "epoch": 0.2093692750588851, "grad_norm": 5.0625, "learning_rate": 4.821331504159906e-06, "logits/chosen": -0.5674183368682861, "logits/rejected": -0.5801523327827454, "logps/chosen": -298.1530456542969, "logps/ref_response": -0.5163358449935913, "logps/rejected": -257.4419250488281, "loss": 0.5579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06740345805883408, "rewards/margins": 0.5899164080619812, "rewards/rejected": -0.6573198437690735, "step": 200 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -0.5041880011558533, "eval_logits/rejected": -0.48473650217056274, "eval_logps/chosen": -293.2984924316406, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -283.7411193847656, "eval_loss": 0.5482621788978577, "eval_rewards/accuracies": 0.7120000123977661, "eval_rewards/chosen": -0.07513303309679031, "eval_rewards/margins": 0.6313197016716003, "eval_rewards/rejected": -0.7064527869224548, "eval_runtime": 349.57, "eval_samples_per_second": 5.721, "eval_steps_per_second": 0.358, "step": 200 }, { "epoch": 0.21983773881182936, "grad_norm": 4.84375, "learning_rate": 4.7858426910973435e-06, "logits/chosen": -0.6099163889884949, "logits/rejected": -0.6096245050430298, "logps/chosen": -280.4328918457031, "logps/ref_response": -0.5563252568244934, "logps/rejected": -274.54486083984375, "loss": 0.5453, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.019828548654913902, "rewards/margins": 0.5767567157745361, "rewards/rejected": -0.596585214138031, "step": 210 }, { "epoch": 0.23030620256477363, "grad_norm": 3.578125, "learning_rate": 4.747296766042161e-06, "logits/chosen": -0.5816672444343567, "logits/rejected": -0.5607967376708984, "logps/chosen": -319.9589538574219, "logps/ref_response": -0.525614857673645, "logps/rejected": -272.82952880859375, "loss": 0.5401, "rewards/accuracies": 0.71875, "rewards/chosen": 0.03950881212949753, "rewards/margins": 0.63139808177948, "rewards/rejected": -0.5918892621994019, "step": 220 }, { "epoch": 0.24077466631771788, "grad_norm": 3.921875, "learning_rate": 4.705745280752586e-06, "logits/chosen": -0.6150011420249939, "logits/rejected": -0.572602391242981, "logps/chosen": -293.0460205078125, "logps/ref_response": -0.5675605535507202, "logps/rejected": -290.7093200683594, "loss": 0.5514, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07407680153846741, "rewards/margins": 0.6289999485015869, "rewards/rejected": -0.7030767202377319, "step": 230 }, { "epoch": 0.2512431300706621, "grad_norm": 3.015625, "learning_rate": 4.661243806657256e-06, "logits/chosen": -0.5870501399040222, "logits/rejected": -0.546720564365387, "logps/chosen": -300.5980529785156, "logps/ref_response": -0.5330287218093872, "logps/rejected": -264.90716552734375, "loss": 0.5493, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01282467134296894, "rewards/margins": 0.5539323091506958, "rewards/rejected": -0.5667570233345032, "step": 240 }, { "epoch": 0.26171159382360637, "grad_norm": 3.140625, "learning_rate": 4.613851860533367e-06, "logits/chosen": -0.5954620242118835, "logits/rejected": -0.5543604493141174, "logps/chosen": -293.9695739746094, "logps/ref_response": -0.5492520928382874, "logps/rejected": -260.6333312988281, "loss": 0.5657, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.10280059278011322, "rewards/margins": 0.5389178395271301, "rewards/rejected": -0.43611717224121094, "step": 250 }, { "epoch": 0.2721800575765506, "grad_norm": 3.765625, "learning_rate": 4.563632824908252e-06, "logits/chosen": -0.564812183380127, "logits/rejected": -0.5271375179290771, "logps/chosen": -293.28594970703125, "logps/ref_response": -0.5089389085769653, "logps/rejected": -280.13018798828125, "loss": 0.5289, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3052422106266022, "rewards/margins": 0.8550776243209839, "rewards/rejected": -0.5498353838920593, "step": 260 }, { "epoch": 0.2826485213294949, "grad_norm": 4.25, "learning_rate": 4.510653863290871e-06, "logits/chosen": -0.558210015296936, "logits/rejected": -0.5351649522781372, "logps/chosen": -296.7794494628906, "logps/ref_response": -0.5091123580932617, "logps/rejected": -305.01654052734375, "loss": 0.5405, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.008460876531898975, "rewards/margins": 0.6207507848739624, "rewards/rejected": -0.6122900247573853, "step": 270 }, { "epoch": 0.29311698508243916, "grad_norm": 3.203125, "learning_rate": 4.454985830346574e-06, "logits/chosen": -0.6231056451797485, "logits/rejected": -0.5877543687820435, "logps/chosen": -302.84906005859375, "logps/ref_response": -0.5748014450073242, "logps/rejected": -286.6356201171875, "loss": 0.5587, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0532672181725502, "rewards/margins": 0.5458566546440125, "rewards/rejected": -0.5991239547729492, "step": 280 }, { "epoch": 0.3035854488353834, "grad_norm": 3.34375, "learning_rate": 4.396703177135262e-06, "logits/chosen": -0.582170844078064, "logits/rejected": -0.5509222149848938, "logps/chosen": -287.865478515625, "logps/ref_response": -0.5320878624916077, "logps/rejected": -259.5517883300781, "loss": 0.5264, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.1213446855545044, "rewards/margins": 0.7001182436943054, "rewards/rejected": -0.5787736177444458, "step": 290 }, { "epoch": 0.31405391258832765, "grad_norm": 4.125, "learning_rate": 4.335883851539693e-06, "logits/chosen": -0.6001819372177124, "logits/rejected": -0.5625559091567993, "logps/chosen": -297.6165771484375, "logps/ref_response": -0.5529105067253113, "logps/rejected": -294.80816650390625, "loss": 0.5402, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.15017859637737274, "rewards/margins": 0.683876097202301, "rewards/rejected": -0.8340547680854797, "step": 300 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -0.4637250602245331, "eval_logits/rejected": -0.4386967718601227, "eval_logps/chosen": -293.8652648925781, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -285.25445556640625, "eval_loss": 0.5354303121566772, "eval_rewards/accuracies": 0.7260000109672546, "eval_rewards/chosen": -0.1318078190088272, "eval_rewards/margins": 0.725982666015625, "eval_rewards/rejected": -0.857790470123291, "eval_runtime": 349.4289, "eval_samples_per_second": 5.724, "eval_steps_per_second": 0.358, "step": 300 }, { "epoch": 0.3245223763412719, "grad_norm": 4.40625, "learning_rate": 4.2726091940171055e-06, "logits/chosen": -0.549019992351532, "logits/rejected": -0.5806938409805298, "logps/chosen": -296.0178527832031, "logps/ref_response": -0.5006662607192993, "logps/rejected": -342.6523742675781, "loss": 0.5123, "rewards/accuracies": 0.78125, "rewards/chosen": 0.08306514471769333, "rewards/margins": 0.8439720869064331, "rewards/rejected": -0.760906994342804, "step": 310 }, { "epoch": 0.33499084009421615, "grad_norm": 2.953125, "learning_rate": 4.206963828813555e-06, "logits/chosen": -0.6003859043121338, "logits/rejected": -0.5595699548721313, "logps/chosen": -297.0275573730469, "logps/ref_response": -0.5563712120056152, "logps/rejected": -280.803466796875, "loss": 0.5233, "rewards/accuracies": 0.71875, "rewards/chosen": -0.02343413233757019, "rewards/margins": 0.791476845741272, "rewards/rejected": -0.8149110078811646, "step": 320 }, { "epoch": 0.34545930384716045, "grad_norm": 3.03125, "learning_rate": 4.139035550786495e-06, "logits/chosen": -0.6349459886550903, "logits/rejected": -0.5698983073234558, "logps/chosen": -290.1078186035156, "logps/ref_response": -0.5800708532333374, "logps/rejected": -261.7127685546875, "loss": 0.5278, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.08804565668106079, "rewards/margins": 0.6575459837913513, "rewards/rejected": -0.7455916404724121, "step": 330 }, { "epoch": 0.3559277676001047, "grad_norm": 3.953125, "learning_rate": 4.068915207986931e-06, "logits/chosen": -0.5905895233154297, "logits/rejected": -0.5241268277168274, "logps/chosen": -298.50506591796875, "logps/ref_response": -0.5407181978225708, "logps/rejected": -259.384765625, "loss": 0.5256, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2834884226322174, "rewards/margins": 0.8159183263778687, "rewards/rejected": -1.0994068384170532, "step": 340 }, { "epoch": 0.36639623135304894, "grad_norm": 4.09375, "learning_rate": 3.996696580158211e-06, "logits/chosen": -0.5350117683410645, "logits/rejected": -0.509468674659729, "logps/chosen": -337.4930114746094, "logps/ref_response": -0.486247718334198, "logps/rejected": -292.4802551269531, "loss": 0.5304, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.055437762290239334, "rewards/margins": 0.7416700124740601, "rewards/rejected": -0.7971076965332031, "step": 350 }, { "epoch": 0.3768646951059932, "grad_norm": 3.140625, "learning_rate": 3.922476253313921e-06, "logits/chosen": -0.5137478113174438, "logits/rejected": -0.5199310183525085, "logps/chosen": -275.74993896484375, "logps/ref_response": -0.48780474066734314, "logps/rejected": -299.1233215332031, "loss": 0.5377, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2543216347694397, "rewards/margins": 0.7176491618156433, "rewards/rejected": -0.9719708561897278, "step": 360 }, { "epoch": 0.38733315885893743, "grad_norm": 4.25, "learning_rate": 3.846353490562664e-06, "logits/chosen": -0.5372802019119263, "logits/rejected": -0.539850115776062, "logps/chosen": -290.45709228515625, "logps/ref_response": -0.491685152053833, "logps/rejected": -265.22857666015625, "loss": 0.5065, "rewards/accuracies": 0.75, "rewards/chosen": -0.026863550767302513, "rewards/margins": 0.8847667574882507, "rewards/rejected": -0.9116303324699402, "step": 370 }, { "epoch": 0.39780162261188173, "grad_norm": 3.609375, "learning_rate": 3.768430099352445e-06, "logits/chosen": -0.5611374378204346, "logits/rejected": -0.5598313808441162, "logps/chosen": -306.60662841796875, "logps/ref_response": -0.5215914845466614, "logps/rejected": -280.02392578125, "loss": 0.5133, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.001783865736797452, "rewards/margins": 1.001586675643921, "rewards/rejected": -0.9998028874397278, "step": 380 }, { "epoch": 0.408270086364826, "grad_norm": 4.625, "learning_rate": 3.6888102953122307e-06, "logits/chosen": -0.6096396446228027, "logits/rejected": -0.5705077052116394, "logps/chosen": -263.71826171875, "logps/ref_response": -0.5661150813102722, "logps/rejected": -265.1150817871094, "loss": 0.5434, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.06994754076004028, "rewards/margins": 0.7673075795173645, "rewards/rejected": -0.8372551202774048, "step": 390 }, { "epoch": 0.4187385501177702, "grad_norm": 3.875, "learning_rate": 3.607600562872785e-06, "logits/chosen": -0.5598580241203308, "logits/rejected": -0.5253019332885742, "logps/chosen": -286.2227478027344, "logps/ref_response": -0.5258094072341919, "logps/rejected": -277.05267333984375, "loss": 0.5112, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.047982849180698395, "rewards/margins": 0.7805670499801636, "rewards/rejected": -0.8285499811172485, "step": 400 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -0.4029563367366791, "eval_logits/rejected": -0.37146955728530884, "eval_logps/chosen": -294.2449645996094, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -286.346923828125, "eval_loss": 0.5277438759803772, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": -0.16978110373020172, "eval_rewards/margins": 0.7972525954246521, "eval_rewards/rejected": -0.9670337438583374, "eval_runtime": 349.6206, "eval_samples_per_second": 5.72, "eval_steps_per_second": 0.358, "step": 400 }, { "epoch": 0.42920701387071447, "grad_norm": 3.703125, "learning_rate": 3.5249095128531863e-06, "logits/chosen": -0.5799764394760132, "logits/rejected": -0.5290526151657104, "logps/chosen": -279.68115234375, "logps/ref_response": -0.5564926862716675, "logps/rejected": -277.60308837890625, "loss": 0.5124, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.1508873999118805, "rewards/margins": 0.8393732905387878, "rewards/rejected": -0.9902607798576355, "step": 410 }, { "epoch": 0.4396754776236587, "grad_norm": 4.34375, "learning_rate": 3.4408477372034743e-06, "logits/chosen": -0.5675779581069946, "logits/rejected": -0.5418698191642761, "logps/chosen": -310.0582275390625, "logps/ref_response": -0.5361344218254089, "logps/rejected": -298.15447998046875, "loss": 0.5504, "rewards/accuracies": 0.75, "rewards/chosen": -0.11406157165765762, "rewards/margins": 0.6680425405502319, "rewards/rejected": -0.7821041345596313, "step": 420 }, { "epoch": 0.45014394137660296, "grad_norm": 4.1875, "learning_rate": 3.355527661097728e-06, "logits/chosen": -0.569171130657196, "logits/rejected": -0.5680087804794312, "logps/chosen": -281.50506591796875, "logps/ref_response": -0.5477866530418396, "logps/rejected": -282.78619384765625, "loss": 0.5291, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.24793243408203125, "rewards/margins": 0.6574320197105408, "rewards/rejected": -0.9053643941879272, "step": 430 }, { "epoch": 0.46061240512954726, "grad_norm": 3.859375, "learning_rate": 3.269063392575352e-06, "logits/chosen": -0.5341087579727173, "logits/rejected": -0.5249664187431335, "logps/chosen": -328.975341796875, "logps/ref_response": -0.5050511360168457, "logps/rejected": -307.42388916015625, "loss": 0.5158, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.021743040531873703, "rewards/margins": 0.7577444314956665, "rewards/rejected": -0.7794874310493469, "step": 440 }, { "epoch": 0.4710808688824915, "grad_norm": 3.828125, "learning_rate": 3.181570569931697e-06, "logits/chosen": -0.5577148199081421, "logits/rejected": -0.5402424931526184, "logps/chosen": -287.3887939453125, "logps/ref_response": -0.5224987864494324, "logps/rejected": -283.7501525878906, "loss": 0.5068, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2252788096666336, "rewards/margins": 0.7505895495414734, "rewards/rejected": -0.9758683443069458, "step": 450 }, { "epoch": 0.48154933263543576, "grad_norm": 2.765625, "learning_rate": 3.09316620706208e-06, "logits/chosen": -0.5113469362258911, "logits/rejected": -0.5197252631187439, "logps/chosen": -308.82244873046875, "logps/ref_response": -0.4874509274959564, "logps/rejected": -290.3113708496094, "loss": 0.4968, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.09814532101154327, "rewards/margins": 0.974043071269989, "rewards/rejected": -1.072188377380371, "step": 460 }, { "epoch": 0.49201779638838, "grad_norm": 3.515625, "learning_rate": 3.0039685369660785e-06, "logits/chosen": -0.513633131980896, "logits/rejected": -0.46772414445877075, "logps/chosen": -282.93438720703125, "logps/ref_response": -0.4861488938331604, "logps/rejected": -267.72796630859375, "loss": 0.5329, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.05372269079089165, "rewards/margins": 0.8479040861129761, "rewards/rejected": -0.7941814661026001, "step": 470 }, { "epoch": 0.5024862601413242, "grad_norm": 3.75, "learning_rate": 2.91409685362137e-06, "logits/chosen": -0.5242967009544373, "logits/rejected": -0.5119736790657043, "logps/chosen": -280.0267028808594, "logps/ref_response": -0.5061747431755066, "logps/rejected": -277.7090759277344, "loss": 0.5085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04931460693478584, "rewards/margins": 0.8077665567398071, "rewards/rejected": -0.8570810556411743, "step": 480 }, { "epoch": 0.5129547238942685, "grad_norm": 2.8125, "learning_rate": 2.8236713524386085e-06, "logits/chosen": -0.587617039680481, "logits/rejected": -0.5464242100715637, "logps/chosen": -280.3719482421875, "logps/ref_response": -0.5583964586257935, "logps/rejected": -258.3996276855469, "loss": 0.5056, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.056199681013822556, "rewards/margins": 0.8544878959655762, "rewards/rejected": -0.7982882261276245, "step": 490 }, { "epoch": 0.5234231876472127, "grad_norm": 3.546875, "learning_rate": 2.7328129695107205e-06, "logits/chosen": -0.4950012266635895, "logits/rejected": -0.49973049759864807, "logps/chosen": -266.21197509765625, "logps/ref_response": -0.46682921051979065, "logps/rejected": -276.02667236328125, "loss": 0.5319, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.011867323890328407, "rewards/margins": 1.0351099967956543, "rewards/rejected": -1.023242712020874, "step": 500 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -0.3727329671382904, "eval_logits/rejected": -0.3377152681350708, "eval_logps/chosen": -294.09320068359375, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -286.45953369140625, "eval_loss": 0.521223783493042, "eval_rewards/accuracies": 0.7260000109672546, "eval_rewards/chosen": -0.1546054631471634, "eval_rewards/margins": 0.8236899375915527, "eval_rewards/rejected": -0.9782953858375549, "eval_runtime": 349.5098, "eval_samples_per_second": 5.722, "eval_steps_per_second": 0.358, "step": 500 }, { "epoch": 0.533891651400157, "grad_norm": 3.046875, "learning_rate": 2.641643219871597e-06, "logits/chosen": -0.5270382165908813, "logits/rejected": -0.484760582447052, "logps/chosen": -314.9234619140625, "logps/ref_response": -0.5090646743774414, "logps/rejected": -299.1803894042969, "loss": 0.5149, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1974947154521942, "rewards/margins": 0.7501281499862671, "rewards/rejected": -0.9476228952407837, "step": 510 }, { "epoch": 0.5443601151531012, "grad_norm": 4.53125, "learning_rate": 2.5502840349805074e-06, "logits/chosen": -0.5182399749755859, "logits/rejected": -0.5034407377243042, "logps/chosen": -310.6617126464844, "logps/ref_response": -0.5057616829872131, "logps/rejected": -298.3763427734375, "loss": 0.538, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.06424389034509659, "rewards/margins": 0.8863626718521118, "rewards/rejected": -0.9506064653396606, "step": 520 }, { "epoch": 0.5548285789060455, "grad_norm": 3.6875, "learning_rate": 2.4588575996495797e-06, "logits/chosen": -0.47386473417282104, "logits/rejected": -0.47526755928993225, "logps/chosen": -272.9559631347656, "logps/ref_response": -0.45075368881225586, "logps/rejected": -263.4909973144531, "loss": 0.5251, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2501363754272461, "rewards/margins": 0.891460120677948, "rewards/rejected": -1.1415965557098389, "step": 530 }, { "epoch": 0.5652970426589898, "grad_norm": 4.15625, "learning_rate": 2.367486188632446e-06, "logits/chosen": -0.5168323516845703, "logits/rejected": -0.5130370855331421, "logps/chosen": -286.60076904296875, "logps/ref_response": -0.5035119652748108, "logps/rejected": -326.2984619140625, "loss": 0.5101, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.09159872680902481, "rewards/margins": 0.8820840120315552, "rewards/rejected": -0.9736827611923218, "step": 540 }, { "epoch": 0.575765506411934, "grad_norm": 3.28125, "learning_rate": 2.276292003092593e-06, "logits/chosen": -0.538984477519989, "logits/rejected": -0.5137041807174683, "logps/chosen": -258.5223083496094, "logps/ref_response": -0.5067554712295532, "logps/rejected": -266.6609802246094, "loss": 0.4938, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0642676055431366, "rewards/margins": 0.9660905003547668, "rewards/rejected": -1.030358076095581, "step": 550 }, { "epoch": 0.5862339701648783, "grad_norm": 3.234375, "learning_rate": 2.1853970071701415e-06, "logits/chosen": -0.5292374491691589, "logits/rejected": -0.49377211928367615, "logps/chosen": -279.6850891113281, "logps/ref_response": -0.5059608817100525, "logps/rejected": -281.2428283691406, "loss": 0.5137, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03345666453242302, "rewards/margins": 0.8658466339111328, "rewards/rejected": -0.8993034362792969, "step": 560 }, { "epoch": 0.5967024339178225, "grad_norm": 4.0, "learning_rate": 2.0949227648656194e-06, "logits/chosen": -0.5543760657310486, "logits/rejected": -0.5229381918907166, "logps/chosen": -296.0806579589844, "logps/ref_response": -0.5283219218254089, "logps/rejected": -263.48150634765625, "loss": 0.5265, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.09281423687934875, "rewards/margins": 0.9194073677062988, "rewards/rejected": -1.0122215747833252, "step": 570 }, { "epoch": 0.6071708976707668, "grad_norm": 3.5625, "learning_rate": 2.00499027745888e-06, "logits/chosen": -0.5253512263298035, "logits/rejected": -0.5014483332633972, "logps/chosen": -300.3424377441406, "logps/ref_response": -0.5130476355552673, "logps/rejected": -299.5146789550781, "loss": 0.5382, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.05144411325454712, "rewards/margins": 0.8135349154472351, "rewards/rejected": -0.8649789690971375, "step": 580 }, { "epoch": 0.6176393614237111, "grad_norm": 3.46875, "learning_rate": 1.915719821680624e-06, "logits/chosen": -0.5522564053535461, "logits/rejected": -0.4926326870918274, "logps/chosen": -288.0102844238281, "logps/ref_response": -0.5210384130477905, "logps/rejected": -284.1330261230469, "loss": 0.5143, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.19294333457946777, "rewards/margins": 0.9110462069511414, "rewards/rejected": -0.7181028127670288, "step": 590 }, { "epoch": 0.6281078251766553, "grad_norm": 3.5, "learning_rate": 1.8272307888529276e-06, "logits/chosen": -0.4761735796928406, "logits/rejected": -0.4301334321498871, "logps/chosen": -264.5964660644531, "logps/ref_response": -0.4653666913509369, "logps/rejected": -281.8071594238281, "loss": 0.5155, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.022416314110159874, "rewards/margins": 0.9364219903945923, "rewards/rejected": -0.9588383436203003, "step": 600 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -0.36081573367118835, "eval_logits/rejected": -0.32467401027679443, "eval_logps/chosen": -293.3980407714844, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -285.9612121582031, "eval_loss": 0.5195037722587585, "eval_rewards/accuracies": 0.7360000014305115, "eval_rewards/chosen": -0.0850897878408432, "eval_rewards/margins": 0.8433744311332703, "eval_rewards/rejected": -0.9284642338752747, "eval_runtime": 349.4276, "eval_samples_per_second": 5.724, "eval_steps_per_second": 0.358, "step": 600 }, { "epoch": 0.6385762889295996, "grad_norm": 3.0, "learning_rate": 1.739641525213929e-06, "logits/chosen": -0.5045549869537354, "logits/rejected": -0.4906612038612366, "logps/chosen": -267.28338623046875, "logps/ref_response": -0.500705897808075, "logps/rejected": -273.13714599609375, "loss": 0.4986, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.08845983445644379, "rewards/margins": 0.9310785531997681, "rewards/rejected": -1.0195385217666626, "step": 610 }, { "epoch": 0.6490447526825438, "grad_norm": 3.0625, "learning_rate": 1.6530691736402317e-06, "logits/chosen": -0.5177065134048462, "logits/rejected": -0.4805786609649658, "logps/chosen": -295.038818359375, "logps/ref_response": -0.502620279788971, "logps/rejected": -284.8276672363281, "loss": 0.5092, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21657638251781464, "rewards/margins": 0.9273554086685181, "rewards/rejected": -1.1439317464828491, "step": 620 }, { "epoch": 0.6595132164354881, "grad_norm": 3.625, "learning_rate": 1.5676295169786864e-06, "logits/chosen": -0.5329315662384033, "logits/rejected": -0.4890086054801941, "logps/chosen": -288.74847412109375, "logps/ref_response": -0.522256076335907, "logps/rejected": -274.2325439453125, "loss": 0.5196, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2333780825138092, "rewards/margins": 0.8650426864624023, "rewards/rejected": -1.0984207391738892, "step": 630 }, { "epoch": 0.6699816801884323, "grad_norm": 3.21875, "learning_rate": 1.4834368231970922e-06, "logits/chosen": -0.5609028935432434, "logits/rejected": -0.5060838460922241, "logps/chosen": -288.317138671875, "logps/ref_response": -0.5478745698928833, "logps/rejected": -274.0264892578125, "loss": 0.5003, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.1753380298614502, "rewards/margins": 0.7557013034820557, "rewards/rejected": -0.9310394525527954, "step": 640 }, { "epoch": 0.6804501439413766, "grad_norm": 3.171875, "learning_rate": 1.4006036925609245e-06, "logits/chosen": -0.5304352045059204, "logits/rejected": -0.48552340269088745, "logps/chosen": -300.5797424316406, "logps/ref_response": -0.5103051662445068, "logps/rejected": -250.87216186523438, "loss": 0.5263, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.19611187279224396, "rewards/margins": 0.8849571347236633, "rewards/rejected": -1.0810692310333252, "step": 650 }, { "epoch": 0.6909186076943209, "grad_norm": 3.453125, "learning_rate": 1.3192409070404582e-06, "logits/chosen": -0.5464252233505249, "logits/rejected": -0.5194587111473083, "logps/chosen": -304.4057312011719, "logps/ref_response": -0.5286127328872681, "logps/rejected": -306.74737548828125, "loss": 0.5106, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.0011399202048778534, "rewards/margins": 0.8308441042900085, "rewards/rejected": -0.8297042846679688, "step": 660 }, { "epoch": 0.7013870714472651, "grad_norm": 4.5, "learning_rate": 1.2394572821496953e-06, "logits/chosen": -0.5439696311950684, "logits/rejected": -0.5098680257797241, "logps/chosen": -277.79193115234375, "logps/ref_response": -0.5491371154785156, "logps/rejected": -259.3212890625, "loss": 0.5184, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.07701723277568817, "rewards/margins": 0.8310605883598328, "rewards/rejected": -0.9080777168273926, "step": 670 }, { "epoch": 0.7118555352002094, "grad_norm": 3.109375, "learning_rate": 1.1613595214152713e-06, "logits/chosen": -0.5734778642654419, "logits/rejected": -0.526314377784729, "logps/chosen": -287.5583801269531, "logps/ref_response": -0.5694643259048462, "logps/rejected": -276.66607666015625, "loss": 0.4993, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.16065016388893127, "rewards/margins": 0.8544554710388184, "rewards/rejected": -1.0151057243347168, "step": 680 }, { "epoch": 0.7223239989531536, "grad_norm": 2.578125, "learning_rate": 1.0850520736699362e-06, "logits/chosen": -0.5160936117172241, "logits/rejected": -0.48213791847229004, "logps/chosen": -341.7447814941406, "logps/ref_response": -0.4945286810398102, "logps/rejected": -317.0802001953125, "loss": 0.5268, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1648830622434616, "rewards/margins": 0.9320821762084961, "rewards/rejected": -1.0969650745391846, "step": 690 }, { "epoch": 0.7327924627060979, "grad_norm": 3.34375, "learning_rate": 1.0106369933615043e-06, "logits/chosen": -0.5604196786880493, "logits/rejected": -0.5098714828491211, "logps/chosen": -316.64105224609375, "logps/ref_response": -0.5506774187088013, "logps/rejected": -264.15411376953125, "loss": 0.5113, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.22138896584510803, "rewards/margins": 0.7301830053329468, "rewards/rejected": -0.9515719413757324, "step": 700 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -0.34110894799232483, "eval_logits/rejected": -0.30364856123924255, "eval_logps/chosen": -294.4884948730469, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -287.16522216796875, "eval_loss": 0.5173361301422119, "eval_rewards/accuracies": 0.734000027179718, "eval_rewards/chosen": -0.19413326680660248, "eval_rewards/margins": 0.8547297120094299, "eval_rewards/rejected": -1.0488630533218384, "eval_runtime": 349.4515, "eval_samples_per_second": 5.723, "eval_steps_per_second": 0.358, "step": 700 }, { "epoch": 0.7432609264590422, "grad_norm": 3.890625, "learning_rate": 9.382138040640714e-07, "logits/chosen": -0.5672627687454224, "logits/rejected": -0.5198173522949219, "logps/chosen": -265.9830627441406, "logps/ref_response": -0.5634459257125854, "logps/rejected": -280.0369873046875, "loss": 0.532, "rewards/accuracies": 0.78125, "rewards/chosen": -0.16655750572681427, "rewards/margins": 0.8667360544204712, "rewards/rejected": -1.0332934856414795, "step": 710 }, { "epoch": 0.7537293902119864, "grad_norm": 3.25, "learning_rate": 8.678793653740633e-07, "logits/chosen": -0.492758572101593, "logits/rejected": -0.4843314290046692, "logps/chosen": -264.08270263671875, "logps/ref_response": -0.49243393540382385, "logps/rejected": -264.4930114746094, "loss": 0.5104, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.08720338344573975, "rewards/margins": 0.9140432476997375, "rewards/rejected": -1.001246690750122, "step": 720 }, { "epoch": 0.7641978539649307, "grad_norm": 2.609375, "learning_rate": 7.997277433690984e-07, "logits/chosen": -0.5135891437530518, "logits/rejected": -0.455097496509552, "logps/chosen": -302.38580322265625, "logps/ref_response": -0.4944031834602356, "logps/rejected": -288.3606872558594, "loss": 0.5071, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.15266835689544678, "rewards/margins": 0.821180522441864, "rewards/rejected": -0.973848819732666, "step": 730 }, { "epoch": 0.7746663177178749, "grad_norm": 2.890625, "learning_rate": 7.338500848029603e-07, "logits/chosen": -0.4669191241264343, "logits/rejected": -0.47497326135635376, "logps/chosen": -292.3653564453125, "logps/ref_response": -0.4282347559928894, "logps/rejected": -276.3725280761719, "loss": 0.4932, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.010840972885489464, "rewards/margins": 0.8605710864067078, "rewards/rejected": -0.8714120984077454, "step": 740 }, { "epoch": 0.7851347814708192, "grad_norm": 3.234375, "learning_rate": 6.70334495204884e-07, "logits/chosen": -0.5066109299659729, "logits/rejected": -0.4780009388923645, "logps/chosen": -324.728515625, "logps/ref_response": -0.49645256996154785, "logps/rejected": -287.18304443359375, "loss": 0.5074, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.02283366583287716, "rewards/margins": 0.854231059551239, "rewards/rejected": -0.8313972353935242, "step": 750 }, { "epoch": 0.7956032452237635, "grad_norm": 3.4375, "learning_rate": 6.092659210462232e-07, "logits/chosen": -0.5308446884155273, "logits/rejected": -0.5082138776779175, "logps/chosen": -270.0063781738281, "logps/ref_response": -0.5222411751747131, "logps/rejected": -269.8592224121094, "loss": 0.534, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.09487895667552948, "rewards/margins": 0.7679542303085327, "rewards/rejected": -0.8628333210945129, "step": 760 }, { "epoch": 0.8060717089767077, "grad_norm": 3.109375, "learning_rate": 5.507260361320738e-07, "logits/chosen": -0.5149004459381104, "logits/rejected": -0.5143811702728271, "logps/chosen": -285.86566162109375, "logps/ref_response": -0.50932776927948, "logps/rejected": -280.4559020996094, "loss": 0.5136, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.04052960127592087, "rewards/margins": 0.7234804034233093, "rewards/rejected": -0.6829507946968079, "step": 770 }, { "epoch": 0.816540172729652, "grad_norm": 2.9375, "learning_rate": 4.947931323697983e-07, "logits/chosen": -0.5068045854568481, "logits/rejected": -0.47291022539138794, "logps/chosen": -287.3854675292969, "logps/ref_response": -0.49121037125587463, "logps/rejected": -281.04669189453125, "loss": 0.5179, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09810696542263031, "rewards/margins": 0.7164817452430725, "rewards/rejected": -0.8145886659622192, "step": 780 }, { "epoch": 0.8270086364825961, "grad_norm": 3.40625, "learning_rate": 4.4154201506053985e-07, "logits/chosen": -0.5332745313644409, "logits/rejected": -0.5107340812683105, "logps/chosen": -301.6036071777344, "logps/ref_response": -0.5042006373405457, "logps/rejected": -265.80133056640625, "loss": 0.5114, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08654189109802246, "rewards/margins": 0.8664189577102661, "rewards/rejected": -0.9529608488082886, "step": 790 }, { "epoch": 0.8374771002355405, "grad_norm": 2.828125, "learning_rate": 3.910439028537638e-07, "logits/chosen": -0.5402854681015015, "logits/rejected": -0.48599618673324585, "logps/chosen": -349.26226806640625, "logps/ref_response": -0.5149141550064087, "logps/rejected": -303.9481201171875, "loss": 0.5268, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.020410016179084778, "rewards/margins": 0.7255326509475708, "rewards/rejected": -0.7051225900650024, "step": 800 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -0.3452620208263397, "eval_logits/rejected": -0.30824077129364014, "eval_logps/chosen": -293.0043640136719, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -285.70001220703125, "eval_loss": 0.5176524519920349, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": -0.045722391456365585, "eval_rewards/margins": 0.8566184043884277, "eval_rewards/rejected": -0.902340829372406, "eval_runtime": 349.422, "eval_samples_per_second": 5.724, "eval_steps_per_second": 0.358, "step": 800 }, { "epoch": 0.8479455639884846, "grad_norm": 2.78125, "learning_rate": 3.4336633249862084e-07, "logits/chosen": -0.564243733882904, "logits/rejected": -0.48312124609947205, "logps/chosen": -321.1978759765625, "logps/ref_response": -0.5519742369651794, "logps/rejected": -290.4500732421875, "loss": 0.5015, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.08782587200403214, "rewards/margins": 0.8487712740898132, "rewards/rejected": -0.9365970492362976, "step": 810 }, { "epoch": 0.8584140277414289, "grad_norm": 3.203125, "learning_rate": 2.98573068519539e-07, "logits/chosen": -0.5403026342391968, "logits/rejected": -0.5190576910972595, "logps/chosen": -308.6977844238281, "logps/ref_response": -0.5307375192642212, "logps/rejected": -295.208984375, "loss": 0.5196, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.004915142897516489, "rewards/margins": 0.8321554064750671, "rewards/rejected": -0.8370705842971802, "step": 820 }, { "epoch": 0.8688824914943732, "grad_norm": 3.453125, "learning_rate": 2.5672401793681854e-07, "logits/chosen": -0.5584547519683838, "logits/rejected": -0.5359379053115845, "logps/chosen": -276.21697998046875, "logps/ref_response": -0.5466696619987488, "logps/rejected": -271.29046630859375, "loss": 0.5047, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.08783929795026779, "rewards/margins": 0.8796290159225464, "rewards/rejected": -0.9674683809280396, "step": 830 }, { "epoch": 0.8793509552473174, "grad_norm": 3.109375, "learning_rate": 2.178751501463036e-07, "logits/chosen": -0.5241914987564087, "logits/rejected": -0.5036768317222595, "logps/chosen": -316.0185852050781, "logps/ref_response": -0.5086795091629028, "logps/rejected": -309.330810546875, "loss": 0.5019, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.10169823467731476, "rewards/margins": 0.8505386114120483, "rewards/rejected": -0.9522367715835571, "step": 840 }, { "epoch": 0.8898194190002617, "grad_norm": 3.296875, "learning_rate": 1.820784220652766e-07, "logits/chosen": -0.5777777433395386, "logits/rejected": -0.5282770991325378, "logps/chosen": -347.5041809082031, "logps/ref_response": -0.5546728372573853, "logps/rejected": -281.29705810546875, "loss": 0.5108, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.10623917728662491, "rewards/margins": 0.9054906964302063, "rewards/rejected": -0.7992514371871948, "step": 850 }, { "epoch": 0.9002878827532059, "grad_norm": 2.96875, "learning_rate": 1.4938170864468636e-07, "logits/chosen": -0.4989868998527527, "logits/rejected": -0.4683281481266022, "logps/chosen": -291.2259216308594, "logps/ref_response": -0.4814940392971039, "logps/rejected": -270.903564453125, "loss": 0.4862, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.06208313629031181, "rewards/margins": 0.9479352235794067, "rewards/rejected": -1.010018229484558, "step": 860 }, { "epoch": 0.9107563465061502, "grad_norm": 3.828125, "learning_rate": 1.1982873884064466e-07, "logits/chosen": -0.4674592614173889, "logits/rejected": -0.46020442247390747, "logps/chosen": -288.8802795410156, "logps/ref_response": -0.463235467672348, "logps/rejected": -279.07818603515625, "loss": 0.5202, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.09532221406698227, "rewards/margins": 0.7154585719108582, "rewards/rejected": -0.8107808232307434, "step": 870 }, { "epoch": 0.9212248102590945, "grad_norm": 3.640625, "learning_rate": 9.345903713082305e-08, "logits/chosen": -0.550748884677887, "logits/rejected": -0.5384425520896912, "logps/chosen": -316.01953125, "logps/ref_response": -0.5406745672225952, "logps/rejected": -282.6295471191406, "loss": 0.5288, "rewards/accuracies": 0.625, "rewards/chosen": -0.04705243557691574, "rewards/margins": 0.6842992901802063, "rewards/rejected": -0.7313517332077026, "step": 880 }, { "epoch": 0.9316932740120387, "grad_norm": 4.21875, "learning_rate": 7.030787065396866e-08, "logits/chosen": -0.5192651152610779, "logits/rejected": -0.47466206550598145, "logps/chosen": -320.68121337890625, "logps/ref_response": -0.5117658376693726, "logps/rejected": -295.110107421875, "loss": 0.5163, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.031226161867380142, "rewards/margins": 0.7437794208526611, "rewards/rejected": -0.7750056385993958, "step": 890 }, { "epoch": 0.942161737764983, "grad_norm": 3.015625, "learning_rate": 5.0406202043228604e-08, "logits/chosen": -0.5355256795883179, "logits/rejected": -0.5220322012901306, "logps/chosen": -334.91900634765625, "logps/ref_response": -0.5195636749267578, "logps/rejected": -276.53973388671875, "loss": 0.4923, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.11498089134693146, "rewards/margins": 1.026011347770691, "rewards/rejected": -0.9110305905342102, "step": 900 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -0.34425824880599976, "eval_logits/rejected": -0.30724215507507324, "eval_logps/chosen": -293.0645446777344, "eval_logps/ref_response": -0.5363935232162476, "eval_logps/rejected": -285.7690734863281, "eval_loss": 0.5175051093101501, "eval_rewards/accuracies": 0.7279999852180481, "eval_rewards/chosen": -0.05173807963728905, "eval_rewards/margins": 0.8575091361999512, "eval_rewards/rejected": -0.9092472791671753, "eval_runtime": 349.4689, "eval_samples_per_second": 5.723, "eval_steps_per_second": 0.358, "step": 900 }, { "epoch": 0.9526302015179272, "grad_norm": 2.953125, "learning_rate": 3.378064801637687e-08, "logits/chosen": -0.5745955109596252, "logits/rejected": -0.5213441848754883, "logps/chosen": -316.2192077636719, "logps/ref_response": -0.561827540397644, "logps/rejected": -315.3379821777344, "loss": 0.5218, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04833642765879631, "rewards/margins": 0.7893211245536804, "rewards/rejected": -0.7409847378730774, "step": 910 }, { "epoch": 0.9630986652708715, "grad_norm": 2.78125, "learning_rate": 2.0453443778310766e-08, "logits/chosen": -0.5022112131118774, "logits/rejected": -0.45878568291664124, "logps/chosen": -330.7922668457031, "logps/ref_response": -0.4732615351676941, "logps/rejected": -306.924560546875, "loss": 0.5181, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.02221493050456047, "rewards/margins": 0.9629158973693848, "rewards/rejected": -0.9407010078430176, "step": 920 }, { "epoch": 0.9735671290238157, "grad_norm": 2.6875, "learning_rate": 1.0442413283435759e-08, "logits/chosen": -0.5029438734054565, "logits/rejected": -0.45212322473526, "logps/chosen": -319.1156921386719, "logps/ref_response": -0.4792579114437103, "logps/rejected": -280.00390625, "loss": 0.5078, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.054530493915081024, "rewards/margins": 1.144339680671692, "rewards/rejected": -1.0898091793060303, "step": 930 }, { "epoch": 0.98403559277676, "grad_norm": 2.375, "learning_rate": 3.760945397705828e-09, "logits/chosen": -0.5288008451461792, "logits/rejected": -0.4695435166358948, "logps/chosen": -292.11932373046875, "logps/ref_response": -0.5234506726264954, "logps/rejected": -262.4600830078125, "loss": 0.4922, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.12431895732879639, "rewards/margins": 1.0587154626846313, "rewards/rejected": -0.9343963861465454, "step": 940 }, { "epoch": 0.9945040565297043, "grad_norm": 2.8125, "learning_rate": 4.1797599220405605e-10, "logits/chosen": -0.5448582172393799, "logits/rejected": -0.521163821220398, "logps/chosen": -296.86529541015625, "logps/ref_response": -0.5367287397384644, "logps/rejected": -280.74432373046875, "loss": 0.5133, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.027158012613654137, "rewards/margins": 0.8814530372619629, "rewards/rejected": -0.8542949557304382, "step": 950 }, { "epoch": 0.9997382884061764, "step": 955, "total_flos": 0.0, "train_loss": 0.5383632463934533, "train_runtime": 19109.9128, "train_samples_per_second": 3.199, "train_steps_per_second": 0.05 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }