{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 64617, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00046427410743302844, "grad_norm": 126.77037048339844, "learning_rate": 4.999535725892567e-07, "logits/chosen": -18.516775131225586, "logits/rejected": -17.81698989868164, "logps/chosen": -445.4917907714844, "logps/rejected": -415.01007080078125, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": -0.005378265865147114, "rewards/margins": -0.0030574037227779627, "rewards/rejected": -0.0023208619095385075, "step": 10 }, { "epoch": 0.0009285482148660569, "grad_norm": 107.12116241455078, "learning_rate": 4.998761935713512e-07, "logits/chosen": -18.34568977355957, "logits/rejected": -16.276836395263672, "logps/chosen": -534.8225708007812, "logps/rejected": -272.27764892578125, "loss": 0.6971, "rewards/accuracies": 0.5, "rewards/chosen": -0.0034585571847856045, "rewards/margins": -0.007839241996407509, "rewards/rejected": 0.00438068388029933, "step": 20 }, { "epoch": 0.0013928223222990854, "grad_norm": 38.84819793701172, "learning_rate": 4.997988145534457e-07, "logits/chosen": -18.071977615356445, "logits/rejected": -17.937307357788086, "logps/chosen": -367.5150451660156, "logps/rejected": -338.6476745605469, "loss": 0.6936, "rewards/accuracies": 0.5, "rewards/chosen": 0.004035797901451588, "rewards/margins": -0.0009393498185090721, "rewards/rejected": 0.00497514707967639, "step": 30 }, { "epoch": 0.0018570964297321137, "grad_norm": 109.9681396484375, "learning_rate": 4.997214355355402e-07, "logits/chosen": -18.800640106201172, "logits/rejected": -18.043800354003906, "logps/chosen": -339.106689453125, "logps/rejected": -371.49176025390625, "loss": 0.6904, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.003316841321066022, "rewards/margins": 0.0054497141391038895, "rewards/rejected": -0.0021328735165297985, "step": 40 }, { "epoch": 0.0023213705371651423, "grad_norm": 72.28086853027344, "learning_rate": 4.996440565176346e-07, "logits/chosen": -17.377155303955078, "logits/rejected": -17.02338218688965, "logps/chosen": -374.7172546386719, "logps/rejected": -285.9313049316406, "loss": 0.6947, "rewards/accuracies": 0.5, "rewards/chosen": 0.005187683273106813, "rewards/margins": -0.0030665399972349405, "rewards/rejected": 0.008254223503172398, "step": 50 }, { "epoch": 0.0027856446445981707, "grad_norm": 47.192413330078125, "learning_rate": 4.995666774997291e-07, "logits/chosen": -18.28397560119629, "logits/rejected": -17.141586303710938, "logps/chosen": -457.67974853515625, "logps/rejected": -278.93597412109375, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006996384356170893, "rewards/margins": 0.0024113082326948643, "rewards/rejected": 0.004585075192153454, "step": 60 }, { "epoch": 0.003249918752031199, "grad_norm": 132.48822021484375, "learning_rate": 4.994892984818237e-07, "logits/chosen": -17.532888412475586, "logits/rejected": -17.09868049621582, "logps/chosen": -470.0636291503906, "logps/rejected": -417.4432067871094, "loss": 0.6933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.010792998597025871, "rewards/margins": -0.00025329607888124883, "rewards/rejected": 0.01104629598557949, "step": 70 }, { "epoch": 0.0037141928594642275, "grad_norm": 145.8800506591797, "learning_rate": 4.994119194639181e-07, "logits/chosen": -18.053422927856445, "logits/rejected": -17.798110961914062, "logps/chosen": -412.2059020996094, "logps/rejected": -313.39959716796875, "loss": 0.6913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008022995665669441, "rewards/margins": 0.0038014601450413465, "rewards/rejected": 0.004221535287797451, "step": 80 }, { "epoch": 0.004178466966897256, "grad_norm": 16.749162673950195, "learning_rate": 4.993345404460127e-07, "logits/chosen": -17.667800903320312, "logits/rejected": -17.260059356689453, "logps/chosen": -416.8697814941406, "logps/rejected": -358.50555419921875, "loss": 0.6947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01759599894285202, "rewards/margins": -0.0030597872100770473, "rewards/rejected": 0.020655784755945206, "step": 90 }, { "epoch": 0.004642741074330285, "grad_norm": 50.72783660888672, "learning_rate": 4.992571614281071e-07, "logits/chosen": -17.58572006225586, "logits/rejected": -17.182281494140625, "logps/chosen": -419.02386474609375, "logps/rejected": -295.6249084472656, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.026140213012695312, "rewards/margins": 0.00084352504927665, "rewards/rejected": 0.025296688079833984, "step": 100 }, { "epoch": 0.005107015181763313, "grad_norm": 87.5359878540039, "learning_rate": 4.991797824102016e-07, "logits/chosen": -17.958845138549805, "logits/rejected": -17.23076057434082, "logps/chosen": -352.3335266113281, "logps/rejected": -321.82562255859375, "loss": 0.6919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.024649199098348618, "rewards/margins": 0.002563286107033491, "rewards/rejected": 0.02208591438829899, "step": 110 }, { "epoch": 0.0055712892891963415, "grad_norm": 124.68074798583984, "learning_rate": 4.991101412940867e-07, "logits/chosen": -17.712963104248047, "logits/rejected": -17.31610107421875, "logps/chosen": -506.82733154296875, "logps/rejected": -416.5594787597656, "loss": 0.6933, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.03355148434638977, "rewards/margins": -0.00016899108595680445, "rewards/rejected": 0.0337204709649086, "step": 120 }, { "epoch": 0.00603556339662937, "grad_norm": 112.43367767333984, "learning_rate": 4.990327622761812e-07, "logits/chosen": -18.753185272216797, "logits/rejected": -17.637178421020508, "logps/chosen": -584.5308837890625, "logps/rejected": -369.9120178222656, "loss": 0.6896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03520752117037773, "rewards/margins": 0.007163695991039276, "rewards/rejected": 0.028043825179338455, "step": 130 }, { "epoch": 0.006499837504062398, "grad_norm": 126.74703216552734, "learning_rate": 4.989553832582756e-07, "logits/chosen": -18.503995895385742, "logits/rejected": -17.26997184753418, "logps/chosen": -419.85870361328125, "logps/rejected": -327.6223449707031, "loss": 0.6936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.030198972672224045, "rewards/margins": -0.0006454478716477752, "rewards/rejected": 0.030844420194625854, "step": 140 }, { "epoch": 0.006964111611495427, "grad_norm": 42.68468475341797, "learning_rate": 4.988780042403701e-07, "logits/chosen": -17.77851104736328, "logits/rejected": -17.00107192993164, "logps/chosen": -366.80303955078125, "logps/rejected": -280.18475341796875, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": 0.032206423580646515, "rewards/margins": 0.0036139674484729767, "rewards/rejected": 0.028592456132173538, "step": 150 }, { "epoch": 0.007428385718928455, "grad_norm": 125.28565216064453, "learning_rate": 4.988006252224647e-07, "logits/chosen": -18.57535171508789, "logits/rejected": -18.080150604248047, "logps/chosen": -465.6023864746094, "logps/rejected": -413.2869567871094, "loss": 0.6981, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.043610382825136185, "rewards/margins": -0.009667357429862022, "rewards/rejected": 0.053277742117643356, "step": 160 }, { "epoch": 0.007892659826361484, "grad_norm": 68.58483123779297, "learning_rate": 4.987232462045591e-07, "logits/chosen": -17.779403686523438, "logits/rejected": -16.729183197021484, "logps/chosen": -462.41546630859375, "logps/rejected": -360.7679748535156, "loss": 0.6905, "rewards/accuracies": 0.5, "rewards/chosen": 0.043996963649988174, "rewards/margins": 0.005375136621296406, "rewards/rejected": 0.038621824234724045, "step": 170 }, { "epoch": 0.008356933933794513, "grad_norm": 64.64506530761719, "learning_rate": 4.986458671866537e-07, "logits/chosen": -17.9097900390625, "logits/rejected": -16.85590171813965, "logps/chosen": -359.67974853515625, "logps/rejected": -265.947998046875, "loss": 0.6903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03493202477693558, "rewards/margins": 0.006092376541346312, "rewards/rejected": 0.02883964404463768, "step": 180 }, { "epoch": 0.008821208041227541, "grad_norm": 70.85740661621094, "learning_rate": 4.985684881687481e-07, "logits/chosen": -18.691137313842773, "logits/rejected": -18.640888214111328, "logps/chosen": -395.5958557128906, "logps/rejected": -323.21820068359375, "loss": 0.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.031446732580661774, "rewards/margins": 0.0032889661379158497, "rewards/rejected": 0.02815776690840721, "step": 190 }, { "epoch": 0.00928548214866057, "grad_norm": 117.32345581054688, "learning_rate": 4.984911091508426e-07, "logits/chosen": -18.334009170532227, "logits/rejected": -18.21535873413086, "logps/chosen": -480.1629943847656, "logps/rejected": -465.93890380859375, "loss": 0.6888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06106582283973694, "rewards/margins": 0.008852502331137657, "rewards/rejected": 0.05221332982182503, "step": 200 }, { "epoch": 0.009749756256093598, "grad_norm": 88.20452117919922, "learning_rate": 4.984137301329372e-07, "logits/chosen": -17.46588134765625, "logits/rejected": -17.68910789489746, "logps/chosen": -343.86944580078125, "logps/rejected": -363.8896179199219, "loss": 0.6927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.05330247804522514, "rewards/margins": 0.001011886983178556, "rewards/rejected": 0.052290596067905426, "step": 210 }, { "epoch": 0.010214030363526626, "grad_norm": 17.463394165039062, "learning_rate": 4.983363511150316e-07, "logits/chosen": -17.877674102783203, "logits/rejected": -16.644685745239258, "logps/chosen": -356.67596435546875, "logps/rejected": -295.6488037109375, "loss": 0.6862, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.05409309267997742, "rewards/margins": 0.01423316914588213, "rewards/rejected": 0.03985992446541786, "step": 220 }, { "epoch": 0.010678304470959655, "grad_norm": 47.97230911254883, "learning_rate": 4.982589720971262e-07, "logits/chosen": -18.54967498779297, "logits/rejected": -17.850101470947266, "logps/chosen": -317.71026611328125, "logps/rejected": -286.8714294433594, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03822875767946243, "rewards/margins": 0.003724134061485529, "rewards/rejected": 0.03450462594628334, "step": 230 }, { "epoch": 0.011142578578392683, "grad_norm": 42.19074630737305, "learning_rate": 4.981815930792206e-07, "logits/chosen": -17.337276458740234, "logits/rejected": -16.544525146484375, "logps/chosen": -323.08544921875, "logps/rejected": -280.44390869140625, "loss": 0.6874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04832092672586441, "rewards/margins": 0.011687373742461205, "rewards/rejected": 0.03663354739546776, "step": 240 }, { "epoch": 0.011606852685825711, "grad_norm": 65.552734375, "learning_rate": 4.981042140613151e-07, "logits/chosen": -17.709835052490234, "logits/rejected": -16.833106994628906, "logps/chosen": -319.45855712890625, "logps/rejected": -238.2582550048828, "loss": 0.6871, "rewards/accuracies": 0.5, "rewards/chosen": 0.0522405207157135, "rewards/margins": 0.01238144002854824, "rewards/rejected": 0.03985908254981041, "step": 250 }, { "epoch": 0.01207112679325874, "grad_norm": 88.52220916748047, "learning_rate": 4.980268350434096e-07, "logits/chosen": -18.846094131469727, "logits/rejected": -17.96956443786621, "logps/chosen": -375.3572998046875, "logps/rejected": -284.99774169921875, "loss": 0.6859, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.06143417954444885, "rewards/margins": 0.01475982554256916, "rewards/rejected": 0.046674348413944244, "step": 260 }, { "epoch": 0.012535400900691768, "grad_norm": 70.00833892822266, "learning_rate": 4.979494560255041e-07, "logits/chosen": -17.77584457397461, "logits/rejected": -16.456830978393555, "logps/chosen": -459.77532958984375, "logps/rejected": -301.1440734863281, "loss": 0.6832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07328568398952484, "rewards/margins": 0.02032577618956566, "rewards/rejected": 0.052959900349378586, "step": 270 }, { "epoch": 0.012999675008124796, "grad_norm": 117.75694274902344, "learning_rate": 4.978720770075986e-07, "logits/chosen": -18.094165802001953, "logits/rejected": -17.574451446533203, "logps/chosen": -493.99188232421875, "logps/rejected": -458.62005615234375, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": 0.07689239829778671, "rewards/margins": 0.008304975926876068, "rewards/rejected": 0.06858741492033005, "step": 280 }, { "epoch": 0.013463949115557825, "grad_norm": 129.7517852783203, "learning_rate": 4.97794697989693e-07, "logits/chosen": -17.61233139038086, "logits/rejected": -17.110321044921875, "logps/chosen": -431.96502685546875, "logps/rejected": -321.2626037597656, "loss": 0.69, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.06704983115196228, "rewards/margins": 0.006820181850343943, "rewards/rejected": 0.060229647904634476, "step": 290 }, { "epoch": 0.013928223222990853, "grad_norm": 82.80023956298828, "learning_rate": 4.977173189717876e-07, "logits/chosen": -17.72490882873535, "logits/rejected": -17.09225082397461, "logps/chosen": -425.963134765625, "logps/rejected": -335.09210205078125, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.07977095991373062, "rewards/margins": 0.009929769672453403, "rewards/rejected": 0.06984119117259979, "step": 300 }, { "epoch": 0.014392497330423882, "grad_norm": 80.35481262207031, "learning_rate": 4.976399399538821e-07, "logits/chosen": -18.40236473083496, "logits/rejected": -17.976112365722656, "logps/chosen": -404.6319885253906, "logps/rejected": -348.9981689453125, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": 0.0850236564874649, "rewards/margins": 0.01895195245742798, "rewards/rejected": 0.06607170403003693, "step": 310 }, { "epoch": 0.01485677143785691, "grad_norm": 65.72504425048828, "learning_rate": 4.975625609359766e-07, "logits/chosen": -16.96441078186035, "logits/rejected": -16.445446014404297, "logps/chosen": -326.19427490234375, "logps/rejected": -304.6036682128906, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.07119308412075043, "rewards/margins": 0.006357173435389996, "rewards/rejected": 0.06483592092990875, "step": 320 }, { "epoch": 0.015321045545289938, "grad_norm": 108.95886993408203, "learning_rate": 4.974851819180711e-07, "logits/chosen": -17.162639617919922, "logits/rejected": -17.278350830078125, "logps/chosen": -421.80322265625, "logps/rejected": -417.9771423339844, "loss": 0.6947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08393996953964233, "rewards/margins": -0.0024713133461773396, "rewards/rejected": 0.08641128987073898, "step": 330 }, { "epoch": 0.01578531965272297, "grad_norm": 97.03437805175781, "learning_rate": 4.974078029001655e-07, "logits/chosen": -17.1955623626709, "logits/rejected": -17.53921890258789, "logps/chosen": -379.0458679199219, "logps/rejected": -387.4994812011719, "loss": 0.7019, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.07352500408887863, "rewards/margins": -0.016886789351701736, "rewards/rejected": 0.09041179716587067, "step": 340 }, { "epoch": 0.016249593760155997, "grad_norm": 117.67225646972656, "learning_rate": 4.9733042388226e-07, "logits/chosen": -17.758155822753906, "logits/rejected": -18.635412216186523, "logps/chosen": -448.93218994140625, "logps/rejected": -485.569580078125, "loss": 0.6984, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.08532349020242691, "rewards/margins": -0.00916366372257471, "rewards/rejected": 0.09448715299367905, "step": 350 }, { "epoch": 0.016713867867589025, "grad_norm": 176.6046600341797, "learning_rate": 4.972530448643546e-07, "logits/chosen": -18.779499053955078, "logits/rejected": -17.392906188964844, "logps/chosen": -574.8807373046875, "logps/rejected": -446.85345458984375, "loss": 0.673, "rewards/accuracies": 1.0, "rewards/chosen": 0.1144619807600975, "rewards/margins": 0.040906794369220734, "rewards/rejected": 0.07355518639087677, "step": 360 }, { "epoch": 0.017178141975022054, "grad_norm": 87.94388580322266, "learning_rate": 4.97175665846449e-07, "logits/chosen": -19.23215103149414, "logits/rejected": -18.688756942749023, "logps/chosen": -397.40582275390625, "logps/rejected": -409.56451416015625, "loss": 0.6966, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.07668136805295944, "rewards/margins": -0.006537170149385929, "rewards/rejected": 0.0832185372710228, "step": 370 }, { "epoch": 0.017642416082455082, "grad_norm": 99.13550567626953, "learning_rate": 4.970982868285436e-07, "logits/chosen": -18.353870391845703, "logits/rejected": -17.020305633544922, "logps/chosen": -426.6875, "logps/rejected": -387.4388122558594, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0992642194032669, "rewards/margins": 0.004247800447046757, "rewards/rejected": 0.09501641243696213, "step": 380 }, { "epoch": 0.01810669018988811, "grad_norm": 42.27467727661133, "learning_rate": 4.970209078106381e-07, "logits/chosen": -17.379249572753906, "logits/rejected": -17.38232421875, "logps/chosen": -411.13055419921875, "logps/rejected": -400.33465576171875, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": 0.0925503596663475, "rewards/margins": 0.012198790907859802, "rewards/rejected": 0.0803515613079071, "step": 390 }, { "epoch": 0.01857096429732114, "grad_norm": 108.2042465209961, "learning_rate": 4.969435287927325e-07, "logits/chosen": -17.539453506469727, "logits/rejected": -17.062538146972656, "logps/chosen": -431.4325256347656, "logps/rejected": -368.62432861328125, "loss": 0.683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.08584778010845184, "rewards/margins": 0.021316375583410263, "rewards/rejected": 0.06453140079975128, "step": 400 }, { "epoch": 0.019035238404754167, "grad_norm": 54.28947067260742, "learning_rate": 4.968661497748271e-07, "logits/chosen": -17.21811294555664, "logits/rejected": -17.208616256713867, "logps/chosen": -306.272216796875, "logps/rejected": -252.0657196044922, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": 0.07171188294887543, "rewards/margins": 0.024051282554864883, "rewards/rejected": 0.047660600394010544, "step": 410 }, { "epoch": 0.019499512512187196, "grad_norm": 114.88614654541016, "learning_rate": 4.967887707569215e-07, "logits/chosen": -18.569185256958008, "logits/rejected": -17.514169692993164, "logps/chosen": -518.6094970703125, "logps/rejected": -369.9266662597656, "loss": 0.6682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13927491009235382, "rewards/margins": 0.051749348640441895, "rewards/rejected": 0.08752556145191193, "step": 420 }, { "epoch": 0.019963786619620224, "grad_norm": 83.80878448486328, "learning_rate": 4.967113917390161e-07, "logits/chosen": -18.19792938232422, "logits/rejected": -16.660785675048828, "logps/chosen": -493.9063415527344, "logps/rejected": -305.0128479003906, "loss": 0.6681, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.13094481825828552, "rewards/margins": 0.051466524600982666, "rewards/rejected": 0.07947830855846405, "step": 430 }, { "epoch": 0.020428060727053252, "grad_norm": 71.47705841064453, "learning_rate": 4.966340127211105e-07, "logits/chosen": -18.23004722595215, "logits/rejected": -18.401042938232422, "logps/chosen": -550.6630249023438, "logps/rejected": -424.2240295410156, "loss": 0.68, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12120163440704346, "rewards/margins": 0.027766723185777664, "rewards/rejected": 0.09343491494655609, "step": 440 }, { "epoch": 0.02089233483448628, "grad_norm": 123.80332946777344, "learning_rate": 4.96556633703205e-07, "logits/chosen": -18.472692489624023, "logits/rejected": -17.519676208496094, "logps/chosen": -430.20245361328125, "logps/rejected": -327.77239990234375, "loss": 0.6643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.15226905047893524, "rewards/margins": 0.06019951030611992, "rewards/rejected": 0.09206955134868622, "step": 450 }, { "epoch": 0.02135660894191931, "grad_norm": 94.92717742919922, "learning_rate": 4.964792546852995e-07, "logits/chosen": -17.063236236572266, "logits/rejected": -17.283958435058594, "logps/chosen": -395.6430969238281, "logps/rejected": -422.48388671875, "loss": 0.6919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12185509502887726, "rewards/margins": 0.003558885306119919, "rewards/rejected": 0.11829620599746704, "step": 460 }, { "epoch": 0.021820883049352337, "grad_norm": 75.02912139892578, "learning_rate": 4.96401875667394e-07, "logits/chosen": -17.513744354248047, "logits/rejected": -16.666202545166016, "logps/chosen": -513.2591552734375, "logps/rejected": -357.27691650390625, "loss": 0.6804, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.15376418828964233, "rewards/margins": 0.028661692515015602, "rewards/rejected": 0.12510250508785248, "step": 470 }, { "epoch": 0.022285157156785366, "grad_norm": 105.78536224365234, "learning_rate": 4.963244966494885e-07, "logits/chosen": -17.555442810058594, "logits/rejected": -16.65084457397461, "logps/chosen": -337.0568542480469, "logps/rejected": -272.2943115234375, "loss": 0.6744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.09262378513813019, "rewards/margins": 0.0385090671479702, "rewards/rejected": 0.05411472171545029, "step": 480 }, { "epoch": 0.022749431264218394, "grad_norm": 56.76881790161133, "learning_rate": 4.96247117631583e-07, "logits/chosen": -18.758502960205078, "logits/rejected": -17.17776107788086, "logps/chosen": -443.166259765625, "logps/rejected": -226.87326049804688, "loss": 0.6497, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.19782620668411255, "rewards/margins": 0.09110309928655624, "rewards/rejected": 0.1067231073975563, "step": 490 }, { "epoch": 0.023213705371651423, "grad_norm": 118.21449279785156, "learning_rate": 4.961697386136775e-07, "logits/chosen": -18.071483612060547, "logits/rejected": -17.3537654876709, "logps/chosen": -531.8092651367188, "logps/rejected": -424.6444396972656, "loss": 0.6764, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.13947024941444397, "rewards/margins": 0.03711536526679993, "rewards/rejected": 0.10235488414764404, "step": 500 }, { "epoch": 0.02367797947908445, "grad_norm": 143.818603515625, "learning_rate": 4.96092359595772e-07, "logits/chosen": -17.950496673583984, "logits/rejected": -17.213773727416992, "logps/chosen": -395.1738586425781, "logps/rejected": -342.3895263671875, "loss": 0.6803, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.13050240278244019, "rewards/margins": 0.0268798116594553, "rewards/rejected": 0.10362259298563004, "step": 510 }, { "epoch": 0.02414225358651748, "grad_norm": 51.067710876464844, "learning_rate": 4.960149805778665e-07, "logits/chosen": -17.945388793945312, "logits/rejected": -17.09832191467285, "logps/chosen": -391.70819091796875, "logps/rejected": -267.0955810546875, "loss": 0.6852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12365646660327911, "rewards/margins": 0.016895906999707222, "rewards/rejected": 0.10676056146621704, "step": 520 }, { "epoch": 0.024606527693950508, "grad_norm": 46.67741394042969, "learning_rate": 4.95937601559961e-07, "logits/chosen": -17.801054000854492, "logits/rejected": -17.069021224975586, "logps/chosen": -397.6291809082031, "logps/rejected": -389.2697448730469, "loss": 0.6865, "rewards/accuracies": 0.5, "rewards/chosen": 0.15242035686969757, "rewards/margins": 0.014359131455421448, "rewards/rejected": 0.13806122541427612, "step": 530 }, { "epoch": 0.025070801801383536, "grad_norm": 98.83032989501953, "learning_rate": 4.958602225420555e-07, "logits/chosen": -17.82821273803711, "logits/rejected": -17.44839096069336, "logps/chosen": -365.53228759765625, "logps/rejected": -288.8079528808594, "loss": 0.6823, "rewards/accuracies": 0.5, "rewards/chosen": 0.11723946034908295, "rewards/margins": 0.022600602358579636, "rewards/rejected": 0.0946388691663742, "step": 540 }, { "epoch": 0.025535075908816564, "grad_norm": 18.539230346679688, "learning_rate": 4.957828435241499e-07, "logits/chosen": -17.738080978393555, "logits/rejected": -16.493099212646484, "logps/chosen": -484.01348876953125, "logps/rejected": -304.795654296875, "loss": 0.6713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17410516738891602, "rewards/margins": 0.047150857746601105, "rewards/rejected": 0.12695430219173431, "step": 550 }, { "epoch": 0.025999350016249593, "grad_norm": 56.14430236816406, "learning_rate": 4.957054645062445e-07, "logits/chosen": -17.549610137939453, "logits/rejected": -17.42966651916504, "logps/chosen": -413.958740234375, "logps/rejected": -391.54449462890625, "loss": 0.6901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.18189452588558197, "rewards/margins": 0.009464675560593605, "rewards/rejected": 0.1724298745393753, "step": 560 }, { "epoch": 0.02646362412368262, "grad_norm": 54.3647575378418, "learning_rate": 4.956280854883389e-07, "logits/chosen": -18.58170509338379, "logits/rejected": -16.937191009521484, "logps/chosen": -451.0340270996094, "logps/rejected": -392.35302734375, "loss": 0.6778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.18048004806041718, "rewards/margins": 0.031705550849437714, "rewards/rejected": 0.14877448976039886, "step": 570 }, { "epoch": 0.02692789823111565, "grad_norm": 121.10553741455078, "learning_rate": 4.955507064704335e-07, "logits/chosen": -19.617137908935547, "logits/rejected": -18.598772048950195, "logps/chosen": -423.6399841308594, "logps/rejected": -365.18072509765625, "loss": 0.6802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17456498742103577, "rewards/margins": 0.027310485020279884, "rewards/rejected": 0.14725448191165924, "step": 580 }, { "epoch": 0.027392172338548678, "grad_norm": 108.2132568359375, "learning_rate": 4.95473327452528e-07, "logits/chosen": -18.811893463134766, "logits/rejected": -17.562179565429688, "logps/chosen": -481.0946350097656, "logps/rejected": -373.09503173828125, "loss": 0.6737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1917921006679535, "rewards/margins": 0.04345905780792236, "rewards/rejected": 0.14833302795886993, "step": 590 }, { "epoch": 0.027856446445981706, "grad_norm": 110.93574523925781, "learning_rate": 4.953959484346224e-07, "logits/chosen": -18.03829574584961, "logits/rejected": -17.602359771728516, "logps/chosen": -312.7474670410156, "logps/rejected": -284.3268737792969, "loss": 0.676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15058653056621552, "rewards/margins": 0.03676381707191467, "rewards/rejected": 0.11382272094488144, "step": 600 }, { "epoch": 0.028320720553414735, "grad_norm": 110.59113311767578, "learning_rate": 4.95318569416717e-07, "logits/chosen": -18.477428436279297, "logits/rejected": -17.700679779052734, "logps/chosen": -410.2960510253906, "logps/rejected": -403.50274658203125, "loss": 0.7047, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.16679084300994873, "rewards/margins": -0.02118298038840294, "rewards/rejected": 0.18797384202480316, "step": 610 }, { "epoch": 0.028784994660847763, "grad_norm": 77.5689926147461, "learning_rate": 4.952411903988114e-07, "logits/chosen": -19.243616104125977, "logits/rejected": -18.731975555419922, "logps/chosen": -415.1881408691406, "logps/rejected": -367.0069885253906, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": 0.139873206615448, "rewards/margins": 0.013218766078352928, "rewards/rejected": 0.12665443122386932, "step": 620 }, { "epoch": 0.02924926876828079, "grad_norm": 107.44772338867188, "learning_rate": 4.951638113809059e-07, "logits/chosen": -18.22382926940918, "logits/rejected": -17.995986938476562, "logps/chosen": -522.4598388671875, "logps/rejected": -503.7972717285156, "loss": 0.6846, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.2226608544588089, "rewards/margins": 0.02277100831270218, "rewards/rejected": 0.19988982379436493, "step": 630 }, { "epoch": 0.02971354287571382, "grad_norm": 138.22080993652344, "learning_rate": 4.950864323630004e-07, "logits/chosen": -18.69261360168457, "logits/rejected": -17.746488571166992, "logps/chosen": -613.2137451171875, "logps/rejected": -408.1347961425781, "loss": 0.6572, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2816910147666931, "rewards/margins": 0.07400993257761002, "rewards/rejected": 0.20768103003501892, "step": 640 }, { "epoch": 0.03017781698314685, "grad_norm": 80.59423065185547, "learning_rate": 4.950090533450949e-07, "logits/chosen": -16.56768035888672, "logits/rejected": -16.516159057617188, "logps/chosen": -336.1548156738281, "logps/rejected": -295.4664001464844, "loss": 0.6912, "rewards/accuracies": 0.5, "rewards/chosen": 0.19782400131225586, "rewards/margins": 0.009788709692656994, "rewards/rejected": 0.1880352646112442, "step": 650 }, { "epoch": 0.030642091090579877, "grad_norm": 149.48768615722656, "learning_rate": 4.949316743271894e-07, "logits/chosen": -18.18301010131836, "logits/rejected": -16.67117691040039, "logps/chosen": -469.415771484375, "logps/rejected": -302.67242431640625, "loss": 0.6479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23285861313343048, "rewards/margins": 0.09554831683635712, "rewards/rejected": 0.13731025159358978, "step": 660 }, { "epoch": 0.031106365198012905, "grad_norm": 54.66472625732422, "learning_rate": 4.948542953092839e-07, "logits/chosen": -17.46908187866211, "logits/rejected": -17.178682327270508, "logps/chosen": -313.0638427734375, "logps/rejected": -337.2393493652344, "loss": 0.7026, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.16295365989208221, "rewards/margins": -0.015360032208263874, "rewards/rejected": 0.17831368744373322, "step": 670 }, { "epoch": 0.03157063930544594, "grad_norm": 114.5217514038086, "learning_rate": 4.947769162913784e-07, "logits/chosen": -17.638408660888672, "logits/rejected": -16.86486053466797, "logps/chosen": -424.6212463378906, "logps/rejected": -317.3565368652344, "loss": 0.6497, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.24791750311851501, "rewards/margins": 0.09117875248193741, "rewards/rejected": 0.156738743185997, "step": 680 }, { "epoch": 0.032034913412878965, "grad_norm": 90.79376220703125, "learning_rate": 4.946995372734729e-07, "logits/chosen": -18.662534713745117, "logits/rejected": -17.318862915039062, "logps/chosen": -383.79571533203125, "logps/rejected": -329.0240783691406, "loss": 0.6696, "rewards/accuracies": 0.5, "rewards/chosen": 0.24651260673999786, "rewards/margins": 0.0539519377052784, "rewards/rejected": 0.19256064295768738, "step": 690 }, { "epoch": 0.032499187520311994, "grad_norm": 167.56935119628906, "learning_rate": 4.946221582555674e-07, "logits/chosen": -17.397594451904297, "logits/rejected": -16.679346084594727, "logps/chosen": -511.24700927734375, "logps/rejected": -480.5392150878906, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.17818203568458557, "rewards/margins": 0.0028832629323005676, "rewards/rejected": 0.1752987802028656, "step": 700 }, { "epoch": 0.03296346162774502, "grad_norm": 106.59515380859375, "learning_rate": 4.945447792376619e-07, "logits/chosen": -18.0601749420166, "logits/rejected": -17.068239212036133, "logps/chosen": -451.2420349121094, "logps/rejected": -351.9391174316406, "loss": 0.6851, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2702702283859253, "rewards/margins": 0.01896057091653347, "rewards/rejected": 0.25130966305732727, "step": 710 }, { "epoch": 0.03342773573517805, "grad_norm": 81.26216888427734, "learning_rate": 4.944674002197564e-07, "logits/chosen": -18.360576629638672, "logits/rejected": -16.417499542236328, "logps/chosen": -595.0709228515625, "logps/rejected": -349.9635314941406, "loss": 0.6186, "rewards/accuracies": 1.0, "rewards/chosen": 0.3620436191558838, "rewards/margins": 0.15828420221805573, "rewards/rejected": 0.20375943183898926, "step": 720 }, { "epoch": 0.03389200984261108, "grad_norm": 74.09405517578125, "learning_rate": 4.943900212018509e-07, "logits/chosen": -17.93579864501953, "logits/rejected": -17.062358856201172, "logps/chosen": -409.0330810546875, "logps/rejected": -282.5763244628906, "loss": 0.6743, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.23541852831840515, "rewards/margins": 0.04141991212964058, "rewards/rejected": 0.19399864971637726, "step": 730 }, { "epoch": 0.03435628395004411, "grad_norm": 119.47509765625, "learning_rate": 4.943126421839454e-07, "logits/chosen": -17.37395668029785, "logits/rejected": -16.952579498291016, "logps/chosen": -407.88568115234375, "logps/rejected": -291.6640319824219, "loss": 0.6654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2861270308494568, "rewards/margins": 0.061374079436063766, "rewards/rejected": 0.22475294768810272, "step": 740 }, { "epoch": 0.034820558057477136, "grad_norm": 72.00702667236328, "learning_rate": 4.942352631660398e-07, "logits/chosen": -17.291683197021484, "logits/rejected": -17.798049926757812, "logps/chosen": -346.2906494140625, "logps/rejected": -370.51116943359375, "loss": 0.6972, "rewards/accuracies": 0.5, "rewards/chosen": 0.2176261842250824, "rewards/margins": -0.005457915365695953, "rewards/rejected": 0.22308412194252014, "step": 750 }, { "epoch": 0.035284832164910164, "grad_norm": 118.10411834716797, "learning_rate": 4.941578841481344e-07, "logits/chosen": -17.79428482055664, "logits/rejected": -18.197956085205078, "logps/chosen": -354.5298156738281, "logps/rejected": -430.397705078125, "loss": 0.695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.26250046491622925, "rewards/margins": 0.0030317590571939945, "rewards/rejected": 0.2594687044620514, "step": 760 }, { "epoch": 0.03574910627234319, "grad_norm": 108.8724365234375, "learning_rate": 4.940805051302288e-07, "logits/chosen": -18.67434310913086, "logits/rejected": -16.853235244750977, "logps/chosen": -506.5414123535156, "logps/rejected": -308.36761474609375, "loss": 0.6562, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.298738569021225, "rewards/margins": 0.08020156621932983, "rewards/rejected": 0.21853700280189514, "step": 770 }, { "epoch": 0.03621338037977622, "grad_norm": 83.46111297607422, "learning_rate": 4.940031261123234e-07, "logits/chosen": -17.61901092529297, "logits/rejected": -18.071285247802734, "logps/chosen": -298.52618408203125, "logps/rejected": -311.76019287109375, "loss": 0.7033, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.21990792453289032, "rewards/margins": -0.017243079841136932, "rewards/rejected": 0.23715099692344666, "step": 780 }, { "epoch": 0.03667765448720925, "grad_norm": 78.98616790771484, "learning_rate": 4.939257470944179e-07, "logits/chosen": -18.22941017150879, "logits/rejected": -17.840290069580078, "logps/chosen": -449.56719970703125, "logps/rejected": -383.66839599609375, "loss": 0.6745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21889328956604004, "rewards/margins": 0.039863891899585724, "rewards/rejected": 0.1790294051170349, "step": 790 }, { "epoch": 0.03714192859464228, "grad_norm": 113.42011260986328, "learning_rate": 4.938483680765123e-07, "logits/chosen": -19.06601333618164, "logits/rejected": -17.62656593322754, "logps/chosen": -543.2867431640625, "logps/rejected": -384.7690734863281, "loss": 0.6276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.31357789039611816, "rewards/margins": 0.13847552239894867, "rewards/rejected": 0.1751023530960083, "step": 800 }, { "epoch": 0.037606202702075306, "grad_norm": 78.73660278320312, "learning_rate": 4.937709890586069e-07, "logits/chosen": -17.89183807373047, "logits/rejected": -16.643213272094727, "logps/chosen": -416.55499267578125, "logps/rejected": -282.9198913574219, "loss": 0.6457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3068270683288574, "rewards/margins": 0.11325763165950775, "rewards/rejected": 0.19356945157051086, "step": 810 }, { "epoch": 0.038070476809508334, "grad_norm": 67.65589141845703, "learning_rate": 4.936936100407014e-07, "logits/chosen": -18.654006958007812, "logits/rejected": -17.678241729736328, "logps/chosen": -442.9937438964844, "logps/rejected": -338.5890808105469, "loss": 0.6725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.28907501697540283, "rewards/margins": 0.044030144810676575, "rewards/rejected": 0.24504487216472626, "step": 820 }, { "epoch": 0.03853475091694136, "grad_norm": 105.36833953857422, "learning_rate": 4.936162310227958e-07, "logits/chosen": -17.789791107177734, "logits/rejected": -16.79059600830078, "logps/chosen": -527.3004150390625, "logps/rejected": -401.56768798828125, "loss": 0.6675, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.3229188323020935, "rewards/margins": 0.06193291023373604, "rewards/rejected": 0.26098594069480896, "step": 830 }, { "epoch": 0.03899902502437439, "grad_norm": 145.49142456054688, "learning_rate": 4.935388520048903e-07, "logits/chosen": -17.033315658569336, "logits/rejected": -16.511064529418945, "logps/chosen": -384.91656494140625, "logps/rejected": -342.54083251953125, "loss": 0.6462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3277242183685303, "rewards/margins": 0.10571081936359406, "rewards/rejected": 0.22201338410377502, "step": 840 }, { "epoch": 0.03946329913180742, "grad_norm": 182.2275848388672, "learning_rate": 4.934614729869848e-07, "logits/chosen": -17.521684646606445, "logits/rejected": -17.195592880249023, "logps/chosen": -381.4451904296875, "logps/rejected": -387.8620910644531, "loss": 0.6917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2766951322555542, "rewards/margins": 0.008991125039756298, "rewards/rejected": 0.26770398020744324, "step": 850 }, { "epoch": 0.03992757323924045, "grad_norm": 133.8595428466797, "learning_rate": 4.933840939690793e-07, "logits/chosen": -17.729167938232422, "logits/rejected": -16.657682418823242, "logps/chosen": -480.79583740234375, "logps/rejected": -440.79278564453125, "loss": 0.6861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.31072601675987244, "rewards/margins": 0.021489102393388748, "rewards/rejected": 0.2892369031906128, "step": 860 }, { "epoch": 0.040391847346673476, "grad_norm": 103.19798278808594, "learning_rate": 4.933067149511739e-07, "logits/chosen": -18.018396377563477, "logits/rejected": -16.95490264892578, "logps/chosen": -470.993408203125, "logps/rejected": -441.0476989746094, "loss": 0.6672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3517216742038727, "rewards/margins": 0.055640868842601776, "rewards/rejected": 0.2960807681083679, "step": 870 }, { "epoch": 0.040856121454106505, "grad_norm": 92.0937728881836, "learning_rate": 4.932293359332683e-07, "logits/chosen": -17.683900833129883, "logits/rejected": -17.2557373046875, "logps/chosen": -304.67730712890625, "logps/rejected": -297.2115783691406, "loss": 0.6774, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.21189625561237335, "rewards/margins": 0.03331343084573746, "rewards/rejected": 0.1785828173160553, "step": 880 }, { "epoch": 0.04132039556153953, "grad_norm": 114.25028991699219, "learning_rate": 4.931519569153628e-07, "logits/chosen": -17.936992645263672, "logits/rejected": -17.105213165283203, "logps/chosen": -438.40838623046875, "logps/rejected": -302.2864685058594, "loss": 0.6252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.36464759707450867, "rewards/margins": 0.14615468680858612, "rewards/rejected": 0.21849295496940613, "step": 890 }, { "epoch": 0.04178466966897256, "grad_norm": 124.34318542480469, "learning_rate": 4.930745778974573e-07, "logits/chosen": -17.924877166748047, "logits/rejected": -17.75459861755371, "logps/chosen": -449.23974609375, "logps/rejected": -410.95135498046875, "loss": 0.7075, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.357687383890152, "rewards/margins": -0.02089187316596508, "rewards/rejected": 0.3785792589187622, "step": 900 }, { "epoch": 0.04224894377640559, "grad_norm": 115.43102264404297, "learning_rate": 4.929971988795518e-07, "logits/chosen": -17.710216522216797, "logits/rejected": -17.892906188964844, "logps/chosen": -445.085205078125, "logps/rejected": -371.37982177734375, "loss": 0.6961, "rewards/accuracies": 0.5, "rewards/chosen": 0.2735653817653656, "rewards/margins": 0.007566812448203564, "rewards/rejected": 0.2659985423088074, "step": 910 }, { "epoch": 0.04271321788383862, "grad_norm": 86.45709228515625, "learning_rate": 4.929198198616463e-07, "logits/chosen": -17.76266098022461, "logits/rejected": -17.569860458374023, "logps/chosen": -490.1277770996094, "logps/rejected": -427.37994384765625, "loss": 0.6871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3638918697834015, "rewards/margins": 0.015198404900729656, "rewards/rejected": 0.3486934304237366, "step": 920 }, { "epoch": 0.043177491991271646, "grad_norm": 112.2062759399414, "learning_rate": 4.928424408437408e-07, "logits/chosen": -18.262142181396484, "logits/rejected": -17.464496612548828, "logps/chosen": -444.43695068359375, "logps/rejected": -348.937744140625, "loss": 0.6829, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.30864518880844116, "rewards/margins": 0.02531389333307743, "rewards/rejected": 0.2833313047885895, "step": 930 }, { "epoch": 0.043641766098704675, "grad_norm": 69.55279541015625, "learning_rate": 4.927650618258353e-07, "logits/chosen": -18.599628448486328, "logits/rejected": -17.989137649536133, "logps/chosen": -307.2242431640625, "logps/rejected": -236.47610473632812, "loss": 0.6565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.25611406564712524, "rewards/margins": 0.0787295550107956, "rewards/rejected": 0.17738449573516846, "step": 940 }, { "epoch": 0.0441060402061377, "grad_norm": 137.59939575195312, "learning_rate": 4.926876828079297e-07, "logits/chosen": -17.039384841918945, "logits/rejected": -16.664213180541992, "logps/chosen": -352.9122314453125, "logps/rejected": -285.76727294921875, "loss": 0.6446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.28589868545532227, "rewards/margins": 0.10732422024011612, "rewards/rejected": 0.17857445776462555, "step": 950 }, { "epoch": 0.04457031431357073, "grad_norm": 68.52989959716797, "learning_rate": 4.926103037900243e-07, "logits/chosen": -18.70744514465332, "logits/rejected": -18.277463912963867, "logps/chosen": -440.57391357421875, "logps/rejected": -335.1253662109375, "loss": 0.6467, "rewards/accuracies": 0.5, "rewards/chosen": 0.40957656502723694, "rewards/margins": 0.10668816417455673, "rewards/rejected": 0.3028883635997772, "step": 960 }, { "epoch": 0.04503458842100376, "grad_norm": 95.70106506347656, "learning_rate": 4.925329247721188e-07, "logits/chosen": -18.265087127685547, "logits/rejected": -17.620882034301758, "logps/chosen": -477.17755126953125, "logps/rejected": -428.62335205078125, "loss": 0.6641, "rewards/accuracies": 0.5, "rewards/chosen": 0.40869951248168945, "rewards/margins": 0.06458435207605362, "rewards/rejected": 0.34411513805389404, "step": 970 }, { "epoch": 0.04549886252843679, "grad_norm": 82.1656723022461, "learning_rate": 4.924555457542133e-07, "logits/chosen": -18.122264862060547, "logits/rejected": -17.176918029785156, "logps/chosen": -367.2530517578125, "logps/rejected": -239.45724487304688, "loss": 0.6516, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.32010334730148315, "rewards/margins": 0.0874536782503128, "rewards/rejected": 0.23264965415000916, "step": 980 }, { "epoch": 0.04596313663586982, "grad_norm": 65.3374252319336, "learning_rate": 4.923781667363078e-07, "logits/chosen": -18.166431427001953, "logits/rejected": -18.702205657958984, "logps/chosen": -441.7447204589844, "logps/rejected": -365.86212158203125, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": 0.29792243242263794, "rewards/margins": 0.004135437309741974, "rewards/rejected": 0.29378700256347656, "step": 990 }, { "epoch": 0.046427410743302845, "grad_norm": 91.9826889038086, "learning_rate": 4.923007877184022e-07, "logits/chosen": -18.538728713989258, "logits/rejected": -17.83202362060547, "logps/chosen": -344.6723327636719, "logps/rejected": -318.3431091308594, "loss": 0.6794, "rewards/accuracies": 0.5, "rewards/chosen": 0.38693368434906006, "rewards/margins": 0.036382973194122314, "rewards/rejected": 0.35055071115493774, "step": 1000 }, { "epoch": 0.046891684850735874, "grad_norm": 59.56592559814453, "learning_rate": 4.922234087004968e-07, "logits/chosen": -17.152124404907227, "logits/rejected": -15.976046562194824, "logps/chosen": -386.35736083984375, "logps/rejected": -218.22378540039062, "loss": 0.6077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4187738001346588, "rewards/margins": 0.18882103264331818, "rewards/rejected": 0.22995276749134064, "step": 1010 }, { "epoch": 0.0473559589581689, "grad_norm": 99.42151641845703, "learning_rate": 4.921460296825913e-07, "logits/chosen": -17.206647872924805, "logits/rejected": -16.606468200683594, "logps/chosen": -341.95831298828125, "logps/rejected": -283.36920166015625, "loss": 0.6465, "rewards/accuracies": 0.5, "rewards/chosen": 0.3537120223045349, "rewards/margins": 0.10534956306219101, "rewards/rejected": 0.2483624517917633, "step": 1020 }, { "epoch": 0.04782023306560193, "grad_norm": 132.75340270996094, "learning_rate": 4.920686506646857e-07, "logits/chosen": -16.29859161376953, "logits/rejected": -16.867151260375977, "logps/chosen": -424.60791015625, "logps/rejected": -522.7360229492188, "loss": 0.7653, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.34171295166015625, "rewards/margins": -0.1315465271472931, "rewards/rejected": 0.47325944900512695, "step": 1030 }, { "epoch": 0.04828450717303496, "grad_norm": 103.6189193725586, "learning_rate": 4.919912716467802e-07, "logits/chosen": -18.75347137451172, "logits/rejected": -17.153827667236328, "logps/chosen": -448.5169372558594, "logps/rejected": -346.39691162109375, "loss": 0.6353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.417307049036026, "rewards/margins": 0.1315617561340332, "rewards/rejected": 0.2857453227043152, "step": 1040 }, { "epoch": 0.04874878128046799, "grad_norm": 57.144229888916016, "learning_rate": 4.919138926288747e-07, "logits/chosen": -16.673954010009766, "logits/rejected": -16.15968894958496, "logps/chosen": -334.490966796875, "logps/rejected": -304.6171569824219, "loss": 0.6456, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3942972421646118, "rewards/margins": 0.10404409468173981, "rewards/rejected": 0.2902531921863556, "step": 1050 }, { "epoch": 0.049213055387901015, "grad_norm": 61.963748931884766, "learning_rate": 4.918365136109692e-07, "logits/chosen": -18.113725662231445, "logits/rejected": -17.684234619140625, "logps/chosen": -417.98797607421875, "logps/rejected": -395.82183837890625, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": 0.4129757881164551, "rewards/margins": 0.010385936126112938, "rewards/rejected": 0.4025898575782776, "step": 1060 }, { "epoch": 0.049677329495334044, "grad_norm": 104.3405990600586, "learning_rate": 4.917591345930638e-07, "logits/chosen": -18.63412857055664, "logits/rejected": -16.833995819091797, "logps/chosen": -461.00537109375, "logps/rejected": -268.53875732421875, "loss": 0.5946, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.43885523080825806, "rewards/margins": 0.21600885689258575, "rewards/rejected": 0.2228463590145111, "step": 1070 }, { "epoch": 0.05014160360276707, "grad_norm": 67.00346374511719, "learning_rate": 4.916817555751582e-07, "logits/chosen": -18.476978302001953, "logits/rejected": -17.505834579467773, "logps/chosen": -517.5316162109375, "logps/rejected": -402.5478210449219, "loss": 0.6603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.44256195425987244, "rewards/margins": 0.07328544557094574, "rewards/rejected": 0.3692764639854431, "step": 1080 }, { "epoch": 0.0506058777102001, "grad_norm": 82.43666076660156, "learning_rate": 4.916043765572527e-07, "logits/chosen": -19.061681747436523, "logits/rejected": -18.788665771484375, "logps/chosen": -380.69207763671875, "logps/rejected": -320.7586975097656, "loss": 0.7203, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.2908056080341339, "rewards/margins": -0.04608406499028206, "rewards/rejected": 0.3368896543979645, "step": 1090 }, { "epoch": 0.05107015181763313, "grad_norm": 120.90550994873047, "learning_rate": 4.915269975393472e-07, "logits/chosen": -17.366294860839844, "logits/rejected": -17.309703826904297, "logps/chosen": -401.810546875, "logps/rejected": -377.3644714355469, "loss": 0.6731, "rewards/accuracies": 0.5, "rewards/chosen": 0.46470385789871216, "rewards/margins": 0.06673373281955719, "rewards/rejected": 0.3979701101779938, "step": 1100 }, { "epoch": 0.05153442592506616, "grad_norm": 130.58387756347656, "learning_rate": 4.914496185214417e-07, "logits/chosen": -17.0504207611084, "logits/rejected": -17.487438201904297, "logps/chosen": -396.81158447265625, "logps/rejected": -406.22052001953125, "loss": 0.6875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3800155520439148, "rewards/margins": 0.0173207875341177, "rewards/rejected": 0.36269479990005493, "step": 1110 }, { "epoch": 0.051998700032499186, "grad_norm": 89.05257415771484, "learning_rate": 4.913722395035362e-07, "logits/chosen": -17.413461685180664, "logits/rejected": -17.15909194946289, "logps/chosen": -301.05572509765625, "logps/rejected": -322.0262145996094, "loss": 0.7522, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.24287764728069305, "rewards/margins": -0.11221883445978165, "rewards/rejected": 0.3550964891910553, "step": 1120 }, { "epoch": 0.052462974139932214, "grad_norm": 89.91748809814453, "learning_rate": 4.912948604856307e-07, "logits/chosen": -18.155141830444336, "logits/rejected": -17.963130950927734, "logps/chosen": -409.7181701660156, "logps/rejected": -353.7330017089844, "loss": 0.6388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.48683223128318787, "rewards/margins": 0.11741790920495987, "rewards/rejected": 0.36941438913345337, "step": 1130 }, { "epoch": 0.05292724824736524, "grad_norm": 111.23920440673828, "learning_rate": 4.912174814677252e-07, "logits/chosen": -17.073396682739258, "logits/rejected": -18.281787872314453, "logps/chosen": -306.103271484375, "logps/rejected": -478.6170349121094, "loss": 0.7886, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 0.3608976900577545, "rewards/margins": -0.1786128133535385, "rewards/rejected": 0.5395105481147766, "step": 1140 }, { "epoch": 0.05339152235479827, "grad_norm": 75.0438003540039, "learning_rate": 4.911401024498196e-07, "logits/chosen": -17.980337142944336, "logits/rejected": -16.655241012573242, "logps/chosen": -397.44793701171875, "logps/rejected": -260.197021484375, "loss": 0.6466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3771774172782898, "rewards/margins": 0.10798157751560211, "rewards/rejected": 0.2691958546638489, "step": 1150 }, { "epoch": 0.0538557964622313, "grad_norm": 109.0711441040039, "learning_rate": 4.910627234319142e-07, "logits/chosen": -17.512142181396484, "logits/rejected": -16.638551712036133, "logps/chosen": -457.82427978515625, "logps/rejected": -361.37054443359375, "loss": 0.6607, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.42101365327835083, "rewards/margins": 0.08852417767047882, "rewards/rejected": 0.3324894607067108, "step": 1160 }, { "epoch": 0.05432007056966433, "grad_norm": 58.74284362792969, "learning_rate": 4.909853444140087e-07, "logits/chosen": -18.194076538085938, "logits/rejected": -17.582767486572266, "logps/chosen": -359.78338623046875, "logps/rejected": -392.67498779296875, "loss": 0.6731, "rewards/accuracies": 0.5, "rewards/chosen": 0.37508073449134827, "rewards/margins": 0.04393189027905464, "rewards/rejected": 0.33114880323410034, "step": 1170 }, { "epoch": 0.054784344677097356, "grad_norm": 101.74040985107422, "learning_rate": 4.909079653961032e-07, "logits/chosen": -17.517284393310547, "logits/rejected": -17.55191421508789, "logps/chosen": -387.30267333984375, "logps/rejected": -379.3492126464844, "loss": 0.6994, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.4512700140476227, "rewards/margins": -0.006128333508968353, "rewards/rejected": 0.45739832520484924, "step": 1180 }, { "epoch": 0.055248618784530384, "grad_norm": 140.97714233398438, "learning_rate": 4.908305863781977e-07, "logits/chosen": -17.583999633789062, "logits/rejected": -17.000410079956055, "logps/chosen": -409.27801513671875, "logps/rejected": -337.48846435546875, "loss": 0.653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.41182833909988403, "rewards/margins": 0.09772951900959015, "rewards/rejected": 0.3140987753868103, "step": 1190 }, { "epoch": 0.05571289289196341, "grad_norm": 73.79524230957031, "learning_rate": 4.907532073602921e-07, "logits/chosen": -17.003162384033203, "logits/rejected": -17.131732940673828, "logps/chosen": -487.1819763183594, "logps/rejected": -528.292236328125, "loss": 0.7122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.44859522581100464, "rewards/margins": -0.02450256608426571, "rewards/rejected": 0.47309786081314087, "step": 1200 }, { "epoch": 0.05617716699939644, "grad_norm": 110.90730285644531, "learning_rate": 4.906758283423866e-07, "logits/chosen": -18.183298110961914, "logits/rejected": -18.04265022277832, "logps/chosen": -449.7488708496094, "logps/rejected": -442.4920959472656, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": 0.4387306272983551, "rewards/margins": 0.04366268962621689, "rewards/rejected": 0.3950679302215576, "step": 1210 }, { "epoch": 0.05664144110682947, "grad_norm": 108.99736022949219, "learning_rate": 4.905984493244812e-07, "logits/chosen": -17.516315460205078, "logits/rejected": -17.050884246826172, "logps/chosen": -524.9129638671875, "logps/rejected": -374.81658935546875, "loss": 0.6703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.47315359115600586, "rewards/margins": 0.049197763204574585, "rewards/rejected": 0.42395585775375366, "step": 1220 }, { "epoch": 0.0571057152142625, "grad_norm": 173.1856689453125, "learning_rate": 4.905210703065756e-07, "logits/chosen": -17.420841217041016, "logits/rejected": -17.398611068725586, "logps/chosen": -385.64306640625, "logps/rejected": -437.613037109375, "loss": 0.7045, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.32371002435684204, "rewards/margins": -0.019313354045152664, "rewards/rejected": 0.3430233597755432, "step": 1230 }, { "epoch": 0.057569989321695526, "grad_norm": 44.42095184326172, "learning_rate": 4.904436912886701e-07, "logits/chosen": -17.71670913696289, "logits/rejected": -17.710689544677734, "logps/chosen": -346.7874450683594, "logps/rejected": -334.97540283203125, "loss": 0.7051, "rewards/accuracies": 0.5, "rewards/chosen": 0.3271781802177429, "rewards/margins": -0.021043844521045685, "rewards/rejected": 0.3482220768928528, "step": 1240 }, { "epoch": 0.058034263429128555, "grad_norm": 82.55894470214844, "learning_rate": 4.903663122707646e-07, "logits/chosen": -18.2600154876709, "logits/rejected": -18.441242218017578, "logps/chosen": -267.7417297363281, "logps/rejected": -209.2593536376953, "loss": 0.6663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2828584313392639, "rewards/margins": 0.05776556208729744, "rewards/rejected": 0.22509288787841797, "step": 1250 }, { "epoch": 0.05849853753656158, "grad_norm": 105.30679321289062, "learning_rate": 4.902889332528591e-07, "logits/chosen": -17.251405715942383, "logits/rejected": -16.310588836669922, "logps/chosen": -377.8070373535156, "logps/rejected": -255.41195678710938, "loss": 0.634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.40218234062194824, "rewards/margins": 0.13454104959964752, "rewards/rejected": 0.2676412761211395, "step": 1260 }, { "epoch": 0.05896281164399461, "grad_norm": 135.54595947265625, "learning_rate": 4.902115542349537e-07, "logits/chosen": -17.653827667236328, "logits/rejected": -17.26091766357422, "logps/chosen": -347.11737060546875, "logps/rejected": -333.36065673828125, "loss": 0.7131, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.3079063296318054, "rewards/margins": -0.035381317138671875, "rewards/rejected": 0.3432876467704773, "step": 1270 }, { "epoch": 0.05942708575142764, "grad_norm": 59.06295394897461, "learning_rate": 4.901341752170481e-07, "logits/chosen": -17.671253204345703, "logits/rejected": -17.493602752685547, "logps/chosen": -435.2743225097656, "logps/rejected": -378.4300842285156, "loss": 0.6778, "rewards/accuracies": 0.5, "rewards/chosen": 0.43215304613113403, "rewards/margins": 0.04618232697248459, "rewards/rejected": 0.38597071170806885, "step": 1280 }, { "epoch": 0.05989135985886067, "grad_norm": 129.914794921875, "learning_rate": 4.900567961991427e-07, "logits/chosen": -18.411312103271484, "logits/rejected": -17.629985809326172, "logps/chosen": -545.1564331054688, "logps/rejected": -413.40350341796875, "loss": 0.6437, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.4860380291938782, "rewards/margins": 0.11822707951068878, "rewards/rejected": 0.3678109645843506, "step": 1290 }, { "epoch": 0.0603556339662937, "grad_norm": 64.55953979492188, "learning_rate": 4.89979417181237e-07, "logits/chosen": -18.201480865478516, "logits/rejected": -18.267099380493164, "logps/chosen": -442.1590881347656, "logps/rejected": -425.6426696777344, "loss": 0.6577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4731886386871338, "rewards/margins": 0.09285614639520645, "rewards/rejected": 0.38033246994018555, "step": 1300 }, { "epoch": 0.060819908073726725, "grad_norm": 157.9522247314453, "learning_rate": 4.899020381633316e-07, "logits/chosen": -18.17292594909668, "logits/rejected": -18.114734649658203, "logps/chosen": -421.46942138671875, "logps/rejected": -401.2269287109375, "loss": 0.6828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4369990825653076, "rewards/margins": 0.023072976619005203, "rewards/rejected": 0.41392606496810913, "step": 1310 }, { "epoch": 0.06128418218115975, "grad_norm": 111.87149810791016, "learning_rate": 4.898246591454261e-07, "logits/chosen": -18.209009170532227, "logits/rejected": -17.884965896606445, "logps/chosen": -552.7246704101562, "logps/rejected": -482.0240173339844, "loss": 0.6003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6866492033004761, "rewards/margins": 0.20658913254737854, "rewards/rejected": 0.48006004095077515, "step": 1320 }, { "epoch": 0.06174845628859278, "grad_norm": 83.57002258300781, "learning_rate": 4.897472801275206e-07, "logits/chosen": -17.27992820739746, "logits/rejected": -16.909650802612305, "logps/chosen": -364.0355224609375, "logps/rejected": -322.6195373535156, "loss": 0.7053, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.32632094621658325, "rewards/margins": -0.008329929783940315, "rewards/rejected": 0.3346509039402008, "step": 1330 }, { "epoch": 0.06221273039602581, "grad_norm": 67.44193267822266, "learning_rate": 4.896699011096151e-07, "logits/chosen": -18.1849365234375, "logits/rejected": -17.163850784301758, "logps/chosen": -541.0252075195312, "logps/rejected": -411.5565490722656, "loss": 0.6425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6356172561645508, "rewards/margins": 0.13120414316654205, "rewards/rejected": 0.5044130086898804, "step": 1340 }, { "epoch": 0.06267700450345884, "grad_norm": 63.73694610595703, "learning_rate": 4.895925220917095e-07, "logits/chosen": -18.805530548095703, "logits/rejected": -17.61177635192871, "logps/chosen": -330.14935302734375, "logps/rejected": -237.2945556640625, "loss": 0.6462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3876936435699463, "rewards/margins": 0.10985855013132095, "rewards/rejected": 0.27783507108688354, "step": 1350 }, { "epoch": 0.06314127861089187, "grad_norm": 113.71638488769531, "learning_rate": 4.895151430738041e-07, "logits/chosen": -17.22312355041504, "logits/rejected": -16.98116111755371, "logps/chosen": -349.89630126953125, "logps/rejected": -321.08428955078125, "loss": 0.6628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3899485766887665, "rewards/margins": 0.07471496611833572, "rewards/rejected": 0.31523364782333374, "step": 1360 }, { "epoch": 0.0636055527183249, "grad_norm": 66.9144287109375, "learning_rate": 4.894377640558986e-07, "logits/chosen": -18.1159725189209, "logits/rejected": -17.46994972229004, "logps/chosen": -466.557373046875, "logps/rejected": -367.79779052734375, "loss": 0.6435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4978964328765869, "rewards/margins": 0.1067974790930748, "rewards/rejected": 0.3910989463329315, "step": 1370 }, { "epoch": 0.06406982682575793, "grad_norm": 148.48468017578125, "learning_rate": 4.893603850379931e-07, "logits/chosen": -18.37904167175293, "logits/rejected": -17.733957290649414, "logps/chosen": -436.80670166015625, "logps/rejected": -353.7525939941406, "loss": 0.6407, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4471588134765625, "rewards/margins": 0.12540528178215027, "rewards/rejected": 0.32175350189208984, "step": 1380 }, { "epoch": 0.06453410093319095, "grad_norm": 88.158935546875, "learning_rate": 4.892830060200876e-07, "logits/chosen": -18.582351684570312, "logits/rejected": -17.92852020263672, "logps/chosen": -387.80072021484375, "logps/rejected": -341.65020751953125, "loss": 0.6365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4831295609474182, "rewards/margins": 0.13733717799186707, "rewards/rejected": 0.34579238295555115, "step": 1390 }, { "epoch": 0.06499837504062399, "grad_norm": 125.49064636230469, "learning_rate": 4.892056270021821e-07, "logits/chosen": -18.682537078857422, "logits/rejected": -18.041227340698242, "logps/chosen": -424.1793518066406, "logps/rejected": -367.87664794921875, "loss": 0.6473, "rewards/accuracies": 0.5, "rewards/chosen": 0.6480294466018677, "rewards/margins": 0.11433901637792587, "rewards/rejected": 0.5336905121803284, "step": 1400 }, { "epoch": 0.06546264914805701, "grad_norm": 91.03451538085938, "learning_rate": 4.891282479842765e-07, "logits/chosen": -17.35352325439453, "logits/rejected": -16.972949981689453, "logps/chosen": -452.4403381347656, "logps/rejected": -377.5086364746094, "loss": 0.6749, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.472752183675766, "rewards/margins": 0.06105118244886398, "rewards/rejected": 0.4117010533809662, "step": 1410 }, { "epoch": 0.06592692325549004, "grad_norm": 55.963836669921875, "learning_rate": 4.890508689663711e-07, "logits/chosen": -17.901630401611328, "logits/rejected": -17.467252731323242, "logps/chosen": -392.4836120605469, "logps/rejected": -430.11932373046875, "loss": 0.7283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4491400122642517, "rewards/margins": -0.051659077405929565, "rewards/rejected": 0.5007990598678589, "step": 1420 }, { "epoch": 0.06639119736292307, "grad_norm": 178.48507690429688, "learning_rate": 4.889734899484655e-07, "logits/chosen": -18.613340377807617, "logits/rejected": -16.837411880493164, "logps/chosen": -428.01025390625, "logps/rejected": -363.427978515625, "loss": 0.6128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5220120549201965, "rewards/margins": 0.18582914769649506, "rewards/rejected": 0.3361828923225403, "step": 1430 }, { "epoch": 0.0668554714703561, "grad_norm": 71.94915008544922, "learning_rate": 4.888961109305601e-07, "logits/chosen": -18.892757415771484, "logits/rejected": -18.00899314880371, "logps/chosen": -424.8414611816406, "logps/rejected": -343.976806640625, "loss": 0.7027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5297229290008545, "rewards/margins": -0.011091920547187328, "rewards/rejected": 0.5408148169517517, "step": 1440 }, { "epoch": 0.06731974557778912, "grad_norm": 24.35173225402832, "learning_rate": 4.888187319126546e-07, "logits/chosen": -18.655372619628906, "logits/rejected": -17.626567840576172, "logps/chosen": -349.3687438964844, "logps/rejected": -274.55255126953125, "loss": 0.6213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4203198552131653, "rewards/margins": 0.16299466788768768, "rewards/rejected": 0.257325142621994, "step": 1450 }, { "epoch": 0.06778401968522216, "grad_norm": 120.3768081665039, "learning_rate": 4.88741352894749e-07, "logits/chosen": -17.61910057067871, "logits/rejected": -16.70780372619629, "logps/chosen": -460.72418212890625, "logps/rejected": -367.028076171875, "loss": 0.6249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6056718230247498, "rewards/margins": 0.15554575622081757, "rewards/rejected": 0.4501260817050934, "step": 1460 }, { "epoch": 0.06824829379265518, "grad_norm": 80.36925506591797, "learning_rate": 4.886639738768436e-07, "logits/chosen": -17.9984188079834, "logits/rejected": -17.611173629760742, "logps/chosen": -375.24407958984375, "logps/rejected": -300.92266845703125, "loss": 0.6623, "rewards/accuracies": 0.5, "rewards/chosen": 0.5367821455001831, "rewards/margins": 0.07435762137174606, "rewards/rejected": 0.46242451667785645, "step": 1470 }, { "epoch": 0.06871256790008821, "grad_norm": 61.28909683227539, "learning_rate": 4.88586594858938e-07, "logits/chosen": -17.401763916015625, "logits/rejected": -16.67361831665039, "logps/chosen": -418.50830078125, "logps/rejected": -326.6481018066406, "loss": 0.6299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.548477292060852, "rewards/margins": 0.14168167114257812, "rewards/rejected": 0.4067956805229187, "step": 1480 }, { "epoch": 0.06917684200752124, "grad_norm": 78.58367919921875, "learning_rate": 4.885092158410325e-07, "logits/chosen": -17.577842712402344, "logits/rejected": -17.49172019958496, "logps/chosen": -224.306884765625, "logps/rejected": -280.45269775390625, "loss": 0.7148, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.29685765504837036, "rewards/margins": -0.03667961433529854, "rewards/rejected": 0.3335372805595398, "step": 1490 }, { "epoch": 0.06964111611495427, "grad_norm": 144.94937133789062, "learning_rate": 4.88431836823127e-07, "logits/chosen": -18.472240447998047, "logits/rejected": -18.683195114135742, "logps/chosen": -347.87554931640625, "logps/rejected": -422.7310485839844, "loss": 0.7849, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.41793885827064514, "rewards/margins": -0.1595677137374878, "rewards/rejected": 0.5775065422058105, "step": 1500 }, { "epoch": 0.07010539022238729, "grad_norm": 39.57789993286133, "learning_rate": 4.883544578052215e-07, "logits/chosen": -17.29855728149414, "logits/rejected": -17.383882522583008, "logps/chosen": -288.80828857421875, "logps/rejected": -289.4036865234375, "loss": 0.6545, "rewards/accuracies": 0.5, "rewards/chosen": 0.45873337984085083, "rewards/margins": 0.0887976884841919, "rewards/rejected": 0.36993569135665894, "step": 1510 }, { "epoch": 0.07056966432982033, "grad_norm": 160.50921630859375, "learning_rate": 4.88277078787316e-07, "logits/chosen": -18.35383415222168, "logits/rejected": -17.330753326416016, "logps/chosen": -452.698974609375, "logps/rejected": -392.9483337402344, "loss": 0.6535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5456552505493164, "rewards/margins": 0.10743981599807739, "rewards/rejected": 0.438215434551239, "step": 1520 }, { "epoch": 0.07103393843725335, "grad_norm": 82.5117416381836, "learning_rate": 4.881996997694105e-07, "logits/chosen": -18.26638412475586, "logits/rejected": -16.859527587890625, "logps/chosen": -399.47100830078125, "logps/rejected": -249.86587524414062, "loss": 0.582, "rewards/accuracies": 1.0, "rewards/chosen": 0.5718865394592285, "rewards/margins": 0.2504613399505615, "rewards/rejected": 0.321425199508667, "step": 1530 }, { "epoch": 0.07149821254468638, "grad_norm": 103.46186828613281, "learning_rate": 4.88122320751505e-07, "logits/chosen": -18.383920669555664, "logits/rejected": -17.68990707397461, "logps/chosen": -475.8519592285156, "logps/rejected": -379.42694091796875, "loss": 0.6472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6733349561691284, "rewards/margins": 0.10404417663812637, "rewards/rejected": 0.5692907571792603, "step": 1540 }, { "epoch": 0.0719624866521194, "grad_norm": 87.12074279785156, "learning_rate": 4.880449417335995e-07, "logits/chosen": -18.634735107421875, "logits/rejected": -17.535686492919922, "logps/chosen": -444.34844970703125, "logps/rejected": -277.8131103515625, "loss": 0.6253, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5482608079910278, "rewards/margins": 0.16822117567062378, "rewards/rejected": 0.38003963232040405, "step": 1550 }, { "epoch": 0.07242676075955244, "grad_norm": 116.55585479736328, "learning_rate": 4.87967562715694e-07, "logits/chosen": -18.72454261779785, "logits/rejected": -17.729074478149414, "logps/chosen": -460.79498291015625, "logps/rejected": -427.98876953125, "loss": 0.6727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5321301817893982, "rewards/margins": 0.04950423911213875, "rewards/rejected": 0.48262590169906616, "step": 1560 }, { "epoch": 0.07289103486698546, "grad_norm": 64.51885986328125, "learning_rate": 4.878901836977885e-07, "logits/chosen": -17.8709659576416, "logits/rejected": -17.424358367919922, "logps/chosen": -384.38836669921875, "logps/rejected": -338.8094482421875, "loss": 0.6324, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5189118385314941, "rewards/margins": 0.138239786028862, "rewards/rejected": 0.38067203760147095, "step": 1570 }, { "epoch": 0.0733553089744185, "grad_norm": 142.8587188720703, "learning_rate": 4.87812804679883e-07, "logits/chosen": -17.808259963989258, "logits/rejected": -17.04592514038086, "logps/chosen": -452.1905822753906, "logps/rejected": -364.4619140625, "loss": 0.7016, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.45753559470176697, "rewards/margins": -0.0009962513577193022, "rewards/rejected": 0.45853179693222046, "step": 1580 }, { "epoch": 0.07381958308185152, "grad_norm": 48.808101654052734, "learning_rate": 4.877354256619775e-07, "logits/chosen": -17.744670867919922, "logits/rejected": -17.012489318847656, "logps/chosen": -423.5181579589844, "logps/rejected": -340.818603515625, "loss": 0.6422, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5690452456474304, "rewards/margins": 0.11189093440771103, "rewards/rejected": 0.4571543335914612, "step": 1590 }, { "epoch": 0.07428385718928456, "grad_norm": 49.52970886230469, "learning_rate": 4.87658046644072e-07, "logits/chosen": -18.909317016601562, "logits/rejected": -18.5487060546875, "logps/chosen": -517.1580810546875, "logps/rejected": -484.71044921875, "loss": 0.6834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6741967797279358, "rewards/margins": 0.04392697662115097, "rewards/rejected": 0.6302698254585266, "step": 1600 }, { "epoch": 0.07474813129671758, "grad_norm": 67.08556365966797, "learning_rate": 4.875806676261664e-07, "logits/chosen": -18.364171981811523, "logits/rejected": -18.4433650970459, "logps/chosen": -293.70684814453125, "logps/rejected": -366.04315185546875, "loss": 0.7683, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.49851781129837036, "rewards/margins": -0.13447466492652893, "rewards/rejected": 0.6329923868179321, "step": 1610 }, { "epoch": 0.07521240540415061, "grad_norm": 61.81727600097656, "learning_rate": 4.87503288608261e-07, "logits/chosen": -17.657337188720703, "logits/rejected": -16.559608459472656, "logps/chosen": -442.38916015625, "logps/rejected": -325.5010681152344, "loss": 0.6105, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5975698232650757, "rewards/margins": 0.19874519109725952, "rewards/rejected": 0.39882463216781616, "step": 1620 }, { "epoch": 0.07567667951158363, "grad_norm": 79.7874526977539, "learning_rate": 4.874259095903554e-07, "logits/chosen": -18.765840530395508, "logits/rejected": -18.18459701538086, "logps/chosen": -396.16680908203125, "logps/rejected": -320.849609375, "loss": 0.6332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5535552501678467, "rewards/margins": 0.1301220804452896, "rewards/rejected": 0.4234333038330078, "step": 1630 }, { "epoch": 0.07614095361901667, "grad_norm": 92.83345794677734, "learning_rate": 4.8734853057245e-07, "logits/chosen": -19.331418991088867, "logits/rejected": -18.3689022064209, "logps/chosen": -448.5738830566406, "logps/rejected": -372.8524169921875, "loss": 0.6769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5191835165023804, "rewards/margins": 0.05061521381139755, "rewards/rejected": 0.4685683250427246, "step": 1640 }, { "epoch": 0.07660522772644969, "grad_norm": 108.24457550048828, "learning_rate": 4.872711515545445e-07, "logits/chosen": -18.134723663330078, "logits/rejected": -17.596773147583008, "logps/chosen": -468.06878662109375, "logps/rejected": -357.3435363769531, "loss": 0.6692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5601024031639099, "rewards/margins": 0.06732034683227539, "rewards/rejected": 0.4927820563316345, "step": 1650 }, { "epoch": 0.07706950183388273, "grad_norm": 54.26335144042969, "learning_rate": 4.871937725366389e-07, "logits/chosen": -17.483257293701172, "logits/rejected": -16.881423950195312, "logps/chosen": -422.5616149902344, "logps/rejected": -289.44671630859375, "loss": 0.67, "rewards/accuracies": 0.5, "rewards/chosen": 0.5266963839530945, "rewards/margins": 0.059659767895936966, "rewards/rejected": 0.46703657507896423, "step": 1660 }, { "epoch": 0.07753377594131575, "grad_norm": 58.32967758178711, "learning_rate": 4.871163935187335e-07, "logits/chosen": -17.126556396484375, "logits/rejected": -16.531463623046875, "logps/chosen": -345.9886169433594, "logps/rejected": -252.66696166992188, "loss": 0.6386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5617309212684631, "rewards/margins": 0.16213953495025635, "rewards/rejected": 0.3995913565158844, "step": 1670 }, { "epoch": 0.07799805004874878, "grad_norm": 106.8121566772461, "learning_rate": 4.870390145008279e-07, "logits/chosen": -17.60610580444336, "logits/rejected": -17.394969940185547, "logps/chosen": -288.820556640625, "logps/rejected": -301.4974060058594, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.39688175916671753, "rewards/margins": 0.024074047803878784, "rewards/rejected": 0.37280768156051636, "step": 1680 }, { "epoch": 0.0784623241561818, "grad_norm": 56.05408477783203, "learning_rate": 4.869616354829224e-07, "logits/chosen": -17.645280838012695, "logits/rejected": -17.574947357177734, "logps/chosen": -318.57330322265625, "logps/rejected": -377.03704833984375, "loss": 0.7442, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.462031751871109, "rewards/margins": -0.08593815565109253, "rewards/rejected": 0.5479698777198792, "step": 1690 }, { "epoch": 0.07892659826361484, "grad_norm": 83.4775619506836, "learning_rate": 4.868842564650169e-07, "logits/chosen": -18.291784286499023, "logits/rejected": -17.671274185180664, "logps/chosen": -396.4473571777344, "logps/rejected": -336.1697082519531, "loss": 0.6675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.48140302300453186, "rewards/margins": 0.06673283874988556, "rewards/rejected": 0.4146701693534851, "step": 1700 }, { "epoch": 0.07939087237104786, "grad_norm": 65.656005859375, "learning_rate": 4.868068774471114e-07, "logits/chosen": -18.378108978271484, "logits/rejected": -16.71099090576172, "logps/chosen": -481.5398864746094, "logps/rejected": -366.8503723144531, "loss": 0.6181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7448965311050415, "rewards/margins": 0.1773163229227066, "rewards/rejected": 0.5675802230834961, "step": 1710 }, { "epoch": 0.0798551464784809, "grad_norm": 99.62411499023438, "learning_rate": 4.867294984292059e-07, "logits/chosen": -17.980512619018555, "logits/rejected": -16.694725036621094, "logps/chosen": -409.92083740234375, "logps/rejected": -282.7818298339844, "loss": 0.6223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6242854595184326, "rewards/margins": 0.17953576147556305, "rewards/rejected": 0.44474974274635315, "step": 1720 }, { "epoch": 0.08031942058591392, "grad_norm": 75.54143524169922, "learning_rate": 4.866521194113004e-07, "logits/chosen": -18.226497650146484, "logits/rejected": -16.947925567626953, "logps/chosen": -490.7179260253906, "logps/rejected": -336.7438659667969, "loss": 0.6078, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6947551965713501, "rewards/margins": 0.21259483695030212, "rewards/rejected": 0.48216041922569275, "step": 1730 }, { "epoch": 0.08078369469334695, "grad_norm": 50.590492248535156, "learning_rate": 4.865747403933949e-07, "logits/chosen": -17.2364501953125, "logits/rejected": -16.479930877685547, "logps/chosen": -312.2391052246094, "logps/rejected": -225.59957885742188, "loss": 0.6325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5066078901290894, "rewards/margins": 0.1485365629196167, "rewards/rejected": 0.35807138681411743, "step": 1740 }, { "epoch": 0.08124796880077997, "grad_norm": 92.8788833618164, "learning_rate": 4.864973613754894e-07, "logits/chosen": -18.694761276245117, "logits/rejected": -17.650665283203125, "logps/chosen": -313.19024658203125, "logps/rejected": -282.82135009765625, "loss": 0.6685, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.43606701493263245, "rewards/margins": 0.06667280197143555, "rewards/rejected": 0.3693942129611969, "step": 1750 }, { "epoch": 0.08171224290821301, "grad_norm": 127.96248626708984, "learning_rate": 4.864199823575839e-07, "logits/chosen": -18.193296432495117, "logits/rejected": -18.174406051635742, "logps/chosen": -483.4544982910156, "logps/rejected": -472.88037109375, "loss": 0.7183, "rewards/accuracies": 0.5, "rewards/chosen": 0.5613983869552612, "rewards/margins": -0.029890218749642372, "rewards/rejected": 0.591288685798645, "step": 1760 }, { "epoch": 0.08217651701564603, "grad_norm": 90.02924346923828, "learning_rate": 4.863426033396784e-07, "logits/chosen": -18.4068660736084, "logits/rejected": -17.178241729736328, "logps/chosen": -401.6182556152344, "logps/rejected": -262.69708251953125, "loss": 0.5923, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5424171686172485, "rewards/margins": 0.2423754632472992, "rewards/rejected": 0.30004170536994934, "step": 1770 }, { "epoch": 0.08264079112307907, "grad_norm": 76.54768371582031, "learning_rate": 4.862652243217728e-07, "logits/chosen": -18.044178009033203, "logits/rejected": -18.25263214111328, "logps/chosen": -453.02398681640625, "logps/rejected": -442.2369079589844, "loss": 0.7165, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.6511608958244324, "rewards/margins": -0.029546350240707397, "rewards/rejected": 0.6807072758674622, "step": 1780 }, { "epoch": 0.0831050652305121, "grad_norm": 80.32431030273438, "learning_rate": 4.861878453038674e-07, "logits/chosen": -17.670766830444336, "logits/rejected": -17.22195053100586, "logps/chosen": -236.57821655273438, "logps/rejected": -203.2546844482422, "loss": 0.6606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.36911219358444214, "rewards/margins": 0.07924602180719376, "rewards/rejected": 0.289866179227829, "step": 1790 }, { "epoch": 0.08356933933794512, "grad_norm": 73.9656753540039, "learning_rate": 4.861104662859619e-07, "logits/chosen": -19.750873565673828, "logits/rejected": -17.260507583618164, "logps/chosen": -450.1795349121094, "logps/rejected": -295.4723205566406, "loss": 0.6005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5914038419723511, "rewards/margins": 0.2087981253862381, "rewards/rejected": 0.3826057016849518, "step": 1800 }, { "epoch": 0.08403361344537816, "grad_norm": 59.693721771240234, "learning_rate": 4.860330872680563e-07, "logits/chosen": -18.173782348632812, "logits/rejected": -16.962162017822266, "logps/chosen": -433.22784423828125, "logps/rejected": -298.115966796875, "loss": 0.6039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6209449768066406, "rewards/margins": 0.21697357296943665, "rewards/rejected": 0.40397143363952637, "step": 1810 }, { "epoch": 0.08449788755281118, "grad_norm": 23.566818237304688, "learning_rate": 4.859557082501509e-07, "logits/chosen": -17.608619689941406, "logits/rejected": -16.46822738647461, "logps/chosen": -344.6357727050781, "logps/rejected": -241.37228393554688, "loss": 0.6046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6288691163063049, "rewards/margins": 0.2093786746263504, "rewards/rejected": 0.41949042677879333, "step": 1820 }, { "epoch": 0.08496216166024421, "grad_norm": 36.782127380371094, "learning_rate": 4.858783292322454e-07, "logits/chosen": -17.360408782958984, "logits/rejected": -16.96047592163086, "logps/chosen": -341.0753173828125, "logps/rejected": -256.810546875, "loss": 0.6584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5007798671722412, "rewards/margins": 0.08237586915493011, "rewards/rejected": 0.4184040129184723, "step": 1830 }, { "epoch": 0.08542643576767724, "grad_norm": 67.95398712158203, "learning_rate": 4.858009502143399e-07, "logits/chosen": -17.795991897583008, "logits/rejected": -16.883739471435547, "logps/chosen": -340.3552551269531, "logps/rejected": -189.96319580078125, "loss": 0.6157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5450149178504944, "rewards/margins": 0.19493715465068817, "rewards/rejected": 0.35007768869400024, "step": 1840 }, { "epoch": 0.08589070987511027, "grad_norm": 74.89543914794922, "learning_rate": 4.857235711964344e-07, "logits/chosen": -17.68325424194336, "logits/rejected": -17.145217895507812, "logps/chosen": -380.21319580078125, "logps/rejected": -248.33132934570312, "loss": 0.6209, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6229391098022461, "rewards/margins": 0.16226279735565186, "rewards/rejected": 0.460676372051239, "step": 1850 }, { "epoch": 0.08635498398254329, "grad_norm": 109.53936767578125, "learning_rate": 4.856461921785288e-07, "logits/chosen": -17.769367218017578, "logits/rejected": -16.856521606445312, "logps/chosen": -410.172119140625, "logps/rejected": -304.9763488769531, "loss": 0.6326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6514410376548767, "rewards/margins": 0.16538682579994202, "rewards/rejected": 0.4860542416572571, "step": 1860 }, { "epoch": 0.08681925808997633, "grad_norm": 78.71818542480469, "learning_rate": 4.855688131606234e-07, "logits/chosen": -18.112083435058594, "logits/rejected": -17.14248275756836, "logps/chosen": -452.9837951660156, "logps/rejected": -318.2974853515625, "loss": 0.6129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6723317503929138, "rewards/margins": 0.22206158936023712, "rewards/rejected": 0.45027023553848267, "step": 1870 }, { "epoch": 0.08728353219740935, "grad_norm": 45.057552337646484, "learning_rate": 4.854914341427179e-07, "logits/chosen": -17.457246780395508, "logits/rejected": -16.944238662719727, "logps/chosen": -376.3883361816406, "logps/rejected": -336.275390625, "loss": 0.6564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6053230166435242, "rewards/margins": 0.10686051845550537, "rewards/rejected": 0.49846261739730835, "step": 1880 }, { "epoch": 0.08774780630484239, "grad_norm": 95.40846252441406, "learning_rate": 4.854140551248123e-07, "logits/chosen": -16.695148468017578, "logits/rejected": -17.275348663330078, "logps/chosen": -272.65826416015625, "logps/rejected": -320.776611328125, "loss": 0.7269, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.4796414375305176, "rewards/margins": -0.05303662270307541, "rewards/rejected": 0.5326780676841736, "step": 1890 }, { "epoch": 0.0882120804122754, "grad_norm": 50.16180419921875, "learning_rate": 4.853366761069068e-07, "logits/chosen": -18.677167892456055, "logits/rejected": -17.031681060791016, "logps/chosen": -458.7247619628906, "logps/rejected": -301.615234375, "loss": 0.607, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7406800985336304, "rewards/margins": 0.22383394837379456, "rewards/rejected": 0.5168461799621582, "step": 1900 }, { "epoch": 0.08867635451970844, "grad_norm": 54.33992385864258, "learning_rate": 4.852592970890013e-07, "logits/chosen": -17.78215789794922, "logits/rejected": -17.578716278076172, "logps/chosen": -403.819091796875, "logps/rejected": -407.1756896972656, "loss": 0.6708, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.7227146625518799, "rewards/margins": 0.06925858557224274, "rewards/rejected": 0.6534560918807983, "step": 1910 }, { "epoch": 0.08914062862714146, "grad_norm": 65.805908203125, "learning_rate": 4.851819180710958e-07, "logits/chosen": -17.60355567932129, "logits/rejected": -18.03778839111328, "logps/chosen": -250.0842742919922, "logps/rejected": -294.4638671875, "loss": 0.8016, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.40966033935546875, "rewards/margins": -0.16366079449653625, "rewards/rejected": 0.5733211636543274, "step": 1920 }, { "epoch": 0.0896049027345745, "grad_norm": 107.53939819335938, "learning_rate": 4.851045390531904e-07, "logits/chosen": -18.134449005126953, "logits/rejected": -16.202146530151367, "logps/chosen": -472.7508239746094, "logps/rejected": -208.8966522216797, "loss": 0.553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7111595869064331, "rewards/margins": 0.3357184827327728, "rewards/rejected": 0.3754410445690155, "step": 1930 }, { "epoch": 0.09006917684200752, "grad_norm": 40.819305419921875, "learning_rate": 4.850271600352848e-07, "logits/chosen": -18.5687198638916, "logits/rejected": -17.733400344848633, "logps/chosen": -465.13677978515625, "logps/rejected": -361.72821044921875, "loss": 0.5887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7880239486694336, "rewards/margins": 0.24639682471752167, "rewards/rejected": 0.5416271090507507, "step": 1940 }, { "epoch": 0.09053345094944056, "grad_norm": 63.63543701171875, "learning_rate": 4.849497810173793e-07, "logits/chosen": -17.793001174926758, "logits/rejected": -17.407913208007812, "logps/chosen": -473.5541076660156, "logps/rejected": -445.09326171875, "loss": 0.644, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7864596247673035, "rewards/margins": 0.1213640570640564, "rewards/rejected": 0.6650956273078918, "step": 1950 }, { "epoch": 0.09099772505687358, "grad_norm": 190.38125610351562, "learning_rate": 4.848724019994738e-07, "logits/chosen": -18.898963928222656, "logits/rejected": -17.882858276367188, "logps/chosen": -472.2024841308594, "logps/rejected": -395.65203857421875, "loss": 0.6997, "rewards/accuracies": 0.5, "rewards/chosen": 0.6758028864860535, "rewards/margins": -0.0072074830532073975, "rewards/rejected": 0.6830103993415833, "step": 1960 }, { "epoch": 0.09146199916430661, "grad_norm": 66.22232055664062, "learning_rate": 4.847950229815683e-07, "logits/chosen": -18.509891510009766, "logits/rejected": -17.204824447631836, "logps/chosen": -502.33111572265625, "logps/rejected": -335.5827941894531, "loss": 0.5741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9399001002311707, "rewards/margins": 0.2890242040157318, "rewards/rejected": 0.6508758664131165, "step": 1970 }, { "epoch": 0.09192627327173963, "grad_norm": 117.98173522949219, "learning_rate": 4.847176439636628e-07, "logits/chosen": -17.664836883544922, "logits/rejected": -17.842824935913086, "logps/chosen": -321.25897216796875, "logps/rejected": -303.2997741699219, "loss": 0.6834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5545287132263184, "rewards/margins": 0.035373471677303314, "rewards/rejected": 0.5191552639007568, "step": 1980 }, { "epoch": 0.09239054737917267, "grad_norm": 140.4456329345703, "learning_rate": 4.846402649457573e-07, "logits/chosen": -18.050159454345703, "logits/rejected": -17.933231353759766, "logps/chosen": -414.63751220703125, "logps/rejected": -377.9574890136719, "loss": 0.6592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6445776224136353, "rewards/margins": 0.08420532941818237, "rewards/rejected": 0.5603722929954529, "step": 1990 }, { "epoch": 0.09285482148660569, "grad_norm": 44.878719329833984, "learning_rate": 4.845628859278518e-07, "logits/chosen": -16.972450256347656, "logits/rejected": -17.167776107788086, "logps/chosen": -313.7131652832031, "logps/rejected": -321.7476501464844, "loss": 0.6991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.588722825050354, "rewards/margins": 0.019326109439134598, "rewards/rejected": 0.5693967342376709, "step": 2000 }, { "epoch": 0.09331909559403873, "grad_norm": 54.33668899536133, "learning_rate": 4.844855069099462e-07, "logits/chosen": -17.698705673217773, "logits/rejected": -17.056135177612305, "logps/chosen": -290.8943786621094, "logps/rejected": -290.42529296875, "loss": 0.6636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5628340244293213, "rewards/margins": 0.09100433439016342, "rewards/rejected": 0.4718296527862549, "step": 2010 }, { "epoch": 0.09378336970147175, "grad_norm": 77.43334197998047, "learning_rate": 4.844081278920408e-07, "logits/chosen": -18.017467498779297, "logits/rejected": -16.009937286376953, "logps/chosen": -369.42022705078125, "logps/rejected": -224.2716064453125, "loss": 0.4883, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8709189295768738, "rewards/margins": 0.5251386761665344, "rewards/rejected": 0.34578031301498413, "step": 2020 }, { "epoch": 0.09424764380890478, "grad_norm": 60.092323303222656, "learning_rate": 4.843307488741353e-07, "logits/chosen": -17.830522537231445, "logits/rejected": -17.07175064086914, "logps/chosen": -301.66192626953125, "logps/rejected": -251.98275756835938, "loss": 0.658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.680157482624054, "rewards/margins": 0.10717363655567169, "rewards/rejected": 0.5729838609695435, "step": 2030 }, { "epoch": 0.0947119179163378, "grad_norm": 124.5689926147461, "learning_rate": 4.842533698562298e-07, "logits/chosen": -17.617006301879883, "logits/rejected": -16.776235580444336, "logps/chosen": -505.61798095703125, "logps/rejected": -400.3490295410156, "loss": 0.5856, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8221436738967896, "rewards/margins": 0.24489648640155792, "rewards/rejected": 0.5772470235824585, "step": 2040 }, { "epoch": 0.09517619202377084, "grad_norm": 90.19288635253906, "learning_rate": 4.841759908383243e-07, "logits/chosen": -19.065584182739258, "logits/rejected": -19.091144561767578, "logps/chosen": -409.0559387207031, "logps/rejected": -369.16912841796875, "loss": 0.7123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6259023547172546, "rewards/margins": -0.005338144488632679, "rewards/rejected": 0.6312404870986938, "step": 2050 }, { "epoch": 0.09564046613120386, "grad_norm": 65.63469696044922, "learning_rate": 4.840986118204187e-07, "logits/chosen": -18.2860107421875, "logits/rejected": -17.952938079833984, "logps/chosen": -386.3915100097656, "logps/rejected": -328.47296142578125, "loss": 0.6695, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7082452774047852, "rewards/margins": 0.07683010399341583, "rewards/rejected": 0.6314151287078857, "step": 2060 }, { "epoch": 0.0961047402386369, "grad_norm": 155.5994110107422, "learning_rate": 4.840212328025132e-07, "logits/chosen": -17.648813247680664, "logits/rejected": -18.203105926513672, "logps/chosen": -396.8884582519531, "logps/rejected": -535.1634521484375, "loss": 0.7985, "rewards/accuracies": 0.5, "rewards/chosen": 0.5916995406150818, "rewards/margins": -0.1718687266111374, "rewards/rejected": 0.7635682821273804, "step": 2070 }, { "epoch": 0.09656901434606992, "grad_norm": 59.42793273925781, "learning_rate": 4.839438537846078e-07, "logits/chosen": -19.53432273864746, "logits/rejected": -18.31902503967285, "logps/chosen": -323.88507080078125, "logps/rejected": -266.720703125, "loss": 0.591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7296180725097656, "rewards/margins": 0.2342008650302887, "rewards/rejected": 0.49541720747947693, "step": 2080 }, { "epoch": 0.09703328845350295, "grad_norm": 39.468910217285156, "learning_rate": 4.838664747667022e-07, "logits/chosen": -18.22380828857422, "logits/rejected": -17.17070198059082, "logps/chosen": -442.52764892578125, "logps/rejected": -303.20135498046875, "loss": 0.5984, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.914175808429718, "rewards/margins": 0.26631778478622437, "rewards/rejected": 0.6478579640388489, "step": 2090 }, { "epoch": 0.09749756256093597, "grad_norm": 63.66200637817383, "learning_rate": 4.837890957487967e-07, "logits/chosen": -18.532306671142578, "logits/rejected": -17.354076385498047, "logps/chosen": -468.48199462890625, "logps/rejected": -358.21832275390625, "loss": 0.6492, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.9939544796943665, "rewards/margins": 0.15653808414936066, "rewards/rejected": 0.8374164700508118, "step": 2100 }, { "epoch": 0.09796183666836901, "grad_norm": 78.36710357666016, "learning_rate": 4.837117167308912e-07, "logits/chosen": -18.19797134399414, "logits/rejected": -18.072956085205078, "logps/chosen": -435.4071350097656, "logps/rejected": -439.64959716796875, "loss": 0.6751, "rewards/accuracies": 0.5, "rewards/chosen": 0.7301149368286133, "rewards/margins": 0.06131943315267563, "rewards/rejected": 0.6687954664230347, "step": 2110 }, { "epoch": 0.09842611077580203, "grad_norm": 53.99755096435547, "learning_rate": 4.836343377129857e-07, "logits/chosen": -18.04253387451172, "logits/rejected": -16.90200424194336, "logps/chosen": -403.41259765625, "logps/rejected": -312.15447998046875, "loss": 0.6041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7076719999313354, "rewards/margins": 0.22160449624061584, "rewards/rejected": 0.4860674738883972, "step": 2120 }, { "epoch": 0.09889038488323507, "grad_norm": 60.67346954345703, "learning_rate": 4.835569586950803e-07, "logits/chosen": -17.69927406311035, "logits/rejected": -17.509014129638672, "logps/chosen": -380.77777099609375, "logps/rejected": -372.3981018066406, "loss": 0.6747, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.7827194333076477, "rewards/margins": 0.0652468204498291, "rewards/rejected": 0.7174726128578186, "step": 2130 }, { "epoch": 0.09935465899066809, "grad_norm": 95.74467468261719, "learning_rate": 4.834795796771747e-07, "logits/chosen": -17.9704647064209, "logits/rejected": -17.339153289794922, "logps/chosen": -397.23712158203125, "logps/rejected": -330.6417541503906, "loss": 0.6245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8199732899665833, "rewards/margins": 0.1479063183069229, "rewards/rejected": 0.6720669269561768, "step": 2140 }, { "epoch": 0.09981893309810112, "grad_norm": 47.030704498291016, "learning_rate": 4.834022006592692e-07, "logits/chosen": -18.030792236328125, "logits/rejected": -17.52434730529785, "logps/chosen": -444.32196044921875, "logps/rejected": -292.970947265625, "loss": 0.5573, "rewards/accuracies": 1.0, "rewards/chosen": 0.854641318321228, "rewards/margins": 0.311251163482666, "rewards/rejected": 0.5433902740478516, "step": 2150 }, { "epoch": 0.10028320720553414, "grad_norm": 140.5346221923828, "learning_rate": 4.833248216413637e-07, "logits/chosen": -17.800134658813477, "logits/rejected": -17.709041595458984, "logps/chosen": -506.1802673339844, "logps/rejected": -494.30999755859375, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.8517863154411316, "rewards/margins": 0.023613814264535904, "rewards/rejected": 0.828172504901886, "step": 2160 }, { "epoch": 0.10074748131296718, "grad_norm": 98.68944549560547, "learning_rate": 4.832474426234582e-07, "logits/chosen": -19.304662704467773, "logits/rejected": -18.998607635498047, "logps/chosen": -349.1988525390625, "logps/rejected": -356.017578125, "loss": 0.7487, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.5662924647331238, "rewards/margins": -0.09928234666585922, "rewards/rejected": 0.665574848651886, "step": 2170 }, { "epoch": 0.1012117554204002, "grad_norm": 68.68348693847656, "learning_rate": 4.831700636055527e-07, "logits/chosen": -17.157506942749023, "logits/rejected": -16.219327926635742, "logps/chosen": -421.90447998046875, "logps/rejected": -337.35565185546875, "loss": 0.6541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7662236094474792, "rewards/margins": 0.10680526494979858, "rewards/rejected": 0.6594182848930359, "step": 2180 }, { "epoch": 0.10167602952783324, "grad_norm": 105.27812194824219, "learning_rate": 4.830926845876472e-07, "logits/chosen": -18.371929168701172, "logits/rejected": -17.521154403686523, "logps/chosen": -472.67376708984375, "logps/rejected": -396.5506591796875, "loss": 0.6644, "rewards/accuracies": 0.5, "rewards/chosen": 0.8455559015274048, "rewards/margins": 0.09095527976751328, "rewards/rejected": 0.7546006441116333, "step": 2190 }, { "epoch": 0.10214030363526626, "grad_norm": 87.99860382080078, "learning_rate": 4.830153055697417e-07, "logits/chosen": -17.66928482055664, "logits/rejected": -17.10256004333496, "logps/chosen": -468.38946533203125, "logps/rejected": -393.57379150390625, "loss": 0.7279, "rewards/accuracies": 0.5, "rewards/chosen": 0.9999386072158813, "rewards/margins": 0.047902870923280716, "rewards/rejected": 0.9520357251167297, "step": 2200 }, { "epoch": 0.1026045777426993, "grad_norm": 125.83492279052734, "learning_rate": 4.829379265518361e-07, "logits/chosen": -17.88067626953125, "logits/rejected": -16.482975006103516, "logps/chosen": -561.5755615234375, "logps/rejected": -403.8124694824219, "loss": 0.5511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1470363140106201, "rewards/margins": 0.4363330006599426, "rewards/rejected": 0.7107033729553223, "step": 2210 }, { "epoch": 0.10306885185013231, "grad_norm": 61.61265563964844, "learning_rate": 4.828605475339307e-07, "logits/chosen": -17.77659797668457, "logits/rejected": -17.658945083618164, "logps/chosen": -287.9621276855469, "logps/rejected": -333.7281799316406, "loss": 0.7225, "rewards/accuracies": 0.5, "rewards/chosen": 0.6634413599967957, "rewards/margins": -0.0232247207313776, "rewards/rejected": 0.6866661310195923, "step": 2220 }, { "epoch": 0.10353312595756535, "grad_norm": 125.81132507324219, "learning_rate": 4.827831685160252e-07, "logits/chosen": -18.503496170043945, "logits/rejected": -16.916393280029297, "logps/chosen": -491.935791015625, "logps/rejected": -286.14447021484375, "loss": 0.5172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0039379596710205, "rewards/margins": 0.4491646885871887, "rewards/rejected": 0.5547733306884766, "step": 2230 }, { "epoch": 0.10399740006499837, "grad_norm": 53.58476257324219, "learning_rate": 4.827057894981197e-07, "logits/chosen": -18.15070915222168, "logits/rejected": -16.985034942626953, "logps/chosen": -384.81976318359375, "logps/rejected": -245.6180419921875, "loss": 0.5293, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8956981897354126, "rewards/margins": 0.4259887635707855, "rewards/rejected": 0.4697093367576599, "step": 2240 }, { "epoch": 0.1044616741724314, "grad_norm": 112.8512191772461, "learning_rate": 4.826284104802142e-07, "logits/chosen": -18.445804595947266, "logits/rejected": -17.737274169921875, "logps/chosen": -363.4771423339844, "logps/rejected": -245.98641967773438, "loss": 0.5979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7418502569198608, "rewards/margins": 0.2232627421617508, "rewards/rejected": 0.5185874700546265, "step": 2250 }, { "epoch": 0.10492594827986443, "grad_norm": 74.39509582519531, "learning_rate": 4.825510314623086e-07, "logits/chosen": -18.99879264831543, "logits/rejected": -18.22683334350586, "logps/chosen": -366.16326904296875, "logps/rejected": -335.0374450683594, "loss": 0.6546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.753414511680603, "rewards/margins": 0.10318901389837265, "rewards/rejected": 0.6502255201339722, "step": 2260 }, { "epoch": 0.10539022238729746, "grad_norm": 70.75374603271484, "learning_rate": 4.824736524444031e-07, "logits/chosen": -17.924072265625, "logits/rejected": -17.872100830078125, "logps/chosen": -341.3895568847656, "logps/rejected": -377.9072265625, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.7197753190994263, "rewards/margins": 0.0025176629424095154, "rewards/rejected": 0.7172577977180481, "step": 2270 }, { "epoch": 0.10585449649473049, "grad_norm": 81.46082305908203, "learning_rate": 4.823962734264977e-07, "logits/chosen": -18.614233016967773, "logits/rejected": -17.71453857421875, "logps/chosen": -427.98651123046875, "logps/rejected": -355.6021423339844, "loss": 0.558, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8453369140625, "rewards/margins": 0.32714951038360596, "rewards/rejected": 0.518187403678894, "step": 2280 }, { "epoch": 0.10631877060216352, "grad_norm": 116.47297668457031, "learning_rate": 4.823188944085921e-07, "logits/chosen": -18.397390365600586, "logits/rejected": -17.870250701904297, "logps/chosen": -382.6867370605469, "logps/rejected": -309.4893493652344, "loss": 0.6544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9727994799613953, "rewards/margins": 0.11257584393024445, "rewards/rejected": 0.860223650932312, "step": 2290 }, { "epoch": 0.10678304470959654, "grad_norm": 82.68609619140625, "learning_rate": 4.822415153906866e-07, "logits/chosen": -17.84885597229004, "logits/rejected": -17.0738468170166, "logps/chosen": -449.9215393066406, "logps/rejected": -325.0715637207031, "loss": 0.5881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0406711101531982, "rewards/margins": 0.3033235967159271, "rewards/rejected": 0.7373474836349487, "step": 2300 }, { "epoch": 0.10724731881702958, "grad_norm": 67.40172576904297, "learning_rate": 4.821641363727812e-07, "logits/chosen": -17.761226654052734, "logits/rejected": -16.667800903320312, "logps/chosen": -355.2383728027344, "logps/rejected": -247.6430206298828, "loss": 0.6547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5675383806228638, "rewards/margins": 0.10901045799255371, "rewards/rejected": 0.45852789282798767, "step": 2310 }, { "epoch": 0.1077115929244626, "grad_norm": 122.42217254638672, "learning_rate": 4.820867573548756e-07, "logits/chosen": -17.956510543823242, "logits/rejected": -16.905887603759766, "logps/chosen": -529.1190795898438, "logps/rejected": -353.32122802734375, "loss": 0.5126, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0442266464233398, "rewards/margins": 0.4557298719882965, "rewards/rejected": 0.5884968042373657, "step": 2320 }, { "epoch": 0.10817586703189563, "grad_norm": 132.45359802246094, "learning_rate": 4.820093783369702e-07, "logits/chosen": -18.91446304321289, "logits/rejected": -18.26141357421875, "logps/chosen": -502.7080993652344, "logps/rejected": -423.0199279785156, "loss": 0.5668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0038896799087524, "rewards/margins": 0.2925027012825012, "rewards/rejected": 0.7113870978355408, "step": 2330 }, { "epoch": 0.10864014113932866, "grad_norm": 79.14881134033203, "learning_rate": 4.819319993190646e-07, "logits/chosen": -18.250789642333984, "logits/rejected": -17.249446868896484, "logps/chosen": -435.1739196777344, "logps/rejected": -389.5520935058594, "loss": 0.6389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8666135668754578, "rewards/margins": 0.14137999713420868, "rewards/rejected": 0.7252336144447327, "step": 2340 }, { "epoch": 0.10910441524676169, "grad_norm": 110.39582824707031, "learning_rate": 4.818546203011591e-07, "logits/chosen": -18.548419952392578, "logits/rejected": -17.059154510498047, "logps/chosen": -351.40545654296875, "logps/rejected": -247.23526000976562, "loss": 0.6114, "rewards/accuracies": 0.5, "rewards/chosen": 0.8149937391281128, "rewards/margins": 0.24041399359703064, "rewards/rejected": 0.5745797753334045, "step": 2350 }, { "epoch": 0.10956868935419471, "grad_norm": 100.79107666015625, "learning_rate": 4.817772412832535e-07, "logits/chosen": -18.542999267578125, "logits/rejected": -18.21953582763672, "logps/chosen": -390.933349609375, "logps/rejected": -327.44549560546875, "loss": 0.6453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9583314657211304, "rewards/margins": 0.1342737078666687, "rewards/rejected": 0.8240577578544617, "step": 2360 }, { "epoch": 0.11003296346162775, "grad_norm": 139.94354248046875, "learning_rate": 4.816998622653481e-07, "logits/chosen": -19.022884368896484, "logits/rejected": -17.25049591064453, "logps/chosen": -483.370849609375, "logps/rejected": -365.0730895996094, "loss": 0.5636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1094160079956055, "rewards/margins": 0.37023693323135376, "rewards/rejected": 0.739179253578186, "step": 2370 }, { "epoch": 0.11049723756906077, "grad_norm": 155.689453125, "learning_rate": 4.816224832474426e-07, "logits/chosen": -18.341346740722656, "logits/rejected": -18.491275787353516, "logps/chosen": -497.0986328125, "logps/rejected": -430.9869079589844, "loss": 0.7243, "rewards/accuracies": 0.5, "rewards/chosen": 0.8660691380500793, "rewards/margins": -0.02304094098508358, "rewards/rejected": 0.8891100883483887, "step": 2380 }, { "epoch": 0.1109615116764938, "grad_norm": 148.50357055664062, "learning_rate": 4.815451042295371e-07, "logits/chosen": -18.180395126342773, "logits/rejected": -17.309289932250977, "logps/chosen": -490.78668212890625, "logps/rejected": -400.1272888183594, "loss": 0.6728, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9305247068405151, "rewards/margins": 0.12130211293697357, "rewards/rejected": 0.8092226982116699, "step": 2390 }, { "epoch": 0.11142578578392683, "grad_norm": 126.406982421875, "learning_rate": 4.814677252116316e-07, "logits/chosen": -19.58155632019043, "logits/rejected": -17.198787689208984, "logps/chosen": -427.54327392578125, "logps/rejected": -307.95367431640625, "loss": 0.5211, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1728274822235107, "rewards/margins": 0.502934455871582, "rewards/rejected": 0.6698930263519287, "step": 2400 }, { "epoch": 0.11189005989135986, "grad_norm": 114.98320007324219, "learning_rate": 4.81390346193726e-07, "logits/chosen": -17.515888214111328, "logits/rejected": -17.683483123779297, "logps/chosen": -455.2509765625, "logps/rejected": -421.0936584472656, "loss": 0.7485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9040152430534363, "rewards/margins": -0.09233381599187851, "rewards/rejected": 0.996349036693573, "step": 2410 }, { "epoch": 0.11235433399879288, "grad_norm": 145.29708862304688, "learning_rate": 4.813129671758206e-07, "logits/chosen": -18.971933364868164, "logits/rejected": -18.931621551513672, "logps/chosen": -538.6975708007812, "logps/rejected": -510.9749450683594, "loss": 0.6767, "rewards/accuracies": 0.5, "rewards/chosen": 1.0524933338165283, "rewards/margins": 0.07617338001728058, "rewards/rejected": 0.9763199090957642, "step": 2420 }, { "epoch": 0.11281860810622592, "grad_norm": 74.05197143554688, "learning_rate": 4.812355881579151e-07, "logits/chosen": -18.641586303710938, "logits/rejected": -18.365581512451172, "logps/chosen": -389.34808349609375, "logps/rejected": -351.9463195800781, "loss": 0.6533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8913818597793579, "rewards/margins": 0.12553253769874573, "rewards/rejected": 0.7658493518829346, "step": 2430 }, { "epoch": 0.11328288221365894, "grad_norm": 107.52838897705078, "learning_rate": 4.811582091400096e-07, "logits/chosen": -18.713407516479492, "logits/rejected": -17.91246223449707, "logps/chosen": -388.77301025390625, "logps/rejected": -316.64752197265625, "loss": 0.6207, "rewards/accuracies": 0.5, "rewards/chosen": 0.9112740755081177, "rewards/margins": 0.19196514785289764, "rewards/rejected": 0.7193089723587036, "step": 2440 }, { "epoch": 0.11374715632109197, "grad_norm": 71.60374450683594, "learning_rate": 4.81080830122104e-07, "logits/chosen": -18.171142578125, "logits/rejected": -17.022499084472656, "logps/chosen": -339.421875, "logps/rejected": -224.1710662841797, "loss": 0.5517, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.013153314590454, "rewards/margins": 0.3358451724052429, "rewards/rejected": 0.6773080229759216, "step": 2450 }, { "epoch": 0.114211430428525, "grad_norm": 56.041236877441406, "learning_rate": 4.810034511041986e-07, "logits/chosen": -18.627260208129883, "logits/rejected": -18.20598602294922, "logps/chosen": -370.0923767089844, "logps/rejected": -371.24713134765625, "loss": 0.7466, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.783030092716217, "rewards/margins": -0.0812315121293068, "rewards/rejected": 0.8642617464065552, "step": 2460 }, { "epoch": 0.11467570453595803, "grad_norm": 112.46121978759766, "learning_rate": 4.80926072086293e-07, "logits/chosen": -18.01566505432129, "logits/rejected": -18.729736328125, "logps/chosen": -439.43280029296875, "logps/rejected": -487.18701171875, "loss": 0.7313, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.9882141351699829, "rewards/margins": -0.014603453688323498, "rewards/rejected": 1.0028173923492432, "step": 2470 }, { "epoch": 0.11513997864339105, "grad_norm": 75.54328918457031, "learning_rate": 4.808486930683876e-07, "logits/chosen": -19.685848236083984, "logits/rejected": -18.092805862426758, "logps/chosen": -519.8130493164062, "logps/rejected": -339.397216796875, "loss": 0.5495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1650800704956055, "rewards/margins": 0.4103530943393707, "rewards/rejected": 0.7547270059585571, "step": 2480 }, { "epoch": 0.11560425275082409, "grad_norm": 83.9076919555664, "learning_rate": 4.80771314050482e-07, "logits/chosen": -18.674510955810547, "logits/rejected": -17.501386642456055, "logps/chosen": -485.7225646972656, "logps/rejected": -358.1546630859375, "loss": 0.5778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2130606174468994, "rewards/margins": 0.3164083659648895, "rewards/rejected": 0.8966522216796875, "step": 2490 }, { "epoch": 0.11606852685825711, "grad_norm": 100.23143005371094, "learning_rate": 4.806939350325765e-07, "logits/chosen": -18.035375595092773, "logits/rejected": -17.87656021118164, "logps/chosen": -330.4158935546875, "logps/rejected": -281.97711181640625, "loss": 0.7024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.859287440776825, "rewards/margins": 0.04603543132543564, "rewards/rejected": 0.8132519721984863, "step": 2500 }, { "epoch": 0.11653280096569014, "grad_norm": 45.48858642578125, "learning_rate": 4.806242939164616e-07, "logits/chosen": -18.045263290405273, "logits/rejected": -17.932270050048828, "logps/chosen": -412.5843811035156, "logps/rejected": -420.39007568359375, "loss": 0.7071, "rewards/accuracies": 0.5, "rewards/chosen": 0.8644174337387085, "rewards/margins": -0.00045988560304977, "rewards/rejected": 0.8648773431777954, "step": 2510 }, { "epoch": 0.11699707507312317, "grad_norm": 144.14578247070312, "learning_rate": 4.805546528003466e-07, "logits/chosen": -19.28897476196289, "logits/rejected": -18.633846282958984, "logps/chosen": -549.1910400390625, "logps/rejected": -416.89178466796875, "loss": 0.6419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9371858835220337, "rewards/margins": 0.16854164004325867, "rewards/rejected": 0.7686442136764526, "step": 2520 }, { "epoch": 0.1174613491805562, "grad_norm": 147.4124755859375, "learning_rate": 4.804772737824411e-07, "logits/chosen": -18.64548683166504, "logits/rejected": -17.783855438232422, "logps/chosen": -342.70025634765625, "logps/rejected": -235.15835571289062, "loss": 0.6452, "rewards/accuracies": 0.5, "rewards/chosen": 0.8711463809013367, "rewards/margins": 0.17704284191131592, "rewards/rejected": 0.6941035985946655, "step": 2530 }, { "epoch": 0.11792562328798922, "grad_norm": 75.02429962158203, "learning_rate": 4.803998947645357e-07, "logits/chosen": -19.121173858642578, "logits/rejected": -18.288373947143555, "logps/chosen": -420.7533264160156, "logps/rejected": -304.07861328125, "loss": 0.5797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.011380910873413, "rewards/margins": 0.30392593145370483, "rewards/rejected": 0.7074549794197083, "step": 2540 }, { "epoch": 0.11838989739542226, "grad_norm": 144.54110717773438, "learning_rate": 4.803225157466301e-07, "logits/chosen": -18.48688507080078, "logits/rejected": -18.07688331604004, "logps/chosen": -356.84014892578125, "logps/rejected": -308.43426513671875, "loss": 0.6254, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8099981546401978, "rewards/margins": 0.16664817929267883, "rewards/rejected": 0.6433498859405518, "step": 2550 }, { "epoch": 0.11885417150285528, "grad_norm": 81.31620788574219, "learning_rate": 4.802451367287246e-07, "logits/chosen": -18.396833419799805, "logits/rejected": -17.54888343811035, "logps/chosen": -470.0655212402344, "logps/rejected": -375.47735595703125, "loss": 0.6172, "rewards/accuracies": 0.5, "rewards/chosen": 1.0171281099319458, "rewards/margins": 0.23595204949378967, "rewards/rejected": 0.781175971031189, "step": 2560 }, { "epoch": 0.11931844561028832, "grad_norm": 24.206575393676758, "learning_rate": 4.801677577108192e-07, "logits/chosen": -18.527427673339844, "logits/rejected": -17.89687728881836, "logps/chosen": -425.4140625, "logps/rejected": -321.5638122558594, "loss": 0.6008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.974685549736023, "rewards/margins": 0.2710338830947876, "rewards/rejected": 0.7036517262458801, "step": 2570 }, { "epoch": 0.11978271971772134, "grad_norm": 89.27106475830078, "learning_rate": 4.800903786929136e-07, "logits/chosen": -18.835731506347656, "logits/rejected": -17.326528549194336, "logps/chosen": -465.90496826171875, "logps/rejected": -339.42340087890625, "loss": 0.6374, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9148057103157043, "rewards/margins": 0.15657725930213928, "rewards/rejected": 0.7582284808158875, "step": 2580 }, { "epoch": 0.12024699382515437, "grad_norm": 92.97062683105469, "learning_rate": 4.800129996750081e-07, "logits/chosen": -18.787553787231445, "logits/rejected": -17.83934211730957, "logps/chosen": -455.18157958984375, "logps/rejected": -387.33758544921875, "loss": 0.6532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9847016334533691, "rewards/margins": 0.09792238473892212, "rewards/rejected": 0.8867793083190918, "step": 2590 }, { "epoch": 0.1207112679325874, "grad_norm": 39.69942092895508, "learning_rate": 4.799356206571026e-07, "logits/chosen": -18.161819458007812, "logits/rejected": -18.179943084716797, "logps/chosen": -418.99200439453125, "logps/rejected": -333.5869445800781, "loss": 0.6561, "rewards/accuracies": 0.5, "rewards/chosen": 1.0589808225631714, "rewards/margins": 0.1636609584093094, "rewards/rejected": 0.895319938659668, "step": 2600 }, { "epoch": 0.12117554204002043, "grad_norm": 32.538665771484375, "learning_rate": 4.798582416391971e-07, "logits/chosen": -17.56277084350586, "logits/rejected": -16.191646575927734, "logps/chosen": -368.65899658203125, "logps/rejected": -206.58737182617188, "loss": 0.6014, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8586681485176086, "rewards/margins": 0.3015410900115967, "rewards/rejected": 0.557127058506012, "step": 2610 }, { "epoch": 0.12163981614745345, "grad_norm": 143.63865661621094, "learning_rate": 4.797808626212915e-07, "logits/chosen": -17.756027221679688, "logits/rejected": -17.151424407958984, "logps/chosen": -369.4128723144531, "logps/rejected": -322.63543701171875, "loss": 0.6347, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9490043520927429, "rewards/margins": 0.19484330713748932, "rewards/rejected": 0.75416100025177, "step": 2620 }, { "epoch": 0.12210409025488649, "grad_norm": 107.5672836303711, "learning_rate": 4.797034836033861e-07, "logits/chosen": -18.190540313720703, "logits/rejected": -18.133188247680664, "logps/chosen": -430.38568115234375, "logps/rejected": -471.20208740234375, "loss": 0.8264, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.008126974105835, "rewards/margins": -0.14130893349647522, "rewards/rejected": 1.1494357585906982, "step": 2630 }, { "epoch": 0.1225683643623195, "grad_norm": 94.03935241699219, "learning_rate": 4.796261045854806e-07, "logits/chosen": -18.892314910888672, "logits/rejected": -18.062007904052734, "logps/chosen": -390.1009216308594, "logps/rejected": -406.29840087890625, "loss": 0.6546, "rewards/accuracies": 0.5, "rewards/chosen": 1.0433499813079834, "rewards/margins": 0.16543038189411163, "rewards/rejected": 0.8779194951057434, "step": 2640 }, { "epoch": 0.12303263846975254, "grad_norm": 103.59403228759766, "learning_rate": 4.795487255675751e-07, "logits/chosen": -18.42327880859375, "logits/rejected": -18.03025245666504, "logps/chosen": -399.25323486328125, "logps/rejected": -348.754638671875, "loss": 0.6104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0470367670059204, "rewards/margins": 0.24715618789196014, "rewards/rejected": 0.7998805642127991, "step": 2650 }, { "epoch": 0.12349691257718556, "grad_norm": 33.175899505615234, "learning_rate": 4.794713465496696e-07, "logits/chosen": -18.103967666625977, "logits/rejected": -16.511768341064453, "logps/chosen": -389.2559509277344, "logps/rejected": -244.212646484375, "loss": 0.5449, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9273412823677063, "rewards/margins": 0.4197044372558594, "rewards/rejected": 0.5076368451118469, "step": 2660 }, { "epoch": 0.1239611866846186, "grad_norm": 78.49510192871094, "learning_rate": 4.79393967531764e-07, "logits/chosen": -17.764699935913086, "logits/rejected": -17.16490364074707, "logps/chosen": -303.8987121582031, "logps/rejected": -222.52841186523438, "loss": 0.6157, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7220200300216675, "rewards/margins": 0.20523333549499512, "rewards/rejected": 0.5167866945266724, "step": 2670 }, { "epoch": 0.12442546079205162, "grad_norm": 89.49657440185547, "learning_rate": 4.793165885138585e-07, "logits/chosen": -18.228792190551758, "logits/rejected": -17.445606231689453, "logps/chosen": -382.262451171875, "logps/rejected": -275.93798828125, "loss": 0.5949, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9658845067024231, "rewards/margins": 0.28541669249534607, "rewards/rejected": 0.6804677248001099, "step": 2680 }, { "epoch": 0.12488973489948466, "grad_norm": 111.97955322265625, "learning_rate": 4.792392094959531e-07, "logits/chosen": -18.448387145996094, "logits/rejected": -17.103633880615234, "logps/chosen": -455.1310119628906, "logps/rejected": -351.518310546875, "loss": 0.6287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0922514200210571, "rewards/margins": 0.15785376727581024, "rewards/rejected": 0.9343975782394409, "step": 2690 }, { "epoch": 0.12535400900691768, "grad_norm": 99.99964141845703, "learning_rate": 4.791618304780475e-07, "logits/chosen": -17.61351776123047, "logits/rejected": -17.505325317382812, "logps/chosen": -491.1439514160156, "logps/rejected": -506.4859313964844, "loss": 0.7428, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.8503020405769348, "rewards/margins": -0.05225713178515434, "rewards/rejected": 0.9025592803955078, "step": 2700 }, { "epoch": 0.12581828311435073, "grad_norm": 71.00311279296875, "learning_rate": 4.79084451460142e-07, "logits/chosen": -17.796348571777344, "logits/rejected": -17.623798370361328, "logps/chosen": -425.1334533691406, "logps/rejected": -419.40325927734375, "loss": 0.7871, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0806763172149658, "rewards/margins": -0.12065891176462173, "rewards/rejected": 1.2013351917266846, "step": 2710 }, { "epoch": 0.12628255722178375, "grad_norm": 73.83464813232422, "learning_rate": 4.790070724422366e-07, "logits/chosen": -18.072158813476562, "logits/rejected": -16.557716369628906, "logps/chosen": -404.86474609375, "logps/rejected": -200.55416870117188, "loss": 0.5199, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0022284984588623, "rewards/margins": 0.4818558096885681, "rewards/rejected": 0.520372748374939, "step": 2720 }, { "epoch": 0.12674683132921677, "grad_norm": 112.05176544189453, "learning_rate": 4.78929693424331e-07, "logits/chosen": -18.915069580078125, "logits/rejected": -17.666412353515625, "logps/chosen": -338.2649841308594, "logps/rejected": -274.2216796875, "loss": 0.5698, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9542213678359985, "rewards/margins": 0.3298872113227844, "rewards/rejected": 0.6243340373039246, "step": 2730 }, { "epoch": 0.1272111054366498, "grad_norm": 150.60691833496094, "learning_rate": 4.788523144064256e-07, "logits/chosen": -17.991439819335938, "logits/rejected": -16.939294815063477, "logps/chosen": -487.12030029296875, "logps/rejected": -352.47955322265625, "loss": 0.5474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1910934448242188, "rewards/margins": 0.37168747186660767, "rewards/rejected": 0.8194060325622559, "step": 2740 }, { "epoch": 0.12767537954408284, "grad_norm": 122.46995544433594, "learning_rate": 4.7877493538852e-07, "logits/chosen": -17.820659637451172, "logits/rejected": -17.07426643371582, "logps/chosen": -325.42486572265625, "logps/rejected": -244.14877319335938, "loss": 0.6486, "rewards/accuracies": 0.5, "rewards/chosen": 0.9385802149772644, "rewards/margins": 0.15529632568359375, "rewards/rejected": 0.7832838892936707, "step": 2750 }, { "epoch": 0.12813965365151586, "grad_norm": 63.33738327026367, "learning_rate": 4.786975563706145e-07, "logits/chosen": -18.57929039001465, "logits/rejected": -17.338481903076172, "logps/chosen": -509.55255126953125, "logps/rejected": -383.9755859375, "loss": 0.568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2437461614608765, "rewards/margins": 0.34196823835372925, "rewards/rejected": 0.9017779231071472, "step": 2760 }, { "epoch": 0.12860392775894888, "grad_norm": 108.4670639038086, "learning_rate": 4.786201773527091e-07, "logits/chosen": -17.847108840942383, "logits/rejected": -17.12856101989746, "logps/chosen": -454.9764709472656, "logps/rejected": -329.26806640625, "loss": 0.6936, "rewards/accuracies": 0.5, "rewards/chosen": 1.2345871925354004, "rewards/margins": 0.09145936369895935, "rewards/rejected": 1.1431277990341187, "step": 2770 }, { "epoch": 0.1290682018663819, "grad_norm": 27.697479248046875, "learning_rate": 4.785427983348035e-07, "logits/chosen": -18.387371063232422, "logits/rejected": -17.738101959228516, "logps/chosen": -409.33526611328125, "logps/rejected": -275.36883544921875, "loss": 0.5728, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0776865482330322, "rewards/margins": 0.3973212242126465, "rewards/rejected": 0.680365264415741, "step": 2780 }, { "epoch": 0.12953247597381495, "grad_norm": 113.2016372680664, "learning_rate": 4.78465419316898e-07, "logits/chosen": -18.41721534729004, "logits/rejected": -17.449813842773438, "logps/chosen": -426.43475341796875, "logps/rejected": -355.93487548828125, "loss": 0.5784, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.201322317123413, "rewards/margins": 0.28035449981689453, "rewards/rejected": 0.920967698097229, "step": 2790 }, { "epoch": 0.12999675008124797, "grad_norm": 160.95651245117188, "learning_rate": 4.783880402989925e-07, "logits/chosen": -18.223697662353516, "logits/rejected": -17.755748748779297, "logps/chosen": -544.0360107421875, "logps/rejected": -501.8299865722656, "loss": 0.6248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.251268982887268, "rewards/margins": 0.2305225431919098, "rewards/rejected": 1.0207463502883911, "step": 2800 }, { "epoch": 0.130461024188681, "grad_norm": 37.52773666381836, "learning_rate": 4.78310661281087e-07, "logits/chosen": -18.006694793701172, "logits/rejected": -17.293601989746094, "logps/chosen": -338.02838134765625, "logps/rejected": -237.2892608642578, "loss": 0.5951, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8625599145889282, "rewards/margins": 0.23628553748130798, "rewards/rejected": 0.6262744665145874, "step": 2810 }, { "epoch": 0.13092529829611402, "grad_norm": 47.49479293823242, "learning_rate": 4.782332822631815e-07, "logits/chosen": -18.327877044677734, "logits/rejected": -17.88981819152832, "logps/chosen": -456.9610900878906, "logps/rejected": -412.35223388671875, "loss": 0.6735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.223028302192688, "rewards/margins": 0.0765121653676033, "rewards/rejected": 1.1465160846710205, "step": 2820 }, { "epoch": 0.13138957240354707, "grad_norm": 64.13322448730469, "learning_rate": 4.78155903245276e-07, "logits/chosen": -18.517744064331055, "logits/rejected": -18.288114547729492, "logps/chosen": -340.1625671386719, "logps/rejected": -343.2926330566406, "loss": 0.7034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8741950988769531, "rewards/margins": 0.040480952709913254, "rewards/rejected": 0.8337141871452332, "step": 2830 }, { "epoch": 0.1318538465109801, "grad_norm": 96.67988586425781, "learning_rate": 4.780785242273705e-07, "logits/chosen": -18.505008697509766, "logits/rejected": -17.452985763549805, "logps/chosen": -568.65478515625, "logps/rejected": -427.1936950683594, "loss": 0.6467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6228721141815186, "rewards/margins": 0.27166473865509033, "rewards/rejected": 1.3512071371078491, "step": 2840 }, { "epoch": 0.1323181206184131, "grad_norm": 61.233421325683594, "learning_rate": 4.78001145209465e-07, "logits/chosen": -17.737834930419922, "logits/rejected": -17.576601028442383, "logps/chosen": -368.08807373046875, "logps/rejected": -289.26934814453125, "loss": 0.5954, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0855427980422974, "rewards/margins": 0.2605626583099365, "rewards/rejected": 0.8249801397323608, "step": 2850 }, { "epoch": 0.13278239472584613, "grad_norm": 19.419971466064453, "learning_rate": 4.779237661915595e-07, "logits/chosen": -18.622425079345703, "logits/rejected": -18.202434539794922, "logps/chosen": -324.3018798828125, "logps/rejected": -369.97821044921875, "loss": 0.7925, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.9228509664535522, "rewards/margins": 0.036067914217710495, "rewards/rejected": 0.8867830038070679, "step": 2860 }, { "epoch": 0.13324666883327918, "grad_norm": 42.31591033935547, "learning_rate": 4.77846387173654e-07, "logits/chosen": -18.244565963745117, "logits/rejected": -16.90795135498047, "logps/chosen": -515.5853881835938, "logps/rejected": -393.5888671875, "loss": 0.5223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2175930738449097, "rewards/margins": 0.46847981214523315, "rewards/rejected": 0.7491132020950317, "step": 2870 }, { "epoch": 0.1337109429407122, "grad_norm": 51.264896392822266, "learning_rate": 4.777690081557484e-07, "logits/chosen": -17.823972702026367, "logits/rejected": -17.878009796142578, "logps/chosen": -289.1578674316406, "logps/rejected": -315.47454833984375, "loss": 0.7106, "rewards/accuracies": 0.5, "rewards/chosen": 0.6967288851737976, "rewards/margins": -0.018893051892518997, "rewards/rejected": 0.7156219482421875, "step": 2880 }, { "epoch": 0.13417521704814522, "grad_norm": 51.8425178527832, "learning_rate": 4.77691629137843e-07, "logits/chosen": -17.654855728149414, "logits/rejected": -17.219287872314453, "logps/chosen": -299.20574951171875, "logps/rejected": -230.62289428710938, "loss": 0.5414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9719449281692505, "rewards/margins": 0.42685532569885254, "rewards/rejected": 0.545089602470398, "step": 2890 }, { "epoch": 0.13463949115557824, "grad_norm": 121.2565689086914, "learning_rate": 4.776142501199374e-07, "logits/chosen": -17.815366744995117, "logits/rejected": -16.64712142944336, "logps/chosen": -400.56988525390625, "logps/rejected": -302.72113037109375, "loss": 0.5454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0093022584915161, "rewards/margins": 0.3846196234226227, "rewards/rejected": 0.624682605266571, "step": 2900 }, { "epoch": 0.1351037652630113, "grad_norm": 64.2486343383789, "learning_rate": 4.775368711020319e-07, "logits/chosen": -18.764446258544922, "logits/rejected": -18.695743560791016, "logps/chosen": -546.7384033203125, "logps/rejected": -399.16448974609375, "loss": 0.6957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2953184843063354, "rewards/margins": 0.08159423619508743, "rewards/rejected": 1.2137242555618286, "step": 2910 }, { "epoch": 0.13556803937044432, "grad_norm": 62.626163482666016, "learning_rate": 4.774594920841265e-07, "logits/chosen": -17.195425033569336, "logits/rejected": -16.531652450561523, "logps/chosen": -365.17535400390625, "logps/rejected": -287.04705810546875, "loss": 0.6544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0148394107818604, "rewards/margins": 0.14208638668060303, "rewards/rejected": 0.8727529644966125, "step": 2920 }, { "epoch": 0.13603231347787734, "grad_norm": 80.90619659423828, "learning_rate": 4.773821130662209e-07, "logits/chosen": -18.42282485961914, "logits/rejected": -18.49997901916504, "logps/chosen": -404.23638916015625, "logps/rejected": -357.1438293457031, "loss": 0.6247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0230821371078491, "rewards/margins": 0.18106882274150848, "rewards/rejected": 0.8420133590698242, "step": 2930 }, { "epoch": 0.13649658758531036, "grad_norm": 121.02154541015625, "learning_rate": 4.773047340483155e-07, "logits/chosen": -18.542648315429688, "logits/rejected": -18.564342498779297, "logps/chosen": -362.4396667480469, "logps/rejected": -340.2750549316406, "loss": 0.6849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.109518051147461, "rewards/margins": 0.05836213752627373, "rewards/rejected": 1.051155924797058, "step": 2940 }, { "epoch": 0.1369608616927434, "grad_norm": 69.4280014038086, "learning_rate": 4.772273550304099e-07, "logits/chosen": -19.120960235595703, "logits/rejected": -18.398181915283203, "logps/chosen": -496.76763916015625, "logps/rejected": -368.2831726074219, "loss": 0.6026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3290783166885376, "rewards/margins": 0.2612004280090332, "rewards/rejected": 1.0678777694702148, "step": 2950 }, { "epoch": 0.13742513580017643, "grad_norm": 37.546939849853516, "learning_rate": 4.771499760125044e-07, "logits/chosen": -18.26401138305664, "logits/rejected": -17.124765396118164, "logps/chosen": -389.30267333984375, "logps/rejected": -291.0267333984375, "loss": 0.674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9146798849105835, "rewards/margins": 0.1298399120569229, "rewards/rejected": 0.7848400473594666, "step": 2960 }, { "epoch": 0.13788940990760945, "grad_norm": 58.51084518432617, "learning_rate": 4.770725969945989e-07, "logits/chosen": -19.159696578979492, "logits/rejected": -17.976791381835938, "logps/chosen": -348.12530517578125, "logps/rejected": -245.1175079345703, "loss": 0.5956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9066364169120789, "rewards/margins": 0.22663238644599915, "rewards/rejected": 0.6800039410591125, "step": 2970 }, { "epoch": 0.13835368401504247, "grad_norm": 99.16976928710938, "learning_rate": 4.769952179766934e-07, "logits/chosen": -17.575653076171875, "logits/rejected": -16.990177154541016, "logps/chosen": -326.667236328125, "logps/rejected": -258.1642150878906, "loss": 0.6496, "rewards/accuracies": 0.5, "rewards/chosen": 0.9886246919631958, "rewards/margins": 0.11338530480861664, "rewards/rejected": 0.875239372253418, "step": 2980 }, { "epoch": 0.13881795812247552, "grad_norm": 48.8080940246582, "learning_rate": 4.769178389587879e-07, "logits/chosen": -18.59847640991211, "logits/rejected": -18.017898559570312, "logps/chosen": -457.14056396484375, "logps/rejected": -355.18878173828125, "loss": 0.6688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.06062912940979, "rewards/margins": 0.09020514786243439, "rewards/rejected": 0.9704238772392273, "step": 2990 }, { "epoch": 0.13928223222990854, "grad_norm": 65.76626586914062, "learning_rate": 4.768404599408824e-07, "logits/chosen": -18.208600997924805, "logits/rejected": -18.120832443237305, "logps/chosen": -498.0550231933594, "logps/rejected": -418.4888610839844, "loss": 0.6607, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2478950023651123, "rewards/margins": 0.20931868255138397, "rewards/rejected": 1.038576364517212, "step": 3000 }, { "epoch": 0.13974650633734156, "grad_norm": 74.53496551513672, "learning_rate": 4.767630809229769e-07, "logits/chosen": -17.581945419311523, "logits/rejected": -17.27098274230957, "logps/chosen": -414.84588623046875, "logps/rejected": -382.20086669921875, "loss": 0.662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2346292734146118, "rewards/margins": 0.16239437460899353, "rewards/rejected": 1.072234869003296, "step": 3010 }, { "epoch": 0.14021078044477459, "grad_norm": 123.82723999023438, "learning_rate": 4.766857019050714e-07, "logits/chosen": -18.294269561767578, "logits/rejected": -17.779869079589844, "logps/chosen": -394.8463439941406, "logps/rejected": -341.859619140625, "loss": 0.7081, "rewards/accuracies": 0.5, "rewards/chosen": 0.8435863256454468, "rewards/margins": 0.027750063687562943, "rewards/rejected": 0.8158363103866577, "step": 3020 }, { "epoch": 0.14067505455220763, "grad_norm": 44.255863189697266, "learning_rate": 4.7660832288716593e-07, "logits/chosen": -17.67106819152832, "logits/rejected": -17.318161010742188, "logps/chosen": -392.3521728515625, "logps/rejected": -339.2916564941406, "loss": 0.7794, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.9223707318305969, "rewards/margins": -0.01085598487406969, "rewards/rejected": 0.9332267045974731, "step": 3030 }, { "epoch": 0.14113932865964066, "grad_norm": 89.8868179321289, "learning_rate": 4.765309438692604e-07, "logits/chosen": -17.358205795288086, "logits/rejected": -16.98391342163086, "logps/chosen": -412.1744079589844, "logps/rejected": -296.37603759765625, "loss": 0.621, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.3048135042190552, "rewards/margins": 0.27881985902786255, "rewards/rejected": 1.0259935855865479, "step": 3040 }, { "epoch": 0.14160360276707368, "grad_norm": 52.25237274169922, "learning_rate": 4.764535648513549e-07, "logits/chosen": -17.658187866210938, "logits/rejected": -16.37228012084961, "logps/chosen": -391.6702575683594, "logps/rejected": -233.6203155517578, "loss": 0.5355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1880419254302979, "rewards/margins": 0.5014578104019165, "rewards/rejected": 0.6865842342376709, "step": 3050 }, { "epoch": 0.1420678768745067, "grad_norm": 38.57636642456055, "learning_rate": 4.763761858334494e-07, "logits/chosen": -17.844135284423828, "logits/rejected": -18.421571731567383, "logps/chosen": -419.34014892578125, "logps/rejected": -477.1000061035156, "loss": 0.7756, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.1633193492889404, "rewards/margins": -0.05070173740386963, "rewards/rejected": 1.2140212059020996, "step": 3060 }, { "epoch": 0.14253215098193975, "grad_norm": 40.91852951049805, "learning_rate": 4.7629880681554387e-07, "logits/chosen": -17.256689071655273, "logits/rejected": -16.497159957885742, "logps/chosen": -333.012939453125, "logps/rejected": -335.071533203125, "loss": 0.6675, "rewards/accuracies": 0.5, "rewards/chosen": 1.0434321165084839, "rewards/margins": 0.14410385489463806, "rewards/rejected": 0.899328351020813, "step": 3070 }, { "epoch": 0.14299642508937277, "grad_norm": 176.1711883544922, "learning_rate": 4.7622142779763833e-07, "logits/chosen": -17.882450103759766, "logits/rejected": -17.656877517700195, "logps/chosen": -456.27923583984375, "logps/rejected": -353.55499267578125, "loss": 0.7178, "rewards/accuracies": 0.5, "rewards/chosen": 0.9282422065734863, "rewards/margins": 0.028940260410308838, "rewards/rejected": 0.8993018865585327, "step": 3080 }, { "epoch": 0.1434606991968058, "grad_norm": 36.97212600708008, "learning_rate": 4.7614404877973284e-07, "logits/chosen": -16.88686752319336, "logits/rejected": -17.145578384399414, "logps/chosen": -353.9463806152344, "logps/rejected": -332.3037109375, "loss": 0.7825, "rewards/accuracies": 0.5, "rewards/chosen": 0.8861838579177856, "rewards/margins": -0.07412056624889374, "rewards/rejected": 0.9603044390678406, "step": 3090 }, { "epoch": 0.1439249733042388, "grad_norm": 38.65367126464844, "learning_rate": 4.7606666976182735e-07, "logits/chosen": -19.206275939941406, "logits/rejected": -18.669490814208984, "logps/chosen": -392.6864929199219, "logps/rejected": -336.1321105957031, "loss": 0.6654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0469048023223877, "rewards/margins": 0.14839430153369904, "rewards/rejected": 0.8985106348991394, "step": 3100 }, { "epoch": 0.14438924741167186, "grad_norm": 83.253662109375, "learning_rate": 4.7598929074392186e-07, "logits/chosen": -18.12677001953125, "logits/rejected": -17.4260196685791, "logps/chosen": -408.326416015625, "logps/rejected": -341.9842834472656, "loss": 0.5696, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0990862846374512, "rewards/margins": 0.29395797848701477, "rewards/rejected": 0.8051283955574036, "step": 3110 }, { "epoch": 0.14485352151910488, "grad_norm": 38.46770095825195, "learning_rate": 4.7591191172601637e-07, "logits/chosen": -18.525699615478516, "logits/rejected": -17.586994171142578, "logps/chosen": -371.4141845703125, "logps/rejected": -327.80169677734375, "loss": 0.6809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.982513427734375, "rewards/margins": 0.10286353528499603, "rewards/rejected": 0.8796499371528625, "step": 3120 }, { "epoch": 0.1453177956265379, "grad_norm": 38.63396072387695, "learning_rate": 4.758345327081109e-07, "logits/chosen": -17.946863174438477, "logits/rejected": -16.86536979675293, "logps/chosen": -498.06207275390625, "logps/rejected": -408.9974060058594, "loss": 0.5433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6360816955566406, "rewards/margins": 0.4254417419433594, "rewards/rejected": 1.2106397151947021, "step": 3130 }, { "epoch": 0.14578206973397093, "grad_norm": 66.45211029052734, "learning_rate": 4.7575715369020534e-07, "logits/chosen": -17.73423957824707, "logits/rejected": -16.91381072998047, "logps/chosen": -304.119140625, "logps/rejected": -257.3597412109375, "loss": 0.6298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.018726110458374, "rewards/margins": 0.201194167137146, "rewards/rejected": 0.8175320625305176, "step": 3140 }, { "epoch": 0.14624634384140398, "grad_norm": 51.454566955566406, "learning_rate": 4.7567977467229985e-07, "logits/chosen": -18.397113800048828, "logits/rejected": -17.972055435180664, "logps/chosen": -411.6798400878906, "logps/rejected": -323.58740234375, "loss": 0.5396, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3476008176803589, "rewards/margins": 0.40918397903442383, "rewards/rejected": 0.9384168386459351, "step": 3150 }, { "epoch": 0.146710617948837, "grad_norm": 74.7562484741211, "learning_rate": 4.756023956543943e-07, "logits/chosen": -18.401281356811523, "logits/rejected": -17.859678268432617, "logps/chosen": -436.1512756347656, "logps/rejected": -373.18243408203125, "loss": 0.6626, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9896245002746582, "rewards/margins": 0.10377141088247299, "rewards/rejected": 0.885853111743927, "step": 3160 }, { "epoch": 0.14717489205627002, "grad_norm": 158.5449676513672, "learning_rate": 4.755250166364888e-07, "logits/chosen": -18.96906280517578, "logits/rejected": -17.436433792114258, "logps/chosen": -394.2160949707031, "logps/rejected": -310.5142822265625, "loss": 0.6231, "rewards/accuracies": 0.5, "rewards/chosen": 1.1394239664077759, "rewards/margins": 0.2519994080066681, "rewards/rejected": 0.887424647808075, "step": 3170 }, { "epoch": 0.14763916616370304, "grad_norm": 61.568790435791016, "learning_rate": 4.7544763761858333e-07, "logits/chosen": -18.012205123901367, "logits/rejected": -17.97730827331543, "logps/chosen": -485.792236328125, "logps/rejected": -492.1036071777344, "loss": 0.6507, "rewards/accuracies": 0.5, "rewards/chosen": 1.3318426609039307, "rewards/margins": 0.13607560098171234, "rewards/rejected": 1.1957669258117676, "step": 3180 }, { "epoch": 0.1481034402711361, "grad_norm": 156.0196075439453, "learning_rate": 4.753702586006778e-07, "logits/chosen": -17.722036361694336, "logits/rejected": -17.772310256958008, "logps/chosen": -307.61273193359375, "logps/rejected": -286.23565673828125, "loss": 0.7753, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.8011369705200195, "rewards/margins": -0.11933152377605438, "rewards/rejected": 0.9204685091972351, "step": 3190 }, { "epoch": 0.1485677143785691, "grad_norm": 137.54347229003906, "learning_rate": 4.752928795827723e-07, "logits/chosen": -18.332883834838867, "logits/rejected": -17.603946685791016, "logps/chosen": -302.83929443359375, "logps/rejected": -297.1937255859375, "loss": 0.8133, "rewards/accuracies": 0.5, "rewards/chosen": 1.0018728971481323, "rewards/margins": -0.0678805559873581, "rewards/rejected": 1.0697535276412964, "step": 3200 }, { "epoch": 0.14903198848600213, "grad_norm": 63.12788772583008, "learning_rate": 4.752155005648668e-07, "logits/chosen": -18.036609649658203, "logits/rejected": -17.393260955810547, "logps/chosen": -353.65155029296875, "logps/rejected": -286.49322509765625, "loss": 0.6182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0411015748977661, "rewards/margins": 0.24464266002178192, "rewards/rejected": 0.7964588403701782, "step": 3210 }, { "epoch": 0.14949626259343515, "grad_norm": 142.73529052734375, "learning_rate": 4.751381215469613e-07, "logits/chosen": -18.231281280517578, "logits/rejected": -18.03184700012207, "logps/chosen": -429.8760681152344, "logps/rejected": -406.06475830078125, "loss": 0.7771, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0712909698486328, "rewards/margins": -0.09842734783887863, "rewards/rejected": 1.1697182655334473, "step": 3220 }, { "epoch": 0.1499605367008682, "grad_norm": 78.2866439819336, "learning_rate": 4.7506074252905584e-07, "logits/chosen": -18.125728607177734, "logits/rejected": -17.14487075805664, "logps/chosen": -359.01251220703125, "logps/rejected": -254.7957763671875, "loss": 0.5728, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1199408769607544, "rewards/margins": 0.33073392510414124, "rewards/rejected": 0.7892070412635803, "step": 3230 }, { "epoch": 0.15042481080830122, "grad_norm": 65.23153686523438, "learning_rate": 4.749833635111503e-07, "logits/chosen": -17.938220977783203, "logits/rejected": -17.23306655883789, "logps/chosen": -444.29541015625, "logps/rejected": -339.863037109375, "loss": 0.568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.326738715171814, "rewards/margins": 0.3343626856803894, "rewards/rejected": 0.9923760294914246, "step": 3240 }, { "epoch": 0.15088908491573425, "grad_norm": 62.34920120239258, "learning_rate": 4.749059844932448e-07, "logits/chosen": -18.190994262695312, "logits/rejected": -17.170255661010742, "logps/chosen": -462.899658203125, "logps/rejected": -303.2639465332031, "loss": 0.6427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3253755569458008, "rewards/margins": 0.20476365089416504, "rewards/rejected": 1.1206119060516357, "step": 3250 }, { "epoch": 0.15135335902316727, "grad_norm": 49.53099060058594, "learning_rate": 4.7482860547533927e-07, "logits/chosen": -18.67976951599121, "logits/rejected": -17.102815628051758, "logps/chosen": -427.6234436035156, "logps/rejected": -340.83819580078125, "loss": 0.6105, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1878364086151123, "rewards/margins": 0.2452850341796875, "rewards/rejected": 0.9425514340400696, "step": 3260 }, { "epoch": 0.15181763313060032, "grad_norm": 61.050254821777344, "learning_rate": 4.747512264574338e-07, "logits/chosen": -18.55900764465332, "logits/rejected": -18.49546241760254, "logps/chosen": -328.0943603515625, "logps/rejected": -327.0618591308594, "loss": 0.7236, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0055391788482666, "rewards/margins": 0.047175101935863495, "rewards/rejected": 0.9583640098571777, "step": 3270 }, { "epoch": 0.15228190723803334, "grad_norm": 84.31214904785156, "learning_rate": 4.746738474395283e-07, "logits/chosen": -18.582271575927734, "logits/rejected": -17.015684127807617, "logps/chosen": -514.177734375, "logps/rejected": -295.74212646484375, "loss": 0.5251, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3090755939483643, "rewards/margins": 0.46395954489707947, "rewards/rejected": 0.845116138458252, "step": 3280 }, { "epoch": 0.15274618134546636, "grad_norm": 44.77896499633789, "learning_rate": 4.7459646842162275e-07, "logits/chosen": -18.395809173583984, "logits/rejected": -15.90546703338623, "logps/chosen": -467.2216796875, "logps/rejected": -182.52308654785156, "loss": 0.3504, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6320343017578125, "rewards/margins": 1.0972278118133545, "rewards/rejected": 0.5348065495491028, "step": 3290 }, { "epoch": 0.15321045545289938, "grad_norm": 36.45459747314453, "learning_rate": 4.7451908940371726e-07, "logits/chosen": -17.71921157836914, "logits/rejected": -17.384578704833984, "logps/chosen": -365.4981384277344, "logps/rejected": -303.34906005859375, "loss": 0.6075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0109584331512451, "rewards/margins": 0.2511940002441406, "rewards/rejected": 0.7597644925117493, "step": 3300 }, { "epoch": 0.15367472956033243, "grad_norm": 115.35277557373047, "learning_rate": 4.7444171038581177e-07, "logits/chosen": -17.75753402709961, "logits/rejected": -17.169511795043945, "logps/chosen": -526.6651611328125, "logps/rejected": -365.494873046875, "loss": 0.6583, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.3287192583084106, "rewards/margins": 0.2249910831451416, "rewards/rejected": 1.1037280559539795, "step": 3310 }, { "epoch": 0.15413900366776545, "grad_norm": 110.17710876464844, "learning_rate": 4.743643313679063e-07, "logits/chosen": -18.66684913635254, "logits/rejected": -17.349285125732422, "logps/chosen": -474.3534240722656, "logps/rejected": -307.084716796875, "loss": 0.5754, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2031880617141724, "rewards/margins": 0.35153836011886597, "rewards/rejected": 0.8516496419906616, "step": 3320 }, { "epoch": 0.15460327777519847, "grad_norm": 65.07411193847656, "learning_rate": 4.742869523500008e-07, "logits/chosen": -18.37296485900879, "logits/rejected": -17.483306884765625, "logps/chosen": -417.5398864746094, "logps/rejected": -272.64715576171875, "loss": 0.4426, "rewards/accuracies": 1.0, "rewards/chosen": 1.4882794618606567, "rewards/margins": 0.6258381605148315, "rewards/rejected": 0.86244136095047, "step": 3330 }, { "epoch": 0.1550675518826315, "grad_norm": 163.18838500976562, "learning_rate": 4.7420957333209525e-07, "logits/chosen": -18.368667602539062, "logits/rejected": -18.337987899780273, "logps/chosen": -446.28790283203125, "logps/rejected": -474.6214904785156, "loss": 0.8095, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.4302051067352295, "rewards/margins": -0.13452811539173126, "rewards/rejected": 1.5647331476211548, "step": 3340 }, { "epoch": 0.15553182599006454, "grad_norm": 63.647525787353516, "learning_rate": 4.741321943141897e-07, "logits/chosen": -19.723201751708984, "logits/rejected": -18.8745059967041, "logps/chosen": -421.9971618652344, "logps/rejected": -389.5171203613281, "loss": 0.5597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3391797542572021, "rewards/margins": 0.33800187706947327, "rewards/rejected": 1.0011779069900513, "step": 3350 }, { "epoch": 0.15599610009749756, "grad_norm": 90.56314849853516, "learning_rate": 4.740548152962842e-07, "logits/chosen": -18.37140464782715, "logits/rejected": -17.35097312927246, "logps/chosen": -486.3515625, "logps/rejected": -353.62286376953125, "loss": 0.5044, "rewards/accuracies": 1.0, "rewards/chosen": 1.3700768947601318, "rewards/margins": 0.4385414719581604, "rewards/rejected": 0.931535542011261, "step": 3360 }, { "epoch": 0.15646037420493059, "grad_norm": 39.6215934753418, "learning_rate": 4.7397743627837873e-07, "logits/chosen": -16.941917419433594, "logits/rejected": -16.809688568115234, "logps/chosen": -406.7529296875, "logps/rejected": -268.9869689941406, "loss": 0.5, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4170695543289185, "rewards/margins": 0.5546602010726929, "rewards/rejected": 0.862409234046936, "step": 3370 }, { "epoch": 0.1569246483123636, "grad_norm": 62.88945388793945, "learning_rate": 4.7390005726047324e-07, "logits/chosen": -18.684371948242188, "logits/rejected": -17.745227813720703, "logps/chosen": -428.40380859375, "logps/rejected": -287.74407958984375, "loss": 0.5912, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1657506227493286, "rewards/margins": 0.27398988604545593, "rewards/rejected": 0.8917607069015503, "step": 3380 }, { "epoch": 0.15738892241979666, "grad_norm": 60.2450065612793, "learning_rate": 4.738226782425677e-07, "logits/chosen": -17.278953552246094, "logits/rejected": -16.83827018737793, "logps/chosen": -366.32806396484375, "logps/rejected": -297.84368896484375, "loss": 0.6432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1966571807861328, "rewards/margins": 0.17645004391670227, "rewards/rejected": 1.020207166671753, "step": 3390 }, { "epoch": 0.15785319652722968, "grad_norm": 96.79247283935547, "learning_rate": 4.737452992246622e-07, "logits/chosen": -18.322885513305664, "logits/rejected": -18.192729949951172, "logps/chosen": -457.7144470214844, "logps/rejected": -472.0986328125, "loss": 0.7297, "rewards/accuracies": 0.5, "rewards/chosen": 1.2278960943222046, "rewards/margins": -0.03883712366223335, "rewards/rejected": 1.2667332887649536, "step": 3400 }, { "epoch": 0.1583174706346627, "grad_norm": 56.04253005981445, "learning_rate": 4.736679202067567e-07, "logits/chosen": -19.06130599975586, "logits/rejected": -17.850061416625977, "logps/chosen": -371.07769775390625, "logps/rejected": -244.7427978515625, "loss": 0.5566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.047940969467163, "rewards/margins": 0.3836016058921814, "rewards/rejected": 0.6643394231796265, "step": 3410 }, { "epoch": 0.15878174474209572, "grad_norm": 68.27153015136719, "learning_rate": 4.7359054118885123e-07, "logits/chosen": -18.66318130493164, "logits/rejected": -17.75836181640625, "logps/chosen": -379.7160949707031, "logps/rejected": -260.84722900390625, "loss": 0.6108, "rewards/accuracies": 0.5, "rewards/chosen": 1.3703457117080688, "rewards/margins": 0.38900142908096313, "rewards/rejected": 0.9813443422317505, "step": 3420 }, { "epoch": 0.15924601884952877, "grad_norm": 113.29331970214844, "learning_rate": 4.7351316217094575e-07, "logits/chosen": -18.804954528808594, "logits/rejected": -18.2172908782959, "logps/chosen": -538.2059326171875, "logps/rejected": -367.4610900878906, "loss": 0.5375, "rewards/accuracies": 0.5, "rewards/chosen": 1.7069936990737915, "rewards/margins": 0.6459890007972717, "rewards/rejected": 1.061004400253296, "step": 3430 }, { "epoch": 0.1597102929569618, "grad_norm": 140.88002014160156, "learning_rate": 4.734357831530402e-07, "logits/chosen": -18.63814353942871, "logits/rejected": -18.850622177124023, "logps/chosen": -447.9300231933594, "logps/rejected": -491.568115234375, "loss": 0.7611, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.2477375268936157, "rewards/margins": 0.008281001821160316, "rewards/rejected": 1.2394565343856812, "step": 3440 }, { "epoch": 0.1601745670643948, "grad_norm": 95.88314819335938, "learning_rate": 4.7335840413513466e-07, "logits/chosen": -19.06679344177246, "logits/rejected": -17.998449325561523, "logps/chosen": -519.1051635742188, "logps/rejected": -397.3230895996094, "loss": 0.5567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5726202726364136, "rewards/margins": 0.3935636579990387, "rewards/rejected": 1.1790566444396973, "step": 3450 }, { "epoch": 0.16063884117182783, "grad_norm": 119.5916976928711, "learning_rate": 4.732810251172292e-07, "logits/chosen": -18.796913146972656, "logits/rejected": -18.086578369140625, "logps/chosen": -491.03338623046875, "logps/rejected": -385.4740905761719, "loss": 0.7297, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.4046721458435059, "rewards/margins": 0.0025273202918469906, "rewards/rejected": 1.4021450281143188, "step": 3460 }, { "epoch": 0.16110311527926088, "grad_norm": 121.4479751586914, "learning_rate": 4.732036460993237e-07, "logits/chosen": -18.531816482543945, "logits/rejected": -17.90533447265625, "logps/chosen": -418.3417053222656, "logps/rejected": -330.00677490234375, "loss": 0.559, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4852076768875122, "rewards/margins": 0.46194639801979065, "rewards/rejected": 1.023261308670044, "step": 3470 }, { "epoch": 0.1615673893866939, "grad_norm": 154.7310028076172, "learning_rate": 4.731262670814182e-07, "logits/chosen": -17.976139068603516, "logits/rejected": -17.30695343017578, "logps/chosen": -369.7669982910156, "logps/rejected": -324.6889953613281, "loss": 0.7318, "rewards/accuracies": 0.5, "rewards/chosen": 1.0063108205795288, "rewards/margins": 0.03816263750195503, "rewards/rejected": 0.9681482315063477, "step": 3480 }, { "epoch": 0.16203166349412693, "grad_norm": 59.039066314697266, "learning_rate": 4.7304888806351265e-07, "logits/chosen": -19.204965591430664, "logits/rejected": -18.1903018951416, "logps/chosen": -352.8294982910156, "logps/rejected": -315.45086669921875, "loss": 0.6012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2430247068405151, "rewards/margins": 0.25648173689842224, "rewards/rejected": 0.9865428805351257, "step": 3490 }, { "epoch": 0.16249593760155995, "grad_norm": 88.17279052734375, "learning_rate": 4.7297150904560717e-07, "logits/chosen": -17.955677032470703, "logits/rejected": -17.28508758544922, "logps/chosen": -475.20623779296875, "logps/rejected": -350.312744140625, "loss": 0.6365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6265830993652344, "rewards/margins": 0.33988600969314575, "rewards/rejected": 1.2866971492767334, "step": 3500 }, { "epoch": 0.162960211708993, "grad_norm": 40.296104431152344, "learning_rate": 4.728941300277017e-07, "logits/chosen": -18.383197784423828, "logits/rejected": -17.419620513916016, "logps/chosen": -448.94183349609375, "logps/rejected": -364.6534729003906, "loss": 0.6291, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4317946434020996, "rewards/margins": 0.20373816788196564, "rewards/rejected": 1.2280564308166504, "step": 3510 }, { "epoch": 0.16342448581642602, "grad_norm": 75.84760284423828, "learning_rate": 4.728167510097962e-07, "logits/chosen": -18.258024215698242, "logits/rejected": -17.419023513793945, "logps/chosen": -414.4466247558594, "logps/rejected": -352.355224609375, "loss": 0.5779, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3234226703643799, "rewards/margins": 0.29198941588401794, "rewards/rejected": 1.03143310546875, "step": 3520 }, { "epoch": 0.16388875992385904, "grad_norm": 58.32664108276367, "learning_rate": 4.727393719918907e-07, "logits/chosen": -18.01173210144043, "logits/rejected": -17.046672821044922, "logps/chosen": -381.7772521972656, "logps/rejected": -278.617919921875, "loss": 0.6166, "rewards/accuracies": 0.5, "rewards/chosen": 1.3618712425231934, "rewards/margins": 0.30676907300949097, "rewards/rejected": 1.0551023483276367, "step": 3530 }, { "epoch": 0.16435303403129206, "grad_norm": 90.76210021972656, "learning_rate": 4.7266199297398516e-07, "logits/chosen": -18.56928825378418, "logits/rejected": -18.143409729003906, "logps/chosen": -350.6519775390625, "logps/rejected": -343.38250732421875, "loss": 0.7311, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.0234839916229248, "rewards/margins": -0.009389477781951427, "rewards/rejected": 1.0328733921051025, "step": 3540 }, { "epoch": 0.1648173081387251, "grad_norm": 101.94290924072266, "learning_rate": 4.725846139560796e-07, "logits/chosen": -18.2171630859375, "logits/rejected": -17.561616897583008, "logps/chosen": -356.7822265625, "logps/rejected": -247.1784210205078, "loss": 0.5412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2282928228378296, "rewards/margins": 0.3765621483325958, "rewards/rejected": 0.8517307043075562, "step": 3550 }, { "epoch": 0.16528158224615813, "grad_norm": 45.13088607788086, "learning_rate": 4.7250723493817413e-07, "logits/chosen": -18.547626495361328, "logits/rejected": -17.289379119873047, "logps/chosen": -372.7638854980469, "logps/rejected": -361.73284912109375, "loss": 0.6465, "rewards/accuracies": 0.5, "rewards/chosen": 1.1859159469604492, "rewards/margins": 0.24700617790222168, "rewards/rejected": 0.9389097094535828, "step": 3560 }, { "epoch": 0.16574585635359115, "grad_norm": 75.06121826171875, "learning_rate": 4.7242985592026864e-07, "logits/chosen": -17.376646041870117, "logits/rejected": -17.26089096069336, "logps/chosen": -323.6582946777344, "logps/rejected": -311.59490966796875, "loss": 0.8176, "rewards/accuracies": 0.5, "rewards/chosen": 1.0789631605148315, "rewards/margins": -0.0940323919057846, "rewards/rejected": 1.1729958057403564, "step": 3570 }, { "epoch": 0.1662101304610242, "grad_norm": 50.76539993286133, "learning_rate": 4.7235247690236315e-07, "logits/chosen": -18.463191986083984, "logits/rejected": -18.101451873779297, "logps/chosen": -363.6144714355469, "logps/rejected": -311.054931640625, "loss": 0.5985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4789488315582275, "rewards/margins": 0.316351979970932, "rewards/rejected": 1.1625968217849731, "step": 3580 }, { "epoch": 0.16667440456845722, "grad_norm": 156.72039794921875, "learning_rate": 4.722750978844576e-07, "logits/chosen": -17.842884063720703, "logits/rejected": -17.186826705932617, "logps/chosen": -417.041748046875, "logps/rejected": -355.4778137207031, "loss": 0.6019, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4167028665542603, "rewards/margins": 0.3340819776058197, "rewards/rejected": 1.0826208591461182, "step": 3590 }, { "epoch": 0.16713867867589025, "grad_norm": 100.94036102294922, "learning_rate": 4.721977188665521e-07, "logits/chosen": -18.734272003173828, "logits/rejected": -17.652706146240234, "logps/chosen": -445.6814880371094, "logps/rejected": -349.87725830078125, "loss": 0.619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4991123676300049, "rewards/margins": 0.26678580045700073, "rewards/rejected": 1.232326626777649, "step": 3600 }, { "epoch": 0.16760295278332327, "grad_norm": 112.96754455566406, "learning_rate": 4.7212033984864663e-07, "logits/chosen": -19.23274803161621, "logits/rejected": -18.105207443237305, "logps/chosen": -391.4847717285156, "logps/rejected": -362.2701110839844, "loss": 0.6095, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3724091053009033, "rewards/margins": 0.29350346326828003, "rewards/rejected": 1.0789058208465576, "step": 3610 }, { "epoch": 0.16806722689075632, "grad_norm": 79.12889099121094, "learning_rate": 4.7204296083074114e-07, "logits/chosen": -18.006450653076172, "logits/rejected": -17.240644454956055, "logps/chosen": -491.4466247558594, "logps/rejected": -366.849365234375, "loss": 0.6255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4385783672332764, "rewards/margins": 0.21123094856739044, "rewards/rejected": 1.2273473739624023, "step": 3620 }, { "epoch": 0.16853150099818934, "grad_norm": 33.429222106933594, "learning_rate": 4.7196558181283565e-07, "logits/chosen": -18.152816772460938, "logits/rejected": -16.420413970947266, "logps/chosen": -418.43792724609375, "logps/rejected": -266.4334411621094, "loss": 0.4404, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6650238037109375, "rewards/margins": 0.6871681213378906, "rewards/rejected": 0.9778558611869812, "step": 3630 }, { "epoch": 0.16899577510562236, "grad_norm": 101.06422424316406, "learning_rate": 4.7188820279493006e-07, "logits/chosen": -18.598196029663086, "logits/rejected": -18.93202781677246, "logps/chosen": -452.8155822753906, "logps/rejected": -463.67474365234375, "loss": 0.8026, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.3700408935546875, "rewards/margins": -0.16183212399482727, "rewards/rejected": 1.5318729877471924, "step": 3640 }, { "epoch": 0.16946004921305538, "grad_norm": 56.2971076965332, "learning_rate": 4.7181082377702457e-07, "logits/chosen": -18.303926467895508, "logits/rejected": -17.78927993774414, "logps/chosen": -487.8318786621094, "logps/rejected": -429.05462646484375, "loss": 0.7145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.496922254562378, "rewards/margins": 0.06232719495892525, "rewards/rejected": 1.4345948696136475, "step": 3650 }, { "epoch": 0.16992432332048843, "grad_norm": 122.09683990478516, "learning_rate": 4.717334447591191e-07, "logits/chosen": -17.139413833618164, "logits/rejected": -17.131465911865234, "logps/chosen": -309.33099365234375, "logps/rejected": -340.4332580566406, "loss": 0.8277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2401608228683472, "rewards/margins": -0.058849550783634186, "rewards/rejected": 1.2990102767944336, "step": 3660 }, { "epoch": 0.17038859742792145, "grad_norm": 77.3834228515625, "learning_rate": 4.716560657412136e-07, "logits/chosen": -18.78969955444336, "logits/rejected": -17.815250396728516, "logps/chosen": -316.98822021484375, "logps/rejected": -270.5625915527344, "loss": 0.8088, "rewards/accuracies": 0.5, "rewards/chosen": 0.9840218424797058, "rewards/margins": -0.057988546788692474, "rewards/rejected": 1.0420104265213013, "step": 3670 }, { "epoch": 0.17085287153535447, "grad_norm": 11.934870719909668, "learning_rate": 4.715786867233081e-07, "logits/chosen": -18.97926902770996, "logits/rejected": -17.782955169677734, "logps/chosen": -528.5667114257812, "logps/rejected": -299.25, "loss": 0.5854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5071057081222534, "rewards/margins": 0.5734012722969055, "rewards/rejected": 0.9337044954299927, "step": 3680 }, { "epoch": 0.1713171456427875, "grad_norm": 145.316162109375, "learning_rate": 4.7150130770540256e-07, "logits/chosen": -18.175418853759766, "logits/rejected": -18.002002716064453, "logps/chosen": -458.04144287109375, "logps/rejected": -395.85711669921875, "loss": 0.7632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4804376363754272, "rewards/margins": 0.10357490926980972, "rewards/rejected": 1.376862645149231, "step": 3690 }, { "epoch": 0.17178141975022054, "grad_norm": 163.11061096191406, "learning_rate": 4.714239286874971e-07, "logits/chosen": -18.601947784423828, "logits/rejected": -18.79813575744629, "logps/chosen": -443.53472900390625, "logps/rejected": -421.8038635253906, "loss": 0.7644, "rewards/accuracies": 0.5, "rewards/chosen": 1.148661732673645, "rewards/margins": -0.06322803348302841, "rewards/rejected": 1.2118898630142212, "step": 3700 }, { "epoch": 0.17224569385765356, "grad_norm": 82.12543487548828, "learning_rate": 4.713465496695916e-07, "logits/chosen": -18.55899429321289, "logits/rejected": -18.201871871948242, "logps/chosen": -460.2176818847656, "logps/rejected": -436.0111389160156, "loss": 0.7532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4801263809204102, "rewards/margins": 0.018175829201936722, "rewards/rejected": 1.4619505405426025, "step": 3710 }, { "epoch": 0.17270996796508659, "grad_norm": 50.949256896972656, "learning_rate": 4.712691706516861e-07, "logits/chosen": -18.28505516052246, "logits/rejected": -17.460193634033203, "logps/chosen": -389.2152099609375, "logps/rejected": -307.1796875, "loss": 0.5258, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3646202087402344, "rewards/margins": 0.4699631631374359, "rewards/rejected": 0.8946571350097656, "step": 3720 }, { "epoch": 0.1731742420725196, "grad_norm": 35.091739654541016, "learning_rate": 4.711917916337806e-07, "logits/chosen": -18.816974639892578, "logits/rejected": -17.0760440826416, "logps/chosen": -409.4580993652344, "logps/rejected": -290.0149841308594, "loss": 0.5266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5202014446258545, "rewards/margins": 0.45256733894348145, "rewards/rejected": 1.067634105682373, "step": 3730 }, { "epoch": 0.17363851617995266, "grad_norm": 61.48729705810547, "learning_rate": 4.71114412615875e-07, "logits/chosen": -19.061738967895508, "logits/rejected": -18.266725540161133, "logps/chosen": -473.51318359375, "logps/rejected": -325.369140625, "loss": 0.5104, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5525333881378174, "rewards/margins": 0.4674653112888336, "rewards/rejected": 1.0850679874420166, "step": 3740 }, { "epoch": 0.17410279028738568, "grad_norm": 16.528005599975586, "learning_rate": 4.710370335979695e-07, "logits/chosen": -18.924039840698242, "logits/rejected": -17.309890747070312, "logps/chosen": -410.19879150390625, "logps/rejected": -268.69244384765625, "loss": 0.565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4079573154449463, "rewards/margins": 0.4403717517852783, "rewards/rejected": 0.967585563659668, "step": 3750 }, { "epoch": 0.1745670643948187, "grad_norm": 67.12065887451172, "learning_rate": 4.7095965458006404e-07, "logits/chosen": -18.43155288696289, "logits/rejected": -16.703495025634766, "logps/chosen": -484.02154541015625, "logps/rejected": -297.45233154296875, "loss": 0.4813, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6866509914398193, "rewards/margins": 0.6392776370048523, "rewards/rejected": 1.0473734140396118, "step": 3760 }, { "epoch": 0.17503133850225172, "grad_norm": 46.451820373535156, "learning_rate": 4.7088227556215855e-07, "logits/chosen": -17.841053009033203, "logits/rejected": -17.867185592651367, "logps/chosen": -266.45831298828125, "logps/rejected": -314.88543701171875, "loss": 0.8616, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.8269851803779602, "rewards/margins": -0.24211516976356506, "rewards/rejected": 1.0691003799438477, "step": 3770 }, { "epoch": 0.17549561260968477, "grad_norm": 40.76995086669922, "learning_rate": 4.7080489654425306e-07, "logits/chosen": -18.523805618286133, "logits/rejected": -17.938440322875977, "logps/chosen": -476.97967529296875, "logps/rejected": -308.39288330078125, "loss": 0.6059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5588006973266602, "rewards/margins": 0.3373907208442688, "rewards/rejected": 1.2214099168777466, "step": 3780 }, { "epoch": 0.1759598867171178, "grad_norm": 73.38589477539062, "learning_rate": 4.707275175263475e-07, "logits/chosen": -18.766204833984375, "logits/rejected": -17.708988189697266, "logps/chosen": -390.23748779296875, "logps/rejected": -318.8675231933594, "loss": 0.7803, "rewards/accuracies": 0.5, "rewards/chosen": 1.1354906558990479, "rewards/margins": -0.04252760484814644, "rewards/rejected": 1.17801833152771, "step": 3790 }, { "epoch": 0.1764241608245508, "grad_norm": 18.504749298095703, "learning_rate": 4.7065013850844203e-07, "logits/chosen": -18.48386573791504, "logits/rejected": -16.88154411315918, "logps/chosen": -537.3302001953125, "logps/rejected": -327.519775390625, "loss": 0.5624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7107486724853516, "rewards/margins": 0.41504788398742676, "rewards/rejected": 1.2957006692886353, "step": 3800 }, { "epoch": 0.17688843493198383, "grad_norm": 34.292598724365234, "learning_rate": 4.7057275949053654e-07, "logits/chosen": -17.801631927490234, "logits/rejected": -16.964441299438477, "logps/chosen": -475.3064880371094, "logps/rejected": -384.2489013671875, "loss": 0.5948, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5051754713058472, "rewards/margins": 0.2807456851005554, "rewards/rejected": 1.2244298458099365, "step": 3810 }, { "epoch": 0.17735270903941688, "grad_norm": 35.738521575927734, "learning_rate": 4.7049538047263105e-07, "logits/chosen": -18.170658111572266, "logits/rejected": -17.01106071472168, "logps/chosen": -446.6048889160156, "logps/rejected": -317.1734313964844, "loss": 0.655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4702765941619873, "rewards/margins": 0.19660530984401703, "rewards/rejected": 1.273671269416809, "step": 3820 }, { "epoch": 0.1778169831468499, "grad_norm": 47.708351135253906, "learning_rate": 4.704180014547255e-07, "logits/chosen": -17.800321578979492, "logits/rejected": -17.272720336914062, "logps/chosen": -322.8017578125, "logps/rejected": -245.2980194091797, "loss": 0.5978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.177402138710022, "rewards/margins": 0.24632401764392853, "rewards/rejected": 0.9310780763626099, "step": 3830 }, { "epoch": 0.17828125725428293, "grad_norm": 141.72607421875, "learning_rate": 4.7034062243681997e-07, "logits/chosen": -18.45751190185547, "logits/rejected": -18.357158660888672, "logps/chosen": -448.28387451171875, "logps/rejected": -454.29180908203125, "loss": 0.7442, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2003862857818604, "rewards/margins": -0.02971675992012024, "rewards/rejected": 1.2301031351089478, "step": 3840 }, { "epoch": 0.17874553136171595, "grad_norm": 108.47954559326172, "learning_rate": 4.702632434189145e-07, "logits/chosen": -17.925378799438477, "logits/rejected": -17.51259994506836, "logps/chosen": -408.3045959472656, "logps/rejected": -302.29638671875, "loss": 0.5226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3848052024841309, "rewards/margins": 0.4346850514411926, "rewards/rejected": 0.9501202702522278, "step": 3850 }, { "epoch": 0.179209805469149, "grad_norm": 32.44564437866211, "learning_rate": 4.70185864401009e-07, "logits/chosen": -18.177745819091797, "logits/rejected": -16.699777603149414, "logps/chosen": -528.8504638671875, "logps/rejected": -393.7583923339844, "loss": 0.5848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9913698434829712, "rewards/margins": 0.5008736848831177, "rewards/rejected": 1.490496039390564, "step": 3860 }, { "epoch": 0.17967407957658202, "grad_norm": 182.93663024902344, "learning_rate": 4.701084853831035e-07, "logits/chosen": -18.1217041015625, "logits/rejected": -17.708759307861328, "logps/chosen": -393.9952697753906, "logps/rejected": -310.07305908203125, "loss": 0.6468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5712308883666992, "rewards/margins": 0.38789257407188416, "rewards/rejected": 1.1833381652832031, "step": 3870 }, { "epoch": 0.18013835368401504, "grad_norm": 52.21270751953125, "learning_rate": 4.70031106365198e-07, "logits/chosen": -19.686832427978516, "logits/rejected": -18.610055923461914, "logps/chosen": -479.4810485839844, "logps/rejected": -335.39849853515625, "loss": 0.5765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6270498037338257, "rewards/margins": 0.4153978228569031, "rewards/rejected": 1.2116520404815674, "step": 3880 }, { "epoch": 0.18060262779144806, "grad_norm": 82.05316925048828, "learning_rate": 4.6995372734729247e-07, "logits/chosen": -17.442432403564453, "logits/rejected": -17.18829345703125, "logps/chosen": -320.6353454589844, "logps/rejected": -240.5731201171875, "loss": 0.623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0667622089385986, "rewards/margins": 0.2123284786939621, "rewards/rejected": 0.8544336557388306, "step": 3890 }, { "epoch": 0.1810669018988811, "grad_norm": 70.45767974853516, "learning_rate": 4.69876348329387e-07, "logits/chosen": -19.487468719482422, "logits/rejected": -18.035133361816406, "logps/chosen": -387.60723876953125, "logps/rejected": -230.6012420654297, "loss": 0.4744, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6345449686050415, "rewards/margins": 0.7184900045394897, "rewards/rejected": 0.916054904460907, "step": 3900 }, { "epoch": 0.18153117600631413, "grad_norm": 49.819847106933594, "learning_rate": 4.697989693114815e-07, "logits/chosen": -17.705181121826172, "logits/rejected": -16.75979995727539, "logps/chosen": -441.6070861816406, "logps/rejected": -331.0271911621094, "loss": 0.5912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4418562650680542, "rewards/margins": 0.2834032475948334, "rewards/rejected": 1.1584529876708984, "step": 3910 }, { "epoch": 0.18199545011374715, "grad_norm": 55.76792526245117, "learning_rate": 4.69721590293576e-07, "logits/chosen": -17.59953498840332, "logits/rejected": -17.421607971191406, "logps/chosen": -353.39044189453125, "logps/rejected": -300.2573547363281, "loss": 0.5802, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3137544393539429, "rewards/margins": 0.30690500140190125, "rewards/rejected": 1.0068494081497192, "step": 3920 }, { "epoch": 0.18245972422118018, "grad_norm": 53.280330657958984, "learning_rate": 4.6964421127567046e-07, "logits/chosen": -18.443037033081055, "logits/rejected": -17.902385711669922, "logps/chosen": -511.3060607910156, "logps/rejected": -378.9458312988281, "loss": 0.5305, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7401478290557861, "rewards/margins": 0.4443696439266205, "rewards/rejected": 1.2957780361175537, "step": 3930 }, { "epoch": 0.18292399832861322, "grad_norm": 79.89124298095703, "learning_rate": 4.695668322577649e-07, "logits/chosen": -18.020709991455078, "logits/rejected": -17.950023651123047, "logps/chosen": -408.5404357910156, "logps/rejected": -369.91265869140625, "loss": 0.7398, "rewards/accuracies": 0.5, "rewards/chosen": 1.446136236190796, "rewards/margins": 0.13326077163219452, "rewards/rejected": 1.312875509262085, "step": 3940 }, { "epoch": 0.18338827243604625, "grad_norm": 58.11922073364258, "learning_rate": 4.6948945323985943e-07, "logits/chosen": -17.47012710571289, "logits/rejected": -17.370447158813477, "logps/chosen": -434.33343505859375, "logps/rejected": -355.34112548828125, "loss": 0.5757, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6064417362213135, "rewards/margins": 0.3158486485481262, "rewards/rejected": 1.290592908859253, "step": 3950 }, { "epoch": 0.18385254654347927, "grad_norm": 171.87339782714844, "learning_rate": 4.6941207422195394e-07, "logits/chosen": -18.606685638427734, "logits/rejected": -17.99542236328125, "logps/chosen": -427.2875061035156, "logps/rejected": -384.00634765625, "loss": 0.7806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2849338054656982, "rewards/margins": -0.015301150269806385, "rewards/rejected": 1.3002350330352783, "step": 3960 }, { "epoch": 0.1843168206509123, "grad_norm": 36.32763671875, "learning_rate": 4.6933469520404846e-07, "logits/chosen": -18.692529678344727, "logits/rejected": -18.33968734741211, "logps/chosen": -370.7108459472656, "logps/rejected": -335.86151123046875, "loss": 0.7479, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2945637702941895, "rewards/margins": 0.047291528433561325, "rewards/rejected": 1.2472723722457886, "step": 3970 }, { "epoch": 0.18478109475834534, "grad_norm": 30.543228149414062, "learning_rate": 4.6925731618614297e-07, "logits/chosen": -17.94345474243164, "logits/rejected": -16.47049903869629, "logps/chosen": -420.426025390625, "logps/rejected": -259.7183837890625, "loss": 0.4727, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.557975172996521, "rewards/margins": 0.5536224842071533, "rewards/rejected": 1.0043526887893677, "step": 3980 }, { "epoch": 0.18524536886577836, "grad_norm": 49.496063232421875, "learning_rate": 4.691799371682374e-07, "logits/chosen": -18.03294563293457, "logits/rejected": -17.35073471069336, "logps/chosen": -311.32464599609375, "logps/rejected": -273.27880859375, "loss": 0.6125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1233443021774292, "rewards/margins": 0.1943890005350113, "rewards/rejected": 0.9289552569389343, "step": 3990 }, { "epoch": 0.18570964297321138, "grad_norm": 109.68866729736328, "learning_rate": 4.6910255815033194e-07, "logits/chosen": -18.12917709350586, "logits/rejected": -18.082910537719727, "logps/chosen": -330.3515319824219, "logps/rejected": -331.01055908203125, "loss": 0.6611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.226334810256958, "rewards/margins": 0.07694827020168304, "rewards/rejected": 1.1493866443634033, "step": 4000 }, { "epoch": 0.1861739170806444, "grad_norm": 108.5430908203125, "learning_rate": 4.6902517913242645e-07, "logits/chosen": -17.777484893798828, "logits/rejected": -17.187599182128906, "logps/chosen": -401.76666259765625, "logps/rejected": -304.9668884277344, "loss": 0.7036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2807613611221313, "rewards/margins": 0.1571863442659378, "rewards/rejected": 1.12357497215271, "step": 4010 }, { "epoch": 0.18663819118807745, "grad_norm": 69.83940887451172, "learning_rate": 4.6894780011452096e-07, "logits/chosen": -18.482616424560547, "logits/rejected": -16.94832992553711, "logps/chosen": -416.8028259277344, "logps/rejected": -271.61309814453125, "loss": 0.5546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.52711820602417, "rewards/margins": 0.4719122052192688, "rewards/rejected": 1.0552059412002563, "step": 4020 }, { "epoch": 0.18710246529551047, "grad_norm": 188.88706970214844, "learning_rate": 4.688704210966154e-07, "logits/chosen": -19.09183692932129, "logits/rejected": -18.210010528564453, "logps/chosen": -405.99505615234375, "logps/rejected": -327.10418701171875, "loss": 0.5911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5166136026382446, "rewards/margins": 0.426596462726593, "rewards/rejected": 1.0900170803070068, "step": 4030 }, { "epoch": 0.1875667394029435, "grad_norm": 94.57896423339844, "learning_rate": 4.687930420787099e-07, "logits/chosen": -17.843233108520508, "logits/rejected": -16.938217163085938, "logps/chosen": -374.8287658691406, "logps/rejected": -270.11614990234375, "loss": 0.6361, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3719937801361084, "rewards/margins": 0.3282713294029236, "rewards/rejected": 1.0437225103378296, "step": 4040 }, { "epoch": 0.18803101351037652, "grad_norm": 122.878173828125, "learning_rate": 4.687156630608044e-07, "logits/chosen": -18.183103561401367, "logits/rejected": -17.174972534179688, "logps/chosen": -452.23773193359375, "logps/rejected": -332.24615478515625, "loss": 0.5246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6481021642684937, "rewards/margins": 0.4582269787788391, "rewards/rejected": 1.1898752450942993, "step": 4050 }, { "epoch": 0.18849528761780956, "grad_norm": 18.828044891357422, "learning_rate": 4.686382840428989e-07, "logits/chosen": -18.24856185913086, "logits/rejected": -17.67582130432129, "logps/chosen": -363.8558349609375, "logps/rejected": -324.4768981933594, "loss": 0.6208, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2200254201889038, "rewards/margins": 0.26451390981674194, "rewards/rejected": 0.9555114507675171, "step": 4060 }, { "epoch": 0.1889595617252426, "grad_norm": 155.39329528808594, "learning_rate": 4.685609050249934e-07, "logits/chosen": -18.38623809814453, "logits/rejected": -18.087434768676758, "logps/chosen": -442.97314453125, "logps/rejected": -446.1502380371094, "loss": 0.8939, "rewards/accuracies": 0.5, "rewards/chosen": 1.6875003576278687, "rewards/margins": -0.0659443587064743, "rewards/rejected": 1.7534449100494385, "step": 4070 }, { "epoch": 0.1894238358326756, "grad_norm": 57.28390121459961, "learning_rate": 4.684835260070879e-07, "logits/chosen": -18.712486267089844, "logits/rejected": -17.61859703063965, "logps/chosen": -409.83160400390625, "logps/rejected": -269.9794006347656, "loss": 0.4288, "rewards/accuracies": 1.0, "rewards/chosen": 1.8317511081695557, "rewards/margins": 0.7826265692710876, "rewards/rejected": 1.0491243600845337, "step": 4080 }, { "epoch": 0.18988810994010863, "grad_norm": 45.70771789550781, "learning_rate": 4.684061469891824e-07, "logits/chosen": -17.269420623779297, "logits/rejected": -17.37765884399414, "logps/chosen": -347.51593017578125, "logps/rejected": -271.87457275390625, "loss": 0.5789, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4378468990325928, "rewards/margins": 0.3247109651565552, "rewards/rejected": 1.1131359338760376, "step": 4090 }, { "epoch": 0.19035238404754168, "grad_norm": 131.65121459960938, "learning_rate": 4.683287679712769e-07, "logits/chosen": -18.118322372436523, "logits/rejected": -17.08346939086914, "logps/chosen": -512.0721435546875, "logps/rejected": -303.66632080078125, "loss": 0.4531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7531503438949585, "rewards/margins": 0.7061388492584229, "rewards/rejected": 1.0470116138458252, "step": 4100 }, { "epoch": 0.1908166581549747, "grad_norm": 42.855621337890625, "learning_rate": 4.682513889533714e-07, "logits/chosen": -18.629215240478516, "logits/rejected": -18.012832641601562, "logps/chosen": -402.1927490234375, "logps/rejected": -311.99237060546875, "loss": 0.4996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5538431406021118, "rewards/margins": 0.5956282615661621, "rewards/rejected": 0.9582147598266602, "step": 4110 }, { "epoch": 0.19128093226240772, "grad_norm": 143.77777099609375, "learning_rate": 4.6817400993546586e-07, "logits/chosen": -18.39157485961914, "logits/rejected": -18.158527374267578, "logps/chosen": -439.42242431640625, "logps/rejected": -441.928466796875, "loss": 0.7559, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.69026780128479, "rewards/margins": -0.03980419784784317, "rewards/rejected": 1.730072021484375, "step": 4120 }, { "epoch": 0.19174520636984074, "grad_norm": 77.46898651123047, "learning_rate": 4.6809663091756037e-07, "logits/chosen": -17.96923065185547, "logits/rejected": -18.216094970703125, "logps/chosen": -432.3955078125, "logps/rejected": -402.014404296875, "loss": 0.7663, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.3531064987182617, "rewards/margins": -0.10950107872486115, "rewards/rejected": 1.4626076221466064, "step": 4130 }, { "epoch": 0.1922094804772738, "grad_norm": 167.8472442626953, "learning_rate": 4.6801925189965483e-07, "logits/chosen": -18.440471649169922, "logits/rejected": -17.336549758911133, "logps/chosen": -461.65093994140625, "logps/rejected": -360.48736572265625, "loss": 0.7472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5742137432098389, "rewards/margins": 0.10815200954675674, "rewards/rejected": 1.4660618305206299, "step": 4140 }, { "epoch": 0.1926737545847068, "grad_norm": 74.50725555419922, "learning_rate": 4.6794187288174934e-07, "logits/chosen": -17.711349487304688, "logits/rejected": -17.81320571899414, "logps/chosen": -435.49810791015625, "logps/rejected": -466.50347900390625, "loss": 0.7289, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6867249011993408, "rewards/margins": 0.13453730940818787, "rewards/rejected": 1.552187442779541, "step": 4150 }, { "epoch": 0.19313802869213983, "grad_norm": 79.59708404541016, "learning_rate": 4.6786449386384385e-07, "logits/chosen": -18.435596466064453, "logits/rejected": -17.175678253173828, "logps/chosen": -389.0538024902344, "logps/rejected": -186.49903869628906, "loss": 0.4765, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4899966716766357, "rewards/margins": 0.6885544061660767, "rewards/rejected": 0.8014421463012695, "step": 4160 }, { "epoch": 0.19360230279957286, "grad_norm": 58.3900260925293, "learning_rate": 4.6778711484593836e-07, "logits/chosen": -19.110172271728516, "logits/rejected": -17.656600952148438, "logps/chosen": -478.58367919921875, "logps/rejected": -341.66046142578125, "loss": 0.5036, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6101303100585938, "rewards/margins": 0.593682587146759, "rewards/rejected": 1.0164477825164795, "step": 4170 }, { "epoch": 0.1940665769070059, "grad_norm": 68.9750747680664, "learning_rate": 4.677097358280329e-07, "logits/chosen": -17.731985092163086, "logits/rejected": -16.91730308532715, "logps/chosen": -361.27362060546875, "logps/rejected": -279.7144775390625, "loss": 0.6021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3857018947601318, "rewards/margins": 0.5654454231262207, "rewards/rejected": 0.8202563524246216, "step": 4180 }, { "epoch": 0.19453085101443893, "grad_norm": 51.742103576660156, "learning_rate": 4.6763235681012733e-07, "logits/chosen": -18.29513168334961, "logits/rejected": -17.770572662353516, "logps/chosen": -338.5625915527344, "logps/rejected": -318.28790283203125, "loss": 0.672, "rewards/accuracies": 0.5, "rewards/chosen": 1.2242618799209595, "rewards/margins": 0.12406182289123535, "rewards/rejected": 1.1002000570297241, "step": 4190 }, { "epoch": 0.19499512512187195, "grad_norm": 66.64263916015625, "learning_rate": 4.6755497779222185e-07, "logits/chosen": -18.283552169799805, "logits/rejected": -17.089004516601562, "logps/chosen": -393.09893798828125, "logps/rejected": -294.9784851074219, "loss": 0.6128, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5623815059661865, "rewards/margins": 0.22655534744262695, "rewards/rejected": 1.3358261585235596, "step": 4200 }, { "epoch": 0.19545939922930497, "grad_norm": 26.216205596923828, "learning_rate": 4.6747759877431636e-07, "logits/chosen": -17.876028060913086, "logits/rejected": -17.791404724121094, "logps/chosen": -448.3981018066406, "logps/rejected": -399.08331298828125, "loss": 0.7098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8670222759246826, "rewards/margins": 0.0642317682504654, "rewards/rejected": 1.802790641784668, "step": 4210 }, { "epoch": 0.19592367333673802, "grad_norm": 29.34895133972168, "learning_rate": 4.674002197564108e-07, "logits/chosen": -17.663347244262695, "logits/rejected": -16.79910659790039, "logps/chosen": -365.96942138671875, "logps/rejected": -330.1405334472656, "loss": 0.7241, "rewards/accuracies": 0.5, "rewards/chosen": 1.3008511066436768, "rewards/margins": 0.1308627426624298, "rewards/rejected": 1.1699883937835693, "step": 4220 }, { "epoch": 0.19638794744417104, "grad_norm": 60.93562316894531, "learning_rate": 4.673228407385053e-07, "logits/chosen": -17.80573844909668, "logits/rejected": -16.862470626831055, "logps/chosen": -470.5894470214844, "logps/rejected": -363.9247741699219, "loss": 0.617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7191417217254639, "rewards/margins": 0.2707247734069824, "rewards/rejected": 1.4484169483184814, "step": 4230 }, { "epoch": 0.19685222155160406, "grad_norm": 35.901893615722656, "learning_rate": 4.672454617205998e-07, "logits/chosen": -18.26525115966797, "logits/rejected": -17.26695442199707, "logps/chosen": -451.2216796875, "logps/rejected": -264.76715087890625, "loss": 0.3488, "rewards/accuracies": 1.0, "rewards/chosen": 1.9533954858779907, "rewards/margins": 0.9449766278266907, "rewards/rejected": 1.0084187984466553, "step": 4240 }, { "epoch": 0.19731649565903708, "grad_norm": 56.943511962890625, "learning_rate": 4.671680827026943e-07, "logits/chosen": -18.614299774169922, "logits/rejected": -17.064090728759766, "logps/chosen": -338.31268310546875, "logps/rejected": -210.78134155273438, "loss": 0.4938, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3441187143325806, "rewards/margins": 0.565830647945404, "rewards/rejected": 0.7782881259918213, "step": 4250 }, { "epoch": 0.19778076976647013, "grad_norm": 152.8206787109375, "learning_rate": 4.670907036847888e-07, "logits/chosen": -18.258047103881836, "logits/rejected": -16.594768524169922, "logps/chosen": -508.63726806640625, "logps/rejected": -400.67156982421875, "loss": 0.4916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7108020782470703, "rewards/margins": 0.5890750885009766, "rewards/rejected": 1.1217269897460938, "step": 4260 }, { "epoch": 0.19824504387390315, "grad_norm": 59.17912292480469, "learning_rate": 4.670133246668833e-07, "logits/chosen": -18.37420654296875, "logits/rejected": -18.18340492248535, "logps/chosen": -463.2144470214844, "logps/rejected": -418.3353576660156, "loss": 0.7093, "rewards/accuracies": 0.5, "rewards/chosen": 1.382204294204712, "rewards/margins": 0.08840261399745941, "rewards/rejected": 1.2938016653060913, "step": 4270 }, { "epoch": 0.19870931798133618, "grad_norm": 128.36749267578125, "learning_rate": 4.6693594564897783e-07, "logits/chosen": -18.400455474853516, "logits/rejected": -17.367616653442383, "logps/chosen": -391.3468322753906, "logps/rejected": -359.172119140625, "loss": 0.6871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.364377737045288, "rewards/margins": 0.09811781346797943, "rewards/rejected": 1.2662599086761475, "step": 4280 }, { "epoch": 0.1991735920887692, "grad_norm": 98.89166259765625, "learning_rate": 4.6685856663107234e-07, "logits/chosen": -18.64812660217285, "logits/rejected": -17.744626998901367, "logps/chosen": -471.9615783691406, "logps/rejected": -391.45587158203125, "loss": 0.4873, "rewards/accuracies": 1.0, "rewards/chosen": 1.7022454738616943, "rewards/margins": 0.5009161233901978, "rewards/rejected": 1.2013293504714966, "step": 4290 }, { "epoch": 0.19963786619620225, "grad_norm": 142.7233428955078, "learning_rate": 4.667811876131668e-07, "logits/chosen": -18.009471893310547, "logits/rejected": -17.3121395111084, "logps/chosen": -368.3059997558594, "logps/rejected": -329.2835998535156, "loss": 0.7312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3776888847351074, "rewards/margins": 0.0070635974407196045, "rewards/rejected": 1.3706252574920654, "step": 4300 }, { "epoch": 0.20010214030363527, "grad_norm": 80.93892669677734, "learning_rate": 4.6670380859526126e-07, "logits/chosen": -18.426921844482422, "logits/rejected": -18.776636123657227, "logps/chosen": -419.10546875, "logps/rejected": -387.20660400390625, "loss": 0.6622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5871984958648682, "rewards/margins": 0.12799592316150665, "rewards/rejected": 1.459202527999878, "step": 4310 }, { "epoch": 0.2005664144110683, "grad_norm": 10.206558227539062, "learning_rate": 4.6662642957735577e-07, "logits/chosen": -18.263229370117188, "logits/rejected": -17.504497528076172, "logps/chosen": -403.3656311035156, "logps/rejected": -306.21014404296875, "loss": 0.62, "rewards/accuracies": 0.5, "rewards/chosen": 1.5238804817199707, "rewards/margins": 0.4249155521392822, "rewards/rejected": 1.098965048789978, "step": 4320 }, { "epoch": 0.2010306885185013, "grad_norm": 147.74452209472656, "learning_rate": 4.665490505594503e-07, "logits/chosen": -18.089860916137695, "logits/rejected": -18.081525802612305, "logps/chosen": -390.1252136230469, "logps/rejected": -311.6717834472656, "loss": 0.7064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.498626947402954, "rewards/margins": 0.19171276688575745, "rewards/rejected": 1.3069143295288086, "step": 4330 }, { "epoch": 0.20149496262593436, "grad_norm": 66.97966766357422, "learning_rate": 4.6647167154154474e-07, "logits/chosen": -18.64388656616211, "logits/rejected": -17.4147891998291, "logps/chosen": -446.9100646972656, "logps/rejected": -292.5475158691406, "loss": 0.4712, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8892319202423096, "rewards/margins": 0.8832440376281738, "rewards/rejected": 1.0059878826141357, "step": 4340 }, { "epoch": 0.20195923673336738, "grad_norm": 44.133544921875, "learning_rate": 4.6639429252363925e-07, "logits/chosen": -17.631399154663086, "logits/rejected": -18.104408264160156, "logps/chosen": -353.74798583984375, "logps/rejected": -376.6401672363281, "loss": 0.8496, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.4952573776245117, "rewards/margins": -0.180806964635849, "rewards/rejected": 1.6760642528533936, "step": 4350 }, { "epoch": 0.2024235108408004, "grad_norm": 141.52247619628906, "learning_rate": 4.6631691350573376e-07, "logits/chosen": -19.156036376953125, "logits/rejected": -18.221134185791016, "logps/chosen": -433.46112060546875, "logps/rejected": -374.94097900390625, "loss": 0.545, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8862760066986084, "rewards/margins": 0.47548753023147583, "rewards/rejected": 1.4107885360717773, "step": 4360 }, { "epoch": 0.20288778494823342, "grad_norm": 107.00098419189453, "learning_rate": 4.6623953448782827e-07, "logits/chosen": -18.282630920410156, "logits/rejected": -17.959121704101562, "logps/chosen": -439.465576171875, "logps/rejected": -359.3810729980469, "loss": 0.6572, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3904751539230347, "rewards/margins": 0.09784364700317383, "rewards/rejected": 1.2926315069198608, "step": 4370 }, { "epoch": 0.20335205905566647, "grad_norm": 174.7378692626953, "learning_rate": 4.661621554699228e-07, "logits/chosen": -17.777921676635742, "logits/rejected": -18.283905029296875, "logps/chosen": -337.73199462890625, "logps/rejected": -381.161376953125, "loss": 0.7617, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.531301498413086, "rewards/margins": 0.0630510002374649, "rewards/rejected": 1.4682505130767822, "step": 4380 }, { "epoch": 0.2038163331630995, "grad_norm": 120.7710189819336, "learning_rate": 4.660847764520173e-07, "logits/chosen": -17.637592315673828, "logits/rejected": -17.42841911315918, "logps/chosen": -253.58285522460938, "logps/rejected": -299.0921630859375, "loss": 0.7906, "rewards/accuracies": 0.5, "rewards/chosen": 1.290184736251831, "rewards/margins": -0.05798138305544853, "rewards/rejected": 1.3481662273406982, "step": 4390 }, { "epoch": 0.20428060727053252, "grad_norm": 49.757102966308594, "learning_rate": 4.6600739743411175e-07, "logits/chosen": -18.325284957885742, "logits/rejected": -16.73040771484375, "logps/chosen": -458.7469177246094, "logps/rejected": -289.9369812011719, "loss": 0.5611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5221134424209595, "rewards/margins": 0.5132028460502625, "rewards/rejected": 1.0089106559753418, "step": 4400 }, { "epoch": 0.20474488137796554, "grad_norm": 72.47161865234375, "learning_rate": 4.659300184162062e-07, "logits/chosen": -18.161224365234375, "logits/rejected": -18.321842193603516, "logps/chosen": -437.2955017089844, "logps/rejected": -434.21246337890625, "loss": 0.7364, "rewards/accuracies": 0.5, "rewards/chosen": 1.6898918151855469, "rewards/margins": 0.0007119894144125283, "rewards/rejected": 1.6891796588897705, "step": 4410 }, { "epoch": 0.2052091554853986, "grad_norm": 59.807621002197266, "learning_rate": 4.658526393983007e-07, "logits/chosen": -19.029855728149414, "logits/rejected": -18.397459030151367, "logps/chosen": -384.41888427734375, "logps/rejected": -332.7171325683594, "loss": 0.6475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5441699028015137, "rewards/margins": 0.31918853521347046, "rewards/rejected": 1.224981427192688, "step": 4420 }, { "epoch": 0.2056734295928316, "grad_norm": 36.643096923828125, "learning_rate": 4.6577526038039523e-07, "logits/chosen": -16.55176544189453, "logits/rejected": -16.772815704345703, "logps/chosen": -369.1988220214844, "logps/rejected": -335.60479736328125, "loss": 0.7438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2799893617630005, "rewards/margins": 0.1495772749185562, "rewards/rejected": 1.1304118633270264, "step": 4430 }, { "epoch": 0.20613770370026463, "grad_norm": 38.79530715942383, "learning_rate": 4.6569788136248975e-07, "logits/chosen": -18.11446189880371, "logits/rejected": -16.936294555664062, "logps/chosen": -426.8890686035156, "logps/rejected": -325.90478515625, "loss": 0.6675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.633779525756836, "rewards/margins": 0.2753852605819702, "rewards/rejected": 1.3583943843841553, "step": 4440 }, { "epoch": 0.20660197780769765, "grad_norm": 20.629005432128906, "learning_rate": 4.656205023445842e-07, "logits/chosen": -16.957372665405273, "logits/rejected": -16.835941314697266, "logps/chosen": -348.2710876464844, "logps/rejected": -350.0916442871094, "loss": 0.6764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1845160722732544, "rewards/margins": 0.060919396579265594, "rewards/rejected": 1.1235966682434082, "step": 4450 }, { "epoch": 0.2070662519151307, "grad_norm": 68.53092956542969, "learning_rate": 4.655431233266787e-07, "logits/chosen": -18.50528907775879, "logits/rejected": -17.498945236206055, "logps/chosen": -510.90020751953125, "logps/rejected": -408.2789001464844, "loss": 0.6078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8055994510650635, "rewards/margins": 0.37133604288101196, "rewards/rejected": 1.4342634677886963, "step": 4460 }, { "epoch": 0.20753052602256372, "grad_norm": 110.2800521850586, "learning_rate": 4.6546574430877323e-07, "logits/chosen": -18.77776336669922, "logits/rejected": -18.44654083251953, "logps/chosen": -377.30621337890625, "logps/rejected": -381.43341064453125, "loss": 0.79, "rewards/accuracies": 0.5, "rewards/chosen": 1.3233325481414795, "rewards/margins": -0.08454938977956772, "rewards/rejected": 1.4078819751739502, "step": 4470 }, { "epoch": 0.20799480012999674, "grad_norm": 98.3133773803711, "learning_rate": 4.6538836529086774e-07, "logits/chosen": -17.429224014282227, "logits/rejected": -16.791698455810547, "logps/chosen": -401.68182373046875, "logps/rejected": -395.9087829589844, "loss": 0.7111, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4775222539901733, "rewards/margins": 0.10872182995080948, "rewards/rejected": 1.3688005208969116, "step": 4480 }, { "epoch": 0.2084590742374298, "grad_norm": 43.91984176635742, "learning_rate": 4.6531098627296225e-07, "logits/chosen": -17.683691024780273, "logits/rejected": -17.490135192871094, "logps/chosen": -421.64837646484375, "logps/rejected": -370.30908203125, "loss": 0.6713, "rewards/accuracies": 0.5, "rewards/chosen": 1.6813796758651733, "rewards/margins": 0.273369699716568, "rewards/rejected": 1.4080098867416382, "step": 4490 }, { "epoch": 0.2089233483448628, "grad_norm": 38.39341735839844, "learning_rate": 4.6523360725505665e-07, "logits/chosen": -19.104106903076172, "logits/rejected": -18.182249069213867, "logps/chosen": -428.040771484375, "logps/rejected": -319.9293212890625, "loss": 0.563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6220157146453857, "rewards/margins": 0.3603651821613312, "rewards/rejected": 1.2616504430770874, "step": 4500 }, { "epoch": 0.20938762245229584, "grad_norm": 49.26118850708008, "learning_rate": 4.6515622823715117e-07, "logits/chosen": -17.05706024169922, "logits/rejected": -16.304798126220703, "logps/chosen": -409.52386474609375, "logps/rejected": -292.6301574707031, "loss": 0.7243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4590984582901, "rewards/margins": 0.2452671080827713, "rewards/rejected": 1.2138314247131348, "step": 4510 }, { "epoch": 0.20985189655972886, "grad_norm": 75.4645767211914, "learning_rate": 4.650788492192457e-07, "logits/chosen": -17.135425567626953, "logits/rejected": -18.43082618713379, "logps/chosen": -308.57452392578125, "logps/rejected": -384.5393981933594, "loss": 0.7672, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2770746946334839, "rewards/margins": -0.08067499101161957, "rewards/rejected": 1.3577497005462646, "step": 4520 }, { "epoch": 0.2103161706671619, "grad_norm": 35.76875305175781, "learning_rate": 4.650014702013402e-07, "logits/chosen": -18.947925567626953, "logits/rejected": -17.534204483032227, "logps/chosen": -572.2930297851562, "logps/rejected": -485.43646240234375, "loss": 0.5305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8485727310180664, "rewards/margins": 0.41333383321762085, "rewards/rejected": 1.4352388381958008, "step": 4530 }, { "epoch": 0.21078044477459493, "grad_norm": 126.75531768798828, "learning_rate": 4.649240911834347e-07, "logits/chosen": -19.152938842773438, "logits/rejected": -18.039812088012695, "logps/chosen": -454.1531677246094, "logps/rejected": -352.163330078125, "loss": 0.599, "rewards/accuracies": 0.5, "rewards/chosen": 1.5289831161499023, "rewards/margins": 0.3233518600463867, "rewards/rejected": 1.2056313753128052, "step": 4540 }, { "epoch": 0.21124471888202795, "grad_norm": 91.35786437988281, "learning_rate": 4.6484671216552916e-07, "logits/chosen": -18.618452072143555, "logits/rejected": -18.126550674438477, "logps/chosen": -443.95147705078125, "logps/rejected": -355.10357666015625, "loss": 0.6262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4653618335723877, "rewards/margins": 0.18137893080711365, "rewards/rejected": 1.2839829921722412, "step": 4550 }, { "epoch": 0.21170899298946097, "grad_norm": 65.30717468261719, "learning_rate": 4.6476933314762367e-07, "logits/chosen": -17.964984893798828, "logits/rejected": -16.32454490661621, "logps/chosen": -389.29803466796875, "logps/rejected": -207.5088653564453, "loss": 0.4272, "rewards/accuracies": 1.0, "rewards/chosen": 1.666637659072876, "rewards/margins": 0.6560200452804565, "rewards/rejected": 1.0106176137924194, "step": 4560 }, { "epoch": 0.21217326709689402, "grad_norm": 54.82585144042969, "learning_rate": 4.646919541297182e-07, "logits/chosen": -18.251659393310547, "logits/rejected": -17.81031036376953, "logps/chosen": -511.16473388671875, "logps/rejected": -471.3912658691406, "loss": 0.8984, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5768699645996094, "rewards/margins": -0.25314927101135254, "rewards/rejected": 1.8300193548202515, "step": 4570 }, { "epoch": 0.21263754120432704, "grad_norm": 67.68697357177734, "learning_rate": 4.646145751118127e-07, "logits/chosen": -19.531951904296875, "logits/rejected": -18.026996612548828, "logps/chosen": -418.48095703125, "logps/rejected": -263.0316162109375, "loss": 0.4726, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5889145135879517, "rewards/margins": 0.6452512145042419, "rewards/rejected": 0.9436632394790649, "step": 4580 }, { "epoch": 0.21310181531176006, "grad_norm": 80.94912719726562, "learning_rate": 4.645371960939072e-07, "logits/chosen": -17.96477699279785, "logits/rejected": -17.425025939941406, "logps/chosen": -416.5452575683594, "logps/rejected": -339.2123107910156, "loss": 0.7274, "rewards/accuracies": 0.5, "rewards/chosen": 1.695600152015686, "rewards/margins": 0.1336624026298523, "rewards/rejected": 1.5619375705718994, "step": 4590 }, { "epoch": 0.21356608941919308, "grad_norm": 49.406219482421875, "learning_rate": 4.644598170760016e-07, "logits/chosen": -18.09554100036621, "logits/rejected": -18.37337303161621, "logps/chosen": -438.7906799316406, "logps/rejected": -332.808837890625, "loss": 0.5474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.892675757408142, "rewards/margins": 0.4719913601875305, "rewards/rejected": 1.4206844568252563, "step": 4600 }, { "epoch": 0.21403036352662613, "grad_norm": 47.886558532714844, "learning_rate": 4.643824380580961e-07, "logits/chosen": -18.012182235717773, "logits/rejected": -16.680694580078125, "logps/chosen": -415.61553955078125, "logps/rejected": -252.8356475830078, "loss": 0.4845, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3661479949951172, "rewards/margins": 0.6188223361968994, "rewards/rejected": 0.7473257184028625, "step": 4610 }, { "epoch": 0.21449463763405915, "grad_norm": 200.889404296875, "learning_rate": 4.6430505904019063e-07, "logits/chosen": -19.512948989868164, "logits/rejected": -19.309587478637695, "logps/chosen": -386.3429870605469, "logps/rejected": -392.42108154296875, "loss": 0.857, "rewards/accuracies": 0.5, "rewards/chosen": 1.3186098337173462, "rewards/margins": -0.07306680828332901, "rewards/rejected": 1.391676664352417, "step": 4620 }, { "epoch": 0.21495891174149218, "grad_norm": 34.7933235168457, "learning_rate": 4.6422768002228514e-07, "logits/chosen": -18.038820266723633, "logits/rejected": -17.799877166748047, "logps/chosen": -428.5391540527344, "logps/rejected": -410.74078369140625, "loss": 0.7508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7495784759521484, "rewards/margins": 0.050707004964351654, "rewards/rejected": 1.698871374130249, "step": 4630 }, { "epoch": 0.2154231858489252, "grad_norm": 44.72637176513672, "learning_rate": 4.6415030100437965e-07, "logits/chosen": -17.906494140625, "logits/rejected": -17.366958618164062, "logps/chosen": -417.99853515625, "logps/rejected": -324.0484619140625, "loss": 0.5972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.722273826599121, "rewards/margins": 0.3665081560611725, "rewards/rejected": 1.3557658195495605, "step": 4640 }, { "epoch": 0.21588745995635825, "grad_norm": 47.05518341064453, "learning_rate": 4.640729219864741e-07, "logits/chosen": -18.04171371459961, "logits/rejected": -17.209638595581055, "logps/chosen": -338.06585693359375, "logps/rejected": -286.32342529296875, "loss": 0.6783, "rewards/accuracies": 0.5, "rewards/chosen": 1.5012547969818115, "rewards/margins": 0.13839925825595856, "rewards/rejected": 1.3628554344177246, "step": 4650 }, { "epoch": 0.21635173406379127, "grad_norm": 50.11227035522461, "learning_rate": 4.639955429685686e-07, "logits/chosen": -17.669326782226562, "logits/rejected": -16.390453338623047, "logps/chosen": -383.3667297363281, "logps/rejected": -225.4928436279297, "loss": 0.4815, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8287580013275146, "rewards/margins": 0.6587287783622742, "rewards/rejected": 1.1700292825698853, "step": 4660 }, { "epoch": 0.2168160081712243, "grad_norm": 34.207611083984375, "learning_rate": 4.6391816395066313e-07, "logits/chosen": -18.406047821044922, "logits/rejected": -17.489727020263672, "logps/chosen": -411.7925720214844, "logps/rejected": -349.36602783203125, "loss": 0.6578, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6445238590240479, "rewards/margins": 0.2786913812160492, "rewards/rejected": 1.3658324480056763, "step": 4670 }, { "epoch": 0.2172802822786573, "grad_norm": 83.94088745117188, "learning_rate": 4.6384078493275765e-07, "logits/chosen": -18.46308135986328, "logits/rejected": -17.378681182861328, "logps/chosen": -542.3109741210938, "logps/rejected": -402.4685363769531, "loss": 0.5162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2352442741394043, "rewards/margins": 0.6913009285926819, "rewards/rejected": 1.5439434051513672, "step": 4680 }, { "epoch": 0.21774455638609036, "grad_norm": 47.36737060546875, "learning_rate": 4.6376340591485216e-07, "logits/chosen": -18.417743682861328, "logits/rejected": -17.06045913696289, "logps/chosen": -371.5094909667969, "logps/rejected": -212.34518432617188, "loss": 0.4348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6505123376846313, "rewards/margins": 0.7807965278625488, "rewards/rejected": 0.869715690612793, "step": 4690 }, { "epoch": 0.21820883049352338, "grad_norm": 47.10081100463867, "learning_rate": 4.6368602689694656e-07, "logits/chosen": -18.698833465576172, "logits/rejected": -17.502105712890625, "logps/chosen": -433.02032470703125, "logps/rejected": -370.5360412597656, "loss": 0.658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5162057876586914, "rewards/margins": 0.12270765006542206, "rewards/rejected": 1.3934983015060425, "step": 4700 }, { "epoch": 0.2186731046009564, "grad_norm": 42.69125747680664, "learning_rate": 4.636086478790411e-07, "logits/chosen": -17.583406448364258, "logits/rejected": -16.268632888793945, "logps/chosen": -450.447265625, "logps/rejected": -225.8984832763672, "loss": 0.5662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8609578609466553, "rewards/margins": 0.6652365922927856, "rewards/rejected": 1.1957213878631592, "step": 4710 }, { "epoch": 0.21913737870838942, "grad_norm": 70.40335083007812, "learning_rate": 4.635312688611356e-07, "logits/chosen": -18.60409164428711, "logits/rejected": -17.85286521911621, "logps/chosen": -443.1361389160156, "logps/rejected": -341.2130126953125, "loss": 0.5786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6902110576629639, "rewards/margins": 0.34265145659446716, "rewards/rejected": 1.3475596904754639, "step": 4720 }, { "epoch": 0.21960165281582247, "grad_norm": 69.84729766845703, "learning_rate": 4.634538898432301e-07, "logits/chosen": -18.419029235839844, "logits/rejected": -17.138538360595703, "logps/chosen": -320.8277587890625, "logps/rejected": -238.0135498046875, "loss": 0.5556, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4081792831420898, "rewards/margins": 0.32797008752822876, "rewards/rejected": 1.0802090167999268, "step": 4730 }, { "epoch": 0.2200659269232555, "grad_norm": 159.96942138671875, "learning_rate": 4.633765108253246e-07, "logits/chosen": -18.107707977294922, "logits/rejected": -16.987850189208984, "logps/chosen": -435.19500732421875, "logps/rejected": -337.7222595214844, "loss": 0.7235, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6655333042144775, "rewards/margins": 0.1277841031551361, "rewards/rejected": 1.5377490520477295, "step": 4740 }, { "epoch": 0.22053020103068852, "grad_norm": 53.466346740722656, "learning_rate": 4.6329913180741907e-07, "logits/chosen": -17.215314865112305, "logits/rejected": -16.713064193725586, "logps/chosen": -362.88421630859375, "logps/rejected": -292.05072021484375, "loss": 0.6387, "rewards/accuracies": 0.5, "rewards/chosen": 1.5904099941253662, "rewards/margins": 0.21434862911701202, "rewards/rejected": 1.376061201095581, "step": 4750 }, { "epoch": 0.22099447513812154, "grad_norm": 43.00557327270508, "learning_rate": 4.632217527895136e-07, "logits/chosen": -17.833770751953125, "logits/rejected": -17.282989501953125, "logps/chosen": -376.6791076660156, "logps/rejected": -334.18829345703125, "loss": 0.6669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4081733226776123, "rewards/margins": 0.2550934851169586, "rewards/rejected": 1.1530797481536865, "step": 4760 }, { "epoch": 0.2214587492455546, "grad_norm": 89.79194641113281, "learning_rate": 4.631443737716081e-07, "logits/chosen": -18.197998046875, "logits/rejected": -17.562734603881836, "logps/chosen": -389.25555419921875, "logps/rejected": -342.40081787109375, "loss": 0.7147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6687602996826172, "rewards/margins": 0.2123112678527832, "rewards/rejected": 1.4564487934112549, "step": 4770 }, { "epoch": 0.2219230233529876, "grad_norm": 34.37210464477539, "learning_rate": 4.630669947537026e-07, "logits/chosen": -19.24091911315918, "logits/rejected": -18.333782196044922, "logps/chosen": -441.08642578125, "logps/rejected": -398.6493225097656, "loss": 0.6569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8206818103790283, "rewards/margins": 0.2633875012397766, "rewards/rejected": 1.557294249534607, "step": 4780 }, { "epoch": 0.22238729746042063, "grad_norm": 94.26690673828125, "learning_rate": 4.6298961573579706e-07, "logits/chosen": -19.11067008972168, "logits/rejected": -17.782520294189453, "logps/chosen": -441.296875, "logps/rejected": -338.441162109375, "loss": 0.5827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9435384273529053, "rewards/margins": 0.4604858458042145, "rewards/rejected": 1.4830528497695923, "step": 4790 }, { "epoch": 0.22285157156785365, "grad_norm": 85.45944213867188, "learning_rate": 4.629122367178915e-07, "logits/chosen": -18.358617782592773, "logits/rejected": -17.827226638793945, "logps/chosen": -382.77154541015625, "logps/rejected": -306.2645263671875, "loss": 0.6529, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.638606309890747, "rewards/margins": 0.16531096398830414, "rewards/rejected": 1.4732953310012817, "step": 4800 }, { "epoch": 0.2233158456752867, "grad_norm": 91.3778076171875, "learning_rate": 4.6283485769998603e-07, "logits/chosen": -18.149812698364258, "logits/rejected": -18.035375595092773, "logps/chosen": -334.4219055175781, "logps/rejected": -313.67803955078125, "loss": 0.7812, "rewards/accuracies": 0.5, "rewards/chosen": 1.2577602863311768, "rewards/margins": -0.06421063095331192, "rewards/rejected": 1.321970820426941, "step": 4810 }, { "epoch": 0.22378011978271972, "grad_norm": 131.90086364746094, "learning_rate": 4.6275747868208054e-07, "logits/chosen": -18.053808212280273, "logits/rejected": -17.445396423339844, "logps/chosen": -401.19635009765625, "logps/rejected": -294.8194274902344, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4412728548049927, "rewards/margins": 0.2972085773944855, "rewards/rejected": 1.1440640687942505, "step": 4820 }, { "epoch": 0.22424439389015274, "grad_norm": 43.06757354736328, "learning_rate": 4.6268009966417505e-07, "logits/chosen": -18.003812789916992, "logits/rejected": -17.64413833618164, "logps/chosen": -358.6597595214844, "logps/rejected": -324.0472717285156, "loss": 0.7647, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2729651927947998, "rewards/margins": -0.07199005782604218, "rewards/rejected": 1.3449552059173584, "step": 4830 }, { "epoch": 0.22470866799758576, "grad_norm": 109.43077850341797, "learning_rate": 4.6260272064626956e-07, "logits/chosen": -17.991254806518555, "logits/rejected": -18.035533905029297, "logps/chosen": -447.583984375, "logps/rejected": -399.63690185546875, "loss": 0.7048, "rewards/accuracies": 0.5, "rewards/chosen": 1.5067285299301147, "rewards/margins": 0.03776029869914055, "rewards/rejected": 1.4689682722091675, "step": 4840 }, { "epoch": 0.22517294210501881, "grad_norm": 71.49117279052734, "learning_rate": 4.62525341628364e-07, "logits/chosen": -17.970111846923828, "logits/rejected": -16.759220123291016, "logps/chosen": -455.04437255859375, "logps/rejected": -289.2658386230469, "loss": 0.5657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5078294277191162, "rewards/margins": 0.3426315188407898, "rewards/rejected": 1.1651978492736816, "step": 4850 }, { "epoch": 0.22563721621245184, "grad_norm": 75.2791519165039, "learning_rate": 4.6244796261045853e-07, "logits/chosen": -18.281923294067383, "logits/rejected": -17.3486385345459, "logps/chosen": -364.51983642578125, "logps/rejected": -309.6150817871094, "loss": 0.5763, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8062067031860352, "rewards/margins": 0.6096499562263489, "rewards/rejected": 1.1965569257736206, "step": 4860 }, { "epoch": 0.22610149031988486, "grad_norm": 92.34265899658203, "learning_rate": 4.6237058359255304e-07, "logits/chosen": -17.54216766357422, "logits/rejected": -17.605144500732422, "logps/chosen": -411.1690368652344, "logps/rejected": -433.23687744140625, "loss": 0.7311, "rewards/accuracies": 0.5, "rewards/chosen": 1.6686131954193115, "rewards/margins": 0.019073331728577614, "rewards/rejected": 1.6495399475097656, "step": 4870 }, { "epoch": 0.22656576442731788, "grad_norm": 41.4050407409668, "learning_rate": 4.6229320457464755e-07, "logits/chosen": -16.922191619873047, "logits/rejected": -16.711938858032227, "logps/chosen": -249.0240478515625, "logps/rejected": -273.1535339355469, "loss": 0.8037, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.0572280883789062, "rewards/margins": -0.07904218882322311, "rewards/rejected": 1.13627028465271, "step": 4880 }, { "epoch": 0.22703003853475093, "grad_norm": 18.365074157714844, "learning_rate": 4.62215825556742e-07, "logits/chosen": -17.662670135498047, "logits/rejected": -17.393230438232422, "logps/chosen": -402.0992126464844, "logps/rejected": -380.5296630859375, "loss": 0.6458, "rewards/accuracies": 0.5, "rewards/chosen": 1.7116687297821045, "rewards/margins": 0.3231555223464966, "rewards/rejected": 1.3885133266448975, "step": 4890 }, { "epoch": 0.22749431264218395, "grad_norm": 135.93142700195312, "learning_rate": 4.6213844653883647e-07, "logits/chosen": -18.433719635009766, "logits/rejected": -17.492305755615234, "logps/chosen": -495.2227478027344, "logps/rejected": -396.45294189453125, "loss": 0.5681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8340994119644165, "rewards/margins": 0.370126873254776, "rewards/rejected": 1.4639724493026733, "step": 4900 }, { "epoch": 0.22795858674961697, "grad_norm": 41.95094680786133, "learning_rate": 4.62061067520931e-07, "logits/chosen": -17.839160919189453, "logits/rejected": -17.128284454345703, "logps/chosen": -367.287841796875, "logps/rejected": -319.8685302734375, "loss": 0.5877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5806255340576172, "rewards/margins": 0.3868781626224518, "rewards/rejected": 1.1937475204467773, "step": 4910 }, { "epoch": 0.22842286085705, "grad_norm": 64.33589935302734, "learning_rate": 4.619836885030255e-07, "logits/chosen": -18.625102996826172, "logits/rejected": -17.41362762451172, "logps/chosen": -452.7268981933594, "logps/rejected": -252.2453155517578, "loss": 0.4787, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8410365581512451, "rewards/margins": 0.6956895589828491, "rewards/rejected": 1.1453471183776855, "step": 4920 }, { "epoch": 0.22888713496448304, "grad_norm": 57.961708068847656, "learning_rate": 4.6190630948512e-07, "logits/chosen": -17.811262130737305, "logits/rejected": -16.742412567138672, "logps/chosen": -439.02520751953125, "logps/rejected": -279.469970703125, "loss": 0.5322, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6645267009735107, "rewards/margins": 0.5572740435600281, "rewards/rejected": 1.107252597808838, "step": 4930 }, { "epoch": 0.22935140907191606, "grad_norm": 36.300697326660156, "learning_rate": 4.618289304672145e-07, "logits/chosen": -19.479822158813477, "logits/rejected": -18.06924057006836, "logps/chosen": -465.08270263671875, "logps/rejected": -438.59234619140625, "loss": 0.5292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1478912830352783, "rewards/margins": 0.45980867743492126, "rewards/rejected": 1.6880826950073242, "step": 4940 }, { "epoch": 0.22981568317934908, "grad_norm": 187.97169494628906, "learning_rate": 4.61751551449309e-07, "logits/chosen": -18.65334701538086, "logits/rejected": -17.103607177734375, "logps/chosen": -390.6619873046875, "logps/rejected": -269.74664306640625, "loss": 0.5615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6632373332977295, "rewards/margins": 0.5761681795120239, "rewards/rejected": 1.0870692729949951, "step": 4950 }, { "epoch": 0.2302799572867821, "grad_norm": 51.76433181762695, "learning_rate": 4.616741724314035e-07, "logits/chosen": -18.22795867919922, "logits/rejected": -17.400156021118164, "logps/chosen": -379.7284851074219, "logps/rejected": -285.2223205566406, "loss": 0.5749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3897594213485718, "rewards/margins": 0.3628276288509369, "rewards/rejected": 1.0269317626953125, "step": 4960 }, { "epoch": 0.23074423139421515, "grad_norm": 97.6405258178711, "learning_rate": 4.61596793413498e-07, "logits/chosen": -18.043718338012695, "logits/rejected": -17.488924026489258, "logps/chosen": -345.215576171875, "logps/rejected": -339.3681945800781, "loss": 0.6266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.636560082435608, "rewards/margins": 0.2644900381565094, "rewards/rejected": 1.3720699548721313, "step": 4970 }, { "epoch": 0.23120850550164818, "grad_norm": 53.63556671142578, "learning_rate": 4.6151941439559246e-07, "logits/chosen": -18.30547332763672, "logits/rejected": -17.683544158935547, "logps/chosen": -376.46343994140625, "logps/rejected": -346.0455322265625, "loss": 0.582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6827466487884521, "rewards/margins": 0.2796635329723358, "rewards/rejected": 1.403083086013794, "step": 4980 }, { "epoch": 0.2316727796090812, "grad_norm": 7.1584696769714355, "learning_rate": 4.6144203537768697e-07, "logits/chosen": -17.25524139404297, "logits/rejected": -16.665775299072266, "logps/chosen": -387.9108581542969, "logps/rejected": -330.3763122558594, "loss": 0.858, "rewards/accuracies": 0.5, "rewards/chosen": 1.506828784942627, "rewards/margins": 0.08866159617900848, "rewards/rejected": 1.418167233467102, "step": 4990 }, { "epoch": 0.23213705371651422, "grad_norm": 49.35273361206055, "learning_rate": 4.613646563597814e-07, "logits/chosen": -17.829914093017578, "logits/rejected": -17.23617172241211, "logps/chosen": -430.5835876464844, "logps/rejected": -342.5618896484375, "loss": 0.6127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6853244304656982, "rewards/margins": 0.38039758801460266, "rewards/rejected": 1.304926872253418, "step": 5000 }, { "epoch": 0.23260132782394727, "grad_norm": 31.377399444580078, "learning_rate": 4.6128727734187594e-07, "logits/chosen": -18.29194450378418, "logits/rejected": -17.711265563964844, "logps/chosen": -473.79669189453125, "logps/rejected": -409.23919677734375, "loss": 0.6688, "rewards/accuracies": 0.5, "rewards/chosen": 1.8417259454727173, "rewards/margins": 0.18585342168807983, "rewards/rejected": 1.655872106552124, "step": 5010 }, { "epoch": 0.2330656019313803, "grad_norm": 70.63748931884766, "learning_rate": 4.6120989832397045e-07, "logits/chosen": -18.92196273803711, "logits/rejected": -17.733720779418945, "logps/chosen": -411.1630859375, "logps/rejected": -325.7013854980469, "loss": 0.7645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.060551166534424, "rewards/margins": 0.20511528849601746, "rewards/rejected": 1.8554359674453735, "step": 5020 }, { "epoch": 0.2335298760388133, "grad_norm": 8.466423034667969, "learning_rate": 4.6113251930606496e-07, "logits/chosen": -18.542110443115234, "logits/rejected": -17.302776336669922, "logps/chosen": -652.4132690429688, "logps/rejected": -327.85711669921875, "loss": 0.4598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6573636531829834, "rewards/margins": 1.0778191089630127, "rewards/rejected": 1.5795445442199707, "step": 5030 }, { "epoch": 0.23399415014624633, "grad_norm": 132.09815979003906, "learning_rate": 4.6105514028815947e-07, "logits/chosen": -18.794361114501953, "logits/rejected": -17.729122161865234, "logps/chosen": -458.796875, "logps/rejected": -345.9837646484375, "loss": 0.5127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8229186534881592, "rewards/margins": 0.6376832723617554, "rewards/rejected": 1.1852355003356934, "step": 5040 }, { "epoch": 0.23445842425367938, "grad_norm": 155.2095184326172, "learning_rate": 4.6097776127025393e-07, "logits/chosen": -18.393604278564453, "logits/rejected": -18.158954620361328, "logps/chosen": -487.2774353027344, "logps/rejected": -458.8328552246094, "loss": 0.7993, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9342409372329712, "rewards/margins": -0.05141974240541458, "rewards/rejected": 1.9856607913970947, "step": 5050 }, { "epoch": 0.2349226983611124, "grad_norm": 41.92989730834961, "learning_rate": 4.6090038225234844e-07, "logits/chosen": -18.319120407104492, "logits/rejected": -18.597068786621094, "logps/chosen": -395.90283203125, "logps/rejected": -424.5376892089844, "loss": 0.7278, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.670373558998108, "rewards/margins": 0.04174666851758957, "rewards/rejected": 1.628626823425293, "step": 5060 }, { "epoch": 0.23538697246854542, "grad_norm": 26.95508575439453, "learning_rate": 4.6082300323444295e-07, "logits/chosen": -18.529733657836914, "logits/rejected": -17.185848236083984, "logps/chosen": -602.6931762695312, "logps/rejected": -356.2143859863281, "loss": 0.4713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.459000587463379, "rewards/margins": 0.8842452764511108, "rewards/rejected": 1.5747551918029785, "step": 5070 }, { "epoch": 0.23585124657597845, "grad_norm": 77.58306884765625, "learning_rate": 4.607456242165374e-07, "logits/chosen": -19.058177947998047, "logits/rejected": -18.565868377685547, "logps/chosen": -407.96185302734375, "logps/rejected": -398.45184326171875, "loss": 0.6302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6720693111419678, "rewards/margins": 0.2321385145187378, "rewards/rejected": 1.4399309158325195, "step": 5080 }, { "epoch": 0.2363155206834115, "grad_norm": 45.97553253173828, "learning_rate": 4.606682451986319e-07, "logits/chosen": -17.665925979614258, "logits/rejected": -17.24905014038086, "logps/chosen": -421.89227294921875, "logps/rejected": -347.3736267089844, "loss": 0.5662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.616586685180664, "rewards/margins": 0.31908971071243286, "rewards/rejected": 1.297497034072876, "step": 5090 }, { "epoch": 0.23677979479084452, "grad_norm": 83.92717742919922, "learning_rate": 4.605908661807264e-07, "logits/chosen": -16.980701446533203, "logits/rejected": -16.51910972595215, "logps/chosen": -287.3922424316406, "logps/rejected": -270.402099609375, "loss": 0.6874, "rewards/accuracies": 0.5, "rewards/chosen": 1.4871060848236084, "rewards/margins": 0.2107737809419632, "rewards/rejected": 1.2763323783874512, "step": 5100 }, { "epoch": 0.23724406889827754, "grad_norm": 55.335880279541016, "learning_rate": 4.605134871628209e-07, "logits/chosen": -19.341571807861328, "logits/rejected": -18.404687881469727, "logps/chosen": -579.332275390625, "logps/rejected": -467.009765625, "loss": 0.5718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0678019523620605, "rewards/margins": 0.4112076163291931, "rewards/rejected": 1.6565942764282227, "step": 5110 }, { "epoch": 0.23770834300571056, "grad_norm": 66.92863464355469, "learning_rate": 4.604361081449154e-07, "logits/chosen": -17.825420379638672, "logits/rejected": -17.4649658203125, "logps/chosen": -445.52374267578125, "logps/rejected": -403.1216125488281, "loss": 0.8033, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6024529933929443, "rewards/margins": 0.09135840088129044, "rewards/rejected": 1.5110946893692017, "step": 5120 }, { "epoch": 0.2381726171131436, "grad_norm": 26.824337005615234, "learning_rate": 4.603587291270099e-07, "logits/chosen": -17.194063186645508, "logits/rejected": -16.572643280029297, "logps/chosen": -306.6451721191406, "logps/rejected": -272.56536865234375, "loss": 0.5817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.808496117591858, "rewards/margins": 0.47242242097854614, "rewards/rejected": 1.3360737562179565, "step": 5130 }, { "epoch": 0.23863689122057663, "grad_norm": 44.2207145690918, "learning_rate": 4.602813501091044e-07, "logits/chosen": -18.318944931030273, "logits/rejected": -17.876888275146484, "logps/chosen": -532.861572265625, "logps/rejected": -435.88153076171875, "loss": 0.6345, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.236044406890869, "rewards/margins": 0.26355013251304626, "rewards/rejected": 1.972494125366211, "step": 5140 }, { "epoch": 0.23910116532800965, "grad_norm": 157.92250061035156, "learning_rate": 4.602039710911989e-07, "logits/chosen": -18.31700325012207, "logits/rejected": -17.67586898803711, "logps/chosen": -460.0818786621094, "logps/rejected": -451.0846252441406, "loss": 0.7988, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.6424528360366821, "rewards/margins": 0.010661959648132324, "rewards/rejected": 1.6317908763885498, "step": 5150 }, { "epoch": 0.23956543943544267, "grad_norm": 128.57852172851562, "learning_rate": 4.601265920732934e-07, "logits/chosen": -18.85904884338379, "logits/rejected": -19.165491104125977, "logps/chosen": -383.37005615234375, "logps/rejected": -422.1898498535156, "loss": 0.731, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6327464580535889, "rewards/margins": -0.0296248197555542, "rewards/rejected": 1.662371277809143, "step": 5160 }, { "epoch": 0.24002971354287572, "grad_norm": 53.107215881347656, "learning_rate": 4.6004921305538785e-07, "logits/chosen": -19.381750106811523, "logits/rejected": -18.06561851501465, "logps/chosen": -472.4400329589844, "logps/rejected": -321.1364440917969, "loss": 0.4414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1001155376434326, "rewards/margins": 0.7064515948295593, "rewards/rejected": 1.393664002418518, "step": 5170 }, { "epoch": 0.24049398765030874, "grad_norm": 65.06351470947266, "learning_rate": 4.5997183403748236e-07, "logits/chosen": -19.08470344543457, "logits/rejected": -18.14073944091797, "logps/chosen": -426.141357421875, "logps/rejected": -376.5039978027344, "loss": 0.5734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.800310492515564, "rewards/margins": 0.4390552043914795, "rewards/rejected": 1.361255407333374, "step": 5180 }, { "epoch": 0.24095826175774177, "grad_norm": 95.9295425415039, "learning_rate": 4.598944550195769e-07, "logits/chosen": -18.219682693481445, "logits/rejected": -18.23178482055664, "logps/chosen": -327.7215270996094, "logps/rejected": -352.0786437988281, "loss": 0.6745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3870413303375244, "rewards/margins": 0.1454104483127594, "rewards/rejected": 1.2416307926177979, "step": 5190 }, { "epoch": 0.2414225358651748, "grad_norm": 68.29639434814453, "learning_rate": 4.5981707600167133e-07, "logits/chosen": -18.78156089782715, "logits/rejected": -18.736845016479492, "logps/chosen": -320.4684753417969, "logps/rejected": -293.19635009765625, "loss": 0.7803, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.5013877153396606, "rewards/margins": -0.06534375250339508, "rewards/rejected": 1.5667314529418945, "step": 5200 }, { "epoch": 0.24188680997260784, "grad_norm": 36.71305465698242, "learning_rate": 4.5973969698376584e-07, "logits/chosen": -18.47032356262207, "logits/rejected": -18.053855895996094, "logps/chosen": -474.89654541015625, "logps/rejected": -288.27044677734375, "loss": 0.4252, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9362545013427734, "rewards/margins": 0.7872135043144226, "rewards/rejected": 1.149040937423706, "step": 5210 }, { "epoch": 0.24235108408004086, "grad_norm": 141.7859649658203, "learning_rate": 4.5966231796586036e-07, "logits/chosen": -18.256214141845703, "logits/rejected": -17.667476654052734, "logps/chosen": -300.55926513671875, "logps/rejected": -264.40447998046875, "loss": 0.7357, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3393265008926392, "rewards/margins": 0.1303146332502365, "rewards/rejected": 1.2090117931365967, "step": 5220 }, { "epoch": 0.24281535818747388, "grad_norm": 147.17955017089844, "learning_rate": 4.5958493894795487e-07, "logits/chosen": -19.147815704345703, "logits/rejected": -18.39773941040039, "logps/chosen": -535.7763671875, "logps/rejected": -384.50567626953125, "loss": 0.5695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.134214162826538, "rewards/margins": 0.41892504692077637, "rewards/rejected": 1.7152893543243408, "step": 5230 }, { "epoch": 0.2432796322949069, "grad_norm": 67.7276382446289, "learning_rate": 4.595075599300494e-07, "logits/chosen": -17.825008392333984, "logits/rejected": -16.960315704345703, "logps/chosen": -434.7682189941406, "logps/rejected": -305.3818054199219, "loss": 0.5693, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.224921703338623, "rewards/margins": 0.6133872270584106, "rewards/rejected": 1.6115343570709229, "step": 5240 }, { "epoch": 0.24374390640233995, "grad_norm": 250.42628479003906, "learning_rate": 4.5943018091214384e-07, "logits/chosen": -19.414804458618164, "logits/rejected": -17.35222053527832, "logps/chosen": -468.11529541015625, "logps/rejected": -335.578857421875, "loss": 0.524, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9968407154083252, "rewards/margins": 0.6536046862602234, "rewards/rejected": 1.3432362079620361, "step": 5250 }, { "epoch": 0.24420818050977297, "grad_norm": 134.66014099121094, "learning_rate": 4.5935280189423835e-07, "logits/chosen": -16.39105987548828, "logits/rejected": -16.01523208618164, "logps/chosen": -389.85150146484375, "logps/rejected": -329.6565856933594, "loss": 0.6726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6799161434173584, "rewards/margins": 0.3506702780723572, "rewards/rejected": 1.329245924949646, "step": 5260 }, { "epoch": 0.244672454617206, "grad_norm": 92.67816925048828, "learning_rate": 4.592754228763328e-07, "logits/chosen": -18.582748413085938, "logits/rejected": -18.45981788635254, "logps/chosen": -326.4854431152344, "logps/rejected": -310.27740478515625, "loss": 0.7157, "rewards/accuracies": 0.5, "rewards/chosen": 1.5855896472930908, "rewards/margins": 0.015587043948471546, "rewards/rejected": 1.570002794265747, "step": 5270 }, { "epoch": 0.245136728724639, "grad_norm": 128.6566162109375, "learning_rate": 4.591980438584273e-07, "logits/chosen": -18.541202545166016, "logits/rejected": -18.054370880126953, "logps/chosen": -414.19940185546875, "logps/rejected": -373.1535339355469, "loss": 0.6749, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0031988620758057, "rewards/margins": 0.37183770537376404, "rewards/rejected": 1.6313610076904297, "step": 5280 }, { "epoch": 0.24560100283207206, "grad_norm": 12.139339447021484, "learning_rate": 4.5912066484052183e-07, "logits/chosen": -19.403276443481445, "logits/rejected": -18.475507736206055, "logps/chosen": -416.95294189453125, "logps/rejected": -250.8281707763672, "loss": 0.4779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.978722333908081, "rewards/margins": 0.852127730846405, "rewards/rejected": 1.1265946626663208, "step": 5290 }, { "epoch": 0.24606527693950508, "grad_norm": 177.16839599609375, "learning_rate": 4.590432858226163e-07, "logits/chosen": -18.489530563354492, "logits/rejected": -17.01022720336914, "logps/chosen": -427.6361389160156, "logps/rejected": -280.41217041015625, "loss": 0.5051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.826395034790039, "rewards/margins": 0.7275359630584717, "rewards/rejected": 1.0988590717315674, "step": 5300 }, { "epoch": 0.2465295510469381, "grad_norm": 13.530343055725098, "learning_rate": 4.589659068047108e-07, "logits/chosen": -17.71719741821289, "logits/rejected": -16.787166595458984, "logps/chosen": -373.43072509765625, "logps/rejected": -240.3665313720703, "loss": 0.5101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6440891027450562, "rewards/margins": 0.7218379974365234, "rewards/rejected": 0.9222510457038879, "step": 5310 }, { "epoch": 0.24699382515437113, "grad_norm": 24.457170486450195, "learning_rate": 4.588885277868053e-07, "logits/chosen": -18.483488082885742, "logits/rejected": -17.6385555267334, "logps/chosen": -435.61273193359375, "logps/rejected": -277.0875549316406, "loss": 0.3937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2337045669555664, "rewards/margins": 1.1523587703704834, "rewards/rejected": 1.081346035003662, "step": 5320 }, { "epoch": 0.24745809926180418, "grad_norm": 39.575016021728516, "learning_rate": 4.588111487688998e-07, "logits/chosen": -17.866100311279297, "logits/rejected": -16.753480911254883, "logps/chosen": -280.936279296875, "logps/rejected": -177.9333038330078, "loss": 0.5611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1279456615447998, "rewards/margins": 0.4339931905269623, "rewards/rejected": 0.6939525008201599, "step": 5330 }, { "epoch": 0.2479223733692372, "grad_norm": 66.31151580810547, "learning_rate": 4.5873376975099433e-07, "logits/chosen": -18.006540298461914, "logits/rejected": -19.07330894470215, "logps/chosen": -399.1144104003906, "logps/rejected": -383.8354797363281, "loss": 0.7354, "rewards/accuracies": 0.5, "rewards/chosen": 1.6816482543945312, "rewards/margins": 0.06154327467083931, "rewards/rejected": 1.6201050281524658, "step": 5340 }, { "epoch": 0.24838664747667022, "grad_norm": 49.96780776977539, "learning_rate": 4.586563907330888e-07, "logits/chosen": -18.979267120361328, "logits/rejected": -17.657434463500977, "logps/chosen": -500.32855224609375, "logps/rejected": -337.3385925292969, "loss": 0.4124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.238065481185913, "rewards/margins": 0.8992007374763489, "rewards/rejected": 1.3388645648956299, "step": 5350 }, { "epoch": 0.24885092158410324, "grad_norm": 35.62382507324219, "learning_rate": 4.585790117151833e-07, "logits/chosen": -18.56407928466797, "logits/rejected": -16.851539611816406, "logps/chosen": -403.8921813964844, "logps/rejected": -329.123046875, "loss": 0.4924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.00240421295166, "rewards/margins": 0.6749046444892883, "rewards/rejected": 1.3274993896484375, "step": 5360 }, { "epoch": 0.2493151956915363, "grad_norm": 78.79173278808594, "learning_rate": 4.5850163269727776e-07, "logits/chosen": -17.94428825378418, "logits/rejected": -17.513290405273438, "logps/chosen": -468.9903259277344, "logps/rejected": -483.549560546875, "loss": 0.6291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3836867809295654, "rewards/margins": 0.2777751684188843, "rewards/rejected": 2.1059114933013916, "step": 5370 }, { "epoch": 0.2497794697989693, "grad_norm": 114.82752990722656, "learning_rate": 4.5842425367937227e-07, "logits/chosen": -18.08694076538086, "logits/rejected": -18.66812515258789, "logps/chosen": -292.6910400390625, "logps/rejected": -287.5320739746094, "loss": 0.8329, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.121155023574829, "rewards/margins": -0.15613491833209991, "rewards/rejected": 1.2772901058197021, "step": 5380 }, { "epoch": 0.25024374390640236, "grad_norm": 95.36380004882812, "learning_rate": 4.583468746614668e-07, "logits/chosen": -19.227603912353516, "logits/rejected": -18.06427764892578, "logps/chosen": -391.47003173828125, "logps/rejected": -296.616943359375, "loss": 0.6759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8332271575927734, "rewards/margins": 0.22545751929283142, "rewards/rejected": 1.6077693700790405, "step": 5390 }, { "epoch": 0.25070801801383535, "grad_norm": 32.126373291015625, "learning_rate": 4.5826949564356124e-07, "logits/chosen": -18.97905731201172, "logits/rejected": -16.01903533935547, "logps/chosen": -556.1160278320312, "logps/rejected": -275.1141357421875, "loss": 0.4147, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3889546394348145, "rewards/margins": 1.0267095565795898, "rewards/rejected": 1.3622448444366455, "step": 5400 }, { "epoch": 0.2511722921212684, "grad_norm": 66.63093566894531, "learning_rate": 4.5819211662565575e-07, "logits/chosen": -17.027057647705078, "logits/rejected": -16.483104705810547, "logps/chosen": -314.52032470703125, "logps/rejected": -231.1183319091797, "loss": 0.6505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.2910560369491577, "rewards/margins": 0.17328043282032013, "rewards/rejected": 1.1177756786346436, "step": 5410 }, { "epoch": 0.25163656622870145, "grad_norm": 47.29462432861328, "learning_rate": 4.5811473760775026e-07, "logits/chosen": -17.53773307800293, "logits/rejected": -16.862966537475586, "logps/chosen": -429.11846923828125, "logps/rejected": -339.3565368652344, "loss": 0.6527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4375462532043457, "rewards/margins": 0.5036506652832031, "rewards/rejected": 1.9338957071304321, "step": 5420 }, { "epoch": 0.25210084033613445, "grad_norm": 102.82083129882812, "learning_rate": 4.580373585898448e-07, "logits/chosen": -19.04435920715332, "logits/rejected": -18.483043670654297, "logps/chosen": -512.8247680664062, "logps/rejected": -461.5370178222656, "loss": 0.8428, "rewards/accuracies": 0.5, "rewards/chosen": 2.1910343170166016, "rewards/margins": -0.0655183419585228, "rewards/rejected": 2.2565529346466064, "step": 5430 }, { "epoch": 0.2525651144435675, "grad_norm": 113.85502624511719, "learning_rate": 4.579599795719393e-07, "logits/chosen": -17.375106811523438, "logits/rejected": -17.344348907470703, "logps/chosen": -407.15423583984375, "logps/rejected": -347.3279113769531, "loss": 0.8642, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5383235216140747, "rewards/margins": -0.12273839861154556, "rewards/rejected": 1.6610618829727173, "step": 5440 }, { "epoch": 0.2530293885510005, "grad_norm": 117.51786041259766, "learning_rate": 4.5788260055403375e-07, "logits/chosen": -17.763050079345703, "logits/rejected": -18.46442222595215, "logps/chosen": -381.33740234375, "logps/rejected": -319.7290344238281, "loss": 0.9378, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.3903136253356934, "rewards/margins": -0.4125968813896179, "rewards/rejected": 1.8029104471206665, "step": 5450 }, { "epoch": 0.25349366265843354, "grad_norm": 55.82900619506836, "learning_rate": 4.578052215361282e-07, "logits/chosen": -16.88711166381836, "logits/rejected": -16.6380558013916, "logps/chosen": -368.0621337890625, "logps/rejected": -277.42547607421875, "loss": 0.6287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3617274761199951, "rewards/margins": 0.235576793551445, "rewards/rejected": 1.1261506080627441, "step": 5460 }, { "epoch": 0.2539579367658666, "grad_norm": 88.04269409179688, "learning_rate": 4.577278425182227e-07, "logits/chosen": -18.115955352783203, "logits/rejected": -17.45532989501953, "logps/chosen": -459.23583984375, "logps/rejected": -422.751708984375, "loss": 0.6595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0913639068603516, "rewards/margins": 0.5525367856025696, "rewards/rejected": 1.5388273000717163, "step": 5470 }, { "epoch": 0.2544222108732996, "grad_norm": 162.72854614257812, "learning_rate": 4.576504635003172e-07, "logits/chosen": -18.736225128173828, "logits/rejected": -18.239030838012695, "logps/chosen": -523.6701049804688, "logps/rejected": -447.4248046875, "loss": 0.6217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9705537557601929, "rewards/margins": 0.33856400847435, "rewards/rejected": 1.63198983669281, "step": 5480 }, { "epoch": 0.25488648498073263, "grad_norm": 162.31016540527344, "learning_rate": 4.5757308448241174e-07, "logits/chosen": -18.22350311279297, "logits/rejected": -17.35501480102539, "logps/chosen": -465.17529296875, "logps/rejected": -348.678955078125, "loss": 0.8448, "rewards/accuracies": 0.5, "rewards/chosen": 1.7831662893295288, "rewards/margins": -0.10411734879016876, "rewards/rejected": 1.8872836828231812, "step": 5490 }, { "epoch": 0.2553507590881657, "grad_norm": 14.496109962463379, "learning_rate": 4.574957054645062e-07, "logits/chosen": -17.674152374267578, "logits/rejected": -17.721311569213867, "logps/chosen": -420.43634033203125, "logps/rejected": -346.6537170410156, "loss": 0.6364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9461896419525146, "rewards/margins": 0.27439284324645996, "rewards/rejected": 1.6717967987060547, "step": 5500 }, { "epoch": 0.2558150331955987, "grad_norm": 132.80113220214844, "learning_rate": 4.574183264466007e-07, "logits/chosen": -18.52621078491211, "logits/rejected": -17.86374282836914, "logps/chosen": -498.23876953125, "logps/rejected": -372.59454345703125, "loss": 0.5592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0440542697906494, "rewards/margins": 0.5107936859130859, "rewards/rejected": 1.533260464668274, "step": 5510 }, { "epoch": 0.2562793073030317, "grad_norm": 47.65372085571289, "learning_rate": 4.573409474286952e-07, "logits/chosen": -18.250598907470703, "logits/rejected": -17.8045711517334, "logps/chosen": -336.32769775390625, "logps/rejected": -259.719482421875, "loss": 0.6224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.720887541770935, "rewards/margins": 0.5794280767440796, "rewards/rejected": 1.141459584236145, "step": 5520 }, { "epoch": 0.2567435814104647, "grad_norm": 25.046573638916016, "learning_rate": 4.5726356841078973e-07, "logits/chosen": -18.284076690673828, "logits/rejected": -17.933338165283203, "logps/chosen": -364.883544921875, "logps/rejected": -317.5346374511719, "loss": 0.6106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4633547067642212, "rewards/margins": 0.32554492354393005, "rewards/rejected": 1.1378097534179688, "step": 5530 }, { "epoch": 0.25720785551789777, "grad_norm": 166.0569305419922, "learning_rate": 4.5718618939288424e-07, "logits/chosen": -18.320167541503906, "logits/rejected": -17.99797821044922, "logps/chosen": -406.65179443359375, "logps/rejected": -287.4167785644531, "loss": 0.6829, "rewards/accuracies": 0.5, "rewards/chosen": 1.8854033946990967, "rewards/margins": 0.6417633295059204, "rewards/rejected": 1.2436401844024658, "step": 5540 }, { "epoch": 0.2576721296253308, "grad_norm": 125.5980453491211, "learning_rate": 4.5710881037497875e-07, "logits/chosen": -17.55284309387207, "logits/rejected": -17.361736297607422, "logps/chosen": -422.95098876953125, "logps/rejected": -351.40826416015625, "loss": 0.6468, "rewards/accuracies": 0.5, "rewards/chosen": 1.8063316345214844, "rewards/margins": 0.29735079407691956, "rewards/rejected": 1.5089808702468872, "step": 5550 }, { "epoch": 0.2581364037327638, "grad_norm": 35.183170318603516, "learning_rate": 4.5703143135707316e-07, "logits/chosen": -18.240543365478516, "logits/rejected": -16.892833709716797, "logps/chosen": -488.23516845703125, "logps/rejected": -306.5658874511719, "loss": 0.4371, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.328916072845459, "rewards/margins": 1.1171811819076538, "rewards/rejected": 1.2117348909378052, "step": 5560 }, { "epoch": 0.25860067784019686, "grad_norm": 151.56568908691406, "learning_rate": 4.5695405233916767e-07, "logits/chosen": -18.99161720275879, "logits/rejected": -18.737171173095703, "logps/chosen": -400.88421630859375, "logps/rejected": -374.9466857910156, "loss": 0.8034, "rewards/accuracies": 0.5, "rewards/chosen": 1.9080917835235596, "rewards/margins": 0.014552056789398193, "rewards/rejected": 1.8935400247573853, "step": 5570 }, { "epoch": 0.2590649519476299, "grad_norm": 27.361318588256836, "learning_rate": 4.568766733212622e-07, "logits/chosen": -17.778295516967773, "logits/rejected": -16.926267623901367, "logps/chosen": -343.9571838378906, "logps/rejected": -314.0882568359375, "loss": 0.5606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3378746509552002, "rewards/margins": 0.3948762118816376, "rewards/rejected": 0.9429985284805298, "step": 5580 }, { "epoch": 0.2595292260550629, "grad_norm": 17.905502319335938, "learning_rate": 4.567992943033567e-07, "logits/chosen": -19.276355743408203, "logits/rejected": -17.817455291748047, "logps/chosen": -353.34063720703125, "logps/rejected": -315.6452331542969, "loss": 0.839, "rewards/accuracies": 0.5, "rewards/chosen": 1.3374521732330322, "rewards/margins": -0.005172556731849909, "rewards/rejected": 1.342624545097351, "step": 5590 }, { "epoch": 0.25999350016249595, "grad_norm": 35.76215362548828, "learning_rate": 4.5672191528545115e-07, "logits/chosen": -18.669620513916016, "logits/rejected": -16.811595916748047, "logps/chosen": -385.10345458984375, "logps/rejected": -200.4879913330078, "loss": 0.359, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.142702579498291, "rewards/margins": 1.1262942552566528, "rewards/rejected": 1.0164083242416382, "step": 5600 }, { "epoch": 0.26045777426992894, "grad_norm": 39.233795166015625, "learning_rate": 4.5664453626754566e-07, "logits/chosen": -18.813289642333984, "logits/rejected": -17.39569091796875, "logps/chosen": -358.8527526855469, "logps/rejected": -179.96780395507812, "loss": 0.5182, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.443791151046753, "rewards/margins": 0.587992787361145, "rewards/rejected": 0.8557982444763184, "step": 5610 }, { "epoch": 0.260922048377362, "grad_norm": 30.780166625976562, "learning_rate": 4.5656715724964017e-07, "logits/chosen": -18.200740814208984, "logits/rejected": -17.746479034423828, "logps/chosen": -276.1100158691406, "logps/rejected": -231.4430389404297, "loss": 0.5321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5820308923721313, "rewards/margins": 0.5617234110832214, "rewards/rejected": 1.0203073024749756, "step": 5620 }, { "epoch": 0.26138632248479504, "grad_norm": 46.90047073364258, "learning_rate": 4.564897782317347e-07, "logits/chosen": -18.30187225341797, "logits/rejected": -17.499385833740234, "logps/chosen": -323.2471008300781, "logps/rejected": -258.2961730957031, "loss": 0.6287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6338106393814087, "rewards/margins": 0.29923006892204285, "rewards/rejected": 1.334580659866333, "step": 5630 }, { "epoch": 0.26185059659222804, "grad_norm": 43.58262252807617, "learning_rate": 4.564123992138292e-07, "logits/chosen": -18.252193450927734, "logits/rejected": -18.121620178222656, "logps/chosen": -380.7177734375, "logps/rejected": -430.4004821777344, "loss": 0.8185, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7276976108551025, "rewards/margins": -0.07367072254419327, "rewards/rejected": 1.8013681173324585, "step": 5640 }, { "epoch": 0.2623148706996611, "grad_norm": 19.21004295349121, "learning_rate": 4.563350201959236e-07, "logits/chosen": -17.991703033447266, "logits/rejected": -17.639570236206055, "logps/chosen": -381.50848388671875, "logps/rejected": -323.03216552734375, "loss": 0.7749, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6413676738739014, "rewards/margins": 0.21773657202720642, "rewards/rejected": 1.4236314296722412, "step": 5650 }, { "epoch": 0.26277914480709413, "grad_norm": 97.95091247558594, "learning_rate": 4.562576411780181e-07, "logits/chosen": -17.2097225189209, "logits/rejected": -16.872692108154297, "logps/chosen": -370.50689697265625, "logps/rejected": -270.48223876953125, "loss": 0.5809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.674833059310913, "rewards/margins": 0.5488398671150208, "rewards/rejected": 1.125993013381958, "step": 5660 }, { "epoch": 0.2632434189145271, "grad_norm": 70.25276184082031, "learning_rate": 4.561802621601126e-07, "logits/chosen": -18.42730712890625, "logits/rejected": -17.627443313598633, "logps/chosen": -419.9920959472656, "logps/rejected": -294.22265625, "loss": 0.7199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4579827785491943, "rewards/margins": 0.30593961477279663, "rewards/rejected": 2.152043342590332, "step": 5670 }, { "epoch": 0.2637076930219602, "grad_norm": 62.913185119628906, "learning_rate": 4.5610288314220713e-07, "logits/chosen": -17.94122886657715, "logits/rejected": -17.758914947509766, "logps/chosen": -295.5667419433594, "logps/rejected": -304.4027099609375, "loss": 0.6892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1841578483581543, "rewards/margins": 0.10681124776601791, "rewards/rejected": 1.0773465633392334, "step": 5680 }, { "epoch": 0.26417196712939317, "grad_norm": 62.27001190185547, "learning_rate": 4.5602550412430165e-07, "logits/chosen": -18.67572593688965, "logits/rejected": -18.654964447021484, "logps/chosen": -457.66400146484375, "logps/rejected": -405.775390625, "loss": 0.6481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.045228958129883, "rewards/margins": 0.27524247765541077, "rewards/rejected": 1.7699865102767944, "step": 5690 }, { "epoch": 0.2646362412368262, "grad_norm": 79.6493911743164, "learning_rate": 4.5594812510639616e-07, "logits/chosen": -17.99126434326172, "logits/rejected": -17.520959854125977, "logps/chosen": -420.36102294921875, "logps/rejected": -392.70135498046875, "loss": 0.6955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8426673412322998, "rewards/margins": 0.21259936690330505, "rewards/rejected": 1.630068063735962, "step": 5700 }, { "epoch": 0.26510051534425927, "grad_norm": 95.25336456298828, "learning_rate": 4.558707460884906e-07, "logits/chosen": -19.152881622314453, "logits/rejected": -18.649192810058594, "logps/chosen": -320.33306884765625, "logps/rejected": -301.25897216796875, "loss": 0.6486, "rewards/accuracies": 0.5, "rewards/chosen": 1.6349763870239258, "rewards/margins": 0.30462712049484253, "rewards/rejected": 1.3303492069244385, "step": 5710 }, { "epoch": 0.26556478945169226, "grad_norm": 124.89353942871094, "learning_rate": 4.5579336707058513e-07, "logits/chosen": -18.390064239501953, "logits/rejected": -17.701330184936523, "logps/chosen": -537.0736083984375, "logps/rejected": -453.4440002441406, "loss": 0.6984, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1067299842834473, "rewards/margins": 0.1119828075170517, "rewards/rejected": 1.9947468042373657, "step": 5720 }, { "epoch": 0.2660290635591253, "grad_norm": 49.45637512207031, "learning_rate": 4.5571598805267964e-07, "logits/chosen": -17.785823822021484, "logits/rejected": -17.414352416992188, "logps/chosen": -467.77276611328125, "logps/rejected": -430.0619201660156, "loss": 0.8007, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.061455488204956, "rewards/margins": -0.020370453596115112, "rewards/rejected": 2.0818259716033936, "step": 5730 }, { "epoch": 0.26649333766655836, "grad_norm": 26.66152572631836, "learning_rate": 4.5563860903477415e-07, "logits/chosen": -18.845548629760742, "logits/rejected": -18.65108299255371, "logps/chosen": -361.0250244140625, "logps/rejected": -355.9497375488281, "loss": 0.6914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9288854598999023, "rewards/margins": 0.20288923382759094, "rewards/rejected": 1.7259962558746338, "step": 5740 }, { "epoch": 0.26695761177399135, "grad_norm": 113.3931655883789, "learning_rate": 4.5556123001686855e-07, "logits/chosen": -18.70299530029297, "logits/rejected": -18.432331085205078, "logps/chosen": -505.21148681640625, "logps/rejected": -383.4335021972656, "loss": 0.5452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.094041347503662, "rewards/margins": 0.5191336870193481, "rewards/rejected": 1.5749075412750244, "step": 5750 }, { "epoch": 0.2674218858814244, "grad_norm": 123.38959503173828, "learning_rate": 4.5548385099896307e-07, "logits/chosen": -17.528545379638672, "logits/rejected": -16.59360122680664, "logps/chosen": -457.97430419921875, "logps/rejected": -330.1755676269531, "loss": 0.4987, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.042487144470215, "rewards/margins": 0.5656099319458008, "rewards/rejected": 1.4768774509429932, "step": 5760 }, { "epoch": 0.2678861599888574, "grad_norm": 34.56742858886719, "learning_rate": 4.554064719810576e-07, "logits/chosen": -18.47703742980957, "logits/rejected": -17.229114532470703, "logps/chosen": -466.724365234375, "logps/rejected": -311.0744323730469, "loss": 0.4719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0240840911865234, "rewards/margins": 0.7809224724769592, "rewards/rejected": 1.2431615591049194, "step": 5770 }, { "epoch": 0.26835043409629045, "grad_norm": 113.8701400756836, "learning_rate": 4.553290929631521e-07, "logits/chosen": -17.29339027404785, "logits/rejected": -17.471698760986328, "logps/chosen": -238.71728515625, "logps/rejected": -309.1910095214844, "loss": 0.9034, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.204341173171997, "rewards/margins": -0.15320512652397156, "rewards/rejected": 1.357546329498291, "step": 5780 }, { "epoch": 0.2688147082037235, "grad_norm": 90.67942810058594, "learning_rate": 4.552517139452466e-07, "logits/chosen": -18.467126846313477, "logits/rejected": -18.755367279052734, "logps/chosen": -445.1756286621094, "logps/rejected": -449.41644287109375, "loss": 0.817, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6682523488998413, "rewards/margins": -0.10659150779247284, "rewards/rejected": 1.7748439311981201, "step": 5790 }, { "epoch": 0.2692789823111565, "grad_norm": 94.24820709228516, "learning_rate": 4.551743349273411e-07, "logits/chosen": -18.585987091064453, "logits/rejected": -16.982959747314453, "logps/chosen": -508.2274475097656, "logps/rejected": -326.92083740234375, "loss": 0.3882, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7832834720611572, "rewards/margins": 1.0727981328964233, "rewards/rejected": 1.7104852199554443, "step": 5800 }, { "epoch": 0.26974325641858954, "grad_norm": 36.28692626953125, "learning_rate": 4.5510469381122613e-07, "logits/chosen": -17.880916595458984, "logits/rejected": -17.8486270904541, "logps/chosen": -429.07257080078125, "logps/rejected": -298.47528076171875, "loss": 0.6862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5382888317108154, "rewards/margins": 0.18614840507507324, "rewards/rejected": 1.3521404266357422, "step": 5810 }, { "epoch": 0.2702075305260226, "grad_norm": 69.6505126953125, "learning_rate": 4.5502731479332065e-07, "logits/chosen": -17.406219482421875, "logits/rejected": -17.051849365234375, "logps/chosen": -325.4237060546875, "logps/rejected": -309.0724182128906, "loss": 0.9099, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3889691829681396, "rewards/margins": -0.04213287681341171, "rewards/rejected": 1.4311020374298096, "step": 5820 }, { "epoch": 0.2706718046334556, "grad_norm": 32.529640197753906, "learning_rate": 4.549499357754151e-07, "logits/chosen": -17.535253524780273, "logits/rejected": -17.81320571899414, "logps/chosen": -368.3615417480469, "logps/rejected": -391.1238098144531, "loss": 0.8452, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5041453838348389, "rewards/margins": -0.2184935361146927, "rewards/rejected": 1.7226388454437256, "step": 5830 }, { "epoch": 0.27113607874088863, "grad_norm": 41.041954040527344, "learning_rate": 4.548725567575096e-07, "logits/chosen": -18.3017635345459, "logits/rejected": -18.10708236694336, "logps/chosen": -290.47406005859375, "logps/rejected": -278.91241455078125, "loss": 0.6586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3261778354644775, "rewards/margins": 0.18834233283996582, "rewards/rejected": 1.1378353834152222, "step": 5840 }, { "epoch": 0.2716003528483216, "grad_norm": 109.46773529052734, "learning_rate": 4.547951777396041e-07, "logits/chosen": -19.15376853942871, "logits/rejected": -18.55089569091797, "logps/chosen": -614.0128173828125, "logps/rejected": -507.4259338378906, "loss": 0.6678, "rewards/accuracies": 0.5, "rewards/chosen": 2.447669744491577, "rewards/margins": 0.34818023443222046, "rewards/rejected": 2.099489688873291, "step": 5850 }, { "epoch": 0.2720646269557547, "grad_norm": 73.76668548583984, "learning_rate": 4.5471779872169864e-07, "logits/chosen": -19.017358779907227, "logits/rejected": -16.93486976623535, "logps/chosen": -437.8279724121094, "logps/rejected": -274.5765380859375, "loss": 0.613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0167315006256104, "rewards/margins": 0.3103973865509033, "rewards/rejected": 1.706334114074707, "step": 5860 }, { "epoch": 0.2725289010631877, "grad_norm": 139.8850555419922, "learning_rate": 4.546404197037931e-07, "logits/chosen": -18.050823211669922, "logits/rejected": -17.226110458374023, "logps/chosen": -421.068603515625, "logps/rejected": -339.03887939453125, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": 1.7309916019439697, "rewards/margins": 0.19810564815998077, "rewards/rejected": 1.5328857898712158, "step": 5870 }, { "epoch": 0.2729931751706207, "grad_norm": 88.95913696289062, "learning_rate": 4.5456304068588755e-07, "logits/chosen": -17.778608322143555, "logits/rejected": -16.920162200927734, "logps/chosen": -397.36444091796875, "logps/rejected": -239.5383758544922, "loss": 0.4594, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8555644750595093, "rewards/margins": 0.7889575958251953, "rewards/rejected": 1.0666067600250244, "step": 5880 }, { "epoch": 0.27345744927805377, "grad_norm": 143.3400421142578, "learning_rate": 4.5448566166798207e-07, "logits/chosen": -18.131391525268555, "logits/rejected": -17.353769302368164, "logps/chosen": -421.2606506347656, "logps/rejected": -399.59271240234375, "loss": 0.6667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.048635959625244, "rewards/margins": 0.24238350987434387, "rewards/rejected": 1.8062524795532227, "step": 5890 }, { "epoch": 0.2739217233854868, "grad_norm": 41.53369903564453, "learning_rate": 4.544082826500766e-07, "logits/chosen": -17.68295669555664, "logits/rejected": -16.788959503173828, "logps/chosen": -328.80194091796875, "logps/rejected": -254.2687530517578, "loss": 0.6756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7848312854766846, "rewards/margins": 0.4393908381462097, "rewards/rejected": 1.34544038772583, "step": 5900 }, { "epoch": 0.2743859974929198, "grad_norm": 163.90576171875, "learning_rate": 4.543309036321711e-07, "logits/chosen": -18.41191864013672, "logits/rejected": -18.72281265258789, "logps/chosen": -451.88714599609375, "logps/rejected": -470.38201904296875, "loss": 0.9601, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1428706645965576, "rewards/margins": -0.3570721745491028, "rewards/rejected": 2.499943256378174, "step": 5910 }, { "epoch": 0.27485027160035286, "grad_norm": 72.63274383544922, "learning_rate": 4.542535246142656e-07, "logits/chosen": -18.909597396850586, "logits/rejected": -18.152605056762695, "logps/chosen": -413.0309143066406, "logps/rejected": -337.09423828125, "loss": 0.4517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.243720293045044, "rewards/margins": 0.7227182984352112, "rewards/rejected": 1.5210020542144775, "step": 5920 }, { "epoch": 0.27531454570778585, "grad_norm": 192.2655487060547, "learning_rate": 4.5417614559636006e-07, "logits/chosen": -18.269367218017578, "logits/rejected": -18.097919464111328, "logps/chosen": -437.194091796875, "logps/rejected": -389.8848876953125, "loss": 0.9464, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5493735074996948, "rewards/margins": -0.28881940245628357, "rewards/rejected": 1.8381931781768799, "step": 5930 }, { "epoch": 0.2757788198152189, "grad_norm": 128.99862670898438, "learning_rate": 4.5409876657845457e-07, "logits/chosen": -18.346338272094727, "logits/rejected": -17.655315399169922, "logps/chosen": -251.9494171142578, "logps/rejected": -235.08773803710938, "loss": 0.6601, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3737714290618896, "rewards/margins": 0.2446552962064743, "rewards/rejected": 1.129116177558899, "step": 5940 }, { "epoch": 0.27624309392265195, "grad_norm": 65.0992660522461, "learning_rate": 4.540213875605491e-07, "logits/chosen": -17.613773345947266, "logits/rejected": -17.1536922454834, "logps/chosen": -423.55224609375, "logps/rejected": -362.911376953125, "loss": 0.7189, "rewards/accuracies": 0.5, "rewards/chosen": 1.6465537548065186, "rewards/margins": 0.1639186143875122, "rewards/rejected": 1.482635259628296, "step": 5950 }, { "epoch": 0.27670736803008494, "grad_norm": 66.84859466552734, "learning_rate": 4.5394400854264354e-07, "logits/chosen": -18.15242576599121, "logits/rejected": -17.520788192749023, "logps/chosen": -508.81036376953125, "logps/rejected": -394.47479248046875, "loss": 0.685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2843801975250244, "rewards/margins": 0.315853476524353, "rewards/rejected": 1.9685266017913818, "step": 5960 }, { "epoch": 0.277171642137518, "grad_norm": 8.439753532409668, "learning_rate": 4.5386662952473805e-07, "logits/chosen": -18.488082885742188, "logits/rejected": -17.471988677978516, "logps/chosen": -463.42822265625, "logps/rejected": -308.0130920410156, "loss": 0.4724, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.455310106277466, "rewards/margins": 0.7339407205581665, "rewards/rejected": 1.7213693857192993, "step": 5970 }, { "epoch": 0.27763591624495104, "grad_norm": 37.86416244506836, "learning_rate": 4.537892505068325e-07, "logits/chosen": -18.859691619873047, "logits/rejected": -18.73244285583496, "logps/chosen": -360.14227294921875, "logps/rejected": -421.0271911621094, "loss": 0.6103, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6815738677978516, "rewards/margins": 0.2458176165819168, "rewards/rejected": 1.435755968093872, "step": 5980 }, { "epoch": 0.27810019035238404, "grad_norm": 12.179152488708496, "learning_rate": 4.53711871488927e-07, "logits/chosen": -19.005224227905273, "logits/rejected": -18.56589126586914, "logps/chosen": -494.74725341796875, "logps/rejected": -430.3741149902344, "loss": 0.7714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.904561996459961, "rewards/margins": 0.11020094156265259, "rewards/rejected": 1.7943611145019531, "step": 5990 }, { "epoch": 0.2785644644598171, "grad_norm": 42.09888458251953, "learning_rate": 4.5363449247102153e-07, "logits/chosen": -18.81876564025879, "logits/rejected": -17.281938552856445, "logps/chosen": -467.72760009765625, "logps/rejected": -222.602783203125, "loss": 0.3648, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.316467046737671, "rewards/margins": 0.9330042600631714, "rewards/rejected": 1.38346266746521, "step": 6000 }, { "epoch": 0.2790287385672501, "grad_norm": 68.60564422607422, "learning_rate": 4.5355711345311604e-07, "logits/chosen": -18.149757385253906, "logits/rejected": -16.51656723022461, "logps/chosen": -346.2413024902344, "logps/rejected": -191.18589782714844, "loss": 0.4179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6637544631958008, "rewards/margins": 0.9434159398078918, "rewards/rejected": 0.7203385233879089, "step": 6010 }, { "epoch": 0.27949301267468313, "grad_norm": 105.04948425292969, "learning_rate": 4.5347973443521055e-07, "logits/chosen": -17.692363739013672, "logits/rejected": -17.148822784423828, "logps/chosen": -427.87493896484375, "logps/rejected": -389.494873046875, "loss": 0.6362, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9280385971069336, "rewards/margins": 0.2879246473312378, "rewards/rejected": 1.6401138305664062, "step": 6020 }, { "epoch": 0.2799572867821162, "grad_norm": 190.18084716796875, "learning_rate": 4.53402355417305e-07, "logits/chosen": -19.542152404785156, "logits/rejected": -18.051780700683594, "logps/chosen": -348.92083740234375, "logps/rejected": -297.9332580566406, "loss": 0.6371, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5023784637451172, "rewards/margins": 0.3309853971004486, "rewards/rejected": 1.1713931560516357, "step": 6030 }, { "epoch": 0.28042156088954917, "grad_norm": 70.32322692871094, "learning_rate": 4.533249763993995e-07, "logits/chosen": -18.770030975341797, "logits/rejected": -18.341691970825195, "logps/chosen": -399.57354736328125, "logps/rejected": -360.4489440917969, "loss": 0.7096, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3909504413604736, "rewards/margins": 0.042682189494371414, "rewards/rejected": 1.3482682704925537, "step": 6040 }, { "epoch": 0.2808858349969822, "grad_norm": 122.7791519165039, "learning_rate": 4.5324759738149403e-07, "logits/chosen": -18.247756958007812, "logits/rejected": -17.77731704711914, "logps/chosen": -435.65252685546875, "logps/rejected": -359.72845458984375, "loss": 0.7702, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.411301612854004, "rewards/margins": 0.2057085931301117, "rewards/rejected": 2.205592632293701, "step": 6050 }, { "epoch": 0.28135010910441527, "grad_norm": 150.3511199951172, "learning_rate": 4.531702183635885e-07, "logits/chosen": -18.862241744995117, "logits/rejected": -17.99545669555664, "logps/chosen": -481.60565185546875, "logps/rejected": -348.16412353515625, "loss": 0.495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.261317491531372, "rewards/margins": 0.8271207809448242, "rewards/rejected": 1.4341967105865479, "step": 6060 }, { "epoch": 0.28181438321184826, "grad_norm": 156.02549743652344, "learning_rate": 4.53092839345683e-07, "logits/chosen": -19.042236328125, "logits/rejected": -18.60063362121582, "logps/chosen": -415.8291931152344, "logps/rejected": -366.45562744140625, "loss": 0.7378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.891932725906372, "rewards/margins": 0.16018907725811005, "rewards/rejected": 1.7317434549331665, "step": 6070 }, { "epoch": 0.2822786573192813, "grad_norm": 35.86392593383789, "learning_rate": 4.5301546032777746e-07, "logits/chosen": -18.75605010986328, "logits/rejected": -18.030263900756836, "logps/chosen": -425.58837890625, "logps/rejected": -422.9891662597656, "loss": 0.8485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1728663444519043, "rewards/margins": -0.1432909220457077, "rewards/rejected": 2.316157341003418, "step": 6080 }, { "epoch": 0.2827429314267143, "grad_norm": 58.30512619018555, "learning_rate": 4.52938081309872e-07, "logits/chosen": -19.267414093017578, "logits/rejected": -17.972370147705078, "logps/chosen": -574.5643920898438, "logps/rejected": -404.7463684082031, "loss": 0.6077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.687748432159424, "rewards/margins": 0.6137592792510986, "rewards/rejected": 2.073988914489746, "step": 6090 }, { "epoch": 0.28320720553414735, "grad_norm": 121.59323120117188, "learning_rate": 4.528607022919665e-07, "logits/chosen": -17.536169052124023, "logits/rejected": -17.048620223999023, "logps/chosen": -323.904052734375, "logps/rejected": -299.5517883300781, "loss": 0.6086, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6972366571426392, "rewards/margins": 0.4032507538795471, "rewards/rejected": 1.2939858436584473, "step": 6100 }, { "epoch": 0.2836714796415804, "grad_norm": 70.48020935058594, "learning_rate": 4.52783323274061e-07, "logits/chosen": -17.508512496948242, "logits/rejected": -17.56157684326172, "logps/chosen": -316.13641357421875, "logps/rejected": -262.41180419921875, "loss": 0.8045, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3811264038085938, "rewards/margins": -0.030363356694579124, "rewards/rejected": 1.411489725112915, "step": 6110 }, { "epoch": 0.2841357537490134, "grad_norm": 93.65605926513672, "learning_rate": 4.527059442561555e-07, "logits/chosen": -19.128355026245117, "logits/rejected": -17.497920989990234, "logps/chosen": -368.60693359375, "logps/rejected": -298.9629211425781, "loss": 0.6403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.329636573791504, "rewards/margins": 0.5981913805007935, "rewards/rejected": 1.7314453125, "step": 6120 }, { "epoch": 0.28460002785644645, "grad_norm": 12.887008666992188, "learning_rate": 4.5262856523825e-07, "logits/chosen": -17.80221939086914, "logits/rejected": -16.44021224975586, "logps/chosen": -486.4889221191406, "logps/rejected": -270.9580993652344, "loss": 0.4689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4932491779327393, "rewards/margins": 0.7352994084358215, "rewards/rejected": 1.7579498291015625, "step": 6130 }, { "epoch": 0.2850643019638795, "grad_norm": 52.46173095703125, "learning_rate": 4.525511862203445e-07, "logits/chosen": -18.800743103027344, "logits/rejected": -18.303531646728516, "logps/chosen": -468.4317932128906, "logps/rejected": -423.7466735839844, "loss": 0.6202, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2873988151550293, "rewards/margins": 0.2504400908946991, "rewards/rejected": 2.036958932876587, "step": 6140 }, { "epoch": 0.2855285760713125, "grad_norm": 68.53543853759766, "learning_rate": 4.5247380720243894e-07, "logits/chosen": -18.848390579223633, "logits/rejected": -18.059967041015625, "logps/chosen": -460.7164001464844, "logps/rejected": -353.6011047363281, "loss": 0.6271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.31201434135437, "rewards/margins": 0.3732275366783142, "rewards/rejected": 1.9387871026992798, "step": 6150 }, { "epoch": 0.28599285017874554, "grad_norm": 56.799930572509766, "learning_rate": 4.5239642818453345e-07, "logits/chosen": -19.660253524780273, "logits/rejected": -17.023984909057617, "logps/chosen": -494.8036193847656, "logps/rejected": -239.41738891601562, "loss": 0.4022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.492327928543091, "rewards/margins": 1.0476183891296387, "rewards/rejected": 1.4447094202041626, "step": 6160 }, { "epoch": 0.28645712428617853, "grad_norm": 60.775325775146484, "learning_rate": 4.5231904916662796e-07, "logits/chosen": -17.317073822021484, "logits/rejected": -17.78641128540039, "logps/chosen": -365.9269104003906, "logps/rejected": -367.12481689453125, "loss": 0.7966, "rewards/accuracies": 0.5, "rewards/chosen": 1.6460193395614624, "rewards/margins": 0.007277369499206543, "rewards/rejected": 1.6387420892715454, "step": 6170 }, { "epoch": 0.2869213983936116, "grad_norm": 47.99049377441406, "learning_rate": 4.522416701487224e-07, "logits/chosen": -18.40369415283203, "logits/rejected": -18.165428161621094, "logps/chosen": -366.5097351074219, "logps/rejected": -404.3809814453125, "loss": 0.6956, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8961784839630127, "rewards/margins": 0.16190628707408905, "rewards/rejected": 1.7342722415924072, "step": 6180 }, { "epoch": 0.28738567250104463, "grad_norm": 66.23859405517578, "learning_rate": 4.5216429113081693e-07, "logits/chosen": -18.1190185546875, "logits/rejected": -17.572429656982422, "logps/chosen": -423.0768127441406, "logps/rejected": -322.16180419921875, "loss": 0.6403, "rewards/accuracies": 0.5, "rewards/chosen": 1.9720147848129272, "rewards/margins": 0.44937628507614136, "rewards/rejected": 1.5226385593414307, "step": 6190 }, { "epoch": 0.2878499466084776, "grad_norm": 148.69371032714844, "learning_rate": 4.5208691211291144e-07, "logits/chosen": -19.043010711669922, "logits/rejected": -18.499462127685547, "logps/chosen": -344.48797607421875, "logps/rejected": -257.09478759765625, "loss": 0.7356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8224432468414307, "rewards/margins": 0.20943543314933777, "rewards/rejected": 1.6130081415176392, "step": 6200 }, { "epoch": 0.2883142207159107, "grad_norm": 125.32501220703125, "learning_rate": 4.5200953309500595e-07, "logits/chosen": -17.418384552001953, "logits/rejected": -17.0403995513916, "logps/chosen": -389.3250732421875, "logps/rejected": -336.01318359375, "loss": 0.5146, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9099044799804688, "rewards/margins": 0.5410204529762268, "rewards/rejected": 1.3688842058181763, "step": 6210 }, { "epoch": 0.2887784948233437, "grad_norm": 34.54653549194336, "learning_rate": 4.5193215407710046e-07, "logits/chosen": -18.805673599243164, "logits/rejected": -17.758310317993164, "logps/chosen": -410.18389892578125, "logps/rejected": -291.35284423828125, "loss": 0.4598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2099461555480957, "rewards/margins": 0.7006052732467651, "rewards/rejected": 1.5093410015106201, "step": 6220 }, { "epoch": 0.2892427689307767, "grad_norm": 143.94320678710938, "learning_rate": 4.5185477505919497e-07, "logits/chosen": -20.264129638671875, "logits/rejected": -19.545398712158203, "logps/chosen": -456.19171142578125, "logps/rejected": -372.03546142578125, "loss": 0.6504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0294528007507324, "rewards/margins": 0.2584468126296997, "rewards/rejected": 1.7710059881210327, "step": 6230 }, { "epoch": 0.28970704303820977, "grad_norm": 47.002281188964844, "learning_rate": 4.5177739604128943e-07, "logits/chosen": -18.817440032958984, "logits/rejected": -18.10348892211914, "logps/chosen": -440.8169860839844, "logps/rejected": -398.2666015625, "loss": 0.7248, "rewards/accuracies": 0.5, "rewards/chosen": 2.093902111053467, "rewards/margins": 0.054180167615413666, "rewards/rejected": 2.039721965789795, "step": 6240 }, { "epoch": 0.29017131714564276, "grad_norm": 29.703691482543945, "learning_rate": 4.517000170233839e-07, "logits/chosen": -19.16852378845215, "logits/rejected": -17.737834930419922, "logps/chosen": -523.1118774414062, "logps/rejected": -322.70782470703125, "loss": 0.4349, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3213160037994385, "rewards/margins": 0.751971423625946, "rewards/rejected": 1.5693446397781372, "step": 6250 }, { "epoch": 0.2906355912530758, "grad_norm": 95.0899658203125, "learning_rate": 4.516226380054784e-07, "logits/chosen": -19.813602447509766, "logits/rejected": -18.334657669067383, "logps/chosen": -417.7804260253906, "logps/rejected": -320.07305908203125, "loss": 0.4795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.087862491607666, "rewards/margins": 0.7271172404289246, "rewards/rejected": 1.3607450723648071, "step": 6260 }, { "epoch": 0.29109986536050886, "grad_norm": 55.8393440246582, "learning_rate": 4.515452589875729e-07, "logits/chosen": -18.901653289794922, "logits/rejected": -18.533588409423828, "logps/chosen": -445.5143127441406, "logps/rejected": -328.63836669921875, "loss": 0.5827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.197464942932129, "rewards/margins": 0.4248882234096527, "rewards/rejected": 1.7725765705108643, "step": 6270 }, { "epoch": 0.29156413946794185, "grad_norm": 99.89533233642578, "learning_rate": 4.514678799696674e-07, "logits/chosen": -18.929325103759766, "logits/rejected": -17.762351989746094, "logps/chosen": -421.258544921875, "logps/rejected": -329.8601989746094, "loss": 0.7867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1371936798095703, "rewards/margins": 0.1169707402586937, "rewards/rejected": 2.0202229022979736, "step": 6280 }, { "epoch": 0.2920284135753749, "grad_norm": 113.1850814819336, "learning_rate": 4.513905009517619e-07, "logits/chosen": -17.858850479125977, "logits/rejected": -17.11844825744629, "logps/chosen": -470.1708068847656, "logps/rejected": -368.82861328125, "loss": 0.5138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9360719919204712, "rewards/margins": 0.5267310738563538, "rewards/rejected": 1.4093409776687622, "step": 6290 }, { "epoch": 0.29249268768280795, "grad_norm": 85.60856628417969, "learning_rate": 4.513131219338564e-07, "logits/chosen": -19.505199432373047, "logits/rejected": -18.30714225769043, "logps/chosen": -422.7147521972656, "logps/rejected": -282.53192138671875, "loss": 0.5465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1934590339660645, "rewards/margins": 0.565224826335907, "rewards/rejected": 1.6282342672348022, "step": 6300 }, { "epoch": 0.29295696179024094, "grad_norm": 37.61767578125, "learning_rate": 4.512357429159509e-07, "logits/chosen": -18.740478515625, "logits/rejected": -17.51407241821289, "logps/chosen": -433.39837646484375, "logps/rejected": -252.7118377685547, "loss": 0.4572, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8808095455169678, "rewards/margins": 0.8145774006843567, "rewards/rejected": 1.0662322044372559, "step": 6310 }, { "epoch": 0.293421235897674, "grad_norm": 117.89781188964844, "learning_rate": 4.511583638980454e-07, "logits/chosen": -18.267749786376953, "logits/rejected": -17.913572311401367, "logps/chosen": -504.38250732421875, "logps/rejected": -406.39703369140625, "loss": 0.6137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2673556804656982, "rewards/margins": 0.4155868589878082, "rewards/rejected": 1.8517684936523438, "step": 6320 }, { "epoch": 0.29388551000510704, "grad_norm": 53.45427703857422, "learning_rate": 4.5108098488013993e-07, "logits/chosen": -18.83153533935547, "logits/rejected": -17.600502014160156, "logps/chosen": -448.635498046875, "logps/rejected": -261.4093933105469, "loss": 0.4504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.325895309448242, "rewards/margins": 0.8971165418624878, "rewards/rejected": 1.4287787675857544, "step": 6330 }, { "epoch": 0.29434978411254004, "grad_norm": 70.8793716430664, "learning_rate": 4.5100360586223433e-07, "logits/chosen": -19.342130661010742, "logits/rejected": -18.639625549316406, "logps/chosen": -463.4729919433594, "logps/rejected": -378.36114501953125, "loss": 0.5564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.204627275466919, "rewards/margins": 0.7608876824378967, "rewards/rejected": 1.4437392950057983, "step": 6340 }, { "epoch": 0.2948140582199731, "grad_norm": 115.48554229736328, "learning_rate": 4.5092622684432884e-07, "logits/chosen": -18.653240203857422, "logits/rejected": -17.472455978393555, "logps/chosen": -443.398193359375, "logps/rejected": -317.9816589355469, "loss": 0.4622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.250866413116455, "rewards/margins": 0.7258477210998535, "rewards/rejected": 1.5250188112258911, "step": 6350 }, { "epoch": 0.2952783323274061, "grad_norm": 138.86585998535156, "learning_rate": 4.5084884782642336e-07, "logits/chosen": -18.569059371948242, "logits/rejected": -17.572376251220703, "logps/chosen": -383.2657775878906, "logps/rejected": -306.6467590332031, "loss": 0.5818, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7483444213867188, "rewards/margins": 0.463688462972641, "rewards/rejected": 1.284656047821045, "step": 6360 }, { "epoch": 0.29574260643483913, "grad_norm": 110.58497619628906, "learning_rate": 4.5077146880851787e-07, "logits/chosen": -18.30415153503418, "logits/rejected": -18.99339485168457, "logps/chosen": -455.3701171875, "logps/rejected": -455.38671875, "loss": 0.7407, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.852103590965271, "rewards/margins": -0.012845456600189209, "rewards/rejected": 1.8649489879608154, "step": 6370 }, { "epoch": 0.2962068805422722, "grad_norm": 12.135592460632324, "learning_rate": 4.506940897906124e-07, "logits/chosen": -18.44919204711914, "logits/rejected": -16.611703872680664, "logps/chosen": -492.9180603027344, "logps/rejected": -267.412841796875, "loss": 0.3979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.993689775466919, "rewards/margins": 1.2678112983703613, "rewards/rejected": 1.725878357887268, "step": 6380 }, { "epoch": 0.29667115464970517, "grad_norm": 99.71967315673828, "learning_rate": 4.5061671077270684e-07, "logits/chosen": -19.716541290283203, "logits/rejected": -18.712448120117188, "logps/chosen": -547.3704833984375, "logps/rejected": -412.31866455078125, "loss": 0.4775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0057950019836426, "rewards/margins": 0.905698299407959, "rewards/rejected": 2.1000964641571045, "step": 6390 }, { "epoch": 0.2971354287571382, "grad_norm": 122.88839721679688, "learning_rate": 4.5053933175480135e-07, "logits/chosen": -18.583908081054688, "logits/rejected": -17.502893447875977, "logps/chosen": -410.86663818359375, "logps/rejected": -321.6383056640625, "loss": 0.6065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.141246795654297, "rewards/margins": 0.4611966013908386, "rewards/rejected": 1.680050253868103, "step": 6400 }, { "epoch": 0.29759970286457127, "grad_norm": 58.72220993041992, "learning_rate": 4.5046195273689586e-07, "logits/chosen": -18.63559341430664, "logits/rejected": -18.526308059692383, "logps/chosen": -369.2650146484375, "logps/rejected": -374.5126037597656, "loss": 0.8052, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7662595510482788, "rewards/margins": 0.02883397974073887, "rewards/rejected": 1.737425446510315, "step": 6410 }, { "epoch": 0.29806397697200426, "grad_norm": 41.14951705932617, "learning_rate": 4.5038457371899037e-07, "logits/chosen": -17.908641815185547, "logits/rejected": -17.312259674072266, "logps/chosen": -368.3945617675781, "logps/rejected": -311.26007080078125, "loss": 0.5251, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.314333200454712, "rewards/margins": 0.628964900970459, "rewards/rejected": 1.685368299484253, "step": 6420 }, { "epoch": 0.2985282510794373, "grad_norm": 50.58483123779297, "learning_rate": 4.503071947010849e-07, "logits/chosen": -18.578710556030273, "logits/rejected": -17.712841033935547, "logps/chosen": -390.87591552734375, "logps/rejected": -287.02459716796875, "loss": 0.5427, "rewards/accuracies": 0.5, "rewards/chosen": 2.221832513809204, "rewards/margins": 0.7211825847625732, "rewards/rejected": 1.5006500482559204, "step": 6430 }, { "epoch": 0.2989925251868703, "grad_norm": 44.43006896972656, "learning_rate": 4.502298156831793e-07, "logits/chosen": -18.333593368530273, "logits/rejected": -18.486061096191406, "logps/chosen": -378.93798828125, "logps/rejected": -395.6138610839844, "loss": 0.7304, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9581104516983032, "rewards/margins": 0.05192537233233452, "rewards/rejected": 1.9061849117279053, "step": 6440 }, { "epoch": 0.29945679929430336, "grad_norm": 141.084716796875, "learning_rate": 4.501524366652738e-07, "logits/chosen": -18.14364242553711, "logits/rejected": -18.01663589477539, "logps/chosen": -371.4427490234375, "logps/rejected": -333.36431884765625, "loss": 0.6134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.019927740097046, "rewards/margins": 0.3497108817100525, "rewards/rejected": 1.6702167987823486, "step": 6450 }, { "epoch": 0.2999210734017364, "grad_norm": 19.89225959777832, "learning_rate": 4.500750576473683e-07, "logits/chosen": -18.2762393951416, "logits/rejected": -17.262218475341797, "logps/chosen": -366.0596008300781, "logps/rejected": -229.4912109375, "loss": 0.4541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.465564489364624, "rewards/margins": 1.1215770244598389, "rewards/rejected": 1.3439874649047852, "step": 6460 }, { "epoch": 0.3003853475091694, "grad_norm": 44.43830108642578, "learning_rate": 4.499976786294628e-07, "logits/chosen": -18.76589584350586, "logits/rejected": -18.33378791809082, "logps/chosen": -302.6170959472656, "logps/rejected": -316.2560119628906, "loss": 0.8354, "rewards/accuracies": 0.5, "rewards/chosen": 1.9527393579483032, "rewards/margins": -0.032046835869550705, "rewards/rejected": 1.984786033630371, "step": 6470 }, { "epoch": 0.30084962161660245, "grad_norm": 98.21278381347656, "learning_rate": 4.4992029961155733e-07, "logits/chosen": -18.39746856689453, "logits/rejected": -17.479244232177734, "logps/chosen": -419.4087829589844, "logps/rejected": -283.6905212402344, "loss": 0.5074, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9437434673309326, "rewards/margins": 0.5770038366317749, "rewards/rejected": 1.3667396306991577, "step": 6480 }, { "epoch": 0.3013138957240355, "grad_norm": 149.81517028808594, "learning_rate": 4.498429205936518e-07, "logits/chosen": -18.646297454833984, "logits/rejected": -17.388145446777344, "logps/chosen": -461.6273498535156, "logps/rejected": -320.1618957519531, "loss": 0.6438, "rewards/accuracies": 0.5, "rewards/chosen": 2.5014069080352783, "rewards/margins": 0.6433005928993225, "rewards/rejected": 1.858106255531311, "step": 6490 }, { "epoch": 0.3017781698314685, "grad_norm": 94.1058120727539, "learning_rate": 4.497655415757463e-07, "logits/chosen": -18.141117095947266, "logits/rejected": -17.8342227935791, "logps/chosen": -441.9234924316406, "logps/rejected": -314.18768310546875, "loss": 0.6572, "rewards/accuracies": 0.5, "rewards/chosen": 1.7171844244003296, "rewards/margins": 0.24833659827709198, "rewards/rejected": 1.4688477516174316, "step": 6500 }, { "epoch": 0.30224244393890154, "grad_norm": 136.39271545410156, "learning_rate": 4.496881625578408e-07, "logits/chosen": -18.700820922851562, "logits/rejected": -18.54134750366211, "logps/chosen": -491.50408935546875, "logps/rejected": -474.56201171875, "loss": 0.7509, "rewards/accuracies": 0.5, "rewards/chosen": 2.8264079093933105, "rewards/margins": 0.21201209723949432, "rewards/rejected": 2.6143956184387207, "step": 6510 }, { "epoch": 0.30270671804633453, "grad_norm": 47.45055389404297, "learning_rate": 4.496107835399353e-07, "logits/chosen": -17.775959014892578, "logits/rejected": -17.304292678833008, "logps/chosen": -391.17547607421875, "logps/rejected": -344.17999267578125, "loss": 0.8127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2394092082977295, "rewards/margins": 0.6947661638259888, "rewards/rejected": 1.5446430444717407, "step": 6520 }, { "epoch": 0.3031709921537676, "grad_norm": 79.0390396118164, "learning_rate": 4.4953340452202984e-07, "logits/chosen": -18.271711349487305, "logits/rejected": -18.12631607055664, "logps/chosen": -455.8243103027344, "logps/rejected": -343.89935302734375, "loss": 0.5534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6029417514801025, "rewards/margins": 0.7211266756057739, "rewards/rejected": 1.8818151950836182, "step": 6530 }, { "epoch": 0.30363526626120063, "grad_norm": 6.48868465423584, "learning_rate": 4.4945602550412424e-07, "logits/chosen": -18.746919631958008, "logits/rejected": -17.028160095214844, "logps/chosen": -435.60650634765625, "logps/rejected": -310.5982360839844, "loss": 0.4415, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.35733699798584, "rewards/margins": 0.8399167060852051, "rewards/rejected": 1.5174204111099243, "step": 6540 }, { "epoch": 0.3040995403686336, "grad_norm": 64.05901336669922, "learning_rate": 4.4937864648621875e-07, "logits/chosen": -18.210330963134766, "logits/rejected": -17.397857666015625, "logps/chosen": -410.5203552246094, "logps/rejected": -329.1025390625, "loss": 0.5564, "rewards/accuracies": 0.5, "rewards/chosen": 1.8755552768707275, "rewards/margins": 0.4558979570865631, "rewards/rejected": 1.4196574687957764, "step": 6550 }, { "epoch": 0.3045638144760667, "grad_norm": 58.73033905029297, "learning_rate": 4.4930126746831326e-07, "logits/chosen": -17.3070011138916, "logits/rejected": -16.97463607788086, "logps/chosen": -331.6697082519531, "logps/rejected": -317.4020690917969, "loss": 0.7866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9782602787017822, "rewards/margins": 0.15670673549175262, "rewards/rejected": 1.8215534687042236, "step": 6560 }, { "epoch": 0.3050280885834997, "grad_norm": 22.502208709716797, "learning_rate": 4.492238884504078e-07, "logits/chosen": -17.30889320373535, "logits/rejected": -16.957820892333984, "logps/chosen": -405.88970947265625, "logps/rejected": -361.12298583984375, "loss": 0.8793, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9434601068496704, "rewards/margins": 0.11435566842556, "rewards/rejected": 1.8291046619415283, "step": 6570 }, { "epoch": 0.3054923626909327, "grad_norm": 56.74030303955078, "learning_rate": 4.491465094325023e-07, "logits/chosen": -18.397747039794922, "logits/rejected": -17.73749542236328, "logps/chosen": -486.36572265625, "logps/rejected": -358.3392639160156, "loss": 0.4772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3694634437561035, "rewards/margins": 0.6432716846466064, "rewards/rejected": 1.726191759109497, "step": 6580 }, { "epoch": 0.30595663679836577, "grad_norm": 44.85960388183594, "learning_rate": 4.4906913041459674e-07, "logits/chosen": -18.796443939208984, "logits/rejected": -18.364452362060547, "logps/chosen": -354.6348571777344, "logps/rejected": -326.247314453125, "loss": 0.7444, "rewards/accuracies": 0.5, "rewards/chosen": 1.715813398361206, "rewards/margins": -0.08206482976675034, "rewards/rejected": 1.7978782653808594, "step": 6590 }, { "epoch": 0.30642091090579876, "grad_norm": 117.3509750366211, "learning_rate": 4.4899175139669126e-07, "logits/chosen": -18.421295166015625, "logits/rejected": -17.921833038330078, "logps/chosen": -395.81500244140625, "logps/rejected": -296.3423767089844, "loss": 0.4331, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6030824184417725, "rewards/margins": 0.9313087463378906, "rewards/rejected": 1.6717736721038818, "step": 6600 }, { "epoch": 0.3068851850132318, "grad_norm": 35.87222671508789, "learning_rate": 4.4891437237878577e-07, "logits/chosen": -19.03404426574707, "logits/rejected": -18.03036880493164, "logps/chosen": -358.5104064941406, "logps/rejected": -271.74188232421875, "loss": 0.5931, "rewards/accuracies": 0.5, "rewards/chosen": 1.8822505474090576, "rewards/margins": 0.4014304280281067, "rewards/rejected": 1.4808204174041748, "step": 6610 }, { "epoch": 0.30734945912066486, "grad_norm": 171.09939575195312, "learning_rate": 4.488369933608803e-07, "logits/chosen": -18.668956756591797, "logits/rejected": -18.447832107543945, "logps/chosen": -365.9139404296875, "logps/rejected": -345.66302490234375, "loss": 0.7255, "rewards/accuracies": 0.5, "rewards/chosen": 1.4940041303634644, "rewards/margins": 0.08938570320606232, "rewards/rejected": 1.404618501663208, "step": 6620 }, { "epoch": 0.30781373322809785, "grad_norm": 8.61028003692627, "learning_rate": 4.4875961434297474e-07, "logits/chosen": -19.082805633544922, "logits/rejected": -17.584239959716797, "logps/chosen": -550.3553466796875, "logps/rejected": -360.8780517578125, "loss": 0.3279, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.723952054977417, "rewards/margins": 1.3747174739837646, "rewards/rejected": 1.3492345809936523, "step": 6630 }, { "epoch": 0.3082780073355309, "grad_norm": 16.963912963867188, "learning_rate": 4.486822353250692e-07, "logits/chosen": -18.267934799194336, "logits/rejected": -18.580669403076172, "logps/chosen": -317.28466796875, "logps/rejected": -346.739990234375, "loss": 0.6722, "rewards/accuracies": 0.5, "rewards/chosen": 1.5437891483306885, "rewards/margins": 0.1993931382894516, "rewards/rejected": 1.3443959951400757, "step": 6640 }, { "epoch": 0.30874228144296395, "grad_norm": 67.66730499267578, "learning_rate": 4.486048563071637e-07, "logits/chosen": -18.88607406616211, "logits/rejected": -18.305282592773438, "logps/chosen": -466.75042724609375, "logps/rejected": -381.56201171875, "loss": 0.525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9791462421417236, "rewards/margins": 0.6038893461227417, "rewards/rejected": 1.375256896018982, "step": 6650 }, { "epoch": 0.30920655555039694, "grad_norm": 111.03141784667969, "learning_rate": 4.485274772892582e-07, "logits/chosen": -18.19338607788086, "logits/rejected": -18.43362045288086, "logps/chosen": -326.5416259765625, "logps/rejected": -339.3835144042969, "loss": 1.0986, "rewards/accuracies": 0.5, "rewards/chosen": 1.7668558359146118, "rewards/margins": -0.42930251359939575, "rewards/rejected": 2.1961586475372314, "step": 6660 }, { "epoch": 0.30967082965783, "grad_norm": 120.46660614013672, "learning_rate": 4.4845009827135273e-07, "logits/chosen": -19.365062713623047, "logits/rejected": -18.615985870361328, "logps/chosen": -454.3421936035156, "logps/rejected": -312.18304443359375, "loss": 0.5192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3235890865325928, "rewards/margins": 0.7776178121566772, "rewards/rejected": 1.545971393585205, "step": 6670 }, { "epoch": 0.310135103765263, "grad_norm": 55.589134216308594, "learning_rate": 4.4837271925344724e-07, "logits/chosen": -18.687875747680664, "logits/rejected": -18.354595184326172, "logps/chosen": -374.38787841796875, "logps/rejected": -267.73089599609375, "loss": 0.6668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.868817925453186, "rewards/margins": 0.14814619719982147, "rewards/rejected": 1.7206714153289795, "step": 6680 }, { "epoch": 0.31059937787269604, "grad_norm": 63.909149169921875, "learning_rate": 4.482953402355417e-07, "logits/chosen": -18.475923538208008, "logits/rejected": -17.258134841918945, "logps/chosen": -475.09918212890625, "logps/rejected": -307.59100341796875, "loss": 0.5406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9247182607650757, "rewards/margins": 0.5673174858093262, "rewards/rejected": 1.357400894165039, "step": 6690 }, { "epoch": 0.3110636519801291, "grad_norm": 30.974388122558594, "learning_rate": 4.482179612176362e-07, "logits/chosen": -18.13128662109375, "logits/rejected": -16.79903793334961, "logps/chosen": -483.2215881347656, "logps/rejected": -287.77227783203125, "loss": 0.4823, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5902860164642334, "rewards/margins": 1.058445930480957, "rewards/rejected": 1.5318400859832764, "step": 6700 }, { "epoch": 0.3115279260875621, "grad_norm": 211.21551513671875, "learning_rate": 4.481405821997307e-07, "logits/chosen": -18.316823959350586, "logits/rejected": -18.048221588134766, "logps/chosen": -450.5445861816406, "logps/rejected": -374.86474609375, "loss": 0.6142, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.895125389099121, "rewards/margins": 0.3992791771888733, "rewards/rejected": 2.4958462715148926, "step": 6710 }, { "epoch": 0.31199220019499513, "grad_norm": 14.530380249023438, "learning_rate": 4.4806320318182523e-07, "logits/chosen": -18.652101516723633, "logits/rejected": -18.8072452545166, "logps/chosen": -338.2341003417969, "logps/rejected": -341.45245361328125, "loss": 1.0104, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6652641296386719, "rewards/margins": -0.24818117916584015, "rewards/rejected": 1.9134454727172852, "step": 6720 }, { "epoch": 0.3124564743024282, "grad_norm": 55.65016555786133, "learning_rate": 4.479858241639197e-07, "logits/chosen": -18.161645889282227, "logits/rejected": -18.210983276367188, "logps/chosen": -453.9847106933594, "logps/rejected": -461.904541015625, "loss": 0.728, "rewards/accuracies": 0.5, "rewards/chosen": 2.134427070617676, "rewards/margins": 0.1763276755809784, "rewards/rejected": 1.958099603652954, "step": 6730 }, { "epoch": 0.31292074840986117, "grad_norm": 163.37551879882812, "learning_rate": 4.4790844514601415e-07, "logits/chosen": -18.053802490234375, "logits/rejected": -17.61781120300293, "logps/chosen": -330.433349609375, "logps/rejected": -329.93695068359375, "loss": 0.9544, "rewards/accuracies": 0.5, "rewards/chosen": 1.6697015762329102, "rewards/margins": -0.14639052748680115, "rewards/rejected": 1.8160921335220337, "step": 6740 }, { "epoch": 0.3133850225172942, "grad_norm": 38.151790618896484, "learning_rate": 4.4783106612810866e-07, "logits/chosen": -18.771648406982422, "logits/rejected": -18.036476135253906, "logps/chosen": -270.723876953125, "logps/rejected": -214.21743774414062, "loss": 0.5454, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6399600505828857, "rewards/margins": 0.3627190589904785, "rewards/rejected": 1.2772409915924072, "step": 6750 }, { "epoch": 0.3138492966247272, "grad_norm": 18.536785125732422, "learning_rate": 4.4775368711020317e-07, "logits/chosen": -18.837387084960938, "logits/rejected": -18.705333709716797, "logps/chosen": -446.3666076660156, "logps/rejected": -410.63916015625, "loss": 0.7114, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.422032594680786, "rewards/margins": 0.18420051038265228, "rewards/rejected": 2.2378320693969727, "step": 6760 }, { "epoch": 0.31431357073216026, "grad_norm": 62.01555252075195, "learning_rate": 4.476763080922977e-07, "logits/chosen": -19.695995330810547, "logits/rejected": -19.469385147094727, "logps/chosen": -427.6568908691406, "logps/rejected": -395.2059631347656, "loss": 0.6754, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.0826499462127686, "rewards/margins": 0.17601510882377625, "rewards/rejected": 1.9066349267959595, "step": 6770 }, { "epoch": 0.3147778448395933, "grad_norm": 79.43505096435547, "learning_rate": 4.475989290743922e-07, "logits/chosen": -20.038976669311523, "logits/rejected": -18.698915481567383, "logps/chosen": -432.1941833496094, "logps/rejected": -263.67559814453125, "loss": 0.4426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4016029834747314, "rewards/margins": 0.826076328754425, "rewards/rejected": 1.575526475906372, "step": 6780 }, { "epoch": 0.3152421189470263, "grad_norm": 17.252277374267578, "learning_rate": 4.4752155005648665e-07, "logits/chosen": -18.391353607177734, "logits/rejected": -17.34246063232422, "logps/chosen": -460.49822998046875, "logps/rejected": -308.363525390625, "loss": 0.5163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.538698434829712, "rewards/margins": 0.8236356973648071, "rewards/rejected": 1.7150628566741943, "step": 6790 }, { "epoch": 0.31570639305445936, "grad_norm": 54.39863967895508, "learning_rate": 4.4744417103858116e-07, "logits/chosen": -19.081300735473633, "logits/rejected": -18.785728454589844, "logps/chosen": -492.9556579589844, "logps/rejected": -483.49151611328125, "loss": 0.6832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7828452587127686, "rewards/margins": 0.26046043634414673, "rewards/rejected": 2.5223848819732666, "step": 6800 }, { "epoch": 0.3161706671618924, "grad_norm": 41.460018157958984, "learning_rate": 4.473667920206757e-07, "logits/chosen": -18.9410457611084, "logits/rejected": -19.004802703857422, "logps/chosen": -431.85906982421875, "logps/rejected": -342.7481689453125, "loss": 0.4206, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5479736328125, "rewards/margins": 0.8103610277175903, "rewards/rejected": 1.7376127243041992, "step": 6810 }, { "epoch": 0.3166349412693254, "grad_norm": 51.108211517333984, "learning_rate": 4.4728941300277013e-07, "logits/chosen": -17.81705665588379, "logits/rejected": -17.541423797607422, "logps/chosen": -585.6610107421875, "logps/rejected": -416.52203369140625, "loss": 0.6066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6452317237854004, "rewards/margins": 0.7656130790710449, "rewards/rejected": 1.8796184062957764, "step": 6820 }, { "epoch": 0.31709921537675845, "grad_norm": 11.95645523071289, "learning_rate": 4.4721203398486465e-07, "logits/chosen": -19.118371963500977, "logits/rejected": -18.095081329345703, "logps/chosen": -335.18487548828125, "logps/rejected": -254.42514038085938, "loss": 0.4495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.813009262084961, "rewards/margins": 0.8012641668319702, "rewards/rejected": 1.0117450952529907, "step": 6830 }, { "epoch": 0.31756348948419144, "grad_norm": 74.02598571777344, "learning_rate": 4.471346549669591e-07, "logits/chosen": -19.55004119873047, "logits/rejected": -19.201709747314453, "logps/chosen": -383.6678466796875, "logps/rejected": -387.75384521484375, "loss": 0.8947, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0556559562683105, "rewards/margins": -0.2863169312477112, "rewards/rejected": 2.341972827911377, "step": 6840 }, { "epoch": 0.3180277635916245, "grad_norm": 70.6263656616211, "learning_rate": 4.470572759490536e-07, "logits/chosen": -18.376880645751953, "logits/rejected": -17.81203842163086, "logps/chosen": -374.3536071777344, "logps/rejected": -333.8357238769531, "loss": 0.6578, "rewards/accuracies": 0.5, "rewards/chosen": 1.786365270614624, "rewards/margins": 0.27496036887168884, "rewards/rejected": 1.511405110359192, "step": 6850 }, { "epoch": 0.31849203769905754, "grad_norm": 226.53240966796875, "learning_rate": 4.469798969311481e-07, "logits/chosen": -20.073543548583984, "logits/rejected": -18.79205322265625, "logps/chosen": -452.65191650390625, "logps/rejected": -290.3382873535156, "loss": 0.7422, "rewards/accuracies": 0.5, "rewards/chosen": 1.8852860927581787, "rewards/margins": 0.42132359743118286, "rewards/rejected": 1.4639625549316406, "step": 6860 }, { "epoch": 0.31895631180649053, "grad_norm": 94.68012237548828, "learning_rate": 4.4690251791324264e-07, "logits/chosen": -20.162113189697266, "logits/rejected": -19.857683181762695, "logps/chosen": -478.27972412109375, "logps/rejected": -470.5245666503906, "loss": 0.6549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.999403953552246, "rewards/margins": 0.19342580437660217, "rewards/rejected": 2.8059778213500977, "step": 6870 }, { "epoch": 0.3194205859139236, "grad_norm": 92.20976257324219, "learning_rate": 4.4682513889533715e-07, "logits/chosen": -18.289936065673828, "logits/rejected": -18.76435661315918, "logps/chosen": -339.72247314453125, "logps/rejected": -336.8013610839844, "loss": 0.7875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.6710268259048462, "rewards/margins": -0.04700714349746704, "rewards/rejected": 1.718034029006958, "step": 6880 }, { "epoch": 0.31988486002135663, "grad_norm": 70.2497329711914, "learning_rate": 4.467477598774316e-07, "logits/chosen": -18.92901611328125, "logits/rejected": -18.36609649658203, "logps/chosen": -480.0313415527344, "logps/rejected": -396.31146240234375, "loss": 0.733, "rewards/accuracies": 0.5, "rewards/chosen": 2.0103907585144043, "rewards/margins": 0.11758433282375336, "rewards/rejected": 1.8928064107894897, "step": 6890 }, { "epoch": 0.3203491341287896, "grad_norm": 46.499595642089844, "learning_rate": 4.466703808595261e-07, "logits/chosen": -17.762950897216797, "logits/rejected": -17.391386032104492, "logps/chosen": -284.1953125, "logps/rejected": -229.4443817138672, "loss": 0.5464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7057307958602905, "rewards/margins": 0.490952730178833, "rewards/rejected": 1.214778184890747, "step": 6900 }, { "epoch": 0.3208134082362227, "grad_norm": 46.37470626831055, "learning_rate": 4.4659300184162063e-07, "logits/chosen": -18.50128173828125, "logits/rejected": -17.452255249023438, "logps/chosen": -387.7268371582031, "logps/rejected": -203.74685668945312, "loss": 0.2932, "rewards/accuracies": 1.0, "rewards/chosen": 2.372887134552002, "rewards/margins": 1.3737168312072754, "rewards/rejected": 0.9991704821586609, "step": 6910 }, { "epoch": 0.32127768234365567, "grad_norm": 266.962646484375, "learning_rate": 4.465156228237151e-07, "logits/chosen": -17.578210830688477, "logits/rejected": -18.44921875, "logps/chosen": -389.90887451171875, "logps/rejected": -511.81103515625, "loss": 1.4839, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.769346833229065, "rewards/margins": -0.991044819355011, "rewards/rejected": 2.7603919506073, "step": 6920 }, { "epoch": 0.3217419564510887, "grad_norm": 132.55426025390625, "learning_rate": 4.464382438058096e-07, "logits/chosen": -17.839019775390625, "logits/rejected": -17.40726661682129, "logps/chosen": -359.8187561035156, "logps/rejected": -338.21685791015625, "loss": 0.6337, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2414774894714355, "rewards/margins": 0.3663673400878906, "rewards/rejected": 1.8751102685928345, "step": 6930 }, { "epoch": 0.32220623055852177, "grad_norm": 5.384225845336914, "learning_rate": 4.4636086478790406e-07, "logits/chosen": -18.81876564025879, "logits/rejected": -17.96945571899414, "logps/chosen": -467.8603515625, "logps/rejected": -398.43206787109375, "loss": 0.6623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3121840953826904, "rewards/margins": 0.48993539810180664, "rewards/rejected": 1.8222484588623047, "step": 6940 }, { "epoch": 0.32267050466595476, "grad_norm": 187.71153259277344, "learning_rate": 4.4628348576999857e-07, "logits/chosen": -18.518795013427734, "logits/rejected": -18.825761795043945, "logps/chosen": -478.67279052734375, "logps/rejected": -519.3861083984375, "loss": 0.8963, "rewards/accuracies": 0.5, "rewards/chosen": 2.564729690551758, "rewards/margins": -0.13552838563919067, "rewards/rejected": 2.7002580165863037, "step": 6950 }, { "epoch": 0.3231347787733878, "grad_norm": 264.2018127441406, "learning_rate": 4.462061067520931e-07, "logits/chosen": -18.865915298461914, "logits/rejected": -17.40823745727539, "logps/chosen": -517.6380004882812, "logps/rejected": -340.7430114746094, "loss": 0.4708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.763240337371826, "rewards/margins": 1.2017762660980225, "rewards/rejected": 1.5614638328552246, "step": 6960 }, { "epoch": 0.32359905288082086, "grad_norm": 106.03022003173828, "learning_rate": 4.461287277341876e-07, "logits/chosen": -18.14141845703125, "logits/rejected": -18.26573944091797, "logps/chosen": -366.97503662109375, "logps/rejected": -270.8533020019531, "loss": 0.6588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6381555795669556, "rewards/margins": 0.3027525246143341, "rewards/rejected": 1.3354028463363647, "step": 6970 }, { "epoch": 0.32406332698825385, "grad_norm": 62.19585418701172, "learning_rate": 4.460513487162821e-07, "logits/chosen": -18.34602165222168, "logits/rejected": -18.573444366455078, "logps/chosen": -418.6915588378906, "logps/rejected": -405.12677001953125, "loss": 0.6946, "rewards/accuracies": 0.5, "rewards/chosen": 2.1393024921417236, "rewards/margins": 0.24402804672718048, "rewards/rejected": 1.8952741622924805, "step": 6980 }, { "epoch": 0.3245276010956869, "grad_norm": 94.27056884765625, "learning_rate": 4.4597396969837656e-07, "logits/chosen": -19.278133392333984, "logits/rejected": -18.739582061767578, "logps/chosen": -416.8150939941406, "logps/rejected": -276.16583251953125, "loss": 0.5584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2071313858032227, "rewards/margins": 0.6943358778953552, "rewards/rejected": 1.5127958059310913, "step": 6990 }, { "epoch": 0.3249918752031199, "grad_norm": 158.63922119140625, "learning_rate": 4.4589659068047107e-07, "logits/chosen": -18.110715866088867, "logits/rejected": -17.202102661132812, "logps/chosen": -503.43194580078125, "logps/rejected": -444.58843994140625, "loss": 0.6591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5754387378692627, "rewards/margins": 0.5362502336502075, "rewards/rejected": 2.0391883850097656, "step": 7000 }, { "epoch": 0.32545614931055294, "grad_norm": 176.9473419189453, "learning_rate": 4.4581921166256553e-07, "logits/chosen": -18.805782318115234, "logits/rejected": -18.251039505004883, "logps/chosen": -371.50469970703125, "logps/rejected": -307.33380126953125, "loss": 0.5972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1580584049224854, "rewards/margins": 0.34155529737472534, "rewards/rejected": 1.8165031671524048, "step": 7010 }, { "epoch": 0.325920423417986, "grad_norm": 104.02596282958984, "learning_rate": 4.4574183264466004e-07, "logits/chosen": -18.882793426513672, "logits/rejected": -17.85614013671875, "logps/chosen": -448.78125, "logps/rejected": -307.1908874511719, "loss": 0.4998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1225504875183105, "rewards/margins": 0.7211644649505615, "rewards/rejected": 1.4013859033584595, "step": 7020 }, { "epoch": 0.326384697525419, "grad_norm": 58.3088264465332, "learning_rate": 4.4566445362675455e-07, "logits/chosen": -18.236988067626953, "logits/rejected": -17.20285415649414, "logps/chosen": -342.75677490234375, "logps/rejected": -181.69357299804688, "loss": 0.5053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9771913290023804, "rewards/margins": 0.9211351275444031, "rewards/rejected": 1.0560564994812012, "step": 7030 }, { "epoch": 0.32684897163285204, "grad_norm": 51.73158645629883, "learning_rate": 4.45587074608849e-07, "logits/chosen": -18.860546112060547, "logits/rejected": -17.85982894897461, "logps/chosen": -477.52001953125, "logps/rejected": -371.22540283203125, "loss": 0.6691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3976833820343018, "rewards/margins": 0.7144172787666321, "rewards/rejected": 1.683266282081604, "step": 7040 }, { "epoch": 0.3273132457402851, "grad_norm": 77.89717864990234, "learning_rate": 4.455096955909435e-07, "logits/chosen": -18.52495765686035, "logits/rejected": -17.623937606811523, "logps/chosen": -437.541259765625, "logps/rejected": -271.589111328125, "loss": 0.4719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1711525917053223, "rewards/margins": 0.6580663919448853, "rewards/rejected": 1.5130863189697266, "step": 7050 }, { "epoch": 0.3277775198477181, "grad_norm": 33.980262756347656, "learning_rate": 4.4543231657303803e-07, "logits/chosen": -19.582355499267578, "logits/rejected": -17.4016170501709, "logps/chosen": -436.209716796875, "logps/rejected": -279.65850830078125, "loss": 0.3629, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3579819202423096, "rewards/margins": 1.2228877544403076, "rewards/rejected": 1.1350940465927124, "step": 7060 }, { "epoch": 0.32824179395515113, "grad_norm": 9.496074676513672, "learning_rate": 4.4535493755513255e-07, "logits/chosen": -19.540369033813477, "logits/rejected": -17.685678482055664, "logps/chosen": -548.6292724609375, "logps/rejected": -270.08001708984375, "loss": 0.417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.682337522506714, "rewards/margins": 1.0963608026504517, "rewards/rejected": 1.5859766006469727, "step": 7070 }, { "epoch": 0.3287060680625841, "grad_norm": 57.89701461791992, "learning_rate": 4.4527755853722706e-07, "logits/chosen": -18.601314544677734, "logits/rejected": -17.777509689331055, "logps/chosen": -489.5, "logps/rejected": -350.382568359375, "loss": 0.4925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4342446327209473, "rewards/margins": 0.6263636350631714, "rewards/rejected": 1.8078807592391968, "step": 7080 }, { "epoch": 0.32917034217001717, "grad_norm": 21.4351806640625, "learning_rate": 4.452001795193215e-07, "logits/chosen": -18.343090057373047, "logits/rejected": -17.045440673828125, "logps/chosen": -421.8594665527344, "logps/rejected": -276.5380859375, "loss": 0.5285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0087649822235107, "rewards/margins": 0.7421534657478333, "rewards/rejected": 1.2666115760803223, "step": 7090 }, { "epoch": 0.3296346162774502, "grad_norm": 37.65753936767578, "learning_rate": 4.4512280050141603e-07, "logits/chosen": -17.979366302490234, "logits/rejected": -18.141822814941406, "logps/chosen": -216.9919891357422, "logps/rejected": -208.9702606201172, "loss": 0.6351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.204866886138916, "rewards/margins": 0.20743553340435028, "rewards/rejected": 0.9974311590194702, "step": 7100 }, { "epoch": 0.3300988903848832, "grad_norm": 121.82192993164062, "learning_rate": 4.450454214835105e-07, "logits/chosen": -18.495519638061523, "logits/rejected": -18.22584342956543, "logps/chosen": -340.3514404296875, "logps/rejected": -335.8218688964844, "loss": 0.6417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1855452060699463, "rewards/margins": 0.27301225066185, "rewards/rejected": 1.912533164024353, "step": 7110 }, { "epoch": 0.33056316449231626, "grad_norm": 300.8388671875, "learning_rate": 4.44968042465605e-07, "logits/chosen": -18.462646484375, "logits/rejected": -18.594120025634766, "logps/chosen": -302.3045654296875, "logps/rejected": -282.7740478515625, "loss": 0.7408, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8150854110717773, "rewards/margins": 0.14568758010864258, "rewards/rejected": 1.6693979501724243, "step": 7120 }, { "epoch": 0.3310274385997493, "grad_norm": 36.35762405395508, "learning_rate": 4.448906634476995e-07, "logits/chosen": -18.696842193603516, "logits/rejected": -18.195920944213867, "logps/chosen": -405.77886962890625, "logps/rejected": -448.590087890625, "loss": 0.7811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3213980197906494, "rewards/margins": 0.20215411484241486, "rewards/rejected": 2.119243860244751, "step": 7130 }, { "epoch": 0.3314917127071823, "grad_norm": 78.70037841796875, "learning_rate": 4.4481328442979397e-07, "logits/chosen": -18.450468063354492, "logits/rejected": -17.264202117919922, "logps/chosen": -368.079833984375, "logps/rejected": -268.283447265625, "loss": 0.6285, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6131839752197266, "rewards/margins": 0.47560566663742065, "rewards/rejected": 2.1375784873962402, "step": 7140 }, { "epoch": 0.33195598681461536, "grad_norm": 113.63204193115234, "learning_rate": 4.447359054118885e-07, "logits/chosen": -19.25983238220215, "logits/rejected": -18.94969367980957, "logps/chosen": -462.0921325683594, "logps/rejected": -378.9140625, "loss": 0.82, "rewards/accuracies": 0.5, "rewards/chosen": 2.2514734268188477, "rewards/margins": -0.011166977696120739, "rewards/rejected": 2.2626404762268066, "step": 7150 }, { "epoch": 0.3324202609220484, "grad_norm": 147.04959106445312, "learning_rate": 4.44658526393983e-07, "logits/chosen": -18.940832138061523, "logits/rejected": -18.54412841796875, "logps/chosen": -420.56451416015625, "logps/rejected": -359.66607666015625, "loss": 0.4798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5226736068725586, "rewards/margins": 0.6323157548904419, "rewards/rejected": 1.8903582096099854, "step": 7160 }, { "epoch": 0.3328845350294814, "grad_norm": 49.866294860839844, "learning_rate": 4.445811473760775e-07, "logits/chosen": -19.576168060302734, "logits/rejected": -19.158527374267578, "logps/chosen": -316.56298828125, "logps/rejected": -254.56796264648438, "loss": 0.631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7912347316741943, "rewards/margins": 0.32842525839805603, "rewards/rejected": 1.4628095626831055, "step": 7170 }, { "epoch": 0.33334880913691445, "grad_norm": 17.227378845214844, "learning_rate": 4.44503768358172e-07, "logits/chosen": -18.67206382751465, "logits/rejected": -18.15407371520996, "logps/chosen": -397.18157958984375, "logps/rejected": -324.2698669433594, "loss": 0.5481, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9813966751098633, "rewards/margins": 0.5041242837905884, "rewards/rejected": 1.4772722721099854, "step": 7180 }, { "epoch": 0.33381308324434744, "grad_norm": 9.333395957946777, "learning_rate": 4.4442638934026647e-07, "logits/chosen": -20.850238800048828, "logits/rejected": -19.023700714111328, "logps/chosen": -372.95806884765625, "logps/rejected": -243.9213104248047, "loss": 0.4722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.326026439666748, "rewards/margins": 0.8853293657302856, "rewards/rejected": 1.440697431564331, "step": 7190 }, { "epoch": 0.3342773573517805, "grad_norm": 156.16302490234375, "learning_rate": 4.44349010322361e-07, "logits/chosen": -18.63553237915039, "logits/rejected": -18.460826873779297, "logps/chosen": -459.3631286621094, "logps/rejected": -465.4939880371094, "loss": 0.8857, "rewards/accuracies": 0.5, "rewards/chosen": 2.4688000679016113, "rewards/margins": 0.02419900894165039, "rewards/rejected": 2.4446005821228027, "step": 7200 }, { "epoch": 0.33474163145921354, "grad_norm": 174.76364135742188, "learning_rate": 4.4427163130445544e-07, "logits/chosen": -18.28581428527832, "logits/rejected": -17.53836441040039, "logps/chosen": -572.4962768554688, "logps/rejected": -421.98681640625, "loss": 0.5772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3462953567504883, "rewards/margins": 0.46517428755760193, "rewards/rejected": 1.881121039390564, "step": 7210 }, { "epoch": 0.33520590556664653, "grad_norm": 58.56778335571289, "learning_rate": 4.4419425228654995e-07, "logits/chosen": -18.358104705810547, "logits/rejected": -17.931987762451172, "logps/chosen": -317.4176940917969, "logps/rejected": -239.9696502685547, "loss": 0.8199, "rewards/accuracies": 0.5, "rewards/chosen": 1.4766854047775269, "rewards/margins": 0.11185995489358902, "rewards/rejected": 1.3648254871368408, "step": 7220 }, { "epoch": 0.3356701796740796, "grad_norm": 78.95071411132812, "learning_rate": 4.4411687326864446e-07, "logits/chosen": -18.272907257080078, "logits/rejected": -17.950809478759766, "logps/chosen": -482.3133850097656, "logps/rejected": -396.9216613769531, "loss": 0.7933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.82490611076355, "rewards/margins": 0.3148460388183594, "rewards/rejected": 2.5100603103637695, "step": 7230 }, { "epoch": 0.33613445378151263, "grad_norm": 198.42198181152344, "learning_rate": 4.440394942507389e-07, "logits/chosen": -18.803348541259766, "logits/rejected": -18.845558166503906, "logps/chosen": -429.26654052734375, "logps/rejected": -451.31610107421875, "loss": 0.9421, "rewards/accuracies": 0.5, "rewards/chosen": 2.1106455326080322, "rewards/margins": 0.04934333637356758, "rewards/rejected": 2.0613019466400146, "step": 7240 }, { "epoch": 0.3365987278889456, "grad_norm": 37.83739471435547, "learning_rate": 4.4396211523283343e-07, "logits/chosen": -18.662954330444336, "logits/rejected": -17.305065155029297, "logps/chosen": -472.81268310546875, "logps/rejected": -296.31610107421875, "loss": 0.3522, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4771552085876465, "rewards/margins": 1.0803935527801514, "rewards/rejected": 1.3967615365982056, "step": 7250 }, { "epoch": 0.3370630019963787, "grad_norm": 53.67218780517578, "learning_rate": 4.4388473621492794e-07, "logits/chosen": -18.19680404663086, "logits/rejected": -16.86634063720703, "logps/chosen": -386.62689208984375, "logps/rejected": -260.20391845703125, "loss": 0.5771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8449159860610962, "rewards/margins": 0.5846588015556335, "rewards/rejected": 1.2602571249008179, "step": 7260 }, { "epoch": 0.33752727610381167, "grad_norm": 39.941524505615234, "learning_rate": 4.4380735719702245e-07, "logits/chosen": -18.29547691345215, "logits/rejected": -17.326475143432617, "logps/chosen": -449.69171142578125, "logps/rejected": -333.7784729003906, "loss": 0.6123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3000988960266113, "rewards/margins": 0.5312236547470093, "rewards/rejected": 1.7688754796981812, "step": 7270 }, { "epoch": 0.3379915502112447, "grad_norm": 88.07353973388672, "learning_rate": 4.4372997817911697e-07, "logits/chosen": -18.53974151611328, "logits/rejected": -17.498050689697266, "logps/chosen": -455.74774169921875, "logps/rejected": -316.2856140136719, "loss": 0.4548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.002458095550537, "rewards/margins": 1.05044424533844, "rewards/rejected": 1.9520137310028076, "step": 7280 }, { "epoch": 0.33845582431867777, "grad_norm": 50.07919692993164, "learning_rate": 4.436525991612114e-07, "logits/chosen": -17.96199607849121, "logits/rejected": -18.4759578704834, "logps/chosen": -252.08419799804688, "logps/rejected": -305.2804260253906, "loss": 0.8957, "rewards/accuracies": 0.5, "rewards/chosen": 1.4997475147247314, "rewards/margins": -0.1585313081741333, "rewards/rejected": 1.6582788228988647, "step": 7290 }, { "epoch": 0.33892009842611076, "grad_norm": 68.16617584228516, "learning_rate": 4.435752201433059e-07, "logits/chosen": -19.729333877563477, "logits/rejected": -19.047351837158203, "logps/chosen": -477.85833740234375, "logps/rejected": -348.50689697265625, "loss": 0.5164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2694952487945557, "rewards/margins": 0.8335744738578796, "rewards/rejected": 1.4359207153320312, "step": 7300 }, { "epoch": 0.3393843725335438, "grad_norm": 62.09828186035156, "learning_rate": 4.434978411254004e-07, "logits/chosen": -18.5720272064209, "logits/rejected": -18.157745361328125, "logps/chosen": -509.483642578125, "logps/rejected": -362.15411376953125, "loss": 0.7457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.365178108215332, "rewards/margins": 0.19567541778087616, "rewards/rejected": 2.1695027351379395, "step": 7310 }, { "epoch": 0.33984864664097686, "grad_norm": 72.16187286376953, "learning_rate": 4.434204621074949e-07, "logits/chosen": -18.958454132080078, "logits/rejected": -18.82040023803711, "logps/chosen": -235.72476196289062, "logps/rejected": -249.62680053710938, "loss": 0.8365, "rewards/accuracies": 0.5, "rewards/chosen": 1.4890767335891724, "rewards/margins": -0.12797459959983826, "rewards/rejected": 1.617051362991333, "step": 7320 }, { "epoch": 0.34031292074840985, "grad_norm": 11.424872398376465, "learning_rate": 4.433430830895894e-07, "logits/chosen": -18.85674476623535, "logits/rejected": -17.565536499023438, "logps/chosen": -602.4991455078125, "logps/rejected": -408.45361328125, "loss": 0.4087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.685995101928711, "rewards/margins": 1.118422508239746, "rewards/rejected": 1.567572832107544, "step": 7330 }, { "epoch": 0.3407771948558429, "grad_norm": 311.9502868652344, "learning_rate": 4.432657040716839e-07, "logits/chosen": -18.405773162841797, "logits/rejected": -17.684446334838867, "logps/chosen": -483.98187255859375, "logps/rejected": -392.6160583496094, "loss": 0.8341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.084449052810669, "rewards/margins": 0.3418126106262207, "rewards/rejected": 1.7426364421844482, "step": 7340 }, { "epoch": 0.3412414689632759, "grad_norm": 50.15864181518555, "learning_rate": 4.431883250537784e-07, "logits/chosen": -18.82845687866211, "logits/rejected": -18.174835205078125, "logps/chosen": -398.4436340332031, "logps/rejected": -325.2773742675781, "loss": 0.6085, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9094057083129883, "rewards/margins": 0.3421092629432678, "rewards/rejected": 1.5672963857650757, "step": 7350 }, { "epoch": 0.34170574307070894, "grad_norm": 37.552371978759766, "learning_rate": 4.431109460358729e-07, "logits/chosen": -18.754955291748047, "logits/rejected": -17.846141815185547, "logps/chosen": -449.31646728515625, "logps/rejected": -278.0230407714844, "loss": 0.5078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.002131700515747, "rewards/margins": 0.8416348695755005, "rewards/rejected": 1.1604968309402466, "step": 7360 }, { "epoch": 0.342170017178142, "grad_norm": 31.031766891479492, "learning_rate": 4.430335670179674e-07, "logits/chosen": -18.94243812561035, "logits/rejected": -17.44761085510254, "logps/chosen": -433.208251953125, "logps/rejected": -225.074462890625, "loss": 0.4248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.598054885864258, "rewards/margins": 1.1779026985168457, "rewards/rejected": 1.4201524257659912, "step": 7370 }, { "epoch": 0.342634291285575, "grad_norm": 132.97805786132812, "learning_rate": 4.429561880000619e-07, "logits/chosen": -17.907562255859375, "logits/rejected": -17.14227294921875, "logps/chosen": -455.1376953125, "logps/rejected": -313.150146484375, "loss": 0.6678, "rewards/accuracies": 0.5, "rewards/chosen": 2.4601528644561768, "rewards/margins": 0.5354261994361877, "rewards/rejected": 1.9247264862060547, "step": 7380 }, { "epoch": 0.34309856539300804, "grad_norm": 7.268373489379883, "learning_rate": 4.4287880898215643e-07, "logits/chosen": -18.77916717529297, "logits/rejected": -17.610570907592773, "logps/chosen": -485.3016662597656, "logps/rejected": -361.7034606933594, "loss": 0.5243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.376180648803711, "rewards/margins": 0.6598572134971619, "rewards/rejected": 1.7163234949111938, "step": 7390 }, { "epoch": 0.3435628395004411, "grad_norm": 121.83268737792969, "learning_rate": 4.4280142996425084e-07, "logits/chosen": -18.526874542236328, "logits/rejected": -18.419933319091797, "logps/chosen": -327.9332275390625, "logps/rejected": -361.06298828125, "loss": 0.8832, "rewards/accuracies": 0.5, "rewards/chosen": 1.5079922676086426, "rewards/margins": -0.1278536468744278, "rewards/rejected": 1.6358457803726196, "step": 7400 }, { "epoch": 0.3440271136078741, "grad_norm": 104.65443420410156, "learning_rate": 4.4272405094634535e-07, "logits/chosen": -17.984813690185547, "logits/rejected": -18.02566146850586, "logps/chosen": -384.81768798828125, "logps/rejected": -408.836669921875, "loss": 0.9081, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7262799739837646, "rewards/margins": -0.263207346200943, "rewards/rejected": 1.9894874095916748, "step": 7410 }, { "epoch": 0.34449138771530713, "grad_norm": 54.856346130371094, "learning_rate": 4.4264667192843986e-07, "logits/chosen": -18.43636703491211, "logits/rejected": -17.1707763671875, "logps/chosen": -431.66571044921875, "logps/rejected": -328.1695251464844, "loss": 0.7498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2788686752319336, "rewards/margins": 0.4615209698677063, "rewards/rejected": 1.8173478841781616, "step": 7420 }, { "epoch": 0.3449556618227401, "grad_norm": 104.29693603515625, "learning_rate": 4.4256929291053437e-07, "logits/chosen": -19.27444839477539, "logits/rejected": -18.40316390991211, "logps/chosen": -530.0064697265625, "logps/rejected": -408.1908874511719, "loss": 0.4601, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.561110734939575, "rewards/margins": 0.7854230999946594, "rewards/rejected": 1.775687575340271, "step": 7430 }, { "epoch": 0.34541993593017317, "grad_norm": 75.28850555419922, "learning_rate": 4.4249191389262883e-07, "logits/chosen": -17.341646194458008, "logits/rejected": -17.080387115478516, "logps/chosen": -415.6512145996094, "logps/rejected": -387.8993225097656, "loss": 0.8233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1663918495178223, "rewards/margins": 0.22525537014007568, "rewards/rejected": 1.941136360168457, "step": 7440 }, { "epoch": 0.3458842100376062, "grad_norm": 4.365469455718994, "learning_rate": 4.4241453487472334e-07, "logits/chosen": -19.35195541381836, "logits/rejected": -18.030628204345703, "logps/chosen": -412.6053771972656, "logps/rejected": -250.78152465820312, "loss": 0.5205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.402967929840088, "rewards/margins": 1.0057986974716187, "rewards/rejected": 1.3971691131591797, "step": 7450 }, { "epoch": 0.3463484841450392, "grad_norm": 90.10009765625, "learning_rate": 4.4233715585681785e-07, "logits/chosen": -18.47278594970703, "logits/rejected": -17.78046417236328, "logps/chosen": -340.1607666015625, "logps/rejected": -321.7632141113281, "loss": 0.5851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9988161325454712, "rewards/margins": 0.3016602098941803, "rewards/rejected": 1.697156310081482, "step": 7460 }, { "epoch": 0.34681275825247226, "grad_norm": 99.39940643310547, "learning_rate": 4.4225977683891236e-07, "logits/chosen": -18.34420394897461, "logits/rejected": -17.551433563232422, "logps/chosen": -383.3814392089844, "logps/rejected": -284.39251708984375, "loss": 0.5427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7676522731781006, "rewards/margins": 0.45175638794898987, "rewards/rejected": 1.315895915031433, "step": 7470 }, { "epoch": 0.3472770323599053, "grad_norm": 91.95840454101562, "learning_rate": 4.4218239782100687e-07, "logits/chosen": -18.24622344970703, "logits/rejected": -18.299964904785156, "logps/chosen": -413.8106994628906, "logps/rejected": -368.6926574707031, "loss": 0.8719, "rewards/accuracies": 0.5, "rewards/chosen": 2.001938581466675, "rewards/margins": 0.04731098935008049, "rewards/rejected": 1.9546277523040771, "step": 7480 }, { "epoch": 0.3477413064673383, "grad_norm": 45.93595504760742, "learning_rate": 4.421050188031013e-07, "logits/chosen": -18.63174057006836, "logits/rejected": -19.116384506225586, "logps/chosen": -286.058837890625, "logps/rejected": -300.3756408691406, "loss": 0.7343, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5619889497756958, "rewards/margins": 0.2952500581741333, "rewards/rejected": 1.266738772392273, "step": 7490 }, { "epoch": 0.34820558057477136, "grad_norm": 101.14038848876953, "learning_rate": 4.420276397851958e-07, "logits/chosen": -18.213645935058594, "logits/rejected": -17.939083099365234, "logps/chosen": -503.2804260253906, "logps/rejected": -414.4034118652344, "loss": 0.6756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.065690279006958, "rewards/margins": 0.09590555727481842, "rewards/rejected": 1.9697847366333008, "step": 7500 }, { "epoch": 0.34866985468220435, "grad_norm": 24.82578468322754, "learning_rate": 4.419502607672903e-07, "logits/chosen": -18.98258399963379, "logits/rejected": -18.400440216064453, "logps/chosen": -379.59014892578125, "logps/rejected": -370.462890625, "loss": 0.7922, "rewards/accuracies": 0.5, "rewards/chosen": 1.9335464239120483, "rewards/margins": -0.006937182042747736, "rewards/rejected": 1.9404836893081665, "step": 7510 }, { "epoch": 0.3491341287896374, "grad_norm": 22.548664093017578, "learning_rate": 4.418728817493848e-07, "logits/chosen": -19.413734436035156, "logits/rejected": -17.82822608947754, "logps/chosen": -445.63238525390625, "logps/rejected": -294.09307861328125, "loss": 0.4582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5359930992126465, "rewards/margins": 0.8930497169494629, "rewards/rejected": 1.6429436206817627, "step": 7520 }, { "epoch": 0.34959840289707045, "grad_norm": 233.631103515625, "learning_rate": 4.417955027314793e-07, "logits/chosen": -19.451618194580078, "logits/rejected": -19.1417236328125, "logps/chosen": -498.875732421875, "logps/rejected": -437.2578125, "loss": 0.8018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2782652378082275, "rewards/margins": 0.21005234122276306, "rewards/rejected": 2.0682129859924316, "step": 7530 }, { "epoch": 0.35006267700450344, "grad_norm": 17.904315948486328, "learning_rate": 4.4171812371357384e-07, "logits/chosen": -18.164831161499023, "logits/rejected": -17.50390625, "logps/chosen": -287.22869873046875, "logps/rejected": -259.61492919921875, "loss": 0.7268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8419811725616455, "rewards/margins": 0.4755345284938812, "rewards/rejected": 1.3664464950561523, "step": 7540 }, { "epoch": 0.3505269511119365, "grad_norm": 70.89480590820312, "learning_rate": 4.416407446956683e-07, "logits/chosen": -17.935054779052734, "logits/rejected": -17.633941650390625, "logps/chosen": -336.81591796875, "logps/rejected": -245.1727752685547, "loss": 0.5868, "rewards/accuracies": 0.5, "rewards/chosen": 2.4941258430480957, "rewards/margins": 0.7957795858383179, "rewards/rejected": 1.6983461380004883, "step": 7550 }, { "epoch": 0.35099122521936954, "grad_norm": 126.73810577392578, "learning_rate": 4.415633656777628e-07, "logits/chosen": -18.363645553588867, "logits/rejected": -17.824420928955078, "logps/chosen": -397.2604675292969, "logps/rejected": -328.34295654296875, "loss": 0.6447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.106229543685913, "rewards/margins": 0.27609437704086304, "rewards/rejected": 1.8301351070404053, "step": 7560 }, { "epoch": 0.35145549932680253, "grad_norm": 55.518489837646484, "learning_rate": 4.414859866598573e-07, "logits/chosen": -18.546354293823242, "logits/rejected": -17.91912078857422, "logps/chosen": -428.98583984375, "logps/rejected": -367.3075256347656, "loss": 0.4968, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.389775276184082, "rewards/margins": 0.6641942262649536, "rewards/rejected": 1.725581169128418, "step": 7570 }, { "epoch": 0.3519197734342356, "grad_norm": 74.87679290771484, "learning_rate": 4.4140860764195183e-07, "logits/chosen": -18.466367721557617, "logits/rejected": -17.90313720703125, "logps/chosen": -498.16436767578125, "logps/rejected": -391.1544494628906, "loss": 0.792, "rewards/accuracies": 0.5, "rewards/chosen": 2.4376611709594727, "rewards/margins": 0.28200748562812805, "rewards/rejected": 2.155653476715088, "step": 7580 }, { "epoch": 0.3523840475416686, "grad_norm": 196.12255859375, "learning_rate": 4.4133122862404623e-07, "logits/chosen": -18.655134201049805, "logits/rejected": -17.61794662475586, "logps/chosen": -470.39984130859375, "logps/rejected": -414.37530517578125, "loss": 0.6495, "rewards/accuracies": 0.5, "rewards/chosen": 2.845200538635254, "rewards/margins": 0.6150010824203491, "rewards/rejected": 2.2301993370056152, "step": 7590 }, { "epoch": 0.3528483216491016, "grad_norm": 158.4449462890625, "learning_rate": 4.4125384960614074e-07, "logits/chosen": -17.77334213256836, "logits/rejected": -17.42750358581543, "logps/chosen": -419.18585205078125, "logps/rejected": -371.8038635253906, "loss": 0.6527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.598456859588623, "rewards/margins": 0.4058869481086731, "rewards/rejected": 2.1925699710845947, "step": 7600 }, { "epoch": 0.3533125957565347, "grad_norm": 81.01176452636719, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -18.748323440551758, "logits/rejected": -17.093219757080078, "logps/chosen": -490.609130859375, "logps/rejected": -324.8926696777344, "loss": 0.4676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.326502561569214, "rewards/margins": 0.8458234667778015, "rewards/rejected": 1.4806790351867676, "step": 7610 }, { "epoch": 0.35377686986396767, "grad_norm": 50.11641311645508, "learning_rate": 4.4109909157032977e-07, "logits/chosen": -17.965408325195312, "logits/rejected": -16.969356536865234, "logps/chosen": -359.787353515625, "logps/rejected": -265.6035461425781, "loss": 0.4284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1164369583129883, "rewards/margins": 0.8187870979309082, "rewards/rejected": 1.2976497411727905, "step": 7620 }, { "epoch": 0.3542411439714007, "grad_norm": 204.16555786132812, "learning_rate": 4.410217125524243e-07, "logits/chosen": -18.18877601623535, "logits/rejected": -17.640239715576172, "logps/chosen": -369.6147766113281, "logps/rejected": -279.3432312011719, "loss": 0.6286, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.827910900115967, "rewards/margins": 0.8908732533454895, "rewards/rejected": 1.9370378255844116, "step": 7630 }, { "epoch": 0.35470541807883377, "grad_norm": 31.269811630249023, "learning_rate": 4.409443335345188e-07, "logits/chosen": -18.381532669067383, "logits/rejected": -17.706899642944336, "logps/chosen": -377.74749755859375, "logps/rejected": -315.37030029296875, "loss": 1.1692, "rewards/accuracies": 0.5, "rewards/chosen": 2.13496994972229, "rewards/margins": -0.2152315080165863, "rewards/rejected": 2.350201368331909, "step": 7640 }, { "epoch": 0.35516969218626676, "grad_norm": 123.93592071533203, "learning_rate": 4.4086695451661325e-07, "logits/chosen": -19.355079650878906, "logits/rejected": -19.10706329345703, "logps/chosen": -435.11083984375, "logps/rejected": -358.9033203125, "loss": 0.5856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2702529430389404, "rewards/margins": 0.3805457353591919, "rewards/rejected": 1.8897072076797485, "step": 7650 }, { "epoch": 0.3556339662936998, "grad_norm": 22.51661491394043, "learning_rate": 4.4078957549870776e-07, "logits/chosen": -17.89983558654785, "logits/rejected": -17.14998435974121, "logps/chosen": -394.00396728515625, "logps/rejected": -355.93682861328125, "loss": 0.6662, "rewards/accuracies": 0.5, "rewards/chosen": 2.294031858444214, "rewards/margins": 0.2215406894683838, "rewards/rejected": 2.07249116897583, "step": 7660 }, { "epoch": 0.3560982404011328, "grad_norm": 29.666887283325195, "learning_rate": 4.4071219648080227e-07, "logits/chosen": -19.039594650268555, "logits/rejected": -17.000255584716797, "logps/chosen": -460.60858154296875, "logps/rejected": -244.47463989257812, "loss": 0.3857, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4847705364227295, "rewards/margins": 0.9714359045028687, "rewards/rejected": 1.5133347511291504, "step": 7670 }, { "epoch": 0.35656251450856585, "grad_norm": 39.80714797973633, "learning_rate": 4.406348174628968e-07, "logits/chosen": -19.683435440063477, "logits/rejected": -18.262462615966797, "logps/chosen": -562.02392578125, "logps/rejected": -397.50323486328125, "loss": 0.4104, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.127901077270508, "rewards/margins": 0.9549576640129089, "rewards/rejected": 2.172943115234375, "step": 7680 }, { "epoch": 0.3570267886159989, "grad_norm": 147.79541015625, "learning_rate": 4.4055743844499124e-07, "logits/chosen": -18.208906173706055, "logits/rejected": -18.125532150268555, "logps/chosen": -351.7625427246094, "logps/rejected": -371.2656555175781, "loss": 1.1145, "rewards/accuracies": 0.5, "rewards/chosen": 2.0324759483337402, "rewards/margins": -0.3889661729335785, "rewards/rejected": 2.4214417934417725, "step": 7690 }, { "epoch": 0.3574910627234319, "grad_norm": 53.731910705566406, "learning_rate": 4.404800594270857e-07, "logits/chosen": -19.209280014038086, "logits/rejected": -18.04225730895996, "logps/chosen": -430.169677734375, "logps/rejected": -302.68243408203125, "loss": 0.5793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.732997179031372, "rewards/margins": 0.39948394894599915, "rewards/rejected": 1.3335132598876953, "step": 7700 }, { "epoch": 0.35795533683086495, "grad_norm": 108.51131439208984, "learning_rate": 4.404026804091802e-07, "logits/chosen": -19.185489654541016, "logits/rejected": -17.48992156982422, "logps/chosen": -495.36151123046875, "logps/rejected": -308.45977783203125, "loss": 0.484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6641693115234375, "rewards/margins": 0.8732916712760925, "rewards/rejected": 1.7908775806427002, "step": 7710 }, { "epoch": 0.358419610938298, "grad_norm": 172.57656860351562, "learning_rate": 4.403253013912747e-07, "logits/chosen": -17.63399887084961, "logits/rejected": -17.154125213623047, "logps/chosen": -358.1993408203125, "logps/rejected": -368.18035888671875, "loss": 0.8942, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8463523387908936, "rewards/margins": -0.2033124417066574, "rewards/rejected": 2.0496647357940674, "step": 7720 }, { "epoch": 0.358883885045731, "grad_norm": 185.92051696777344, "learning_rate": 4.4024792237336923e-07, "logits/chosen": -17.935367584228516, "logits/rejected": -17.933712005615234, "logps/chosen": -447.3070373535156, "logps/rejected": -464.5508728027344, "loss": 0.8212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4518394470214844, "rewards/margins": 0.06809109449386597, "rewards/rejected": 2.3837485313415527, "step": 7730 }, { "epoch": 0.35934815915316404, "grad_norm": 54.606849670410156, "learning_rate": 4.4017054335546374e-07, "logits/chosen": -18.153886795043945, "logits/rejected": -17.893686294555664, "logps/chosen": -316.059814453125, "logps/rejected": -276.9217224121094, "loss": 0.82, "rewards/accuracies": 0.5, "rewards/chosen": 1.798431634902954, "rewards/margins": -0.04630211740732193, "rewards/rejected": 1.8447335958480835, "step": 7740 }, { "epoch": 0.35981243326059703, "grad_norm": 36.4845085144043, "learning_rate": 4.400931643375582e-07, "logits/chosen": -18.012367248535156, "logits/rejected": -16.994884490966797, "logps/chosen": -436.35015869140625, "logps/rejected": -301.25018310546875, "loss": 0.5323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4210450649261475, "rewards/margins": 0.7223156690597534, "rewards/rejected": 1.6987292766571045, "step": 7750 }, { "epoch": 0.3602767073680301, "grad_norm": 144.8778839111328, "learning_rate": 4.400157853196527e-07, "logits/chosen": -19.045949935913086, "logits/rejected": -18.671680450439453, "logps/chosen": -373.18572998046875, "logps/rejected": -397.8526306152344, "loss": 0.7081, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1699957847595215, "rewards/margins": 0.29511576890945435, "rewards/rejected": 1.8748804330825806, "step": 7760 }, { "epoch": 0.36074098147546313, "grad_norm": 48.77644348144531, "learning_rate": 4.399384063017472e-07, "logits/chosen": -17.741546630859375, "logits/rejected": -17.58087921142578, "logps/chosen": -314.32232666015625, "logps/rejected": -322.5751037597656, "loss": 0.6014, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6990196704864502, "rewards/margins": 0.3085917830467224, "rewards/rejected": 1.390427827835083, "step": 7770 }, { "epoch": 0.3612052555828961, "grad_norm": 61.21846008300781, "learning_rate": 4.398610272838417e-07, "logits/chosen": -19.15780258178711, "logits/rejected": -18.339664459228516, "logps/chosen": -474.93231201171875, "logps/rejected": -380.1520080566406, "loss": 0.6731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4633874893188477, "rewards/margins": 0.45550185441970825, "rewards/rejected": 2.007885456085205, "step": 7780 }, { "epoch": 0.3616695296903292, "grad_norm": 144.8834228515625, "learning_rate": 4.397836482659362e-07, "logits/chosen": -18.697765350341797, "logits/rejected": -18.36222267150879, "logps/chosen": -406.90765380859375, "logps/rejected": -400.9080810546875, "loss": 0.566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3791677951812744, "rewards/margins": 0.571295440196991, "rewards/rejected": 1.8078721761703491, "step": 7790 }, { "epoch": 0.3621338037977622, "grad_norm": 11.085100173950195, "learning_rate": 4.3970626924803065e-07, "logits/chosen": -17.777334213256836, "logits/rejected": -17.40593719482422, "logps/chosen": -399.03851318359375, "logps/rejected": -352.5263366699219, "loss": 0.5897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5784990787506104, "rewards/margins": 0.7474125027656555, "rewards/rejected": 1.83108651638031, "step": 7800 }, { "epoch": 0.3625980779051952, "grad_norm": 155.68898010253906, "learning_rate": 4.3962889023012516e-07, "logits/chosen": -18.42561912536621, "logits/rejected": -18.380542755126953, "logps/chosen": -454.20355224609375, "logps/rejected": -539.4708251953125, "loss": 0.99, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7541770935058594, "rewards/margins": -0.3382989764213562, "rewards/rejected": 3.0924763679504395, "step": 7810 }, { "epoch": 0.36306235201262826, "grad_norm": 48.071449279785156, "learning_rate": 4.395515112122197e-07, "logits/chosen": -19.333059310913086, "logits/rejected": -17.62033462524414, "logps/chosen": -396.18450927734375, "logps/rejected": -328.60467529296875, "loss": 0.3799, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.666227340698242, "rewards/margins": 1.1144509315490723, "rewards/rejected": 1.5517762899398804, "step": 7820 }, { "epoch": 0.36352662612006126, "grad_norm": 11.597147941589355, "learning_rate": 4.394741321943142e-07, "logits/chosen": -17.978992462158203, "logits/rejected": -16.657438278198242, "logps/chosen": -387.2022399902344, "logps/rejected": -261.77410888671875, "loss": 0.6147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9535030126571655, "rewards/margins": 0.5186528563499451, "rewards/rejected": 1.4348503351211548, "step": 7830 }, { "epoch": 0.3639909002274943, "grad_norm": 33.56752014160156, "learning_rate": 4.393967531764087e-07, "logits/chosen": -18.772720336914062, "logits/rejected": -17.71839141845703, "logps/chosen": -408.32647705078125, "logps/rejected": -311.17010498046875, "loss": 0.5553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4275102615356445, "rewards/margins": 0.6564874649047852, "rewards/rejected": 1.7710227966308594, "step": 7840 }, { "epoch": 0.36445517433492736, "grad_norm": 66.7286376953125, "learning_rate": 4.3931937415850316e-07, "logits/chosen": -18.8584041595459, "logits/rejected": -18.051979064941406, "logps/chosen": -436.2875061035156, "logps/rejected": -372.8479309082031, "loss": 0.79, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.431272029876709, "rewards/margins": 0.17429938912391663, "rewards/rejected": 2.2569727897644043, "step": 7850 }, { "epoch": 0.36491944844236035, "grad_norm": 49.51592254638672, "learning_rate": 4.3924199514059767e-07, "logits/chosen": -19.034656524658203, "logits/rejected": -17.58808135986328, "logps/chosen": -339.05767822265625, "logps/rejected": -240.6901092529297, "loss": 0.4692, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9012987613677979, "rewards/margins": 0.7326072454452515, "rewards/rejected": 1.168691635131836, "step": 7860 }, { "epoch": 0.3653837225497934, "grad_norm": 74.96820831298828, "learning_rate": 4.391646161226922e-07, "logits/chosen": -18.803136825561523, "logits/rejected": -18.47946548461914, "logps/chosen": -368.94122314453125, "logps/rejected": -297.6676940917969, "loss": 0.7193, "rewards/accuracies": 0.5, "rewards/chosen": 1.8318132162094116, "rewards/margins": 0.08873709291219711, "rewards/rejected": 1.743075966835022, "step": 7870 }, { "epoch": 0.36584799665722645, "grad_norm": 107.99127197265625, "learning_rate": 4.3908723710478664e-07, "logits/chosen": -18.40976333618164, "logits/rejected": -18.30988121032715, "logps/chosen": -455.5902404785156, "logps/rejected": -290.84661865234375, "loss": 0.5909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2828030586242676, "rewards/margins": 0.5398851633071899, "rewards/rejected": 1.742917776107788, "step": 7880 }, { "epoch": 0.36631227076465944, "grad_norm": 68.94979858398438, "learning_rate": 4.3900985808688115e-07, "logits/chosen": -18.803010940551758, "logits/rejected": -17.639328002929688, "logps/chosen": -485.5430603027344, "logps/rejected": -290.4263610839844, "loss": 0.352, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8031578063964844, "rewards/margins": 1.2232003211975098, "rewards/rejected": 1.5799574851989746, "step": 7890 }, { "epoch": 0.3667765448720925, "grad_norm": 65.74776458740234, "learning_rate": 4.389324790689756e-07, "logits/chosen": -18.942934036254883, "logits/rejected": -17.60909080505371, "logps/chosen": -335.1029357910156, "logps/rejected": -274.6840515136719, "loss": 0.5474, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9271695613861084, "rewards/margins": 0.5842806100845337, "rewards/rejected": 1.3428890705108643, "step": 7900 }, { "epoch": 0.3672408189795255, "grad_norm": 98.37738037109375, "learning_rate": 4.388551000510701e-07, "logits/chosen": -19.096097946166992, "logits/rejected": -18.374608993530273, "logps/chosen": -402.4334716796875, "logps/rejected": -350.34906005859375, "loss": 0.4641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5352673530578613, "rewards/margins": 0.7725202441215515, "rewards/rejected": 1.762746810913086, "step": 7910 }, { "epoch": 0.36770509308695853, "grad_norm": 120.8771743774414, "learning_rate": 4.3877772103316463e-07, "logits/chosen": -19.228496551513672, "logits/rejected": -18.097929000854492, "logps/chosen": -379.74981689453125, "logps/rejected": -304.4253234863281, "loss": 0.5672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5416572093963623, "rewards/margins": 0.7186416387557983, "rewards/rejected": 1.8230154514312744, "step": 7920 }, { "epoch": 0.3681693671943916, "grad_norm": 29.825355529785156, "learning_rate": 4.3870034201525914e-07, "logits/chosen": -18.939817428588867, "logits/rejected": -18.523380279541016, "logps/chosen": -358.26654052734375, "logps/rejected": -355.22686767578125, "loss": 0.6364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7018091678619385, "rewards/margins": 0.3861103653907776, "rewards/rejected": 1.3156986236572266, "step": 7930 }, { "epoch": 0.3686336413018246, "grad_norm": 114.21410369873047, "learning_rate": 4.3862296299735365e-07, "logits/chosen": -18.840988159179688, "logits/rejected": -18.561838150024414, "logps/chosen": -422.21173095703125, "logps/rejected": -371.6763916015625, "loss": 0.7077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.197754383087158, "rewards/margins": 0.11819534003734589, "rewards/rejected": 2.079559087753296, "step": 7940 }, { "epoch": 0.3690979154092576, "grad_norm": 86.31336975097656, "learning_rate": 4.385455839794481e-07, "logits/chosen": -18.106884002685547, "logits/rejected": -17.35897445678711, "logps/chosen": -414.18994140625, "logps/rejected": -307.57720947265625, "loss": 0.5359, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8506975173950195, "rewards/margins": 0.7986804246902466, "rewards/rejected": 2.0520172119140625, "step": 7950 }, { "epoch": 0.3695621895166907, "grad_norm": 103.85720825195312, "learning_rate": 4.384682049615426e-07, "logits/chosen": -17.90690803527832, "logits/rejected": -17.069900512695312, "logps/chosen": -347.5458679199219, "logps/rejected": -199.90109252929688, "loss": 0.4637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9137260913848877, "rewards/margins": 0.7874447703361511, "rewards/rejected": 1.1262812614440918, "step": 7960 }, { "epoch": 0.37002646362412367, "grad_norm": 80.9216079711914, "learning_rate": 4.383908259436371e-07, "logits/chosen": -18.763805389404297, "logits/rejected": -17.33823013305664, "logps/chosen": -362.15826416015625, "logps/rejected": -208.7837677001953, "loss": 0.5122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1638197898864746, "rewards/margins": 0.7693696618080139, "rewards/rejected": 1.394450068473816, "step": 7970 }, { "epoch": 0.3704907377315567, "grad_norm": 61.9300537109375, "learning_rate": 4.383134469257316e-07, "logits/chosen": -18.863964080810547, "logits/rejected": -18.16940689086914, "logps/chosen": -397.76959228515625, "logps/rejected": -359.31170654296875, "loss": 0.6887, "rewards/accuracies": 0.5, "rewards/chosen": 2.2822563648223877, "rewards/margins": 0.30132001638412476, "rewards/rejected": 1.9809362888336182, "step": 7980 }, { "epoch": 0.3709550118389897, "grad_norm": 120.99319458007812, "learning_rate": 4.382360679078261e-07, "logits/chosen": -18.577762603759766, "logits/rejected": -17.74141502380371, "logps/chosen": -416.748291015625, "logps/rejected": -310.3564453125, "loss": 0.6444, "rewards/accuracies": 0.5, "rewards/chosen": 2.575573444366455, "rewards/margins": 0.6149238348007202, "rewards/rejected": 1.9606494903564453, "step": 7990 }, { "epoch": 0.37141928594642276, "grad_norm": 55.23287582397461, "learning_rate": 4.3815868888992056e-07, "logits/chosen": -18.199914932250977, "logits/rejected": -16.692373275756836, "logps/chosen": -346.5919494628906, "logps/rejected": -217.90341186523438, "loss": 0.4625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9745566844940186, "rewards/margins": 0.9678500890731812, "rewards/rejected": 1.0067065954208374, "step": 8000 }, { "epoch": 0.3718835600538558, "grad_norm": 86.56023406982422, "learning_rate": 4.3808130987201507e-07, "logits/chosen": -18.98760223388672, "logits/rejected": -18.55657386779785, "logps/chosen": -345.5134582519531, "logps/rejected": -262.3356018066406, "loss": 0.5649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1785571575164795, "rewards/margins": 0.37708061933517456, "rewards/rejected": 1.8014767169952393, "step": 8010 }, { "epoch": 0.3723478341612888, "grad_norm": 208.69522094726562, "learning_rate": 4.380039308541096e-07, "logits/chosen": -18.62519645690918, "logits/rejected": -17.689491271972656, "logps/chosen": -421.5322265625, "logps/rejected": -325.0024108886719, "loss": 0.7196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5999066829681396, "rewards/margins": 0.4763781428337097, "rewards/rejected": 2.1235287189483643, "step": 8020 }, { "epoch": 0.37281210826872185, "grad_norm": 181.93704223632812, "learning_rate": 4.379265518362041e-07, "logits/chosen": -19.842952728271484, "logits/rejected": -19.382951736450195, "logps/chosen": -423.8358459472656, "logps/rejected": -403.3796691894531, "loss": 0.4996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4685580730438232, "rewards/margins": 0.6358831524848938, "rewards/rejected": 1.8326747417449951, "step": 8030 }, { "epoch": 0.3732763823761549, "grad_norm": 40.58907699584961, "learning_rate": 4.378491728182986e-07, "logits/chosen": -19.5709285736084, "logits/rejected": -18.968143463134766, "logps/chosen": -523.8931884765625, "logps/rejected": -401.8406066894531, "loss": 0.6337, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0403103828430176, "rewards/margins": 0.551328182220459, "rewards/rejected": 2.4889822006225586, "step": 8040 }, { "epoch": 0.3737406564835879, "grad_norm": 95.68319702148438, "learning_rate": 4.3777953170218363e-07, "logits/chosen": -19.631744384765625, "logits/rejected": -17.85515022277832, "logps/chosen": -382.1542053222656, "logps/rejected": -295.2671203613281, "loss": 0.5515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5461032390594482, "rewards/margins": 0.7720893621444702, "rewards/rejected": 1.7740137577056885, "step": 8050 }, { "epoch": 0.37420493059102095, "grad_norm": 124.30814361572266, "learning_rate": 4.3770215268427814e-07, "logits/chosen": -18.056886672973633, "logits/rejected": -17.21892738342285, "logps/chosen": -358.37603759765625, "logps/rejected": -220.91525268554688, "loss": 0.45, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.385580062866211, "rewards/margins": 0.9455234408378601, "rewards/rejected": 1.4400569200515747, "step": 8060 }, { "epoch": 0.374669204698454, "grad_norm": 86.30696105957031, "learning_rate": 4.3762477366637265e-07, "logits/chosen": -19.270654678344727, "logits/rejected": -18.21381950378418, "logps/chosen": -579.6912841796875, "logps/rejected": -432.4120178222656, "loss": 0.4697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7035930156707764, "rewards/margins": 0.8100970983505249, "rewards/rejected": 1.893495798110962, "step": 8070 }, { "epoch": 0.375133478805887, "grad_norm": 200.71780395507812, "learning_rate": 4.375473946484671e-07, "logits/chosen": -17.622398376464844, "logits/rejected": -17.277572631835938, "logps/chosen": -356.64239501953125, "logps/rejected": -337.57208251953125, "loss": 0.7121, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3749277591705322, "rewards/margins": 0.42781367897987366, "rewards/rejected": 1.9471139907836914, "step": 8080 }, { "epoch": 0.37559775291332004, "grad_norm": 130.50802612304688, "learning_rate": 4.3747001563056157e-07, "logits/chosen": -17.841114044189453, "logits/rejected": -17.825305938720703, "logps/chosen": -390.250244140625, "logps/rejected": -367.59246826171875, "loss": 0.7963, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4466190338134766, "rewards/margins": 0.2808993458747864, "rewards/rejected": 2.165719747543335, "step": 8090 }, { "epoch": 0.37606202702075303, "grad_norm": 63.2518310546875, "learning_rate": 4.373926366126561e-07, "logits/chosen": -17.403722763061523, "logits/rejected": -16.343908309936523, "logps/chosen": -271.78021240234375, "logps/rejected": -206.3269805908203, "loss": 0.5569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.047994613647461, "rewards/margins": 0.9023134112358093, "rewards/rejected": 1.1456810235977173, "step": 8100 }, { "epoch": 0.3765263011281861, "grad_norm": 59.66709518432617, "learning_rate": 4.373152575947506e-07, "logits/chosen": -19.046043395996094, "logits/rejected": -17.279342651367188, "logps/chosen": -413.70928955078125, "logps/rejected": -271.3664855957031, "loss": 0.3792, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.299386978149414, "rewards/margins": 1.1560324430465698, "rewards/rejected": 1.1433544158935547, "step": 8110 }, { "epoch": 0.37699057523561913, "grad_norm": 57.76173782348633, "learning_rate": 4.372378785768451e-07, "logits/chosen": -18.668521881103516, "logits/rejected": -17.15915298461914, "logps/chosen": -491.56927490234375, "logps/rejected": -313.1006774902344, "loss": 0.3943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3284802436828613, "rewards/margins": 0.8807498812675476, "rewards/rejected": 1.4477301836013794, "step": 8120 }, { "epoch": 0.3774548493430521, "grad_norm": 40.14462661743164, "learning_rate": 4.3716049955893956e-07, "logits/chosen": -18.55467414855957, "logits/rejected": -17.764747619628906, "logps/chosen": -405.510498046875, "logps/rejected": -312.78729248046875, "loss": 0.6134, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.559124231338501, "rewards/margins": 0.5705469250679016, "rewards/rejected": 1.9885774850845337, "step": 8130 }, { "epoch": 0.3779191234504852, "grad_norm": 142.82957458496094, "learning_rate": 4.3708312054103407e-07, "logits/chosen": -18.543649673461914, "logits/rejected": -19.228090286254883, "logps/chosen": -409.2355651855469, "logps/rejected": -441.611083984375, "loss": 0.9093, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.073780059814453, "rewards/margins": -0.2320602387189865, "rewards/rejected": 2.305840015411377, "step": 8140 }, { "epoch": 0.3783833975579182, "grad_norm": 18.21351432800293, "learning_rate": 4.370057415231286e-07, "logits/chosen": -18.391801834106445, "logits/rejected": -17.85051918029785, "logps/chosen": -356.10894775390625, "logps/rejected": -272.40911865234375, "loss": 0.7966, "rewards/accuracies": 0.5, "rewards/chosen": 2.0251150131225586, "rewards/margins": 0.29983678460121155, "rewards/rejected": 1.7252784967422485, "step": 8150 }, { "epoch": 0.3788476716653512, "grad_norm": 30.038755416870117, "learning_rate": 4.369283625052231e-07, "logits/chosen": -19.150976181030273, "logits/rejected": -18.08066177368164, "logps/chosen": -510.73577880859375, "logps/rejected": -327.29681396484375, "loss": 0.387, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9082722663879395, "rewards/margins": 1.1844236850738525, "rewards/rejected": 1.723848581314087, "step": 8160 }, { "epoch": 0.37931194577278426, "grad_norm": 98.38729858398438, "learning_rate": 4.368509834873176e-07, "logits/chosen": -18.09160041809082, "logits/rejected": -17.686321258544922, "logps/chosen": -426.9581604003906, "logps/rejected": -333.29327392578125, "loss": 0.4675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.062166690826416, "rewards/margins": 0.9010068774223328, "rewards/rejected": 2.1611599922180176, "step": 8170 }, { "epoch": 0.37977621988021726, "grad_norm": 15.737931251525879, "learning_rate": 4.36773604469412e-07, "logits/chosen": -18.479068756103516, "logits/rejected": -17.224702835083008, "logps/chosen": -534.3820190429688, "logps/rejected": -301.91241455078125, "loss": 0.4109, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.898577928543091, "rewards/margins": 1.231950044631958, "rewards/rejected": 1.6666278839111328, "step": 8180 }, { "epoch": 0.3802404939876503, "grad_norm": 21.32693099975586, "learning_rate": 4.366962254515065e-07, "logits/chosen": -19.843761444091797, "logits/rejected": -18.667020797729492, "logps/chosen": -352.2564697265625, "logps/rejected": -229.8259735107422, "loss": 0.4834, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.129270076751709, "rewards/margins": 0.7764769792556763, "rewards/rejected": 1.3527932167053223, "step": 8190 }, { "epoch": 0.38070476809508336, "grad_norm": 220.30047607421875, "learning_rate": 4.3661884643360103e-07, "logits/chosen": -17.995174407958984, "logits/rejected": -17.369815826416016, "logps/chosen": -489.8042907714844, "logps/rejected": -401.616943359375, "loss": 0.5845, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4467580318450928, "rewards/margins": 0.5097247362136841, "rewards/rejected": 1.9370330572128296, "step": 8200 }, { "epoch": 0.38116904220251635, "grad_norm": 196.86309814453125, "learning_rate": 4.3654146741569554e-07, "logits/chosen": -18.8726863861084, "logits/rejected": -18.594303131103516, "logps/chosen": -410.51971435546875, "logps/rejected": -329.0050354003906, "loss": 0.6962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.356581926345825, "rewards/margins": 0.22673949599266052, "rewards/rejected": 2.1298422813415527, "step": 8210 }, { "epoch": 0.3816333163099494, "grad_norm": 261.8096008300781, "learning_rate": 4.3646408839779006e-07, "logits/chosen": -18.385000228881836, "logits/rejected": -18.49576759338379, "logps/chosen": -413.0772399902344, "logps/rejected": -419.15838623046875, "loss": 0.9256, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1397416591644287, "rewards/margins": 0.020288657397031784, "rewards/rejected": 2.119453191757202, "step": 8220 }, { "epoch": 0.38209759041738245, "grad_norm": 87.33160400390625, "learning_rate": 4.363867093798845e-07, "logits/chosen": -18.10108184814453, "logits/rejected": -17.52664566040039, "logps/chosen": -435.94232177734375, "logps/rejected": -353.6421203613281, "loss": 0.5031, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1219773292541504, "rewards/margins": 0.5349053740501404, "rewards/rejected": 1.5870721340179443, "step": 8230 }, { "epoch": 0.38256186452481544, "grad_norm": 15.563859939575195, "learning_rate": 4.36309330361979e-07, "logits/chosen": -18.345172882080078, "logits/rejected": -16.572851181030273, "logps/chosen": -352.48095703125, "logps/rejected": -194.80209350585938, "loss": 0.3696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.131223678588867, "rewards/margins": 1.1288230419158936, "rewards/rejected": 1.0024006366729736, "step": 8240 }, { "epoch": 0.3830261386322485, "grad_norm": 105.1852035522461, "learning_rate": 4.3623195134407354e-07, "logits/chosen": -18.081111907958984, "logits/rejected": -17.976411819458008, "logps/chosen": -365.9910583496094, "logps/rejected": -359.63897705078125, "loss": 0.6333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2976086139678955, "rewards/margins": 0.28695279359817505, "rewards/rejected": 2.0106558799743652, "step": 8250 }, { "epoch": 0.3834904127396815, "grad_norm": 46.772239685058594, "learning_rate": 4.3615457232616805e-07, "logits/chosen": -17.783554077148438, "logits/rejected": -18.09115982055664, "logps/chosen": -343.88153076171875, "logps/rejected": -383.06353759765625, "loss": 0.8703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1396400928497314, "rewards/margins": 0.14640779793262482, "rewards/rejected": 1.9932323694229126, "step": 8260 }, { "epoch": 0.38395468684711453, "grad_norm": 130.95068359375, "learning_rate": 4.3607719330826256e-07, "logits/chosen": -17.90765380859375, "logits/rejected": -17.609663009643555, "logps/chosen": -281.4378356933594, "logps/rejected": -277.47552490234375, "loss": 0.7692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0711445808410645, "rewards/margins": 0.3415902256965637, "rewards/rejected": 1.729554533958435, "step": 8270 }, { "epoch": 0.3844189609545476, "grad_norm": 57.56821823120117, "learning_rate": 4.3599981429035697e-07, "logits/chosen": -18.21405601501465, "logits/rejected": -18.390939712524414, "logps/chosen": -418.315673828125, "logps/rejected": -473.7005310058594, "loss": 1.1081, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.179650068283081, "rewards/margins": -0.49226149916648865, "rewards/rejected": 2.6719117164611816, "step": 8280 }, { "epoch": 0.3848832350619806, "grad_norm": 100.02098083496094, "learning_rate": 4.359224352724515e-07, "logits/chosen": -18.07691764831543, "logits/rejected": -17.98171615600586, "logps/chosen": -405.17840576171875, "logps/rejected": -314.4725341796875, "loss": 0.6095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5147552490234375, "rewards/margins": 0.507003664970398, "rewards/rejected": 2.007751703262329, "step": 8290 }, { "epoch": 0.3853475091694136, "grad_norm": 21.820341110229492, "learning_rate": 4.35845056254546e-07, "logits/chosen": -18.75979995727539, "logits/rejected": -18.463912963867188, "logps/chosen": -467.61492919921875, "logps/rejected": -389.11846923828125, "loss": 0.8183, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5628771781921387, "rewards/margins": 0.16226284205913544, "rewards/rejected": 2.4006142616271973, "step": 8300 }, { "epoch": 0.3858117832768467, "grad_norm": 6.537042617797852, "learning_rate": 4.357676772366405e-07, "logits/chosen": -17.92856216430664, "logits/rejected": -16.82114028930664, "logps/chosen": -385.8308410644531, "logps/rejected": -330.2652893066406, "loss": 0.6236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1825156211853027, "rewards/margins": 0.7689300179481506, "rewards/rejected": 1.4135857820510864, "step": 8310 }, { "epoch": 0.38627605738427967, "grad_norm": 15.835760116577148, "learning_rate": 4.35690298218735e-07, "logits/chosen": -18.445755004882812, "logits/rejected": -17.218547821044922, "logps/chosen": -418.66351318359375, "logps/rejected": -274.56854248046875, "loss": 0.3746, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.692626953125, "rewards/margins": 1.20807945728302, "rewards/rejected": 1.4845476150512695, "step": 8320 }, { "epoch": 0.3867403314917127, "grad_norm": 97.12309265136719, "learning_rate": 4.3561291920082947e-07, "logits/chosen": -18.300586700439453, "logits/rejected": -18.071788787841797, "logps/chosen": -471.1195373535156, "logps/rejected": -392.89556884765625, "loss": 0.6533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5261709690093994, "rewards/margins": 0.27435070276260376, "rewards/rejected": 2.2518200874328613, "step": 8330 }, { "epoch": 0.3872046055991457, "grad_norm": 107.54240417480469, "learning_rate": 4.35535540182924e-07, "logits/chosen": -17.93692970275879, "logits/rejected": -17.730295181274414, "logps/chosen": -366.53497314453125, "logps/rejected": -326.80853271484375, "loss": 0.9324, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9328076839447021, "rewards/margins": -0.14037056267261505, "rewards/rejected": 2.073178291320801, "step": 8340 }, { "epoch": 0.38766887970657876, "grad_norm": 135.931884765625, "learning_rate": 4.354581611650185e-07, "logits/chosen": -18.635236740112305, "logits/rejected": -18.76974868774414, "logps/chosen": -410.205810546875, "logps/rejected": -423.51446533203125, "loss": 0.8311, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5243563652038574, "rewards/margins": -0.033282436430454254, "rewards/rejected": 2.5576388835906982, "step": 8350 }, { "epoch": 0.3881331538140118, "grad_norm": 92.98621368408203, "learning_rate": 4.35380782147113e-07, "logits/chosen": -18.631324768066406, "logits/rejected": -18.53635597229004, "logps/chosen": -419.886474609375, "logps/rejected": -403.0973815917969, "loss": 0.8827, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9207117557525635, "rewards/margins": -0.20930662751197815, "rewards/rejected": 2.1300182342529297, "step": 8360 }, { "epoch": 0.3885974279214448, "grad_norm": 81.48821258544922, "learning_rate": 4.353034031292075e-07, "logits/chosen": -17.708887100219727, "logits/rejected": -17.70734214782715, "logps/chosen": -303.12274169921875, "logps/rejected": -280.75250244140625, "loss": 0.9927, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.1638847589492798, "rewards/margins": -0.24968858063220978, "rewards/rejected": 1.4135732650756836, "step": 8370 }, { "epoch": 0.38906170202887785, "grad_norm": 45.05229568481445, "learning_rate": 4.352260241113019e-07, "logits/chosen": -19.32440757751465, "logits/rejected": -19.671905517578125, "logps/chosen": -347.61065673828125, "logps/rejected": -442.5804748535156, "loss": 1.1397, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9172197580337524, "rewards/margins": -0.4537569582462311, "rewards/rejected": 2.3709769248962402, "step": 8380 }, { "epoch": 0.3895259761363109, "grad_norm": 149.56565856933594, "learning_rate": 4.3514864509339643e-07, "logits/chosen": -18.90053939819336, "logits/rejected": -18.35993194580078, "logps/chosen": -382.06524658203125, "logps/rejected": -271.2063903808594, "loss": 0.6534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8295013904571533, "rewards/margins": 0.2846774458885193, "rewards/rejected": 1.5448238849639893, "step": 8390 }, { "epoch": 0.3899902502437439, "grad_norm": 35.96686935424805, "learning_rate": 4.3507126607549094e-07, "logits/chosen": -18.730396270751953, "logits/rejected": -17.98038673400879, "logps/chosen": -389.443603515625, "logps/rejected": -297.7140197753906, "loss": 0.7282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0266759395599365, "rewards/margins": 0.3619343340396881, "rewards/rejected": 1.6647417545318604, "step": 8400 }, { "epoch": 0.39045452435117695, "grad_norm": 74.3609848022461, "learning_rate": 4.3499388705758545e-07, "logits/chosen": -18.912334442138672, "logits/rejected": -18.07967758178711, "logps/chosen": -345.70794677734375, "logps/rejected": -262.4024353027344, "loss": 0.6142, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1148741245269775, "rewards/margins": 0.4592192769050598, "rewards/rejected": 1.6556546688079834, "step": 8410 }, { "epoch": 0.39091879845860994, "grad_norm": 28.580821990966797, "learning_rate": 4.3491650803967996e-07, "logits/chosen": -17.669103622436523, "logits/rejected": -17.602163314819336, "logps/chosen": -275.9626770019531, "logps/rejected": -287.18634033203125, "loss": 0.7116, "rewards/accuracies": 0.5, "rewards/chosen": 1.4157168865203857, "rewards/margins": 0.04320179298520088, "rewards/rejected": 1.3725152015686035, "step": 8420 }, { "epoch": 0.391383072566043, "grad_norm": 49.75371551513672, "learning_rate": 4.348391290217744e-07, "logits/chosen": -18.114665985107422, "logits/rejected": -17.86803436279297, "logps/chosen": -494.72845458984375, "logps/rejected": -450.727783203125, "loss": 0.7053, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1042332649230957, "rewards/margins": 0.5460290312767029, "rewards/rejected": 2.558204174041748, "step": 8430 }, { "epoch": 0.39184734667347604, "grad_norm": 226.38084411621094, "learning_rate": 4.3476175000386893e-07, "logits/chosen": -18.4227294921875, "logits/rejected": -18.132946014404297, "logps/chosen": -420.7372131347656, "logps/rejected": -352.13470458984375, "loss": 0.7414, "rewards/accuracies": 0.5, "rewards/chosen": 2.1861438751220703, "rewards/margins": 0.44735708832740784, "rewards/rejected": 1.7387869358062744, "step": 8440 }, { "epoch": 0.39231162078090903, "grad_norm": 2.137648582458496, "learning_rate": 4.3468437098596345e-07, "logits/chosen": -18.504709243774414, "logits/rejected": -18.104990005493164, "logps/chosen": -480.4334411621094, "logps/rejected": -404.9869079589844, "loss": 0.7037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8660635948181152, "rewards/margins": 0.40435290336608887, "rewards/rejected": 2.4617106914520264, "step": 8450 }, { "epoch": 0.3927758948883421, "grad_norm": 74.36160278320312, "learning_rate": 4.3460699196805796e-07, "logits/chosen": -18.952239990234375, "logits/rejected": -18.289531707763672, "logps/chosen": -398.97088623046875, "logps/rejected": -276.5536193847656, "loss": 0.5619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4684181213378906, "rewards/margins": 0.5029882192611694, "rewards/rejected": 1.9654299020767212, "step": 8460 }, { "epoch": 0.39324016899577513, "grad_norm": 158.3020782470703, "learning_rate": 4.345296129501524e-07, "logits/chosen": -18.02550506591797, "logits/rejected": -17.724397659301758, "logps/chosen": -437.14019775390625, "logps/rejected": -430.12066650390625, "loss": 0.8812, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8450093269348145, "rewards/margins": 0.026775550097227097, "rewards/rejected": 2.8182339668273926, "step": 8470 }, { "epoch": 0.3937044431032081, "grad_norm": 66.77050018310547, "learning_rate": 4.3445223393224687e-07, "logits/chosen": -18.399272918701172, "logits/rejected": -17.739879608154297, "logps/chosen": -362.88616943359375, "logps/rejected": -315.41314697265625, "loss": 0.5286, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.413331985473633, "rewards/margins": 0.5805838108062744, "rewards/rejected": 1.8327480554580688, "step": 8480 }, { "epoch": 0.3941687172106412, "grad_norm": 107.20391082763672, "learning_rate": 4.343748549143414e-07, "logits/chosen": -18.920764923095703, "logits/rejected": -17.982194900512695, "logps/chosen": -331.9598083496094, "logps/rejected": -287.41497802734375, "loss": 0.4487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4443576335906982, "rewards/margins": 0.8290813565254211, "rewards/rejected": 1.6152760982513428, "step": 8490 }, { "epoch": 0.39463299131807417, "grad_norm": 75.3575210571289, "learning_rate": 4.342974758964359e-07, "logits/chosen": -18.749086380004883, "logits/rejected": -17.568178176879883, "logps/chosen": -391.30914306640625, "logps/rejected": -275.0852355957031, "loss": 0.5063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.139266014099121, "rewards/margins": 0.7836214303970337, "rewards/rejected": 1.355644702911377, "step": 8500 }, { "epoch": 0.3950972654255072, "grad_norm": 27.454492568969727, "learning_rate": 4.342200968785304e-07, "logits/chosen": -19.06698226928711, "logits/rejected": -18.63650131225586, "logps/chosen": -486.1702575683594, "logps/rejected": -378.1235046386719, "loss": 0.5542, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.491464853286743, "rewards/margins": 0.5028156042098999, "rewards/rejected": 1.9886493682861328, "step": 8510 }, { "epoch": 0.39556153953294027, "grad_norm": 125.24430847167969, "learning_rate": 4.341427178606249e-07, "logits/chosen": -18.248348236083984, "logits/rejected": -17.50102996826172, "logps/chosen": -310.05218505859375, "logps/rejected": -184.00735473632812, "loss": 0.5907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8695131540298462, "rewards/margins": 0.8569642305374146, "rewards/rejected": 1.0125490427017212, "step": 8520 }, { "epoch": 0.39602581364037326, "grad_norm": 83.1805648803711, "learning_rate": 4.340653388427194e-07, "logits/chosen": -19.835857391357422, "logits/rejected": -19.012496948242188, "logps/chosen": -391.42193603515625, "logps/rejected": -271.5185546875, "loss": 0.654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.025083065032959, "rewards/margins": 0.3842878043651581, "rewards/rejected": 1.6407954692840576, "step": 8530 }, { "epoch": 0.3964900877478063, "grad_norm": 31.044830322265625, "learning_rate": 4.339879598248139e-07, "logits/chosen": -19.034164428710938, "logits/rejected": -17.512374877929688, "logps/chosen": -454.659912109375, "logps/rejected": -298.2751770019531, "loss": 0.4735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.505858898162842, "rewards/margins": 0.9809626340866089, "rewards/rejected": 1.5248961448669434, "step": 8540 }, { "epoch": 0.39695436185523936, "grad_norm": 78.86034393310547, "learning_rate": 4.339105808069084e-07, "logits/chosen": -18.42351531982422, "logits/rejected": -18.633424758911133, "logps/chosen": -423.329345703125, "logps/rejected": -435.92559814453125, "loss": 0.8291, "rewards/accuracies": 0.5, "rewards/chosen": 2.9401628971099854, "rewards/margins": -0.003919124603271484, "rewards/rejected": 2.9440817832946777, "step": 8550 }, { "epoch": 0.39741863596267235, "grad_norm": 61.310646057128906, "learning_rate": 4.338332017890029e-07, "logits/chosen": -18.620805740356445, "logits/rejected": -17.28646469116211, "logps/chosen": -467.10931396484375, "logps/rejected": -267.30133056640625, "loss": 0.4529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.140279769897461, "rewards/margins": 1.3995249271392822, "rewards/rejected": 1.7407547235488892, "step": 8560 }, { "epoch": 0.3978829100701054, "grad_norm": 136.78587341308594, "learning_rate": 4.3375582277109737e-07, "logits/chosen": -19.805648803710938, "logits/rejected": -17.045818328857422, "logps/chosen": -390.70135498046875, "logps/rejected": -197.50765991210938, "loss": 0.398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8657127618789673, "rewards/margins": 0.9736387133598328, "rewards/rejected": 0.8920738101005554, "step": 8570 }, { "epoch": 0.3983471841775384, "grad_norm": 27.075984954833984, "learning_rate": 4.3367844375319183e-07, "logits/chosen": -17.917583465576172, "logits/rejected": -18.214153289794922, "logps/chosen": -372.33453369140625, "logps/rejected": -374.74407958984375, "loss": 0.8214, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.284348726272583, "rewards/margins": 0.14291152358055115, "rewards/rejected": 2.14143705368042, "step": 8580 }, { "epoch": 0.39881145828497144, "grad_norm": 32.088340759277344, "learning_rate": 4.3360106473528634e-07, "logits/chosen": -17.57065200805664, "logits/rejected": -17.47802734375, "logps/chosen": -328.9525451660156, "logps/rejected": -335.97613525390625, "loss": 1.0033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0332963466644287, "rewards/margins": 0.006726694293320179, "rewards/rejected": 2.0265698432922363, "step": 8590 }, { "epoch": 0.3992757323924045, "grad_norm": 75.98365783691406, "learning_rate": 4.3352368571738085e-07, "logits/chosen": -18.855770111083984, "logits/rejected": -18.0142879486084, "logps/chosen": -379.11212158203125, "logps/rejected": -257.91802978515625, "loss": 0.5473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7739267349243164, "rewards/margins": 0.5693925619125366, "rewards/rejected": 2.2045340538024902, "step": 8600 }, { "epoch": 0.3997400064998375, "grad_norm": 60.21626281738281, "learning_rate": 4.3344630669947536e-07, "logits/chosen": -18.125835418701172, "logits/rejected": -17.988040924072266, "logps/chosen": -421.4313049316406, "logps/rejected": -354.80010986328125, "loss": 0.5884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3771321773529053, "rewards/margins": 0.4756179451942444, "rewards/rejected": 1.9015144109725952, "step": 8610 }, { "epoch": 0.40020428060727053, "grad_norm": 62.555274963378906, "learning_rate": 4.3336892768156987e-07, "logits/chosen": -19.2287540435791, "logits/rejected": -19.33238983154297, "logps/chosen": -412.25677490234375, "logps/rejected": -376.31005859375, "loss": 0.9508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.184654951095581, "rewards/margins": -0.26762357354164124, "rewards/rejected": 2.4522788524627686, "step": 8620 }, { "epoch": 0.4006685547147036, "grad_norm": 4.346099853515625, "learning_rate": 4.3329154866366433e-07, "logits/chosen": -18.453662872314453, "logits/rejected": -18.384693145751953, "logps/chosen": -361.14263916015625, "logps/rejected": -386.3307189941406, "loss": 0.8308, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.342268228530884, "rewards/margins": 0.14651861786842346, "rewards/rejected": 2.195749521255493, "step": 8630 }, { "epoch": 0.4011328288221366, "grad_norm": 85.00645446777344, "learning_rate": 4.3321416964575884e-07, "logits/chosen": -18.178050994873047, "logits/rejected": -17.501495361328125, "logps/chosen": -391.21124267578125, "logps/rejected": -320.3983154296875, "loss": 0.6805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.14018177986145, "rewards/margins": 0.3616274893283844, "rewards/rejected": 1.7785543203353882, "step": 8640 }, { "epoch": 0.4015971029295696, "grad_norm": 15.362048149108887, "learning_rate": 4.3313679062785335e-07, "logits/chosen": -18.398780822753906, "logits/rejected": -17.982545852661133, "logps/chosen": -351.19281005859375, "logps/rejected": -303.02520751953125, "loss": 0.5008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.217170000076294, "rewards/margins": 0.6360169649124146, "rewards/rejected": 1.5811527967453003, "step": 8650 }, { "epoch": 0.4020613770370026, "grad_norm": 60.86330795288086, "learning_rate": 4.330594116099478e-07, "logits/chosen": -18.571149826049805, "logits/rejected": -16.77938461303711, "logps/chosen": -476.90887451171875, "logps/rejected": -244.70388793945312, "loss": 0.3706, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4729151725769043, "rewards/margins": 1.0562312602996826, "rewards/rejected": 1.4166842699050903, "step": 8660 }, { "epoch": 0.40252565114443567, "grad_norm": 7.628020763397217, "learning_rate": 4.329820325920423e-07, "logits/chosen": -18.68770408630371, "logits/rejected": -18.298561096191406, "logps/chosen": -449.2361755371094, "logps/rejected": -407.39825439453125, "loss": 0.7219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.813013792037964, "rewards/margins": 0.5967259407043457, "rewards/rejected": 2.216287612915039, "step": 8670 }, { "epoch": 0.4029899252518687, "grad_norm": 153.3500518798828, "learning_rate": 4.329046535741368e-07, "logits/chosen": -19.282007217407227, "logits/rejected": -18.961498260498047, "logps/chosen": -443.87957763671875, "logps/rejected": -373.2762756347656, "loss": 0.6449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.170808792114258, "rewards/margins": 0.3361334204673767, "rewards/rejected": 1.8346755504608154, "step": 8680 }, { "epoch": 0.4034541993593017, "grad_norm": 61.368412017822266, "learning_rate": 4.328272745562313e-07, "logits/chosen": -18.72330665588379, "logits/rejected": -19.221513748168945, "logps/chosen": -392.9775085449219, "logps/rejected": -379.885009765625, "loss": 0.7164, "rewards/accuracies": 0.5, "rewards/chosen": 2.4490275382995605, "rewards/margins": 0.27966415882110596, "rewards/rejected": 2.169363498687744, "step": 8690 }, { "epoch": 0.40391847346673476, "grad_norm": 65.5085220336914, "learning_rate": 4.327498955383258e-07, "logits/chosen": -18.332351684570312, "logits/rejected": -17.902515411376953, "logps/chosen": -285.43865966796875, "logps/rejected": -345.9740295410156, "loss": 0.8827, "rewards/accuracies": 0.5, "rewards/chosen": 2.283385753631592, "rewards/margins": 0.28999805450439453, "rewards/rejected": 1.9933878183364868, "step": 8700 }, { "epoch": 0.4043827475741678, "grad_norm": 87.82151794433594, "learning_rate": 4.326725165204203e-07, "logits/chosen": -17.872745513916016, "logits/rejected": -17.231630325317383, "logps/chosen": -356.1677551269531, "logps/rejected": -272.78912353515625, "loss": 0.6221, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2471764087677, "rewards/margins": 0.5022572875022888, "rewards/rejected": 1.7449191808700562, "step": 8710 }, { "epoch": 0.4048470216816008, "grad_norm": 10.313946723937988, "learning_rate": 4.3259513750251483e-07, "logits/chosen": -18.234989166259766, "logits/rejected": -17.034910202026367, "logps/chosen": -477.79949951171875, "logps/rejected": -301.319091796875, "loss": 0.4163, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6807284355163574, "rewards/margins": 1.1989519596099854, "rewards/rejected": 1.4817768335342407, "step": 8720 }, { "epoch": 0.40531129578903385, "grad_norm": 9.738044738769531, "learning_rate": 4.325177584846093e-07, "logits/chosen": -19.448719024658203, "logits/rejected": -18.968107223510742, "logps/chosen": -434.82977294921875, "logps/rejected": -283.48883056640625, "loss": 0.3665, "rewards/accuracies": 1.0, "rewards/chosen": 2.24710750579834, "rewards/margins": 0.970528781414032, "rewards/rejected": 1.276578664779663, "step": 8730 }, { "epoch": 0.40577556989646685, "grad_norm": 137.78985595703125, "learning_rate": 4.324403794667038e-07, "logits/chosen": -18.516687393188477, "logits/rejected": -18.3621883392334, "logps/chosen": -449.03399658203125, "logps/rejected": -458.59283447265625, "loss": 0.6169, "rewards/accuracies": 0.5, "rewards/chosen": 2.605768918991089, "rewards/margins": 0.35265403985977173, "rewards/rejected": 2.253114938735962, "step": 8740 }, { "epoch": 0.4062398440038999, "grad_norm": 5.552378177642822, "learning_rate": 4.323630004487983e-07, "logits/chosen": -19.59994125366211, "logits/rejected": -17.930850982666016, "logps/chosen": -531.2154541015625, "logps/rejected": -378.8332824707031, "loss": 0.468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9588840007781982, "rewards/margins": 1.0518162250518799, "rewards/rejected": 1.907067894935608, "step": 8750 }, { "epoch": 0.40670411811133295, "grad_norm": 50.38489532470703, "learning_rate": 4.3228562143089277e-07, "logits/chosen": -18.991296768188477, "logits/rejected": -17.912242889404297, "logps/chosen": -430.5673828125, "logps/rejected": -284.94512939453125, "loss": 0.4134, "rewards/accuracies": 1.0, "rewards/chosen": 2.4235448837280273, "rewards/margins": 0.8543216586112976, "rewards/rejected": 1.569223403930664, "step": 8760 }, { "epoch": 0.40716839221876594, "grad_norm": 125.68573760986328, "learning_rate": 4.322082424129873e-07, "logits/chosen": -18.701091766357422, "logits/rejected": -17.8924560546875, "logps/chosen": -451.2870178222656, "logps/rejected": -363.49395751953125, "loss": 0.4407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7025413513183594, "rewards/margins": 1.1239060163497925, "rewards/rejected": 1.5786349773406982, "step": 8770 }, { "epoch": 0.407632666326199, "grad_norm": 40.24443817138672, "learning_rate": 4.3213086339508174e-07, "logits/chosen": -19.607946395874023, "logits/rejected": -18.903459548950195, "logps/chosen": -426.1766662597656, "logps/rejected": -293.1063537597656, "loss": 0.4677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6280832290649414, "rewards/margins": 0.8855546116828918, "rewards/rejected": 1.7425286769866943, "step": 8780 }, { "epoch": 0.40809694043363204, "grad_norm": 30.495630264282227, "learning_rate": 4.3205348437717625e-07, "logits/chosen": -18.331405639648438, "logits/rejected": -17.768844604492188, "logps/chosen": -476.3310546875, "logps/rejected": -379.1083679199219, "loss": 0.931, "rewards/accuracies": 0.5, "rewards/chosen": 2.4265644550323486, "rewards/margins": -0.13639847934246063, "rewards/rejected": 2.5629630088806152, "step": 8790 }, { "epoch": 0.40856121454106503, "grad_norm": 244.9469757080078, "learning_rate": 4.3197610535927076e-07, "logits/chosen": -19.42144775390625, "logits/rejected": -19.47904396057129, "logps/chosen": -369.42291259765625, "logps/rejected": -462.20269775390625, "loss": 1.1296, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.107144355773926, "rewards/margins": -0.6099246144294739, "rewards/rejected": 2.717069149017334, "step": 8800 }, { "epoch": 0.4090254886484981, "grad_norm": 24.7183837890625, "learning_rate": 4.3189872634136527e-07, "logits/chosen": -18.698389053344727, "logits/rejected": -19.242101669311523, "logps/chosen": -284.29364013671875, "logps/rejected": -309.9344482421875, "loss": 1.0874, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8683412075042725, "rewards/margins": -0.48365384340286255, "rewards/rejected": 2.3519949913024902, "step": 8810 }, { "epoch": 0.4094897627559311, "grad_norm": 52.888729095458984, "learning_rate": 4.318213473234598e-07, "logits/chosen": -19.640958786010742, "logits/rejected": -17.969600677490234, "logps/chosen": -388.7062072753906, "logps/rejected": -215.790283203125, "loss": 0.3967, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.508612632751465, "rewards/margins": 1.4216654300689697, "rewards/rejected": 1.0869472026824951, "step": 8820 }, { "epoch": 0.4099540368633641, "grad_norm": 88.02239990234375, "learning_rate": 4.3174396830555424e-07, "logits/chosen": -18.913801193237305, "logits/rejected": -18.436784744262695, "logps/chosen": -329.49786376953125, "logps/rejected": -252.06625366210938, "loss": 0.6812, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7604166269302368, "rewards/margins": 0.12140794098377228, "rewards/rejected": 1.6390082836151123, "step": 8830 }, { "epoch": 0.4104183109707972, "grad_norm": 27.588253021240234, "learning_rate": 4.3166658928764875e-07, "logits/chosen": -19.085594177246094, "logits/rejected": -17.86258888244629, "logps/chosen": -376.8266906738281, "logps/rejected": -305.383544921875, "loss": 0.8347, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5816407203674316, "rewards/margins": 0.22265250980854034, "rewards/rejected": 2.358988046646118, "step": 8840 }, { "epoch": 0.41088258507823017, "grad_norm": 112.26720428466797, "learning_rate": 4.3158921026974326e-07, "logits/chosen": -19.090557098388672, "logits/rejected": -17.299386978149414, "logps/chosen": -468.57379150390625, "logps/rejected": -284.91339111328125, "loss": 0.3612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.091426372528076, "rewards/margins": 1.5563933849334717, "rewards/rejected": 1.535032868385315, "step": 8850 }, { "epoch": 0.4113468591856632, "grad_norm": 171.07420349121094, "learning_rate": 4.315118312518377e-07, "logits/chosen": -17.856037139892578, "logits/rejected": -17.7416934967041, "logps/chosen": -342.1316223144531, "logps/rejected": -393.8053283691406, "loss": 0.8983, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8596032857894897, "rewards/margins": -0.27299603819847107, "rewards/rejected": 2.132599353790283, "step": 8860 }, { "epoch": 0.41181113329309627, "grad_norm": 94.84356689453125, "learning_rate": 4.3143445223393223e-07, "logits/chosen": -19.817094802856445, "logits/rejected": -19.232711791992188, "logps/chosen": -388.45343017578125, "logps/rejected": -294.7603454589844, "loss": 0.6342, "rewards/accuracies": 0.5, "rewards/chosen": 2.9865410327911377, "rewards/margins": 0.9383646249771118, "rewards/rejected": 2.0481762886047363, "step": 8870 }, { "epoch": 0.41227540740052926, "grad_norm": 188.4989471435547, "learning_rate": 4.313570732160267e-07, "logits/chosen": -18.098543167114258, "logits/rejected": -17.255184173583984, "logps/chosen": -403.7298583984375, "logps/rejected": -285.2551574707031, "loss": 0.5851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1825053691864014, "rewards/margins": 0.43872517347335815, "rewards/rejected": 1.7437803745269775, "step": 8880 }, { "epoch": 0.4127396815079623, "grad_norm": 43.7763557434082, "learning_rate": 4.312796941981212e-07, "logits/chosen": -18.303524017333984, "logits/rejected": -17.58306312561035, "logps/chosen": -369.11322021484375, "logps/rejected": -325.87396240234375, "loss": 0.7696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.075178861618042, "rewards/margins": 0.13682934641838074, "rewards/rejected": 1.938349723815918, "step": 8890 }, { "epoch": 0.4132039556153953, "grad_norm": 78.96481323242188, "learning_rate": 4.312023151802157e-07, "logits/chosen": -18.94282341003418, "logits/rejected": -18.195003509521484, "logps/chosen": -429.7208557128906, "logps/rejected": -374.6463317871094, "loss": 0.4822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6692681312561035, "rewards/margins": 0.6467075347900391, "rewards/rejected": 2.0225605964660645, "step": 8900 }, { "epoch": 0.41366822972282835, "grad_norm": 104.41795349121094, "learning_rate": 4.311249361623102e-07, "logits/chosen": -17.8158016204834, "logits/rejected": -17.118099212646484, "logps/chosen": -411.0029296875, "logps/rejected": -294.5140380859375, "loss": 0.5467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0889699459075928, "rewards/margins": 0.5316516757011414, "rewards/rejected": 1.5573183298110962, "step": 8910 }, { "epoch": 0.4141325038302614, "grad_norm": 148.58334350585938, "learning_rate": 4.3104755714440474e-07, "logits/chosen": -19.552473068237305, "logits/rejected": -18.908748626708984, "logps/chosen": -558.3900756835938, "logps/rejected": -446.247314453125, "loss": 0.4434, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1061103343963623, "rewards/margins": 0.7194204926490784, "rewards/rejected": 2.3866896629333496, "step": 8920 }, { "epoch": 0.4145967779376944, "grad_norm": 147.3614501953125, "learning_rate": 4.309701781264992e-07, "logits/chosen": -17.71469497680664, "logits/rejected": -17.640432357788086, "logps/chosen": -322.22747802734375, "logps/rejected": -336.35064697265625, "loss": 0.6697, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8969656229019165, "rewards/margins": 0.4141017496585846, "rewards/rejected": 1.4828637838363647, "step": 8930 }, { "epoch": 0.41506105204512744, "grad_norm": 102.3362808227539, "learning_rate": 4.308927991085937e-07, "logits/chosen": -19.6799259185791, "logits/rejected": -18.487594604492188, "logps/chosen": -416.62384033203125, "logps/rejected": -272.9150390625, "loss": 0.3383, "rewards/accuracies": 1.0, "rewards/chosen": 2.462810754776001, "rewards/margins": 1.2350108623504639, "rewards/rejected": 1.227799892425537, "step": 8940 }, { "epoch": 0.4155253261525605, "grad_norm": 22.456417083740234, "learning_rate": 4.3081542009068816e-07, "logits/chosen": -19.4501953125, "logits/rejected": -18.547073364257812, "logps/chosen": -323.283447265625, "logps/rejected": -255.69406127929688, "loss": 0.5815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1455681324005127, "rewards/margins": 0.49458780884742737, "rewards/rejected": 1.6509802341461182, "step": 8950 }, { "epoch": 0.4159896002599935, "grad_norm": 41.30078887939453, "learning_rate": 4.307380410727827e-07, "logits/chosen": -19.29265785217285, "logits/rejected": -18.156742095947266, "logps/chosen": -439.10198974609375, "logps/rejected": -338.97491455078125, "loss": 0.3723, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1736302375793457, "rewards/margins": 1.272416353225708, "rewards/rejected": 1.9012138843536377, "step": 8960 }, { "epoch": 0.41645387436742654, "grad_norm": 42.85152053833008, "learning_rate": 4.306606620548772e-07, "logits/chosen": -19.015966415405273, "logits/rejected": -17.801647186279297, "logps/chosen": -431.08587646484375, "logps/rejected": -231.0066375732422, "loss": 0.4095, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5337233543395996, "rewards/margins": 0.8025192022323608, "rewards/rejected": 1.7312042713165283, "step": 8970 }, { "epoch": 0.4169181484748596, "grad_norm": 166.33055114746094, "learning_rate": 4.3058328303697164e-07, "logits/chosen": -19.772825241088867, "logits/rejected": -20.191591262817383, "logps/chosen": -460.58740234375, "logps/rejected": -486.8829040527344, "loss": 1.0528, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.684911012649536, "rewards/margins": -0.3806535005569458, "rewards/rejected": 3.0655646324157715, "step": 8980 }, { "epoch": 0.4173824225822926, "grad_norm": 66.74739074707031, "learning_rate": 4.3050590401906616e-07, "logits/chosen": -18.497947692871094, "logits/rejected": -17.2570858001709, "logps/chosen": -440.24896240234375, "logps/rejected": -338.67822265625, "loss": 0.7371, "rewards/accuracies": 0.5, "rewards/chosen": 2.005039691925049, "rewards/margins": 0.07625957578420639, "rewards/rejected": 1.92877995967865, "step": 8990 }, { "epoch": 0.4178466966897256, "grad_norm": 24.61842155456543, "learning_rate": 4.3042852500116067e-07, "logits/chosen": -18.19022560119629, "logits/rejected": -16.840105056762695, "logps/chosen": -356.7620544433594, "logps/rejected": -273.7652893066406, "loss": 0.4479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9559216499328613, "rewards/margins": 1.2594958543777466, "rewards/rejected": 1.6964256763458252, "step": 9000 }, { "epoch": 0.4183109707971586, "grad_norm": 151.29208374023438, "learning_rate": 4.303511459832552e-07, "logits/chosen": -19.665002822875977, "logits/rejected": -18.441787719726562, "logps/chosen": -571.4978637695312, "logps/rejected": -415.31854248046875, "loss": 0.7928, "rewards/accuracies": 0.5, "rewards/chosen": 3.179518699645996, "rewards/margins": 0.3661766052246094, "rewards/rejected": 2.8133418560028076, "step": 9010 }, { "epoch": 0.41877524490459167, "grad_norm": 98.58966064453125, "learning_rate": 4.302737669653497e-07, "logits/chosen": -19.311708450317383, "logits/rejected": -17.2944278717041, "logps/chosen": -533.6740112304688, "logps/rejected": -368.7688293457031, "loss": 0.3418, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9929919242858887, "rewards/margins": 1.2821348905563354, "rewards/rejected": 1.7108570337295532, "step": 9020 }, { "epoch": 0.4192395190120247, "grad_norm": 11.31840991973877, "learning_rate": 4.3019638794744415e-07, "logits/chosen": -17.755849838256836, "logits/rejected": -17.656015396118164, "logps/chosen": -355.86383056640625, "logps/rejected": -330.96923828125, "loss": 1.0021, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2236168384552, "rewards/margins": -0.1215289980173111, "rewards/rejected": 2.3451457023620605, "step": 9030 }, { "epoch": 0.4197037931194577, "grad_norm": 68.93408966064453, "learning_rate": 4.3011900892953866e-07, "logits/chosen": -18.64067268371582, "logits/rejected": -18.05406951904297, "logps/chosen": -446.76025390625, "logps/rejected": -328.02276611328125, "loss": 0.5309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4937729835510254, "rewards/margins": 0.6475118398666382, "rewards/rejected": 1.8462610244750977, "step": 9040 }, { "epoch": 0.42016806722689076, "grad_norm": 23.157163619995117, "learning_rate": 4.300416299116331e-07, "logits/chosen": -20.522869110107422, "logits/rejected": -20.107065200805664, "logps/chosen": -419.612060546875, "logps/rejected": -376.06854248046875, "loss": 0.6659, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4382951259613037, "rewards/margins": 0.23318593204021454, "rewards/rejected": 2.205109119415283, "step": 9050 }, { "epoch": 0.4206323413343238, "grad_norm": 87.88349914550781, "learning_rate": 4.2996425089372763e-07, "logits/chosen": -17.835371017456055, "logits/rejected": -17.307960510253906, "logps/chosen": -377.89447021484375, "logps/rejected": -292.8316650390625, "loss": 0.7007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3210365772247314, "rewards/margins": 0.3848636746406555, "rewards/rejected": 1.9361728429794312, "step": 9060 }, { "epoch": 0.4210966154417568, "grad_norm": 122.01019287109375, "learning_rate": 4.2988687187582214e-07, "logits/chosen": -17.8700008392334, "logits/rejected": -17.59353256225586, "logps/chosen": -368.17364501953125, "logps/rejected": -368.85748291015625, "loss": 0.644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.312775135040283, "rewards/margins": 0.2990170121192932, "rewards/rejected": 2.0137581825256348, "step": 9070 }, { "epoch": 0.42156088954918985, "grad_norm": 15.431601524353027, "learning_rate": 4.298094928579166e-07, "logits/chosen": -18.66535758972168, "logits/rejected": -17.3824462890625, "logps/chosen": -351.1194763183594, "logps/rejected": -281.7906799316406, "loss": 0.5651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.191556215286255, "rewards/margins": 0.6857888102531433, "rewards/rejected": 1.5057674646377563, "step": 9080 }, { "epoch": 0.42202516365662285, "grad_norm": 68.86162567138672, "learning_rate": 4.297321138400111e-07, "logits/chosen": -18.310518264770508, "logits/rejected": -17.30680274963379, "logps/chosen": -514.3916625976562, "logps/rejected": -395.10302734375, "loss": 0.4523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0098981857299805, "rewards/margins": 0.8763939142227173, "rewards/rejected": 2.1335041522979736, "step": 9090 }, { "epoch": 0.4224894377640559, "grad_norm": 15.611083984375, "learning_rate": 4.296547348221056e-07, "logits/chosen": -18.126893997192383, "logits/rejected": -17.481454849243164, "logps/chosen": -374.583984375, "logps/rejected": -294.48004150390625, "loss": 0.4393, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.681938409805298, "rewards/margins": 0.8022456169128418, "rewards/rejected": 1.879692792892456, "step": 9100 }, { "epoch": 0.42295371187148895, "grad_norm": 40.20250701904297, "learning_rate": 4.2957735580420013e-07, "logits/chosen": -18.467239379882812, "logits/rejected": -17.970157623291016, "logps/chosen": -332.73992919921875, "logps/rejected": -299.9741516113281, "loss": 0.7728, "rewards/accuracies": 0.5, "rewards/chosen": 2.250558853149414, "rewards/margins": 0.19908170402050018, "rewards/rejected": 2.0514771938323975, "step": 9110 }, { "epoch": 0.42341798597892194, "grad_norm": 68.548583984375, "learning_rate": 4.2949997678629464e-07, "logits/chosen": -19.109094619750977, "logits/rejected": -17.919628143310547, "logps/chosen": -440.1363830566406, "logps/rejected": -420.4397888183594, "loss": 0.536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.47731351852417, "rewards/margins": 0.5911632776260376, "rewards/rejected": 2.886150360107422, "step": 9120 }, { "epoch": 0.423882260086355, "grad_norm": 37.63746643066406, "learning_rate": 4.294225977683891e-07, "logits/chosen": -18.871700286865234, "logits/rejected": -18.823993682861328, "logps/chosen": -481.84722900390625, "logps/rejected": -464.92498779296875, "loss": 0.9002, "rewards/accuracies": 0.5, "rewards/chosen": 2.569056987762451, "rewards/margins": -0.08404110372066498, "rewards/rejected": 2.6530981063842773, "step": 9130 }, { "epoch": 0.42434653419378804, "grad_norm": 28.56256866455078, "learning_rate": 4.2934521875048356e-07, "logits/chosen": -19.07855224609375, "logits/rejected": -17.371753692626953, "logps/chosen": -434.653564453125, "logps/rejected": -271.68572998046875, "loss": 0.4344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8529887199401855, "rewards/margins": 1.2147961854934692, "rewards/rejected": 1.6381924152374268, "step": 9140 }, { "epoch": 0.42481080830122103, "grad_norm": 167.33364868164062, "learning_rate": 4.2926783973257807e-07, "logits/chosen": -17.975107192993164, "logits/rejected": -17.39971351623535, "logps/chosen": -384.330322265625, "logps/rejected": -348.1441650390625, "loss": 0.6607, "rewards/accuracies": 0.5, "rewards/chosen": 2.255657196044922, "rewards/margins": 0.4717058539390564, "rewards/rejected": 1.7839514017105103, "step": 9150 }, { "epoch": 0.4252750824086541, "grad_norm": 227.37586975097656, "learning_rate": 4.291904607146726e-07, "logits/chosen": -18.465356826782227, "logits/rejected": -18.050785064697266, "logps/chosen": -408.69390869140625, "logps/rejected": -349.1010437011719, "loss": 0.9347, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6354820728302, "rewards/margins": 0.18460090458393097, "rewards/rejected": 2.4508814811706543, "step": 9160 }, { "epoch": 0.4257393565160871, "grad_norm": 33.2879753112793, "learning_rate": 4.291130816967671e-07, "logits/chosen": -18.516735076904297, "logits/rejected": -18.728233337402344, "logps/chosen": -401.68341064453125, "logps/rejected": -356.3049621582031, "loss": 1.0801, "rewards/accuracies": 0.5, "rewards/chosen": 2.2164547443389893, "rewards/margins": -0.34465524554252625, "rewards/rejected": 2.5611095428466797, "step": 9170 }, { "epoch": 0.4262036306235201, "grad_norm": 76.49556732177734, "learning_rate": 4.2903570267886155e-07, "logits/chosen": -19.92426872253418, "logits/rejected": -19.47679901123047, "logps/chosen": -471.28607177734375, "logps/rejected": -348.30377197265625, "loss": 0.6205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4910807609558105, "rewards/margins": 0.3663904070854187, "rewards/rejected": 2.124690294265747, "step": 9180 }, { "epoch": 0.4266679047309532, "grad_norm": 66.06904602050781, "learning_rate": 4.2895832366095606e-07, "logits/chosen": -19.285478591918945, "logits/rejected": -18.02049446105957, "logps/chosen": -354.4913024902344, "logps/rejected": -280.075927734375, "loss": 0.651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6136951446533203, "rewards/margins": 0.5278648138046265, "rewards/rejected": 2.0858302116394043, "step": 9190 }, { "epoch": 0.42713217883838617, "grad_norm": 44.1909065246582, "learning_rate": 4.288809446430506e-07, "logits/chosen": -17.50806427001953, "logits/rejected": -17.06351661682129, "logps/chosen": -339.56463623046875, "logps/rejected": -278.5300598144531, "loss": 0.5358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1812283992767334, "rewards/margins": 0.6392897367477417, "rewards/rejected": 1.5419389009475708, "step": 9200 }, { "epoch": 0.4275964529458192, "grad_norm": 5.898585796356201, "learning_rate": 4.288035656251451e-07, "logits/chosen": -17.99923324584961, "logits/rejected": -17.928136825561523, "logps/chosen": -370.4465026855469, "logps/rejected": -311.02862548828125, "loss": 0.5841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.161864757537842, "rewards/margins": 0.5635883212089539, "rewards/rejected": 1.5982766151428223, "step": 9210 }, { "epoch": 0.42806072705325227, "grad_norm": 268.54705810546875, "learning_rate": 4.287261866072396e-07, "logits/chosen": -18.157556533813477, "logits/rejected": -16.522794723510742, "logps/chosen": -582.8109741210938, "logps/rejected": -354.86309814453125, "loss": 0.6215, "rewards/accuracies": 0.5, "rewards/chosen": 3.1682469844818115, "rewards/margins": 0.8005023002624512, "rewards/rejected": 2.3677444458007812, "step": 9220 }, { "epoch": 0.42852500116068526, "grad_norm": 178.88343811035156, "learning_rate": 4.286488075893341e-07, "logits/chosen": -19.405221939086914, "logits/rejected": -19.456371307373047, "logps/chosen": -316.15911865234375, "logps/rejected": -371.67791748046875, "loss": 1.0909, "rewards/accuracies": 0.5, "rewards/chosen": 2.2372732162475586, "rewards/margins": -0.2303784191608429, "rewards/rejected": 2.467651844024658, "step": 9230 }, { "epoch": 0.4289892752681183, "grad_norm": 114.7517318725586, "learning_rate": 4.285714285714285e-07, "logits/chosen": -18.43270492553711, "logits/rejected": -18.018796920776367, "logps/chosen": -414.15032958984375, "logps/rejected": -345.9160461425781, "loss": 0.4991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.690078020095825, "rewards/margins": 0.6496222615242004, "rewards/rejected": 2.0404558181762695, "step": 9240 }, { "epoch": 0.4294535493755513, "grad_norm": 47.361209869384766, "learning_rate": 4.28494049553523e-07, "logits/chosen": -18.169401168823242, "logits/rejected": -17.322397232055664, "logps/chosen": -397.6122131347656, "logps/rejected": -266.1709289550781, "loss": 0.6288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7540926933288574, "rewards/margins": 0.7240869402885437, "rewards/rejected": 2.030005931854248, "step": 9250 }, { "epoch": 0.42991782348298435, "grad_norm": 59.173927307128906, "learning_rate": 4.2841667053561754e-07, "logits/chosen": -17.703323364257812, "logits/rejected": -16.892749786376953, "logps/chosen": -447.8304748535156, "logps/rejected": -372.8982849121094, "loss": 0.9348, "rewards/accuracies": 0.5, "rewards/chosen": 2.2360243797302246, "rewards/margins": 0.08818913996219635, "rewards/rejected": 2.1478352546691895, "step": 9260 }, { "epoch": 0.4303820975904174, "grad_norm": 134.7489471435547, "learning_rate": 4.2833929151771205e-07, "logits/chosen": -19.608562469482422, "logits/rejected": -18.285974502563477, "logps/chosen": -546.4058837890625, "logps/rejected": -378.60369873046875, "loss": 0.4478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.704958438873291, "rewards/margins": 1.4020116329193115, "rewards/rejected": 2.3029470443725586, "step": 9270 }, { "epoch": 0.4308463716978504, "grad_norm": 35.81715393066406, "learning_rate": 4.282619124998065e-07, "logits/chosen": -18.862796783447266, "logits/rejected": -18.71124839782715, "logps/chosen": -381.0443115234375, "logps/rejected": -430.1575622558594, "loss": 0.8585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6698508262634277, "rewards/margins": -0.005538141820579767, "rewards/rejected": 2.675389051437378, "step": 9280 }, { "epoch": 0.43131064580528344, "grad_norm": 8.353226661682129, "learning_rate": 4.28184533481901e-07, "logits/chosen": -18.377113342285156, "logits/rejected": -17.84332847595215, "logps/chosen": -532.203857421875, "logps/rejected": -389.475341796875, "loss": 0.7533, "rewards/accuracies": 0.5, "rewards/chosen": 2.579019546508789, "rewards/margins": 0.38803571462631226, "rewards/rejected": 2.190983772277832, "step": 9290 }, { "epoch": 0.4317749199127165, "grad_norm": 18.884817123413086, "learning_rate": 4.2810715446399553e-07, "logits/chosen": -17.872634887695312, "logits/rejected": -17.144662857055664, "logps/chosen": -318.1451110839844, "logps/rejected": -225.14013671875, "loss": 0.7429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5765835046768188, "rewards/margins": 0.5311678051948547, "rewards/rejected": 1.0454156398773193, "step": 9300 }, { "epoch": 0.4322391940201495, "grad_norm": 41.16584777832031, "learning_rate": 4.2802977544609004e-07, "logits/chosen": -17.986724853515625, "logits/rejected": -18.734760284423828, "logps/chosen": -373.8129577636719, "logps/rejected": -369.7941589355469, "loss": 0.9789, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.7300726175308228, "rewards/margins": -0.3674240708351135, "rewards/rejected": 2.097496509552002, "step": 9310 }, { "epoch": 0.43270346812758254, "grad_norm": 176.07711791992188, "learning_rate": 4.2795239642818455e-07, "logits/chosen": -18.64420509338379, "logits/rejected": -18.411930084228516, "logps/chosen": -300.96099853515625, "logps/rejected": -324.1858825683594, "loss": 0.7176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7178754806518555, "rewards/margins": 0.14042377471923828, "rewards/rejected": 2.577451705932617, "step": 9320 }, { "epoch": 0.43316774223501553, "grad_norm": 75.64246368408203, "learning_rate": 4.2787501741027896e-07, "logits/chosen": -18.856657028198242, "logits/rejected": -18.15191650390625, "logps/chosen": -500.65985107421875, "logps/rejected": -416.03143310546875, "loss": 0.4884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.829873561859131, "rewards/margins": 0.5564465522766113, "rewards/rejected": 2.2734267711639404, "step": 9330 }, { "epoch": 0.4336320163424486, "grad_norm": 28.210786819458008, "learning_rate": 4.2779763839237347e-07, "logits/chosen": -18.893756866455078, "logits/rejected": -18.021881103515625, "logps/chosen": -355.0465087890625, "logps/rejected": -286.59320068359375, "loss": 0.5664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7495923042297363, "rewards/margins": 0.4671650826931, "rewards/rejected": 2.2824273109436035, "step": 9340 }, { "epoch": 0.4340962904498816, "grad_norm": 66.25101470947266, "learning_rate": 4.27720259374468e-07, "logits/chosen": -19.357452392578125, "logits/rejected": -18.622583389282227, "logps/chosen": -416.5184631347656, "logps/rejected": -339.6099548339844, "loss": 0.5074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7614428997039795, "rewards/margins": 0.7885881662368774, "rewards/rejected": 1.9728549718856812, "step": 9350 }, { "epoch": 0.4345605645573146, "grad_norm": 90.34652709960938, "learning_rate": 4.276428803565625e-07, "logits/chosen": -17.467437744140625, "logits/rejected": -17.226642608642578, "logps/chosen": -266.84759521484375, "logps/rejected": -228.79183959960938, "loss": 0.6802, "rewards/accuracies": 0.5, "rewards/chosen": 1.6339032649993896, "rewards/margins": 0.2573058009147644, "rewards/rejected": 1.3765974044799805, "step": 9360 }, { "epoch": 0.43502483866474767, "grad_norm": 77.35994720458984, "learning_rate": 4.27565501338657e-07, "logits/chosen": -17.917133331298828, "logits/rejected": -18.201129913330078, "logps/chosen": -453.074951171875, "logps/rejected": -430.964599609375, "loss": 0.8269, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7979772090911865, "rewards/margins": -0.11483696848154068, "rewards/rejected": 2.9128143787384033, "step": 9370 }, { "epoch": 0.4354891127721807, "grad_norm": 92.23827362060547, "learning_rate": 4.274881223207515e-07, "logits/chosen": -17.959583282470703, "logits/rejected": -17.20795249938965, "logps/chosen": -401.11114501953125, "logps/rejected": -337.24462890625, "loss": 0.7381, "rewards/accuracies": 0.5, "rewards/chosen": 2.2264082431793213, "rewards/margins": 0.45743808150291443, "rewards/rejected": 1.768970251083374, "step": 9380 }, { "epoch": 0.4359533868796137, "grad_norm": 130.01072692871094, "learning_rate": 4.2741074330284597e-07, "logits/chosen": -19.51588249206543, "logits/rejected": -18.725749969482422, "logps/chosen": -367.4718017578125, "logps/rejected": -302.8009338378906, "loss": 0.4142, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4900097846984863, "rewards/margins": 1.0819677114486694, "rewards/rejected": 1.4080417156219482, "step": 9390 }, { "epoch": 0.43641766098704676, "grad_norm": 31.399526596069336, "learning_rate": 4.273333642849405e-07, "logits/chosen": -18.508960723876953, "logits/rejected": -17.2117862701416, "logps/chosen": -391.6648864746094, "logps/rejected": -247.06591796875, "loss": 0.3224, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.422621250152588, "rewards/margins": 1.4382538795471191, "rewards/rejected": 0.9843673706054688, "step": 9400 }, { "epoch": 0.43688193509447976, "grad_norm": 23.258092880249023, "learning_rate": 4.27255985267035e-07, "logits/chosen": -18.621850967407227, "logits/rejected": -18.973939895629883, "logps/chosen": -394.6651916503906, "logps/rejected": -435.34796142578125, "loss": 0.7854, "rewards/accuracies": 0.5, "rewards/chosen": 2.1925082206726074, "rewards/margins": -0.059540439397096634, "rewards/rejected": 2.2520487308502197, "step": 9410 }, { "epoch": 0.4373462092019128, "grad_norm": 60.1518669128418, "learning_rate": 4.271786062491295e-07, "logits/chosen": -18.6895809173584, "logits/rejected": -17.59296226501465, "logps/chosen": -500.48443603515625, "logps/rejected": -320.7559509277344, "loss": 0.4562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.286569595336914, "rewards/margins": 1.1693055629730225, "rewards/rejected": 2.1172642707824707, "step": 9420 }, { "epoch": 0.43781048330934585, "grad_norm": 89.93133544921875, "learning_rate": 4.271012272312239e-07, "logits/chosen": -18.983936309814453, "logits/rejected": -17.923107147216797, "logps/chosen": -424.7470703125, "logps/rejected": -338.52203369140625, "loss": 0.5463, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8331828117370605, "rewards/margins": 0.8525413274765015, "rewards/rejected": 1.9806417226791382, "step": 9430 }, { "epoch": 0.43827475741677885, "grad_norm": 41.10796356201172, "learning_rate": 4.270238482133184e-07, "logits/chosen": -17.924230575561523, "logits/rejected": -18.160642623901367, "logps/chosen": -258.4634094238281, "logps/rejected": -272.9016418457031, "loss": 0.8866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.824377417564392, "rewards/margins": -0.11685197055339813, "rewards/rejected": 1.9412295818328857, "step": 9440 }, { "epoch": 0.4387390315242119, "grad_norm": 12.440783500671387, "learning_rate": 4.2694646919541293e-07, "logits/chosen": -17.70513916015625, "logits/rejected": -17.708717346191406, "logps/chosen": -345.6009216308594, "logps/rejected": -340.68438720703125, "loss": 0.8981, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9707472324371338, "rewards/margins": 0.08915898948907852, "rewards/rejected": 1.881588339805603, "step": 9450 }, { "epoch": 0.43920330563164495, "grad_norm": 23.832509994506836, "learning_rate": 4.2686909017750745e-07, "logits/chosen": -18.65410804748535, "logits/rejected": -18.826290130615234, "logps/chosen": -355.1832275390625, "logps/rejected": -358.18975830078125, "loss": 0.8481, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.074394702911377, "rewards/margins": -0.1348523199558258, "rewards/rejected": 2.20924711227417, "step": 9460 }, { "epoch": 0.43966757973907794, "grad_norm": 141.0133819580078, "learning_rate": 4.2679171115960196e-07, "logits/chosen": -19.36379623413086, "logits/rejected": -18.486583709716797, "logps/chosen": -352.18707275390625, "logps/rejected": -335.21441650390625, "loss": 0.4884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0876922607421875, "rewards/margins": 0.706997275352478, "rewards/rejected": 1.3806949853897095, "step": 9470 }, { "epoch": 0.440131853846511, "grad_norm": 22.121549606323242, "learning_rate": 4.2671433214169647e-07, "logits/chosen": -18.43043327331543, "logits/rejected": -17.341402053833008, "logps/chosen": -419.34759521484375, "logps/rejected": -279.46563720703125, "loss": 0.4438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.581801176071167, "rewards/margins": 0.972662091255188, "rewards/rejected": 1.609139084815979, "step": 9480 }, { "epoch": 0.440596127953944, "grad_norm": 129.56959533691406, "learning_rate": 4.266369531237909e-07, "logits/chosen": -19.579044342041016, "logits/rejected": -17.949687957763672, "logps/chosen": -436.8038024902344, "logps/rejected": -348.4715576171875, "loss": 0.7131, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.735586643218994, "rewards/margins": 0.3784921169281006, "rewards/rejected": 2.3570945262908936, "step": 9490 }, { "epoch": 0.44106040206137703, "grad_norm": 91.7582778930664, "learning_rate": 4.2655957410588544e-07, "logits/chosen": -18.3853759765625, "logits/rejected": -18.445682525634766, "logps/chosen": -472.0184020996094, "logps/rejected": -462.62750244140625, "loss": 0.8197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5749754905700684, "rewards/margins": -0.050699688494205475, "rewards/rejected": 2.6256754398345947, "step": 9500 }, { "epoch": 0.4415246761688101, "grad_norm": 45.01533126831055, "learning_rate": 4.2648219508797995e-07, "logits/chosen": -18.330007553100586, "logits/rejected": -17.70046043395996, "logps/chosen": -403.1140441894531, "logps/rejected": -347.24114990234375, "loss": 0.5697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.292851448059082, "rewards/margins": 0.459517240524292, "rewards/rejected": 1.83333420753479, "step": 9510 }, { "epoch": 0.4419889502762431, "grad_norm": 71.81398010253906, "learning_rate": 4.2640481607007446e-07, "logits/chosen": -19.270647048950195, "logits/rejected": -17.8904972076416, "logps/chosen": -439.22833251953125, "logps/rejected": -303.0696716308594, "loss": 0.4447, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7144665718078613, "rewards/margins": 1.0404664278030396, "rewards/rejected": 1.6740000247955322, "step": 9520 }, { "epoch": 0.4424532243836761, "grad_norm": 27.985443115234375, "learning_rate": 4.263274370521689e-07, "logits/chosen": -18.11032485961914, "logits/rejected": -17.444154739379883, "logps/chosen": -383.440185546875, "logps/rejected": -295.0392150878906, "loss": 0.6279, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.625661611557007, "rewards/margins": 0.40076056122779846, "rewards/rejected": 2.224900960922241, "step": 9530 }, { "epoch": 0.4429174984911092, "grad_norm": 75.62664031982422, "learning_rate": 4.262500580342634e-07, "logits/chosen": -18.95498275756836, "logits/rejected": -18.06700897216797, "logps/chosen": -349.82159423828125, "logps/rejected": -219.2198028564453, "loss": 0.4847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.059389352798462, "rewards/margins": 0.6152870655059814, "rewards/rejected": 1.444102168083191, "step": 9540 }, { "epoch": 0.44338177259854217, "grad_norm": 44.98170471191406, "learning_rate": 4.261726790163579e-07, "logits/chosen": -18.401037216186523, "logits/rejected": -17.652667999267578, "logps/chosen": -384.9527282714844, "logps/rejected": -305.42962646484375, "loss": 0.6619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5351779460906982, "rewards/margins": 0.14020447432994843, "rewards/rejected": 1.3949735164642334, "step": 9550 }, { "epoch": 0.4438460467059752, "grad_norm": 178.2257080078125, "learning_rate": 4.260952999984524e-07, "logits/chosen": -17.864286422729492, "logits/rejected": -17.6824951171875, "logps/chosen": -391.51751708984375, "logps/rejected": -357.5580139160156, "loss": 0.805, "rewards/accuracies": 0.5, "rewards/chosen": 2.403049945831299, "rewards/margins": 0.34839051961898804, "rewards/rejected": 2.054659366607666, "step": 9560 }, { "epoch": 0.4443103208134082, "grad_norm": 20.913631439208984, "learning_rate": 4.260179209805469e-07, "logits/chosen": -19.15573501586914, "logits/rejected": -18.695924758911133, "logps/chosen": -480.3375549316406, "logps/rejected": -498.88800048828125, "loss": 0.9103, "rewards/accuracies": 0.5, "rewards/chosen": 2.9880290031433105, "rewards/margins": 0.06336279213428497, "rewards/rejected": 2.9246666431427, "step": 9570 }, { "epoch": 0.44477459492084126, "grad_norm": 189.02066040039062, "learning_rate": 4.259405419626414e-07, "logits/chosen": -18.513408660888672, "logits/rejected": -18.331829071044922, "logps/chosen": -308.3188171386719, "logps/rejected": -330.4189758300781, "loss": 1.0675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0531208515167236, "rewards/margins": -0.0894264206290245, "rewards/rejected": 2.142547369003296, "step": 9580 }, { "epoch": 0.4452388690282743, "grad_norm": 5.961293697357178, "learning_rate": 4.258631629447359e-07, "logits/chosen": -18.9448299407959, "logits/rejected": -17.759075164794922, "logps/chosen": -460.03192138671875, "logps/rejected": -333.742919921875, "loss": 0.5789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4903550148010254, "rewards/margins": 0.7035675644874573, "rewards/rejected": 1.7867872714996338, "step": 9590 }, { "epoch": 0.4457031431357073, "grad_norm": 126.99791717529297, "learning_rate": 4.257857839268304e-07, "logits/chosen": -18.766103744506836, "logits/rejected": -18.403644561767578, "logps/chosen": -403.519287109375, "logps/rejected": -326.542724609375, "loss": 0.7187, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.144448757171631, "rewards/margins": 0.08473778516054153, "rewards/rejected": 2.059710741043091, "step": 9600 }, { "epoch": 0.44616741724314035, "grad_norm": 16.749235153198242, "learning_rate": 4.257084049089249e-07, "logits/chosen": -18.378292083740234, "logits/rejected": -17.022327423095703, "logps/chosen": -484.882568359375, "logps/rejected": -306.49456787109375, "loss": 0.5883, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.883167028427124, "rewards/margins": 0.8509181141853333, "rewards/rejected": 2.0322489738464355, "step": 9610 }, { "epoch": 0.4466316913505734, "grad_norm": 119.20832824707031, "learning_rate": 4.2563102589101936e-07, "logits/chosen": -17.902090072631836, "logits/rejected": -18.883739471435547, "logps/chosen": -465.85467529296875, "logps/rejected": -414.65283203125, "loss": 0.8776, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5797934532165527, "rewards/margins": 0.17501960694789886, "rewards/rejected": 2.4047741889953613, "step": 9620 }, { "epoch": 0.4470959654580064, "grad_norm": 3.994352102279663, "learning_rate": 4.2555364687311387e-07, "logits/chosen": -17.89654541015625, "logits/rejected": -16.79621696472168, "logps/chosen": -391.74578857421875, "logps/rejected": -290.814453125, "loss": 0.6538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7350573539733887, "rewards/margins": 0.7747532725334167, "rewards/rejected": 1.9603042602539062, "step": 9630 }, { "epoch": 0.44756023956543944, "grad_norm": 90.42350006103516, "learning_rate": 4.2547626785520833e-07, "logits/chosen": -19.889286041259766, "logits/rejected": -19.71940803527832, "logps/chosen": -422.89642333984375, "logps/rejected": -363.8135986328125, "loss": 0.642, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.381743907928467, "rewards/margins": 0.3391498029232025, "rewards/rejected": 2.0425941944122314, "step": 9640 }, { "epoch": 0.44802451367287244, "grad_norm": 196.87158203125, "learning_rate": 4.2539888883730284e-07, "logits/chosen": -19.35165786743164, "logits/rejected": -18.690092086791992, "logps/chosen": -420.363037109375, "logps/rejected": -349.7938537597656, "loss": 0.694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3349835872650146, "rewards/margins": 0.36361080408096313, "rewards/rejected": 1.9713729619979858, "step": 9650 }, { "epoch": 0.4484887877803055, "grad_norm": 33.83638381958008, "learning_rate": 4.2532150981939735e-07, "logits/chosen": -19.777423858642578, "logits/rejected": -19.074853897094727, "logps/chosen": -365.35076904296875, "logps/rejected": -263.53448486328125, "loss": 0.4676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0150227546691895, "rewards/margins": 0.978188157081604, "rewards/rejected": 2.036834239959717, "step": 9660 }, { "epoch": 0.44895306188773854, "grad_norm": 1.6492929458618164, "learning_rate": 4.2524413080149186e-07, "logits/chosen": -17.91934585571289, "logits/rejected": -16.80396842956543, "logps/chosen": -395.9037780761719, "logps/rejected": -263.82958984375, "loss": 0.4604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.537034034729004, "rewards/margins": 1.1804707050323486, "rewards/rejected": 1.3565633296966553, "step": 9670 }, { "epoch": 0.44941733599517153, "grad_norm": 36.84492111206055, "learning_rate": 4.251667517835864e-07, "logits/chosen": -19.139558792114258, "logits/rejected": -17.91436767578125, "logps/chosen": -428.5980529785156, "logps/rejected": -240.2573699951172, "loss": 0.4294, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5467231273651123, "rewards/margins": 1.369955062866211, "rewards/rejected": 1.1767680644989014, "step": 9680 }, { "epoch": 0.4498816101026046, "grad_norm": 19.390596389770508, "learning_rate": 4.2508937276568083e-07, "logits/chosen": -18.83148193359375, "logits/rejected": -17.824127197265625, "logps/chosen": -365.4973449707031, "logps/rejected": -300.72369384765625, "loss": 0.6729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.357908010482788, "rewards/margins": 0.3530784249305725, "rewards/rejected": 2.0048296451568604, "step": 9690 }, { "epoch": 0.45034588421003763, "grad_norm": 39.5273551940918, "learning_rate": 4.2501199374777535e-07, "logits/chosen": -19.402973175048828, "logits/rejected": -18.087858200073242, "logps/chosen": -386.4050598144531, "logps/rejected": -241.07022094726562, "loss": 0.352, "rewards/accuracies": 1.0, "rewards/chosen": 2.843477487564087, "rewards/margins": 1.0974944829940796, "rewards/rejected": 1.7459831237792969, "step": 9700 }, { "epoch": 0.4508101583174706, "grad_norm": 25.816335678100586, "learning_rate": 4.2493461472986986e-07, "logits/chosen": -19.47129249572754, "logits/rejected": -17.95309829711914, "logps/chosen": -371.0851135253906, "logps/rejected": -182.35708618164062, "loss": 0.3116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.073315382003784, "rewards/margins": 2.176584482192993, "rewards/rejected": 0.8967310190200806, "step": 9710 }, { "epoch": 0.45127443242490367, "grad_norm": 135.03790283203125, "learning_rate": 4.248572357119643e-07, "logits/chosen": -20.01607322692871, "logits/rejected": -18.768518447875977, "logps/chosen": -404.85089111328125, "logps/rejected": -339.6656188964844, "loss": 0.5468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.389408588409424, "rewards/margins": 0.5290073156356812, "rewards/rejected": 1.860400915145874, "step": 9720 }, { "epoch": 0.45173870653233666, "grad_norm": 17.289064407348633, "learning_rate": 4.2477985669405883e-07, "logits/chosen": -19.37462043762207, "logits/rejected": -18.581483840942383, "logps/chosen": -300.83880615234375, "logps/rejected": -193.0535125732422, "loss": 0.4986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8346138000488281, "rewards/margins": 0.7559260129928589, "rewards/rejected": 1.0786879062652588, "step": 9730 }, { "epoch": 0.4522029806397697, "grad_norm": 171.4629669189453, "learning_rate": 4.247024776761533e-07, "logits/chosen": -17.33513641357422, "logits/rejected": -17.152000427246094, "logps/chosen": -464.49749755859375, "logps/rejected": -392.71820068359375, "loss": 1.2619, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0283055305480957, "rewards/margins": 0.2228488028049469, "rewards/rejected": 2.8054568767547607, "step": 9740 }, { "epoch": 0.45266725474720276, "grad_norm": 68.90274047851562, "learning_rate": 4.246250986582478e-07, "logits/chosen": -18.47939109802246, "logits/rejected": -18.615955352783203, "logps/chosen": -362.1836853027344, "logps/rejected": -347.80401611328125, "loss": 0.8284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4389984607696533, "rewards/margins": 0.0018490791553631425, "rewards/rejected": 2.4371492862701416, "step": 9750 }, { "epoch": 0.45313152885463576, "grad_norm": 21.31777572631836, "learning_rate": 4.245477196403423e-07, "logits/chosen": -18.724843978881836, "logits/rejected": -16.787126541137695, "logps/chosen": -390.112060546875, "logps/rejected": -184.88851928710938, "loss": 0.4832, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5026144981384277, "rewards/margins": 1.2826687097549438, "rewards/rejected": 1.2199457883834839, "step": 9760 }, { "epoch": 0.4535958029620688, "grad_norm": 48.407161712646484, "learning_rate": 4.244703406224368e-07, "logits/chosen": -19.83848762512207, "logits/rejected": -19.0341796875, "logps/chosen": -422.81964111328125, "logps/rejected": -341.28094482421875, "loss": 0.5409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6695895195007324, "rewards/margins": 0.5923101305961609, "rewards/rejected": 2.0772793292999268, "step": 9770 }, { "epoch": 0.45406007706950186, "grad_norm": 26.820417404174805, "learning_rate": 4.2439296160453133e-07, "logits/chosen": -18.999462127685547, "logits/rejected": -18.750110626220703, "logps/chosen": -425.97625732421875, "logps/rejected": -383.51104736328125, "loss": 0.8887, "rewards/accuracies": 0.5, "rewards/chosen": 2.270669460296631, "rewards/margins": -0.015524208545684814, "rewards/rejected": 2.286193370819092, "step": 9780 }, { "epoch": 0.45452435117693485, "grad_norm": 27.920372009277344, "learning_rate": 4.243155825866258e-07, "logits/chosen": -18.83530044555664, "logits/rejected": -17.343727111816406, "logps/chosen": -423.6332092285156, "logps/rejected": -331.16925048828125, "loss": 0.7959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.518164873123169, "rewards/margins": 0.7104881405830383, "rewards/rejected": 1.8076766729354858, "step": 9790 }, { "epoch": 0.4549886252843679, "grad_norm": 41.77107620239258, "learning_rate": 4.242382035687203e-07, "logits/chosen": -17.852413177490234, "logits/rejected": -18.04952049255371, "logps/chosen": -362.31378173828125, "logps/rejected": -330.4039001464844, "loss": 0.6505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.308789014816284, "rewards/margins": 0.27941083908081055, "rewards/rejected": 2.0293781757354736, "step": 9800 }, { "epoch": 0.4554528993918009, "grad_norm": 18.2708740234375, "learning_rate": 4.2416082455081476e-07, "logits/chosen": -19.3408145904541, "logits/rejected": -19.169370651245117, "logps/chosen": -444.11553955078125, "logps/rejected": -420.6546936035156, "loss": 0.6356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0891060829162598, "rewards/margins": 0.5360387563705444, "rewards/rejected": 2.5530669689178467, "step": 9810 }, { "epoch": 0.45591717349923394, "grad_norm": 15.363972663879395, "learning_rate": 4.2408344553290927e-07, "logits/chosen": -18.769094467163086, "logits/rejected": -18.091854095458984, "logps/chosen": -403.7549743652344, "logps/rejected": -302.9184265136719, "loss": 0.5337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.638471841812134, "rewards/margins": 1.013005018234253, "rewards/rejected": 1.6254665851593018, "step": 9820 }, { "epoch": 0.456381447606667, "grad_norm": 4.247557640075684, "learning_rate": 4.240060665150038e-07, "logits/chosen": -18.314533233642578, "logits/rejected": -17.94331932067871, "logps/chosen": -429.14801025390625, "logps/rejected": -274.4066467285156, "loss": 0.5503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.822012424468994, "rewards/margins": 1.4353983402252197, "rewards/rejected": 1.3866139650344849, "step": 9830 }, { "epoch": 0.4568457217141, "grad_norm": 165.35269165039062, "learning_rate": 4.2392868749709824e-07, "logits/chosen": -18.636932373046875, "logits/rejected": -18.288028717041016, "logps/chosen": -356.71685791015625, "logps/rejected": -313.40948486328125, "loss": 0.8284, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.0645976066589355, "rewards/margins": -0.012589693069458008, "rewards/rejected": 2.0771870613098145, "step": 9840 }, { "epoch": 0.45730999582153303, "grad_norm": 181.46446228027344, "learning_rate": 4.2385130847919275e-07, "logits/chosen": -18.73400115966797, "logits/rejected": -18.100526809692383, "logps/chosen": -398.28076171875, "logps/rejected": -297.2492980957031, "loss": 0.7096, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0148978233337402, "rewards/margins": 0.26555123925209045, "rewards/rejected": 1.7493464946746826, "step": 9850 }, { "epoch": 0.4577742699289661, "grad_norm": 63.70878219604492, "learning_rate": 4.2377392946128726e-07, "logits/chosen": -19.12032127380371, "logits/rejected": -18.959272384643555, "logps/chosen": -350.69244384765625, "logps/rejected": -343.86749267578125, "loss": 0.9424, "rewards/accuracies": 0.5, "rewards/chosen": 2.272623062133789, "rewards/margins": -0.21415996551513672, "rewards/rejected": 2.486783266067505, "step": 9860 }, { "epoch": 0.4582385440363991, "grad_norm": 137.90560913085938, "learning_rate": 4.2369655044338177e-07, "logits/chosen": -18.248043060302734, "logits/rejected": -17.869396209716797, "logps/chosen": -425.65875244140625, "logps/rejected": -259.3772277832031, "loss": 0.503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4198248386383057, "rewards/margins": 0.8338054418563843, "rewards/rejected": 1.5860192775726318, "step": 9870 }, { "epoch": 0.4587028181438321, "grad_norm": 27.061187744140625, "learning_rate": 4.236191714254763e-07, "logits/chosen": -18.787242889404297, "logits/rejected": -17.83710479736328, "logps/chosen": -475.00250244140625, "logps/rejected": -356.19281005859375, "loss": 0.5501, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4522643089294434, "rewards/margins": 0.7232051491737366, "rewards/rejected": 2.7290592193603516, "step": 9880 }, { "epoch": 0.4591670922512652, "grad_norm": 52.72733688354492, "learning_rate": 4.2354179240757074e-07, "logits/chosen": -19.036174774169922, "logits/rejected": -18.816871643066406, "logps/chosen": -461.1944885253906, "logps/rejected": -461.2750549316406, "loss": 0.6717, "rewards/accuracies": 0.5, "rewards/chosen": 2.8679254055023193, "rewards/margins": 0.42797571420669556, "rewards/rejected": 2.4399495124816895, "step": 9890 }, { "epoch": 0.45963136635869817, "grad_norm": 3.844766855239868, "learning_rate": 4.2346441338966525e-07, "logits/chosen": -19.40584945678711, "logits/rejected": -18.072246551513672, "logps/chosen": -582.269287109375, "logps/rejected": -347.435546875, "loss": 0.4489, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.251828670501709, "rewards/margins": 1.1890876293182373, "rewards/rejected": 2.0627408027648926, "step": 9900 }, { "epoch": 0.4600956404661312, "grad_norm": 4.572386264801025, "learning_rate": 4.233870343717597e-07, "logits/chosen": -18.61063003540039, "logits/rejected": -18.58859634399414, "logps/chosen": -370.99774169921875, "logps/rejected": -388.1662902832031, "loss": 0.7414, "rewards/accuracies": 0.5, "rewards/chosen": 2.6052277088165283, "rewards/margins": 0.20202693343162537, "rewards/rejected": 2.40320086479187, "step": 9910 }, { "epoch": 0.4605599145735642, "grad_norm": 79.4870376586914, "learning_rate": 4.233096553538542e-07, "logits/chosen": -18.272703170776367, "logits/rejected": -17.864492416381836, "logps/chosen": -337.324951171875, "logps/rejected": -359.089111328125, "loss": 0.8703, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1580991744995117, "rewards/margins": -0.10491099208593369, "rewards/rejected": 2.263010263442993, "step": 9920 }, { "epoch": 0.46102418868099726, "grad_norm": 70.32105255126953, "learning_rate": 4.2323227633594873e-07, "logits/chosen": -19.147689819335938, "logits/rejected": -18.101787567138672, "logps/chosen": -502.5479431152344, "logps/rejected": -402.4156494140625, "loss": 0.4567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2539913654327393, "rewards/margins": 0.7743998765945435, "rewards/rejected": 2.4795913696289062, "step": 9930 }, { "epoch": 0.4614884627884303, "grad_norm": 21.84886932373047, "learning_rate": 4.231548973180432e-07, "logits/chosen": -18.06113052368164, "logits/rejected": -17.743988037109375, "logps/chosen": -299.33160400390625, "logps/rejected": -243.49801635742188, "loss": 0.7779, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.921478509902954, "rewards/margins": -0.015876924619078636, "rewards/rejected": 1.937355399131775, "step": 9940 }, { "epoch": 0.4619527368958633, "grad_norm": 99.10043334960938, "learning_rate": 4.230775183001377e-07, "logits/chosen": -17.919395446777344, "logits/rejected": -18.460063934326172, "logps/chosen": -342.9708251953125, "logps/rejected": -388.720703125, "loss": 0.9776, "rewards/accuracies": 0.5, "rewards/chosen": 2.074420213699341, "rewards/margins": -0.2772781252861023, "rewards/rejected": 2.351698398590088, "step": 9950 }, { "epoch": 0.46241701100329635, "grad_norm": 29.281850814819336, "learning_rate": 4.230001392822322e-07, "logits/chosen": -19.037235260009766, "logits/rejected": -18.292049407958984, "logps/chosen": -345.6763610839844, "logps/rejected": -219.8476104736328, "loss": 0.4894, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6530067920684814, "rewards/margins": 0.8685098886489868, "rewards/rejected": 1.7844966650009155, "step": 9960 }, { "epoch": 0.4628812851107294, "grad_norm": 36.3098030090332, "learning_rate": 4.2292276026432673e-07, "logits/chosen": -18.03162956237793, "logits/rejected": -17.975202560424805, "logps/chosen": -400.6590270996094, "logps/rejected": -450.32159423828125, "loss": 1.0832, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.000884532928467, "rewards/margins": -0.3323191702365875, "rewards/rejected": 3.3332037925720215, "step": 9970 }, { "epoch": 0.4633455592181624, "grad_norm": 9.336925506591797, "learning_rate": 4.2284538124642124e-07, "logits/chosen": -19.403837203979492, "logits/rejected": -18.642847061157227, "logps/chosen": -375.93206787109375, "logps/rejected": -331.4801025390625, "loss": 0.5811, "rewards/accuracies": 0.5, "rewards/chosen": 2.111633777618408, "rewards/margins": 0.48940953612327576, "rewards/rejected": 1.6222244501113892, "step": 9980 }, { "epoch": 0.46380983332559544, "grad_norm": 25.11257553100586, "learning_rate": 4.227680022285157e-07, "logits/chosen": -18.261764526367188, "logits/rejected": -17.066593170166016, "logps/chosen": -458.4911193847656, "logps/rejected": -309.313232421875, "loss": 0.5429, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8125526905059814, "rewards/margins": 1.0066581964492798, "rewards/rejected": 1.8058944940567017, "step": 9990 }, { "epoch": 0.46427410743302844, "grad_norm": 80.6597900390625, "learning_rate": 4.2269062321061015e-07, "logits/chosen": -18.44236183166504, "logits/rejected": -18.727720260620117, "logps/chosen": -373.3526916503906, "logps/rejected": -336.6241760253906, "loss": 0.8862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8208582401275635, "rewards/margins": -0.0547279492020607, "rewards/rejected": 1.875585913658142, "step": 10000 }, { "epoch": 0.4647383815404615, "grad_norm": 98.59441375732422, "learning_rate": 4.2261324419270467e-07, "logits/chosen": -18.14454460144043, "logits/rejected": -16.971817016601562, "logps/chosen": -381.85772705078125, "logps/rejected": -236.0784149169922, "loss": 0.4788, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9187132120132446, "rewards/margins": 0.6138779520988464, "rewards/rejected": 1.3048350811004639, "step": 10010 }, { "epoch": 0.46520265564789454, "grad_norm": 159.0430450439453, "learning_rate": 4.225358651747992e-07, "logits/chosen": -19.294126510620117, "logits/rejected": -18.335460662841797, "logps/chosen": -449.4452209472656, "logps/rejected": -352.55279541015625, "loss": 0.7344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8891730308532715, "rewards/margins": 0.23946194350719452, "rewards/rejected": 2.6497113704681396, "step": 10020 }, { "epoch": 0.46566692975532753, "grad_norm": 83.81912994384766, "learning_rate": 4.224584861568937e-07, "logits/chosen": -19.778263092041016, "logits/rejected": -19.304214477539062, "logps/chosen": -345.938232421875, "logps/rejected": -255.41714477539062, "loss": 0.7593, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1264405250549316, "rewards/margins": 0.0791628509759903, "rewards/rejected": 2.0472774505615234, "step": 10030 }, { "epoch": 0.4661312038627606, "grad_norm": 122.86876678466797, "learning_rate": 4.2238110713898815e-07, "logits/chosen": -18.301082611083984, "logits/rejected": -18.959354400634766, "logps/chosen": -390.31298828125, "logps/rejected": -415.9244689941406, "loss": 0.7433, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5165281295776367, "rewards/margins": 0.4983484148979187, "rewards/rejected": 2.0181796550750732, "step": 10040 }, { "epoch": 0.46659547797019363, "grad_norm": 91.72901916503906, "learning_rate": 4.2230372812108266e-07, "logits/chosen": -18.44817543029785, "logits/rejected": -17.034343719482422, "logps/chosen": -412.58160400390625, "logps/rejected": -302.51800537109375, "loss": 0.5017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2414231300354004, "rewards/margins": 0.6922961473464966, "rewards/rejected": 1.5491269826889038, "step": 10050 }, { "epoch": 0.4670597520776266, "grad_norm": 27.348508834838867, "learning_rate": 4.2222634910317717e-07, "logits/chosen": -18.486515045166016, "logits/rejected": -17.829578399658203, "logps/chosen": -348.78399658203125, "logps/rejected": -375.65032958984375, "loss": 0.858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3370375633239746, "rewards/margins": 0.3514174520969391, "rewards/rejected": 1.9856204986572266, "step": 10060 }, { "epoch": 0.46752402618505967, "grad_norm": 11.709704399108887, "learning_rate": 4.221489700852717e-07, "logits/chosen": -20.019548416137695, "logits/rejected": -17.957239151000977, "logps/chosen": -496.79052734375, "logps/rejected": -284.06134033203125, "loss": 0.4656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3340041637420654, "rewards/margins": 1.2611984014511108, "rewards/rejected": 2.072805166244507, "step": 10070 }, { "epoch": 0.46798830029249266, "grad_norm": 65.4748306274414, "learning_rate": 4.220715910673662e-07, "logits/chosen": -18.2117977142334, "logits/rejected": -18.480205535888672, "logps/chosen": -409.5503234863281, "logps/rejected": -405.3023376464844, "loss": 0.975, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.9330692291259766, "rewards/margins": -0.2973862290382385, "rewards/rejected": 2.2304556369781494, "step": 10080 }, { "epoch": 0.4684525743999257, "grad_norm": 92.06800079345703, "learning_rate": 4.2199421204946065e-07, "logits/chosen": -19.51019287109375, "logits/rejected": -18.359874725341797, "logps/chosen": -353.8089294433594, "logps/rejected": -280.2147216796875, "loss": 0.6367, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1301772594451904, "rewards/margins": 0.5217167735099792, "rewards/rejected": 1.6084606647491455, "step": 10090 }, { "epoch": 0.46891684850735876, "grad_norm": 35.21043014526367, "learning_rate": 4.219168330315551e-07, "logits/chosen": -18.207809448242188, "logits/rejected": -17.642629623413086, "logps/chosen": -382.3450622558594, "logps/rejected": -319.44830322265625, "loss": 0.7289, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4903995990753174, "rewards/margins": 0.41200917959213257, "rewards/rejected": 2.078390598297119, "step": 10100 }, { "epoch": 0.46938112261479176, "grad_norm": 24.73813819885254, "learning_rate": 4.218394540136496e-07, "logits/chosen": -18.43411636352539, "logits/rejected": -17.36903190612793, "logps/chosen": -464.5849609375, "logps/rejected": -313.33843994140625, "loss": 0.4837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.81162691116333, "rewards/margins": 0.9591085314750671, "rewards/rejected": 1.8525184392929077, "step": 10110 }, { "epoch": 0.4698453967222248, "grad_norm": 44.65254592895508, "learning_rate": 4.2176207499574413e-07, "logits/chosen": -18.45113754272461, "logits/rejected": -17.422969818115234, "logps/chosen": -470.3121643066406, "logps/rejected": -335.6947937011719, "loss": 0.5159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0314877033233643, "rewards/margins": 0.8499239683151245, "rewards/rejected": 2.18156361579895, "step": 10120 }, { "epoch": 0.47030967082965786, "grad_norm": 159.4261932373047, "learning_rate": 4.2168469597783864e-07, "logits/chosen": -18.499942779541016, "logits/rejected": -17.80240249633789, "logps/chosen": -380.9029846191406, "logps/rejected": -352.865966796875, "loss": 0.8139, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9720203876495361, "rewards/margins": 0.011944872327148914, "rewards/rejected": 1.9600751399993896, "step": 10130 }, { "epoch": 0.47077394493709085, "grad_norm": 154.4839630126953, "learning_rate": 4.216073169599331e-07, "logits/chosen": -19.129846572875977, "logits/rejected": -18.784286499023438, "logps/chosen": -395.82666015625, "logps/rejected": -296.89385986328125, "loss": 0.5908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1172773838043213, "rewards/margins": 0.3728736340999603, "rewards/rejected": 1.744403600692749, "step": 10140 }, { "epoch": 0.4712382190445239, "grad_norm": 91.23056030273438, "learning_rate": 4.215299379420276e-07, "logits/chosen": -18.611324310302734, "logits/rejected": -18.21685218811035, "logps/chosen": -493.10418701171875, "logps/rejected": -463.16900634765625, "loss": 0.7735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0175716876983643, "rewards/margins": 0.1994897574186325, "rewards/rejected": 2.818082094192505, "step": 10150 }, { "epoch": 0.4717024931519569, "grad_norm": 64.0663833618164, "learning_rate": 4.214525589241221e-07, "logits/chosen": -19.44337272644043, "logits/rejected": -18.551973342895508, "logps/chosen": -347.7425537109375, "logps/rejected": -298.4324035644531, "loss": 0.6738, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4964089393615723, "rewards/margins": 0.5764725208282471, "rewards/rejected": 1.9199365377426147, "step": 10160 }, { "epoch": 0.47216676725938994, "grad_norm": 50.513221740722656, "learning_rate": 4.2137517990621664e-07, "logits/chosen": -18.90152359008789, "logits/rejected": -18.282215118408203, "logps/chosen": -495.37933349609375, "logps/rejected": -418.2564392089844, "loss": 0.7179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6921334266662598, "rewards/margins": 0.2441389560699463, "rewards/rejected": 2.4479944705963135, "step": 10170 }, { "epoch": 0.472631041366823, "grad_norm": 80.87699890136719, "learning_rate": 4.2129780088831115e-07, "logits/chosen": -18.71896743774414, "logits/rejected": -17.56854248046875, "logps/chosen": -318.5310974121094, "logps/rejected": -243.2530517578125, "loss": 0.6641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3181586265563965, "rewards/margins": 0.439735472202301, "rewards/rejected": 1.8784229755401611, "step": 10180 }, { "epoch": 0.473095315474256, "grad_norm": 72.94709777832031, "learning_rate": 4.212204218704056e-07, "logits/chosen": -18.625972747802734, "logits/rejected": -18.329925537109375, "logps/chosen": -373.57623291015625, "logps/rejected": -293.8069763183594, "loss": 0.7257, "rewards/accuracies": 0.5, "rewards/chosen": 1.4313491582870483, "rewards/margins": 0.1549881398677826, "rewards/rejected": 1.276361107826233, "step": 10190 }, { "epoch": 0.47355958958168903, "grad_norm": 149.09437561035156, "learning_rate": 4.2114304285250006e-07, "logits/chosen": -18.76712989807129, "logits/rejected": -17.80670166015625, "logps/chosen": -360.10943603515625, "logps/rejected": -272.4876403808594, "loss": 0.5695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.533133029937744, "rewards/margins": 0.46383315324783325, "rewards/rejected": 2.0693001747131348, "step": 10200 }, { "epoch": 0.4740238636891221, "grad_norm": 10.026886940002441, "learning_rate": 4.210656638345946e-07, "logits/chosen": -18.84125328063965, "logits/rejected": -17.238554000854492, "logps/chosen": -375.2713928222656, "logps/rejected": -256.4380798339844, "loss": 0.5088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.277385711669922, "rewards/margins": 0.7014005184173584, "rewards/rejected": 1.5759851932525635, "step": 10210 }, { "epoch": 0.4744881377965551, "grad_norm": 190.4180145263672, "learning_rate": 4.209882848166891e-07, "logits/chosen": -19.74152183532715, "logits/rejected": -19.030616760253906, "logps/chosen": -373.22210693359375, "logps/rejected": -295.0743713378906, "loss": 0.6854, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.271899938583374, "rewards/margins": 0.2638562321662903, "rewards/rejected": 2.0080437660217285, "step": 10220 }, { "epoch": 0.4749524119039881, "grad_norm": 138.944091796875, "learning_rate": 4.209109057987836e-07, "logits/chosen": -18.54883575439453, "logits/rejected": -18.138385772705078, "logps/chosen": -377.606201171875, "logps/rejected": -342.64788818359375, "loss": 0.6139, "rewards/accuracies": 0.5, "rewards/chosen": 2.5305256843566895, "rewards/margins": 0.2665957510471344, "rewards/rejected": 2.263929843902588, "step": 10230 }, { "epoch": 0.4754166860114211, "grad_norm": 189.3568878173828, "learning_rate": 4.2083352678087806e-07, "logits/chosen": -18.74090003967285, "logits/rejected": -17.99307632446289, "logps/chosen": -390.73419189453125, "logps/rejected": -355.50677490234375, "loss": 0.7691, "rewards/accuracies": 0.5, "rewards/chosen": 2.5287866592407227, "rewards/margins": 0.08652502298355103, "rewards/rejected": 2.4422614574432373, "step": 10240 }, { "epoch": 0.47588096011885417, "grad_norm": 80.42032623291016, "learning_rate": 4.2075614776297257e-07, "logits/chosen": -18.715612411499023, "logits/rejected": -18.670063018798828, "logps/chosen": -345.63330078125, "logps/rejected": -301.1695861816406, "loss": 0.639, "rewards/accuracies": 0.5, "rewards/chosen": 2.1788246631622314, "rewards/margins": 0.16278746724128723, "rewards/rejected": 2.0160374641418457, "step": 10250 }, { "epoch": 0.4763452342262872, "grad_norm": 45.151493072509766, "learning_rate": 4.206787687450671e-07, "logits/chosen": -18.641918182373047, "logits/rejected": -18.65945816040039, "logps/chosen": -434.63226318359375, "logps/rejected": -393.4584655761719, "loss": 0.728, "rewards/accuracies": 0.5, "rewards/chosen": 3.053270101547241, "rewards/margins": 0.1535697877407074, "rewards/rejected": 2.8996999263763428, "step": 10260 }, { "epoch": 0.4768095083337202, "grad_norm": 82.13032531738281, "learning_rate": 4.206013897271616e-07, "logits/chosen": -19.026405334472656, "logits/rejected": -18.97066879272461, "logps/chosen": -417.43060302734375, "logps/rejected": -360.7080993652344, "loss": 0.7619, "rewards/accuracies": 0.5, "rewards/chosen": 2.4785516262054443, "rewards/margins": -0.013974678702652454, "rewards/rejected": 2.4925265312194824, "step": 10270 }, { "epoch": 0.47727378244115326, "grad_norm": 81.69195556640625, "learning_rate": 4.205240107092561e-07, "logits/chosen": -19.327646255493164, "logits/rejected": -18.654582977294922, "logps/chosen": -401.5832824707031, "logps/rejected": -386.5353088378906, "loss": 0.5992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.577737808227539, "rewards/margins": 0.45902127027511597, "rewards/rejected": 2.1187167167663574, "step": 10280 }, { "epoch": 0.4777380565485863, "grad_norm": 10.900420188903809, "learning_rate": 4.204466316913505e-07, "logits/chosen": -19.096099853515625, "logits/rejected": -17.48311996459961, "logps/chosen": -244.5265655517578, "logps/rejected": -127.97013092041016, "loss": 0.3818, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6601976156234741, "rewards/margins": 1.0614491701126099, "rewards/rejected": 0.5987483859062195, "step": 10290 }, { "epoch": 0.4782023306560193, "grad_norm": 115.44261169433594, "learning_rate": 4.20369252673445e-07, "logits/chosen": -18.330997467041016, "logits/rejected": -17.745946884155273, "logps/chosen": -462.128173828125, "logps/rejected": -319.75360107421875, "loss": 0.5768, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5964035987854004, "rewards/margins": 0.9927345514297485, "rewards/rejected": 2.6036691665649414, "step": 10300 }, { "epoch": 0.47866660476345235, "grad_norm": 36.28547668457031, "learning_rate": 4.2029187365553953e-07, "logits/chosen": -18.712181091308594, "logits/rejected": -17.392608642578125, "logps/chosen": -377.08538818359375, "logps/rejected": -254.114013671875, "loss": 0.5547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.122053384780884, "rewards/margins": 0.6516045331954956, "rewards/rejected": 1.4704487323760986, "step": 10310 }, { "epoch": 0.47913087887088535, "grad_norm": 78.21634674072266, "learning_rate": 4.2021449463763404e-07, "logits/chosen": -18.7122859954834, "logits/rejected": -17.429393768310547, "logps/chosen": -412.65106201171875, "logps/rejected": -267.85479736328125, "loss": 0.5103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.061795711517334, "rewards/margins": 0.6474285125732422, "rewards/rejected": 1.414367437362671, "step": 10320 }, { "epoch": 0.4795951529783184, "grad_norm": 16.0577335357666, "learning_rate": 4.2013711561972855e-07, "logits/chosen": -18.6199893951416, "logits/rejected": -17.448211669921875, "logps/chosen": -439.65655517578125, "logps/rejected": -371.4031982421875, "loss": 0.417, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2288379669189453, "rewards/margins": 1.0367648601531982, "rewards/rejected": 2.192072868347168, "step": 10330 }, { "epoch": 0.48005942708575144, "grad_norm": 110.47239685058594, "learning_rate": 4.20059736601823e-07, "logits/chosen": -19.42498207092285, "logits/rejected": -18.19375228881836, "logps/chosen": -513.7723388671875, "logps/rejected": -380.9251708984375, "loss": 0.7174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.577373504638672, "rewards/margins": 0.791042149066925, "rewards/rejected": 2.7863311767578125, "step": 10340 }, { "epoch": 0.48052370119318444, "grad_norm": 20.378589630126953, "learning_rate": 4.199823575839175e-07, "logits/chosen": -18.7396297454834, "logits/rejected": -17.402904510498047, "logps/chosen": -376.3887634277344, "logps/rejected": -227.29446411132812, "loss": 0.4532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.593191623687744, "rewards/margins": 1.015734076499939, "rewards/rejected": 1.5774574279785156, "step": 10350 }, { "epoch": 0.4809879753006175, "grad_norm": 99.79157257080078, "learning_rate": 4.1990497856601203e-07, "logits/chosen": -18.883609771728516, "logits/rejected": -17.533832550048828, "logps/chosen": -482.65020751953125, "logps/rejected": -353.99322509765625, "loss": 0.5031, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7408194541931152, "rewards/margins": 0.8229252099990845, "rewards/rejected": 1.9178941249847412, "step": 10360 }, { "epoch": 0.48145224940805054, "grad_norm": 19.939926147460938, "learning_rate": 4.1982759954810654e-07, "logits/chosen": -18.465206146240234, "logits/rejected": -17.91570472717285, "logps/chosen": -348.88775634765625, "logps/rejected": -312.48590087890625, "loss": 0.6286, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4054133892059326, "rewards/margins": 0.6309406161308289, "rewards/rejected": 1.774472951889038, "step": 10370 }, { "epoch": 0.48191652351548353, "grad_norm": 44.507835388183594, "learning_rate": 4.1975022053020105e-07, "logits/chosen": -19.174209594726562, "logits/rejected": -17.93602180480957, "logps/chosen": -423.3116149902344, "logps/rejected": -318.980224609375, "loss": 0.5877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.742014169692993, "rewards/margins": 1.0580546855926514, "rewards/rejected": 1.6839593648910522, "step": 10380 }, { "epoch": 0.4823807976229166, "grad_norm": 21.39181900024414, "learning_rate": 4.1967284151229546e-07, "logits/chosen": -18.91478729248047, "logits/rejected": -17.726886749267578, "logps/chosen": -466.72918701171875, "logps/rejected": -279.613525390625, "loss": 0.3784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.122061014175415, "rewards/margins": 1.4476099014282227, "rewards/rejected": 1.6744511127471924, "step": 10390 }, { "epoch": 0.4828450717303496, "grad_norm": 53.67348861694336, "learning_rate": 4.1959546249438997e-07, "logits/chosen": -18.320232391357422, "logits/rejected": -18.322975158691406, "logps/chosen": -372.9831237792969, "logps/rejected": -312.2916564941406, "loss": 0.7041, "rewards/accuracies": 0.5, "rewards/chosen": 2.4512529373168945, "rewards/margins": 0.16940376162528992, "rewards/rejected": 2.2818493843078613, "step": 10400 }, { "epoch": 0.4833093458377826, "grad_norm": 91.71405029296875, "learning_rate": 4.195180834764845e-07, "logits/chosen": -18.819780349731445, "logits/rejected": -18.30055809020996, "logps/chosen": -387.5734558105469, "logps/rejected": -340.80914306640625, "loss": 0.7295, "rewards/accuracies": 0.5, "rewards/chosen": 2.5961196422576904, "rewards/margins": 0.20573166012763977, "rewards/rejected": 2.390388011932373, "step": 10410 }, { "epoch": 0.48377361994521567, "grad_norm": 244.94985961914062, "learning_rate": 4.19440704458579e-07, "logits/chosen": -17.938247680664062, "logits/rejected": -17.684799194335938, "logps/chosen": -374.24334716796875, "logps/rejected": -323.78192138671875, "loss": 0.7745, "rewards/accuracies": 0.5, "rewards/chosen": 2.365311622619629, "rewards/margins": 0.2665807604789734, "rewards/rejected": 2.0987308025360107, "step": 10420 }, { "epoch": 0.48423789405264867, "grad_norm": 226.34347534179688, "learning_rate": 4.193633254406735e-07, "logits/chosen": -19.545734405517578, "logits/rejected": -19.16347885131836, "logps/chosen": -397.8122253417969, "logps/rejected": -378.6834716796875, "loss": 0.7492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5006985664367676, "rewards/margins": 0.06975172460079193, "rewards/rejected": 2.4309468269348145, "step": 10430 }, { "epoch": 0.4847021681600817, "grad_norm": 91.78141784667969, "learning_rate": 4.1928594642276796e-07, "logits/chosen": -18.028493881225586, "logits/rejected": -18.323993682861328, "logps/chosen": -359.2660217285156, "logps/rejected": -394.24853515625, "loss": 1.1238, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0740952491760254, "rewards/margins": -0.3085194230079651, "rewards/rejected": 2.3826146125793457, "step": 10440 }, { "epoch": 0.48516644226751476, "grad_norm": 170.00787353515625, "learning_rate": 4.192085674048625e-07, "logits/chosen": -18.191003799438477, "logits/rejected": -18.526214599609375, "logps/chosen": -362.4874267578125, "logps/rejected": -342.5009765625, "loss": 0.7549, "rewards/accuracies": 0.5, "rewards/chosen": 2.467276096343994, "rewards/margins": 0.09563864767551422, "rewards/rejected": 2.3716375827789307, "step": 10450 }, { "epoch": 0.48563071637494776, "grad_norm": 45.29712677001953, "learning_rate": 4.19131188386957e-07, "logits/chosen": -19.064510345458984, "logits/rejected": -18.447694778442383, "logps/chosen": -434.336181640625, "logps/rejected": -415.1366271972656, "loss": 0.7597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.964958667755127, "rewards/margins": 0.38446080684661865, "rewards/rejected": 2.580497980117798, "step": 10460 }, { "epoch": 0.4860949904823808, "grad_norm": NaN, "learning_rate": 4.19061547270842e-07, "logits/chosen": -19.189010620117188, "logits/rejected": -19.76576805114746, "logps/chosen": -473.53955078125, "logps/rejected": -408.0447998046875, "loss": 1.1455, "rewards/accuracies": 0.5, "rewards/chosen": 2.4475793838500977, "rewards/margins": -0.48098403215408325, "rewards/rejected": 2.9285635948181152, "step": 10470 }, { "epoch": 0.4865592645898138, "grad_norm": 96.16346740722656, "learning_rate": 4.189841682529365e-07, "logits/chosen": -18.966920852661133, "logits/rejected": -17.59601402282715, "logps/chosen": -540.8240356445312, "logps/rejected": -323.86932373046875, "loss": 0.4214, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1947007179260254, "rewards/margins": 1.1372385025024414, "rewards/rejected": 2.057462215423584, "step": 10480 }, { "epoch": 0.48702353869724685, "grad_norm": 117.23177337646484, "learning_rate": 4.1890678923503103e-07, "logits/chosen": -19.69192123413086, "logits/rejected": -18.727191925048828, "logps/chosen": -318.1263732910156, "logps/rejected": -302.4027404785156, "loss": 0.7743, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3448524475097656, "rewards/margins": 0.14096805453300476, "rewards/rejected": 2.2038846015930176, "step": 10490 }, { "epoch": 0.4874878128046799, "grad_norm": 75.29702758789062, "learning_rate": 4.188294102171255e-07, "logits/chosen": -18.35112953186035, "logits/rejected": -17.973438262939453, "logps/chosen": -291.7712097167969, "logps/rejected": -234.93795776367188, "loss": 0.6091, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7672560214996338, "rewards/margins": 0.2824842929840088, "rewards/rejected": 1.4847718477249146, "step": 10500 }, { "epoch": 0.4879520869121129, "grad_norm": 28.930192947387695, "learning_rate": 4.1875203119922e-07, "logits/chosen": -17.955366134643555, "logits/rejected": -17.9844913482666, "logps/chosen": -249.86465454101562, "logps/rejected": -244.73574829101562, "loss": 0.77, "rewards/accuracies": 0.5, "rewards/chosen": 1.883306860923767, "rewards/margins": 0.09283138066530228, "rewards/rejected": 1.790475606918335, "step": 10510 }, { "epoch": 0.48841636101954594, "grad_norm": 125.26215362548828, "learning_rate": 4.1867465218131446e-07, "logits/chosen": -18.013965606689453, "logits/rejected": -17.894906997680664, "logps/chosen": -391.39910888671875, "logps/rejected": -335.7660217285156, "loss": 0.5162, "rewards/accuracies": 0.5, "rewards/chosen": 3.5836215019226074, "rewards/margins": 1.4129397869110107, "rewards/rejected": 2.170681953430176, "step": 10520 }, { "epoch": 0.488880635126979, "grad_norm": 134.29969787597656, "learning_rate": 4.1859727316340897e-07, "logits/chosen": -18.405458450317383, "logits/rejected": -17.922657012939453, "logps/chosen": -302.0299987792969, "logps/rejected": -259.54144287109375, "loss": 0.7426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.38684344291687, "rewards/margins": 0.3053794205188751, "rewards/rejected": 2.0814642906188965, "step": 10530 }, { "epoch": 0.489344909234412, "grad_norm": 4.535597324371338, "learning_rate": 4.185198941455035e-07, "logits/chosen": -18.317913055419922, "logits/rejected": -17.263477325439453, "logps/chosen": -470.8909606933594, "logps/rejected": -353.2152099609375, "loss": 1.051, "rewards/accuracies": 0.5, "rewards/chosen": 2.580317974090576, "rewards/margins": 0.4302234649658203, "rewards/rejected": 2.150094509124756, "step": 10540 }, { "epoch": 0.48980918334184503, "grad_norm": 47.41579055786133, "learning_rate": 4.18442515127598e-07, "logits/chosen": -18.657665252685547, "logits/rejected": -17.772645950317383, "logps/chosen": -391.7998352050781, "logps/rejected": -319.86553955078125, "loss": 0.4476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6964614391326904, "rewards/margins": 0.7717857956886292, "rewards/rejected": 1.9246755838394165, "step": 10550 }, { "epoch": 0.490273457449278, "grad_norm": 31.26554298400879, "learning_rate": 4.183651361096925e-07, "logits/chosen": -18.39344596862793, "logits/rejected": -18.044208526611328, "logps/chosen": -395.7848815917969, "logps/rejected": -385.3276062011719, "loss": 0.6456, "rewards/accuracies": 0.5, "rewards/chosen": 2.373466968536377, "rewards/margins": 0.32064706087112427, "rewards/rejected": 2.0528199672698975, "step": 10560 }, { "epoch": 0.4907377315567111, "grad_norm": 58.12782287597656, "learning_rate": 4.1828775709178696e-07, "logits/chosen": -18.119857788085938, "logits/rejected": -18.21437644958496, "logps/chosen": -367.2657775878906, "logps/rejected": -423.81134033203125, "loss": 1.0355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2778983116149902, "rewards/margins": -0.2741711735725403, "rewards/rejected": 2.5520691871643066, "step": 10570 }, { "epoch": 0.4912020056641441, "grad_norm": 79.27729034423828, "learning_rate": 4.182103780738815e-07, "logits/chosen": -18.4444522857666, "logits/rejected": -17.331575393676758, "logps/chosen": -389.20819091796875, "logps/rejected": -244.2300567626953, "loss": 0.345, "rewards/accuracies": 1.0, "rewards/chosen": 2.299105405807495, "rewards/margins": 0.9863923788070679, "rewards/rejected": 1.3127130270004272, "step": 10580 }, { "epoch": 0.4916662797715771, "grad_norm": 3.645463466644287, "learning_rate": 4.18132999055976e-07, "logits/chosen": -17.665096282958984, "logits/rejected": -16.038158416748047, "logps/chosen": -362.0615234375, "logps/rejected": -194.67953491210938, "loss": 0.4588, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5516436100006104, "rewards/margins": 1.3435308933258057, "rewards/rejected": 1.2081125974655151, "step": 10590 }, { "epoch": 0.49213055387901017, "grad_norm": 62.988975524902344, "learning_rate": 4.1805562003807044e-07, "logits/chosen": -18.127933502197266, "logits/rejected": -17.939579010009766, "logps/chosen": -459.57708740234375, "logps/rejected": -421.844482421875, "loss": 0.7222, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8753652572631836, "rewards/margins": 0.15615782141685486, "rewards/rejected": 2.719207525253296, "step": 10600 }, { "epoch": 0.4925948279864432, "grad_norm": 16.33500862121582, "learning_rate": 4.1797824102016496e-07, "logits/chosen": -19.13985824584961, "logits/rejected": -17.82352638244629, "logps/chosen": -410.3373107910156, "logps/rejected": -268.305908203125, "loss": 0.4835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.891026735305786, "rewards/margins": 1.067359447479248, "rewards/rejected": 1.8236671686172485, "step": 10610 }, { "epoch": 0.4930591020938762, "grad_norm": 98.29576873779297, "learning_rate": 4.179008620022594e-07, "logits/chosen": -19.272411346435547, "logits/rejected": -19.742889404296875, "logps/chosen": -370.79241943359375, "logps/rejected": -411.6376953125, "loss": 0.7467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.833765983581543, "rewards/margins": 0.11922057718038559, "rewards/rejected": 2.714545249938965, "step": 10620 }, { "epoch": 0.49352337620130926, "grad_norm": 142.62742614746094, "learning_rate": 4.178234829843539e-07, "logits/chosen": -19.80862045288086, "logits/rejected": -19.694360733032227, "logps/chosen": -349.973388671875, "logps/rejected": -330.4769287109375, "loss": 0.7712, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.365755558013916, "rewards/margins": 0.26710349321365356, "rewards/rejected": 2.0986521244049072, "step": 10630 }, { "epoch": 0.49398765030874225, "grad_norm": 40.28016662597656, "learning_rate": 4.1774610396644844e-07, "logits/chosen": -18.521448135375977, "logits/rejected": -17.685199737548828, "logps/chosen": -467.88800048828125, "logps/rejected": -375.6443176269531, "loss": 0.5453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1131439208984375, "rewards/margins": 0.5484393835067749, "rewards/rejected": 2.564704418182373, "step": 10640 }, { "epoch": 0.4944519244161753, "grad_norm": 149.81069946289062, "learning_rate": 4.1766872494854295e-07, "logits/chosen": -18.67874526977539, "logits/rejected": -17.695335388183594, "logps/chosen": -465.6000061035156, "logps/rejected": -390.4276428222656, "loss": 0.8118, "rewards/accuracies": 0.5, "rewards/chosen": 3.370561122894287, "rewards/margins": 0.6945646405220032, "rewards/rejected": 2.6759965419769287, "step": 10650 }, { "epoch": 0.49491619852360835, "grad_norm": 207.4216766357422, "learning_rate": 4.1759134593063746e-07, "logits/chosen": -19.10275650024414, "logits/rejected": -18.449277877807617, "logps/chosen": -476.1039123535156, "logps/rejected": -457.1234436035156, "loss": 0.6694, "rewards/accuracies": 0.5, "rewards/chosen": 3.132483720779419, "rewards/margins": 0.4046829342842102, "rewards/rejected": 2.7278010845184326, "step": 10660 }, { "epoch": 0.49538047263104135, "grad_norm": 245.37171936035156, "learning_rate": 4.175139669127319e-07, "logits/chosen": -17.86190414428711, "logits/rejected": -18.394712448120117, "logps/chosen": -378.72467041015625, "logps/rejected": -399.7806701660156, "loss": 0.9701, "rewards/accuracies": 0.5, "rewards/chosen": 1.9291213750839233, "rewards/margins": -0.21531462669372559, "rewards/rejected": 2.1444358825683594, "step": 10670 }, { "epoch": 0.4958447467384744, "grad_norm": 251.2721405029297, "learning_rate": 4.1743658789482643e-07, "logits/chosen": -18.46163558959961, "logits/rejected": -17.767744064331055, "logps/chosen": -411.2523498535156, "logps/rejected": -353.2361755371094, "loss": 0.7755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.034353017807007, "rewards/margins": 0.5988150835037231, "rewards/rejected": 2.4355380535125732, "step": 10680 }, { "epoch": 0.49630902084590744, "grad_norm": 74.234375, "learning_rate": 4.1735920887692094e-07, "logits/chosen": -19.323123931884766, "logits/rejected": -19.00613784790039, "logps/chosen": -276.6768798828125, "logps/rejected": -302.52703857421875, "loss": 0.9777, "rewards/accuracies": 0.5, "rewards/chosen": 1.6138238906860352, "rewards/margins": -0.25002580881118774, "rewards/rejected": 1.8638496398925781, "step": 10690 }, { "epoch": 0.49677329495334044, "grad_norm": 83.70132446289062, "learning_rate": 4.172818298590154e-07, "logits/chosen": -18.2195987701416, "logits/rejected": -17.999473571777344, "logps/chosen": -355.3478698730469, "logps/rejected": -252.22119140625, "loss": 0.729, "rewards/accuracies": 0.5, "rewards/chosen": 2.9164938926696777, "rewards/margins": 1.287537693977356, "rewards/rejected": 1.6289564371109009, "step": 10700 }, { "epoch": 0.4972375690607735, "grad_norm": 24.019437789916992, "learning_rate": 4.172044508411099e-07, "logits/chosen": -18.692110061645508, "logits/rejected": -17.651601791381836, "logps/chosen": -460.2994079589844, "logps/rejected": -325.6445007324219, "loss": 0.48, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.918612003326416, "rewards/margins": 0.9879991412162781, "rewards/rejected": 1.9306132793426514, "step": 10710 }, { "epoch": 0.4977018431682065, "grad_norm": 106.95460510253906, "learning_rate": 4.1712707182320437e-07, "logits/chosen": -17.54606056213379, "logits/rejected": -16.494897842407227, "logps/chosen": -322.29864501953125, "logps/rejected": -270.4590148925781, "loss": 0.5599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.20514178276062, "rewards/margins": 0.8939708471298218, "rewards/rejected": 1.3111708164215088, "step": 10720 }, { "epoch": 0.49816611727563953, "grad_norm": 120.14588928222656, "learning_rate": 4.170496928052989e-07, "logits/chosen": -18.52683448791504, "logits/rejected": -17.995540618896484, "logps/chosen": -411.1380920410156, "logps/rejected": -343.7528381347656, "loss": 0.6073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.53180193901062, "rewards/margins": 0.7868083715438843, "rewards/rejected": 1.7449935674667358, "step": 10730 }, { "epoch": 0.4986303913830726, "grad_norm": 123.79650115966797, "learning_rate": 4.169723137873934e-07, "logits/chosen": -18.17746925354004, "logits/rejected": -17.403324127197266, "logps/chosen": -449.72247314453125, "logps/rejected": -313.87548828125, "loss": 0.7066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0297958850860596, "rewards/margins": 0.5636447668075562, "rewards/rejected": 2.466151237487793, "step": 10740 }, { "epoch": 0.4990946654905056, "grad_norm": 64.9669189453125, "learning_rate": 4.168949347694879e-07, "logits/chosen": -18.875398635864258, "logits/rejected": -18.07077980041504, "logps/chosen": -418.14764404296875, "logps/rejected": -348.46466064453125, "loss": 0.521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0462400913238525, "rewards/margins": 0.7096475958824158, "rewards/rejected": 2.336592674255371, "step": 10750 }, { "epoch": 0.4995589395979386, "grad_norm": 52.244903564453125, "learning_rate": 4.168175557515824e-07, "logits/chosen": -18.673892974853516, "logits/rejected": -18.766935348510742, "logps/chosen": -336.88067626953125, "logps/rejected": -376.8548889160156, "loss": 0.9739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2045838832855225, "rewards/margins": -0.33157873153686523, "rewards/rejected": 2.5361623764038086, "step": 10760 }, { "epoch": 0.5000232137053716, "grad_norm": 227.40663146972656, "learning_rate": 4.1674017673367687e-07, "logits/chosen": -19.60310935974121, "logits/rejected": -18.98400115966797, "logps/chosen": -391.21112060546875, "logps/rejected": -364.71270751953125, "loss": 0.7357, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.488299608230591, "rewards/margins": 0.306270033121109, "rewards/rejected": 2.1820292472839355, "step": 10770 }, { "epoch": 0.5004874878128047, "grad_norm": 122.25284576416016, "learning_rate": 4.166627977157714e-07, "logits/chosen": -18.35993766784668, "logits/rejected": -18.298633575439453, "logps/chosen": -482.9994201660156, "logps/rejected": -504.2098693847656, "loss": 1.1441, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.679769277572632, "rewards/margins": -0.5804722309112549, "rewards/rejected": 3.2602412700653076, "step": 10780 }, { "epoch": 0.5009517619202377, "grad_norm": 35.809974670410156, "learning_rate": 4.1658541869786584e-07, "logits/chosen": -19.508371353149414, "logits/rejected": -18.68761444091797, "logps/chosen": -497.6693420410156, "logps/rejected": -481.5650329589844, "loss": 0.732, "rewards/accuracies": 0.5, "rewards/chosen": 2.410006046295166, "rewards/margins": 0.22180216014385223, "rewards/rejected": 2.188203811645508, "step": 10790 }, { "epoch": 0.5014160360276707, "grad_norm": 10.931742668151855, "learning_rate": 4.1650803967996035e-07, "logits/chosen": -19.46890640258789, "logits/rejected": -17.157196044921875, "logps/chosen": -389.2095031738281, "logps/rejected": -237.4335479736328, "loss": 0.5, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7249093055725098, "rewards/margins": 0.6624668836593628, "rewards/rejected": 2.0624423027038574, "step": 10800 }, { "epoch": 0.5018803101351038, "grad_norm": 37.9610481262207, "learning_rate": 4.1643066066205486e-07, "logits/chosen": -18.31631088256836, "logits/rejected": -17.241657257080078, "logps/chosen": -507.8810119628906, "logps/rejected": -361.92254638671875, "loss": 0.6871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2266032695770264, "rewards/margins": 0.57284015417099, "rewards/rejected": 2.6537632942199707, "step": 10810 }, { "epoch": 0.5023445842425368, "grad_norm": 65.8823471069336, "learning_rate": 4.163532816441493e-07, "logits/chosen": -19.11344337463379, "logits/rejected": -18.629493713378906, "logps/chosen": -485.553955078125, "logps/rejected": -416.0094299316406, "loss": 0.7928, "rewards/accuracies": 0.5, "rewards/chosen": 3.044553279876709, "rewards/margins": 0.07409224659204483, "rewards/rejected": 2.970460891723633, "step": 10820 }, { "epoch": 0.5028088583499698, "grad_norm": 25.02818489074707, "learning_rate": 4.1627590262624383e-07, "logits/chosen": -18.096729278564453, "logits/rejected": -18.52375030517578, "logps/chosen": -297.98712158203125, "logps/rejected": -312.0966796875, "loss": 0.8033, "rewards/accuracies": 0.5, "rewards/chosen": 1.979071855545044, "rewards/margins": 0.12400057166814804, "rewards/rejected": 1.8550713062286377, "step": 10830 }, { "epoch": 0.5032731324574029, "grad_norm": 12.609761238098145, "learning_rate": 4.1619852360833834e-07, "logits/chosen": -18.841699600219727, "logits/rejected": -18.30215835571289, "logps/chosen": -333.88726806640625, "logps/rejected": -268.2702941894531, "loss": 0.7391, "rewards/accuracies": 0.5, "rewards/chosen": 2.2269625663757324, "rewards/margins": 0.5650196075439453, "rewards/rejected": 1.6619428396224976, "step": 10840 }, { "epoch": 0.5037374065648359, "grad_norm": 111.92684936523438, "learning_rate": 4.1612114459043286e-07, "logits/chosen": -19.876415252685547, "logits/rejected": -18.956146240234375, "logps/chosen": -413.2044982910156, "logps/rejected": -314.15032958984375, "loss": 0.5148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.373251438140869, "rewards/margins": 0.9338116645812988, "rewards/rejected": 2.439439296722412, "step": 10850 }, { "epoch": 0.5042016806722689, "grad_norm": 36.0660285949707, "learning_rate": 4.1604376557252737e-07, "logits/chosen": -19.156463623046875, "logits/rejected": -18.376375198364258, "logps/chosen": -434.287109375, "logps/rejected": -436.789794921875, "loss": 0.3125, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4811320304870605, "rewards/margins": 1.1496260166168213, "rewards/rejected": 2.3315062522888184, "step": 10860 }, { "epoch": 0.5046659547797019, "grad_norm": 42.74420928955078, "learning_rate": 4.159663865546218e-07, "logits/chosen": -18.658281326293945, "logits/rejected": -18.925453186035156, "logps/chosen": -362.26654052734375, "logps/rejected": -358.1646423339844, "loss": 0.8884, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.255124092102051, "rewards/margins": -0.15739916265010834, "rewards/rejected": 2.4125235080718994, "step": 10870 }, { "epoch": 0.505130228887135, "grad_norm": 74.86039733886719, "learning_rate": 4.1588900753671634e-07, "logits/chosen": -18.400928497314453, "logits/rejected": -17.49715805053711, "logps/chosen": -492.9717712402344, "logps/rejected": -328.05181884765625, "loss": 0.4837, "rewards/accuracies": 0.5, "rewards/chosen": 2.8744924068450928, "rewards/margins": 0.8459833264350891, "rewards/rejected": 2.0285089015960693, "step": 10880 }, { "epoch": 0.505594502994568, "grad_norm": 28.422197341918945, "learning_rate": 4.158116285188108e-07, "logits/chosen": -18.804584503173828, "logits/rejected": -18.20252799987793, "logps/chosen": -330.06658935546875, "logps/rejected": -207.1376953125, "loss": 0.773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.085453748703003, "rewards/margins": 0.49207431077957153, "rewards/rejected": 1.593379259109497, "step": 10890 }, { "epoch": 0.506058777102001, "grad_norm": 67.73848724365234, "learning_rate": 4.157342495009053e-07, "logits/chosen": -20.06380271911621, "logits/rejected": -18.685516357421875, "logps/chosen": -447.55010986328125, "logps/rejected": -352.0804748535156, "loss": 0.3072, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.542526960372925, "rewards/margins": 1.431192398071289, "rewards/rejected": 2.111335277557373, "step": 10900 }, { "epoch": 0.5065230512094341, "grad_norm": 10.312501907348633, "learning_rate": 4.156568704829998e-07, "logits/chosen": -18.699777603149414, "logits/rejected": -18.395788192749023, "logps/chosen": -291.9361267089844, "logps/rejected": -238.03695678710938, "loss": 0.716, "rewards/accuracies": 0.5, "rewards/chosen": 2.101635456085205, "rewards/margins": 0.22686739265918732, "rewards/rejected": 1.8747678995132446, "step": 10910 }, { "epoch": 0.5069873253168671, "grad_norm": 122.59675598144531, "learning_rate": 4.155794914650943e-07, "logits/chosen": -18.505701065063477, "logits/rejected": -16.899551391601562, "logps/chosen": -284.11773681640625, "logps/rejected": -242.9537811279297, "loss": 0.5437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5555171966552734, "rewards/margins": 1.2994120121002197, "rewards/rejected": 1.2561053037643433, "step": 10920 }, { "epoch": 0.5074515994243001, "grad_norm": 11.365466117858887, "learning_rate": 4.155021124471888e-07, "logits/chosen": -19.202503204345703, "logits/rejected": -18.102802276611328, "logps/chosen": -432.3497619628906, "logps/rejected": -339.1197814941406, "loss": 0.7884, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.213280439376831, "rewards/margins": 0.8237277269363403, "rewards/rejected": 2.3895528316497803, "step": 10930 }, { "epoch": 0.5079158735317332, "grad_norm": 51.821292877197266, "learning_rate": 4.154247334292833e-07, "logits/chosen": -19.010229110717773, "logits/rejected": -18.558073043823242, "logps/chosen": -344.33203125, "logps/rejected": -222.92800903320312, "loss": 0.7767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3753206729888916, "rewards/margins": 0.8687880635261536, "rewards/rejected": 1.5065325498580933, "step": 10940 }, { "epoch": 0.5083801476391662, "grad_norm": 30.551088333129883, "learning_rate": 4.153473544113778e-07, "logits/chosen": -17.901016235351562, "logits/rejected": -17.897518157958984, "logps/chosen": -304.53057861328125, "logps/rejected": -282.86639404296875, "loss": 0.7708, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.243298292160034, "rewards/margins": 0.34605973958969116, "rewards/rejected": 1.8972387313842773, "step": 10950 }, { "epoch": 0.5088444217465992, "grad_norm": 48.520896911621094, "learning_rate": 4.152699753934723e-07, "logits/chosen": -18.9844913482666, "logits/rejected": -16.92994499206543, "logps/chosen": -477.3030700683594, "logps/rejected": -305.47991943359375, "loss": 0.5139, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0796873569488525, "rewards/margins": 1.1973103284835815, "rewards/rejected": 1.88237726688385, "step": 10960 }, { "epoch": 0.5093086958540323, "grad_norm": 99.74702453613281, "learning_rate": 4.151925963755668e-07, "logits/chosen": -19.68631935119629, "logits/rejected": -18.66791534423828, "logps/chosen": -369.717041015625, "logps/rejected": -331.5494079589844, "loss": 0.6266, "rewards/accuracies": 0.5, "rewards/chosen": 2.387338161468506, "rewards/margins": 0.5352721214294434, "rewards/rejected": 1.8520662784576416, "step": 10970 }, { "epoch": 0.5097729699614653, "grad_norm": 45.720882415771484, "learning_rate": 4.1511521735766124e-07, "logits/chosen": -18.589427947998047, "logits/rejected": -17.569225311279297, "logps/chosen": -331.4256286621094, "logps/rejected": -319.8257141113281, "loss": 0.6243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7511277198791504, "rewards/margins": 0.8282982707023621, "rewards/rejected": 1.9228293895721436, "step": 10980 }, { "epoch": 0.5102372440688983, "grad_norm": 46.308807373046875, "learning_rate": 4.1503783833975575e-07, "logits/chosen": -18.7721004486084, "logits/rejected": -16.833160400390625, "logps/chosen": -420.03680419921875, "logps/rejected": -217.901123046875, "loss": 0.562, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.48514461517334, "rewards/margins": 1.0878193378448486, "rewards/rejected": 1.3973252773284912, "step": 10990 }, { "epoch": 0.5107015181763314, "grad_norm": 152.3343048095703, "learning_rate": 4.1496045932185026e-07, "logits/chosen": -19.037343978881836, "logits/rejected": -17.954975128173828, "logps/chosen": -438.34912109375, "logps/rejected": -288.9764709472656, "loss": 0.5558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.980957269668579, "rewards/margins": 1.2545713186264038, "rewards/rejected": 1.7263858318328857, "step": 11000 }, { "epoch": 0.5111657922837644, "grad_norm": 120.63370513916016, "learning_rate": 4.1488308030394477e-07, "logits/chosen": -19.177743911743164, "logits/rejected": -19.16226577758789, "logps/chosen": -382.156005859375, "logps/rejected": -414.0226135253906, "loss": 0.7258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4679415225982666, "rewards/margins": 0.23697678744792938, "rewards/rejected": 2.2309646606445312, "step": 11010 }, { "epoch": 0.5116300663911973, "grad_norm": 105.81783294677734, "learning_rate": 4.1480570128603923e-07, "logits/chosen": -18.398906707763672, "logits/rejected": -17.73586654663086, "logps/chosen": -406.1459045410156, "logps/rejected": -315.1635437011719, "loss": 0.8033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4070544242858887, "rewards/margins": 0.32974472641944885, "rewards/rejected": 2.077310085296631, "step": 11020 }, { "epoch": 0.5120943404986303, "grad_norm": 136.3385467529297, "learning_rate": 4.1472832226813374e-07, "logits/chosen": -19.446903228759766, "logits/rejected": -18.27145004272461, "logps/chosen": -405.6161804199219, "logps/rejected": -251.9740753173828, "loss": 0.538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.761371612548828, "rewards/margins": 1.092278242111206, "rewards/rejected": 1.669093132019043, "step": 11030 }, { "epoch": 0.5125586146060634, "grad_norm": 118.93773651123047, "learning_rate": 4.1465094325022825e-07, "logits/chosen": -18.25511932373047, "logits/rejected": -17.847492218017578, "logps/chosen": -445.869384765625, "logps/rejected": -340.364501953125, "loss": 0.6886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.432574510574341, "rewards/margins": 0.1826060712337494, "rewards/rejected": 2.2499682903289795, "step": 11040 }, { "epoch": 0.5130228887134964, "grad_norm": 130.0161590576172, "learning_rate": 4.1457356423232276e-07, "logits/chosen": -18.92438316345215, "logits/rejected": -18.398548126220703, "logps/chosen": -356.90386962890625, "logps/rejected": -276.8003845214844, "loss": 0.5608, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8165504932403564, "rewards/margins": 0.8394908905029297, "rewards/rejected": 1.9770593643188477, "step": 11050 }, { "epoch": 0.5134871628209294, "grad_norm": 111.22347259521484, "learning_rate": 4.144961852144173e-07, "logits/chosen": -18.787073135375977, "logits/rejected": -18.429378509521484, "logps/chosen": -417.5306701660156, "logps/rejected": -362.1600036621094, "loss": 1.2547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7134366035461426, "rewards/margins": -0.17255237698554993, "rewards/rejected": 2.88598895072937, "step": 11060 }, { "epoch": 0.5139514369283625, "grad_norm": 127.54214477539062, "learning_rate": 4.144188061965118e-07, "logits/chosen": -18.428546905517578, "logits/rejected": -18.57535171508789, "logps/chosen": -490.7501525878906, "logps/rejected": -417.17462158203125, "loss": 0.7697, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.38458251953125, "rewards/margins": 0.07793662697076797, "rewards/rejected": 2.3066458702087402, "step": 11070 }, { "epoch": 0.5144157110357955, "grad_norm": 11.584816932678223, "learning_rate": 4.143414271786062e-07, "logits/chosen": -18.41131591796875, "logits/rejected": -18.57901382446289, "logps/chosen": -267.8975830078125, "logps/rejected": -241.4296112060547, "loss": 0.7516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9532508850097656, "rewards/margins": 0.35923781991004944, "rewards/rejected": 1.594012975692749, "step": 11080 }, { "epoch": 0.5148799851432285, "grad_norm": 49.33015823364258, "learning_rate": 4.142640481607007e-07, "logits/chosen": -18.860687255859375, "logits/rejected": -17.11705207824707, "logps/chosen": -335.75860595703125, "logps/rejected": -223.49520874023438, "loss": 0.3837, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6811885833740234, "rewards/margins": 1.1685937643051147, "rewards/rejected": 1.5125948190689087, "step": 11090 }, { "epoch": 0.5153442592506616, "grad_norm": 89.51563262939453, "learning_rate": 4.141866691427952e-07, "logits/chosen": -17.97296714782715, "logits/rejected": -16.89607048034668, "logps/chosen": -405.5269775390625, "logps/rejected": -296.9250183105469, "loss": 0.566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3029448986053467, "rewards/margins": 0.8824999928474426, "rewards/rejected": 1.4204450845718384, "step": 11100 }, { "epoch": 0.5158085333580946, "grad_norm": 99.51542663574219, "learning_rate": 4.141092901248897e-07, "logits/chosen": -18.515588760375977, "logits/rejected": -18.38551902770996, "logps/chosen": -320.094482421875, "logps/rejected": -336.662109375, "loss": 1.0176, "rewards/accuracies": 0.5, "rewards/chosen": 2.0826854705810547, "rewards/margins": -0.22756394743919373, "rewards/rejected": 2.3102493286132812, "step": 11110 }, { "epoch": 0.5162728074655276, "grad_norm": 183.596923828125, "learning_rate": 4.140319111069842e-07, "logits/chosen": -17.895030975341797, "logits/rejected": -16.904447555541992, "logps/chosen": -416.0682067871094, "logps/rejected": -279.3689270019531, "loss": 0.6321, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5553410053253174, "rewards/margins": 0.7152307033538818, "rewards/rejected": 1.840110421180725, "step": 11120 }, { "epoch": 0.5167370815729607, "grad_norm": 146.7664031982422, "learning_rate": 4.139545320890787e-07, "logits/chosen": -18.983478546142578, "logits/rejected": -18.76004981994629, "logps/chosen": -347.9541320800781, "logps/rejected": -310.19024658203125, "loss": 0.7481, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5868587493896484, "rewards/margins": 0.14003819227218628, "rewards/rejected": 2.4468207359313965, "step": 11130 }, { "epoch": 0.5172013556803937, "grad_norm": 264.9947204589844, "learning_rate": 4.138771530711732e-07, "logits/chosen": -17.99654197692871, "logits/rejected": -17.530668258666992, "logps/chosen": -273.09588623046875, "logps/rejected": -309.364013671875, "loss": 0.8857, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.71494460105896, "rewards/margins": 0.663923978805542, "rewards/rejected": 2.051020622253418, "step": 11140 }, { "epoch": 0.5176656297878267, "grad_norm": 4.969665050506592, "learning_rate": 4.137997740532677e-07, "logits/chosen": -19.113378524780273, "logits/rejected": -17.644628524780273, "logps/chosen": -529.6915893554688, "logps/rejected": -303.84991455078125, "loss": 0.4165, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.678004741668701, "rewards/margins": 1.6840956211090088, "rewards/rejected": 1.993909239768982, "step": 11150 }, { "epoch": 0.5181299038952598, "grad_norm": 5.698428630828857, "learning_rate": 4.1372239503536223e-07, "logits/chosen": -18.959117889404297, "logits/rejected": -17.963966369628906, "logps/chosen": -528.1712036132812, "logps/rejected": -375.58087158203125, "loss": 0.5013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.178156614303589, "rewards/margins": 1.1646760702133179, "rewards/rejected": 2.0134806632995605, "step": 11160 }, { "epoch": 0.5185941780026928, "grad_norm": 1.6222903728485107, "learning_rate": 4.1364501601745664e-07, "logits/chosen": -18.025936126708984, "logits/rejected": -17.232799530029297, "logps/chosen": -416.0096740722656, "logps/rejected": -300.4957580566406, "loss": 0.7123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1073269844055176, "rewards/margins": 0.8185015916824341, "rewards/rejected": 2.288825035095215, "step": 11170 }, { "epoch": 0.5190584521101258, "grad_norm": 1.2915995121002197, "learning_rate": 4.1356763699955115e-07, "logits/chosen": -18.52796745300293, "logits/rejected": -18.221006393432617, "logps/chosen": -427.6728515625, "logps/rejected": -435.98907470703125, "loss": 1.204, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0652058124542236, "rewards/margins": -0.1376257687807083, "rewards/rejected": 3.202831745147705, "step": 11180 }, { "epoch": 0.5195227262175588, "grad_norm": 111.36625671386719, "learning_rate": 4.1349025798164566e-07, "logits/chosen": -18.385950088500977, "logits/rejected": -17.196626663208008, "logps/chosen": -415.5489807128906, "logps/rejected": -290.9431457519531, "loss": 0.3484, "rewards/accuracies": 1.0, "rewards/chosen": 3.300579071044922, "rewards/margins": 1.3838216066360474, "rewards/rejected": 1.9167578220367432, "step": 11190 }, { "epoch": 0.5199870003249919, "grad_norm": 31.871990203857422, "learning_rate": 4.1341287896374017e-07, "logits/chosen": -19.376745223999023, "logits/rejected": -18.23033332824707, "logps/chosen": -271.57611083984375, "logps/rejected": -249.25369262695312, "loss": 0.4466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.619140863418579, "rewards/margins": 1.0803964138031006, "rewards/rejected": 1.5387448072433472, "step": 11200 }, { "epoch": 0.5204512744324249, "grad_norm": 93.14334106445312, "learning_rate": 4.133354999458347e-07, "logits/chosen": -19.190149307250977, "logits/rejected": -17.794679641723633, "logps/chosen": -440.1180725097656, "logps/rejected": -331.0005798339844, "loss": 0.3378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7549026012420654, "rewards/margins": 1.411484956741333, "rewards/rejected": 2.3434181213378906, "step": 11210 }, { "epoch": 0.5209155485398579, "grad_norm": 125.24957275390625, "learning_rate": 4.132581209279292e-07, "logits/chosen": -18.45909881591797, "logits/rejected": -18.163097381591797, "logps/chosen": -448.2974548339844, "logps/rejected": -354.53009033203125, "loss": 0.6075, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.325808048248291, "rewards/margins": 0.4125309884548187, "rewards/rejected": 1.91327702999115, "step": 11220 }, { "epoch": 0.521379822647291, "grad_norm": 68.18902587890625, "learning_rate": 4.1318074191002365e-07, "logits/chosen": -17.178091049194336, "logits/rejected": -17.662996292114258, "logps/chosen": -304.8561096191406, "logps/rejected": -369.07196044921875, "loss": 1.2989, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.6883796453475952, "rewards/margins": -0.7013779878616333, "rewards/rejected": 2.3897576332092285, "step": 11230 }, { "epoch": 0.521844096754724, "grad_norm": 135.2917938232422, "learning_rate": 4.1310336289211816e-07, "logits/chosen": -19.19527816772461, "logits/rejected": -17.652576446533203, "logps/chosen": -508.2552795410156, "logps/rejected": -301.162353515625, "loss": 0.5423, "rewards/accuracies": 0.5, "rewards/chosen": 2.5706560611724854, "rewards/margins": 0.6925804018974304, "rewards/rejected": 1.878075361251831, "step": 11240 }, { "epoch": 0.522308370862157, "grad_norm": 62.5671501159668, "learning_rate": 4.1302598387421267e-07, "logits/chosen": -19.159595489501953, "logits/rejected": -18.067012786865234, "logps/chosen": -385.4508972167969, "logps/rejected": -373.5204772949219, "loss": 0.7863, "rewards/accuracies": 0.5, "rewards/chosen": 2.2192742824554443, "rewards/margins": 0.06315895169973373, "rewards/rejected": 2.1561150550842285, "step": 11250 }, { "epoch": 0.5227726449695901, "grad_norm": 115.02879333496094, "learning_rate": 4.129486048563072e-07, "logits/chosen": -18.319637298583984, "logits/rejected": -18.25785255432129, "logps/chosen": -390.8892517089844, "logps/rejected": -433.56683349609375, "loss": 0.753, "rewards/accuracies": 0.5, "rewards/chosen": 2.9228131771087646, "rewards/margins": 0.255813866853714, "rewards/rejected": 2.666999340057373, "step": 11260 }, { "epoch": 0.5232369190770231, "grad_norm": 55.42855453491211, "learning_rate": 4.128712258384016e-07, "logits/chosen": -18.299549102783203, "logits/rejected": -17.932851791381836, "logps/chosen": -447.107421875, "logps/rejected": -336.80963134765625, "loss": 0.5706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.088930368423462, "rewards/margins": 0.7159307599067688, "rewards/rejected": 2.372999429702759, "step": 11270 }, { "epoch": 0.5237011931844561, "grad_norm": 2.4885339736938477, "learning_rate": 4.127938468204961e-07, "logits/chosen": -18.80111312866211, "logits/rejected": -17.853313446044922, "logps/chosen": -468.1197204589844, "logps/rejected": -285.2074890136719, "loss": 0.5975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.189406633377075, "rewards/margins": 1.4444892406463623, "rewards/rejected": 1.7449172735214233, "step": 11280 }, { "epoch": 0.5241654672918892, "grad_norm": 49.97970199584961, "learning_rate": 4.127164678025906e-07, "logits/chosen": -17.901901245117188, "logits/rejected": -17.56576156616211, "logps/chosen": -358.71002197265625, "logps/rejected": -321.99859619140625, "loss": 0.5625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4015121459960938, "rewards/margins": 0.7248546481132507, "rewards/rejected": 1.6766573190689087, "step": 11290 }, { "epoch": 0.5246297413993222, "grad_norm": 11.349520683288574, "learning_rate": 4.126390887846851e-07, "logits/chosen": -17.99209213256836, "logits/rejected": -17.04021644592285, "logps/chosen": -411.67694091796875, "logps/rejected": -255.94235229492188, "loss": 0.4224, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3290951251983643, "rewards/margins": 0.9156131744384766, "rewards/rejected": 1.4134819507598877, "step": 11300 }, { "epoch": 0.5250940155067552, "grad_norm": 63.51798629760742, "learning_rate": 4.1256170976677963e-07, "logits/chosen": -20.0166015625, "logits/rejected": -20.177867889404297, "logps/chosen": -440.0357360839844, "logps/rejected": -393.2278747558594, "loss": 0.9745, "rewards/accuracies": 0.5, "rewards/chosen": 2.7773828506469727, "rewards/margins": -0.07669631391763687, "rewards/rejected": 2.8540797233581543, "step": 11310 }, { "epoch": 0.5255582896141883, "grad_norm": 66.50054931640625, "learning_rate": 4.1248433074887415e-07, "logits/chosen": -19.6014347076416, "logits/rejected": -19.58159065246582, "logps/chosen": -441.73785400390625, "logps/rejected": -377.52459716796875, "loss": 0.8386, "rewards/accuracies": 0.5, "rewards/chosen": 3.159153461456299, "rewards/margins": 0.24017062783241272, "rewards/rejected": 2.918982744216919, "step": 11320 }, { "epoch": 0.5260225637216213, "grad_norm": 9.305649757385254, "learning_rate": 4.124069517309686e-07, "logits/chosen": -18.267648696899414, "logits/rejected": -16.88417625427246, "logps/chosen": -309.4639892578125, "logps/rejected": -200.32968139648438, "loss": 0.3559, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.587489604949951, "rewards/margins": 1.1771810054779053, "rewards/rejected": 1.410308837890625, "step": 11330 }, { "epoch": 0.5264868378290543, "grad_norm": 83.38809967041016, "learning_rate": 4.123295727130631e-07, "logits/chosen": -19.01030158996582, "logits/rejected": -18.041723251342773, "logps/chosen": -420.5321350097656, "logps/rejected": -261.91827392578125, "loss": 0.5626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.807802677154541, "rewards/margins": 0.883625328540802, "rewards/rejected": 1.9241775274276733, "step": 11340 }, { "epoch": 0.5269511119364872, "grad_norm": 154.95445251464844, "learning_rate": 4.1225219369515763e-07, "logits/chosen": -19.37055778503418, "logits/rejected": -18.6079158782959, "logps/chosen": -426.08966064453125, "logps/rejected": -412.12841796875, "loss": 0.6724, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6337201595306396, "rewards/margins": 0.2809022068977356, "rewards/rejected": 2.352818012237549, "step": 11350 }, { "epoch": 0.5274153860439204, "grad_norm": 33.08811950683594, "learning_rate": 4.1217481467725214e-07, "logits/chosen": -19.073978424072266, "logits/rejected": -18.762184143066406, "logps/chosen": -410.5377502441406, "logps/rejected": -407.8450622558594, "loss": 0.7579, "rewards/accuracies": 0.5, "rewards/chosen": 2.607987642288208, "rewards/margins": 0.398698627948761, "rewards/rejected": 2.209289073944092, "step": 11360 }, { "epoch": 0.5278796601513533, "grad_norm": 19.38823890686035, "learning_rate": 4.120974356593466e-07, "logits/chosen": -18.45675277709961, "logits/rejected": -17.74100112915039, "logps/chosen": -326.0450134277344, "logps/rejected": -340.39251708984375, "loss": 1.4296, "rewards/accuracies": 0.5, "rewards/chosen": 2.4408421516418457, "rewards/margins": -0.42789554595947266, "rewards/rejected": 2.8687381744384766, "step": 11370 }, { "epoch": 0.5283439342587863, "grad_norm": 7.446170330047607, "learning_rate": 4.1202005664144105e-07, "logits/chosen": -19.01957893371582, "logits/rejected": -17.641490936279297, "logps/chosen": -441.7984924316406, "logps/rejected": -291.18780517578125, "loss": 0.2709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.593470335006714, "rewards/margins": 1.6765661239624023, "rewards/rejected": 1.9169038534164429, "step": 11380 }, { "epoch": 0.5288082083662194, "grad_norm": 125.38347625732422, "learning_rate": 4.1194267762353557e-07, "logits/chosen": -19.03904914855957, "logits/rejected": -18.465553283691406, "logps/chosen": -395.2799377441406, "logps/rejected": -361.99761962890625, "loss": 0.5911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.390371322631836, "rewards/margins": 0.29393917322158813, "rewards/rejected": 2.0964322090148926, "step": 11390 }, { "epoch": 0.5292724824736524, "grad_norm": 71.32327270507812, "learning_rate": 4.118652986056301e-07, "logits/chosen": -18.102001190185547, "logits/rejected": -17.46279525756836, "logps/chosen": -382.2516784667969, "logps/rejected": -242.13595581054688, "loss": 0.5681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.068666458129883, "rewards/margins": 0.6515365839004517, "rewards/rejected": 1.4171299934387207, "step": 11400 }, { "epoch": 0.5297367565810854, "grad_norm": 119.74456787109375, "learning_rate": 4.117879195877246e-07, "logits/chosen": -18.853776931762695, "logits/rejected": -17.62626838684082, "logps/chosen": -354.31256103515625, "logps/rejected": -270.2142028808594, "loss": 0.5752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6835107803344727, "rewards/margins": 0.8080180287361145, "rewards/rejected": 1.8754926919937134, "step": 11410 }, { "epoch": 0.5302010306885185, "grad_norm": 23.341991424560547, "learning_rate": 4.117105405698191e-07, "logits/chosen": -18.7297420501709, "logits/rejected": -18.022132873535156, "logps/chosen": -369.5126953125, "logps/rejected": -333.9601135253906, "loss": 0.5453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8708205223083496, "rewards/margins": 1.2989261150360107, "rewards/rejected": 1.5718944072723389, "step": 11420 }, { "epoch": 0.5306653047959515, "grad_norm": 1.4066438674926758, "learning_rate": 4.1163316155191356e-07, "logits/chosen": -18.541831970214844, "logits/rejected": -17.193273544311523, "logps/chosen": -384.7099914550781, "logps/rejected": -241.112060546875, "loss": 0.301, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.903782367706299, "rewards/margins": 2.1233346462249756, "rewards/rejected": 1.7804477214813232, "step": 11430 }, { "epoch": 0.5311295789033845, "grad_norm": 101.77486419677734, "learning_rate": 4.1155578253400807e-07, "logits/chosen": -19.380252838134766, "logits/rejected": -18.640060424804688, "logps/chosen": -417.14691162109375, "logps/rejected": -484.1822204589844, "loss": 0.919, "rewards/accuracies": 0.5, "rewards/chosen": 2.5223050117492676, "rewards/margins": 0.12891754508018494, "rewards/rejected": 2.39338755607605, "step": 11440 }, { "epoch": 0.5315938530108176, "grad_norm": 80.69950103759766, "learning_rate": 4.114784035161026e-07, "logits/chosen": -18.574932098388672, "logits/rejected": -17.56374168395996, "logps/chosen": -432.4298400878906, "logps/rejected": -326.7444152832031, "loss": 0.5073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3821561336517334, "rewards/margins": 0.7100411653518677, "rewards/rejected": 1.6721150875091553, "step": 11450 }, { "epoch": 0.5320581271182506, "grad_norm": 94.95877838134766, "learning_rate": 4.1140102449819704e-07, "logits/chosen": -18.524883270263672, "logits/rejected": -17.663372039794922, "logps/chosen": -479.5892639160156, "logps/rejected": -336.0054931640625, "loss": 0.5879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6027843952178955, "rewards/margins": 0.387477844953537, "rewards/rejected": 2.215306282043457, "step": 11460 }, { "epoch": 0.5325224012256836, "grad_norm": 61.211483001708984, "learning_rate": 4.1132364548029155e-07, "logits/chosen": -18.17530632019043, "logits/rejected": -17.841188430786133, "logps/chosen": -327.67218017578125, "logps/rejected": -296.5736389160156, "loss": 0.9691, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.995374321937561, "rewards/margins": -0.24762949347496033, "rewards/rejected": 2.2430038452148438, "step": 11470 }, { "epoch": 0.5329866753331167, "grad_norm": 42.638710021972656, "learning_rate": 4.11246266462386e-07, "logits/chosen": -18.636722564697266, "logits/rejected": -18.575599670410156, "logps/chosen": -382.3438415527344, "logps/rejected": -305.29962158203125, "loss": 0.8128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9004873037338257, "rewards/margins": 0.2201647311449051, "rewards/rejected": 1.6803226470947266, "step": 11480 }, { "epoch": 0.5334509494405497, "grad_norm": 37.19510269165039, "learning_rate": 4.111688874444805e-07, "logits/chosen": -18.60616683959961, "logits/rejected": -18.770641326904297, "logps/chosen": -371.93133544921875, "logps/rejected": -419.9141540527344, "loss": 1.0922, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3695998191833496, "rewards/margins": -0.24361327290534973, "rewards/rejected": 2.613213300704956, "step": 11490 }, { "epoch": 0.5339152235479827, "grad_norm": 36.41980743408203, "learning_rate": 4.1109150842657503e-07, "logits/chosen": -18.296571731567383, "logits/rejected": -18.11876106262207, "logps/chosen": -389.6346130371094, "logps/rejected": -345.58929443359375, "loss": 0.8638, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.664313793182373, "rewards/margins": 0.33029070496559143, "rewards/rejected": 2.3340232372283936, "step": 11500 }, { "epoch": 0.5343794976554157, "grad_norm": 122.74626922607422, "learning_rate": 4.1101412940866954e-07, "logits/chosen": -18.6583194732666, "logits/rejected": -17.95353126525879, "logps/chosen": -356.9093933105469, "logps/rejected": -282.9866027832031, "loss": 0.4998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.09989857673645, "rewards/margins": 1.0154974460601807, "rewards/rejected": 2.0844011306762695, "step": 11510 }, { "epoch": 0.5348437717628488, "grad_norm": 7.513485431671143, "learning_rate": 4.1093675039076405e-07, "logits/chosen": -19.444793701171875, "logits/rejected": -18.47435188293457, "logps/chosen": -427.531005859375, "logps/rejected": -345.69830322265625, "loss": 0.5861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.335829257965088, "rewards/margins": 0.6457635760307312, "rewards/rejected": 1.6900659799575806, "step": 11520 }, { "epoch": 0.5353080458702818, "grad_norm": 49.29265213012695, "learning_rate": 4.108593713728585e-07, "logits/chosen": -18.305368423461914, "logits/rejected": -16.734342575073242, "logps/chosen": -392.7648010253906, "logps/rejected": -203.9643096923828, "loss": 0.4621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3228182792663574, "rewards/margins": 0.919547438621521, "rewards/rejected": 1.4032707214355469, "step": 11530 }, { "epoch": 0.5357723199777148, "grad_norm": 66.81322479248047, "learning_rate": 4.10781992354953e-07, "logits/chosen": -19.50972557067871, "logits/rejected": -18.16499137878418, "logps/chosen": -523.6375732421875, "logps/rejected": -319.7734680175781, "loss": 0.3587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.297666549682617, "rewards/margins": 1.3397800922393799, "rewards/rejected": 1.9578863382339478, "step": 11540 }, { "epoch": 0.5362365940851479, "grad_norm": 97.6270523071289, "learning_rate": 4.1070461333704754e-07, "logits/chosen": -18.323150634765625, "logits/rejected": -18.130172729492188, "logps/chosen": -497.504150390625, "logps/rejected": -440.62738037109375, "loss": 0.4957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.019885778427124, "rewards/margins": 0.6231790781021118, "rewards/rejected": 2.396707057952881, "step": 11550 }, { "epoch": 0.5367008681925809, "grad_norm": 15.551088333129883, "learning_rate": 4.10627234319142e-07, "logits/chosen": -19.03672981262207, "logits/rejected": -17.37972068786621, "logps/chosen": -437.8606872558594, "logps/rejected": -254.9680938720703, "loss": 0.4954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9868271350860596, "rewards/margins": 1.1885391473770142, "rewards/rejected": 1.7982877492904663, "step": 11560 }, { "epoch": 0.5371651423000139, "grad_norm": 26.904754638671875, "learning_rate": 4.105498553012365e-07, "logits/chosen": -18.63511085510254, "logits/rejected": -17.616233825683594, "logps/chosen": -414.353759765625, "logps/rejected": -276.42535400390625, "loss": 0.4663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.986389636993408, "rewards/margins": 1.484297752380371, "rewards/rejected": 1.502091884613037, "step": 11570 }, { "epoch": 0.537629416407447, "grad_norm": 17.156566619873047, "learning_rate": 4.1047247628333096e-07, "logits/chosen": -18.77109718322754, "logits/rejected": -17.96653175354004, "logps/chosen": -414.8680725097656, "logps/rejected": -340.515380859375, "loss": 0.5357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.051431179046631, "rewards/margins": 0.9897575378417969, "rewards/rejected": 2.061673641204834, "step": 11580 }, { "epoch": 0.53809369051488, "grad_norm": 54.048309326171875, "learning_rate": 4.103950972654255e-07, "logits/chosen": -18.997098922729492, "logits/rejected": -17.940555572509766, "logps/chosen": -448.4884338378906, "logps/rejected": -322.1102294921875, "loss": 0.4267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8170838356018066, "rewards/margins": 1.0205960273742676, "rewards/rejected": 1.7964880466461182, "step": 11590 }, { "epoch": 0.538557964622313, "grad_norm": 70.6485824584961, "learning_rate": 4.1031771824752e-07, "logits/chosen": -18.17441177368164, "logits/rejected": -17.808696746826172, "logps/chosen": -344.85723876953125, "logps/rejected": -310.24945068359375, "loss": 0.6724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.840857982635498, "rewards/margins": 0.6272540092468262, "rewards/rejected": 2.213603973388672, "step": 11600 }, { "epoch": 0.5390222387297461, "grad_norm": 68.41612243652344, "learning_rate": 4.102403392296145e-07, "logits/chosen": -18.661340713500977, "logits/rejected": -17.39413833618164, "logps/chosen": -360.98358154296875, "logps/rejected": -266.30517578125, "loss": 0.3738, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.753812313079834, "rewards/margins": 1.345668077468872, "rewards/rejected": 1.4081447124481201, "step": 11610 }, { "epoch": 0.5394865128371791, "grad_norm": 53.066043853759766, "learning_rate": 4.10162960211709e-07, "logits/chosen": -19.110788345336914, "logits/rejected": -17.858619689941406, "logps/chosen": -422.3309020996094, "logps/rejected": -285.07550048828125, "loss": 0.5912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4384970664978027, "rewards/margins": 0.8289243578910828, "rewards/rejected": 1.6095727682113647, "step": 11620 }, { "epoch": 0.5399507869446121, "grad_norm": 33.60814666748047, "learning_rate": 4.1008558119380347e-07, "logits/chosen": -17.943859100341797, "logits/rejected": -18.223981857299805, "logps/chosen": -362.6141357421875, "logps/rejected": -382.4205017089844, "loss": 0.9749, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4994404315948486, "rewards/margins": -0.25111374258995056, "rewards/rejected": 2.750553846359253, "step": 11630 }, { "epoch": 0.5404150610520452, "grad_norm": 64.8526611328125, "learning_rate": 4.10008202175898e-07, "logits/chosen": -18.628868103027344, "logits/rejected": -17.613969802856445, "logps/chosen": -473.4285583496094, "logps/rejected": -326.71844482421875, "loss": 0.3775, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.780473232269287, "rewards/margins": 1.4439219236373901, "rewards/rejected": 2.3365511894226074, "step": 11640 }, { "epoch": 0.5408793351594782, "grad_norm": 7.879103660583496, "learning_rate": 4.0993082315799244e-07, "logits/chosen": -17.909364700317383, "logits/rejected": -16.71316909790039, "logps/chosen": -433.75689697265625, "logps/rejected": -304.19561767578125, "loss": 0.5455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.871293306350708, "rewards/margins": 1.2561547756195068, "rewards/rejected": 1.6151384115219116, "step": 11650 }, { "epoch": 0.5413436092669112, "grad_norm": 38.64381790161133, "learning_rate": 4.0985344414008695e-07, "logits/chosen": -18.365720748901367, "logits/rejected": -17.639970779418945, "logps/chosen": -474.19940185546875, "logps/rejected": -374.3406066894531, "loss": 0.6417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.866194009780884, "rewards/margins": 0.8259710073471069, "rewards/rejected": 2.040222644805908, "step": 11660 }, { "epoch": 0.5418078833743443, "grad_norm": 15.857056617736816, "learning_rate": 4.0977606512218146e-07, "logits/chosen": -18.919652938842773, "logits/rejected": -18.2050724029541, "logps/chosen": -370.98712158203125, "logps/rejected": -258.7777099609375, "loss": 0.58, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0190417766571045, "rewards/margins": 0.7206282615661621, "rewards/rejected": 1.2984135150909424, "step": 11670 }, { "epoch": 0.5422721574817773, "grad_norm": 9.114797592163086, "learning_rate": 4.096986861042759e-07, "logits/chosen": -18.698501586914062, "logits/rejected": -18.0648193359375, "logps/chosen": -443.4166564941406, "logps/rejected": -379.42291259765625, "loss": 0.4406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6168060302734375, "rewards/margins": 0.9451189041137695, "rewards/rejected": 1.671687364578247, "step": 11680 }, { "epoch": 0.5427364315892103, "grad_norm": 7.338729381561279, "learning_rate": 4.0962130708637043e-07, "logits/chosen": -18.38039779663086, "logits/rejected": -16.726613998413086, "logps/chosen": -476.6210021972656, "logps/rejected": -265.47100830078125, "loss": 0.6979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7299599647521973, "rewards/margins": 1.3234663009643555, "rewards/rejected": 1.4064936637878418, "step": 11690 }, { "epoch": 0.5432007056966432, "grad_norm": 27.646759033203125, "learning_rate": 4.0954392806846494e-07, "logits/chosen": -18.014440536499023, "logits/rejected": -17.27093505859375, "logps/chosen": -366.93084716796875, "logps/rejected": -305.4900207519531, "loss": 0.8328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4863877296447754, "rewards/margins": 0.3953622877597809, "rewards/rejected": 2.0910253524780273, "step": 11700 }, { "epoch": 0.5436649798040764, "grad_norm": 37.24195861816406, "learning_rate": 4.0946654905055945e-07, "logits/chosen": -18.371109008789062, "logits/rejected": -18.6290225982666, "logps/chosen": -248.713623046875, "logps/rejected": -262.01324462890625, "loss": 1.1716, "rewards/accuracies": 0.5, "rewards/chosen": 2.073117256164551, "rewards/margins": -0.5190724730491638, "rewards/rejected": 2.5921897888183594, "step": 11710 }, { "epoch": 0.5441292539115093, "grad_norm": 35.079376220703125, "learning_rate": 4.0938917003265396e-07, "logits/chosen": -18.929765701293945, "logits/rejected": -18.647602081298828, "logps/chosen": -416.34722900390625, "logps/rejected": -276.6814880371094, "loss": 0.7765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3906524181365967, "rewards/margins": 0.6313168406486511, "rewards/rejected": 1.7593352794647217, "step": 11720 }, { "epoch": 0.5445935280189423, "grad_norm": 82.9154052734375, "learning_rate": 4.093117910147484e-07, "logits/chosen": -18.37375831604004, "logits/rejected": -18.250473022460938, "logps/chosen": -360.5862731933594, "logps/rejected": -402.912353515625, "loss": 0.8735, "rewards/accuracies": 0.5, "rewards/chosen": 2.6753716468811035, "rewards/margins": 0.01667105033993721, "rewards/rejected": 2.658700466156006, "step": 11730 }, { "epoch": 0.5450578021263754, "grad_norm": 92.80967712402344, "learning_rate": 4.0923441199684293e-07, "logits/chosen": -18.617191314697266, "logits/rejected": -17.795246124267578, "logps/chosen": -362.6806335449219, "logps/rejected": -235.13870239257812, "loss": 0.4394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.714801073074341, "rewards/margins": 1.3321776390075684, "rewards/rejected": 1.3826231956481934, "step": 11740 }, { "epoch": 0.5455220762338084, "grad_norm": 118.5341567993164, "learning_rate": 4.091570329789374e-07, "logits/chosen": -18.260005950927734, "logits/rejected": -17.055662155151367, "logps/chosen": -512.1012573242188, "logps/rejected": -354.01104736328125, "loss": 0.7732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.034575462341309, "rewards/margins": 0.9433609843254089, "rewards/rejected": 3.091214418411255, "step": 11750 }, { "epoch": 0.5459863503412414, "grad_norm": 133.91555786132812, "learning_rate": 4.090796539610319e-07, "logits/chosen": -18.789485931396484, "logits/rejected": -19.186952590942383, "logps/chosen": -450.33587646484375, "logps/rejected": -451.84771728515625, "loss": 0.864, "rewards/accuracies": 0.5, "rewards/chosen": 2.894362688064575, "rewards/margins": -0.20112602412700653, "rewards/rejected": 3.0954887866973877, "step": 11760 }, { "epoch": 0.5464506244486745, "grad_norm": 213.29197692871094, "learning_rate": 4.090022749431264e-07, "logits/chosen": -17.404817581176758, "logits/rejected": -18.872154235839844, "logps/chosen": -327.4304504394531, "logps/rejected": -422.90570068359375, "loss": 1.2562, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2604169845581055, "rewards/margins": -0.7569466829299927, "rewards/rejected": 3.017364025115967, "step": 11770 }, { "epoch": 0.5469148985561075, "grad_norm": 38.69398498535156, "learning_rate": 4.0892489592522087e-07, "logits/chosen": -18.901634216308594, "logits/rejected": -17.85368537902832, "logps/chosen": -311.19647216796875, "logps/rejected": -235.8166046142578, "loss": 0.4572, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.351090431213379, "rewards/margins": 0.7861238718032837, "rewards/rejected": 1.5649669170379639, "step": 11780 }, { "epoch": 0.5473791726635405, "grad_norm": 44.7194938659668, "learning_rate": 4.088475169073154e-07, "logits/chosen": -19.41409683227539, "logits/rejected": -18.609743118286133, "logps/chosen": -331.45391845703125, "logps/rejected": -263.5159912109375, "loss": 0.5772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1198644638061523, "rewards/margins": 0.6362966299057007, "rewards/rejected": 1.4835678339004517, "step": 11790 }, { "epoch": 0.5478434467709736, "grad_norm": 123.9406967163086, "learning_rate": 4.087701378894099e-07, "logits/chosen": -19.736927032470703, "logits/rejected": -18.349109649658203, "logps/chosen": -434.9912109375, "logps/rejected": -312.01397705078125, "loss": 0.6699, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4431588649749756, "rewards/margins": 0.5644610524177551, "rewards/rejected": 1.8786977529525757, "step": 11800 }, { "epoch": 0.5483077208784066, "grad_norm": 90.15918731689453, "learning_rate": 4.086927588715044e-07, "logits/chosen": -19.022294998168945, "logits/rejected": -18.58065414428711, "logps/chosen": -478.2177734375, "logps/rejected": -450.0809020996094, "loss": 0.5305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.759352207183838, "rewards/margins": 0.6288301348686218, "rewards/rejected": 3.1305222511291504, "step": 11810 }, { "epoch": 0.5487719949858396, "grad_norm": 60.6128044128418, "learning_rate": 4.086153798535989e-07, "logits/chosen": -18.62259292602539, "logits/rejected": -17.720470428466797, "logps/chosen": -342.88433837890625, "logps/rejected": -294.3423767089844, "loss": 0.6607, "rewards/accuracies": 0.5, "rewards/chosen": 2.3714632987976074, "rewards/margins": 0.5978862047195435, "rewards/rejected": 1.773577332496643, "step": 11820 }, { "epoch": 0.5492362690932727, "grad_norm": 52.73549270629883, "learning_rate": 4.085380008356934e-07, "logits/chosen": -19.00358772277832, "logits/rejected": -18.43922996520996, "logps/chosen": -463.9794921875, "logps/rejected": -383.21063232421875, "loss": 0.7441, "rewards/accuracies": 0.5, "rewards/chosen": 2.322481870651245, "rewards/margins": -0.031989168375730515, "rewards/rejected": 2.35447096824646, "step": 11830 }, { "epoch": 0.5497005432007057, "grad_norm": 61.9654426574707, "learning_rate": 4.0846062181778783e-07, "logits/chosen": -18.244739532470703, "logits/rejected": -18.0137996673584, "logps/chosen": -376.59234619140625, "logps/rejected": -309.7414855957031, "loss": 0.4276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.63820219039917, "rewards/margins": 0.9234170913696289, "rewards/rejected": 1.7147852182388306, "step": 11840 }, { "epoch": 0.5501648173081387, "grad_norm": 114.66534423828125, "learning_rate": 4.0838324279988234e-07, "logits/chosen": -18.29695701599121, "logits/rejected": -17.58348846435547, "logps/chosen": -465.6294860839844, "logps/rejected": -417.5697326660156, "loss": 0.4849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5097153186798096, "rewards/margins": 1.0276848077774048, "rewards/rejected": 2.4820306301116943, "step": 11850 }, { "epoch": 0.5506290914155717, "grad_norm": 40.75973892211914, "learning_rate": 4.0830586378197686e-07, "logits/chosen": -18.4055118560791, "logits/rejected": -17.944936752319336, "logps/chosen": -268.3580017089844, "logps/rejected": -256.2364807128906, "loss": 0.7333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.068666934967041, "rewards/margins": 0.14089611172676086, "rewards/rejected": 1.9277708530426025, "step": 11860 }, { "epoch": 0.5510933655230048, "grad_norm": 55.30391311645508, "learning_rate": 4.0822848476407137e-07, "logits/chosen": -17.76372718811035, "logits/rejected": -18.656951904296875, "logps/chosen": -326.37506103515625, "logps/rejected": -396.41290283203125, "loss": 1.433, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7042319774627686, "rewards/margins": -0.8181840181350708, "rewards/rejected": 3.52241587638855, "step": 11870 }, { "epoch": 0.5515576396304378, "grad_norm": 123.73091125488281, "learning_rate": 4.081511057461658e-07, "logits/chosen": -18.47677993774414, "logits/rejected": -19.004375457763672, "logps/chosen": -478.716796875, "logps/rejected": -463.9867248535156, "loss": 1.4108, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.236563205718994, "rewards/margins": -0.9913946390151978, "rewards/rejected": 3.2279579639434814, "step": 11880 }, { "epoch": 0.5520219137378708, "grad_norm": 278.9765930175781, "learning_rate": 4.0807372672826034e-07, "logits/chosen": -18.717832565307617, "logits/rejected": -18.204914093017578, "logps/chosen": -367.64190673828125, "logps/rejected": -351.98858642578125, "loss": 0.9143, "rewards/accuracies": 0.5, "rewards/chosen": 2.158815860748291, "rewards/margins": -0.10448899120092392, "rewards/rejected": 2.2633047103881836, "step": 11890 }, { "epoch": 0.5524861878453039, "grad_norm": 23.211341857910156, "learning_rate": 4.0799634771035485e-07, "logits/chosen": -19.86630630493164, "logits/rejected": -18.29788589477539, "logps/chosen": -492.5774841308594, "logps/rejected": -322.0983581542969, "loss": 0.4841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.82379150390625, "rewards/margins": 1.3192623853683472, "rewards/rejected": 2.504528760910034, "step": 11900 }, { "epoch": 0.5529504619527369, "grad_norm": 44.21073913574219, "learning_rate": 4.0791896869244936e-07, "logits/chosen": -17.380512237548828, "logits/rejected": -17.190927505493164, "logps/chosen": -245.79299926757812, "logps/rejected": -237.33346557617188, "loss": 1.0464, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.393347144126892, "rewards/margins": -0.22221460938453674, "rewards/rejected": 1.6155617237091064, "step": 11910 }, { "epoch": 0.5534147360601699, "grad_norm": 57.03460693359375, "learning_rate": 4.0784158967454387e-07, "logits/chosen": -18.39377212524414, "logits/rejected": -17.59027862548828, "logps/chosen": -357.69586181640625, "logps/rejected": -242.2801513671875, "loss": 0.5167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7978529930114746, "rewards/margins": 1.161527156829834, "rewards/rejected": 1.6363260746002197, "step": 11920 }, { "epoch": 0.553879010167603, "grad_norm": 61.8339729309082, "learning_rate": 4.0776421065663833e-07, "logits/chosen": -18.629718780517578, "logits/rejected": -17.710308074951172, "logps/chosen": -372.88226318359375, "logps/rejected": -261.76519775390625, "loss": 0.6567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3625736236572266, "rewards/margins": 0.34609681367874146, "rewards/rejected": 2.0164763927459717, "step": 11930 }, { "epoch": 0.554343284275036, "grad_norm": 64.2043228149414, "learning_rate": 4.076868316387328e-07, "logits/chosen": -19.082042694091797, "logits/rejected": -18.82720947265625, "logps/chosen": -539.2269287109375, "logps/rejected": -381.4810485839844, "loss": 0.9826, "rewards/accuracies": 0.5, "rewards/chosen": 2.76275372505188, "rewards/margins": 0.2051277458667755, "rewards/rejected": 2.5576257705688477, "step": 11940 }, { "epoch": 0.554807558382469, "grad_norm": 49.71751403808594, "learning_rate": 4.076094526208273e-07, "logits/chosen": -17.383831024169922, "logits/rejected": -17.754169464111328, "logps/chosen": -269.4260559082031, "logps/rejected": -251.3515625, "loss": 0.9894, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.4456942081451416, "rewards/margins": -0.3971037268638611, "rewards/rejected": 1.842797875404358, "step": 11950 }, { "epoch": 0.5552718324899021, "grad_norm": 46.604156494140625, "learning_rate": 4.075320736029218e-07, "logits/chosen": -20.10489273071289, "logits/rejected": -19.568309783935547, "logps/chosen": -373.9714050292969, "logps/rejected": -361.98480224609375, "loss": 0.7773, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.561149835586548, "rewards/margins": 0.18567529320716858, "rewards/rejected": 2.375474452972412, "step": 11960 }, { "epoch": 0.5557361065973351, "grad_norm": 86.88188934326172, "learning_rate": 4.074546945850163e-07, "logits/chosen": -19.78378677368164, "logits/rejected": -18.97222137451172, "logps/chosen": -366.3090515136719, "logps/rejected": -336.4764099121094, "loss": 1.0402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2806107997894287, "rewards/margins": 0.11227748543024063, "rewards/rejected": 2.1683335304260254, "step": 11970 }, { "epoch": 0.5562003807047681, "grad_norm": 180.52944946289062, "learning_rate": 4.073773155671108e-07, "logits/chosen": -18.02338409423828, "logits/rejected": -18.24044418334961, "logps/chosen": -258.39599609375, "logps/rejected": -294.99908447265625, "loss": 0.9739, "rewards/accuracies": 0.5, "rewards/chosen": 1.8199138641357422, "rewards/margins": -0.04129563644528389, "rewards/rejected": 1.861209511756897, "step": 11980 }, { "epoch": 0.5566646548122012, "grad_norm": 39.52962112426758, "learning_rate": 4.072999365492053e-07, "logits/chosen": -18.556278228759766, "logits/rejected": -18.366348266601562, "logps/chosen": -340.48760986328125, "logps/rejected": -322.4366760253906, "loss": 0.5834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7324602603912354, "rewards/margins": 0.6070472002029419, "rewards/rejected": 2.125412702560425, "step": 11990 }, { "epoch": 0.5571289289196342, "grad_norm": 50.878150939941406, "learning_rate": 4.072225575312998e-07, "logits/chosen": -18.42899513244629, "logits/rejected": -17.690673828125, "logps/chosen": -387.0579528808594, "logps/rejected": -317.0282287597656, "loss": 0.7237, "rewards/accuracies": 0.5, "rewards/chosen": 2.2509713172912598, "rewards/margins": 0.07025624811649323, "rewards/rejected": 2.1807150840759277, "step": 12000 }, { "epoch": 0.5575932030270672, "grad_norm": 27.552350997924805, "learning_rate": 4.071451785133943e-07, "logits/chosen": -17.727914810180664, "logits/rejected": -17.84158706665039, "logps/chosen": -250.35671997070312, "logps/rejected": -312.686279296875, "loss": 1.0359, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.3565597534179688, "rewards/margins": -0.3880470395088196, "rewards/rejected": 1.7446067333221436, "step": 12010 }, { "epoch": 0.5580574771345002, "grad_norm": 3.4643545150756836, "learning_rate": 4.070677994954888e-07, "logits/chosen": -19.12010383605957, "logits/rejected": -18.596811294555664, "logps/chosen": -406.423095703125, "logps/rejected": -398.0335998535156, "loss": 0.7583, "rewards/accuracies": 0.5, "rewards/chosen": 2.8690505027770996, "rewards/margins": 0.48334985971450806, "rewards/rejected": 2.3857004642486572, "step": 12020 }, { "epoch": 0.5585217512419333, "grad_norm": 130.3568878173828, "learning_rate": 4.069904204775833e-07, "logits/chosen": -19.15255355834961, "logits/rejected": -18.148517608642578, "logps/chosen": -598.0348510742188, "logps/rejected": -347.396240234375, "loss": 0.3028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3401970863342285, "rewards/margins": 1.561253309249878, "rewards/rejected": 1.7789437770843506, "step": 12030 }, { "epoch": 0.5589860253493663, "grad_norm": 33.3819694519043, "learning_rate": 4.0691304145967774e-07, "logits/chosen": -17.827314376831055, "logits/rejected": -16.959131240844727, "logps/chosen": -392.4300842285156, "logps/rejected": -254.6051483154297, "loss": 0.3779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0823676586151123, "rewards/margins": 1.3535301685333252, "rewards/rejected": 1.728837251663208, "step": 12040 }, { "epoch": 0.5594502994567992, "grad_norm": 80.86640167236328, "learning_rate": 4.0683566244177225e-07, "logits/chosen": -19.166751861572266, "logits/rejected": -18.53162956237793, "logps/chosen": -378.439208984375, "logps/rejected": -295.9183654785156, "loss": 0.5606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7934765815734863, "rewards/margins": 0.5655763745307922, "rewards/rejected": 2.2279000282287598, "step": 12050 }, { "epoch": 0.5599145735642324, "grad_norm": 104.5242691040039, "learning_rate": 4.0675828342386676e-07, "logits/chosen": -18.778799057006836, "logits/rejected": -17.33318519592285, "logps/chosen": -324.699951171875, "logps/rejected": -198.06454467773438, "loss": 0.638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3177850246429443, "rewards/margins": 0.8150904774665833, "rewards/rejected": 1.5026947259902954, "step": 12060 }, { "epoch": 0.5603788476716653, "grad_norm": 75.0257339477539, "learning_rate": 4.066809044059613e-07, "logits/chosen": -18.3347110748291, "logits/rejected": -17.942319869995117, "logps/chosen": -414.9801330566406, "logps/rejected": -366.37469482421875, "loss": 0.8, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.578936815261841, "rewards/margins": 0.09902455657720566, "rewards/rejected": 2.479912281036377, "step": 12070 }, { "epoch": 0.5608431217790983, "grad_norm": 23.20246124267578, "learning_rate": 4.0660352538805573e-07, "logits/chosen": -17.90488624572754, "logits/rejected": -17.154722213745117, "logps/chosen": -320.70196533203125, "logps/rejected": -281.8077697753906, "loss": 1.0667, "rewards/accuracies": 0.5, "rewards/chosen": 2.101768732070923, "rewards/margins": 0.1596667319536209, "rewards/rejected": 1.9421018362045288, "step": 12080 }, { "epoch": 0.5613073958865314, "grad_norm": 38.97388458251953, "learning_rate": 4.0652614637015025e-07, "logits/chosen": -19.364665985107422, "logits/rejected": -18.359539031982422, "logps/chosen": -363.80303955078125, "logps/rejected": -270.29302978515625, "loss": 0.5039, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5505778789520264, "rewards/margins": 0.6061784029006958, "rewards/rejected": 1.9443995952606201, "step": 12090 }, { "epoch": 0.5617716699939644, "grad_norm": 54.341217041015625, "learning_rate": 4.0644876735224476e-07, "logits/chosen": -18.833881378173828, "logits/rejected": -18.0764102935791, "logps/chosen": -371.343994140625, "logps/rejected": -373.485595703125, "loss": 0.8932, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.042116165161133, "rewards/margins": -0.048004817217588425, "rewards/rejected": 2.090121030807495, "step": 12100 }, { "epoch": 0.5622359441013974, "grad_norm": 75.35798645019531, "learning_rate": 4.0637138833433927e-07, "logits/chosen": -18.382667541503906, "logits/rejected": -18.740829467773438, "logps/chosen": -370.84454345703125, "logps/rejected": -422.6607360839844, "loss": 1.3749, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.944488286972046, "rewards/margins": -0.7072241902351379, "rewards/rejected": 3.651712417602539, "step": 12110 }, { "epoch": 0.5627002182088305, "grad_norm": 82.15587615966797, "learning_rate": 4.062940093164338e-07, "logits/chosen": -18.95342254638672, "logits/rejected": -19.122943878173828, "logps/chosen": -303.72894287109375, "logps/rejected": -242.4219970703125, "loss": 0.5256, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.442415952682495, "rewards/margins": 0.6597765684127808, "rewards/rejected": 1.782639503479004, "step": 12120 }, { "epoch": 0.5631644923162635, "grad_norm": 28.836605072021484, "learning_rate": 4.062166302985282e-07, "logits/chosen": -18.84714126586914, "logits/rejected": -18.176420211791992, "logps/chosen": -349.98236083984375, "logps/rejected": -250.4954833984375, "loss": 0.3743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.63247013092041, "rewards/margins": 1.6428844928741455, "rewards/rejected": 0.9895856976509094, "step": 12130 }, { "epoch": 0.5636287664236965, "grad_norm": 25.062788009643555, "learning_rate": 4.061392512806227e-07, "logits/chosen": -19.702117919921875, "logits/rejected": -17.465435028076172, "logps/chosen": -433.1380310058594, "logps/rejected": -160.83779907226562, "loss": 0.4577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.00223445892334, "rewards/margins": 1.6518512964248657, "rewards/rejected": 1.3503830432891846, "step": 12140 }, { "epoch": 0.5640930405311296, "grad_norm": 25.107521057128906, "learning_rate": 4.060618722627172e-07, "logits/chosen": -18.804523468017578, "logits/rejected": -18.4525203704834, "logps/chosen": -534.6972045898438, "logps/rejected": -461.951904296875, "loss": 0.4534, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.741156816482544, "rewards/margins": 0.9162809252738953, "rewards/rejected": 2.824876546859741, "step": 12150 }, { "epoch": 0.5645573146385626, "grad_norm": 47.403282165527344, "learning_rate": 4.059844932448117e-07, "logits/chosen": -18.574813842773438, "logits/rejected": -17.49618911743164, "logps/chosen": -432.54150390625, "logps/rejected": -328.1786193847656, "loss": 0.3814, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.000434398651123, "rewards/margins": 1.217522382736206, "rewards/rejected": 1.782912254333496, "step": 12160 }, { "epoch": 0.5650215887459956, "grad_norm": 11.620194435119629, "learning_rate": 4.0590711422690623e-07, "logits/chosen": -19.52345848083496, "logits/rejected": -18.507034301757812, "logps/chosen": -437.02392578125, "logps/rejected": -267.9277038574219, "loss": 0.3352, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2078487873077393, "rewards/margins": 1.3723760843276978, "rewards/rejected": 1.8354727029800415, "step": 12170 }, { "epoch": 0.5654858628534286, "grad_norm": 8.567330360412598, "learning_rate": 4.058297352090007e-07, "logits/chosen": -17.921016693115234, "logits/rejected": -18.013446807861328, "logps/chosen": -305.47015380859375, "logps/rejected": -398.72259521484375, "loss": 1.3897, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.455664873123169, "rewards/margins": -0.48785001039505005, "rewards/rejected": 2.9435150623321533, "step": 12180 }, { "epoch": 0.5659501369608617, "grad_norm": 75.32521057128906, "learning_rate": 4.057523561910952e-07, "logits/chosen": -19.14889907836914, "logits/rejected": -18.27301025390625, "logps/chosen": -438.8321838378906, "logps/rejected": -406.8865966796875, "loss": 0.7569, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5419812202453613, "rewards/margins": 0.30891093611717224, "rewards/rejected": 2.233070135116577, "step": 12190 }, { "epoch": 0.5664144110682947, "grad_norm": 81.81602478027344, "learning_rate": 4.056749771731897e-07, "logits/chosen": -18.74629783630371, "logits/rejected": -18.07093048095703, "logps/chosen": -395.69989013671875, "logps/rejected": -271.39239501953125, "loss": 0.5368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1330738067626953, "rewards/margins": 0.9038882255554199, "rewards/rejected": 2.2291855812072754, "step": 12200 }, { "epoch": 0.5668786851757277, "grad_norm": 43.16640090942383, "learning_rate": 4.055975981552842e-07, "logits/chosen": -18.54425621032715, "logits/rejected": -18.52517318725586, "logps/chosen": -344.61444091796875, "logps/rejected": -351.70806884765625, "loss": 0.5821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.589003801345825, "rewards/margins": 0.41270875930786133, "rewards/rejected": 2.1762948036193848, "step": 12210 }, { "epoch": 0.5673429592831608, "grad_norm": 1.452133297920227, "learning_rate": 4.0552021913737873e-07, "logits/chosen": -17.796588897705078, "logits/rejected": -17.542367935180664, "logps/chosen": -341.8514404296875, "logps/rejected": -277.334716796875, "loss": 0.7899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1084694862365723, "rewards/margins": 1.2542672157287598, "rewards/rejected": 1.8542022705078125, "step": 12220 }, { "epoch": 0.5678072333905938, "grad_norm": 52.03776550292969, "learning_rate": 4.0544284011947314e-07, "logits/chosen": -17.982885360717773, "logits/rejected": -16.118192672729492, "logps/chosen": -480.81341552734375, "logps/rejected": -255.95083618164062, "loss": 0.2338, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.192411184310913, "rewards/margins": 1.8559987545013428, "rewards/rejected": 1.3364126682281494, "step": 12230 }, { "epoch": 0.5682715074980268, "grad_norm": 65.53633117675781, "learning_rate": 4.0536546110156765e-07, "logits/chosen": -17.775020599365234, "logits/rejected": -17.364055633544922, "logps/chosen": -349.9712829589844, "logps/rejected": -311.93896484375, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": 2.5665171146392822, "rewards/margins": 0.11029700934886932, "rewards/rejected": 2.4562201499938965, "step": 12240 }, { "epoch": 0.5687357816054599, "grad_norm": 71.61917877197266, "learning_rate": 4.0528808208366216e-07, "logits/chosen": -19.083019256591797, "logits/rejected": -18.85955047607422, "logps/chosen": -460.80804443359375, "logps/rejected": -486.41448974609375, "loss": 0.9751, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.813326597213745, "rewards/margins": -0.3552762269973755, "rewards/rejected": 3.16860294342041, "step": 12250 }, { "epoch": 0.5692000557128929, "grad_norm": 156.92242431640625, "learning_rate": 4.0521070306575667e-07, "logits/chosen": -18.522525787353516, "logits/rejected": -18.855022430419922, "logps/chosen": -276.73297119140625, "logps/rejected": -299.8730773925781, "loss": 1.1441, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.954263687133789, "rewards/margins": -0.48512405157089233, "rewards/rejected": 2.439387798309326, "step": 12260 }, { "epoch": 0.5696643298203259, "grad_norm": 80.46231842041016, "learning_rate": 4.051333240478512e-07, "logits/chosen": -19.495128631591797, "logits/rejected": -18.969070434570312, "logps/chosen": -347.88922119140625, "logps/rejected": -251.57666015625, "loss": 0.5666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4313395023345947, "rewards/margins": 0.7220286130905151, "rewards/rejected": 1.7093108892440796, "step": 12270 }, { "epoch": 0.570128603927759, "grad_norm": 242.1325225830078, "learning_rate": 4.0505594502994564e-07, "logits/chosen": -18.559814453125, "logits/rejected": -17.89669418334961, "logps/chosen": -434.079833984375, "logps/rejected": -358.61358642578125, "loss": 0.4739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3093504905700684, "rewards/margins": 1.1783746480941772, "rewards/rejected": 2.1309757232666016, "step": 12280 }, { "epoch": 0.570592878035192, "grad_norm": 71.65288543701172, "learning_rate": 4.0497856601204015e-07, "logits/chosen": -20.214479446411133, "logits/rejected": -19.388402938842773, "logps/chosen": -439.22479248046875, "logps/rejected": -374.94171142578125, "loss": 0.531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.764275312423706, "rewards/margins": 0.5520831346511841, "rewards/rejected": 2.2121922969818115, "step": 12290 }, { "epoch": 0.571057152142625, "grad_norm": 1.9129959344863892, "learning_rate": 4.0490118699413466e-07, "logits/chosen": -19.93065071105957, "logits/rejected": -18.345796585083008, "logps/chosen": -366.2347717285156, "logps/rejected": -301.69012451171875, "loss": 0.6429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.025373935699463, "rewards/margins": 0.8201414346694946, "rewards/rejected": 2.205232620239258, "step": 12300 }, { "epoch": 0.5715214262500581, "grad_norm": 32.229122161865234, "learning_rate": 4.048238079762292e-07, "logits/chosen": -18.623149871826172, "logits/rejected": -17.946781158447266, "logps/chosen": -421.84478759765625, "logps/rejected": -354.5328369140625, "loss": 0.5303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7971982955932617, "rewards/margins": 1.0340609550476074, "rewards/rejected": 1.7631372213363647, "step": 12310 }, { "epoch": 0.5719857003574911, "grad_norm": 98.74689483642578, "learning_rate": 4.0474642895832363e-07, "logits/chosen": -18.71029281616211, "logits/rejected": -17.98292350769043, "logps/chosen": -323.39996337890625, "logps/rejected": -304.831787109375, "loss": 0.7195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.869084596633911, "rewards/margins": 0.6793709993362427, "rewards/rejected": 2.189713954925537, "step": 12320 }, { "epoch": 0.5724499744649241, "grad_norm": 59.637046813964844, "learning_rate": 4.046690499404181e-07, "logits/chosen": -18.941150665283203, "logits/rejected": -18.683916091918945, "logps/chosen": -400.99908447265625, "logps/rejected": -397.3261413574219, "loss": 0.7219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0491483211517334, "rewards/margins": 0.4028252959251404, "rewards/rejected": 2.646322727203369, "step": 12330 }, { "epoch": 0.5729142485723571, "grad_norm": 106.8670883178711, "learning_rate": 4.045916709225126e-07, "logits/chosen": -19.907140731811523, "logits/rejected": -18.091747283935547, "logps/chosen": -478.29730224609375, "logps/rejected": -317.59503173828125, "loss": 0.3292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6763179302215576, "rewards/margins": 1.8242906332015991, "rewards/rejected": 1.852027177810669, "step": 12340 }, { "epoch": 0.5733785226797902, "grad_norm": 68.2850341796875, "learning_rate": 4.045142919046071e-07, "logits/chosen": -18.272991180419922, "logits/rejected": -17.96648597717285, "logps/chosen": -502.1324157714844, "logps/rejected": -429.70989990234375, "loss": 1.0719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.266146421432495, "rewards/margins": -0.08135654777288437, "rewards/rejected": 3.347503185272217, "step": 12350 }, { "epoch": 0.5738427967872232, "grad_norm": 183.02171325683594, "learning_rate": 4.0443691288670163e-07, "logits/chosen": -19.758901596069336, "logits/rejected": -19.80980110168457, "logps/chosen": -466.43682861328125, "logps/rejected": -463.41448974609375, "loss": 0.5165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5035452842712402, "rewards/margins": 0.520074725151062, "rewards/rejected": 2.983470916748047, "step": 12360 }, { "epoch": 0.5743070708946562, "grad_norm": 10.126078605651855, "learning_rate": 4.0435953386879614e-07, "logits/chosen": -19.417804718017578, "logits/rejected": -19.06507682800293, "logps/chosen": -422.620361328125, "logps/rejected": -380.0851135253906, "loss": 0.8705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.780891180038452, "rewards/margins": 0.1624658703804016, "rewards/rejected": 2.6184258460998535, "step": 12370 }, { "epoch": 0.5747713450020893, "grad_norm": 83.39470672607422, "learning_rate": 4.042821548508906e-07, "logits/chosen": -20.19200897216797, "logits/rejected": -19.048053741455078, "logps/chosen": -509.83935546875, "logps/rejected": -371.158447265625, "loss": 0.4848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1357040405273438, "rewards/margins": 0.80943363904953, "rewards/rejected": 2.326270341873169, "step": 12380 }, { "epoch": 0.5752356191095223, "grad_norm": 47.252113342285156, "learning_rate": 4.042047758329851e-07, "logits/chosen": -18.10653305053711, "logits/rejected": -17.230907440185547, "logps/chosen": -363.3359375, "logps/rejected": -260.66552734375, "loss": 0.5581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3697264194488525, "rewards/margins": 0.725925087928772, "rewards/rejected": 1.6438013315200806, "step": 12390 }, { "epoch": 0.5756998932169552, "grad_norm": 21.21662712097168, "learning_rate": 4.041273968150796e-07, "logits/chosen": -19.305076599121094, "logits/rejected": -17.496427536010742, "logps/chosen": -407.8745422363281, "logps/rejected": -256.6689758300781, "loss": 0.3922, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7436070442199707, "rewards/margins": 1.3920701742172241, "rewards/rejected": 1.3515368700027466, "step": 12400 }, { "epoch": 0.5761641673243884, "grad_norm": 15.116291046142578, "learning_rate": 4.0405001779717413e-07, "logits/chosen": -19.326190948486328, "logits/rejected": -18.010150909423828, "logps/chosen": -479.1421813964844, "logps/rejected": -336.10015869140625, "loss": 0.733, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.361628770828247, "rewards/margins": 0.5010361671447754, "rewards/rejected": 1.8605926036834717, "step": 12410 }, { "epoch": 0.5766284414318213, "grad_norm": 187.64346313476562, "learning_rate": 4.039726387792686e-07, "logits/chosen": -18.638751983642578, "logits/rejected": -18.588212966918945, "logps/chosen": -344.05438232421875, "logps/rejected": -310.218017578125, "loss": 0.9446, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6910269260406494, "rewards/margins": 0.08472730964422226, "rewards/rejected": 2.60629940032959, "step": 12420 }, { "epoch": 0.5770927155392543, "grad_norm": 44.0516242980957, "learning_rate": 4.0389525976136305e-07, "logits/chosen": -18.395061492919922, "logits/rejected": -17.55962371826172, "logps/chosen": -399.96221923828125, "logps/rejected": -361.0752868652344, "loss": 0.6681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2785680294036865, "rewards/margins": 0.4525560438632965, "rewards/rejected": 1.8260120153427124, "step": 12430 }, { "epoch": 0.5775569896466874, "grad_norm": 166.5885467529297, "learning_rate": 4.0381788074345756e-07, "logits/chosen": -18.59990882873535, "logits/rejected": -18.29373550415039, "logps/chosen": -384.37860107421875, "logps/rejected": -317.1339416503906, "loss": 0.817, "rewards/accuracies": 0.5, "rewards/chosen": 2.297213315963745, "rewards/margins": 0.25184446573257446, "rewards/rejected": 2.0453686714172363, "step": 12440 }, { "epoch": 0.5780212637541204, "grad_norm": 35.98487854003906, "learning_rate": 4.0374050172555207e-07, "logits/chosen": -19.826501846313477, "logits/rejected": -18.772808074951172, "logps/chosen": -444.89910888671875, "logps/rejected": -437.74774169921875, "loss": 0.4629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5421223640441895, "rewards/margins": 1.056214451789856, "rewards/rejected": 2.485908031463623, "step": 12450 }, { "epoch": 0.5784855378615534, "grad_norm": 34.130958557128906, "learning_rate": 4.036631227076466e-07, "logits/chosen": -19.227291107177734, "logits/rejected": -17.99704933166504, "logps/chosen": -508.0084533691406, "logps/rejected": -299.8184814453125, "loss": 0.3561, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.875814437866211, "rewards/margins": 1.7634737491607666, "rewards/rejected": 2.1123404502868652, "step": 12460 }, { "epoch": 0.5789498119689865, "grad_norm": 63.83795166015625, "learning_rate": 4.035857436897411e-07, "logits/chosen": -18.391801834106445, "logits/rejected": -18.095949172973633, "logps/chosen": -406.62310791015625, "logps/rejected": -357.1697082519531, "loss": 0.5411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.536860466003418, "rewards/margins": 0.5715085864067078, "rewards/rejected": 1.9653518199920654, "step": 12470 }, { "epoch": 0.5794140860764195, "grad_norm": 10.511324882507324, "learning_rate": 4.035083646718356e-07, "logits/chosen": -18.66691017150879, "logits/rejected": -17.725418090820312, "logps/chosen": -422.8766174316406, "logps/rejected": -313.9360656738281, "loss": 0.4776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.035128593444824, "rewards/margins": 1.1705870628356934, "rewards/rejected": 1.8645414113998413, "step": 12480 }, { "epoch": 0.5798783601838525, "grad_norm": 15.2152738571167, "learning_rate": 4.0343098565393006e-07, "logits/chosen": -18.6673583984375, "logits/rejected": -17.579273223876953, "logps/chosen": -369.8424987792969, "logps/rejected": -225.95437622070312, "loss": 0.3808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.364532470703125, "rewards/margins": 1.258920431137085, "rewards/rejected": 1.10561203956604, "step": 12490 }, { "epoch": 0.5803426342912855, "grad_norm": 165.8727264404297, "learning_rate": 4.0335360663602457e-07, "logits/chosen": -18.058521270751953, "logits/rejected": -17.824914932250977, "logps/chosen": -233.3307342529297, "logps/rejected": -289.87945556640625, "loss": 0.6992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6445157527923584, "rewards/margins": 0.19215543568134308, "rewards/rejected": 1.4523600339889526, "step": 12500 }, { "epoch": 0.5808069083987186, "grad_norm": 150.93203735351562, "learning_rate": 4.032762276181191e-07, "logits/chosen": -18.328262329101562, "logits/rejected": -18.160900115966797, "logps/chosen": -607.4513549804688, "logps/rejected": -484.075927734375, "loss": 0.6178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9399402141571045, "rewards/margins": 0.5159686207771301, "rewards/rejected": 3.423971652984619, "step": 12510 }, { "epoch": 0.5812711825061516, "grad_norm": 109.51398468017578, "learning_rate": 4.0319884860021354e-07, "logits/chosen": -18.64847183227539, "logits/rejected": -18.27420425415039, "logps/chosen": -458.99609375, "logps/rejected": -410.01409912109375, "loss": 0.5201, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.078835964202881, "rewards/margins": 0.8591777682304382, "rewards/rejected": 2.219658136367798, "step": 12520 }, { "epoch": 0.5817354566135846, "grad_norm": 102.97442626953125, "learning_rate": 4.03121469582308e-07, "logits/chosen": -18.50942611694336, "logits/rejected": -17.647602081298828, "logps/chosen": -461.2190856933594, "logps/rejected": -398.26251220703125, "loss": 0.7551, "rewards/accuracies": 0.5, "rewards/chosen": 3.406886339187622, "rewards/margins": 0.53694748878479, "rewards/rejected": 2.869938611984253, "step": 12530 }, { "epoch": 0.5821997307210177, "grad_norm": 89.5099868774414, "learning_rate": 4.030440905644025e-07, "logits/chosen": -18.4670352935791, "logits/rejected": -18.154537200927734, "logps/chosen": -429.86199951171875, "logps/rejected": -422.35833740234375, "loss": 1.1471, "rewards/accuracies": 0.5, "rewards/chosen": 2.44884991645813, "rewards/margins": -0.42470017075538635, "rewards/rejected": 2.8735501766204834, "step": 12540 }, { "epoch": 0.5826640048284507, "grad_norm": 25.884536743164062, "learning_rate": 4.02966711546497e-07, "logits/chosen": -18.347362518310547, "logits/rejected": -17.778091430664062, "logps/chosen": -472.1031799316406, "logps/rejected": -390.623779296875, "loss": 0.4596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.817553997039795, "rewards/margins": 0.9439159631729126, "rewards/rejected": 2.873638153076172, "step": 12550 }, { "epoch": 0.5831282789358837, "grad_norm": 111.0077133178711, "learning_rate": 4.0288933252859153e-07, "logits/chosen": -18.906871795654297, "logits/rejected": -17.618844985961914, "logps/chosen": -361.145263671875, "logps/rejected": -262.8110656738281, "loss": 0.4899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.728347063064575, "rewards/margins": 0.875374436378479, "rewards/rejected": 1.8529726266860962, "step": 12560 }, { "epoch": 0.5835925530433168, "grad_norm": 119.61968994140625, "learning_rate": 4.0281195351068605e-07, "logits/chosen": -18.300085067749023, "logits/rejected": -17.829233169555664, "logps/chosen": -357.0736083984375, "logps/rejected": -306.70819091796875, "loss": 0.7987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4453985691070557, "rewards/margins": 0.5562955141067505, "rewards/rejected": 1.889102578163147, "step": 12570 }, { "epoch": 0.5840568271507498, "grad_norm": 36.03122329711914, "learning_rate": 4.0273457449278056e-07, "logits/chosen": -19.574954986572266, "logits/rejected": -18.886890411376953, "logps/chosen": -408.9800720214844, "logps/rejected": -319.3555603027344, "loss": 0.4647, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5707507133483887, "rewards/margins": 0.9958900213241577, "rewards/rejected": 1.5748608112335205, "step": 12580 }, { "epoch": 0.5845211012581828, "grad_norm": 124.3583755493164, "learning_rate": 4.02657195474875e-07, "logits/chosen": -20.175294876098633, "logits/rejected": -19.605159759521484, "logps/chosen": -367.9007263183594, "logps/rejected": -377.09478759765625, "loss": 0.603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.395848035812378, "rewards/margins": 0.38980430364608765, "rewards/rejected": 2.0060439109802246, "step": 12590 }, { "epoch": 0.5849853753656159, "grad_norm": 65.24016571044922, "learning_rate": 4.0257981645696953e-07, "logits/chosen": -17.60953140258789, "logits/rejected": -17.024703979492188, "logps/chosen": -252.69723510742188, "logps/rejected": -224.81982421875, "loss": 0.6718, "rewards/accuracies": 0.5, "rewards/chosen": 1.7449086904525757, "rewards/margins": 0.33421140909194946, "rewards/rejected": 1.410697340965271, "step": 12600 }, { "epoch": 0.5854496494730489, "grad_norm": 62.470455169677734, "learning_rate": 4.02502437439064e-07, "logits/chosen": -19.582645416259766, "logits/rejected": -19.082860946655273, "logps/chosen": -430.0619201660156, "logps/rejected": -369.5579528808594, "loss": 0.5571, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.388036012649536, "rewards/margins": 0.4659043848514557, "rewards/rejected": 2.922131299972534, "step": 12610 }, { "epoch": 0.5859139235804819, "grad_norm": 219.55882263183594, "learning_rate": 4.024250584211585e-07, "logits/chosen": -19.432010650634766, "logits/rejected": -18.644367218017578, "logps/chosen": -409.16802978515625, "logps/rejected": -376.70892333984375, "loss": 0.9074, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.054231643676758, "rewards/margins": -0.018453503027558327, "rewards/rejected": 3.0726850032806396, "step": 12620 }, { "epoch": 0.586378197687915, "grad_norm": 55.44696807861328, "learning_rate": 4.02347679403253e-07, "logits/chosen": -19.798828125, "logits/rejected": -18.88897132873535, "logps/chosen": -444.22198486328125, "logps/rejected": -364.3595275878906, "loss": 0.7471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.82277250289917, "rewards/margins": 0.22769060730934143, "rewards/rejected": 2.5950820446014404, "step": 12630 }, { "epoch": 0.586842471795348, "grad_norm": 59.48821258544922, "learning_rate": 4.0227030038534747e-07, "logits/chosen": -18.025163650512695, "logits/rejected": -17.318103790283203, "logps/chosen": -308.17626953125, "logps/rejected": -304.70916748046875, "loss": 0.8948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.312180995941162, "rewards/margins": 0.33191150426864624, "rewards/rejected": 1.980269432067871, "step": 12640 }, { "epoch": 0.587306745902781, "grad_norm": 68.46221923828125, "learning_rate": 4.02192921367442e-07, "logits/chosen": -18.567407608032227, "logits/rejected": -18.40237045288086, "logps/chosen": -386.35888671875, "logps/rejected": -376.7729797363281, "loss": 0.6584, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0428662300109863, "rewards/margins": 0.1487552374601364, "rewards/rejected": 1.8941112756729126, "step": 12650 }, { "epoch": 0.5877710200102141, "grad_norm": 26.979963302612305, "learning_rate": 4.021155423495365e-07, "logits/chosen": -18.1066837310791, "logits/rejected": -17.952346801757812, "logps/chosen": -362.83404541015625, "logps/rejected": -307.01641845703125, "loss": 0.7243, "rewards/accuracies": 0.5, "rewards/chosen": 2.174379825592041, "rewards/margins": 0.06598442792892456, "rewards/rejected": 2.1083953380584717, "step": 12660 }, { "epoch": 0.5882352941176471, "grad_norm": 33.83673858642578, "learning_rate": 4.02038163331631e-07, "logits/chosen": -18.078866958618164, "logits/rejected": -17.310970306396484, "logps/chosen": -375.47894287109375, "logps/rejected": -266.42730712890625, "loss": 0.5869, "rewards/accuracies": 0.5, "rewards/chosen": 2.28950834274292, "rewards/margins": 0.5323697924613953, "rewards/rejected": 1.7571386098861694, "step": 12670 }, { "epoch": 0.5886995682250801, "grad_norm": 104.86422729492188, "learning_rate": 4.019607843137255e-07, "logits/chosen": -18.430706024169922, "logits/rejected": -17.465160369873047, "logps/chosen": -408.32806396484375, "logps/rejected": -233.6565704345703, "loss": 0.4379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2942817211151123, "rewards/margins": 1.564337134361267, "rewards/rejected": 1.7299445867538452, "step": 12680 }, { "epoch": 0.5891638423325131, "grad_norm": 282.2391357421875, "learning_rate": 4.0188340529581997e-07, "logits/chosen": -18.70846939086914, "logits/rejected": -19.19687843322754, "logps/chosen": -443.4723205566406, "logps/rejected": -500.6426696777344, "loss": 1.1064, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.898432970046997, "rewards/margins": -0.10975347459316254, "rewards/rejected": 3.0081865787506104, "step": 12690 }, { "epoch": 0.5896281164399462, "grad_norm": 4.724204063415527, "learning_rate": 4.018060262779145e-07, "logits/chosen": -19.54334831237793, "logits/rejected": -17.39069175720215, "logps/chosen": -501.47332763671875, "logps/rejected": -262.58038330078125, "loss": 0.5163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.188363552093506, "rewards/margins": 1.5479894876480103, "rewards/rejected": 1.6403745412826538, "step": 12700 }, { "epoch": 0.5900923905473792, "grad_norm": 45.68013381958008, "learning_rate": 4.0172864726000894e-07, "logits/chosen": -18.160594940185547, "logits/rejected": -17.819381713867188, "logps/chosen": -301.1807556152344, "logps/rejected": -210.1912078857422, "loss": 0.4494, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.166794538497925, "rewards/margins": 0.7943150997161865, "rewards/rejected": 1.3724792003631592, "step": 12710 }, { "epoch": 0.5905566646548122, "grad_norm": 70.62512969970703, "learning_rate": 4.0165126824210345e-07, "logits/chosen": -19.13167953491211, "logits/rejected": -17.89142608642578, "logps/chosen": -424.75018310546875, "logps/rejected": -343.69354248046875, "loss": 0.6597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.126389265060425, "rewards/margins": 0.7989699244499207, "rewards/rejected": 2.3274195194244385, "step": 12720 }, { "epoch": 0.5910209387622453, "grad_norm": 113.57320404052734, "learning_rate": 4.0157388922419796e-07, "logits/chosen": -18.777917861938477, "logits/rejected": -17.503734588623047, "logps/chosen": -510.5281677246094, "logps/rejected": -366.2601013183594, "loss": 0.3984, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5387790203094482, "rewards/margins": 0.9520440101623535, "rewards/rejected": 2.586735248565674, "step": 12730 }, { "epoch": 0.5914852128696783, "grad_norm": 49.13069152832031, "learning_rate": 4.014965102062924e-07, "logits/chosen": -19.218542098999023, "logits/rejected": -18.244142532348633, "logps/chosen": -506.0386657714844, "logps/rejected": -393.81256103515625, "loss": 0.7534, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.840366840362549, "rewards/margins": 0.7435040473937988, "rewards/rejected": 3.096863031387329, "step": 12740 }, { "epoch": 0.5919494869771112, "grad_norm": 35.37104034423828, "learning_rate": 4.0141913118838693e-07, "logits/chosen": -18.369861602783203, "logits/rejected": -18.19583511352539, "logps/chosen": -232.4120635986328, "logps/rejected": -180.36985778808594, "loss": 0.7115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9743492603302002, "rewards/margins": 0.49726563692092896, "rewards/rejected": 1.4770839214324951, "step": 12750 }, { "epoch": 0.5924137610845444, "grad_norm": 285.7608947753906, "learning_rate": 4.0134175217048144e-07, "logits/chosen": -19.458446502685547, "logits/rejected": -18.78104019165039, "logps/chosen": -419.99041748046875, "logps/rejected": -394.9994201660156, "loss": 1.2003, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.542102098464966, "rewards/margins": -0.011360740289092064, "rewards/rejected": 2.5534627437591553, "step": 12760 }, { "epoch": 0.5928780351919773, "grad_norm": 247.7766571044922, "learning_rate": 4.0126437315257595e-07, "logits/chosen": -19.54053497314453, "logits/rejected": -19.17823600769043, "logps/chosen": -424.6673889160156, "logps/rejected": -419.23492431640625, "loss": 0.8207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.20825457572937, "rewards/margins": -0.05721733719110489, "rewards/rejected": 3.265471935272217, "step": 12770 }, { "epoch": 0.5933423092994103, "grad_norm": 26.744144439697266, "learning_rate": 4.0118699413467047e-07, "logits/chosen": -19.214540481567383, "logits/rejected": -18.834373474121094, "logps/chosen": -440.1288146972656, "logps/rejected": -407.4613342285156, "loss": 0.5408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2505943775177, "rewards/margins": 0.7722051739692688, "rewards/rejected": 1.4783892631530762, "step": 12780 }, { "epoch": 0.5938065834068434, "grad_norm": 102.19670867919922, "learning_rate": 4.011096151167649e-07, "logits/chosen": -18.422922134399414, "logits/rejected": -18.309289932250977, "logps/chosen": -316.641845703125, "logps/rejected": -331.8627014160156, "loss": 1.2141, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8794994354248047, "rewards/margins": -0.3993537724018097, "rewards/rejected": 2.278853178024292, "step": 12790 }, { "epoch": 0.5942708575142764, "grad_norm": 44.02698516845703, "learning_rate": 4.010322360988594e-07, "logits/chosen": -19.049232482910156, "logits/rejected": -18.973588943481445, "logps/chosen": -432.40704345703125, "logps/rejected": -383.7164611816406, "loss": 0.6805, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8825132846832275, "rewards/margins": 0.17214176058769226, "rewards/rejected": 2.710371732711792, "step": 12800 }, { "epoch": 0.5947351316217094, "grad_norm": 114.6498794555664, "learning_rate": 4.009548570809539e-07, "logits/chosen": -18.965476989746094, "logits/rejected": -18.20941925048828, "logps/chosen": -460.1477966308594, "logps/rejected": -382.21966552734375, "loss": 0.4984, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.320241928100586, "rewards/margins": 0.735636830329895, "rewards/rejected": 2.5846049785614014, "step": 12810 }, { "epoch": 0.5951994057291425, "grad_norm": 114.71180725097656, "learning_rate": 4.008774780630484e-07, "logits/chosen": -18.747772216796875, "logits/rejected": -17.817520141601562, "logps/chosen": -510.52294921875, "logps/rejected": -398.4593811035156, "loss": 0.5931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.795412063598633, "rewards/margins": 0.6321941614151001, "rewards/rejected": 2.163217782974243, "step": 12820 }, { "epoch": 0.5956636798365755, "grad_norm": 98.8403549194336, "learning_rate": 4.008000990451429e-07, "logits/chosen": -18.21417999267578, "logits/rejected": -17.717313766479492, "logps/chosen": -334.9298400878906, "logps/rejected": -263.04693603515625, "loss": 0.7539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0158350467681885, "rewards/margins": 0.8321477770805359, "rewards/rejected": 2.1836869716644287, "step": 12830 }, { "epoch": 0.5961279539440085, "grad_norm": 17.023792266845703, "learning_rate": 4.007227200272374e-07, "logits/chosen": -17.826431274414062, "logits/rejected": -17.227882385253906, "logps/chosen": -269.3332824707031, "logps/rejected": -258.42608642578125, "loss": 0.9023, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.953102469444275, "rewards/margins": 0.5746240019798279, "rewards/rejected": 1.3784784078598022, "step": 12840 }, { "epoch": 0.5965922280514415, "grad_norm": 165.40939331054688, "learning_rate": 4.006453410093319e-07, "logits/chosen": -18.38170623779297, "logits/rejected": -17.544498443603516, "logps/chosen": -494.88214111328125, "logps/rejected": -307.31414794921875, "loss": 0.5325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9814693927764893, "rewards/margins": 1.0393749475479126, "rewards/rejected": 1.9420945644378662, "step": 12850 }, { "epoch": 0.5970565021588746, "grad_norm": 18.871091842651367, "learning_rate": 4.005679619914264e-07, "logits/chosen": -19.285724639892578, "logits/rejected": -18.84847068786621, "logps/chosen": -318.22662353515625, "logps/rejected": -296.2166748046875, "loss": 0.7168, "rewards/accuracies": 0.5, "rewards/chosen": 2.497368335723877, "rewards/margins": 0.41502705216407776, "rewards/rejected": 2.082341432571411, "step": 12860 }, { "epoch": 0.5975207762663076, "grad_norm": 5.768385410308838, "learning_rate": 4.004905829735209e-07, "logits/chosen": -19.496997833251953, "logits/rejected": -18.844762802124023, "logps/chosen": -410.57574462890625, "logps/rejected": -308.19549560546875, "loss": 0.4385, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.98728084564209, "rewards/margins": 0.8667621612548828, "rewards/rejected": 2.120518684387207, "step": 12870 }, { "epoch": 0.5979850503737406, "grad_norm": 118.68412780761719, "learning_rate": 4.004132039556154e-07, "logits/chosen": -19.86583137512207, "logits/rejected": -20.16493034362793, "logps/chosen": -391.62835693359375, "logps/rejected": -287.3445129394531, "loss": 0.4725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9906208515167236, "rewards/margins": 0.8304893374443054, "rewards/rejected": 2.1601309776306152, "step": 12880 }, { "epoch": 0.5984493244811737, "grad_norm": 5.298386573791504, "learning_rate": 4.003358249377099e-07, "logits/chosen": -19.511764526367188, "logits/rejected": -18.64667320251465, "logps/chosen": -255.97061157226562, "logps/rejected": -185.36849975585938, "loss": 0.4369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9302539825439453, "rewards/margins": 0.9694031476974487, "rewards/rejected": 0.9608508944511414, "step": 12890 }, { "epoch": 0.5989135985886067, "grad_norm": 53.77451705932617, "learning_rate": 4.0025844591980434e-07, "logits/chosen": -18.173282623291016, "logits/rejected": -16.657955169677734, "logps/chosen": -373.9248962402344, "logps/rejected": -218.8830108642578, "loss": 0.4674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6262640953063965, "rewards/margins": 1.3512197732925415, "rewards/rejected": 1.2750440835952759, "step": 12900 }, { "epoch": 0.5993778726960397, "grad_norm": 46.47693634033203, "learning_rate": 4.0018106690189885e-07, "logits/chosen": -19.15729522705078, "logits/rejected": -18.04776382446289, "logps/chosen": -324.217529296875, "logps/rejected": -257.16888427734375, "loss": 0.451, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.970731258392334, "rewards/margins": 0.8980321884155273, "rewards/rejected": 2.0726990699768066, "step": 12910 }, { "epoch": 0.5998421468034728, "grad_norm": 42.1044921875, "learning_rate": 4.0010368788399336e-07, "logits/chosen": -18.527542114257812, "logits/rejected": -17.22568702697754, "logps/chosen": -410.8045349121094, "logps/rejected": -225.60543823242188, "loss": 0.4986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.015362024307251, "rewards/margins": 1.7902923822402954, "rewards/rejected": 1.225069284439087, "step": 12920 }, { "epoch": 0.6003064209109058, "grad_norm": 47.2899169921875, "learning_rate": 4.0002630886608787e-07, "logits/chosen": -18.62038803100586, "logits/rejected": -18.55796241760254, "logps/chosen": -501.30389404296875, "logps/rejected": -440.78887939453125, "loss": 0.5828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.860435962677002, "rewards/margins": 1.0272823572158813, "rewards/rejected": 2.833153486251831, "step": 12930 }, { "epoch": 0.6007706950183388, "grad_norm": 66.98107147216797, "learning_rate": 3.9994892984818233e-07, "logits/chosen": -19.210384368896484, "logits/rejected": -17.74893569946289, "logps/chosen": -516.25048828125, "logps/rejected": -380.44915771484375, "loss": 0.5919, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.784738063812256, "rewards/margins": 0.7566668391227722, "rewards/rejected": 2.028071165084839, "step": 12940 }, { "epoch": 0.6012349691257719, "grad_norm": 165.894287109375, "learning_rate": 3.9987155083027684e-07, "logits/chosen": -19.329442977905273, "logits/rejected": -18.969404220581055, "logps/chosen": -475.15155029296875, "logps/rejected": -458.05072021484375, "loss": 1.0394, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2252724170684814, "rewards/margins": -0.26311132311820984, "rewards/rejected": 3.4883835315704346, "step": 12950 }, { "epoch": 0.6016992432332049, "grad_norm": 142.61517333984375, "learning_rate": 3.9979417181237135e-07, "logits/chosen": -19.196535110473633, "logits/rejected": -19.03153419494629, "logps/chosen": -453.1788635253906, "logps/rejected": -400.0419616699219, "loss": 0.5788, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9813942909240723, "rewards/margins": 0.6709381341934204, "rewards/rejected": 2.3104560375213623, "step": 12960 }, { "epoch": 0.6021635173406379, "grad_norm": 43.497806549072266, "learning_rate": 3.9971679279446586e-07, "logits/chosen": -17.95884895324707, "logits/rejected": -17.379405975341797, "logps/chosen": -333.19757080078125, "logps/rejected": -306.74737548828125, "loss": 1.0226, "rewards/accuracies": 0.5, "rewards/chosen": 2.2386934757232666, "rewards/margins": 0.02393503114581108, "rewards/rejected": 2.2147583961486816, "step": 12970 }, { "epoch": 0.602627791448071, "grad_norm": 63.57511901855469, "learning_rate": 3.996394137765604e-07, "logits/chosen": -20.03700065612793, "logits/rejected": -20.25035285949707, "logps/chosen": -345.3280944824219, "logps/rejected": -287.08441162109375, "loss": 0.5935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.96067476272583, "rewards/margins": 0.7615587711334229, "rewards/rejected": 2.1991162300109863, "step": 12980 }, { "epoch": 0.603092065555504, "grad_norm": 37.42280578613281, "learning_rate": 3.995620347586548e-07, "logits/chosen": -18.207321166992188, "logits/rejected": -18.627628326416016, "logps/chosen": -341.3639831542969, "logps/rejected": -364.76739501953125, "loss": 1.564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0914406776428223, "rewards/margins": -0.5132739543914795, "rewards/rejected": 2.6047146320343018, "step": 12990 }, { "epoch": 0.603556339662937, "grad_norm": 95.82074737548828, "learning_rate": 3.994846557407493e-07, "logits/chosen": -19.980661392211914, "logits/rejected": -17.3665771484375, "logps/chosen": -560.2865600585938, "logps/rejected": -288.62408447265625, "loss": 0.429, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.371767044067383, "rewards/margins": 1.5527589321136475, "rewards/rejected": 1.8190078735351562, "step": 13000 }, { "epoch": 0.60402061377037, "grad_norm": 62.5849723815918, "learning_rate": 3.994072767228438e-07, "logits/chosen": -18.928760528564453, "logits/rejected": -17.74265480041504, "logps/chosen": -487.4535217285156, "logps/rejected": -317.90484619140625, "loss": 0.5435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.159386396408081, "rewards/margins": 0.9289576411247253, "rewards/rejected": 2.230429172515869, "step": 13010 }, { "epoch": 0.6044848878778031, "grad_norm": 40.50516891479492, "learning_rate": 3.993298977049383e-07, "logits/chosen": -19.5905818939209, "logits/rejected": -17.712078094482422, "logps/chosen": -517.738037109375, "logps/rejected": -310.62481689453125, "loss": 0.3706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.450927257537842, "rewards/margins": 1.3976017236709595, "rewards/rejected": 2.053325891494751, "step": 13020 }, { "epoch": 0.6049491619852361, "grad_norm": 23.466087341308594, "learning_rate": 3.992525186870328e-07, "logits/chosen": -19.418476104736328, "logits/rejected": -18.003047943115234, "logps/chosen": -418.09405517578125, "logps/rejected": -269.3006896972656, "loss": 0.7607, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3934757709503174, "rewards/margins": 0.4031447768211365, "rewards/rejected": 1.9903310537338257, "step": 13030 }, { "epoch": 0.6054134360926691, "grad_norm": 84.73445129394531, "learning_rate": 3.991751396691273e-07, "logits/chosen": -19.39322853088379, "logits/rejected": -19.399860382080078, "logps/chosen": -344.96331787109375, "logps/rejected": -333.50653076171875, "loss": 1.1147, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5273327827453613, "rewards/margins": -0.3006350100040436, "rewards/rejected": 2.827968120574951, "step": 13040 }, { "epoch": 0.6058777102001022, "grad_norm": 65.00552368164062, "learning_rate": 3.990977606512218e-07, "logits/chosen": -19.197343826293945, "logits/rejected": -18.058120727539062, "logps/chosen": -420.97174072265625, "logps/rejected": -311.7048034667969, "loss": 0.3247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7028915882110596, "rewards/margins": 1.417191743850708, "rewards/rejected": 2.2856998443603516, "step": 13050 }, { "epoch": 0.6063419843075352, "grad_norm": 219.6822509765625, "learning_rate": 3.990203816333163e-07, "logits/chosen": -19.536758422851562, "logits/rejected": -19.020877838134766, "logps/chosen": -493.79541015625, "logps/rejected": -388.827392578125, "loss": 0.5345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9832193851470947, "rewards/margins": 0.620002269744873, "rewards/rejected": 2.363217353820801, "step": 13060 }, { "epoch": 0.6068062584149682, "grad_norm": 29.650680541992188, "learning_rate": 3.989430026154108e-07, "logits/chosen": -19.096445083618164, "logits/rejected": -18.821359634399414, "logps/chosen": -259.0181579589844, "logps/rejected": -272.3525695800781, "loss": 0.8326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.093039035797119, "rewards/margins": 0.19838449358940125, "rewards/rejected": 1.8946545124053955, "step": 13070 }, { "epoch": 0.6072705325224013, "grad_norm": 105.30889129638672, "learning_rate": 3.9886562359750533e-07, "logits/chosen": -19.201557159423828, "logits/rejected": -18.1434268951416, "logps/chosen": -390.1901550292969, "logps/rejected": -293.07373046875, "loss": 0.5065, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.438194990158081, "rewards/margins": 1.2881834506988525, "rewards/rejected": 2.1500117778778076, "step": 13080 }, { "epoch": 0.6077348066298343, "grad_norm": 0.42047321796417236, "learning_rate": 3.9878824457959973e-07, "logits/chosen": -18.106536865234375, "logits/rejected": -17.730201721191406, "logps/chosen": -418.17633056640625, "logps/rejected": -411.45428466796875, "loss": 1.7061, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.7760770320892334, "rewards/margins": -0.584877073764801, "rewards/rejected": 3.3609542846679688, "step": 13090 }, { "epoch": 0.6081990807372673, "grad_norm": 178.99752807617188, "learning_rate": 3.9871086556169424e-07, "logits/chosen": -19.36041831970215, "logits/rejected": -18.65920639038086, "logps/chosen": -374.0754089355469, "logps/rejected": -216.30508422851562, "loss": 0.6188, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0077545642852783, "rewards/margins": 1.280517339706421, "rewards/rejected": 1.727237343788147, "step": 13100 }, { "epoch": 0.6086633548447004, "grad_norm": 10.122864723205566, "learning_rate": 3.9863348654378876e-07, "logits/chosen": -19.501567840576172, "logits/rejected": -18.03731346130371, "logps/chosen": -452.13751220703125, "logps/rejected": -337.5434875488281, "loss": 0.3366, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0909667015075684, "rewards/margins": 1.2618106603622437, "rewards/rejected": 1.829155683517456, "step": 13110 }, { "epoch": 0.6091276289521333, "grad_norm": 30.472148895263672, "learning_rate": 3.9855610752588327e-07, "logits/chosen": -18.81096649169922, "logits/rejected": -19.166913986206055, "logps/chosen": -440.9300842285156, "logps/rejected": -359.94952392578125, "loss": 0.7829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8549318313598633, "rewards/margins": 0.05439937114715576, "rewards/rejected": 2.800532341003418, "step": 13120 }, { "epoch": 0.6095919030595663, "grad_norm": 43.26150131225586, "learning_rate": 3.984787285079778e-07, "logits/chosen": -18.93385887145996, "logits/rejected": -17.57847785949707, "logps/chosen": -619.3340454101562, "logps/rejected": -350.72601318359375, "loss": 0.4258, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6465601921081543, "rewards/margins": 1.3306604623794556, "rewards/rejected": 2.3159000873565674, "step": 13130 }, { "epoch": 0.6100561771669994, "grad_norm": 28.556838989257812, "learning_rate": 3.984090873918628e-07, "logits/chosen": -19.32766342163086, "logits/rejected": -19.63229751586914, "logps/chosen": -464.54718017578125, "logps/rejected": -441.391845703125, "loss": 1.0157, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.282395839691162, "rewards/margins": -0.06936965882778168, "rewards/rejected": 3.3517658710479736, "step": 13140 }, { "epoch": 0.6105204512744324, "grad_norm": 100.14969635009766, "learning_rate": 3.983317083739573e-07, "logits/chosen": -19.87189292907715, "logits/rejected": -19.63239097595215, "logps/chosen": -395.4336853027344, "logps/rejected": -405.06536865234375, "loss": 0.5706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9938435554504395, "rewards/margins": 0.48246899247169495, "rewards/rejected": 2.5113747119903564, "step": 13150 }, { "epoch": 0.6109847253818654, "grad_norm": 26.86699867248535, "learning_rate": 3.982543293560518e-07, "logits/chosen": -18.50629997253418, "logits/rejected": -18.070581436157227, "logps/chosen": -468.3761291503906, "logps/rejected": -444.57501220703125, "loss": 0.6243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.133450746536255, "rewards/margins": 0.470905601978302, "rewards/rejected": 2.6625447273254395, "step": 13160 }, { "epoch": 0.6114489994892984, "grad_norm": 51.3377571105957, "learning_rate": 3.981769503381463e-07, "logits/chosen": -20.158523559570312, "logits/rejected": -19.32699966430664, "logps/chosen": -388.962646484375, "logps/rejected": -355.8450012207031, "loss": 0.7282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5983998775482178, "rewards/margins": 0.09535326063632965, "rewards/rejected": 2.503046989440918, "step": 13170 }, { "epoch": 0.6119132735967315, "grad_norm": 1.5080385208129883, "learning_rate": 3.980995713202408e-07, "logits/chosen": -20.078716278076172, "logits/rejected": -19.696002960205078, "logps/chosen": -414.06353759765625, "logps/rejected": -408.7853088378906, "loss": 1.1533, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.045590877532959, "rewards/margins": -0.11015062034130096, "rewards/rejected": 3.1557414531707764, "step": 13180 }, { "epoch": 0.6123775477041645, "grad_norm": 275.0955810546875, "learning_rate": 3.980221923023353e-07, "logits/chosen": -18.13534927368164, "logits/rejected": -18.341718673706055, "logps/chosen": -313.2261962890625, "logps/rejected": -364.625244140625, "loss": 1.2577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4569671154022217, "rewards/margins": -0.4196593165397644, "rewards/rejected": 2.87662672996521, "step": 13190 }, { "epoch": 0.6128418218115975, "grad_norm": 87.91848754882812, "learning_rate": 3.979448132844298e-07, "logits/chosen": -18.38921546936035, "logits/rejected": -17.715219497680664, "logps/chosen": -405.0972595214844, "logps/rejected": -311.46832275390625, "loss": 0.4342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2677664756774902, "rewards/margins": 0.9308661222457886, "rewards/rejected": 2.336900472640991, "step": 13200 }, { "epoch": 0.6133060959190306, "grad_norm": 106.20765686035156, "learning_rate": 3.978674342665243e-07, "logits/chosen": -20.37388038635254, "logits/rejected": -18.627029418945312, "logps/chosen": -524.861328125, "logps/rejected": -333.16119384765625, "loss": 0.3669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6875579357147217, "rewards/margins": 1.313652753829956, "rewards/rejected": 2.3739054203033447, "step": 13210 }, { "epoch": 0.6137703700264636, "grad_norm": 12.029024124145508, "learning_rate": 3.9779005524861873e-07, "logits/chosen": -18.290119171142578, "logits/rejected": -16.772340774536133, "logps/chosen": -444.3770446777344, "logps/rejected": -267.9037780761719, "loss": 0.5394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.626866579055786, "rewards/margins": 1.3987687826156616, "rewards/rejected": 2.228097677230835, "step": 13220 }, { "epoch": 0.6142346441338966, "grad_norm": 85.1280746459961, "learning_rate": 3.9771267623071324e-07, "logits/chosen": -18.501684188842773, "logits/rejected": -18.129148483276367, "logps/chosen": -347.4100036621094, "logps/rejected": -341.54119873046875, "loss": 0.7002, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9556636810302734, "rewards/margins": 0.5348371267318726, "rewards/rejected": 2.4208266735076904, "step": 13230 }, { "epoch": 0.6146989182413297, "grad_norm": 31.98225975036621, "learning_rate": 3.9763529721280776e-07, "logits/chosen": -19.435604095458984, "logits/rejected": -19.9465389251709, "logps/chosen": -312.09075927734375, "logps/rejected": -255.8191680908203, "loss": 0.7723, "rewards/accuracies": 0.5, "rewards/chosen": 2.2380359172821045, "rewards/margins": 0.2521399259567261, "rewards/rejected": 1.9858958721160889, "step": 13240 }, { "epoch": 0.6151631923487627, "grad_norm": 81.2197494506836, "learning_rate": 3.9755791819490227e-07, "logits/chosen": -18.684738159179688, "logits/rejected": -17.332550048828125, "logps/chosen": -426.22064208984375, "logps/rejected": -286.1634521484375, "loss": 0.3433, "rewards/accuracies": 1.0, "rewards/chosen": 2.4651074409484863, "rewards/margins": 1.0988425016403198, "rewards/rejected": 1.366265058517456, "step": 13250 }, { "epoch": 0.6156274664561957, "grad_norm": 21.171546936035156, "learning_rate": 3.974805391769968e-07, "logits/chosen": -21.026782989501953, "logits/rejected": -20.664836883544922, "logps/chosen": -363.89410400390625, "logps/rejected": -337.05035400390625, "loss": 0.6099, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.863410472869873, "rewards/margins": 0.4916132390499115, "rewards/rejected": 2.3717973232269287, "step": 13260 }, { "epoch": 0.6160917405636288, "grad_norm": 76.08307647705078, "learning_rate": 3.9740316015909124e-07, "logits/chosen": -18.31534767150879, "logits/rejected": -18.674510955810547, "logps/chosen": -347.25714111328125, "logps/rejected": -349.8414611816406, "loss": 1.0219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1939942836761475, "rewards/margins": -0.2370060235261917, "rewards/rejected": 2.431000232696533, "step": 13270 }, { "epoch": 0.6165560146710618, "grad_norm": 254.79859924316406, "learning_rate": 3.9732578114118575e-07, "logits/chosen": -18.19894790649414, "logits/rejected": -17.838687896728516, "logps/chosen": -344.79205322265625, "logps/rejected": -284.95098876953125, "loss": 0.8316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.807260751724243, "rewards/margins": 0.7358095049858093, "rewards/rejected": 2.07145094871521, "step": 13280 }, { "epoch": 0.6170202887784948, "grad_norm": 129.0673065185547, "learning_rate": 3.9724840212328026e-07, "logits/chosen": -18.35637092590332, "logits/rejected": -18.666948318481445, "logps/chosen": -405.41375732421875, "logps/rejected": -427.40576171875, "loss": 1.0561, "rewards/accuracies": 0.5, "rewards/chosen": 2.3600146770477295, "rewards/margins": -0.09988340735435486, "rewards/rejected": 2.4598982334136963, "step": 13290 }, { "epoch": 0.6174845628859279, "grad_norm": 48.499507904052734, "learning_rate": 3.971710231053747e-07, "logits/chosen": -18.736431121826172, "logits/rejected": -16.849462509155273, "logps/chosen": -402.577392578125, "logps/rejected": -205.0681915283203, "loss": 0.4317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8935484886169434, "rewards/margins": 1.454356074333191, "rewards/rejected": 1.4391920566558838, "step": 13300 }, { "epoch": 0.6179488369933609, "grad_norm": 6.088326454162598, "learning_rate": 3.9709364408746923e-07, "logits/chosen": -18.127832412719727, "logits/rejected": -17.68848419189453, "logps/chosen": -557.8090209960938, "logps/rejected": -436.087158203125, "loss": 0.8514, "rewards/accuracies": 0.5, "rewards/chosen": 3.7896125316619873, "rewards/margins": 0.5871545076370239, "rewards/rejected": 3.202457904815674, "step": 13310 }, { "epoch": 0.6184131111007939, "grad_norm": 19.12546157836914, "learning_rate": 3.970162650695637e-07, "logits/chosen": -18.333385467529297, "logits/rejected": -17.873336791992188, "logps/chosen": -484.6839904785156, "logps/rejected": -400.33843994140625, "loss": 0.6625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5119540691375732, "rewards/margins": 0.3518848419189453, "rewards/rejected": 3.160068988800049, "step": 13320 }, { "epoch": 0.6188773852082269, "grad_norm": 57.96425247192383, "learning_rate": 3.969388860516582e-07, "logits/chosen": -18.522090911865234, "logits/rejected": -18.340572357177734, "logps/chosen": -384.739990234375, "logps/rejected": -340.682373046875, "loss": 0.8212, "rewards/accuracies": 0.5, "rewards/chosen": 3.213593006134033, "rewards/margins": 0.10390792042016983, "rewards/rejected": 3.109684705734253, "step": 13330 }, { "epoch": 0.61934165931566, "grad_norm": 1.205481767654419, "learning_rate": 3.968615070337527e-07, "logits/chosen": -19.040477752685547, "logits/rejected": -18.460498809814453, "logps/chosen": -409.7450256347656, "logps/rejected": -328.62554931640625, "loss": 0.9176, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.926920175552368, "rewards/margins": 0.4904797673225403, "rewards/rejected": 2.4364402294158936, "step": 13340 }, { "epoch": 0.619805933423093, "grad_norm": 123.13993072509766, "learning_rate": 3.967841280158472e-07, "logits/chosen": -18.022043228149414, "logits/rejected": -17.985599517822266, "logps/chosen": -327.325927734375, "logps/rejected": -379.70550537109375, "loss": 0.903, "rewards/accuracies": 0.5, "rewards/chosen": 2.3354408740997314, "rewards/margins": -0.11685576289892197, "rewards/rejected": 2.452296257019043, "step": 13350 }, { "epoch": 0.620270207530526, "grad_norm": 120.74076080322266, "learning_rate": 3.9670674899794173e-07, "logits/chosen": -18.12651824951172, "logits/rejected": -17.572528839111328, "logps/chosen": -458.5184020996094, "logps/rejected": -343.4588623046875, "loss": 0.7229, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.292937755584717, "rewards/margins": 0.732090175151825, "rewards/rejected": 2.560847759246826, "step": 13360 }, { "epoch": 0.6207344816379591, "grad_norm": 144.53472900390625, "learning_rate": 3.966293699800362e-07, "logits/chosen": -19.11505889892578, "logits/rejected": -18.383655548095703, "logps/chosen": -408.3934020996094, "logps/rejected": -371.94140625, "loss": 0.6152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8922762870788574, "rewards/margins": 0.5938811898231506, "rewards/rejected": 2.2983951568603516, "step": 13370 }, { "epoch": 0.6211987557453921, "grad_norm": 118.1746597290039, "learning_rate": 3.965519909621307e-07, "logits/chosen": -18.403514862060547, "logits/rejected": -17.877216339111328, "logps/chosen": -462.33795166015625, "logps/rejected": -455.948486328125, "loss": 0.9494, "rewards/accuracies": 0.5, "rewards/chosen": 3.1991477012634277, "rewards/margins": 0.0034747957251966, "rewards/rejected": 3.1956729888916016, "step": 13380 }, { "epoch": 0.6216630298528251, "grad_norm": 190.90267944335938, "learning_rate": 3.964746119442252e-07, "logits/chosen": -18.37453842163086, "logits/rejected": -18.57246208190918, "logps/chosen": -379.2425231933594, "logps/rejected": -383.7126770019531, "loss": 1.2471, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.498469591140747, "rewards/margins": -0.307245671749115, "rewards/rejected": 2.8057150840759277, "step": 13390 }, { "epoch": 0.6221273039602582, "grad_norm": 129.7191925048828, "learning_rate": 3.9639723292631967e-07, "logits/chosen": -19.23407745361328, "logits/rejected": -18.695552825927734, "logps/chosen": -426.90960693359375, "logps/rejected": -381.10211181640625, "loss": 0.4142, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.008037805557251, "rewards/margins": 0.9321457743644714, "rewards/rejected": 2.0758919715881348, "step": 13400 }, { "epoch": 0.6225915780676912, "grad_norm": 55.683650970458984, "learning_rate": 3.963198539084142e-07, "logits/chosen": -17.83840560913086, "logits/rejected": -17.914899826049805, "logps/chosen": -339.13421630859375, "logps/rejected": -314.3101501464844, "loss": 0.6869, "rewards/accuracies": 0.5, "rewards/chosen": 2.1402881145477295, "rewards/margins": 0.13293901085853577, "rewards/rejected": 2.0073490142822266, "step": 13410 }, { "epoch": 0.6230558521751242, "grad_norm": 85.53939819335938, "learning_rate": 3.9624247489050864e-07, "logits/chosen": -18.405569076538086, "logits/rejected": -17.802703857421875, "logps/chosen": -478.3894958496094, "logps/rejected": -420.568359375, "loss": 0.7145, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1141669750213623, "rewards/margins": 0.425835520029068, "rewards/rejected": 2.688331365585327, "step": 13420 }, { "epoch": 0.6235201262825573, "grad_norm": 7.121815204620361, "learning_rate": 3.9616509587260315e-07, "logits/chosen": -18.723445892333984, "logits/rejected": -18.192920684814453, "logps/chosen": -362.9900207519531, "logps/rejected": -286.3084411621094, "loss": 0.5083, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.294304370880127, "rewards/margins": 1.2675713300704956, "rewards/rejected": 2.0267326831817627, "step": 13430 }, { "epoch": 0.6239844003899903, "grad_norm": 80.53764343261719, "learning_rate": 3.9608771685469766e-07, "logits/chosen": -18.418926239013672, "logits/rejected": -17.06195831298828, "logps/chosen": -410.9583435058594, "logps/rejected": -291.7464599609375, "loss": 0.5867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.741619348526001, "rewards/margins": 0.8019595146179199, "rewards/rejected": 1.9396597146987915, "step": 13440 }, { "epoch": 0.6244486744974233, "grad_norm": 260.5, "learning_rate": 3.960103378367922e-07, "logits/chosen": -18.457275390625, "logits/rejected": -17.864133834838867, "logps/chosen": -259.1231994628906, "logps/rejected": -266.199951171875, "loss": 1.1652, "rewards/accuracies": 0.5, "rewards/chosen": 1.921708106994629, "rewards/margins": -0.09708791971206665, "rewards/rejected": 2.01879620552063, "step": 13450 }, { "epoch": 0.6249129486048564, "grad_norm": 118.76458740234375, "learning_rate": 3.959329588188867e-07, "logits/chosen": -18.053552627563477, "logits/rejected": -18.076465606689453, "logps/chosen": -369.4735107421875, "logps/rejected": -373.05755615234375, "loss": 0.9141, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.760647773742676, "rewards/margins": 0.04778020456433296, "rewards/rejected": 2.712867259979248, "step": 13460 }, { "epoch": 0.6253772227122893, "grad_norm": 64.34746551513672, "learning_rate": 3.9585557980098114e-07, "logits/chosen": -18.902572631835938, "logits/rejected": -18.88669204711914, "logps/chosen": -315.1275329589844, "logps/rejected": -329.41693115234375, "loss": 0.8312, "rewards/accuracies": 0.5, "rewards/chosen": 2.0100045204162598, "rewards/margins": 0.059853196144104004, "rewards/rejected": 1.9501516819000244, "step": 13470 }, { "epoch": 0.6258414968197223, "grad_norm": 130.15255737304688, "learning_rate": 3.9577820078307566e-07, "logits/chosen": -19.295265197753906, "logits/rejected": -18.8048152923584, "logps/chosen": -365.49530029296875, "logps/rejected": -352.25360107421875, "loss": 0.7878, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4560835361480713, "rewards/margins": 0.4547053277492523, "rewards/rejected": 2.001378059387207, "step": 13480 }, { "epoch": 0.6263057709271554, "grad_norm": 108.58647918701172, "learning_rate": 3.957008217651701e-07, "logits/chosen": -18.443265914916992, "logits/rejected": -17.390539169311523, "logps/chosen": -408.14251708984375, "logps/rejected": -301.5553894042969, "loss": 0.4887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1264569759368896, "rewards/margins": 0.7684643864631653, "rewards/rejected": 2.357992649078369, "step": 13490 }, { "epoch": 0.6267700450345884, "grad_norm": 29.932680130004883, "learning_rate": 3.956234427472646e-07, "logits/chosen": -18.691280364990234, "logits/rejected": -17.619144439697266, "logps/chosen": -354.2419128417969, "logps/rejected": -267.378173828125, "loss": 0.5408, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1019973754882812, "rewards/margins": 0.9632282257080078, "rewards/rejected": 2.1387693881988525, "step": 13500 }, { "epoch": 0.6272343191420214, "grad_norm": 164.50640869140625, "learning_rate": 3.9554606372935914e-07, "logits/chosen": -19.991037368774414, "logits/rejected": -19.259479522705078, "logps/chosen": -392.2551574707031, "logps/rejected": -365.72052001953125, "loss": 0.5748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.532099962234497, "rewards/margins": 0.5036624670028687, "rewards/rejected": 2.028437376022339, "step": 13510 }, { "epoch": 0.6276985932494544, "grad_norm": 82.17671966552734, "learning_rate": 3.954686847114536e-07, "logits/chosen": -18.0537109375, "logits/rejected": -18.401464462280273, "logps/chosen": -372.9224548339844, "logps/rejected": -356.6858825683594, "loss": 0.5514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3067383766174316, "rewards/margins": 0.47907066345214844, "rewards/rejected": 1.8276678323745728, "step": 13520 }, { "epoch": 0.6281628673568875, "grad_norm": 222.72634887695312, "learning_rate": 3.953913056935481e-07, "logits/chosen": -18.933568954467773, "logits/rejected": -18.286334991455078, "logps/chosen": -523.1468505859375, "logps/rejected": -426.71923828125, "loss": 1.105, "rewards/accuracies": 0.5, "rewards/chosen": 4.012204170227051, "rewards/margins": 0.3911166787147522, "rewards/rejected": 3.6210873126983643, "step": 13530 }, { "epoch": 0.6286271414643205, "grad_norm": 63.99734878540039, "learning_rate": 3.953139266756426e-07, "logits/chosen": -19.09639549255371, "logits/rejected": -18.403656005859375, "logps/chosen": -430.98931884765625, "logps/rejected": -445.75933837890625, "loss": 0.5499, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4188365936279297, "rewards/margins": 0.5027157664299011, "rewards/rejected": 2.916121006011963, "step": 13540 }, { "epoch": 0.6290914155717535, "grad_norm": 75.96955871582031, "learning_rate": 3.9523654765773713e-07, "logits/chosen": -19.495471954345703, "logits/rejected": -19.211132049560547, "logps/chosen": -337.13433837890625, "logps/rejected": -277.68121337890625, "loss": 0.5471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4218268394470215, "rewards/margins": 0.7590649724006653, "rewards/rejected": 1.6627616882324219, "step": 13550 }, { "epoch": 0.6295556896791866, "grad_norm": 198.14097595214844, "learning_rate": 3.9515916863983164e-07, "logits/chosen": -18.948450088500977, "logits/rejected": -17.532169342041016, "logps/chosen": -422.3877868652344, "logps/rejected": -259.2266540527344, "loss": 0.3779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0881597995758057, "rewards/margins": 1.2890024185180664, "rewards/rejected": 1.7991573810577393, "step": 13560 }, { "epoch": 0.6300199637866196, "grad_norm": 25.19651985168457, "learning_rate": 3.950817896219261e-07, "logits/chosen": -19.59340476989746, "logits/rejected": -18.59876251220703, "logps/chosen": -469.621337890625, "logps/rejected": -321.24346923828125, "loss": 0.7673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.124912738800049, "rewards/margins": 0.8239358067512512, "rewards/rejected": 2.3009772300720215, "step": 13570 }, { "epoch": 0.6304842378940526, "grad_norm": 212.8321075439453, "learning_rate": 3.950044106040206e-07, "logits/chosen": -18.485734939575195, "logits/rejected": -17.724796295166016, "logps/chosen": -294.788818359375, "logps/rejected": -304.81561279296875, "loss": 0.9503, "rewards/accuracies": 0.5, "rewards/chosen": 2.9551472663879395, "rewards/margins": 0.4840773940086365, "rewards/rejected": 2.471069812774658, "step": 13580 }, { "epoch": 0.6309485120014857, "grad_norm": 9.386617660522461, "learning_rate": 3.9492703158611507e-07, "logits/chosen": -18.838579177856445, "logits/rejected": -17.036479949951172, "logps/chosen": -451.6573181152344, "logps/rejected": -572.908935546875, "loss": 0.3483, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.886526107788086, "rewards/margins": 2.222853183746338, "rewards/rejected": 1.6636730432510376, "step": 13590 }, { "epoch": 0.6314127861089187, "grad_norm": 29.032039642333984, "learning_rate": 3.948496525682096e-07, "logits/chosen": -18.958961486816406, "logits/rejected": -18.28433609008789, "logps/chosen": -444.78509521484375, "logps/rejected": -345.58172607421875, "loss": 0.5246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9268081188201904, "rewards/margins": 0.5982614755630493, "rewards/rejected": 2.3285465240478516, "step": 13600 }, { "epoch": 0.6318770602163517, "grad_norm": 78.29206848144531, "learning_rate": 3.947722735503041e-07, "logits/chosen": -18.551244735717773, "logits/rejected": -16.82667350769043, "logps/chosen": -377.0480041503906, "logps/rejected": -185.68606567382812, "loss": 0.341, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3666458129882812, "rewards/margins": 1.3966634273529053, "rewards/rejected": 0.9699821472167969, "step": 13610 }, { "epoch": 0.6323413343237848, "grad_norm": 66.28477478027344, "learning_rate": 3.9469489453239855e-07, "logits/chosen": -18.84200096130371, "logits/rejected": -17.458446502685547, "logps/chosen": -365.3876953125, "logps/rejected": -246.4418182373047, "loss": 0.599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0486338138580322, "rewards/margins": 1.0574971437454224, "rewards/rejected": 1.9911365509033203, "step": 13620 }, { "epoch": 0.6328056084312178, "grad_norm": 56.26999282836914, "learning_rate": 3.9461751551449306e-07, "logits/chosen": -19.28518295288086, "logits/rejected": -19.810026168823242, "logps/chosen": -427.8236389160156, "logps/rejected": -450.94903564453125, "loss": 0.7649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.129960536956787, "rewards/margins": 0.22318914532661438, "rewards/rejected": 2.906771421432495, "step": 13630 }, { "epoch": 0.6332698825386508, "grad_norm": 91.96611022949219, "learning_rate": 3.9454013649658757e-07, "logits/chosen": -18.635604858398438, "logits/rejected": -17.745458602905273, "logps/chosen": -316.43267822265625, "logps/rejected": -267.70782470703125, "loss": 0.5734, "rewards/accuracies": 0.5, "rewards/chosen": 3.016282558441162, "rewards/margins": 1.0540971755981445, "rewards/rejected": 1.9621856212615967, "step": 13640 }, { "epoch": 0.6337341566460839, "grad_norm": 12.18281078338623, "learning_rate": 3.944627574786821e-07, "logits/chosen": -19.39040184020996, "logits/rejected": -17.88357925415039, "logps/chosen": -406.758056640625, "logps/rejected": -276.41888427734375, "loss": 0.4323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1154634952545166, "rewards/margins": 1.1564193964004517, "rewards/rejected": 1.9590438604354858, "step": 13650 }, { "epoch": 0.6341984307535169, "grad_norm": 50.89741897583008, "learning_rate": 3.943853784607766e-07, "logits/chosen": -18.255674362182617, "logits/rejected": -18.048076629638672, "logps/chosen": -371.3178405761719, "logps/rejected": -328.4539794921875, "loss": 0.4591, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.28619647026062, "rewards/margins": 0.6644331812858582, "rewards/rejected": 2.621763229370117, "step": 13660 }, { "epoch": 0.6346627048609499, "grad_norm": 75.1492919921875, "learning_rate": 3.9430799944287105e-07, "logits/chosen": -18.53322410583496, "logits/rejected": -17.776981353759766, "logps/chosen": -427.7547912597656, "logps/rejected": -380.333740234375, "loss": 0.8288, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.849412441253662, "rewards/margins": 0.05990755558013916, "rewards/rejected": 2.7895047664642334, "step": 13670 }, { "epoch": 0.6351269789683829, "grad_norm": 256.5301513671875, "learning_rate": 3.942306204249655e-07, "logits/chosen": -18.222780227661133, "logits/rejected": -17.055076599121094, "logps/chosen": -458.91082763671875, "logps/rejected": -335.4622497558594, "loss": 0.6541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7768120765686035, "rewards/margins": 1.5008519887924194, "rewards/rejected": 2.2759594917297363, "step": 13680 }, { "epoch": 0.635591253075816, "grad_norm": 174.17543029785156, "learning_rate": 3.9415324140706e-07, "logits/chosen": -19.183521270751953, "logits/rejected": -17.745332717895508, "logps/chosen": -482.17059326171875, "logps/rejected": -303.70086669921875, "loss": 0.3149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.102846145629883, "rewards/margins": 1.5328446626663208, "rewards/rejected": 2.5700016021728516, "step": 13690 }, { "epoch": 0.636055527183249, "grad_norm": 102.35387420654297, "learning_rate": 3.9407586238915453e-07, "logits/chosen": -18.278011322021484, "logits/rejected": -17.75088882446289, "logps/chosen": -399.6344299316406, "logps/rejected": -381.77606201171875, "loss": 0.7534, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.057248592376709, "rewards/margins": 0.5914028882980347, "rewards/rejected": 2.4658455848693848, "step": 13700 }, { "epoch": 0.636519801290682, "grad_norm": 109.75949096679688, "learning_rate": 3.9399848337124905e-07, "logits/chosen": -18.86011505126953, "logits/rejected": -17.907983779907227, "logps/chosen": -336.9436950683594, "logps/rejected": -297.16632080078125, "loss": 0.4834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.107783555984497, "rewards/margins": 0.6859371066093445, "rewards/rejected": 2.421846389770508, "step": 13710 }, { "epoch": 0.6369840753981151, "grad_norm": 12.800745010375977, "learning_rate": 3.939211043533435e-07, "logits/chosen": -18.840946197509766, "logits/rejected": -17.775888442993164, "logps/chosen": -424.32733154296875, "logps/rejected": -339.62347412109375, "loss": 0.5169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.202805280685425, "rewards/margins": 0.9753028154373169, "rewards/rejected": 2.2275023460388184, "step": 13720 }, { "epoch": 0.6374483495055481, "grad_norm": 154.8257598876953, "learning_rate": 3.93843725335438e-07, "logits/chosen": -18.18422508239746, "logits/rejected": -17.393510818481445, "logps/chosen": -488.78887939453125, "logps/rejected": -340.5085144042969, "loss": 0.7423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4822208881378174, "rewards/margins": 1.1407482624053955, "rewards/rejected": 2.341472625732422, "step": 13730 }, { "epoch": 0.6379126236129811, "grad_norm": 102.21741485595703, "learning_rate": 3.937663463175325e-07, "logits/chosen": -19.156997680664062, "logits/rejected": -18.47244644165039, "logps/chosen": -435.2245178222656, "logps/rejected": -350.4473571777344, "loss": 0.5625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.791311740875244, "rewards/margins": 0.866084098815918, "rewards/rejected": 1.9252277612686157, "step": 13740 }, { "epoch": 0.6383768977204142, "grad_norm": 27.580549240112305, "learning_rate": 3.9368896729962704e-07, "logits/chosen": -18.447757720947266, "logits/rejected": -17.429214477539062, "logps/chosen": -416.41925048828125, "logps/rejected": -279.392333984375, "loss": 0.3346, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4412436485290527, "rewards/margins": 1.627671241760254, "rewards/rejected": 1.813572645187378, "step": 13750 }, { "epoch": 0.6388411718278472, "grad_norm": 168.5937042236328, "learning_rate": 3.9361158828172155e-07, "logits/chosen": -18.91408920288086, "logits/rejected": -17.49246597290039, "logps/chosen": -359.49957275390625, "logps/rejected": -221.66043090820312, "loss": 0.5102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.998551607131958, "rewards/margins": 1.254739761352539, "rewards/rejected": 1.7438122034072876, "step": 13760 }, { "epoch": 0.6393054459352802, "grad_norm": 180.5157470703125, "learning_rate": 3.93534209263816e-07, "logits/chosen": -18.857717514038086, "logits/rejected": -18.95323944091797, "logps/chosen": -469.00079345703125, "logps/rejected": -439.0196228027344, "loss": 1.3281, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2199912071228027, "rewards/margins": -0.19108419120311737, "rewards/rejected": 3.4110755920410156, "step": 13770 }, { "epoch": 0.6397697200427133, "grad_norm": 134.60934448242188, "learning_rate": 3.9345683024591047e-07, "logits/chosen": -18.089757919311523, "logits/rejected": -17.756986618041992, "logps/chosen": -339.6056213378906, "logps/rejected": -306.0830383300781, "loss": 0.9896, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0933051109313965, "rewards/margins": 0.050145603716373444, "rewards/rejected": 2.0431594848632812, "step": 13780 }, { "epoch": 0.6402339941501463, "grad_norm": 139.9234619140625, "learning_rate": 3.93379451228005e-07, "logits/chosen": -18.410287857055664, "logits/rejected": -17.741390228271484, "logps/chosen": -419.0909118652344, "logps/rejected": -309.7845764160156, "loss": 0.9071, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6715810298919678, "rewards/margins": 0.024125998839735985, "rewards/rejected": 2.6474549770355225, "step": 13790 }, { "epoch": 0.6406982682575793, "grad_norm": 17.311819076538086, "learning_rate": 3.933020722100995e-07, "logits/chosen": -19.12015724182129, "logits/rejected": -18.970500946044922, "logps/chosen": -291.9275817871094, "logps/rejected": -240.93252563476562, "loss": 0.5622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.530000686645508, "rewards/margins": 0.5663642287254333, "rewards/rejected": 1.9636363983154297, "step": 13800 }, { "epoch": 0.6411625423650124, "grad_norm": 75.89178466796875, "learning_rate": 3.93224693192194e-07, "logits/chosen": -18.26715087890625, "logits/rejected": -17.245800018310547, "logps/chosen": -287.4000244140625, "logps/rejected": -209.43197631835938, "loss": 0.4561, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.332035779953003, "rewards/margins": 0.9924104809761047, "rewards/rejected": 1.339625358581543, "step": 13810 }, { "epoch": 0.6416268164724453, "grad_norm": 13.572473526000977, "learning_rate": 3.9314731417428846e-07, "logits/chosen": -17.488765716552734, "logits/rejected": -17.331628799438477, "logps/chosen": -321.5609436035156, "logps/rejected": -252.15780639648438, "loss": 0.7549, "rewards/accuracies": 0.5, "rewards/chosen": 1.9717754125595093, "rewards/margins": 0.4142611622810364, "rewards/rejected": 1.5575140714645386, "step": 13820 }, { "epoch": 0.6420910905798783, "grad_norm": 9.585104942321777, "learning_rate": 3.9306993515638297e-07, "logits/chosen": -18.082275390625, "logits/rejected": -16.800537109375, "logps/chosen": -356.6103515625, "logps/rejected": -193.82350158691406, "loss": 0.4388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.318372964859009, "rewards/margins": 1.1931110620498657, "rewards/rejected": 1.125261902809143, "step": 13830 }, { "epoch": 0.6425553646873113, "grad_norm": 161.3592071533203, "learning_rate": 3.929925561384775e-07, "logits/chosen": -18.991302490234375, "logits/rejected": -19.188854217529297, "logps/chosen": -381.79876708984375, "logps/rejected": -366.74029541015625, "loss": 0.9117, "rewards/accuracies": 0.5, "rewards/chosen": 3.175142288208008, "rewards/margins": -0.013397634029388428, "rewards/rejected": 3.188539981842041, "step": 13840 }, { "epoch": 0.6430196387947444, "grad_norm": 233.14894104003906, "learning_rate": 3.92915177120572e-07, "logits/chosen": -18.647401809692383, "logits/rejected": -17.84160041809082, "logps/chosen": -393.43310546875, "logps/rejected": -353.43731689453125, "loss": 0.8926, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.036720037460327, "rewards/margins": 0.18790197372436523, "rewards/rejected": 2.848818302154541, "step": 13850 }, { "epoch": 0.6434839129021774, "grad_norm": 58.54358673095703, "learning_rate": 3.928377981026665e-07, "logits/chosen": -18.816844940185547, "logits/rejected": -17.25282859802246, "logps/chosen": -392.7427978515625, "logps/rejected": -271.62127685546875, "loss": 0.5429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2726974487304688, "rewards/margins": 1.2069623470306396, "rewards/rejected": 2.065735340118408, "step": 13860 }, { "epoch": 0.6439481870096104, "grad_norm": 79.47481536865234, "learning_rate": 3.9276041908476096e-07, "logits/chosen": -18.495349884033203, "logits/rejected": -16.859386444091797, "logps/chosen": -378.72198486328125, "logps/rejected": -248.14266967773438, "loss": 0.4865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.746786117553711, "rewards/margins": 1.613197922706604, "rewards/rejected": 2.1335883140563965, "step": 13870 }, { "epoch": 0.6444124611170435, "grad_norm": 6.939480781555176, "learning_rate": 3.926830400668554e-07, "logits/chosen": -18.007511138916016, "logits/rejected": -16.62978172302246, "logps/chosen": -382.825927734375, "logps/rejected": -220.8816680908203, "loss": 0.535, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.478144407272339, "rewards/margins": 1.2197613716125488, "rewards/rejected": 1.2583832740783691, "step": 13880 }, { "epoch": 0.6448767352244765, "grad_norm": 199.8386688232422, "learning_rate": 3.9260566104894993e-07, "logits/chosen": -17.56989288330078, "logits/rejected": -16.849952697753906, "logps/chosen": -288.65081787109375, "logps/rejected": -229.7324676513672, "loss": 0.623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.670620918273926, "rewards/margins": 1.3870686292648315, "rewards/rejected": 1.2835521697998047, "step": 13890 }, { "epoch": 0.6453410093319095, "grad_norm": 55.39955139160156, "learning_rate": 3.9252828203104444e-07, "logits/chosen": -18.65924644470215, "logits/rejected": -18.98556900024414, "logps/chosen": -349.1778564453125, "logps/rejected": -317.11273193359375, "loss": 0.7699, "rewards/accuracies": 0.5, "rewards/chosen": 2.263422966003418, "rewards/margins": 0.005604386329650879, "rewards/rejected": 2.2578186988830566, "step": 13900 }, { "epoch": 0.6458052834393426, "grad_norm": 165.10345458984375, "learning_rate": 3.9245090301313895e-07, "logits/chosen": -18.261735916137695, "logits/rejected": -17.55717658996582, "logps/chosen": -384.22869873046875, "logps/rejected": -324.03887939453125, "loss": 0.704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7339224815368652, "rewards/margins": 0.353459894657135, "rewards/rejected": 2.380462646484375, "step": 13910 }, { "epoch": 0.6462695575467756, "grad_norm": 97.74652099609375, "learning_rate": 3.923735239952334e-07, "logits/chosen": -19.327674865722656, "logits/rejected": -19.232595443725586, "logps/chosen": -450.0547790527344, "logps/rejected": -401.226806640625, "loss": 0.6318, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9701969623565674, "rewards/margins": 0.47732916474342346, "rewards/rejected": 2.4928677082061768, "step": 13920 }, { "epoch": 0.6467338316542086, "grad_norm": 2.903778076171875, "learning_rate": 3.922961449773279e-07, "logits/chosen": -18.244470596313477, "logits/rejected": -17.7783145904541, "logps/chosen": -358.522705078125, "logps/rejected": -320.5093078613281, "loss": 0.6725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.212923526763916, "rewards/margins": 0.48869380354881287, "rewards/rejected": 1.7242298126220703, "step": 13930 }, { "epoch": 0.6471981057616417, "grad_norm": 10.315001487731934, "learning_rate": 3.9221876595942243e-07, "logits/chosen": -18.607074737548828, "logits/rejected": -17.97775650024414, "logps/chosen": -422.8106384277344, "logps/rejected": -373.946533203125, "loss": 0.8483, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.541287899017334, "rewards/margins": 0.7965683341026306, "rewards/rejected": 2.7447192668914795, "step": 13940 }, { "epoch": 0.6476623798690747, "grad_norm": 65.50251007080078, "learning_rate": 3.9214138694151695e-07, "logits/chosen": -18.98345184326172, "logits/rejected": -17.674610137939453, "logps/chosen": -398.24151611328125, "logps/rejected": -254.9077911376953, "loss": 0.5365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6109657287597656, "rewards/margins": 0.7464290261268616, "rewards/rejected": 1.8645368814468384, "step": 13950 }, { "epoch": 0.6481266539765077, "grad_norm": 106.18667602539062, "learning_rate": 3.9206400792361146e-07, "logits/chosen": -18.8284854888916, "logits/rejected": -18.032848358154297, "logps/chosen": -400.2912902832031, "logps/rejected": -354.8171691894531, "loss": 0.6018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.027650833129883, "rewards/margins": 0.8734968304634094, "rewards/rejected": 3.1541543006896973, "step": 13960 }, { "epoch": 0.6485909280839408, "grad_norm": 113.99651336669922, "learning_rate": 3.9198662890570586e-07, "logits/chosen": -19.334596633911133, "logits/rejected": -18.542165756225586, "logps/chosen": -401.02252197265625, "logps/rejected": -373.2681884765625, "loss": 0.7132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.516758680343628, "rewards/margins": 0.42207980155944824, "rewards/rejected": 3.0946788787841797, "step": 13970 }, { "epoch": 0.6490552021913738, "grad_norm": 54.002471923828125, "learning_rate": 3.919092498878004e-07, "logits/chosen": -18.65115737915039, "logits/rejected": -17.696664810180664, "logps/chosen": -342.32891845703125, "logps/rejected": -237.85696411132812, "loss": 0.656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.806318998336792, "rewards/margins": 0.9401172399520874, "rewards/rejected": 1.866201639175415, "step": 13980 }, { "epoch": 0.6495194762988068, "grad_norm": 57.392879486083984, "learning_rate": 3.918318708698949e-07, "logits/chosen": -18.602195739746094, "logits/rejected": -18.360559463500977, "logps/chosen": -452.74053955078125, "logps/rejected": -383.9212646484375, "loss": 1.2421, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.738948345184326, "rewards/margins": -0.47454071044921875, "rewards/rejected": 3.213488817214966, "step": 13990 }, { "epoch": 0.6499837504062398, "grad_norm": 120.2952651977539, "learning_rate": 3.917544918519894e-07, "logits/chosen": -18.4177303314209, "logits/rejected": -18.593246459960938, "logps/chosen": -435.35845947265625, "logps/rejected": -454.8633728027344, "loss": 0.8134, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2172293663024902, "rewards/margins": 0.09231351315975189, "rewards/rejected": 3.124915599822998, "step": 14000 }, { "epoch": 0.6504480245136729, "grad_norm": 120.17745208740234, "learning_rate": 3.916771128340839e-07, "logits/chosen": -18.64789581298828, "logits/rejected": -18.4449405670166, "logps/chosen": -374.29730224609375, "logps/rejected": -285.32171630859375, "loss": 0.6824, "rewards/accuracies": 0.5, "rewards/chosen": 3.0132248401641846, "rewards/margins": 0.37789395451545715, "rewards/rejected": 2.6353306770324707, "step": 14010 }, { "epoch": 0.6509122986211059, "grad_norm": 3.73783278465271, "learning_rate": 3.9159973381617837e-07, "logits/chosen": -19.424579620361328, "logits/rejected": -18.018688201904297, "logps/chosen": -362.4786376953125, "logps/rejected": -235.1931915283203, "loss": 0.6002, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1630849838256836, "rewards/margins": 1.1780731678009033, "rewards/rejected": 1.9850118160247803, "step": 14020 }, { "epoch": 0.6513765727285389, "grad_norm": 69.25247955322266, "learning_rate": 3.915223547982729e-07, "logits/chosen": -19.324111938476562, "logits/rejected": -17.349885940551758, "logps/chosen": -512.0447387695312, "logps/rejected": -308.1715393066406, "loss": 0.5313, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.11530065536499, "rewards/margins": 1.8480783700942993, "rewards/rejected": 2.2672224044799805, "step": 14030 }, { "epoch": 0.651840846835972, "grad_norm": 9.099550247192383, "learning_rate": 3.914449757803674e-07, "logits/chosen": -18.589338302612305, "logits/rejected": -18.24485206604004, "logps/chosen": -430.4104919433594, "logps/rejected": -336.0856628417969, "loss": 0.7793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.444584846496582, "rewards/margins": 0.34190887212753296, "rewards/rejected": 2.1026759147644043, "step": 14040 }, { "epoch": 0.652305120943405, "grad_norm": 59.424686431884766, "learning_rate": 3.913675967624619e-07, "logits/chosen": -18.533594131469727, "logits/rejected": -17.713539123535156, "logps/chosen": -426.16668701171875, "logps/rejected": -360.6338806152344, "loss": 0.6535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.469377040863037, "rewards/margins": 1.2856074571609497, "rewards/rejected": 2.183769702911377, "step": 14050 }, { "epoch": 0.652769395050838, "grad_norm": 39.14611053466797, "learning_rate": 3.912902177445564e-07, "logits/chosen": -18.599294662475586, "logits/rejected": -17.801456451416016, "logps/chosen": -342.30438232421875, "logps/rejected": -306.09576416015625, "loss": 0.8739, "rewards/accuracies": 0.5, "rewards/chosen": 2.529614210128784, "rewards/margins": 0.10323719680309296, "rewards/rejected": 2.4263768196105957, "step": 14060 }, { "epoch": 0.6532336691582711, "grad_norm": 22.01195526123047, "learning_rate": 3.912128387266508e-07, "logits/chosen": -18.694910049438477, "logits/rejected": -17.534889221191406, "logps/chosen": -416.68524169921875, "logps/rejected": -269.9834899902344, "loss": 0.3557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0447189807891846, "rewards/margins": 1.3405954837799072, "rewards/rejected": 1.7041234970092773, "step": 14070 }, { "epoch": 0.6536979432657041, "grad_norm": 10.51159954071045, "learning_rate": 3.9113545970874533e-07, "logits/chosen": -18.635944366455078, "logits/rejected": -17.499526977539062, "logps/chosen": -346.2193603515625, "logps/rejected": -241.3975067138672, "loss": 0.8156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.28598690032959, "rewards/margins": 0.6898025274276733, "rewards/rejected": 1.5961843729019165, "step": 14080 }, { "epoch": 0.6541622173731371, "grad_norm": 56.1866455078125, "learning_rate": 3.9105808069083984e-07, "logits/chosen": -20.053829193115234, "logits/rejected": -18.869958877563477, "logps/chosen": -395.64544677734375, "logps/rejected": -268.6676330566406, "loss": 0.4514, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9358866214752197, "rewards/margins": 1.0444977283477783, "rewards/rejected": 1.8913888931274414, "step": 14090 }, { "epoch": 0.6546264914805702, "grad_norm": 156.90130615234375, "learning_rate": 3.9098070167293435e-07, "logits/chosen": -18.27448081970215, "logits/rejected": -18.02102279663086, "logps/chosen": -396.595458984375, "logps/rejected": -369.6011962890625, "loss": 0.98, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2340095043182373, "rewards/margins": 0.6795564889907837, "rewards/rejected": 2.554452896118164, "step": 14100 }, { "epoch": 0.6550907655880032, "grad_norm": 46.107032775878906, "learning_rate": 3.9090332265502886e-07, "logits/chosen": -19.45836067199707, "logits/rejected": -18.41287612915039, "logps/chosen": -414.802490234375, "logps/rejected": -303.14691162109375, "loss": 0.4708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.835563898086548, "rewards/margins": 0.9703994989395142, "rewards/rejected": 1.8651645183563232, "step": 14110 }, { "epoch": 0.6555550396954362, "grad_norm": 18.600767135620117, "learning_rate": 3.908259436371233e-07, "logits/chosen": -18.629724502563477, "logits/rejected": -17.97625732421875, "logps/chosen": -234.78945922851562, "logps/rejected": -216.1345672607422, "loss": 0.526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6192344427108765, "rewards/margins": 0.5147823095321655, "rewards/rejected": 1.1044522523880005, "step": 14120 }, { "epoch": 0.6560193138028693, "grad_norm": 8.117645263671875, "learning_rate": 3.9074856461921783e-07, "logits/chosen": -19.07547950744629, "logits/rejected": -17.9246883392334, "logps/chosen": -407.79937744140625, "logps/rejected": -306.43658447265625, "loss": 0.5416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4779253005981445, "rewards/margins": 0.7272064685821533, "rewards/rejected": 1.7507188320159912, "step": 14130 }, { "epoch": 0.6564835879103023, "grad_norm": 13.446290016174316, "learning_rate": 3.9067118560131234e-07, "logits/chosen": -18.637393951416016, "logits/rejected": -17.694381713867188, "logps/chosen": -341.7809143066406, "logps/rejected": -287.2828063964844, "loss": 0.3926, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.043238401412964, "rewards/margins": 1.1331963539123535, "rewards/rejected": 1.9100421667099, "step": 14140 }, { "epoch": 0.6569478620177353, "grad_norm": 82.95381164550781, "learning_rate": 3.9059380658340685e-07, "logits/chosen": -19.039566040039062, "logits/rejected": -17.911760330200195, "logps/chosen": -427.7099609375, "logps/rejected": -354.3671569824219, "loss": 0.5922, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.408280849456787, "rewards/margins": 0.9543056488037109, "rewards/rejected": 2.453975200653076, "step": 14150 }, { "epoch": 0.6574121361251682, "grad_norm": 39.75434494018555, "learning_rate": 3.905164275655013e-07, "logits/chosen": -19.103599548339844, "logits/rejected": -19.08047866821289, "logps/chosen": -360.19403076171875, "logps/rejected": -384.7549743652344, "loss": 0.6751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6431961059570312, "rewards/margins": 0.24491611123085022, "rewards/rejected": 2.398280143737793, "step": 14160 }, { "epoch": 0.6578764102326013, "grad_norm": 35.010833740234375, "learning_rate": 3.9043904854759577e-07, "logits/chosen": -19.2221736907959, "logits/rejected": -17.57343101501465, "logps/chosen": -347.94378662109375, "logps/rejected": -213.67965698242188, "loss": 0.5171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.717639446258545, "rewards/margins": 1.4503285884857178, "rewards/rejected": 1.2673109769821167, "step": 14170 }, { "epoch": 0.6583406843400343, "grad_norm": 2.319288969039917, "learning_rate": 3.903616695296903e-07, "logits/chosen": -18.71321678161621, "logits/rejected": -17.84869384765625, "logps/chosen": -500.36688232421875, "logps/rejected": -404.7320251464844, "loss": 0.6097, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2935237884521484, "rewards/margins": 0.8183170557022095, "rewards/rejected": 2.4752068519592285, "step": 14180 }, { "epoch": 0.6588049584474673, "grad_norm": 210.03378295898438, "learning_rate": 3.902842905117848e-07, "logits/chosen": -18.54088020324707, "logits/rejected": -17.90873146057129, "logps/chosen": -474.10528564453125, "logps/rejected": -378.3470764160156, "loss": 0.6858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.570079803466797, "rewards/margins": 0.8396568298339844, "rewards/rejected": 2.7304227352142334, "step": 14190 }, { "epoch": 0.6592692325549004, "grad_norm": 49.00916290283203, "learning_rate": 3.902069114938793e-07, "logits/chosen": -19.07470703125, "logits/rejected": -18.087013244628906, "logps/chosen": -344.64532470703125, "logps/rejected": -247.35940551757812, "loss": 0.5006, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5809149742126465, "rewards/margins": 0.9427959322929382, "rewards/rejected": 1.638119101524353, "step": 14200 }, { "epoch": 0.6597335066623334, "grad_norm": 249.10877990722656, "learning_rate": 3.901295324759738e-07, "logits/chosen": -19.03274917602539, "logits/rejected": -18.584354400634766, "logps/chosen": -354.7392883300781, "logps/rejected": -357.119140625, "loss": 0.9722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2908902168273926, "rewards/margins": 0.02434924803674221, "rewards/rejected": 2.266541004180908, "step": 14210 }, { "epoch": 0.6601977807697664, "grad_norm": 56.83610153198242, "learning_rate": 3.900521534580683e-07, "logits/chosen": -18.259899139404297, "logits/rejected": -18.405900955200195, "logps/chosen": -336.37774658203125, "logps/rejected": -372.6202087402344, "loss": 0.7093, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.313913345336914, "rewards/margins": 0.036242008209228516, "rewards/rejected": 2.2776713371276855, "step": 14220 }, { "epoch": 0.6606620548771995, "grad_norm": 203.08706665039062, "learning_rate": 3.899747744401628e-07, "logits/chosen": -19.08075523376465, "logits/rejected": -17.993732452392578, "logps/chosen": -544.9708862304688, "logps/rejected": -409.8355407714844, "loss": 0.5384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.601425886154175, "rewards/margins": 0.5356882214546204, "rewards/rejected": 3.06573748588562, "step": 14230 }, { "epoch": 0.6611263289846325, "grad_norm": 18.662221908569336, "learning_rate": 3.898973954222573e-07, "logits/chosen": -18.328289031982422, "logits/rejected": -17.332260131835938, "logps/chosen": -362.55255126953125, "logps/rejected": -267.86468505859375, "loss": 0.6169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0941290855407715, "rewards/margins": 1.3211684226989746, "rewards/rejected": 1.772960901260376, "step": 14240 }, { "epoch": 0.6615906030920655, "grad_norm": 84.12416076660156, "learning_rate": 3.898200164043518e-07, "logits/chosen": -19.252548217773438, "logits/rejected": -18.78516387939453, "logps/chosen": -454.37030029296875, "logps/rejected": -301.79754638671875, "loss": 0.9323, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.612565279006958, "rewards/margins": 0.1850743591785431, "rewards/rejected": 3.4274909496307373, "step": 14250 }, { "epoch": 0.6620548771994986, "grad_norm": 48.3582649230957, "learning_rate": 3.8974263738644627e-07, "logits/chosen": -19.536724090576172, "logits/rejected": -17.93886947631836, "logps/chosen": -416.19659423828125, "logps/rejected": -259.6972961425781, "loss": 0.3519, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4346611499786377, "rewards/margins": 1.6354119777679443, "rewards/rejected": 1.7992489337921143, "step": 14260 }, { "epoch": 0.6625191513069316, "grad_norm": 108.69493103027344, "learning_rate": 3.896652583685407e-07, "logits/chosen": -19.184656143188477, "logits/rejected": -18.43331527709961, "logps/chosen": -402.76885986328125, "logps/rejected": -313.25555419921875, "loss": 0.5272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.005415439605713, "rewards/margins": 0.8155522346496582, "rewards/rejected": 2.1898632049560547, "step": 14270 }, { "epoch": 0.6629834254143646, "grad_norm": 36.60527420043945, "learning_rate": 3.8958787935063524e-07, "logits/chosen": -19.774742126464844, "logits/rejected": -19.35538673400879, "logps/chosen": -436.73785400390625, "logps/rejected": -379.12286376953125, "loss": 0.6452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5363731384277344, "rewards/margins": 0.5897025465965271, "rewards/rejected": 2.9466705322265625, "step": 14280 }, { "epoch": 0.6634476995217977, "grad_norm": 1.947662353515625, "learning_rate": 3.8951050033272975e-07, "logits/chosen": -19.243412017822266, "logits/rejected": -17.669429779052734, "logps/chosen": -418.91009521484375, "logps/rejected": -253.5836639404297, "loss": 0.3706, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4399070739746094, "rewards/margins": 1.5847089290618896, "rewards/rejected": 1.8551981449127197, "step": 14290 }, { "epoch": 0.6639119736292307, "grad_norm": 11.418537139892578, "learning_rate": 3.8943312131482426e-07, "logits/chosen": -19.49346160888672, "logits/rejected": -18.864356994628906, "logps/chosen": -409.44732666015625, "logps/rejected": -316.14947509765625, "loss": 0.5323, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.143704652786255, "rewards/margins": 0.6114534139633179, "rewards/rejected": 2.5322508811950684, "step": 14300 }, { "epoch": 0.6643762477366637, "grad_norm": 7.273954391479492, "learning_rate": 3.8935574229691877e-07, "logits/chosen": -18.42222023010254, "logits/rejected": -17.047489166259766, "logps/chosen": -521.8794555664062, "logps/rejected": -379.0657653808594, "loss": 0.4876, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.284500598907471, "rewards/margins": 1.438617467880249, "rewards/rejected": 2.8458831310272217, "step": 14310 }, { "epoch": 0.6648405218440968, "grad_norm": 58.06808853149414, "learning_rate": 3.892783632790133e-07, "logits/chosen": -19.365665435791016, "logits/rejected": -18.733516693115234, "logps/chosen": -368.4093322753906, "logps/rejected": -292.32342529296875, "loss": 0.8471, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.086207866668701, "rewards/margins": -0.10115157067775726, "rewards/rejected": 2.18735933303833, "step": 14320 }, { "epoch": 0.6653047959515298, "grad_norm": 122.46033477783203, "learning_rate": 3.8920098426110774e-07, "logits/chosen": -18.49701690673828, "logits/rejected": -17.74655532836914, "logps/chosen": -413.81280517578125, "logps/rejected": -323.77703857421875, "loss": 0.5494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4151806831359863, "rewards/margins": 1.3092536926269531, "rewards/rejected": 2.105926990509033, "step": 14330 }, { "epoch": 0.6657690700589628, "grad_norm": 94.30050659179688, "learning_rate": 3.8912360524320225e-07, "logits/chosen": -18.525482177734375, "logits/rejected": -18.245351791381836, "logps/chosen": -306.91204833984375, "logps/rejected": -295.06329345703125, "loss": 0.5936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0860934257507324, "rewards/margins": 0.3260951042175293, "rewards/rejected": 1.7599983215332031, "step": 14340 }, { "epoch": 0.6662333441663958, "grad_norm": 51.988712310791016, "learning_rate": 3.8904622622529676e-07, "logits/chosen": -18.370494842529297, "logits/rejected": -16.816692352294922, "logps/chosen": -394.26226806640625, "logps/rejected": -257.00408935546875, "loss": 0.3297, "rewards/accuracies": 1.0, "rewards/chosen": 3.369424819946289, "rewards/margins": 1.6439669132232666, "rewards/rejected": 1.725457787513733, "step": 14350 }, { "epoch": 0.6666976182738289, "grad_norm": 152.13059997558594, "learning_rate": 3.889688472073912e-07, "logits/chosen": -19.00221061706543, "logits/rejected": -18.131752014160156, "logps/chosen": -402.76910400390625, "logps/rejected": -346.43438720703125, "loss": 0.6882, "rewards/accuracies": 0.5, "rewards/chosen": 3.0545268058776855, "rewards/margins": 0.6769353151321411, "rewards/rejected": 2.377591609954834, "step": 14360 }, { "epoch": 0.6671618923812619, "grad_norm": 50.76713562011719, "learning_rate": 3.888914681894857e-07, "logits/chosen": -18.072160720825195, "logits/rejected": -17.864240646362305, "logps/chosen": -331.46795654296875, "logps/rejected": -264.1994934082031, "loss": 0.6529, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9090771675109863, "rewards/margins": 0.7005831599235535, "rewards/rejected": 2.208494186401367, "step": 14370 }, { "epoch": 0.6676261664886949, "grad_norm": 184.00433349609375, "learning_rate": 3.888140891715802e-07, "logits/chosen": -18.06755256652832, "logits/rejected": -18.242061614990234, "logps/chosen": -423.74847412109375, "logps/rejected": -471.8761291503906, "loss": 0.7439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.960176706314087, "rewards/margins": 0.3164982497692108, "rewards/rejected": 2.6436781883239746, "step": 14380 }, { "epoch": 0.668090440596128, "grad_norm": 56.02376937866211, "learning_rate": 3.887367101536747e-07, "logits/chosen": -17.404756546020508, "logits/rejected": -18.496150970458984, "logps/chosen": -320.24066162109375, "logps/rejected": -366.4953918457031, "loss": 1.1622, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9601402282714844, "rewards/margins": -0.6352654695510864, "rewards/rejected": 2.5954055786132812, "step": 14390 }, { "epoch": 0.668554714703561, "grad_norm": 68.49543762207031, "learning_rate": 3.886593311357692e-07, "logits/chosen": -19.675676345825195, "logits/rejected": -18.46782112121582, "logps/chosen": -394.4286193847656, "logps/rejected": -242.3271942138672, "loss": 0.3456, "rewards/accuracies": 1.0, "rewards/chosen": 3.400296688079834, "rewards/margins": 1.5334783792495728, "rewards/rejected": 1.8668180704116821, "step": 14400 }, { "epoch": 0.669018988810994, "grad_norm": 3.855933904647827, "learning_rate": 3.885819521178637e-07, "logits/chosen": -17.75684928894043, "logits/rejected": -17.5105037689209, "logps/chosen": -294.733642578125, "logps/rejected": -254.25146484375, "loss": 0.6662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9187636375427246, "rewards/margins": 0.927161693572998, "rewards/rejected": 1.9916019439697266, "step": 14410 }, { "epoch": 0.6694832629184271, "grad_norm": 74.16168212890625, "learning_rate": 3.8850457309995824e-07, "logits/chosen": -18.2999210357666, "logits/rejected": -17.61592674255371, "logps/chosen": -277.3499755859375, "logps/rejected": -239.6295166015625, "loss": 0.6275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.126352548599243, "rewards/margins": 0.46791619062423706, "rewards/rejected": 1.6584361791610718, "step": 14420 }, { "epoch": 0.6699475370258601, "grad_norm": 32.21310806274414, "learning_rate": 3.884271940820527e-07, "logits/chosen": -19.623252868652344, "logits/rejected": -18.266246795654297, "logps/chosen": -401.11077880859375, "logps/rejected": -289.5928649902344, "loss": 0.5144, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6172828674316406, "rewards/margins": 1.2384355068206787, "rewards/rejected": 2.378847360610962, "step": 14430 }, { "epoch": 0.6704118111332931, "grad_norm": 236.92071533203125, "learning_rate": 3.883498150641472e-07, "logits/chosen": -19.02634620666504, "logits/rejected": -18.409196853637695, "logps/chosen": -466.8031311035156, "logps/rejected": -422.423828125, "loss": 0.8937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1678102016448975, "rewards/margins": 0.23625166714191437, "rewards/rejected": 2.93155837059021, "step": 14440 }, { "epoch": 0.6708760852407262, "grad_norm": 143.8878173828125, "learning_rate": 3.8827243604624166e-07, "logits/chosen": -18.315242767333984, "logits/rejected": -18.14221954345703, "logps/chosen": -426.33636474609375, "logps/rejected": -411.5816955566406, "loss": 0.7829, "rewards/accuracies": 0.5, "rewards/chosen": 2.658450126647949, "rewards/margins": 0.1460142433643341, "rewards/rejected": 2.5124361515045166, "step": 14450 }, { "epoch": 0.6713403593481592, "grad_norm": 107.41585540771484, "learning_rate": 3.881950570283362e-07, "logits/chosen": -18.526287078857422, "logits/rejected": -17.908105850219727, "logps/chosen": -345.1403503417969, "logps/rejected": -247.50082397460938, "loss": 0.4582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7861924171447754, "rewards/margins": 0.9869126081466675, "rewards/rejected": 1.7992801666259766, "step": 14460 }, { "epoch": 0.6718046334555922, "grad_norm": 60.17244338989258, "learning_rate": 3.881176780104307e-07, "logits/chosen": -18.615079879760742, "logits/rejected": -17.046436309814453, "logps/chosen": -415.80609130859375, "logps/rejected": -233.67041015625, "loss": 0.5734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8062498569488525, "rewards/margins": 1.1485364437103271, "rewards/rejected": 1.6577132940292358, "step": 14470 }, { "epoch": 0.6722689075630253, "grad_norm": 172.9029998779297, "learning_rate": 3.8804029899252514e-07, "logits/chosen": -18.5793514251709, "logits/rejected": -18.938446044921875, "logps/chosen": -382.04376220703125, "logps/rejected": -363.56390380859375, "loss": 0.8698, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.637185573577881, "rewards/margins": -0.0898674726486206, "rewards/rejected": 2.727053165435791, "step": 14480 }, { "epoch": 0.6727331816704583, "grad_norm": 98.22566223144531, "learning_rate": 3.8796291997461966e-07, "logits/chosen": -17.989948272705078, "logits/rejected": -17.617107391357422, "logps/chosen": -412.3821716308594, "logps/rejected": -281.9576110839844, "loss": 0.696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.822291374206543, "rewards/margins": 0.8399481773376465, "rewards/rejected": 1.982343316078186, "step": 14490 }, { "epoch": 0.6731974557778913, "grad_norm": 282.20977783203125, "learning_rate": 3.8788554095671417e-07, "logits/chosen": -18.178457260131836, "logits/rejected": -17.711122512817383, "logps/chosen": -390.89874267578125, "logps/rejected": -341.63958740234375, "loss": 0.6596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.275733470916748, "rewards/margins": 0.9036844372749329, "rewards/rejected": 2.372048854827881, "step": 14500 }, { "epoch": 0.6736617298853242, "grad_norm": 7.919140338897705, "learning_rate": 3.878081619388087e-07, "logits/chosen": -18.610300064086914, "logits/rejected": -17.932161331176758, "logps/chosen": -431.1263732910156, "logps/rejected": -356.746337890625, "loss": 0.5775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9052536487579346, "rewards/margins": 0.7537002563476562, "rewards/rejected": 2.1515533924102783, "step": 14510 }, { "epoch": 0.6741260039927573, "grad_norm": 55.489158630371094, "learning_rate": 3.877307829209032e-07, "logits/chosen": -19.425434112548828, "logits/rejected": -18.282182693481445, "logps/chosen": -440.090576171875, "logps/rejected": -357.0511169433594, "loss": 0.4872, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4116523265838623, "rewards/margins": 0.9574456214904785, "rewards/rejected": 2.454206943511963, "step": 14520 }, { "epoch": 0.6745902781001903, "grad_norm": 55.857723236083984, "learning_rate": 3.8765340390299765e-07, "logits/chosen": -19.883407592773438, "logits/rejected": -18.765247344970703, "logps/chosen": -338.6407470703125, "logps/rejected": -296.0350341796875, "loss": 0.4386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6030666828155518, "rewards/margins": 0.8579143285751343, "rewards/rejected": 1.745152235031128, "step": 14530 }, { "epoch": 0.6750545522076233, "grad_norm": 10.150141716003418, "learning_rate": 3.8757602488509216e-07, "logits/chosen": -18.872882843017578, "logits/rejected": -17.82819366455078, "logps/chosen": -466.3294982910156, "logps/rejected": -265.9437255859375, "loss": 0.4093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0910658836364746, "rewards/margins": 1.1659482717514038, "rewards/rejected": 1.9251174926757812, "step": 14540 }, { "epoch": 0.6755188263150564, "grad_norm": 243.80917358398438, "learning_rate": 3.874986458671866e-07, "logits/chosen": -18.803363800048828, "logits/rejected": -18.160802841186523, "logps/chosen": -398.50787353515625, "logps/rejected": -387.1329650878906, "loss": 0.8539, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.451162099838257, "rewards/margins": 0.0775078535079956, "rewards/rejected": 2.373654365539551, "step": 14550 }, { "epoch": 0.6759831004224894, "grad_norm": 171.062255859375, "learning_rate": 3.8742126684928113e-07, "logits/chosen": -18.914264678955078, "logits/rejected": -18.45471954345703, "logps/chosen": -394.2944030761719, "logps/rejected": -332.00103759765625, "loss": 0.6165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9936838150024414, "rewards/margins": 0.4507356286048889, "rewards/rejected": 2.5429482460021973, "step": 14560 }, { "epoch": 0.6764473745299224, "grad_norm": 2.9438929557800293, "learning_rate": 3.8734388783137564e-07, "logits/chosen": -18.148508071899414, "logits/rejected": -17.13718605041504, "logps/chosen": -401.7752685546875, "logps/rejected": -259.087890625, "loss": 0.5019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.526332139968872, "rewards/margins": 1.8305599689483643, "rewards/rejected": 1.6957719326019287, "step": 14570 }, { "epoch": 0.6769116486373555, "grad_norm": 163.62808227539062, "learning_rate": 3.872665088134701e-07, "logits/chosen": -19.287670135498047, "logits/rejected": -18.363710403442383, "logps/chosen": -448.322265625, "logps/rejected": -275.1785583496094, "loss": 0.4729, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2647643089294434, "rewards/margins": 1.0209343433380127, "rewards/rejected": 2.2438299655914307, "step": 14580 }, { "epoch": 0.6773759227447885, "grad_norm": 52.918052673339844, "learning_rate": 3.871891297955646e-07, "logits/chosen": -19.209341049194336, "logits/rejected": -17.61899757385254, "logps/chosen": -426.8525390625, "logps/rejected": -303.43792724609375, "loss": 0.3342, "rewards/accuracies": 1.0, "rewards/chosen": 3.3857288360595703, "rewards/margins": 1.4296596050262451, "rewards/rejected": 1.9560692310333252, "step": 14590 }, { "epoch": 0.6778401968522215, "grad_norm": 33.791751861572266, "learning_rate": 3.871117507776591e-07, "logits/chosen": -18.02608871459961, "logits/rejected": -17.89743423461914, "logps/chosen": -216.2153778076172, "logps/rejected": -201.20782470703125, "loss": 0.7125, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.2449555397033691, "rewards/margins": 0.05212021619081497, "rewards/rejected": 1.1928353309631348, "step": 14600 }, { "epoch": 0.6783044709596546, "grad_norm": 43.584503173828125, "learning_rate": 3.8703437175975363e-07, "logits/chosen": -18.89009666442871, "logits/rejected": -18.548158645629883, "logps/chosen": -302.66729736328125, "logps/rejected": -252.08883666992188, "loss": 0.7159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3759121894836426, "rewards/margins": 0.19688136875629425, "rewards/rejected": 2.1790308952331543, "step": 14610 }, { "epoch": 0.6787687450670876, "grad_norm": 6.799918174743652, "learning_rate": 3.8695699274184814e-07, "logits/chosen": -19.09622573852539, "logits/rejected": -18.317886352539062, "logps/chosen": -365.03460693359375, "logps/rejected": -261.30609130859375, "loss": 0.6796, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4223570823669434, "rewards/margins": 0.9376043081283569, "rewards/rejected": 1.4847527742385864, "step": 14620 }, { "epoch": 0.6792330191745206, "grad_norm": 102.31423950195312, "learning_rate": 3.868796137239426e-07, "logits/chosen": -20.024517059326172, "logits/rejected": -17.919452667236328, "logps/chosen": -398.94970703125, "logps/rejected": -256.0206604003906, "loss": 0.3607, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.488818407058716, "rewards/margins": 1.5553581714630127, "rewards/rejected": 1.9334604740142822, "step": 14630 }, { "epoch": 0.6796972932819537, "grad_norm": 19.1799373626709, "learning_rate": 3.8680223470603706e-07, "logits/chosen": -18.819793701171875, "logits/rejected": -19.06136703491211, "logps/chosen": -359.185546875, "logps/rejected": -344.2962951660156, "loss": 0.5822, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.389565944671631, "rewards/margins": 0.3673070967197418, "rewards/rejected": 2.022258758544922, "step": 14640 }, { "epoch": 0.6801615673893867, "grad_norm": 67.9830322265625, "learning_rate": 3.8672485568813157e-07, "logits/chosen": -19.01102066040039, "logits/rejected": -18.751108169555664, "logps/chosen": -391.9257507324219, "logps/rejected": -342.8059387207031, "loss": 0.6716, "rewards/accuracies": 0.5, "rewards/chosen": 2.835191249847412, "rewards/margins": 0.6227751970291138, "rewards/rejected": 2.212415933609009, "step": 14650 }, { "epoch": 0.6806258414968197, "grad_norm": 39.10674285888672, "learning_rate": 3.866474766702261e-07, "logits/chosen": -18.267650604248047, "logits/rejected": -17.933963775634766, "logps/chosen": -355.5378112792969, "logps/rejected": -335.9893798828125, "loss": 1.191, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.4290788173675537, "rewards/margins": -0.4693072438240051, "rewards/rejected": 2.898386001586914, "step": 14660 }, { "epoch": 0.6810901156042527, "grad_norm": 7.417203426361084, "learning_rate": 3.865700976523206e-07, "logits/chosen": -18.32045555114746, "logits/rejected": -16.997196197509766, "logps/chosen": -353.3469543457031, "logps/rejected": -273.35980224609375, "loss": 0.3741, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.523736000061035, "rewards/margins": 1.296887755393982, "rewards/rejected": 1.2268480062484741, "step": 14670 }, { "epoch": 0.6815543897116858, "grad_norm": 46.136985778808594, "learning_rate": 3.8649271863441505e-07, "logits/chosen": -18.621084213256836, "logits/rejected": -17.948841094970703, "logps/chosen": -428.3271484375, "logps/rejected": -320.63189697265625, "loss": 0.5653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7872469425201416, "rewards/margins": 0.5111583471298218, "rewards/rejected": 2.2760884761810303, "step": 14680 }, { "epoch": 0.6820186638191188, "grad_norm": 42.08855438232422, "learning_rate": 3.8641533961650956e-07, "logits/chosen": -18.554906845092773, "logits/rejected": -18.72320556640625, "logps/chosen": -407.06207275390625, "logps/rejected": -382.78033447265625, "loss": 0.8067, "rewards/accuracies": 0.5, "rewards/chosen": 2.8910529613494873, "rewards/margins": 0.1124144196510315, "rewards/rejected": 2.7786383628845215, "step": 14690 }, { "epoch": 0.6824829379265518, "grad_norm": 6.029149055480957, "learning_rate": 3.863379605986041e-07, "logits/chosen": -18.493072509765625, "logits/rejected": -18.541784286499023, "logps/chosen": -402.66851806640625, "logps/rejected": -331.602783203125, "loss": 0.8461, "rewards/accuracies": 0.5, "rewards/chosen": 2.3714680671691895, "rewards/margins": 0.15965504944324493, "rewards/rejected": 2.211812973022461, "step": 14700 }, { "epoch": 0.6829472120339849, "grad_norm": 36.177940368652344, "learning_rate": 3.862605815806986e-07, "logits/chosen": -19.244218826293945, "logits/rejected": -18.625011444091797, "logps/chosen": -401.4500427246094, "logps/rejected": -319.8154296875, "loss": 0.6426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3194162845611572, "rewards/margins": 0.38824060559272766, "rewards/rejected": 1.9311755895614624, "step": 14710 }, { "epoch": 0.6834114861414179, "grad_norm": 36.983741760253906, "learning_rate": 3.861832025627931e-07, "logits/chosen": -17.777048110961914, "logits/rejected": -17.980215072631836, "logps/chosen": -327.5159606933594, "logps/rejected": -361.1023254394531, "loss": 0.5924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.612957715988159, "rewards/margins": 0.43174830079078674, "rewards/rejected": 2.1812095642089844, "step": 14720 }, { "epoch": 0.6838757602488509, "grad_norm": 85.62468719482422, "learning_rate": 3.8610582354488756e-07, "logits/chosen": -19.321537017822266, "logits/rejected": -18.299978256225586, "logps/chosen": -455.23980712890625, "logps/rejected": -384.32489013671875, "loss": 0.694, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5033905506134033, "rewards/margins": 0.3481998145580292, "rewards/rejected": 3.1551904678344727, "step": 14730 }, { "epoch": 0.684340034356284, "grad_norm": 36.8267707824707, "learning_rate": 3.86028444526982e-07, "logits/chosen": -18.70978546142578, "logits/rejected": -18.70974349975586, "logps/chosen": -443.19378662109375, "logps/rejected": -427.7625427246094, "loss": 0.5976, "rewards/accuracies": 0.5, "rewards/chosen": 2.4938838481903076, "rewards/margins": 0.320853054523468, "rewards/rejected": 2.1730306148529053, "step": 14740 }, { "epoch": 0.684804308463717, "grad_norm": 40.66619873046875, "learning_rate": 3.859510655090765e-07, "logits/chosen": -18.508085250854492, "logits/rejected": -17.961565017700195, "logps/chosen": -407.83001708984375, "logps/rejected": -406.14886474609375, "loss": 0.6351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9755635261535645, "rewards/margins": 0.4003371596336365, "rewards/rejected": 2.575226306915283, "step": 14750 }, { "epoch": 0.68526858257115, "grad_norm": 44.39902877807617, "learning_rate": 3.8587368649117104e-07, "logits/chosen": -18.198938369750977, "logits/rejected": -17.181591033935547, "logps/chosen": -454.35321044921875, "logps/rejected": -369.16278076171875, "loss": 0.456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.683763027191162, "rewards/margins": 1.1718335151672363, "rewards/rejected": 2.511929750442505, "step": 14760 }, { "epoch": 0.6857328566785831, "grad_norm": 170.212158203125, "learning_rate": 3.8579630747326555e-07, "logits/chosen": -18.333850860595703, "logits/rejected": -17.38311004638672, "logps/chosen": -435.34014892578125, "logps/rejected": -323.164306640625, "loss": 0.4624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1933817863464355, "rewards/margins": 1.057344675064087, "rewards/rejected": 2.1360371112823486, "step": 14770 }, { "epoch": 0.6861971307860161, "grad_norm": 85.71669006347656, "learning_rate": 3.8571892845536e-07, "logits/chosen": -19.175180435180664, "logits/rejected": -18.284177780151367, "logps/chosen": -297.88092041015625, "logps/rejected": -301.50103759765625, "loss": 0.7775, "rewards/accuracies": 0.5, "rewards/chosen": 2.957296371459961, "rewards/margins": 0.8051376342773438, "rewards/rejected": 2.152158498764038, "step": 14780 }, { "epoch": 0.6866614048934491, "grad_norm": 85.54666137695312, "learning_rate": 3.856415494374545e-07, "logits/chosen": -17.5469913482666, "logits/rejected": -17.318477630615234, "logps/chosen": -373.5292053222656, "logps/rejected": -304.5715637207031, "loss": 0.8441, "rewards/accuracies": 0.5, "rewards/chosen": 2.212017059326172, "rewards/margins": -0.05011671781539917, "rewards/rejected": 2.262134075164795, "step": 14790 }, { "epoch": 0.6871256790008822, "grad_norm": 96.82583618164062, "learning_rate": 3.8556417041954903e-07, "logits/chosen": -18.501150131225586, "logits/rejected": -18.230241775512695, "logps/chosen": -369.0577087402344, "logps/rejected": -344.5093688964844, "loss": 0.7071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4048800468444824, "rewards/margins": 0.6288584470748901, "rewards/rejected": 2.77602219581604, "step": 14800 }, { "epoch": 0.6875899531083152, "grad_norm": 18.70478630065918, "learning_rate": 3.8548679140164354e-07, "logits/chosen": -18.37005043029785, "logits/rejected": -17.1624755859375, "logps/chosen": -389.5430603027344, "logps/rejected": -293.34381103515625, "loss": 0.6623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2769618034362793, "rewards/margins": 0.4840102195739746, "rewards/rejected": 2.7929513454437256, "step": 14810 }, { "epoch": 0.6880542272157482, "grad_norm": 109.70140075683594, "learning_rate": 3.8540941238373805e-07, "logits/chosen": -17.324296951293945, "logits/rejected": -16.987924575805664, "logps/chosen": -276.44793701171875, "logps/rejected": -243.6503143310547, "loss": 0.8605, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9109216928482056, "rewards/margins": 0.04265434667468071, "rewards/rejected": 1.8682670593261719, "step": 14820 }, { "epoch": 0.6885185013231812, "grad_norm": 83.44082641601562, "learning_rate": 3.8533203336583246e-07, "logits/chosen": -18.826345443725586, "logits/rejected": -17.743886947631836, "logps/chosen": -275.0542297363281, "logps/rejected": -244.0809326171875, "loss": 0.9605, "rewards/accuracies": 0.5, "rewards/chosen": 1.6376752853393555, "rewards/margins": 0.15987874567508698, "rewards/rejected": 1.4777965545654297, "step": 14830 }, { "epoch": 0.6889827754306143, "grad_norm": 36.615299224853516, "learning_rate": 3.8525465434792697e-07, "logits/chosen": -19.77474594116211, "logits/rejected": -19.632652282714844, "logps/chosen": -429.0115661621094, "logps/rejected": -452.2423400878906, "loss": 0.948, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.596083164215088, "rewards/margins": -0.29705503582954407, "rewards/rejected": 2.8931381702423096, "step": 14840 }, { "epoch": 0.6894470495380473, "grad_norm": 171.31369018554688, "learning_rate": 3.851772753300215e-07, "logits/chosen": -19.216392517089844, "logits/rejected": -18.960186004638672, "logps/chosen": -363.39935302734375, "logps/rejected": -365.86065673828125, "loss": 0.7698, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3112857341766357, "rewards/margins": 0.5208913087844849, "rewards/rejected": 2.7903950214385986, "step": 14850 }, { "epoch": 0.6899113236454802, "grad_norm": 38.132041931152344, "learning_rate": 3.85099896312116e-07, "logits/chosen": -19.206315994262695, "logits/rejected": -18.104270935058594, "logps/chosen": -444.68890380859375, "logps/rejected": -324.96002197265625, "loss": 0.4929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.756413698196411, "rewards/margins": 1.0673431158065796, "rewards/rejected": 2.689070463180542, "step": 14860 }, { "epoch": 0.6903755977529134, "grad_norm": 297.87109375, "learning_rate": 3.850225172942105e-07, "logits/chosen": -18.227394104003906, "logits/rejected": -17.136402130126953, "logps/chosen": -419.22979736328125, "logps/rejected": -305.74835205078125, "loss": 0.5201, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.936602830886841, "rewards/margins": 1.415114402770996, "rewards/rejected": 1.5214886665344238, "step": 14870 }, { "epoch": 0.6908398718603463, "grad_norm": 40.516292572021484, "learning_rate": 3.8494513827630496e-07, "logits/chosen": -17.849506378173828, "logits/rejected": -18.17884635925293, "logps/chosen": -325.2421875, "logps/rejected": -350.38775634765625, "loss": 1.111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.61427640914917, "rewards/margins": -0.35763245820999146, "rewards/rejected": 2.9719088077545166, "step": 14880 }, { "epoch": 0.6913041459677793, "grad_norm": 9.41049861907959, "learning_rate": 3.8486775925839947e-07, "logits/chosen": -19.425325393676758, "logits/rejected": -17.898155212402344, "logps/chosen": -303.23480224609375, "logps/rejected": -171.20364379882812, "loss": 0.5034, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4733774662017822, "rewards/margins": 1.214606523513794, "rewards/rejected": 1.2587709426879883, "step": 14890 }, { "epoch": 0.6917684200752124, "grad_norm": 147.81919860839844, "learning_rate": 3.84790380240494e-07, "logits/chosen": -18.273927688598633, "logits/rejected": -18.46432113647461, "logps/chosen": -374.55584716796875, "logps/rejected": -344.0517578125, "loss": 1.3855, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.1805226802825928, "rewards/margins": -0.8009670376777649, "rewards/rejected": 2.981489658355713, "step": 14900 }, { "epoch": 0.6922326941826454, "grad_norm": 174.38780212402344, "learning_rate": 3.847130012225885e-07, "logits/chosen": -18.187828063964844, "logits/rejected": -18.02532958984375, "logps/chosen": -497.3316345214844, "logps/rejected": -386.81915283203125, "loss": 0.8637, "rewards/accuracies": 0.5, "rewards/chosen": 2.7701449394226074, "rewards/margins": 0.15438631176948547, "rewards/rejected": 2.6157584190368652, "step": 14910 }, { "epoch": 0.6926969682900784, "grad_norm": 19.215961456298828, "learning_rate": 3.84635622204683e-07, "logits/chosen": -18.8281192779541, "logits/rejected": -17.220062255859375, "logps/chosen": -443.95574951171875, "logps/rejected": -363.05865478515625, "loss": 1.0475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2879319190979004, "rewards/margins": 0.5195330381393433, "rewards/rejected": 2.7683987617492676, "step": 14920 }, { "epoch": 0.6931612423975115, "grad_norm": 29.230344772338867, "learning_rate": 3.845582431867774e-07, "logits/chosen": -19.42509651184082, "logits/rejected": -18.56180191040039, "logps/chosen": -475.925537109375, "logps/rejected": -349.9551696777344, "loss": 0.442, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.438053846359253, "rewards/margins": 0.9359842538833618, "rewards/rejected": 2.5020699501037598, "step": 14930 }, { "epoch": 0.6936255165049445, "grad_norm": 217.89219665527344, "learning_rate": 3.844808641688719e-07, "logits/chosen": -18.22781753540039, "logits/rejected": -17.154308319091797, "logps/chosen": -462.5218811035156, "logps/rejected": -340.53582763671875, "loss": 0.6469, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.964665412902832, "rewards/margins": 0.4645722806453705, "rewards/rejected": 2.5000929832458496, "step": 14940 }, { "epoch": 0.6940897906123775, "grad_norm": 64.85238647460938, "learning_rate": 3.8440348515096643e-07, "logits/chosen": -18.26955795288086, "logits/rejected": -18.199352264404297, "logps/chosen": -345.6091613769531, "logps/rejected": -356.3059997558594, "loss": 1.0031, "rewards/accuracies": 0.5, "rewards/chosen": 2.979846954345703, "rewards/margins": 0.20219895243644714, "rewards/rejected": 2.7776477336883545, "step": 14950 }, { "epoch": 0.6945540647198106, "grad_norm": 0.2155926674604416, "learning_rate": 3.8432610613306095e-07, "logits/chosen": -17.89199447631836, "logits/rejected": -16.72355842590332, "logps/chosen": -473.88671875, "logps/rejected": -295.8207702636719, "loss": 0.4968, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8244433403015137, "rewards/margins": 1.4226038455963135, "rewards/rejected": 2.4018394947052, "step": 14960 }, { "epoch": 0.6950183388272436, "grad_norm": 108.72855377197266, "learning_rate": 3.8424872711515546e-07, "logits/chosen": -19.86187171936035, "logits/rejected": -18.73343849182129, "logps/chosen": -525.5187377929688, "logps/rejected": -381.3360290527344, "loss": 0.6378, "rewards/accuracies": 0.5, "rewards/chosen": 3.997368335723877, "rewards/margins": 0.761896014213562, "rewards/rejected": 3.2354724407196045, "step": 14970 }, { "epoch": 0.6954826129346766, "grad_norm": 93.67649841308594, "learning_rate": 3.841713480972499e-07, "logits/chosen": -18.06313133239746, "logits/rejected": -17.90863609313965, "logps/chosen": -298.8343505859375, "logps/rejected": -321.18377685546875, "loss": 0.6, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2159078121185303, "rewards/margins": 0.3216254413127899, "rewards/rejected": 1.894282341003418, "step": 14980 }, { "epoch": 0.6959468870421096, "grad_norm": 216.64495849609375, "learning_rate": 3.8409396907934443e-07, "logits/chosen": -19.057031631469727, "logits/rejected": -17.589977264404297, "logps/chosen": -473.7874450683594, "logps/rejected": -358.2821350097656, "loss": 0.3504, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6136081218719482, "rewards/margins": 1.3851497173309326, "rewards/rejected": 2.2284584045410156, "step": 14990 }, { "epoch": 0.6964111611495427, "grad_norm": 25.546070098876953, "learning_rate": 3.8401659006143894e-07, "logits/chosen": -17.50083351135254, "logits/rejected": -18.4482479095459, "logps/chosen": -320.98333740234375, "logps/rejected": -457.50640869140625, "loss": 1.5421, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3121392726898193, "rewards/margins": -0.7874194383621216, "rewards/rejected": 3.0995588302612305, "step": 15000 }, { "epoch": 0.6968754352569757, "grad_norm": 139.6260528564453, "learning_rate": 3.8393921104353345e-07, "logits/chosen": -18.32290267944336, "logits/rejected": -18.09267807006836, "logps/chosen": -384.72088623046875, "logps/rejected": -350.5597229003906, "loss": 1.0769, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.151947498321533, "rewards/margins": -0.23374810814857483, "rewards/rejected": 2.385695695877075, "step": 15010 }, { "epoch": 0.6973397093644087, "grad_norm": 119.89582824707031, "learning_rate": 3.8386183202562796e-07, "logits/chosen": -19.36794662475586, "logits/rejected": -19.19333267211914, "logps/chosen": -383.27545166015625, "logps/rejected": -356.3053283691406, "loss": 0.8892, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.710315704345703, "rewards/margins": 0.40269097685813904, "rewards/rejected": 2.3076248168945312, "step": 15020 }, { "epoch": 0.6978039834718418, "grad_norm": 18.72576141357422, "learning_rate": 3.8378445300772237e-07, "logits/chosen": -18.318134307861328, "logits/rejected": -17.368839263916016, "logps/chosen": -435.23187255859375, "logps/rejected": -305.3026428222656, "loss": 0.4784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0999855995178223, "rewards/margins": 0.8346655964851379, "rewards/rejected": 2.265320301055908, "step": 15030 }, { "epoch": 0.6982682575792748, "grad_norm": 13.753005981445312, "learning_rate": 3.837070739898169e-07, "logits/chosen": -18.572734832763672, "logits/rejected": -17.28362274169922, "logps/chosen": -322.50927734375, "logps/rejected": -217.56887817382812, "loss": 0.4277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0605130195617676, "rewards/margins": 1.2057431936264038, "rewards/rejected": 1.8547694683074951, "step": 15040 }, { "epoch": 0.6987325316867078, "grad_norm": 54.12190628051758, "learning_rate": 3.836296949719114e-07, "logits/chosen": -18.84316635131836, "logits/rejected": -17.922121047973633, "logps/chosen": -397.724853515625, "logps/rejected": -284.9718322753906, "loss": 0.4, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0061490535736084, "rewards/margins": 1.1247156858444214, "rewards/rejected": 1.881433129310608, "step": 15050 }, { "epoch": 0.6991968057941409, "grad_norm": 22.063676834106445, "learning_rate": 3.835523159540059e-07, "logits/chosen": -20.33680534362793, "logits/rejected": -19.277690887451172, "logps/chosen": -430.9071350097656, "logps/rejected": -348.29449462890625, "loss": 0.6102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2939975261688232, "rewards/margins": 0.46999987959861755, "rewards/rejected": 2.823997735977173, "step": 15060 }, { "epoch": 0.6996610799015739, "grad_norm": 113.23682403564453, "learning_rate": 3.834749369361004e-07, "logits/chosen": -18.57411003112793, "logits/rejected": -18.158695220947266, "logps/chosen": -395.24810791015625, "logps/rejected": -269.50213623046875, "loss": 0.4313, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.723752975463867, "rewards/margins": 1.3766965866088867, "rewards/rejected": 2.3470559120178223, "step": 15070 }, { "epoch": 0.7001253540090069, "grad_norm": 63.11631774902344, "learning_rate": 3.8339755791819487e-07, "logits/chosen": -18.355621337890625, "logits/rejected": -18.0573787689209, "logps/chosen": -507.34857177734375, "logps/rejected": -390.5121154785156, "loss": 0.6777, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.558238983154297, "rewards/margins": 0.5713142156600952, "rewards/rejected": 2.986924648284912, "step": 15080 }, { "epoch": 0.70058962811644, "grad_norm": 22.7941837310791, "learning_rate": 3.833201789002894e-07, "logits/chosen": -19.15264892578125, "logits/rejected": -19.236059188842773, "logps/chosen": -305.85308837890625, "logps/rejected": -320.681884765625, "loss": 0.7145, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3972153663635254, "rewards/margins": 0.2182103842496872, "rewards/rejected": 2.1790051460266113, "step": 15090 }, { "epoch": 0.701053902223873, "grad_norm": 137.83177185058594, "learning_rate": 3.832427998823839e-07, "logits/chosen": -19.84296989440918, "logits/rejected": -19.023059844970703, "logps/chosen": -337.9051513671875, "logps/rejected": -265.57012939453125, "loss": 0.601, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.757692575454712, "rewards/margins": 1.2084851264953613, "rewards/rejected": 1.549207329750061, "step": 15100 }, { "epoch": 0.701518176331306, "grad_norm": 187.2134552001953, "learning_rate": 3.831654208644784e-07, "logits/chosen": -18.9451961517334, "logits/rejected": -17.232135772705078, "logps/chosen": -553.9827880859375, "logps/rejected": -334.7936706542969, "loss": 0.4413, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6348443031311035, "rewards/margins": 1.9498180150985718, "rewards/rejected": 1.6850261688232422, "step": 15110 }, { "epoch": 0.7019824504387391, "grad_norm": 43.396610260009766, "learning_rate": 3.8308804184657286e-07, "logits/chosen": -18.59423065185547, "logits/rejected": -18.29800796508789, "logps/chosen": -316.2176513671875, "logps/rejected": -280.16058349609375, "loss": 0.7796, "rewards/accuracies": 0.5, "rewards/chosen": 2.761629581451416, "rewards/margins": 0.18590836226940155, "rewards/rejected": 2.575721263885498, "step": 15120 }, { "epoch": 0.7024467245461721, "grad_norm": 2.0043723583221436, "learning_rate": 3.830106628286673e-07, "logits/chosen": -18.98520278930664, "logits/rejected": -18.044679641723633, "logps/chosen": -378.7677917480469, "logps/rejected": -318.99163818359375, "loss": 0.6257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1314074993133545, "rewards/margins": 0.9658626317977905, "rewards/rejected": 2.1655449867248535, "step": 15130 }, { "epoch": 0.7029109986536051, "grad_norm": 95.6911392211914, "learning_rate": 3.8293328381076183e-07, "logits/chosen": -19.336580276489258, "logits/rejected": -18.566020965576172, "logps/chosen": -434.220703125, "logps/rejected": -301.39959716796875, "loss": 0.4846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.143721580505371, "rewards/margins": 1.1289503574371338, "rewards/rejected": 2.0147712230682373, "step": 15140 }, { "epoch": 0.7033752727610381, "grad_norm": 114.9562759399414, "learning_rate": 3.8285590479285634e-07, "logits/chosen": -19.03769302368164, "logits/rejected": -18.22296714782715, "logps/chosen": -416.34417724609375, "logps/rejected": -372.12286376953125, "loss": 0.8483, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.843400716781616, "rewards/margins": 0.06233854219317436, "rewards/rejected": 2.781061887741089, "step": 15150 }, { "epoch": 0.7038395468684712, "grad_norm": 61.01640701293945, "learning_rate": 3.8277852577495085e-07, "logits/chosen": -19.18653106689453, "logits/rejected": -18.480070114135742, "logps/chosen": -428.9725646972656, "logps/rejected": -273.09564208984375, "loss": 0.529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2558727264404297, "rewards/margins": 1.1551730632781982, "rewards/rejected": 2.1006999015808105, "step": 15160 }, { "epoch": 0.7043038209759042, "grad_norm": 2.148756980895996, "learning_rate": 3.8270114675704537e-07, "logits/chosen": -18.392566680908203, "logits/rejected": -17.72746467590332, "logps/chosen": -418.98748779296875, "logps/rejected": -318.1756286621094, "loss": 0.687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9469025135040283, "rewards/margins": 1.4938828945159912, "rewards/rejected": 2.453019142150879, "step": 15170 }, { "epoch": 0.7047680950833372, "grad_norm": 37.902679443359375, "learning_rate": 3.826237677391398e-07, "logits/chosen": -17.87240982055664, "logits/rejected": -16.814451217651367, "logps/chosen": -429.95220947265625, "logps/rejected": -268.869384765625, "loss": 0.5578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0453941822052, "rewards/margins": 0.9496240615844727, "rewards/rejected": 2.0957701206207275, "step": 15180 }, { "epoch": 0.7052323691907703, "grad_norm": 110.68421936035156, "learning_rate": 3.8254638872123433e-07, "logits/chosen": -17.833147048950195, "logits/rejected": -18.095882415771484, "logps/chosen": -419.0719299316406, "logps/rejected": -346.8179626464844, "loss": 1.073, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.224655866622925, "rewards/margins": 0.24410724639892578, "rewards/rejected": 2.98054838180542, "step": 15190 }, { "epoch": 0.7056966432982033, "grad_norm": 237.12979125976562, "learning_rate": 3.8246900970332885e-07, "logits/chosen": -18.64680290222168, "logits/rejected": -18.47536849975586, "logps/chosen": -332.9338684082031, "logps/rejected": -374.5148620605469, "loss": 0.8591, "rewards/accuracies": 0.5, "rewards/chosen": 2.9728524684906006, "rewards/margins": 0.6007281541824341, "rewards/rejected": 2.372124195098877, "step": 15200 }, { "epoch": 0.7061609174056362, "grad_norm": 18.588829040527344, "learning_rate": 3.8239163068542336e-07, "logits/chosen": -18.098398208618164, "logits/rejected": -17.60161018371582, "logps/chosen": -441.3236389160156, "logps/rejected": -382.88165283203125, "loss": 0.6179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.565631151199341, "rewards/margins": 1.049115777015686, "rewards/rejected": 2.5165152549743652, "step": 15210 }, { "epoch": 0.7066251915130694, "grad_norm": 20.669925689697266, "learning_rate": 3.823142516675178e-07, "logits/chosen": -17.752370834350586, "logits/rejected": -17.259496688842773, "logps/chosen": -370.99200439453125, "logps/rejected": -292.0617980957031, "loss": 0.6644, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.784939765930176, "rewards/margins": 0.6350150108337402, "rewards/rejected": 2.1499247550964355, "step": 15220 }, { "epoch": 0.7070894656205023, "grad_norm": 70.11698150634766, "learning_rate": 3.822368726496123e-07, "logits/chosen": -18.01483917236328, "logits/rejected": -18.00785255432129, "logps/chosen": -290.14404296875, "logps/rejected": -267.75750732421875, "loss": 1.1113, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.334555149078369, "rewards/margins": -0.2312300205230713, "rewards/rejected": 2.5657849311828613, "step": 15230 }, { "epoch": 0.7075537397279353, "grad_norm": 229.2996063232422, "learning_rate": 3.821594936317068e-07, "logits/chosen": -18.89011573791504, "logits/rejected": -18.79203987121582, "logps/chosen": -404.5976867675781, "logps/rejected": -410.18145751953125, "loss": 1.1666, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.896395206451416, "rewards/margins": -0.536318302154541, "rewards/rejected": 3.432713270187378, "step": 15240 }, { "epoch": 0.7080180138353684, "grad_norm": 133.55201721191406, "learning_rate": 3.820821146138013e-07, "logits/chosen": -18.527198791503906, "logits/rejected": -17.91842269897461, "logps/chosen": -313.2943420410156, "logps/rejected": -315.6831359863281, "loss": 1.0734, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3868165016174316, "rewards/margins": -0.3263714015483856, "rewards/rejected": 2.7131881713867188, "step": 15250 }, { "epoch": 0.7084822879428014, "grad_norm": 6.110830783843994, "learning_rate": 3.820047355958958e-07, "logits/chosen": -18.62240219116211, "logits/rejected": -17.26299285888672, "logps/chosen": -386.82806396484375, "logps/rejected": -335.0821533203125, "loss": 1.1203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9383203983306885, "rewards/margins": 0.3236871361732483, "rewards/rejected": 2.614633083343506, "step": 15260 }, { "epoch": 0.7089465620502344, "grad_norm": 226.92564392089844, "learning_rate": 3.819273565779903e-07, "logits/chosen": -19.539264678955078, "logits/rejected": -19.161380767822266, "logps/chosen": -413.29058837890625, "logps/rejected": -371.98541259765625, "loss": 0.6861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4347546100616455, "rewards/margins": 0.8826042413711548, "rewards/rejected": 2.552150249481201, "step": 15270 }, { "epoch": 0.7094108361576675, "grad_norm": 24.33858871459961, "learning_rate": 3.818499775600848e-07, "logits/chosen": -19.23330307006836, "logits/rejected": -17.56705665588379, "logps/chosen": -421.38336181640625, "logps/rejected": -233.5792236328125, "loss": 0.274, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.043112277984619, "rewards/margins": 2.6667919158935547, "rewards/rejected": 1.3763206005096436, "step": 15280 }, { "epoch": 0.7098751102651005, "grad_norm": 92.6724853515625, "learning_rate": 3.817725985421793e-07, "logits/chosen": -18.175832748413086, "logits/rejected": -18.04595947265625, "logps/chosen": -413.67431640625, "logps/rejected": -402.3512878417969, "loss": 0.5717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1436009407043457, "rewards/margins": 0.510571300983429, "rewards/rejected": 2.6330294609069824, "step": 15290 }, { "epoch": 0.7103393843725335, "grad_norm": 59.863407135009766, "learning_rate": 3.816952195242738e-07, "logits/chosen": -19.37509536743164, "logits/rejected": -18.373891830444336, "logps/chosen": -326.5770568847656, "logps/rejected": -279.729736328125, "loss": 0.4228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0432064533233643, "rewards/margins": 0.903478741645813, "rewards/rejected": 2.139727830886841, "step": 15300 }, { "epoch": 0.7108036584799666, "grad_norm": 20.105329513549805, "learning_rate": 3.8161784050636826e-07, "logits/chosen": -19.042436599731445, "logits/rejected": -17.60515785217285, "logps/chosen": -373.7036437988281, "logps/rejected": -207.0281982421875, "loss": 0.4203, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.224337100982666, "rewards/margins": 1.3266160488128662, "rewards/rejected": 1.8977206945419312, "step": 15310 }, { "epoch": 0.7112679325873996, "grad_norm": 46.031620025634766, "learning_rate": 3.8154046148846277e-07, "logits/chosen": -18.493810653686523, "logits/rejected": -18.093278884887695, "logps/chosen": -338.1983642578125, "logps/rejected": -351.20416259765625, "loss": 1.1732, "rewards/accuracies": 0.5, "rewards/chosen": 1.9872344732284546, "rewards/margins": -0.344765841960907, "rewards/rejected": 2.332000255584717, "step": 15320 }, { "epoch": 0.7117322066948326, "grad_norm": 28.82056999206543, "learning_rate": 3.8146308247055723e-07, "logits/chosen": -18.077518463134766, "logits/rejected": -17.476470947265625, "logps/chosen": -350.63531494140625, "logps/rejected": -291.5760498046875, "loss": 1.524, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7548184394836426, "rewards/margins": -0.07916216552257538, "rewards/rejected": 2.833980083465576, "step": 15330 }, { "epoch": 0.7121964808022656, "grad_norm": 26.726957321166992, "learning_rate": 3.8138570345265174e-07, "logits/chosen": -17.86441993713379, "logits/rejected": -17.713056564331055, "logps/chosen": -333.75567626953125, "logps/rejected": -247.2724151611328, "loss": 0.7886, "rewards/accuracies": 0.5, "rewards/chosen": 1.7983758449554443, "rewards/margins": 0.18250517547130585, "rewards/rejected": 1.615870714187622, "step": 15340 }, { "epoch": 0.7126607549096987, "grad_norm": 90.1445541381836, "learning_rate": 3.8130832443474625e-07, "logits/chosen": -18.158321380615234, "logits/rejected": -17.352354049682617, "logps/chosen": -456.48822021484375, "logps/rejected": -334.7116394042969, "loss": 0.4165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.315747022628784, "rewards/margins": 1.200331449508667, "rewards/rejected": 2.1154160499572754, "step": 15350 }, { "epoch": 0.7131250290171317, "grad_norm": 100.4590072631836, "learning_rate": 3.8123094541684076e-07, "logits/chosen": -18.523136138916016, "logits/rejected": -17.381298065185547, "logps/chosen": -426.7938537597656, "logps/rejected": -255.40463256835938, "loss": 0.5312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.878669023513794, "rewards/margins": 1.3198320865631104, "rewards/rejected": 2.5588366985321045, "step": 15360 }, { "epoch": 0.7135893031245647, "grad_norm": 55.930763244628906, "learning_rate": 3.8115356639893527e-07, "logits/chosen": -18.75609588623047, "logits/rejected": -18.238811492919922, "logps/chosen": -497.35797119140625, "logps/rejected": -413.43798828125, "loss": 0.5595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8420701026916504, "rewards/margins": 0.7427656650543213, "rewards/rejected": 2.09930419921875, "step": 15370 }, { "epoch": 0.7140535772319978, "grad_norm": 113.73405456542969, "learning_rate": 3.8107618738102973e-07, "logits/chosen": -18.8267822265625, "logits/rejected": -18.12777328491211, "logps/chosen": -408.6670227050781, "logps/rejected": -314.7626953125, "loss": 0.5492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.334609270095825, "rewards/margins": 1.3791701793670654, "rewards/rejected": 1.9554386138916016, "step": 15380 }, { "epoch": 0.7145178513394308, "grad_norm": 166.42971801757812, "learning_rate": 3.8099880836312424e-07, "logits/chosen": -19.142122268676758, "logits/rejected": -18.042240142822266, "logps/chosen": -431.2464904785156, "logps/rejected": -322.4610290527344, "loss": 0.7767, "rewards/accuracies": 0.5, "rewards/chosen": 3.250368118286133, "rewards/margins": 0.07158058881759644, "rewards/rejected": 3.1787877082824707, "step": 15390 }, { "epoch": 0.7149821254468638, "grad_norm": 0.35114920139312744, "learning_rate": 3.8092142934521875e-07, "logits/chosen": -18.777748107910156, "logits/rejected": -17.091869354248047, "logps/chosen": -456.3570861816406, "logps/rejected": -302.2268371582031, "loss": 0.3294, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4340641498565674, "rewards/margins": 1.6358143091201782, "rewards/rejected": 1.7982499599456787, "step": 15400 }, { "epoch": 0.7154463995542969, "grad_norm": 163.06996154785156, "learning_rate": 3.808440503273132e-07, "logits/chosen": -18.134353637695312, "logits/rejected": -17.954288482666016, "logps/chosen": -473.37896728515625, "logps/rejected": -441.2408752441406, "loss": 1.1379, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.9851901531219482, "rewards/margins": -0.19270110130310059, "rewards/rejected": 3.177891254425049, "step": 15410 }, { "epoch": 0.7159106736617299, "grad_norm": 30.250995635986328, "learning_rate": 3.807666713094077e-07, "logits/chosen": -18.916658401489258, "logits/rejected": -18.071622848510742, "logps/chosen": -388.06219482421875, "logps/rejected": -337.61773681640625, "loss": 0.4203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4138591289520264, "rewards/margins": 1.0716463327407837, "rewards/rejected": 2.3422129154205322, "step": 15420 }, { "epoch": 0.7163749477691629, "grad_norm": 223.52894592285156, "learning_rate": 3.806892922915022e-07, "logits/chosen": -18.505403518676758, "logits/rejected": -18.161500930786133, "logps/chosen": -455.66729736328125, "logps/rejected": -370.9677429199219, "loss": 1.0542, "rewards/accuracies": 0.5, "rewards/chosen": 3.0349481105804443, "rewards/margins": 0.12284409999847412, "rewards/rejected": 2.9121036529541016, "step": 15430 }, { "epoch": 0.716839221876596, "grad_norm": 191.14329528808594, "learning_rate": 3.806119132735967e-07, "logits/chosen": -19.50326919555664, "logits/rejected": -18.962360382080078, "logps/chosen": -427.74078369140625, "logps/rejected": -307.6402893066406, "loss": 0.8016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3231117725372314, "rewards/margins": 0.3511276841163635, "rewards/rejected": 2.9719839096069336, "step": 15440 }, { "epoch": 0.717303495984029, "grad_norm": 21.41054916381836, "learning_rate": 3.805345342556912e-07, "logits/chosen": -19.11142921447754, "logits/rejected": -18.40566635131836, "logps/chosen": -395.5389709472656, "logps/rejected": -311.03314208984375, "loss": 0.6801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.232750654220581, "rewards/margins": 0.5779012441635132, "rewards/rejected": 1.6548490524291992, "step": 15450 }, { "epoch": 0.717767770091462, "grad_norm": 59.11980438232422, "learning_rate": 3.804571552377857e-07, "logits/chosen": -18.271272659301758, "logits/rejected": -18.01820182800293, "logps/chosen": -372.17681884765625, "logps/rejected": -332.06689453125, "loss": 0.7089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7516040802001953, "rewards/margins": 0.2109772264957428, "rewards/rejected": 2.5406270027160645, "step": 15460 }, { "epoch": 0.7182320441988951, "grad_norm": 77.14082336425781, "learning_rate": 3.8037977621988023e-07, "logits/chosen": -18.02445411682129, "logits/rejected": -16.964866638183594, "logps/chosen": -378.5380859375, "logps/rejected": -252.28622436523438, "loss": 0.6127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5139575004577637, "rewards/margins": 0.6798931956291199, "rewards/rejected": 1.834064245223999, "step": 15470 }, { "epoch": 0.7186963183063281, "grad_norm": 65.06035614013672, "learning_rate": 3.803023972019747e-07, "logits/chosen": -18.59603500366211, "logits/rejected": -17.440860748291016, "logps/chosen": -311.2225646972656, "logps/rejected": -199.76846313476562, "loss": 0.4227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.429765462875366, "rewards/margins": 1.3501864671707153, "rewards/rejected": 1.0795791149139404, "step": 15480 }, { "epoch": 0.7191605924137611, "grad_norm": 31.302152633666992, "learning_rate": 3.802250181840692e-07, "logits/chosen": -19.037593841552734, "logits/rejected": -17.645597457885742, "logps/chosen": -530.3011474609375, "logps/rejected": -310.44659423828125, "loss": 0.3971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5817668437957764, "rewards/margins": 1.3016705513000488, "rewards/rejected": 2.2800965309143066, "step": 15490 }, { "epoch": 0.7196248665211941, "grad_norm": 37.791812896728516, "learning_rate": 3.8014763916616366e-07, "logits/chosen": -18.599111557006836, "logits/rejected": -18.009693145751953, "logps/chosen": -358.61492919921875, "logps/rejected": -324.02032470703125, "loss": 0.485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7072231769561768, "rewards/margins": 0.712646484375, "rewards/rejected": 1.9945768117904663, "step": 15500 }, { "epoch": 0.7200891406286272, "grad_norm": 159.94046020507812, "learning_rate": 3.8007026014825817e-07, "logits/chosen": -19.553340911865234, "logits/rejected": -19.327632904052734, "logps/chosen": -433.63037109375, "logps/rejected": -430.62353515625, "loss": 1.0208, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.289961338043213, "rewards/margins": -0.2823143005371094, "rewards/rejected": 3.5722756385803223, "step": 15510 }, { "epoch": 0.7205534147360602, "grad_norm": 174.61398315429688, "learning_rate": 3.799928811303527e-07, "logits/chosen": -19.056697845458984, "logits/rejected": -18.605541229248047, "logps/chosen": -491.6859436035156, "logps/rejected": -414.94134521484375, "loss": 0.471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3736982345581055, "rewards/margins": 1.5589754581451416, "rewards/rejected": 2.8147220611572266, "step": 15520 }, { "epoch": 0.7210176888434932, "grad_norm": 57.47669219970703, "learning_rate": 3.7991550211244714e-07, "logits/chosen": -19.991941452026367, "logits/rejected": -19.64439582824707, "logps/chosen": -359.252197265625, "logps/rejected": -327.1825256347656, "loss": 0.568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0801258087158203, "rewards/margins": 0.7411625385284424, "rewards/rejected": 2.338963747024536, "step": 15530 }, { "epoch": 0.7214819629509263, "grad_norm": 29.098487854003906, "learning_rate": 3.7983812309454165e-07, "logits/chosen": -20.024784088134766, "logits/rejected": -19.019725799560547, "logps/chosen": -343.0262145996094, "logps/rejected": -303.0850830078125, "loss": 0.5737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.005631685256958, "rewards/margins": 0.6926348805427551, "rewards/rejected": 2.3129963874816895, "step": 15540 }, { "epoch": 0.7219462370583593, "grad_norm": 180.93553161621094, "learning_rate": 3.7976074407663616e-07, "logits/chosen": -17.978891372680664, "logits/rejected": -17.46585464477539, "logps/chosen": -368.8166198730469, "logps/rejected": -336.09661865234375, "loss": 0.989, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.076547145843506, "rewards/margins": 0.17673572897911072, "rewards/rejected": 2.899811267852783, "step": 15550 }, { "epoch": 0.7224105111657922, "grad_norm": 156.1837158203125, "learning_rate": 3.7968336505873067e-07, "logits/chosen": -18.085220336914062, "logits/rejected": -17.795930862426758, "logps/chosen": -392.3797912597656, "logps/rejected": -373.024169921875, "loss": 0.7931, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6341307163238525, "rewards/margins": 0.10457205772399902, "rewards/rejected": 2.5295586585998535, "step": 15560 }, { "epoch": 0.7228747852732254, "grad_norm": 38.93193817138672, "learning_rate": 3.796059860408252e-07, "logits/chosen": -19.283740997314453, "logits/rejected": -19.032779693603516, "logps/chosen": -436.99395751953125, "logps/rejected": -384.58160400390625, "loss": 0.3718, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0832605361938477, "rewards/margins": 1.3030171394348145, "rewards/rejected": 1.7802432775497437, "step": 15570 }, { "epoch": 0.7233390593806583, "grad_norm": 56.9814453125, "learning_rate": 3.795286070229197e-07, "logits/chosen": -19.05866813659668, "logits/rejected": -18.462512969970703, "logps/chosen": -483.9281311035156, "logps/rejected": -418.49969482421875, "loss": 0.8409, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5602309703826904, "rewards/margins": 0.132616326212883, "rewards/rejected": 3.42761492729187, "step": 15580 }, { "epoch": 0.7238033334880913, "grad_norm": 85.80620574951172, "learning_rate": 3.7945122800501415e-07, "logits/chosen": -19.1164608001709, "logits/rejected": -18.870393753051758, "logps/chosen": -366.24334716796875, "logps/rejected": -407.2063293457031, "loss": 0.8982, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.062798023223877, "rewards/margins": 0.02130298689007759, "rewards/rejected": 3.041494846343994, "step": 15590 }, { "epoch": 0.7242676075955244, "grad_norm": 7.010103225708008, "learning_rate": 3.793738489871086e-07, "logits/chosen": -18.59571075439453, "logits/rejected": -18.0010929107666, "logps/chosen": -406.4334716796875, "logps/rejected": -308.1172180175781, "loss": 0.5735, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.809758424758911, "rewards/margins": 0.9098894000053406, "rewards/rejected": 1.8998689651489258, "step": 15600 }, { "epoch": 0.7247318817029574, "grad_norm": 73.13128662109375, "learning_rate": 3.792964699692031e-07, "logits/chosen": -18.671924591064453, "logits/rejected": -17.899112701416016, "logps/chosen": -425.4437561035156, "logps/rejected": -324.77850341796875, "loss": 0.4733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8546218872070312, "rewards/margins": 1.185361623764038, "rewards/rejected": 2.669260025024414, "step": 15610 }, { "epoch": 0.7251961558103904, "grad_norm": 35.016788482666016, "learning_rate": 3.7921909095129763e-07, "logits/chosen": -19.240402221679688, "logits/rejected": -18.00812339782715, "logps/chosen": -539.6611328125, "logps/rejected": -430.99005126953125, "loss": 0.816, "rewards/accuracies": 0.5, "rewards/chosen": 3.9083056449890137, "rewards/margins": 0.5540148019790649, "rewards/rejected": 3.3542912006378174, "step": 15620 }, { "epoch": 0.7256604299178235, "grad_norm": 84.85895538330078, "learning_rate": 3.791417119333921e-07, "logits/chosen": -19.306861877441406, "logits/rejected": -18.998973846435547, "logps/chosen": -380.01153564453125, "logps/rejected": -315.4219665527344, "loss": 0.5541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.784838914871216, "rewards/margins": 1.0778419971466064, "rewards/rejected": 2.7069971561431885, "step": 15630 }, { "epoch": 0.7261247040252565, "grad_norm": 1.5798799991607666, "learning_rate": 3.790643329154866e-07, "logits/chosen": -18.163854598999023, "logits/rejected": -18.025646209716797, "logps/chosen": -288.8949279785156, "logps/rejected": -230.78732299804688, "loss": 0.6687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6171650886535645, "rewards/margins": 0.7564882040023804, "rewards/rejected": 1.8606770038604736, "step": 15640 }, { "epoch": 0.7265889781326895, "grad_norm": 17.8167724609375, "learning_rate": 3.789869538975811e-07, "logits/chosen": -18.886754989624023, "logits/rejected": -18.257976531982422, "logps/chosen": -422.37445068359375, "logps/rejected": -250.70852661132812, "loss": 0.4549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0151450634002686, "rewards/margins": 1.3500349521636963, "rewards/rejected": 1.6651099920272827, "step": 15650 }, { "epoch": 0.7270532522401225, "grad_norm": 45.53087615966797, "learning_rate": 3.789095748796756e-07, "logits/chosen": -18.264432907104492, "logits/rejected": -17.354061126708984, "logps/chosen": -365.9740295410156, "logps/rejected": -354.0555114746094, "loss": 0.6902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.824009895324707, "rewards/margins": 0.6412097215652466, "rewards/rejected": 2.182800054550171, "step": 15660 }, { "epoch": 0.7275175263475556, "grad_norm": 7.325867176055908, "learning_rate": 3.7883219586177014e-07, "logits/chosen": -19.363174438476562, "logits/rejected": -18.492929458618164, "logps/chosen": -452.0398864746094, "logps/rejected": -299.45892333984375, "loss": 0.6112, "rewards/accuracies": 0.5, "rewards/chosen": 3.098611831665039, "rewards/margins": 0.6811444163322449, "rewards/rejected": 2.4174671173095703, "step": 15670 }, { "epoch": 0.7279818004549886, "grad_norm": 202.689453125, "learning_rate": 3.7875481684386465e-07, "logits/chosen": -18.358407974243164, "logits/rejected": -18.120555877685547, "logps/chosen": -409.1048889160156, "logps/rejected": -385.48223876953125, "loss": 0.6588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2349448204040527, "rewards/margins": 0.6635948419570923, "rewards/rejected": 2.571350574493408, "step": 15680 }, { "epoch": 0.7284460745624216, "grad_norm": 41.72998046875, "learning_rate": 3.786774378259591e-07, "logits/chosen": -19.21707534790039, "logits/rejected": -17.42776107788086, "logps/chosen": -462.0120544433594, "logps/rejected": -232.5105743408203, "loss": 0.3967, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.978982925415039, "rewards/margins": 2.1035237312316895, "rewards/rejected": 1.8754587173461914, "step": 15690 }, { "epoch": 0.7289103486698547, "grad_norm": 46.372013092041016, "learning_rate": 3.7860005880805356e-07, "logits/chosen": -18.506608963012695, "logits/rejected": -17.86497688293457, "logps/chosen": -420.19317626953125, "logps/rejected": -282.43878173828125, "loss": 0.6998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9064595699310303, "rewards/margins": 0.677953839302063, "rewards/rejected": 2.228506088256836, "step": 15700 }, { "epoch": 0.7293746227772877, "grad_norm": 158.14556884765625, "learning_rate": 3.785226797901481e-07, "logits/chosen": -19.345447540283203, "logits/rejected": -19.04298210144043, "logps/chosen": -294.4633483886719, "logps/rejected": -281.62860107421875, "loss": 0.9018, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.278517723083496, "rewards/margins": 0.03157147020101547, "rewards/rejected": 2.246946334838867, "step": 15710 }, { "epoch": 0.7298388968847207, "grad_norm": 197.1282501220703, "learning_rate": 3.784453007722426e-07, "logits/chosen": -18.32532501220703, "logits/rejected": -18.25931739807129, "logps/chosen": -316.7358703613281, "logps/rejected": -301.4835205078125, "loss": 1.468, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.442631721496582, "rewards/margins": -0.006778729148209095, "rewards/rejected": 2.4494106769561768, "step": 15720 }, { "epoch": 0.7303031709921538, "grad_norm": 110.25625610351562, "learning_rate": 3.783679217543371e-07, "logits/chosen": -17.896142959594727, "logits/rejected": -17.95325469970703, "logps/chosen": -347.0028991699219, "logps/rejected": -328.86370849609375, "loss": 1.2188, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.18143892288208, "rewards/margins": -0.19984674453735352, "rewards/rejected": 2.381286144256592, "step": 15730 }, { "epoch": 0.7307674450995868, "grad_norm": 151.8384246826172, "learning_rate": 3.7829054273643156e-07, "logits/chosen": -18.502666473388672, "logits/rejected": -17.40189552307129, "logps/chosen": -363.930908203125, "logps/rejected": -294.000244140625, "loss": 0.7827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6661760807037354, "rewards/margins": 0.820249080657959, "rewards/rejected": 1.8459268808364868, "step": 15740 }, { "epoch": 0.7312317192070198, "grad_norm": 107.7968521118164, "learning_rate": 3.7821316371852607e-07, "logits/chosen": -18.928043365478516, "logits/rejected": -18.53351402282715, "logps/chosen": -500.0516052246094, "logps/rejected": -435.69482421875, "loss": 0.6132, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.09666633605957, "rewards/margins": 0.5197068452835083, "rewards/rejected": 3.5769596099853516, "step": 15750 }, { "epoch": 0.7316959933144529, "grad_norm": 116.6949462890625, "learning_rate": 3.781357847006206e-07, "logits/chosen": -18.978010177612305, "logits/rejected": -18.609302520751953, "logps/chosen": -475.65887451171875, "logps/rejected": -406.56817626953125, "loss": 0.5492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.538611650466919, "rewards/margins": 0.6642686128616333, "rewards/rejected": 2.874343156814575, "step": 15760 }, { "epoch": 0.7321602674218859, "grad_norm": 114.13396453857422, "learning_rate": 3.780584056827151e-07, "logits/chosen": -19.877710342407227, "logits/rejected": -18.888872146606445, "logps/chosen": -454.8592224121094, "logps/rejected": -346.21588134765625, "loss": 0.5058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7446200847625732, "rewards/margins": 0.854966938495636, "rewards/rejected": 2.889653205871582, "step": 15770 }, { "epoch": 0.7326245415293189, "grad_norm": 67.40535736083984, "learning_rate": 3.779810266648096e-07, "logits/chosen": -19.61918830871582, "logits/rejected": -18.30714225769043, "logps/chosen": -369.3470153808594, "logps/rejected": -229.37606811523438, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 3.1488864421844482, "rewards/margins": 1.4773772954940796, "rewards/rejected": 1.6715091466903687, "step": 15780 }, { "epoch": 0.733088815636752, "grad_norm": 72.48299407958984, "learning_rate": 3.77903647646904e-07, "logits/chosen": -18.5184268951416, "logits/rejected": -17.797420501708984, "logps/chosen": -374.082763671875, "logps/rejected": -235.2580108642578, "loss": 0.6052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9629464149475098, "rewards/margins": 1.0831634998321533, "rewards/rejected": 1.8797826766967773, "step": 15790 }, { "epoch": 0.733553089744185, "grad_norm": 97.78063201904297, "learning_rate": 3.778262686289985e-07, "logits/chosen": -18.75435447692871, "logits/rejected": -18.16638946533203, "logps/chosen": -406.8134460449219, "logps/rejected": -321.12908935546875, "loss": 0.6949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.686123847961426, "rewards/margins": 0.6994611024856567, "rewards/rejected": 1.9866626262664795, "step": 15800 }, { "epoch": 0.734017363851618, "grad_norm": 169.08412170410156, "learning_rate": 3.7774888961109303e-07, "logits/chosen": -18.76648712158203, "logits/rejected": -18.02456283569336, "logps/chosen": -436.3001403808594, "logps/rejected": -367.3168029785156, "loss": 0.7491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.047255039215088, "rewards/margins": 0.39497116208076477, "rewards/rejected": 2.6522836685180664, "step": 15810 }, { "epoch": 0.734481637959051, "grad_norm": 96.8060531616211, "learning_rate": 3.7767151059318754e-07, "logits/chosen": -18.91775894165039, "logits/rejected": -17.057788848876953, "logps/chosen": -477.9383239746094, "logps/rejected": -281.76031494140625, "loss": 0.5581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8481345176696777, "rewards/margins": 1.2153394222259521, "rewards/rejected": 2.6327950954437256, "step": 15820 }, { "epoch": 0.7349459120664841, "grad_norm": 60.26970291137695, "learning_rate": 3.7759413157528205e-07, "logits/chosen": -19.504070281982422, "logits/rejected": -18.58584213256836, "logps/chosen": -410.10821533203125, "logps/rejected": -318.75634765625, "loss": 0.7935, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8970320224761963, "rewards/margins": 0.6017945408821106, "rewards/rejected": 2.2952375411987305, "step": 15830 }, { "epoch": 0.7354101861739171, "grad_norm": 188.58151245117188, "learning_rate": 3.775167525573765e-07, "logits/chosen": -18.21244239807129, "logits/rejected": -17.414148330688477, "logps/chosen": -357.28057861328125, "logps/rejected": -270.2700500488281, "loss": 0.7033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.531825304031372, "rewards/margins": 1.130128026008606, "rewards/rejected": 1.401697039604187, "step": 15840 }, { "epoch": 0.7358744602813501, "grad_norm": 216.7251434326172, "learning_rate": 3.77439373539471e-07, "logits/chosen": -18.215442657470703, "logits/rejected": -17.877246856689453, "logps/chosen": -376.12042236328125, "logps/rejected": -371.2929992675781, "loss": 0.7963, "rewards/accuracies": 0.5, "rewards/chosen": 2.961622476577759, "rewards/margins": 0.1528773009777069, "rewards/rejected": 2.8087449073791504, "step": 15850 }, { "epoch": 0.7363387343887832, "grad_norm": 111.17586517333984, "learning_rate": 3.7736199452156553e-07, "logits/chosen": -20.21401596069336, "logits/rejected": -18.985721588134766, "logps/chosen": -437.86138916015625, "logps/rejected": -343.469970703125, "loss": 0.8252, "rewards/accuracies": 0.5, "rewards/chosen": 2.984584331512451, "rewards/margins": 0.1692078411579132, "rewards/rejected": 2.8153767585754395, "step": 15860 }, { "epoch": 0.7368030084962162, "grad_norm": 129.52752685546875, "learning_rate": 3.7728461550366004e-07, "logits/chosen": -19.83161735534668, "logits/rejected": -19.600435256958008, "logps/chosen": -332.668212890625, "logps/rejected": -361.33001708984375, "loss": 0.9733, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.618448257446289, "rewards/margins": -0.10393564403057098, "rewards/rejected": 2.722383975982666, "step": 15870 }, { "epoch": 0.7372672826036492, "grad_norm": 64.91480255126953, "learning_rate": 3.7720723648575456e-07, "logits/chosen": -18.87338638305664, "logits/rejected": -18.83696174621582, "logps/chosen": -468.6353454589844, "logps/rejected": -436.78759765625, "loss": 0.646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6450493335723877, "rewards/margins": 0.42515286803245544, "rewards/rejected": 3.219895839691162, "step": 15880 }, { "epoch": 0.7377315567110823, "grad_norm": 50.596466064453125, "learning_rate": 3.7712985746784896e-07, "logits/chosen": -20.266197204589844, "logits/rejected": -17.8100643157959, "logps/chosen": -502.4912109375, "logps/rejected": -291.24603271484375, "loss": 0.4722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5868961811065674, "rewards/margins": 1.3356744050979614, "rewards/rejected": 2.2512218952178955, "step": 15890 }, { "epoch": 0.7381958308185153, "grad_norm": 26.85370445251465, "learning_rate": 3.7705247844994347e-07, "logits/chosen": -19.387672424316406, "logits/rejected": -19.30422592163086, "logps/chosen": -378.60980224609375, "logps/rejected": -381.0963439941406, "loss": 1.1816, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.560701370239258, "rewards/margins": -0.24560491740703583, "rewards/rejected": 2.8063063621520996, "step": 15900 }, { "epoch": 0.7386601049259482, "grad_norm": 59.030879974365234, "learning_rate": 3.76975099432038e-07, "logits/chosen": -18.959264755249023, "logits/rejected": -18.738574981689453, "logps/chosen": -357.584716796875, "logps/rejected": -389.8858947753906, "loss": 1.0734, "rewards/accuracies": 0.5, "rewards/chosen": 2.851132869720459, "rewards/margins": -0.2875286340713501, "rewards/rejected": 3.1386618614196777, "step": 15910 }, { "epoch": 0.7391243790333814, "grad_norm": 4.716796398162842, "learning_rate": 3.768977204141325e-07, "logits/chosen": -19.219053268432617, "logits/rejected": -17.680904388427734, "logps/chosen": -488.7005310058594, "logps/rejected": -329.363037109375, "loss": 0.7329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.062674045562744, "rewards/margins": 0.9866873621940613, "rewards/rejected": 2.075986862182617, "step": 15920 }, { "epoch": 0.7395886531408143, "grad_norm": 95.89491271972656, "learning_rate": 3.76820341396227e-07, "logits/chosen": -19.30388832092285, "logits/rejected": -18.411706924438477, "logps/chosen": -376.52435302734375, "logps/rejected": -291.4986267089844, "loss": 0.4476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1183671951293945, "rewards/margins": 0.9943356513977051, "rewards/rejected": 2.1240315437316895, "step": 15930 }, { "epoch": 0.7400529272482473, "grad_norm": 232.52943420410156, "learning_rate": 3.7674296237832146e-07, "logits/chosen": -17.43320083618164, "logits/rejected": -16.311779022216797, "logps/chosen": -484.93359375, "logps/rejected": -381.61041259765625, "loss": 0.5198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7305328845977783, "rewards/margins": 1.2235389947891235, "rewards/rejected": 2.5069940090179443, "step": 15940 }, { "epoch": 0.7405172013556804, "grad_norm": 256.9583435058594, "learning_rate": 3.76665583360416e-07, "logits/chosen": -18.628103256225586, "logits/rejected": -18.363101959228516, "logps/chosen": -407.4488830566406, "logps/rejected": -394.81634521484375, "loss": 1.0568, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.023681402206421, "rewards/margins": -0.14443129301071167, "rewards/rejected": 3.1681129932403564, "step": 15950 }, { "epoch": 0.7409814754631134, "grad_norm": 264.2769470214844, "learning_rate": 3.765882043425105e-07, "logits/chosen": -17.821168899536133, "logits/rejected": -18.085351943969727, "logps/chosen": -413.48297119140625, "logps/rejected": -416.6343688964844, "loss": 1.3663, "rewards/accuracies": 0.5, "rewards/chosen": 2.977856159210205, "rewards/margins": -0.42009544372558594, "rewards/rejected": 3.397951602935791, "step": 15960 }, { "epoch": 0.7414457495705464, "grad_norm": 131.3363494873047, "learning_rate": 3.76510825324605e-07, "logits/chosen": -18.717891693115234, "logits/rejected": -19.014545440673828, "logps/chosen": -461.7554626464844, "logps/rejected": -442.9458923339844, "loss": 0.5541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.670342206954956, "rewards/margins": 0.8498266339302063, "rewards/rejected": 2.8205156326293945, "step": 15970 }, { "epoch": 0.7419100236779794, "grad_norm": 4.5396246910095215, "learning_rate": 3.7643344630669946e-07, "logits/chosen": -18.819616317749023, "logits/rejected": -17.306808471679688, "logps/chosen": -379.210693359375, "logps/rejected": -294.56951904296875, "loss": 0.5352, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1368191242218018, "rewards/margins": 1.181457757949829, "rewards/rejected": 1.9553611278533936, "step": 15980 }, { "epoch": 0.7423742977854125, "grad_norm": 83.91033172607422, "learning_rate": 3.763560672887939e-07, "logits/chosen": -19.24197769165039, "logits/rejected": -18.24625015258789, "logps/chosen": -287.8626708984375, "logps/rejected": -240.12496948242188, "loss": 0.5565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7323007583618164, "rewards/margins": 0.6521545052528381, "rewards/rejected": 2.080146312713623, "step": 15990 }, { "epoch": 0.7428385718928455, "grad_norm": 148.67861938476562, "learning_rate": 3.762786882708884e-07, "logits/chosen": -19.22125816345215, "logits/rejected": -18.51628875732422, "logps/chosen": -429.1526794433594, "logps/rejected": -389.28240966796875, "loss": 0.5982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5950591564178467, "rewards/margins": 1.1267359256744385, "rewards/rejected": 2.468322992324829, "step": 16000 }, { "epoch": 0.7433028460002785, "grad_norm": 20.837736129760742, "learning_rate": 3.7620130925298294e-07, "logits/chosen": -18.476661682128906, "logits/rejected": -17.454004287719727, "logps/chosen": -428.0116271972656, "logps/rejected": -294.70428466796875, "loss": 0.5986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.332143783569336, "rewards/margins": 0.9150193929672241, "rewards/rejected": 2.4171245098114014, "step": 16010 }, { "epoch": 0.7437671201077116, "grad_norm": 48.117347717285156, "learning_rate": 3.7612393023507745e-07, "logits/chosen": -19.201709747314453, "logits/rejected": -18.028488159179688, "logps/chosen": -362.14874267578125, "logps/rejected": -259.2877197265625, "loss": 0.4295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5514798164367676, "rewards/margins": 0.9110479354858398, "rewards/rejected": 2.6404318809509277, "step": 16020 }, { "epoch": 0.7442313942151446, "grad_norm": 155.15638732910156, "learning_rate": 3.7604655121717196e-07, "logits/chosen": -18.317485809326172, "logits/rejected": -18.23223876953125, "logps/chosen": -303.7671813964844, "logps/rejected": -260.8275451660156, "loss": 0.6217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.004070281982422, "rewards/margins": 1.139730453491211, "rewards/rejected": 1.86434006690979, "step": 16030 }, { "epoch": 0.7446956683225776, "grad_norm": 0.48632270097732544, "learning_rate": 3.759691721992664e-07, "logits/chosen": -19.09604835510254, "logits/rejected": -18.36777687072754, "logps/chosen": -319.3634033203125, "logps/rejected": -257.2613830566406, "loss": 0.5115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7021446228027344, "rewards/margins": 0.895252525806427, "rewards/rejected": 1.8068923950195312, "step": 16040 }, { "epoch": 0.7451599424300107, "grad_norm": 0.10541127622127533, "learning_rate": 3.7589179318136093e-07, "logits/chosen": -18.117618560791016, "logits/rejected": -17.552453994750977, "logps/chosen": -301.795654296875, "logps/rejected": -251.6852569580078, "loss": 0.6631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.731184482574463, "rewards/margins": 0.8327955007553101, "rewards/rejected": 1.8983891010284424, "step": 16050 }, { "epoch": 0.7456242165374437, "grad_norm": 9.012005805969238, "learning_rate": 3.7581441416345544e-07, "logits/chosen": -18.546884536743164, "logits/rejected": -18.534622192382812, "logps/chosen": -412.04266357421875, "logps/rejected": -437.6070251464844, "loss": 0.6996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3484623432159424, "rewards/margins": 0.539939284324646, "rewards/rejected": 2.808523178100586, "step": 16060 }, { "epoch": 0.7460884906448767, "grad_norm": 126.37149047851562, "learning_rate": 3.7573703514554995e-07, "logits/chosen": -18.993663787841797, "logits/rejected": -18.440975189208984, "logps/chosen": -381.55657958984375, "logps/rejected": -338.42974853515625, "loss": 0.6671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.274491786956787, "rewards/margins": 0.6879843473434448, "rewards/rejected": 2.586507797241211, "step": 16070 }, { "epoch": 0.7465527647523098, "grad_norm": 8.345849990844727, "learning_rate": 3.756596561276444e-07, "logits/chosen": -18.326738357543945, "logits/rejected": -18.22298240661621, "logps/chosen": -412.1217346191406, "logps/rejected": -325.56463623046875, "loss": 0.8241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.042147636413574, "rewards/margins": 0.785075843334198, "rewards/rejected": 2.2570719718933105, "step": 16080 }, { "epoch": 0.7470170388597428, "grad_norm": 37.362945556640625, "learning_rate": 3.7558227710973887e-07, "logits/chosen": -19.08456802368164, "logits/rejected": -17.884220123291016, "logps/chosen": -437.18389892578125, "logps/rejected": -326.00445556640625, "loss": 0.421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9338009357452393, "rewards/margins": 1.25716233253479, "rewards/rejected": 2.6766390800476074, "step": 16090 }, { "epoch": 0.7474813129671758, "grad_norm": 142.23287963867188, "learning_rate": 3.755048980918334e-07, "logits/chosen": -18.270709991455078, "logits/rejected": -17.442237854003906, "logps/chosen": -464.64453125, "logps/rejected": -415.466064453125, "loss": 0.8643, "rewards/accuracies": 0.5, "rewards/chosen": 3.3467857837677, "rewards/margins": 0.5777734518051147, "rewards/rejected": 2.769012212753296, "step": 16100 }, { "epoch": 0.7479455870746089, "grad_norm": 30.25118064880371, "learning_rate": 3.754275190739279e-07, "logits/chosen": -19.04925537109375, "logits/rejected": -17.960918426513672, "logps/chosen": -398.1144714355469, "logps/rejected": -374.8863830566406, "loss": 1.1158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.771125078201294, "rewards/margins": 0.6661617159843445, "rewards/rejected": 3.1049630641937256, "step": 16110 }, { "epoch": 0.7484098611820419, "grad_norm": 80.97867584228516, "learning_rate": 3.753501400560224e-07, "logits/chosen": -18.08340835571289, "logits/rejected": -17.70756721496582, "logps/chosen": -452.6171875, "logps/rejected": -446.3008728027344, "loss": 1.0107, "rewards/accuracies": 0.5, "rewards/chosen": 2.870302438735962, "rewards/margins": -0.14183633029460907, "rewards/rejected": 3.012138843536377, "step": 16120 }, { "epoch": 0.7488741352894749, "grad_norm": 165.04859924316406, "learning_rate": 3.752727610381169e-07, "logits/chosen": -17.787494659423828, "logits/rejected": -17.497230529785156, "logps/chosen": -402.9199523925781, "logps/rejected": -361.33026123046875, "loss": 1.283, "rewards/accuracies": 0.5, "rewards/chosen": 2.743652105331421, "rewards/margins": 0.16957740485668182, "rewards/rejected": 2.5740742683410645, "step": 16130 }, { "epoch": 0.749338409396908, "grad_norm": 113.2457046508789, "learning_rate": 3.7519538202021137e-07, "logits/chosen": -19.58363914489746, "logits/rejected": -19.340290069580078, "logps/chosen": -392.86700439453125, "logps/rejected": -310.38909912109375, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": 3.2089290618896484, "rewards/margins": 0.8337615728378296, "rewards/rejected": 2.3751673698425293, "step": 16140 }, { "epoch": 0.749802683504341, "grad_norm": 145.404052734375, "learning_rate": 3.751180030023059e-07, "logits/chosen": -18.334375381469727, "logits/rejected": -18.217058181762695, "logps/chosen": -489.40185546875, "logps/rejected": -468.66058349609375, "loss": 0.9192, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.148073434829712, "rewards/margins": -0.07470735907554626, "rewards/rejected": 3.22278094291687, "step": 16150 }, { "epoch": 0.750266957611774, "grad_norm": 99.3172378540039, "learning_rate": 3.750406239844004e-07, "logits/chosen": -18.558124542236328, "logits/rejected": -18.175506591796875, "logps/chosen": -522.5951538085938, "logps/rejected": -478.5899353027344, "loss": 0.4733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3876423835754395, "rewards/margins": 1.0557947158813477, "rewards/rejected": 3.331847667694092, "step": 16160 }, { "epoch": 0.750731231719207, "grad_norm": 31.70897102355957, "learning_rate": 3.749632449664949e-07, "logits/chosen": -19.17701530456543, "logits/rejected": -19.463655471801758, "logps/chosen": -423.36773681640625, "logps/rejected": -419.99884033203125, "loss": 0.5674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9061636924743652, "rewards/margins": 0.5849446654319763, "rewards/rejected": 3.321218967437744, "step": 16170 }, { "epoch": 0.7511955058266401, "grad_norm": 12.040492057800293, "learning_rate": 3.7488586594858936e-07, "logits/chosen": -18.765201568603516, "logits/rejected": -18.388320922851562, "logps/chosen": -351.5335388183594, "logps/rejected": -315.7381286621094, "loss": 1.2893, "rewards/accuracies": 0.5, "rewards/chosen": 2.2479190826416016, "rewards/margins": 0.15897436439990997, "rewards/rejected": 2.088944673538208, "step": 16180 }, { "epoch": 0.7516597799340731, "grad_norm": 0.9034532904624939, "learning_rate": 3.748084869306838e-07, "logits/chosen": -18.398807525634766, "logits/rejected": -17.751012802124023, "logps/chosen": -374.2983703613281, "logps/rejected": -315.95428466796875, "loss": 0.984, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.8323652744293213, "rewards/margins": 0.08653593063354492, "rewards/rejected": 2.7458293437957764, "step": 16190 }, { "epoch": 0.7521240540415061, "grad_norm": 87.45581817626953, "learning_rate": 3.7473110791277833e-07, "logits/chosen": -17.37148666381836, "logits/rejected": -17.24142074584961, "logps/chosen": -335.74749755859375, "logps/rejected": -267.6145935058594, "loss": 1.0119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4028480052948, "rewards/margins": -0.15966784954071045, "rewards/rejected": 2.5625157356262207, "step": 16200 }, { "epoch": 0.7525883281489392, "grad_norm": 146.7230224609375, "learning_rate": 3.7465372889487285e-07, "logits/chosen": -17.997615814208984, "logits/rejected": -17.522706985473633, "logps/chosen": -350.9251403808594, "logps/rejected": -293.63946533203125, "loss": 0.7383, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.148372173309326, "rewards/margins": 0.9445985555648804, "rewards/rejected": 2.2037737369537354, "step": 16210 }, { "epoch": 0.7530526022563722, "grad_norm": 64.09014892578125, "learning_rate": 3.7457634987696736e-07, "logits/chosen": -17.7290096282959, "logits/rejected": -17.803747177124023, "logps/chosen": -345.203857421875, "logps/rejected": -336.8634338378906, "loss": 0.8567, "rewards/accuracies": 0.5, "rewards/chosen": 2.4353890419006348, "rewards/margins": 0.1249847412109375, "rewards/rejected": 2.3104043006896973, "step": 16220 }, { "epoch": 0.7535168763638052, "grad_norm": 3.9457144737243652, "learning_rate": 3.7449897085906187e-07, "logits/chosen": -18.66000747680664, "logits/rejected": -18.1181697845459, "logps/chosen": -421.56500244140625, "logps/rejected": -288.43829345703125, "loss": 0.5666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7941501140594482, "rewards/margins": 1.5251449346542358, "rewards/rejected": 2.2690048217773438, "step": 16230 }, { "epoch": 0.7539811504712383, "grad_norm": 128.1388702392578, "learning_rate": 3.7442159184115633e-07, "logits/chosen": -18.014942169189453, "logits/rejected": -17.131502151489258, "logps/chosen": -437.44580078125, "logps/rejected": -316.41546630859375, "loss": 0.6262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7539496421813965, "rewards/margins": 0.692961573600769, "rewards/rejected": 2.060987949371338, "step": 16240 }, { "epoch": 0.7544454245786713, "grad_norm": 34.44248962402344, "learning_rate": 3.7434421282325084e-07, "logits/chosen": -18.66328239440918, "logits/rejected": -17.667545318603516, "logps/chosen": -300.6846618652344, "logps/rejected": -229.3472442626953, "loss": 0.4737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1614856719970703, "rewards/margins": 0.9861801266670227, "rewards/rejected": 1.1753054857254028, "step": 16250 }, { "epoch": 0.7549096986861042, "grad_norm": 18.527503967285156, "learning_rate": 3.7426683380534535e-07, "logits/chosen": -18.53089141845703, "logits/rejected": -18.20538330078125, "logps/chosen": -394.7339782714844, "logps/rejected": -338.1903381347656, "loss": 0.7285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3950436115264893, "rewards/margins": 0.6548125147819519, "rewards/rejected": 2.7402305603027344, "step": 16260 }, { "epoch": 0.7553739727935374, "grad_norm": 9.668484687805176, "learning_rate": 3.741894547874398e-07, "logits/chosen": -17.63571548461914, "logits/rejected": -18.414350509643555, "logps/chosen": -390.2013854980469, "logps/rejected": -410.47930908203125, "loss": 1.3004, "rewards/accuracies": 0.5, "rewards/chosen": 3.268378734588623, "rewards/margins": -0.35254842042922974, "rewards/rejected": 3.620927333831787, "step": 16270 }, { "epoch": 0.7558382469009703, "grad_norm": 48.694522857666016, "learning_rate": 3.741120757695343e-07, "logits/chosen": -19.463342666625977, "logits/rejected": -18.5288143157959, "logps/chosen": -398.01519775390625, "logps/rejected": -267.9700622558594, "loss": 0.4557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.490889310836792, "rewards/margins": 1.3875701427459717, "rewards/rejected": 2.1033191680908203, "step": 16280 }, { "epoch": 0.7563025210084033, "grad_norm": 52.72922897338867, "learning_rate": 3.740346967516288e-07, "logits/chosen": -19.069570541381836, "logits/rejected": -18.951873779296875, "logps/chosen": -416.83123779296875, "logps/rejected": -372.44049072265625, "loss": 0.8304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.745201826095581, "rewards/margins": 0.08391833305358887, "rewards/rejected": 2.661283493041992, "step": 16290 }, { "epoch": 0.7567667951158364, "grad_norm": 109.03861236572266, "learning_rate": 3.739573177337233e-07, "logits/chosen": -18.635623931884766, "logits/rejected": -17.977066040039062, "logps/chosen": -288.05328369140625, "logps/rejected": -271.94805908203125, "loss": 0.7947, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.269644260406494, "rewards/margins": 0.26928621530532837, "rewards/rejected": 2.0003581047058105, "step": 16300 }, { "epoch": 0.7572310692232694, "grad_norm": 18.51483154296875, "learning_rate": 3.738799387158178e-07, "logits/chosen": -18.456241607666016, "logits/rejected": -17.469762802124023, "logps/chosen": -415.68255615234375, "logps/rejected": -254.88882446289062, "loss": 0.2046, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.708599805831909, "rewards/margins": 1.92890202999115, "rewards/rejected": 1.7796977758407593, "step": 16310 }, { "epoch": 0.7576953433307024, "grad_norm": 115.86930847167969, "learning_rate": 3.738025596979123e-07, "logits/chosen": -18.73055648803711, "logits/rejected": -19.182680130004883, "logps/chosen": -414.36981201171875, "logps/rejected": -400.91302490234375, "loss": 1.2538, "rewards/accuracies": 0.5, "rewards/chosen": 3.0067086219787598, "rewards/margins": -0.2905738353729248, "rewards/rejected": 3.2972824573516846, "step": 16320 }, { "epoch": 0.7581596174381354, "grad_norm": 48.03522491455078, "learning_rate": 3.737251806800068e-07, "logits/chosen": -18.194133758544922, "logits/rejected": -17.824098587036133, "logps/chosen": -339.80462646484375, "logps/rejected": -276.7853698730469, "loss": 0.7125, "rewards/accuracies": 0.5, "rewards/chosen": 2.4211416244506836, "rewards/margins": 0.39896249771118164, "rewards/rejected": 2.022179126739502, "step": 16330 }, { "epoch": 0.7586238915455685, "grad_norm": 115.98823547363281, "learning_rate": 3.736478016621013e-07, "logits/chosen": -18.7420597076416, "logits/rejected": -18.257030487060547, "logps/chosen": -347.5916748046875, "logps/rejected": -310.0517883300781, "loss": 0.7071, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6679248809814453, "rewards/margins": 0.7107422351837158, "rewards/rejected": 1.95718252658844, "step": 16340 }, { "epoch": 0.7590881656530015, "grad_norm": 45.78954315185547, "learning_rate": 3.7357816054598636e-07, "logits/chosen": -18.560815811157227, "logits/rejected": -19.593135833740234, "logps/chosen": -406.78338623046875, "logps/rejected": -431.830078125, "loss": 1.1902, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.524075984954834, "rewards/margins": 0.07907428592443466, "rewards/rejected": 2.4450018405914307, "step": 16350 }, { "epoch": 0.7595524397604345, "grad_norm": 38.36417770385742, "learning_rate": 3.7350078152808087e-07, "logits/chosen": -19.493398666381836, "logits/rejected": -18.966781616210938, "logps/chosen": -448.320068359375, "logps/rejected": -338.23199462890625, "loss": 0.576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7749569416046143, "rewards/margins": 0.9330512285232544, "rewards/rejected": 2.8419055938720703, "step": 16360 }, { "epoch": 0.7600167138678676, "grad_norm": 30.740467071533203, "learning_rate": 3.734234025101753e-07, "logits/chosen": -18.694223403930664, "logits/rejected": -18.53398323059082, "logps/chosen": -330.053955078125, "logps/rejected": -365.3149108886719, "loss": 0.5999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.459050178527832, "rewards/margins": 0.5814303159713745, "rewards/rejected": 1.877619743347168, "step": 16370 }, { "epoch": 0.7604809879753006, "grad_norm": 172.533447265625, "learning_rate": 3.7334602349226984e-07, "logits/chosen": -18.265634536743164, "logits/rejected": -17.895423889160156, "logps/chosen": -312.80841064453125, "logps/rejected": -335.9197082519531, "loss": 0.8483, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6104702949523926, "rewards/margins": 0.05590682104229927, "rewards/rejected": 2.554563045501709, "step": 16380 }, { "epoch": 0.7609452620827336, "grad_norm": 34.38004684448242, "learning_rate": 3.732686444743643e-07, "logits/chosen": -19.11611557006836, "logits/rejected": -18.296756744384766, "logps/chosen": -381.5154113769531, "logps/rejected": -368.1029968261719, "loss": 0.835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.294785261154175, "rewards/margins": 0.39678916335105896, "rewards/rejected": 2.897996425628662, "step": 16390 }, { "epoch": 0.7614095361901667, "grad_norm": 52.09027099609375, "learning_rate": 3.731912654564588e-07, "logits/chosen": -18.55512237548828, "logits/rejected": -18.4899845123291, "logps/chosen": -334.1646728515625, "logps/rejected": -252.8363037109375, "loss": 0.7013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.810192108154297, "rewards/margins": 0.7432482242584229, "rewards/rejected": 2.066943883895874, "step": 16400 }, { "epoch": 0.7618738102975997, "grad_norm": 205.30235290527344, "learning_rate": 3.731138864385533e-07, "logits/chosen": -18.001224517822266, "logits/rejected": -17.603042602539062, "logps/chosen": -530.0806274414062, "logps/rejected": -473.5965270996094, "loss": 0.8038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5230185985565186, "rewards/margins": 0.2877620756626129, "rewards/rejected": 3.2352569103240967, "step": 16410 }, { "epoch": 0.7623380844050327, "grad_norm": 85.66031646728516, "learning_rate": 3.730365074206478e-07, "logits/chosen": -16.749452590942383, "logits/rejected": -16.86060333251953, "logps/chosen": -249.9086151123047, "logps/rejected": -267.5411071777344, "loss": 1.3595, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5248291492462158, "rewards/margins": -0.47576481103897095, "rewards/rejected": 2.000593900680542, "step": 16420 }, { "epoch": 0.7628023585124658, "grad_norm": 20.28447151184082, "learning_rate": 3.729591284027423e-07, "logits/chosen": -19.796276092529297, "logits/rejected": -18.751338958740234, "logps/chosen": -484.5428161621094, "logps/rejected": -370.83587646484375, "loss": 0.6499, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7335827350616455, "rewards/margins": 0.4716666340827942, "rewards/rejected": 3.261915922164917, "step": 16430 }, { "epoch": 0.7632666326198988, "grad_norm": 74.37694549560547, "learning_rate": 3.728817493848368e-07, "logits/chosen": -18.673641204833984, "logits/rejected": -17.856828689575195, "logps/chosen": -352.2718200683594, "logps/rejected": -263.5316467285156, "loss": 0.7226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8383243083953857, "rewards/margins": 1.0488686561584473, "rewards/rejected": 1.7894556522369385, "step": 16440 }, { "epoch": 0.7637309067273318, "grad_norm": 45.504486083984375, "learning_rate": 3.728043703669313e-07, "logits/chosen": -19.084476470947266, "logits/rejected": -17.563745498657227, "logps/chosen": -435.58258056640625, "logps/rejected": -296.3247985839844, "loss": 0.5298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4008851051330566, "rewards/margins": 1.2032158374786377, "rewards/rejected": 2.197669506072998, "step": 16450 }, { "epoch": 0.7641951808347649, "grad_norm": 118.94733428955078, "learning_rate": 3.727269913490258e-07, "logits/chosen": -19.403339385986328, "logits/rejected": -19.02237319946289, "logps/chosen": -496.7117614746094, "logps/rejected": -536.9572143554688, "loss": 0.6871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8594844341278076, "rewards/margins": 0.5733109712600708, "rewards/rejected": 3.2861735820770264, "step": 16460 }, { "epoch": 0.7646594549421979, "grad_norm": 105.87174224853516, "learning_rate": 3.726496123311203e-07, "logits/chosen": -18.810165405273438, "logits/rejected": -18.297420501708984, "logps/chosen": -396.9971618652344, "logps/rejected": -390.46087646484375, "loss": 0.7136, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.07319974899292, "rewards/margins": 0.32500648498535156, "rewards/rejected": 2.7481932640075684, "step": 16470 }, { "epoch": 0.7651237290496309, "grad_norm": 128.2729949951172, "learning_rate": 3.7257223331321474e-07, "logits/chosen": -18.41961097717285, "logits/rejected": -18.332252502441406, "logps/chosen": -430.539794921875, "logps/rejected": -411.1524353027344, "loss": 0.9137, "rewards/accuracies": 0.5, "rewards/chosen": 3.3570289611816406, "rewards/margins": 0.37052565813064575, "rewards/rejected": 2.9865028858184814, "step": 16480 }, { "epoch": 0.7655880031570639, "grad_norm": 57.52654266357422, "learning_rate": 3.7249485429530925e-07, "logits/chosen": -19.280498504638672, "logits/rejected": -19.05074691772461, "logps/chosen": -379.83172607421875, "logps/rejected": -287.8446350097656, "loss": 0.5273, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2766213417053223, "rewards/margins": 1.1241670846939087, "rewards/rejected": 2.152454137802124, "step": 16490 }, { "epoch": 0.766052277264497, "grad_norm": 67.45594024658203, "learning_rate": 3.7241747527740376e-07, "logits/chosen": -17.998634338378906, "logits/rejected": -17.27014923095703, "logps/chosen": -424.46990966796875, "logps/rejected": -331.36865234375, "loss": 1.1158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4232935905456543, "rewards/margins": -0.09785406291484833, "rewards/rejected": 2.5211477279663086, "step": 16500 }, { "epoch": 0.76651655137193, "grad_norm": 50.84742736816406, "learning_rate": 3.7234009625949827e-07, "logits/chosen": -18.932437896728516, "logits/rejected": -17.670412063598633, "logps/chosen": -483.0298767089844, "logps/rejected": -415.22930908203125, "loss": 0.5851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9706263542175293, "rewards/margins": 0.9618377685546875, "rewards/rejected": 3.0087890625, "step": 16510 }, { "epoch": 0.766980825479363, "grad_norm": 2.8580286502838135, "learning_rate": 3.7226271724159273e-07, "logits/chosen": -18.358158111572266, "logits/rejected": -16.698368072509766, "logps/chosen": -408.6665954589844, "logps/rejected": -242.0374298095703, "loss": 0.391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.094723701477051, "rewards/margins": 1.249753475189209, "rewards/rejected": 1.8449703454971313, "step": 16520 }, { "epoch": 0.7674450995867961, "grad_norm": 37.69233703613281, "learning_rate": 3.7218533822368724e-07, "logits/chosen": -18.398414611816406, "logits/rejected": -18.52005386352539, "logps/chosen": -413.33831787109375, "logps/rejected": -334.2989196777344, "loss": 1.043, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.560492992401123, "rewards/margins": -0.21112790703773499, "rewards/rejected": 2.7716212272644043, "step": 16530 }, { "epoch": 0.7679093736942291, "grad_norm": 19.705076217651367, "learning_rate": 3.7210795920578175e-07, "logits/chosen": -18.656198501586914, "logits/rejected": -17.64181137084961, "logps/chosen": -397.2981872558594, "logps/rejected": -300.9312744140625, "loss": 0.7425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.615079164505005, "rewards/margins": 1.2314951419830322, "rewards/rejected": 2.3835840225219727, "step": 16540 }, { "epoch": 0.7683736478016621, "grad_norm": 46.35065841674805, "learning_rate": 3.7203058018787626e-07, "logits/chosen": -19.466876983642578, "logits/rejected": -18.013042449951172, "logps/chosen": -433.5679626464844, "logps/rejected": -292.39337158203125, "loss": 0.3718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.517230987548828, "rewards/margins": 1.3468986749649048, "rewards/rejected": 2.170332431793213, "step": 16550 }, { "epoch": 0.7688379219090952, "grad_norm": 158.73748779296875, "learning_rate": 3.719532011699708e-07, "logits/chosen": -18.569826126098633, "logits/rejected": -17.066041946411133, "logps/chosen": -513.504638671875, "logps/rejected": -314.46380615234375, "loss": 0.4422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.388081073760986, "rewards/margins": 2.400996446609497, "rewards/rejected": 1.9870846271514893, "step": 16560 }, { "epoch": 0.7693021960165282, "grad_norm": 65.77783966064453, "learning_rate": 3.7187582215206523e-07, "logits/chosen": -19.822559356689453, "logits/rejected": -17.9339542388916, "logps/chosen": -467.52508544921875, "logps/rejected": -381.1185607910156, "loss": 0.6028, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5898730754852295, "rewards/margins": 0.5593188405036926, "rewards/rejected": 3.0305542945861816, "step": 16570 }, { "epoch": 0.7697664701239612, "grad_norm": 97.62476348876953, "learning_rate": 3.717984431341597e-07, "logits/chosen": -19.374759674072266, "logits/rejected": -18.492389678955078, "logps/chosen": -446.77587890625, "logps/rejected": -342.3914794921875, "loss": 0.6134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.611567974090576, "rewards/margins": 0.570270836353302, "rewards/rejected": 3.041297435760498, "step": 16580 }, { "epoch": 0.7702307442313943, "grad_norm": 48.29018020629883, "learning_rate": 3.717210641162542e-07, "logits/chosen": -18.38627815246582, "logits/rejected": -18.673913955688477, "logps/chosen": -317.7245788574219, "logps/rejected": -333.0750732421875, "loss": 0.6808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9808349609375, "rewards/margins": 0.36134734749794006, "rewards/rejected": 2.619488000869751, "step": 16590 }, { "epoch": 0.7706950183388273, "grad_norm": 42.937889099121094, "learning_rate": 3.716436850983487e-07, "logits/chosen": -19.481531143188477, "logits/rejected": -19.3614559173584, "logps/chosen": -400.5350341796875, "logps/rejected": -395.5550842285156, "loss": 1.0407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8196756839752197, "rewards/margins": -0.22717027366161346, "rewards/rejected": 3.0468459129333496, "step": 16600 }, { "epoch": 0.7711592924462602, "grad_norm": 2.01161789894104, "learning_rate": 3.7156630608044323e-07, "logits/chosen": -20.267789840698242, "logits/rejected": -19.08050537109375, "logps/chosen": -494.3296813964844, "logps/rejected": -326.1000671386719, "loss": 0.3953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6977856159210205, "rewards/margins": 1.7475426197052002, "rewards/rejected": 1.9502429962158203, "step": 16610 }, { "epoch": 0.7716235665536934, "grad_norm": 156.49571228027344, "learning_rate": 3.714889270625377e-07, "logits/chosen": -19.62137222290039, "logits/rejected": -18.015155792236328, "logps/chosen": -345.1329650878906, "logps/rejected": -239.9719696044922, "loss": 0.4744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9291155338287354, "rewards/margins": 1.3476059436798096, "rewards/rejected": 1.5815093517303467, "step": 16620 }, { "epoch": 0.7720878406611263, "grad_norm": 20.94009017944336, "learning_rate": 3.714115480446322e-07, "logits/chosen": -18.595964431762695, "logits/rejected": -17.945053100585938, "logps/chosen": -398.1592712402344, "logps/rejected": -321.50848388671875, "loss": 0.7305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8995845317840576, "rewards/margins": 0.39028313755989075, "rewards/rejected": 2.5093014240264893, "step": 16630 }, { "epoch": 0.7725521147685593, "grad_norm": 89.80442810058594, "learning_rate": 3.713341690267267e-07, "logits/chosen": -18.661680221557617, "logits/rejected": -17.69895362854004, "logps/chosen": -504.7498474121094, "logps/rejected": -401.5198974609375, "loss": 0.3078, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9986958503723145, "rewards/margins": 1.5291752815246582, "rewards/rejected": 2.469520330429077, "step": 16640 }, { "epoch": 0.7730163888759923, "grad_norm": 22.392858505249023, "learning_rate": 3.712567900088212e-07, "logits/chosen": -18.471420288085938, "logits/rejected": -18.120817184448242, "logps/chosen": -416.41748046875, "logps/rejected": -335.18048095703125, "loss": 0.7445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4806747436523438, "rewards/margins": 0.8323801755905151, "rewards/rejected": 2.6482949256896973, "step": 16650 }, { "epoch": 0.7734806629834254, "grad_norm": 41.90387725830078, "learning_rate": 3.7117941099091573e-07, "logits/chosen": -18.6142520904541, "logits/rejected": -17.893688201904297, "logps/chosen": -413.53082275390625, "logps/rejected": -363.8460693359375, "loss": 0.5551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0872912406921387, "rewards/margins": 0.8909636735916138, "rewards/rejected": 2.1963276863098145, "step": 16660 }, { "epoch": 0.7739449370908584, "grad_norm": 160.02236938476562, "learning_rate": 3.7110203197301014e-07, "logits/chosen": -18.068580627441406, "logits/rejected": -17.70782470703125, "logps/chosen": -439.99560546875, "logps/rejected": -309.04345703125, "loss": 0.5012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7710611820220947, "rewards/margins": 0.7773879766464233, "rewards/rejected": 1.993673324584961, "step": 16670 }, { "epoch": 0.7744092111982914, "grad_norm": 39.89379119873047, "learning_rate": 3.7102465295510465e-07, "logits/chosen": -19.39585304260254, "logits/rejected": -18.2496280670166, "logps/chosen": -500.68890380859375, "logps/rejected": -320.5164489746094, "loss": 0.5369, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6143875122070312, "rewards/margins": 1.03689706325531, "rewards/rejected": 2.5774905681610107, "step": 16680 }, { "epoch": 0.7748734853057245, "grad_norm": 243.71900939941406, "learning_rate": 3.7094727393719916e-07, "logits/chosen": -18.297622680664062, "logits/rejected": -17.391921997070312, "logps/chosen": -379.93255615234375, "logps/rejected": -272.9139099121094, "loss": 0.4975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9401261806488037, "rewards/margins": 1.1500234603881836, "rewards/rejected": 1.7901026010513306, "step": 16690 }, { "epoch": 0.7753377594131575, "grad_norm": 237.37542724609375, "learning_rate": 3.7086989491929367e-07, "logits/chosen": -18.44847869873047, "logits/rejected": -18.465003967285156, "logps/chosen": -340.99432373046875, "logps/rejected": -373.7743225097656, "loss": 1.6343, "rewards/accuracies": 0.5, "rewards/chosen": 2.211129903793335, "rewards/margins": -0.7111583352088928, "rewards/rejected": 2.922288417816162, "step": 16700 }, { "epoch": 0.7758020335205905, "grad_norm": 78.15621185302734, "learning_rate": 3.707925159013882e-07, "logits/chosen": -19.51565933227539, "logits/rejected": -19.147968292236328, "logps/chosen": -500.47564697265625, "logps/rejected": -503.18328857421875, "loss": 0.8133, "rewards/accuracies": 0.5, "rewards/chosen": 3.8412182331085205, "rewards/margins": -0.05179031938314438, "rewards/rejected": 3.8930084705352783, "step": 16710 }, { "epoch": 0.7762663076280236, "grad_norm": 4.620089054107666, "learning_rate": 3.7071513688348264e-07, "logits/chosen": -18.837095260620117, "logits/rejected": -18.151973724365234, "logps/chosen": -434.87738037109375, "logps/rejected": -348.2838439941406, "loss": 0.6026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2519264221191406, "rewards/margins": 0.8684452772140503, "rewards/rejected": 2.38348126411438, "step": 16720 }, { "epoch": 0.7767305817354566, "grad_norm": 279.9046325683594, "learning_rate": 3.7063775786557715e-07, "logits/chosen": -18.67568016052246, "logits/rejected": -17.681570053100586, "logps/chosen": -467.8724670410156, "logps/rejected": -350.3869323730469, "loss": 0.6526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6044578552246094, "rewards/margins": 0.930755615234375, "rewards/rejected": 2.6737020015716553, "step": 16730 }, { "epoch": 0.7771948558428896, "grad_norm": 52.01385498046875, "learning_rate": 3.7056037884767166e-07, "logits/chosen": -20.006038665771484, "logits/rejected": -18.018924713134766, "logps/chosen": -383.9704284667969, "logps/rejected": -256.3211364746094, "loss": 0.3616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.980966567993164, "rewards/margins": 1.7735042572021484, "rewards/rejected": 2.2074618339538574, "step": 16740 }, { "epoch": 0.7776591299503227, "grad_norm": 124.23320007324219, "learning_rate": 3.7048299982976617e-07, "logits/chosen": -18.995777130126953, "logits/rejected": -18.48480224609375, "logps/chosen": -426.50079345703125, "logps/rejected": -424.592529296875, "loss": 0.551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7831313610076904, "rewards/margins": 0.6646147966384888, "rewards/rejected": 2.118516445159912, "step": 16750 }, { "epoch": 0.7781234040577557, "grad_norm": 44.995567321777344, "learning_rate": 3.704056208118607e-07, "logits/chosen": -17.958375930786133, "logits/rejected": -17.77159881591797, "logps/chosen": -311.9774169921875, "logps/rejected": -306.9745178222656, "loss": 0.9858, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4685521125793457, "rewards/margins": -0.03852654621005058, "rewards/rejected": 2.5070786476135254, "step": 16760 }, { "epoch": 0.7785876781651887, "grad_norm": 207.058837890625, "learning_rate": 3.703282417939551e-07, "logits/chosen": -19.38995933532715, "logits/rejected": -19.017717361450195, "logps/chosen": -501.8827209472656, "logps/rejected": -457.74420166015625, "loss": 0.8201, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2317395210266113, "rewards/margins": -0.04721547290682793, "rewards/rejected": 3.2789547443389893, "step": 16770 }, { "epoch": 0.7790519522726218, "grad_norm": 71.35236358642578, "learning_rate": 3.702508627760496e-07, "logits/chosen": -19.085857391357422, "logits/rejected": -18.145254135131836, "logps/chosen": -328.3251953125, "logps/rejected": -322.56109619140625, "loss": 0.9516, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.491237163543701, "rewards/margins": 0.009445840492844582, "rewards/rejected": 2.4817910194396973, "step": 16780 }, { "epoch": 0.7795162263800548, "grad_norm": 10.759553909301758, "learning_rate": 3.701734837581441e-07, "logits/chosen": -18.623075485229492, "logits/rejected": -18.404964447021484, "logps/chosen": -383.82342529296875, "logps/rejected": -349.325927734375, "loss": 0.6101, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8802528381347656, "rewards/margins": 0.4378082752227783, "rewards/rejected": 2.4424448013305664, "step": 16790 }, { "epoch": 0.7799805004874878, "grad_norm": 4.916511058807373, "learning_rate": 3.700961047402386e-07, "logits/chosen": -18.119482040405273, "logits/rejected": -16.840869903564453, "logps/chosen": -368.33648681640625, "logps/rejected": -275.90350341796875, "loss": 0.6001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.367966413497925, "rewards/margins": 1.8980919122695923, "rewards/rejected": 1.469874620437622, "step": 16800 }, { "epoch": 0.7804447745949208, "grad_norm": 156.88558959960938, "learning_rate": 3.7001872572233314e-07, "logits/chosen": -18.430530548095703, "logits/rejected": -18.090301513671875, "logps/chosen": -409.1077880859375, "logps/rejected": -416.203125, "loss": 1.1221, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.1431117057800293, "rewards/margins": -0.1378827691078186, "rewards/rejected": 3.280994415283203, "step": 16810 }, { "epoch": 0.7809090487023539, "grad_norm": 167.76797485351562, "learning_rate": 3.699413467044276e-07, "logits/chosen": -19.197893142700195, "logits/rejected": -19.09339141845703, "logps/chosen": -392.50201416015625, "logps/rejected": -403.5965881347656, "loss": 0.5304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7745347023010254, "rewards/margins": 0.7364624738693237, "rewards/rejected": 3.038072347640991, "step": 16820 }, { "epoch": 0.7813733228097869, "grad_norm": 9.570202827453613, "learning_rate": 3.698639676865221e-07, "logits/chosen": -19.473949432373047, "logits/rejected": -18.891328811645508, "logps/chosen": -374.76409912109375, "logps/rejected": -316.19500732421875, "loss": 0.8213, "rewards/accuracies": 0.5, "rewards/chosen": 3.0463764667510986, "rewards/margins": 0.4480072557926178, "rewards/rejected": 2.5983693599700928, "step": 16830 }, { "epoch": 0.7818375969172199, "grad_norm": 183.7618408203125, "learning_rate": 3.697865886686166e-07, "logits/chosen": -18.983312606811523, "logits/rejected": -17.793975830078125, "logps/chosen": -410.87518310546875, "logps/rejected": -291.926025390625, "loss": 0.426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2921268939971924, "rewards/margins": 1.1599605083465576, "rewards/rejected": 2.1321661472320557, "step": 16840 }, { "epoch": 0.782301871024653, "grad_norm": 181.28135681152344, "learning_rate": 3.6970920965071113e-07, "logits/chosen": -19.544530868530273, "logits/rejected": -19.380924224853516, "logps/chosen": -509.0315856933594, "logps/rejected": -454.97235107421875, "loss": 0.8146, "rewards/accuracies": 0.5, "rewards/chosen": 4.28007698059082, "rewards/margins": 0.44348496198654175, "rewards/rejected": 3.836592197418213, "step": 16850 }, { "epoch": 0.782766145132086, "grad_norm": 39.904319763183594, "learning_rate": 3.6963183063280564e-07, "logits/chosen": -17.847881317138672, "logits/rejected": -17.613605499267578, "logps/chosen": -356.2724609375, "logps/rejected": -279.08465576171875, "loss": 0.7178, "rewards/accuracies": 0.5, "rewards/chosen": 2.1444895267486572, "rewards/margins": 0.26978689432144165, "rewards/rejected": 1.8747024536132812, "step": 16860 }, { "epoch": 0.783230419239519, "grad_norm": 179.1421356201172, "learning_rate": 3.6955445161490004e-07, "logits/chosen": -19.42512321472168, "logits/rejected": -18.34636878967285, "logps/chosen": -452.21978759765625, "logps/rejected": -373.7919006347656, "loss": 0.7112, "rewards/accuracies": 0.5, "rewards/chosen": 2.9612033367156982, "rewards/margins": 0.2956983745098114, "rewards/rejected": 2.6655049324035645, "step": 16870 }, { "epoch": 0.7836946933469521, "grad_norm": 8.537676811218262, "learning_rate": 3.6947707259699456e-07, "logits/chosen": -19.138269424438477, "logits/rejected": -18.46921730041504, "logps/chosen": -585.9114990234375, "logps/rejected": -501.82489013671875, "loss": 0.9655, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.149900913238525, "rewards/margins": 0.3773755133152008, "rewards/rejected": 3.7725250720977783, "step": 16880 }, { "epoch": 0.7841589674543851, "grad_norm": 111.43295288085938, "learning_rate": 3.6939969357908907e-07, "logits/chosen": -18.709529876708984, "logits/rejected": -18.12649917602539, "logps/chosen": -423.9632263183594, "logps/rejected": -301.8017883300781, "loss": 0.5012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3776793479919434, "rewards/margins": 0.6475987434387207, "rewards/rejected": 2.7300806045532227, "step": 16890 }, { "epoch": 0.7846232415618181, "grad_norm": 38.8157958984375, "learning_rate": 3.693223145611836e-07, "logits/chosen": -20.465023040771484, "logits/rejected": -18.969219207763672, "logps/chosen": -362.10174560546875, "logps/rejected": -356.5533752441406, "loss": 0.5443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2888646125793457, "rewards/margins": 0.9132474064826965, "rewards/rejected": 2.375617027282715, "step": 16900 }, { "epoch": 0.7850875156692512, "grad_norm": 244.23818969726562, "learning_rate": 3.692449355432781e-07, "logits/chosen": -18.67569923400879, "logits/rejected": -18.82753562927246, "logps/chosen": -449.2188415527344, "logps/rejected": -469.9064025878906, "loss": 1.4181, "rewards/accuracies": 0.5, "rewards/chosen": 3.5217723846435547, "rewards/margins": -0.6303655505180359, "rewards/rejected": 4.1521382331848145, "step": 16910 }, { "epoch": 0.7855517897766842, "grad_norm": 47.240718841552734, "learning_rate": 3.6916755652537255e-07, "logits/chosen": -18.758365631103516, "logits/rejected": -18.424022674560547, "logps/chosen": -305.6860656738281, "logps/rejected": -330.20404052734375, "loss": 0.9682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3626880645751953, "rewards/margins": 0.2507881224155426, "rewards/rejected": 2.1119000911712646, "step": 16920 }, { "epoch": 0.7860160638841172, "grad_norm": 71.75538635253906, "learning_rate": 3.6909017750746706e-07, "logits/chosen": -18.577861785888672, "logits/rejected": -18.550695419311523, "logps/chosen": -332.5016174316406, "logps/rejected": -342.4192810058594, "loss": 0.9428, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2272791862487793, "rewards/margins": -0.2283935546875, "rewards/rejected": 2.4556727409362793, "step": 16930 }, { "epoch": 0.7864803379915503, "grad_norm": 68.27537536621094, "learning_rate": 3.6901279848956157e-07, "logits/chosen": -19.248241424560547, "logits/rejected": -17.85629653930664, "logps/chosen": -356.51104736328125, "logps/rejected": -242.89492797851562, "loss": 0.6239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2866268157958984, "rewards/margins": 0.5418905019760132, "rewards/rejected": 1.7447364330291748, "step": 16940 }, { "epoch": 0.7869446120989833, "grad_norm": 15.73214340209961, "learning_rate": 3.689354194716561e-07, "logits/chosen": -18.800861358642578, "logits/rejected": -17.489994049072266, "logps/chosen": -493.88153076171875, "logps/rejected": -345.62945556640625, "loss": 0.478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.184195041656494, "rewards/margins": 1.0966498851776123, "rewards/rejected": 2.087545394897461, "step": 16950 }, { "epoch": 0.7874088862064162, "grad_norm": 99.01239013671875, "learning_rate": 3.6885804045375054e-07, "logits/chosen": -18.727832794189453, "logits/rejected": -18.173309326171875, "logps/chosen": -488.4600524902344, "logps/rejected": -387.4916076660156, "loss": 0.6901, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.943324089050293, "rewards/margins": 0.4189864695072174, "rewards/rejected": 2.5243380069732666, "step": 16960 }, { "epoch": 0.7878731603138492, "grad_norm": 8.666607856750488, "learning_rate": 3.68780661435845e-07, "logits/chosen": -18.831775665283203, "logits/rejected": -17.311689376831055, "logps/chosen": -440.0282287597656, "logps/rejected": -305.89990234375, "loss": 0.3334, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2437527179718018, "rewards/margins": 1.4580821990966797, "rewards/rejected": 1.785670280456543, "step": 16970 }, { "epoch": 0.7883374344212823, "grad_norm": 37.684932708740234, "learning_rate": 3.687032824179395e-07, "logits/chosen": -19.569412231445312, "logits/rejected": -19.06698989868164, "logps/chosen": -451.8609313964844, "logps/rejected": -425.35467529296875, "loss": 0.5846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8363993167877197, "rewards/margins": 0.5839654207229614, "rewards/rejected": 2.2524337768554688, "step": 16980 }, { "epoch": 0.7888017085287153, "grad_norm": 4.6881208419799805, "learning_rate": 3.68625903400034e-07, "logits/chosen": -18.096431732177734, "logits/rejected": -18.245052337646484, "logps/chosen": -399.2272033691406, "logps/rejected": -459.71405029296875, "loss": 1.2819, "rewards/accuracies": 0.5, "rewards/chosen": 3.388563632965088, "rewards/margins": 0.09837895631790161, "rewards/rejected": 3.290184497833252, "step": 16990 }, { "epoch": 0.7892659826361483, "grad_norm": 51.8668098449707, "learning_rate": 3.6854852438212853e-07, "logits/chosen": -19.14689064025879, "logits/rejected": -18.576335906982422, "logps/chosen": -364.70147705078125, "logps/rejected": -351.69580078125, "loss": 0.5746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.465599775314331, "rewards/margins": 0.3902338147163391, "rewards/rejected": 2.0753660202026367, "step": 17000 }, { "epoch": 0.7897302567435814, "grad_norm": 22.82953453063965, "learning_rate": 3.6847114536422304e-07, "logits/chosen": -18.430707931518555, "logits/rejected": -18.768306732177734, "logps/chosen": -303.19708251953125, "logps/rejected": -360.85882568359375, "loss": 0.8941, "rewards/accuracies": 0.5, "rewards/chosen": 2.043541193008423, "rewards/margins": -0.23052871227264404, "rewards/rejected": 2.2740697860717773, "step": 17010 }, { "epoch": 0.7901945308510144, "grad_norm": 168.07330322265625, "learning_rate": 3.683937663463175e-07, "logits/chosen": -18.702083587646484, "logits/rejected": -17.954097747802734, "logps/chosen": -478.97802734375, "logps/rejected": -394.3238830566406, "loss": 0.6109, "rewards/accuracies": 0.5, "rewards/chosen": 3.3471827507019043, "rewards/margins": 0.6395015120506287, "rewards/rejected": 2.707681179046631, "step": 17020 }, { "epoch": 0.7906588049584474, "grad_norm": 15.677577018737793, "learning_rate": 3.68316387328412e-07, "logits/chosen": -19.350906372070312, "logits/rejected": -17.27475929260254, "logps/chosen": -393.9870300292969, "logps/rejected": -254.84628295898438, "loss": 0.5599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6656558513641357, "rewards/margins": 1.4422112703323364, "rewards/rejected": 2.223444700241089, "step": 17030 }, { "epoch": 0.7911230790658805, "grad_norm": 158.979248046875, "learning_rate": 3.682390083105065e-07, "logits/chosen": -17.374664306640625, "logits/rejected": -18.170066833496094, "logps/chosen": -278.21661376953125, "logps/rejected": -340.3543395996094, "loss": 1.433, "rewards/accuracies": 0.5, "rewards/chosen": 2.5823018550872803, "rewards/margins": -0.7365843057632446, "rewards/rejected": 3.3188858032226562, "step": 17040 }, { "epoch": 0.7915873531733135, "grad_norm": 3.2039377689361572, "learning_rate": 3.6816162929260104e-07, "logits/chosen": -18.765155792236328, "logits/rejected": -18.02987289428711, "logps/chosen": -337.3922119140625, "logps/rejected": -216.75576782226562, "loss": 0.5034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1550514698028564, "rewards/margins": 1.1534672975540161, "rewards/rejected": 2.001584053039551, "step": 17050 }, { "epoch": 0.7920516272807465, "grad_norm": 86.20304107666016, "learning_rate": 3.680842502746955e-07, "logits/chosen": -18.898578643798828, "logits/rejected": -17.530372619628906, "logps/chosen": -390.1005554199219, "logps/rejected": -285.64599609375, "loss": 0.6578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.500694513320923, "rewards/margins": 0.4376789629459381, "rewards/rejected": 2.0630156993865967, "step": 17060 }, { "epoch": 0.7925159013881796, "grad_norm": 233.41287231445312, "learning_rate": 3.6800687125678995e-07, "logits/chosen": -17.77606964111328, "logits/rejected": -18.081968307495117, "logps/chosen": -371.72735595703125, "logps/rejected": -353.4335021972656, "loss": 0.9971, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2020270824432373, "rewards/margins": -0.008582914248108864, "rewards/rejected": 3.2106106281280518, "step": 17070 }, { "epoch": 0.7929801754956126, "grad_norm": 76.5995864868164, "learning_rate": 3.6792949223888446e-07, "logits/chosen": -18.381498336791992, "logits/rejected": -17.402679443359375, "logps/chosen": -392.28118896484375, "logps/rejected": -215.5244140625, "loss": 0.6124, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0794835090637207, "rewards/margins": 0.5570220351219177, "rewards/rejected": 2.522461414337158, "step": 17080 }, { "epoch": 0.7934444496030456, "grad_norm": 68.27207946777344, "learning_rate": 3.67852113220979e-07, "logits/chosen": -18.568832397460938, "logits/rejected": -17.728260040283203, "logps/chosen": -468.2103576660156, "logps/rejected": -342.35882568359375, "loss": 0.824, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1103708744049072, "rewards/margins": 0.5148126482963562, "rewards/rejected": 2.595557928085327, "step": 17090 }, { "epoch": 0.7939087237104787, "grad_norm": 39.002174377441406, "learning_rate": 3.677747342030735e-07, "logits/chosen": -18.790903091430664, "logits/rejected": -17.76511001586914, "logps/chosen": -324.5384826660156, "logps/rejected": -242.1614532470703, "loss": 0.5386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.320646286010742, "rewards/margins": 1.1943238973617554, "rewards/rejected": 2.1263222694396973, "step": 17100 }, { "epoch": 0.7943729978179117, "grad_norm": 95.08338928222656, "learning_rate": 3.67697355185168e-07, "logits/chosen": -19.11130142211914, "logits/rejected": -18.987462997436523, "logps/chosen": -406.949462890625, "logps/rejected": -279.7423400878906, "loss": 0.9842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5658836364746094, "rewards/margins": 0.5160263180732727, "rewards/rejected": 3.0498576164245605, "step": 17110 }, { "epoch": 0.7948372719253447, "grad_norm": 147.26553344726562, "learning_rate": 3.6761997616726246e-07, "logits/chosen": -19.223373413085938, "logits/rejected": -17.803768157958984, "logps/chosen": -445.4386291503906, "logps/rejected": -310.55859375, "loss": 0.7158, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5692524909973145, "rewards/margins": 1.4697659015655518, "rewards/rejected": 2.099486827850342, "step": 17120 }, { "epoch": 0.7953015460327778, "grad_norm": 48.38590621948242, "learning_rate": 3.6754259714935697e-07, "logits/chosen": -19.382661819458008, "logits/rejected": -17.716522216796875, "logps/chosen": -371.76385498046875, "logps/rejected": -229.65966796875, "loss": 0.2954, "rewards/accuracies": 1.0, "rewards/chosen": 2.7209970951080322, "rewards/margins": 1.5194475650787354, "rewards/rejected": 1.2015492916107178, "step": 17130 }, { "epoch": 0.7957658201402108, "grad_norm": 179.3529815673828, "learning_rate": 3.674652181314515e-07, "logits/chosen": -18.62579345703125, "logits/rejected": -18.180816650390625, "logps/chosen": -446.49432373046875, "logps/rejected": -369.7568359375, "loss": 0.7201, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9197769165039062, "rewards/margins": 0.5331894159317017, "rewards/rejected": 2.386587381362915, "step": 17140 }, { "epoch": 0.7962300942476438, "grad_norm": 255.97946166992188, "learning_rate": 3.6738783911354594e-07, "logits/chosen": -20.276973724365234, "logits/rejected": -19.021678924560547, "logps/chosen": -377.300048828125, "logps/rejected": -286.3412780761719, "loss": 0.4942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.325981855392456, "rewards/margins": 1.0520014762878418, "rewards/rejected": 2.2739808559417725, "step": 17150 }, { "epoch": 0.7966943683550768, "grad_norm": 178.07177734375, "learning_rate": 3.6731046009564045e-07, "logits/chosen": -18.025503158569336, "logits/rejected": -18.209060668945312, "logps/chosen": -434.838623046875, "logps/rejected": -398.506591796875, "loss": 0.9133, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0483508110046387, "rewards/margins": 0.1277155876159668, "rewards/rejected": 2.920635223388672, "step": 17160 }, { "epoch": 0.7971586424625099, "grad_norm": 170.1132049560547, "learning_rate": 3.672330810777349e-07, "logits/chosen": -20.50143051147461, "logits/rejected": -19.246143341064453, "logps/chosen": -459.39471435546875, "logps/rejected": -328.7828063964844, "loss": 0.7296, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2536442279815674, "rewards/margins": 0.5215862393379211, "rewards/rejected": 2.732058048248291, "step": 17170 }, { "epoch": 0.7976229165699429, "grad_norm": 117.62957000732422, "learning_rate": 3.671557020598294e-07, "logits/chosen": -18.816608428955078, "logits/rejected": -17.419692993164062, "logps/chosen": -418.7305603027344, "logps/rejected": -279.6915588378906, "loss": 0.6776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2402358055114746, "rewards/margins": 0.9052095413208008, "rewards/rejected": 2.3350260257720947, "step": 17180 }, { "epoch": 0.7980871906773759, "grad_norm": 41.11665725708008, "learning_rate": 3.6707832304192393e-07, "logits/chosen": -18.365537643432617, "logits/rejected": -17.934959411621094, "logps/chosen": -287.33563232421875, "logps/rejected": -260.0262756347656, "loss": 0.8751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0496773719787598, "rewards/margins": -0.12649628520011902, "rewards/rejected": 2.176173686981201, "step": 17190 }, { "epoch": 0.798551464784809, "grad_norm": 17.47264862060547, "learning_rate": 3.6700094402401844e-07, "logits/chosen": -19.126628875732422, "logits/rejected": -18.8614559173584, "logps/chosen": -390.63238525390625, "logps/rejected": -315.5010986328125, "loss": 0.7065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.732713222503662, "rewards/margins": 0.6907591223716736, "rewards/rejected": 3.041954517364502, "step": 17200 }, { "epoch": 0.799015738892242, "grad_norm": 141.02906799316406, "learning_rate": 3.6692356500611295e-07, "logits/chosen": -18.925067901611328, "logits/rejected": -18.02942657470703, "logps/chosen": -560.1561279296875, "logps/rejected": -445.16558837890625, "loss": 0.8192, "rewards/accuracies": 0.5, "rewards/chosen": 3.950568675994873, "rewards/margins": 0.2319965660572052, "rewards/rejected": 3.7185726165771484, "step": 17210 }, { "epoch": 0.799480012999675, "grad_norm": 65.90121459960938, "learning_rate": 3.668461859882074e-07, "logits/chosen": -19.315719604492188, "logits/rejected": -18.803600311279297, "logps/chosen": -292.94146728515625, "logps/rejected": -282.8856201171875, "loss": 0.7221, "rewards/accuracies": 0.5, "rewards/chosen": 2.537010431289673, "rewards/margins": 0.1306535303592682, "rewards/rejected": 2.4063572883605957, "step": 17220 }, { "epoch": 0.7999442871071081, "grad_norm": 150.45571899414062, "learning_rate": 3.667688069703019e-07, "logits/chosen": -19.053150177001953, "logits/rejected": -18.668291091918945, "logps/chosen": -313.6492919921875, "logps/rejected": -272.073974609375, "loss": 1.0599, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.98494553565979, "rewards/margins": 0.3403531312942505, "rewards/rejected": 2.64459228515625, "step": 17230 }, { "epoch": 0.8004085612145411, "grad_norm": 68.96466064453125, "learning_rate": 3.6669142795239643e-07, "logits/chosen": -17.844348907470703, "logits/rejected": -18.16799545288086, "logps/chosen": -325.6741638183594, "logps/rejected": -368.15447998046875, "loss": 0.7178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9233057498931885, "rewards/margins": 0.39292266964912415, "rewards/rejected": 2.530383348464966, "step": 17240 }, { "epoch": 0.8008728353219741, "grad_norm": 123.4859848022461, "learning_rate": 3.666140489344909e-07, "logits/chosen": -19.715824127197266, "logits/rejected": -18.251171112060547, "logps/chosen": -474.87908935546875, "logps/rejected": -366.8836364746094, "loss": 0.7152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6455509662628174, "rewards/margins": 0.500095009803772, "rewards/rejected": 3.145455837249756, "step": 17250 }, { "epoch": 0.8013371094294072, "grad_norm": 198.25772094726562, "learning_rate": 3.665366699165854e-07, "logits/chosen": -19.660751342773438, "logits/rejected": -19.228530883789062, "logps/chosen": -394.1851501464844, "logps/rejected": -341.90899658203125, "loss": 0.8372, "rewards/accuracies": 0.5, "rewards/chosen": 2.739755630493164, "rewards/margins": 0.08841691166162491, "rewards/rejected": 2.651339054107666, "step": 17260 }, { "epoch": 0.8018013835368402, "grad_norm": 3.3530936241149902, "learning_rate": 3.6645929089867986e-07, "logits/chosen": -18.652738571166992, "logits/rejected": -17.07553482055664, "logps/chosen": -570.0421142578125, "logps/rejected": -351.53778076171875, "loss": 0.4149, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.600430727005005, "rewards/margins": 1.3235896825790405, "rewards/rejected": 2.276840925216675, "step": 17270 }, { "epoch": 0.8022656576442732, "grad_norm": 210.26646423339844, "learning_rate": 3.6638191188077437e-07, "logits/chosen": -19.770126342773438, "logits/rejected": -18.629648208618164, "logps/chosen": -446.2027282714844, "logps/rejected": -313.1768798828125, "loss": 0.8272, "rewards/accuracies": 0.5, "rewards/chosen": 3.2529845237731934, "rewards/margins": 0.4932378828525543, "rewards/rejected": 2.759746789932251, "step": 17280 }, { "epoch": 0.8027299317517063, "grad_norm": 263.2604675292969, "learning_rate": 3.663045328628689e-07, "logits/chosen": -19.694747924804688, "logits/rejected": -19.838022232055664, "logps/chosen": -448.69549560546875, "logps/rejected": -509.7550354003906, "loss": 0.8452, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.89951753616333, "rewards/margins": 0.1480875313282013, "rewards/rejected": 3.751430034637451, "step": 17290 }, { "epoch": 0.8031942058591393, "grad_norm": 118.40717315673828, "learning_rate": 3.662271538449634e-07, "logits/chosen": -18.794950485229492, "logits/rejected": -17.934553146362305, "logps/chosen": -428.67218017578125, "logps/rejected": -362.6529541015625, "loss": 0.5667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.298882007598877, "rewards/margins": 1.1935590505599976, "rewards/rejected": 3.1053225994110107, "step": 17300 }, { "epoch": 0.8036584799665722, "grad_norm": 11.406267166137695, "learning_rate": 3.661497748270579e-07, "logits/chosen": -19.415264129638672, "logits/rejected": -17.543712615966797, "logps/chosen": -393.8030090332031, "logps/rejected": -232.62326049804688, "loss": 0.4072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.467738628387451, "rewards/margins": 1.078179121017456, "rewards/rejected": 1.3895593881607056, "step": 17310 }, { "epoch": 0.8041227540740052, "grad_norm": 79.41277313232422, "learning_rate": 3.6607239580915236e-07, "logits/chosen": -19.150630950927734, "logits/rejected": -19.463315963745117, "logps/chosen": -392.04376220703125, "logps/rejected": -403.35931396484375, "loss": 1.3536, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.642963409423828, "rewards/margins": -0.6696061491966248, "rewards/rejected": 3.3125693798065186, "step": 17320 }, { "epoch": 0.8045870281814383, "grad_norm": 214.45553588867188, "learning_rate": 3.659950167912469e-07, "logits/chosen": -19.100902557373047, "logits/rejected": -18.514404296875, "logps/chosen": -353.0729675292969, "logps/rejected": -288.7095642089844, "loss": 0.4616, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.605766773223877, "rewards/margins": 1.2911548614501953, "rewards/rejected": 2.3146114349365234, "step": 17330 }, { "epoch": 0.8050513022888713, "grad_norm": 96.50413513183594, "learning_rate": 3.6591763777334133e-07, "logits/chosen": -18.711458206176758, "logits/rejected": -17.452219009399414, "logps/chosen": -439.9866638183594, "logps/rejected": -285.1309509277344, "loss": 0.5182, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5118820667266846, "rewards/margins": 1.4940465688705444, "rewards/rejected": 2.0178353786468506, "step": 17340 }, { "epoch": 0.8055155763963043, "grad_norm": 222.50057983398438, "learning_rate": 3.6584025875543585e-07, "logits/chosen": -19.02033805847168, "logits/rejected": -18.16778564453125, "logps/chosen": -431.00250244140625, "logps/rejected": -335.44525146484375, "loss": 0.6294, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5882489681243896, "rewards/margins": 0.4183166027069092, "rewards/rejected": 2.1699323654174805, "step": 17350 }, { "epoch": 0.8059798505037374, "grad_norm": 108.91600036621094, "learning_rate": 3.6576287973753036e-07, "logits/chosen": -19.766124725341797, "logits/rejected": -19.085596084594727, "logps/chosen": -335.87762451171875, "logps/rejected": -272.04974365234375, "loss": 0.7163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8483872413635254, "rewards/margins": 0.581108570098877, "rewards/rejected": 2.2672784328460693, "step": 17360 }, { "epoch": 0.8064441246111704, "grad_norm": 174.41993713378906, "learning_rate": 3.656855007196248e-07, "logits/chosen": -19.458518981933594, "logits/rejected": -20.042232513427734, "logps/chosen": -377.10009765625, "logps/rejected": -406.1586608886719, "loss": 1.034, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.899195909500122, "rewards/margins": -0.44183215498924255, "rewards/rejected": 3.3410274982452393, "step": 17370 }, { "epoch": 0.8069083987186034, "grad_norm": 143.58737182617188, "learning_rate": 3.656081217017193e-07, "logits/chosen": -18.8912296295166, "logits/rejected": -17.960514068603516, "logps/chosen": -404.5416259765625, "logps/rejected": -294.63226318359375, "loss": 0.59, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1169285774230957, "rewards/margins": 1.2286860942840576, "rewards/rejected": 1.8882423639297485, "step": 17380 }, { "epoch": 0.8073726728260365, "grad_norm": 16.11229705810547, "learning_rate": 3.6553074268381384e-07, "logits/chosen": -19.782588958740234, "logits/rejected": -18.336687088012695, "logps/chosen": -519.6761474609375, "logps/rejected": -384.7137756347656, "loss": 0.4121, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.217297554016113, "rewards/margins": 1.1730667352676392, "rewards/rejected": 3.044231414794922, "step": 17390 }, { "epoch": 0.8078369469334695, "grad_norm": 28.905881881713867, "learning_rate": 3.6545336366590835e-07, "logits/chosen": -18.986698150634766, "logits/rejected": -18.04142951965332, "logps/chosen": -501.6766662597656, "logps/rejected": -395.6440124511719, "loss": 0.5409, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4552812576293945, "rewards/margins": 1.3816108703613281, "rewards/rejected": 3.0736706256866455, "step": 17400 }, { "epoch": 0.8083012210409025, "grad_norm": 37.91797637939453, "learning_rate": 3.6537598464800286e-07, "logits/chosen": -18.97412109375, "logits/rejected": -17.593154907226562, "logps/chosen": -483.4608459472656, "logps/rejected": -325.4324951171875, "loss": 0.4013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.056985378265381, "rewards/margins": 1.7377636432647705, "rewards/rejected": 2.3192219734191895, "step": 17410 }, { "epoch": 0.8087654951483356, "grad_norm": 104.1637191772461, "learning_rate": 3.6529860563009737e-07, "logits/chosen": -19.36649513244629, "logits/rejected": -18.896940231323242, "logps/chosen": -302.86846923828125, "logps/rejected": -232.6505889892578, "loss": 0.6513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3883607387542725, "rewards/margins": 0.8843925595283508, "rewards/rejected": 1.5039681196212769, "step": 17420 }, { "epoch": 0.8092297692557686, "grad_norm": 100.09062957763672, "learning_rate": 3.6522122661219183e-07, "logits/chosen": -19.896615982055664, "logits/rejected": -19.519916534423828, "logps/chosen": -417.8168029785156, "logps/rejected": -404.15106201171875, "loss": 0.709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3562235832214355, "rewards/margins": 0.22099769115447998, "rewards/rejected": 3.135225772857666, "step": 17430 }, { "epoch": 0.8096940433632016, "grad_norm": 112.79096221923828, "learning_rate": 3.651438475942863e-07, "logits/chosen": -18.655658721923828, "logits/rejected": -18.157136917114258, "logps/chosen": -320.8924865722656, "logps/rejected": -285.5638732910156, "loss": 0.9371, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5139737129211426, "rewards/margins": 0.14300136268138885, "rewards/rejected": 2.3709728717803955, "step": 17440 }, { "epoch": 0.8101583174706347, "grad_norm": 36.302547454833984, "learning_rate": 3.650664685763808e-07, "logits/chosen": -19.344968795776367, "logits/rejected": -17.77931022644043, "logps/chosen": -347.4727478027344, "logps/rejected": -163.77029418945312, "loss": 0.5046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0242178440093994, "rewards/margins": 0.8640414476394653, "rewards/rejected": 1.1601765155792236, "step": 17450 }, { "epoch": 0.8106225915780677, "grad_norm": 109.0676498413086, "learning_rate": 3.649890895584753e-07, "logits/chosen": -20.099117279052734, "logits/rejected": -19.003009796142578, "logps/chosen": -422.08111572265625, "logps/rejected": -350.85614013671875, "loss": 0.796, "rewards/accuracies": 0.5, "rewards/chosen": 2.9444756507873535, "rewards/margins": 0.5656365156173706, "rewards/rejected": 2.3788390159606934, "step": 17460 }, { "epoch": 0.8110868656855007, "grad_norm": 42.18638610839844, "learning_rate": 3.6491171054056977e-07, "logits/chosen": -19.8617000579834, "logits/rejected": -18.893932342529297, "logps/chosen": -458.75640869140625, "logps/rejected": -309.35272216796875, "loss": 0.5757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.054617404937744, "rewards/margins": 0.9598314166069031, "rewards/rejected": 3.0947859287261963, "step": 17470 }, { "epoch": 0.8115511397929337, "grad_norm": 43.06013107299805, "learning_rate": 3.648343315226643e-07, "logits/chosen": -17.986400604248047, "logits/rejected": -18.078163146972656, "logps/chosen": -268.6333923339844, "logps/rejected": -219.7499237060547, "loss": 0.8051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.878208875656128, "rewards/margins": 0.6721294522285461, "rewards/rejected": 1.2060792446136475, "step": 17480 }, { "epoch": 0.8120154139003668, "grad_norm": 3.3117988109588623, "learning_rate": 3.647569525047588e-07, "logits/chosen": -18.043842315673828, "logits/rejected": -17.04024887084961, "logps/chosen": -437.5328063964844, "logps/rejected": -319.9841003417969, "loss": 0.8466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3040611743927, "rewards/margins": 1.5604193210601807, "rewards/rejected": 1.7436418533325195, "step": 17490 }, { "epoch": 0.8124796880077998, "grad_norm": 0.2530670762062073, "learning_rate": 3.646795734868533e-07, "logits/chosen": -19.11412239074707, "logits/rejected": -17.685535430908203, "logps/chosen": -318.9960632324219, "logps/rejected": -253.80419921875, "loss": 0.3872, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.912698745727539, "rewards/margins": 2.1213836669921875, "rewards/rejected": 1.7913148403167725, "step": 17500 }, { "epoch": 0.8129439621152328, "grad_norm": 3.9510343074798584, "learning_rate": 3.646021944689478e-07, "logits/chosen": -20.083553314208984, "logits/rejected": -18.979442596435547, "logps/chosen": -400.34844970703125, "logps/rejected": -295.642333984375, "loss": 0.7286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3238091468811035, "rewards/margins": 0.5937275290489197, "rewards/rejected": 1.7300817966461182, "step": 17510 }, { "epoch": 0.8134082362226659, "grad_norm": 71.39707946777344, "learning_rate": 3.645248154510423e-07, "logits/chosen": -19.198810577392578, "logits/rejected": -19.518789291381836, "logps/chosen": -441.963623046875, "logps/rejected": -488.0721740722656, "loss": 0.7909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7317147254943848, "rewards/margins": 0.20363518595695496, "rewards/rejected": 2.5280792713165283, "step": 17520 }, { "epoch": 0.8138725103300989, "grad_norm": 130.6284942626953, "learning_rate": 3.644474364331368e-07, "logits/chosen": -19.567913055419922, "logits/rejected": -18.189311981201172, "logps/chosen": -532.8329467773438, "logps/rejected": -302.53582763671875, "loss": 0.5204, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9824328422546387, "rewards/margins": 1.5016249418258667, "rewards/rejected": 2.4808077812194824, "step": 17530 }, { "epoch": 0.8143367844375319, "grad_norm": 23.388690948486328, "learning_rate": 3.6437005741523124e-07, "logits/chosen": -18.314334869384766, "logits/rejected": -17.696714401245117, "logps/chosen": -250.29452514648438, "logps/rejected": -245.6472625732422, "loss": 0.5486, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.279531478881836, "rewards/margins": 0.5585641264915466, "rewards/rejected": 1.7209672927856445, "step": 17540 }, { "epoch": 0.814801058544965, "grad_norm": 116.29598999023438, "learning_rate": 3.6429267839732575e-07, "logits/chosen": -18.889545440673828, "logits/rejected": -18.282184600830078, "logps/chosen": -409.7106018066406, "logps/rejected": -340.00274658203125, "loss": 1.0359, "rewards/accuracies": 0.5, "rewards/chosen": 3.255528688430786, "rewards/margins": 0.5487667322158813, "rewards/rejected": 2.7067618370056152, "step": 17550 }, { "epoch": 0.815265332652398, "grad_norm": 9.268139839172363, "learning_rate": 3.6421529937942026e-07, "logits/chosen": -18.364200592041016, "logits/rejected": -17.659564971923828, "logps/chosen": -328.4375, "logps/rejected": -202.23849487304688, "loss": 0.3866, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9542155265808105, "rewards/margins": 1.5705242156982422, "rewards/rejected": 1.3836910724639893, "step": 17560 }, { "epoch": 0.815729606759831, "grad_norm": 195.085205078125, "learning_rate": 3.641379203615148e-07, "logits/chosen": -19.128604888916016, "logits/rejected": -18.480318069458008, "logps/chosen": -410.22686767578125, "logps/rejected": -353.43988037109375, "loss": 0.892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8814566135406494, "rewards/margins": 0.3978709876537323, "rewards/rejected": 2.4835855960845947, "step": 17570 }, { "epoch": 0.8161938808672641, "grad_norm": 28.917333602905273, "learning_rate": 3.6406054134360923e-07, "logits/chosen": -19.262611389160156, "logits/rejected": -18.199846267700195, "logps/chosen": -417.59906005859375, "logps/rejected": -321.11297607421875, "loss": 0.469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4120452404022217, "rewards/margins": 1.0188887119293213, "rewards/rejected": 2.3931562900543213, "step": 17580 }, { "epoch": 0.8166581549746971, "grad_norm": 24.098623275756836, "learning_rate": 3.6398316232570375e-07, "logits/chosen": -18.769062042236328, "logits/rejected": -17.836055755615234, "logps/chosen": -439.63116455078125, "logps/rejected": -318.4593811035156, "loss": 0.6143, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.715630292892456, "rewards/margins": 0.6784874200820923, "rewards/rejected": 2.0371429920196533, "step": 17590 }, { "epoch": 0.8171224290821301, "grad_norm": 224.9571075439453, "learning_rate": 3.6390578330779826e-07, "logits/chosen": -19.522090911865234, "logits/rejected": -18.53895378112793, "logps/chosen": -438.9344177246094, "logps/rejected": -360.3341369628906, "loss": 1.0824, "rewards/accuracies": 0.5, "rewards/chosen": 3.170346975326538, "rewards/margins": 0.2882850766181946, "rewards/rejected": 2.8820619583129883, "step": 17600 }, { "epoch": 0.8175867031895632, "grad_norm": 19.96831512451172, "learning_rate": 3.6382840428989277e-07, "logits/chosen": -19.957256317138672, "logits/rejected": -18.41781234741211, "logps/chosen": -440.8045959472656, "logps/rejected": -282.40081787109375, "loss": 0.3598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5769412517547607, "rewards/margins": 1.3930141925811768, "rewards/rejected": 2.183927059173584, "step": 17610 }, { "epoch": 0.8180509772969962, "grad_norm": 33.826969146728516, "learning_rate": 3.637510252719873e-07, "logits/chosen": -19.680763244628906, "logits/rejected": -19.394765853881836, "logps/chosen": -418.09161376953125, "logps/rejected": -400.0646057128906, "loss": 0.6285, "rewards/accuracies": 0.5, "rewards/chosen": 3.4700350761413574, "rewards/margins": 0.8055591583251953, "rewards/rejected": 2.664475917816162, "step": 17620 }, { "epoch": 0.8185152514044292, "grad_norm": 8.073883056640625, "learning_rate": 3.636736462540817e-07, "logits/chosen": -18.630216598510742, "logits/rejected": -18.184152603149414, "logps/chosen": -307.4790954589844, "logps/rejected": -314.3095703125, "loss": 0.7914, "rewards/accuracies": 0.5, "rewards/chosen": 2.537806987762451, "rewards/margins": 0.3518563210964203, "rewards/rejected": 2.185950756072998, "step": 17630 }, { "epoch": 0.8189795255118621, "grad_norm": 1.1814149618148804, "learning_rate": 3.635962672361762e-07, "logits/chosen": -19.019084930419922, "logits/rejected": -17.301774978637695, "logps/chosen": -379.7454833984375, "logps/rejected": -297.6884765625, "loss": 0.6065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3009560108184814, "rewards/margins": 0.9434059262275696, "rewards/rejected": 2.3575501441955566, "step": 17640 }, { "epoch": 0.8194437996192953, "grad_norm": 129.76904296875, "learning_rate": 3.635188882182707e-07, "logits/chosen": -19.199928283691406, "logits/rejected": -18.097915649414062, "logps/chosen": -467.98492431640625, "logps/rejected": -310.81646728515625, "loss": 0.3577, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3487319946289062, "rewards/margins": 1.3360841274261475, "rewards/rejected": 2.0126476287841797, "step": 17650 }, { "epoch": 0.8199080737267282, "grad_norm": 138.80743408203125, "learning_rate": 3.634415092003652e-07, "logits/chosen": -18.839641571044922, "logits/rejected": -18.42037582397461, "logps/chosen": -455.00732421875, "logps/rejected": -403.69140625, "loss": 0.719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.901496171951294, "rewards/margins": 0.14726600050926208, "rewards/rejected": 3.75423002243042, "step": 17660 }, { "epoch": 0.8203723478341612, "grad_norm": 133.94256591796875, "learning_rate": 3.6336413018245973e-07, "logits/chosen": -18.62178611755371, "logits/rejected": -19.268259048461914, "logps/chosen": -305.72998046875, "logps/rejected": -398.62945556640625, "loss": 1.2798, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.144155979156494, "rewards/margins": -0.8063758015632629, "rewards/rejected": 2.9505317211151123, "step": 17670 }, { "epoch": 0.8208366219415943, "grad_norm": 221.66073608398438, "learning_rate": 3.632867511645542e-07, "logits/chosen": -17.709117889404297, "logits/rejected": -17.316783905029297, "logps/chosen": -366.7735900878906, "logps/rejected": -310.6243896484375, "loss": 0.8036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3561956882476807, "rewards/margins": 0.5522516965866089, "rewards/rejected": 1.8039442300796509, "step": 17680 }, { "epoch": 0.8213008960490273, "grad_norm": 68.40084075927734, "learning_rate": 3.632093721466487e-07, "logits/chosen": -18.81900405883789, "logits/rejected": -17.956981658935547, "logps/chosen": -570.0345458984375, "logps/rejected": -407.654052734375, "loss": 0.5443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.012427806854248, "rewards/margins": 1.0236343145370483, "rewards/rejected": 2.9887936115264893, "step": 17690 }, { "epoch": 0.8217651701564603, "grad_norm": 74.77749633789062, "learning_rate": 3.631319931287432e-07, "logits/chosen": -19.089773178100586, "logits/rejected": -19.02133560180664, "logps/chosen": -480.72442626953125, "logps/rejected": -373.67681884765625, "loss": 0.7842, "rewards/accuracies": 0.5, "rewards/chosen": 3.8646748065948486, "rewards/margins": 0.9355214238166809, "rewards/rejected": 2.9291534423828125, "step": 17700 }, { "epoch": 0.8222294442638934, "grad_norm": 226.1307830810547, "learning_rate": 3.630546141108377e-07, "logits/chosen": -19.08209800720215, "logits/rejected": -18.80097007751465, "logps/chosen": -412.1533203125, "logps/rejected": -348.29766845703125, "loss": 1.1712, "rewards/accuracies": 0.5, "rewards/chosen": 3.988299608230591, "rewards/margins": 0.6401683688163757, "rewards/rejected": 3.3481311798095703, "step": 17710 }, { "epoch": 0.8226937183713264, "grad_norm": 118.79356384277344, "learning_rate": 3.6297723509293223e-07, "logits/chosen": -18.55069351196289, "logits/rejected": -17.52127456665039, "logps/chosen": -448.1361389160156, "logps/rejected": -350.36151123046875, "loss": 0.6785, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3290672302246094, "rewards/margins": 1.2210544347763062, "rewards/rejected": 2.108013153076172, "step": 17720 }, { "epoch": 0.8231579924787594, "grad_norm": 88.51306915283203, "learning_rate": 3.6289985607502664e-07, "logits/chosen": -19.730304718017578, "logits/rejected": -17.983043670654297, "logps/chosen": -471.7063903808594, "logps/rejected": -357.38348388671875, "loss": 0.4545, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.456419944763184, "rewards/margins": 1.703421950340271, "rewards/rejected": 2.7529985904693604, "step": 17730 }, { "epoch": 0.8236222665861925, "grad_norm": 68.14958953857422, "learning_rate": 3.6282247705712115e-07, "logits/chosen": -19.265766143798828, "logits/rejected": -18.031883239746094, "logps/chosen": -392.2452087402344, "logps/rejected": -344.84771728515625, "loss": 0.5735, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7301065921783447, "rewards/margins": 0.5725570917129517, "rewards/rejected": 2.1575496196746826, "step": 17740 }, { "epoch": 0.8240865406936255, "grad_norm": 19.3207950592041, "learning_rate": 3.6274509803921566e-07, "logits/chosen": -19.156970977783203, "logits/rejected": -17.16951560974121, "logps/chosen": -472.3248596191406, "logps/rejected": -283.4795227050781, "loss": 0.4104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8693795204162598, "rewards/margins": 1.605176329612732, "rewards/rejected": 2.2642033100128174, "step": 17750 }, { "epoch": 0.8245508148010585, "grad_norm": 62.421958923339844, "learning_rate": 3.6266771902131017e-07, "logits/chosen": -18.81392478942871, "logits/rejected": -18.87396812438965, "logps/chosen": -320.7297058105469, "logps/rejected": -314.3053894042969, "loss": 1.0731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.121431589126587, "rewards/margins": -0.13119235634803772, "rewards/rejected": 2.252624034881592, "step": 17760 }, { "epoch": 0.8250150889084916, "grad_norm": 100.16215515136719, "learning_rate": 3.625903400034047e-07, "logits/chosen": -18.9029541015625, "logits/rejected": -17.853973388671875, "logps/chosen": -361.9517822265625, "logps/rejected": -223.7719268798828, "loss": 0.4778, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.889030933380127, "rewards/margins": 1.2168233394622803, "rewards/rejected": 1.6722078323364258, "step": 17770 }, { "epoch": 0.8254793630159246, "grad_norm": 6.897837162017822, "learning_rate": 3.6251296098549914e-07, "logits/chosen": -18.7138671875, "logits/rejected": -17.942930221557617, "logps/chosen": -459.7940979003906, "logps/rejected": -306.58343505859375, "loss": 0.9009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.796931266784668, "rewards/margins": 0.5851714611053467, "rewards/rejected": 2.2117598056793213, "step": 17780 }, { "epoch": 0.8259436371233576, "grad_norm": 72.32688903808594, "learning_rate": 3.6243558196759365e-07, "logits/chosen": -19.675312042236328, "logits/rejected": -18.305776596069336, "logps/chosen": -362.0281066894531, "logps/rejected": -262.98651123046875, "loss": 0.402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9055356979370117, "rewards/margins": 1.2496435642242432, "rewards/rejected": 1.6558921337127686, "step": 17790 }, { "epoch": 0.8264079112307906, "grad_norm": 90.56293487548828, "learning_rate": 3.6235820294968817e-07, "logits/chosen": -18.5398006439209, "logits/rejected": -17.497289657592773, "logps/chosen": -405.374755859375, "logps/rejected": -325.77838134765625, "loss": 0.4963, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0166749954223633, "rewards/margins": 1.1463395357131958, "rewards/rejected": 1.870335340499878, "step": 17800 }, { "epoch": 0.8268721853382237, "grad_norm": 13.794126510620117, "learning_rate": 3.622808239317827e-07, "logits/chosen": -19.197538375854492, "logits/rejected": -19.29339027404785, "logps/chosen": -311.7457275390625, "logps/rejected": -321.70361328125, "loss": 0.7923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.61807918548584, "rewards/margins": 0.28710445761680603, "rewards/rejected": 2.330974817276001, "step": 17810 }, { "epoch": 0.8273364594456567, "grad_norm": 115.62446594238281, "learning_rate": 3.6220344491387713e-07, "logits/chosen": -18.433849334716797, "logits/rejected": -18.555856704711914, "logps/chosen": -383.1141052246094, "logps/rejected": -333.19451904296875, "loss": 0.7195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.163231372833252, "rewards/margins": 0.3650346100330353, "rewards/rejected": 2.79819655418396, "step": 17820 }, { "epoch": 0.8278007335530897, "grad_norm": 96.44693756103516, "learning_rate": 3.621260658959716e-07, "logits/chosen": -18.59191131591797, "logits/rejected": -18.748859405517578, "logps/chosen": -366.0691833496094, "logps/rejected": -402.1850891113281, "loss": 1.027, "rewards/accuracies": 0.5, "rewards/chosen": 2.527939558029175, "rewards/margins": -0.3680250942707062, "rewards/rejected": 2.8959646224975586, "step": 17830 }, { "epoch": 0.8282650076605228, "grad_norm": 168.44528198242188, "learning_rate": 3.620486868780661e-07, "logits/chosen": -18.91416358947754, "logits/rejected": -17.573884963989258, "logps/chosen": -463.09637451171875, "logps/rejected": -374.81793212890625, "loss": 0.3861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.235844850540161, "rewards/margins": 1.3630555868148804, "rewards/rejected": 1.8727891445159912, "step": 17840 }, { "epoch": 0.8287292817679558, "grad_norm": 49.92173385620117, "learning_rate": 3.619713078601606e-07, "logits/chosen": -19.317577362060547, "logits/rejected": -19.409011840820312, "logps/chosen": -512.1732177734375, "logps/rejected": -453.313232421875, "loss": 0.6525, "rewards/accuracies": 0.5, "rewards/chosen": 3.8778018951416016, "rewards/margins": 0.23070462048053741, "rewards/rejected": 3.64709734916687, "step": 17850 }, { "epoch": 0.8291935558753888, "grad_norm": 4.147918224334717, "learning_rate": 3.6189392884225513e-07, "logits/chosen": -18.91799545288086, "logits/rejected": -17.79643440246582, "logps/chosen": -357.69281005859375, "logps/rejected": -292.04290771484375, "loss": 0.4666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.692774534225464, "rewards/margins": 1.3485511541366577, "rewards/rejected": 2.344223737716675, "step": 17860 }, { "epoch": 0.8296578299828219, "grad_norm": 94.85391998291016, "learning_rate": 3.6181654982434964e-07, "logits/chosen": -19.44931411743164, "logits/rejected": -17.957866668701172, "logps/chosen": -483.45556640625, "logps/rejected": -321.6707458496094, "loss": 0.3586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.872515678405762, "rewards/margins": 1.5867582559585571, "rewards/rejected": 3.285757064819336, "step": 17870 }, { "epoch": 0.8301221040902549, "grad_norm": 209.3882293701172, "learning_rate": 3.617391708064441e-07, "logits/chosen": -18.76111602783203, "logits/rejected": -18.41124725341797, "logps/chosen": -368.3700866699219, "logps/rejected": -320.95489501953125, "loss": 0.7072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1547577381134033, "rewards/margins": 0.7428815960884094, "rewards/rejected": 2.4118762016296387, "step": 17880 }, { "epoch": 0.8305863781976879, "grad_norm": 9.191666603088379, "learning_rate": 3.616617917885386e-07, "logits/chosen": -18.61408042907715, "logits/rejected": -18.113265991210938, "logps/chosen": -310.15289306640625, "logps/rejected": -265.4166564941406, "loss": 0.8142, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5863046646118164, "rewards/margins": 0.4468647539615631, "rewards/rejected": 2.1394400596618652, "step": 17890 }, { "epoch": 0.831050652305121, "grad_norm": 78.61103820800781, "learning_rate": 3.615844127706331e-07, "logits/chosen": -19.264257431030273, "logits/rejected": -19.470333099365234, "logps/chosen": -513.8077392578125, "logps/rejected": -532.8236083984375, "loss": 0.9542, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.627194881439209, "rewards/margins": 0.016409754753112793, "rewards/rejected": 3.6107852458953857, "step": 17900 }, { "epoch": 0.831514926412554, "grad_norm": 120.03347778320312, "learning_rate": 3.6150703375272763e-07, "logits/chosen": -19.205167770385742, "logits/rejected": -18.674331665039062, "logps/chosen": -353.4488220214844, "logps/rejected": -295.9338684082031, "loss": 0.8874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5728237628936768, "rewards/margins": 0.13741593062877655, "rewards/rejected": 2.4354076385498047, "step": 17910 }, { "epoch": 0.831979200519987, "grad_norm": 64.05634307861328, "learning_rate": 3.614296547348221e-07, "logits/chosen": -18.74441909790039, "logits/rejected": -18.26758575439453, "logps/chosen": -394.6063232421875, "logps/rejected": -306.7464599609375, "loss": 0.6219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.192220211029053, "rewards/margins": 1.5319479703903198, "rewards/rejected": 2.6602721214294434, "step": 17920 }, { "epoch": 0.8324434746274201, "grad_norm": 21.685693740844727, "learning_rate": 3.6135227571691655e-07, "logits/chosen": -19.04770851135254, "logits/rejected": -19.22740936279297, "logps/chosen": -311.57550048828125, "logps/rejected": -274.7691955566406, "loss": 0.6791, "rewards/accuracies": 0.5, "rewards/chosen": 2.2554898262023926, "rewards/margins": 0.18636909127235413, "rewards/rejected": 2.069120407104492, "step": 17930 }, { "epoch": 0.8329077487348531, "grad_norm": 82.80143737792969, "learning_rate": 3.6127489669901106e-07, "logits/chosen": -18.849430084228516, "logits/rejected": -18.185558319091797, "logps/chosen": -411.2428283691406, "logps/rejected": -375.72998046875, "loss": 0.6447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.219411849975586, "rewards/margins": 0.9177610278129578, "rewards/rejected": 2.3016507625579834, "step": 17940 }, { "epoch": 0.8333720228422861, "grad_norm": 46.52943801879883, "learning_rate": 3.6119751768110557e-07, "logits/chosen": -20.095319747924805, "logits/rejected": -19.093778610229492, "logps/chosen": -299.10552978515625, "logps/rejected": -273.4186706542969, "loss": 0.4572, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5792555809020996, "rewards/margins": 0.9121357202529907, "rewards/rejected": 1.6671199798583984, "step": 17950 }, { "epoch": 0.8338362969497192, "grad_norm": 69.26873016357422, "learning_rate": 3.611201386632001e-07, "logits/chosen": -18.904577255249023, "logits/rejected": -18.012638092041016, "logps/chosen": -487.1190490722656, "logps/rejected": -306.48455810546875, "loss": 0.7277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4113681316375732, "rewards/margins": 1.174888253211975, "rewards/rejected": 2.236480236053467, "step": 17960 }, { "epoch": 0.8343005710571522, "grad_norm": 9.593716621398926, "learning_rate": 3.610427596452946e-07, "logits/chosen": -18.96564483642578, "logits/rejected": -19.022008895874023, "logps/chosen": -352.55706787109375, "logps/rejected": -382.2200622558594, "loss": 0.8465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.085227966308594, "rewards/margins": 1.0332591533660889, "rewards/rejected": 3.0519683361053467, "step": 17970 }, { "epoch": 0.8347648451645852, "grad_norm": 127.4112777709961, "learning_rate": 3.6096538062738905e-07, "logits/chosen": -18.19517707824707, "logits/rejected": -17.236549377441406, "logps/chosen": -346.77777099609375, "logps/rejected": -245.83627319335938, "loss": 0.5725, "rewards/accuracies": 0.5, "rewards/chosen": 2.480055332183838, "rewards/margins": 0.993048369884491, "rewards/rejected": 1.4870067834854126, "step": 17980 }, { "epoch": 0.8352291192720182, "grad_norm": 25.56019401550293, "learning_rate": 3.6088800160948356e-07, "logits/chosen": -19.53061294555664, "logits/rejected": -18.162792205810547, "logps/chosen": -314.23858642578125, "logps/rejected": -196.69265747070312, "loss": 0.406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5238983631134033, "rewards/margins": 1.3179231882095337, "rewards/rejected": 1.2059751749038696, "step": 17990 }, { "epoch": 0.8356933933794513, "grad_norm": 35.301151275634766, "learning_rate": 3.6081062259157807e-07, "logits/chosen": -20.02125358581543, "logits/rejected": -18.124223709106445, "logps/chosen": -562.9440307617188, "logps/rejected": -389.1135559082031, "loss": 0.6244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.062806129455566, "rewards/margins": 0.8210130929946899, "rewards/rejected": 3.241793394088745, "step": 18000 }, { "epoch": 0.8361576674868842, "grad_norm": 15.728002548217773, "learning_rate": 3.607332435736726e-07, "logits/chosen": -18.53850555419922, "logits/rejected": -17.952770233154297, "logps/chosen": -499.1639709472656, "logps/rejected": -363.79302978515625, "loss": 0.3855, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6717770099639893, "rewards/margins": 1.6102790832519531, "rewards/rejected": 2.061497688293457, "step": 18010 }, { "epoch": 0.8366219415943172, "grad_norm": 75.45146179199219, "learning_rate": 3.6065586455576704e-07, "logits/chosen": -18.057506561279297, "logits/rejected": -17.71622085571289, "logps/chosen": -308.4530944824219, "logps/rejected": -330.80352783203125, "loss": 0.7678, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5078539848327637, "rewards/margins": 0.42636018991470337, "rewards/rejected": 2.081493616104126, "step": 18020 }, { "epoch": 0.8370862157017503, "grad_norm": 16.189918518066406, "learning_rate": 3.605784855378615e-07, "logits/chosen": -18.438106536865234, "logits/rejected": -16.97035789489746, "logps/chosen": -396.2374572753906, "logps/rejected": -212.5546417236328, "loss": 0.4124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.149930000305176, "rewards/margins": 1.2566437721252441, "rewards/rejected": 0.8932863473892212, "step": 18030 }, { "epoch": 0.8375504898091833, "grad_norm": 112.55024719238281, "learning_rate": 3.60501106519956e-07, "logits/chosen": -19.266155242919922, "logits/rejected": -18.87531280517578, "logps/chosen": -379.93597412109375, "logps/rejected": -290.69989013671875, "loss": 0.7833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0087966918945312, "rewards/margins": 0.6535595655441284, "rewards/rejected": 2.3552374839782715, "step": 18040 }, { "epoch": 0.8380147639166163, "grad_norm": 34.31195831298828, "learning_rate": 3.604237275020505e-07, "logits/chosen": -18.618755340576172, "logits/rejected": -17.835681915283203, "logps/chosen": -449.237060546875, "logps/rejected": -343.0018615722656, "loss": 0.4132, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1797313690185547, "rewards/margins": 0.9031848907470703, "rewards/rejected": 2.2765464782714844, "step": 18050 }, { "epoch": 0.8384790380240494, "grad_norm": 64.4149398803711, "learning_rate": 3.6034634848414504e-07, "logits/chosen": -18.88717269897461, "logits/rejected": -17.321414947509766, "logps/chosen": -424.77081298828125, "logps/rejected": -277.35784912109375, "loss": 0.4165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3598334789276123, "rewards/margins": 1.456963062286377, "rewards/rejected": 1.902870774269104, "step": 18060 }, { "epoch": 0.8389433121314824, "grad_norm": 44.10077667236328, "learning_rate": 3.6026896946623955e-07, "logits/chosen": -19.068965911865234, "logits/rejected": -18.92257308959961, "logps/chosen": -282.59619140625, "logps/rejected": -267.21014404296875, "loss": 0.5858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4224395751953125, "rewards/margins": 0.5071395039558411, "rewards/rejected": 1.9153000116348267, "step": 18070 }, { "epoch": 0.8394075862389154, "grad_norm": 18.549285888671875, "learning_rate": 3.60191590448334e-07, "logits/chosen": -19.531984329223633, "logits/rejected": -19.121023178100586, "logps/chosen": -367.8067626953125, "logps/rejected": -276.9659729003906, "loss": 0.493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7040417194366455, "rewards/margins": 1.4186551570892334, "rewards/rejected": 2.285386800765991, "step": 18080 }, { "epoch": 0.8398718603463485, "grad_norm": 27.378820419311523, "learning_rate": 3.601142114304285e-07, "logits/chosen": -18.04758644104004, "logits/rejected": -17.63262176513672, "logps/chosen": -332.9239501953125, "logps/rejected": -261.68060302734375, "loss": 0.516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3742189407348633, "rewards/margins": 0.6890842318534851, "rewards/rejected": 1.6851345300674438, "step": 18090 }, { "epoch": 0.8403361344537815, "grad_norm": 23.116649627685547, "learning_rate": 3.6003683241252303e-07, "logits/chosen": -18.410297393798828, "logits/rejected": -16.90066146850586, "logps/chosen": -430.2911682128906, "logps/rejected": -339.7228088378906, "loss": 0.7013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4488682746887207, "rewards/margins": 1.1272952556610107, "rewards/rejected": 2.321572780609131, "step": 18100 }, { "epoch": 0.8408004085612145, "grad_norm": 4.855068683624268, "learning_rate": 3.599594533946175e-07, "logits/chosen": -20.295255661010742, "logits/rejected": -19.412891387939453, "logps/chosen": -475.298828125, "logps/rejected": -249.25863647460938, "loss": 0.4543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1538233757019043, "rewards/margins": 1.228567361831665, "rewards/rejected": 1.9252557754516602, "step": 18110 }, { "epoch": 0.8412646826686476, "grad_norm": 240.49813842773438, "learning_rate": 3.59882074376712e-07, "logits/chosen": -19.265954971313477, "logits/rejected": -18.407926559448242, "logps/chosen": -446.63287353515625, "logps/rejected": -394.54962158203125, "loss": 0.6031, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3681368827819824, "rewards/margins": 0.9644439816474915, "rewards/rejected": 2.403693437576294, "step": 18120 }, { "epoch": 0.8417289567760806, "grad_norm": 12.650644302368164, "learning_rate": 3.5980469535880646e-07, "logits/chosen": -18.68663215637207, "logits/rejected": -17.8505916595459, "logps/chosen": -393.4013671875, "logps/rejected": -353.8941955566406, "loss": 0.4523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.574333667755127, "rewards/margins": 1.3079335689544678, "rewards/rejected": 3.266400098800659, "step": 18130 }, { "epoch": 0.8421932308835136, "grad_norm": 62.96645736694336, "learning_rate": 3.5972731634090097e-07, "logits/chosen": -18.708934783935547, "logits/rejected": -18.02097511291504, "logps/chosen": -293.0089416503906, "logps/rejected": -203.61013793945312, "loss": 0.6278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2153549194335938, "rewards/margins": 0.6977160573005676, "rewards/rejected": 1.517638921737671, "step": 18140 }, { "epoch": 0.8426575049909466, "grad_norm": 33.790687561035156, "learning_rate": 3.596499373229955e-07, "logits/chosen": -18.115562438964844, "logits/rejected": -17.719064712524414, "logps/chosen": -462.07391357421875, "logps/rejected": -359.7337341308594, "loss": 0.8225, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0084376335144043, "rewards/margins": 0.02425522729754448, "rewards/rejected": 2.984182834625244, "step": 18150 }, { "epoch": 0.8431217790983797, "grad_norm": 0.7267679572105408, "learning_rate": 3.5957255830509e-07, "logits/chosen": -17.951982498168945, "logits/rejected": -17.358478546142578, "logps/chosen": -441.9183044433594, "logps/rejected": -337.1123962402344, "loss": 0.7918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.798447847366333, "rewards/margins": 1.3354089260101318, "rewards/rejected": 2.463038682937622, "step": 18160 }, { "epoch": 0.8435860532058127, "grad_norm": 64.23391723632812, "learning_rate": 3.594951792871845e-07, "logits/chosen": -18.351848602294922, "logits/rejected": -17.59843635559082, "logps/chosen": -510.80706787109375, "logps/rejected": -460.21484375, "loss": 0.7665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.637207508087158, "rewards/margins": 1.0799472332000732, "rewards/rejected": 3.5572593212127686, "step": 18170 }, { "epoch": 0.8440503273132457, "grad_norm": 110.83611297607422, "learning_rate": 3.5941780026927896e-07, "logits/chosen": -17.963619232177734, "logits/rejected": -17.226781845092773, "logps/chosen": -430.7884216308594, "logps/rejected": -336.60943603515625, "loss": 0.5835, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.092339038848877, "rewards/margins": 0.8530033230781555, "rewards/rejected": 2.239335536956787, "step": 18180 }, { "epoch": 0.8445146014206788, "grad_norm": 18.096269607543945, "learning_rate": 3.5934042125137347e-07, "logits/chosen": -19.518386840820312, "logits/rejected": -19.082015991210938, "logps/chosen": -483.5362243652344, "logps/rejected": -605.114990234375, "loss": 0.5323, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.039307594299316, "rewards/margins": 1.0284560918807983, "rewards/rejected": 3.0108511447906494, "step": 18190 }, { "epoch": 0.8449788755281118, "grad_norm": 187.11036682128906, "learning_rate": 3.59263042233468e-07, "logits/chosen": -18.690383911132812, "logits/rejected": -18.24158477783203, "logps/chosen": -454.47998046875, "logps/rejected": -373.51910400390625, "loss": 1.0843, "rewards/accuracies": 0.5, "rewards/chosen": 4.059543609619141, "rewards/margins": 0.1685091257095337, "rewards/rejected": 3.8910346031188965, "step": 18200 }, { "epoch": 0.8454431496355448, "grad_norm": 1.3768317699432373, "learning_rate": 3.5918566321556244e-07, "logits/chosen": -19.192638397216797, "logits/rejected": -16.79098892211914, "logps/chosen": -517.0972290039062, "logps/rejected": -333.93634033203125, "loss": 0.2679, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1923747062683105, "rewards/margins": 1.7539132833480835, "rewards/rejected": 2.4384617805480957, "step": 18210 }, { "epoch": 0.8459074237429779, "grad_norm": 260.71728515625, "learning_rate": 3.5910828419765695e-07, "logits/chosen": -20.46619987487793, "logits/rejected": -18.5942325592041, "logps/chosen": -482.04833984375, "logps/rejected": -425.73455810546875, "loss": 0.722, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.429147243499756, "rewards/margins": 0.41413623094558716, "rewards/rejected": 3.0150108337402344, "step": 18220 }, { "epoch": 0.8463716978504109, "grad_norm": 2.187870979309082, "learning_rate": 3.590309051797514e-07, "logits/chosen": -18.84051513671875, "logits/rejected": -17.759479522705078, "logps/chosen": -485.123291015625, "logps/rejected": -382.3638610839844, "loss": 0.8705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.900796890258789, "rewards/margins": 0.9521068334579468, "rewards/rejected": 2.9486899375915527, "step": 18230 }, { "epoch": 0.8468359719578439, "grad_norm": 24.55765151977539, "learning_rate": 3.589535261618459e-07, "logits/chosen": -19.66828727722168, "logits/rejected": -18.059553146362305, "logps/chosen": -487.03173828125, "logps/rejected": -320.0797119140625, "loss": 0.6212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.472848892211914, "rewards/margins": 0.5029407739639282, "rewards/rejected": 2.9699082374572754, "step": 18240 }, { "epoch": 0.847300246065277, "grad_norm": 106.27790832519531, "learning_rate": 3.5887614714394043e-07, "logits/chosen": -18.400371551513672, "logits/rejected": -17.482580184936523, "logps/chosen": -294.69488525390625, "logps/rejected": -264.81988525390625, "loss": 0.8503, "rewards/accuracies": 0.5, "rewards/chosen": 2.0391108989715576, "rewards/margins": -0.17936013638973236, "rewards/rejected": 2.218471050262451, "step": 18250 }, { "epoch": 0.84776452017271, "grad_norm": 21.842395782470703, "learning_rate": 3.5879876812603494e-07, "logits/chosen": -17.752330780029297, "logits/rejected": -17.433076858520508, "logps/chosen": -311.4915771484375, "logps/rejected": -300.0271301269531, "loss": 0.7825, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5828166007995605, "rewards/margins": 0.39503562450408936, "rewards/rejected": 2.18778133392334, "step": 18260 }, { "epoch": 0.848228794280143, "grad_norm": 85.78084564208984, "learning_rate": 3.5872138910812945e-07, "logits/chosen": -19.442459106445312, "logits/rejected": -18.219196319580078, "logps/chosen": -460.73492431640625, "logps/rejected": -336.546142578125, "loss": 0.9679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9516937732696533, "rewards/margins": 0.8008419275283813, "rewards/rejected": 3.1508517265319824, "step": 18270 }, { "epoch": 0.8486930683875761, "grad_norm": 38.83229064941406, "learning_rate": 3.586440100902239e-07, "logits/chosen": -17.77920913696289, "logits/rejected": -17.28278160095215, "logps/chosen": -341.0329895019531, "logps/rejected": -258.63909912109375, "loss": 0.827, "rewards/accuracies": 0.5, "rewards/chosen": 2.1320013999938965, "rewards/margins": 0.013000989332795143, "rewards/rejected": 2.119000196456909, "step": 18280 }, { "epoch": 0.8491573424950091, "grad_norm": 112.02484893798828, "learning_rate": 3.585666310723184e-07, "logits/chosen": -18.827777862548828, "logits/rejected": -18.727746963500977, "logps/chosen": -339.6080322265625, "logps/rejected": -326.37518310546875, "loss": 1.0991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3025436401367188, "rewards/margins": 0.19670285284519196, "rewards/rejected": 3.1058406829833984, "step": 18290 }, { "epoch": 0.8496216166024421, "grad_norm": 323.8905944824219, "learning_rate": 3.584892520544129e-07, "logits/chosen": -18.851375579833984, "logits/rejected": -17.870098114013672, "logps/chosen": -418.242431640625, "logps/rejected": -289.16436767578125, "loss": 0.8912, "rewards/accuracies": 0.5, "rewards/chosen": 2.601362705230713, "rewards/margins": 0.2396969497203827, "rewards/rejected": 2.361665725708008, "step": 18300 }, { "epoch": 0.8500858907098751, "grad_norm": 96.81612396240234, "learning_rate": 3.584118730365074e-07, "logits/chosen": -18.424577713012695, "logits/rejected": -17.11713218688965, "logps/chosen": -344.2470397949219, "logps/rejected": -242.482666015625, "loss": 0.4819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.286278009414673, "rewards/margins": 0.7944921255111694, "rewards/rejected": 1.4917857646942139, "step": 18310 }, { "epoch": 0.8505501648173082, "grad_norm": 15.814860343933105, "learning_rate": 3.583344940186019e-07, "logits/chosen": -18.4273681640625, "logits/rejected": -17.977588653564453, "logps/chosen": -380.2666320800781, "logps/rejected": -321.60418701171875, "loss": 0.6931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.630859851837158, "rewards/margins": 0.5482596158981323, "rewards/rejected": 2.0826003551483154, "step": 18320 }, { "epoch": 0.8510144389247412, "grad_norm": 250.8057098388672, "learning_rate": 3.5825711500069636e-07, "logits/chosen": -18.716304779052734, "logits/rejected": -18.03296661376953, "logps/chosen": -347.895263671875, "logps/rejected": -321.01177978515625, "loss": 1.2503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4904842376708984, "rewards/margins": 0.11478960514068604, "rewards/rejected": 2.375694751739502, "step": 18330 }, { "epoch": 0.8514787130321742, "grad_norm": 32.65241622924805, "learning_rate": 3.581797359827909e-07, "logits/chosen": -18.293996810913086, "logits/rejected": -17.60042381286621, "logps/chosen": -383.25701904296875, "logps/rejected": -273.94085693359375, "loss": 0.5618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.133296012878418, "rewards/margins": 1.1890392303466797, "rewards/rejected": 1.9442565441131592, "step": 18340 }, { "epoch": 0.8519429871396073, "grad_norm": 3.1867148876190186, "learning_rate": 3.581023569648854e-07, "logits/chosen": -18.455867767333984, "logits/rejected": -17.46854591369629, "logps/chosen": -347.6240234375, "logps/rejected": -322.41082763671875, "loss": 0.828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9030160903930664, "rewards/margins": 0.8677738904953003, "rewards/rejected": 2.0352425575256348, "step": 18350 }, { "epoch": 0.8524072612470402, "grad_norm": 126.83486938476562, "learning_rate": 3.580249779469799e-07, "logits/chosen": -18.688934326171875, "logits/rejected": -18.618383407592773, "logps/chosen": -355.4498291015625, "logps/rejected": -367.454833984375, "loss": 0.5486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6378910541534424, "rewards/margins": 0.40988191962242126, "rewards/rejected": 2.22800874710083, "step": 18360 }, { "epoch": 0.8528715353544732, "grad_norm": 83.88363647460938, "learning_rate": 3.579475989290744e-07, "logits/chosen": -18.381669998168945, "logits/rejected": -17.19534683227539, "logps/chosen": -464.74444580078125, "logps/rejected": -264.45623779296875, "loss": 0.5798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0148355960845947, "rewards/margins": 1.0118608474731445, "rewards/rejected": 2.00297474861145, "step": 18370 }, { "epoch": 0.8533358094619063, "grad_norm": 7.393838882446289, "learning_rate": 3.5787021991116887e-07, "logits/chosen": -18.590381622314453, "logits/rejected": -16.97776985168457, "logps/chosen": -495.1734313964844, "logps/rejected": -266.2760314941406, "loss": 0.3473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4048526287078857, "rewards/margins": 1.4309436082839966, "rewards/rejected": 1.9739090204238892, "step": 18380 }, { "epoch": 0.8538000835693393, "grad_norm": 66.57952117919922, "learning_rate": 3.577928408932634e-07, "logits/chosen": -19.232919692993164, "logits/rejected": -17.996652603149414, "logps/chosen": -432.64617919921875, "logps/rejected": -340.9445495605469, "loss": 0.3696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6395645141601562, "rewards/margins": 1.4687745571136475, "rewards/rejected": 2.170790195465088, "step": 18390 }, { "epoch": 0.8542643576767723, "grad_norm": 32.9070930480957, "learning_rate": 3.5771546187535784e-07, "logits/chosen": -18.204565048217773, "logits/rejected": -18.237133026123047, "logps/chosen": -418.9187927246094, "logps/rejected": -377.8928527832031, "loss": 0.6861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0007123947143555, "rewards/margins": 1.2891054153442383, "rewards/rejected": 2.7116072177886963, "step": 18400 }, { "epoch": 0.8547286317842054, "grad_norm": 85.57149505615234, "learning_rate": 3.5763808285745235e-07, "logits/chosen": -19.26203727722168, "logits/rejected": -18.313201904296875, "logps/chosen": -337.33001708984375, "logps/rejected": -301.90325927734375, "loss": 0.6477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.218653440475464, "rewards/margins": 0.9228823781013489, "rewards/rejected": 2.2957711219787598, "step": 18410 }, { "epoch": 0.8551929058916384, "grad_norm": 66.95576477050781, "learning_rate": 3.5756070383954686e-07, "logits/chosen": -18.743345260620117, "logits/rejected": -17.89276123046875, "logps/chosen": -473.810546875, "logps/rejected": -382.05706787109375, "loss": 0.4967, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.248897075653076, "rewards/margins": 1.006862998008728, "rewards/rejected": 2.2420341968536377, "step": 18420 }, { "epoch": 0.8556571799990714, "grad_norm": 26.85306167602539, "learning_rate": 3.574833248216413e-07, "logits/chosen": -18.573823928833008, "logits/rejected": -17.859251022338867, "logps/chosen": -353.1506042480469, "logps/rejected": -267.27081298828125, "loss": 0.6496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2518868446350098, "rewards/margins": 0.6980351209640503, "rewards/rejected": 2.553851842880249, "step": 18430 }, { "epoch": 0.8561214541065045, "grad_norm": 45.49150848388672, "learning_rate": 3.5740594580373583e-07, "logits/chosen": -19.088584899902344, "logits/rejected": -18.422508239746094, "logps/chosen": -356.7771911621094, "logps/rejected": -258.52679443359375, "loss": 0.6949, "rewards/accuracies": 0.5, "rewards/chosen": 3.1311440467834473, "rewards/margins": 1.005953311920166, "rewards/rejected": 2.125190496444702, "step": 18440 }, { "epoch": 0.8565857282139375, "grad_norm": 5.427737712860107, "learning_rate": 3.5732856678583034e-07, "logits/chosen": -19.41468048095703, "logits/rejected": -18.315166473388672, "logps/chosen": -426.1897888183594, "logps/rejected": -310.49810791015625, "loss": 0.405, "rewards/accuracies": 1.0, "rewards/chosen": 4.05805778503418, "rewards/margins": 1.08757746219635, "rewards/rejected": 2.970480442047119, "step": 18450 }, { "epoch": 0.8570500023213705, "grad_norm": 33.74222946166992, "learning_rate": 3.5725118776792485e-07, "logits/chosen": -19.17758560180664, "logits/rejected": -18.055217742919922, "logps/chosen": -427.26806640625, "logps/rejected": -338.0899353027344, "loss": 0.4813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.411710262298584, "rewards/margins": 1.4947303533554077, "rewards/rejected": 1.9169803857803345, "step": 18460 }, { "epoch": 0.8575142764288035, "grad_norm": 36.99091720581055, "learning_rate": 3.5717380875001936e-07, "logits/chosen": -20.076404571533203, "logits/rejected": -18.626707077026367, "logps/chosen": -497.29437255859375, "logps/rejected": -278.7588806152344, "loss": 0.473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5386810302734375, "rewards/margins": 1.3770067691802979, "rewards/rejected": 2.1616742610931396, "step": 18470 }, { "epoch": 0.8579785505362366, "grad_norm": 13.155009269714355, "learning_rate": 3.570964297321138e-07, "logits/chosen": -17.89934539794922, "logits/rejected": -18.006210327148438, "logps/chosen": -472.88818359375, "logps/rejected": -464.0381774902344, "loss": 0.8344, "rewards/accuracies": 0.5, "rewards/chosen": 3.604220151901245, "rewards/margins": 0.12903502583503723, "rewards/rejected": 3.475184917449951, "step": 18480 }, { "epoch": 0.8584428246436696, "grad_norm": 71.49159240722656, "learning_rate": 3.570190507142083e-07, "logits/chosen": -18.317358016967773, "logits/rejected": -17.486814498901367, "logps/chosen": -420.420654296875, "logps/rejected": -309.6671142578125, "loss": 0.3955, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2736282348632812, "rewards/margins": 1.033130407333374, "rewards/rejected": 2.2404980659484863, "step": 18490 }, { "epoch": 0.8589070987511026, "grad_norm": 119.96405792236328, "learning_rate": 3.569416716963028e-07, "logits/chosen": -18.765153884887695, "logits/rejected": -18.05906867980957, "logps/chosen": -406.527587890625, "logps/rejected": -274.62200927734375, "loss": 0.5346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6166961193084717, "rewards/margins": 0.7949265837669373, "rewards/rejected": 1.8217693567276, "step": 18500 }, { "epoch": 0.8593713728585357, "grad_norm": 113.26000213623047, "learning_rate": 3.568642926783973e-07, "logits/chosen": -19.020496368408203, "logits/rejected": -18.37020492553711, "logps/chosen": -531.2943725585938, "logps/rejected": -469.4404296875, "loss": 0.72, "rewards/accuracies": 0.5, "rewards/chosen": 3.779097080230713, "rewards/margins": 0.12467575073242188, "rewards/rejected": 3.65442156791687, "step": 18510 }, { "epoch": 0.8598356469659687, "grad_norm": 56.116249084472656, "learning_rate": 3.567869136604918e-07, "logits/chosen": -19.028533935546875, "logits/rejected": -18.44866943359375, "logps/chosen": -315.1605529785156, "logps/rejected": -295.9787902832031, "loss": 0.4521, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.384856700897217, "rewards/margins": 0.9812297821044922, "rewards/rejected": 1.4036271572113037, "step": 18520 }, { "epoch": 0.8602999210734017, "grad_norm": 134.5218048095703, "learning_rate": 3.5670953464258627e-07, "logits/chosen": -18.59109115600586, "logits/rejected": -18.504552841186523, "logps/chosen": -363.81744384765625, "logps/rejected": -342.5052185058594, "loss": 0.7458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6572532653808594, "rewards/margins": 0.8752700686454773, "rewards/rejected": 2.7819833755493164, "step": 18530 }, { "epoch": 0.8607641951808348, "grad_norm": 101.56111145019531, "learning_rate": 3.566321556246808e-07, "logits/chosen": -18.59584617614746, "logits/rejected": -17.40017318725586, "logps/chosen": -455.1404724121094, "logps/rejected": -332.38299560546875, "loss": 0.5612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.431147575378418, "rewards/margins": 1.7903547286987305, "rewards/rejected": 2.6407923698425293, "step": 18540 }, { "epoch": 0.8612284692882678, "grad_norm": 31.637895584106445, "learning_rate": 3.565547766067753e-07, "logits/chosen": -18.715656280517578, "logits/rejected": -17.982038497924805, "logps/chosen": -352.89752197265625, "logps/rejected": -284.83673095703125, "loss": 1.1677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.90433931350708, "rewards/margins": 0.25072556734085083, "rewards/rejected": 2.653613567352295, "step": 18550 }, { "epoch": 0.8616927433957008, "grad_norm": 148.96661376953125, "learning_rate": 3.564773975888698e-07, "logits/chosen": -19.736143112182617, "logits/rejected": -19.16999626159668, "logps/chosen": -345.6253356933594, "logps/rejected": -291.59112548828125, "loss": 0.599, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4630684852600098, "rewards/margins": 1.0758450031280518, "rewards/rejected": 2.387223720550537, "step": 18560 }, { "epoch": 0.8621570175031339, "grad_norm": 28.456283569335938, "learning_rate": 3.564000185709643e-07, "logits/chosen": -19.229751586914062, "logits/rejected": -18.948848724365234, "logps/chosen": -346.58905029296875, "logps/rejected": -338.06634521484375, "loss": 0.7628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5249485969543457, "rewards/margins": 0.382049024105072, "rewards/rejected": 2.14289927482605, "step": 18570 }, { "epoch": 0.8626212916105669, "grad_norm": 174.68626403808594, "learning_rate": 3.563226395530588e-07, "logits/chosen": -18.301897048950195, "logits/rejected": -18.072050094604492, "logps/chosen": -469.25140380859375, "logps/rejected": -413.9097595214844, "loss": 0.9236, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2609095573425293, "rewards/margins": 0.0914156436920166, "rewards/rejected": 3.169494152069092, "step": 18580 }, { "epoch": 0.8630855657179999, "grad_norm": 143.56687927246094, "learning_rate": 3.5624526053515323e-07, "logits/chosen": -19.342548370361328, "logits/rejected": -18.702890396118164, "logps/chosen": -387.506103515625, "logps/rejected": -321.5992431640625, "loss": 0.754, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2004828453063965, "rewards/margins": 0.4187353253364563, "rewards/rejected": 2.781747817993164, "step": 18590 }, { "epoch": 0.863549839825433, "grad_norm": 14.645325660705566, "learning_rate": 3.5616788151724775e-07, "logits/chosen": -18.680164337158203, "logits/rejected": -18.331579208374023, "logps/chosen": -383.8744201660156, "logps/rejected": -280.2110595703125, "loss": 0.4052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5804367065429688, "rewards/margins": 1.559080958366394, "rewards/rejected": 2.021355628967285, "step": 18600 }, { "epoch": 0.864014113932866, "grad_norm": 153.11392211914062, "learning_rate": 3.5609050249934226e-07, "logits/chosen": -17.718677520751953, "logits/rejected": -17.92925453186035, "logps/chosen": -485.3482971191406, "logps/rejected": -450.5420837402344, "loss": 1.3647, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.2233099937438965, "rewards/margins": -0.7107759118080139, "rewards/rejected": 3.934086322784424, "step": 18610 }, { "epoch": 0.864478388040299, "grad_norm": 21.041873931884766, "learning_rate": 3.5601312348143677e-07, "logits/chosen": -18.7197265625, "logits/rejected": -18.0635986328125, "logps/chosen": -386.4911804199219, "logps/rejected": -376.3625793457031, "loss": 1.3424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.15159010887146, "rewards/margins": -0.3044961392879486, "rewards/rejected": 3.4560859203338623, "step": 18620 }, { "epoch": 0.864942662147732, "grad_norm": 50.42795944213867, "learning_rate": 3.559357444635312e-07, "logits/chosen": -18.88884735107422, "logits/rejected": -17.423357009887695, "logps/chosen": -409.3621520996094, "logps/rejected": -342.7311706542969, "loss": 0.4762, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8603031635284424, "rewards/margins": 0.8594173192977905, "rewards/rejected": 2.0008857250213623, "step": 18630 }, { "epoch": 0.8654069362551651, "grad_norm": 208.34632873535156, "learning_rate": 3.5585836544562574e-07, "logits/chosen": -19.36800765991211, "logits/rejected": -18.749727249145508, "logps/chosen": -433.7442932128906, "logps/rejected": -484.926513671875, "loss": 1.3435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.967562675476074, "rewards/margins": -0.35863012075424194, "rewards/rejected": 3.326192855834961, "step": 18640 }, { "epoch": 0.8658712103625981, "grad_norm": 51.25766372680664, "learning_rate": 3.5578098642772025e-07, "logits/chosen": -18.59698486328125, "logits/rejected": -17.57265281677246, "logps/chosen": -408.5579528808594, "logps/rejected": -333.6983947753906, "loss": 0.5224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6314873695373535, "rewards/margins": 0.8560803532600403, "rewards/rejected": 1.775406837463379, "step": 18650 }, { "epoch": 0.8663354844700311, "grad_norm": 150.86912536621094, "learning_rate": 3.5570360740981476e-07, "logits/chosen": -19.129451751708984, "logits/rejected": -18.526020050048828, "logps/chosen": -294.69793701171875, "logps/rejected": -246.5978546142578, "loss": 0.9047, "rewards/accuracies": 0.5, "rewards/chosen": 2.543445348739624, "rewards/margins": 0.3193199038505554, "rewards/rejected": 2.224125385284424, "step": 18660 }, { "epoch": 0.8667997585774642, "grad_norm": 6.830921173095703, "learning_rate": 3.5562622839190927e-07, "logits/chosen": -19.059843063354492, "logits/rejected": -17.599422454833984, "logps/chosen": -398.33538818359375, "logps/rejected": -197.7892303466797, "loss": 0.5467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.752352237701416, "rewards/margins": 1.1781728267669678, "rewards/rejected": 1.5741791725158691, "step": 18670 }, { "epoch": 0.8672640326848972, "grad_norm": 62.52382278442383, "learning_rate": 3.555488493740038e-07, "logits/chosen": -18.76657485961914, "logits/rejected": -18.48965072631836, "logps/chosen": -406.9223937988281, "logps/rejected": -365.89044189453125, "loss": 0.9165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.524819850921631, "rewards/margins": 0.7240906953811646, "rewards/rejected": 2.8007290363311768, "step": 18680 }, { "epoch": 0.8677283067923302, "grad_norm": 82.654296875, "learning_rate": 3.554714703560982e-07, "logits/chosen": -18.525114059448242, "logits/rejected": -18.055828094482422, "logps/chosen": -398.15142822265625, "logps/rejected": -388.7929992675781, "loss": 1.101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2400307655334473, "rewards/margins": -0.16932377219200134, "rewards/rejected": 2.4093544483184814, "step": 18690 }, { "epoch": 0.8681925808997633, "grad_norm": 79.55963897705078, "learning_rate": 3.553940913381927e-07, "logits/chosen": -19.488018035888672, "logits/rejected": -19.108211517333984, "logps/chosen": -344.09857177734375, "logps/rejected": -275.2520751953125, "loss": 0.8119, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.127157688140869, "rewards/margins": 0.3105069100856781, "rewards/rejected": 2.8166511058807373, "step": 18700 }, { "epoch": 0.8686568550071962, "grad_norm": 7.567681789398193, "learning_rate": 3.553167123202872e-07, "logits/chosen": -19.72068977355957, "logits/rejected": -18.887996673583984, "logps/chosen": -427.23382568359375, "logps/rejected": -177.15138244628906, "loss": 0.378, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2995314598083496, "rewards/margins": 2.068747043609619, "rewards/rejected": 1.2307844161987305, "step": 18710 }, { "epoch": 0.8691211291146292, "grad_norm": 60.22667694091797, "learning_rate": 3.552393333023817e-07, "logits/chosen": -19.039525985717773, "logits/rejected": -17.650638580322266, "logps/chosen": -437.945068359375, "logps/rejected": -303.0200500488281, "loss": 0.5251, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8836448192596436, "rewards/margins": 0.7863650321960449, "rewards/rejected": 2.0972800254821777, "step": 18720 }, { "epoch": 0.8695854032220623, "grad_norm": 29.30729866027832, "learning_rate": 3.551619542844762e-07, "logits/chosen": -19.00127410888672, "logits/rejected": -18.3876953125, "logps/chosen": -292.3443603515625, "logps/rejected": -232.7184295654297, "loss": 0.6647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2786927223205566, "rewards/margins": 0.8309839367866516, "rewards/rejected": 2.4477086067199707, "step": 18730 }, { "epoch": 0.8700496773294953, "grad_norm": 32.2157096862793, "learning_rate": 3.550845752665707e-07, "logits/chosen": -19.797958374023438, "logits/rejected": -18.512630462646484, "logps/chosen": -392.4413146972656, "logps/rejected": -281.63470458984375, "loss": 0.4395, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.221949815750122, "rewards/margins": 1.0680902004241943, "rewards/rejected": 2.1538591384887695, "step": 18740 }, { "epoch": 0.8705139514369283, "grad_norm": 1.5988930463790894, "learning_rate": 3.550071962486652e-07, "logits/chosen": -20.145414352416992, "logits/rejected": -18.648265838623047, "logps/chosen": -398.4045104980469, "logps/rejected": -322.29998779296875, "loss": 0.5015, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4076600074768066, "rewards/margins": 1.2130842208862305, "rewards/rejected": 2.194575786590576, "step": 18750 }, { "epoch": 0.8709782255443614, "grad_norm": 74.93816375732422, "learning_rate": 3.549298172307597e-07, "logits/chosen": -18.455364227294922, "logits/rejected": -17.52908706665039, "logps/chosen": -483.3751525878906, "logps/rejected": -390.034912109375, "loss": 0.6338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.371255874633789, "rewards/margins": 0.6817746758460999, "rewards/rejected": 2.689481258392334, "step": 18760 }, { "epoch": 0.8714424996517944, "grad_norm": 213.416748046875, "learning_rate": 3.548524382128542e-07, "logits/chosen": -18.41066551208496, "logits/rejected": -18.097902297973633, "logps/chosen": -480.9501037597656, "logps/rejected": -441.34820556640625, "loss": 1.0099, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5703213214874268, "rewards/margins": -0.029303550720214844, "rewards/rejected": 3.5996251106262207, "step": 18770 }, { "epoch": 0.8719067737592274, "grad_norm": 2.506946563720703, "learning_rate": 3.5477505919494863e-07, "logits/chosen": -18.809871673583984, "logits/rejected": -18.503833770751953, "logps/chosen": -325.04486083984375, "logps/rejected": -260.01397705078125, "loss": 0.8529, "rewards/accuracies": 0.5, "rewards/chosen": 2.69006085395813, "rewards/margins": 0.5670675039291382, "rewards/rejected": 2.1229934692382812, "step": 18780 }, { "epoch": 0.8723710478666605, "grad_norm": 5.586010932922363, "learning_rate": 3.5469768017704314e-07, "logits/chosen": -18.83242416381836, "logits/rejected": -18.198652267456055, "logps/chosen": -319.57476806640625, "logps/rejected": -274.17620849609375, "loss": 0.7796, "rewards/accuracies": 0.5, "rewards/chosen": 2.329360246658325, "rewards/margins": 0.5744155645370483, "rewards/rejected": 1.7549445629119873, "step": 18790 }, { "epoch": 0.8728353219740935, "grad_norm": 40.642417907714844, "learning_rate": 3.5462030115913765e-07, "logits/chosen": -18.126449584960938, "logits/rejected": -17.617141723632812, "logps/chosen": -324.69134521484375, "logps/rejected": -243.46484375, "loss": 0.689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8345553874969482, "rewards/margins": 0.7139149904251099, "rewards/rejected": 2.120640516281128, "step": 18800 }, { "epoch": 0.8732995960815265, "grad_norm": 7.9601311683654785, "learning_rate": 3.5454292214123216e-07, "logits/chosen": -19.594905853271484, "logits/rejected": -18.633895874023438, "logps/chosen": -359.8061828613281, "logps/rejected": -296.44610595703125, "loss": 0.6739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2836833000183105, "rewards/margins": 0.952174186706543, "rewards/rejected": 2.3315091133117676, "step": 18810 }, { "epoch": 0.8737638701889595, "grad_norm": 0.2872007191181183, "learning_rate": 3.544655431233267e-07, "logits/chosen": -18.681495666503906, "logits/rejected": -17.75798225402832, "logps/chosen": -450.2552795410156, "logps/rejected": -361.55108642578125, "loss": 0.372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.513566255569458, "rewards/margins": 1.7997080087661743, "rewards/rejected": 1.7138583660125732, "step": 18820 }, { "epoch": 0.8742281442963926, "grad_norm": 105.9306640625, "learning_rate": 3.543881641054212e-07, "logits/chosen": -17.94902229309082, "logits/rejected": -18.020183563232422, "logps/chosen": -290.18597412109375, "logps/rejected": -318.2257995605469, "loss": 0.8994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.318486452102661, "rewards/margins": 0.003194451332092285, "rewards/rejected": 2.3152918815612793, "step": 18830 }, { "epoch": 0.8746924184038256, "grad_norm": 2.8961102962493896, "learning_rate": 3.5431078508751565e-07, "logits/chosen": -19.370361328125, "logits/rejected": -17.97835922241211, "logps/chosen": -374.2213439941406, "logps/rejected": -203.05674743652344, "loss": 0.5644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9668872356414795, "rewards/margins": 1.6873966455459595, "rewards/rejected": 2.2794899940490723, "step": 18840 }, { "epoch": 0.8751566925112586, "grad_norm": 170.77609252929688, "learning_rate": 3.5423340606961016e-07, "logits/chosen": -19.15239906311035, "logits/rejected": -17.908451080322266, "logps/chosen": -430.46783447265625, "logps/rejected": -280.29949951171875, "loss": 0.4727, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4814789295196533, "rewards/margins": 1.4162794351577759, "rewards/rejected": 2.065199613571167, "step": 18850 }, { "epoch": 0.8756209666186917, "grad_norm": 30.17833137512207, "learning_rate": 3.5415602705170467e-07, "logits/chosen": -19.489301681518555, "logits/rejected": -18.498821258544922, "logps/chosen": -445.16827392578125, "logps/rejected": -270.78912353515625, "loss": 0.3042, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.521181583404541, "rewards/margins": 2.1088390350341797, "rewards/rejected": 1.4123423099517822, "step": 18860 }, { "epoch": 0.8760852407261247, "grad_norm": 116.5848388671875, "learning_rate": 3.540786480337992e-07, "logits/chosen": -19.089427947998047, "logits/rejected": -18.528539657592773, "logps/chosen": -352.745361328125, "logps/rejected": -296.03460693359375, "loss": 0.9187, "rewards/accuracies": 0.5, "rewards/chosen": 2.4609222412109375, "rewards/margins": 0.027959752827882767, "rewards/rejected": 2.4329628944396973, "step": 18870 }, { "epoch": 0.8765495148335577, "grad_norm": 103.00807189941406, "learning_rate": 3.540012690158936e-07, "logits/chosen": -18.512943267822266, "logits/rejected": -18.101892471313477, "logps/chosen": -333.75537109375, "logps/rejected": -299.2143859863281, "loss": 1.0169, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.182827949523926, "rewards/margins": -0.18775229156017303, "rewards/rejected": 2.3705801963806152, "step": 18880 }, { "epoch": 0.8770137889409908, "grad_norm": 118.81668853759766, "learning_rate": 3.539238899979881e-07, "logits/chosen": -18.799409866333008, "logits/rejected": -18.955867767333984, "logps/chosen": -331.6712341308594, "logps/rejected": -378.8673400878906, "loss": 1.1081, "rewards/accuracies": 0.5, "rewards/chosen": 2.332430839538574, "rewards/margins": -0.06636856496334076, "rewards/rejected": 2.398799419403076, "step": 18890 }, { "epoch": 0.8774780630484238, "grad_norm": 23.663774490356445, "learning_rate": 3.538465109800826e-07, "logits/chosen": -19.25206184387207, "logits/rejected": -17.523963928222656, "logps/chosen": -452.2362365722656, "logps/rejected": -301.05657958984375, "loss": 0.3862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.310934543609619, "rewards/margins": 1.491856336593628, "rewards/rejected": 1.8190784454345703, "step": 18900 }, { "epoch": 0.8779423371558568, "grad_norm": 84.8877182006836, "learning_rate": 3.537691319621771e-07, "logits/chosen": -19.170700073242188, "logits/rejected": -18.55603790283203, "logps/chosen": -458.63909912109375, "logps/rejected": -349.6459045410156, "loss": 0.581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.790038824081421, "rewards/margins": 0.4717998504638672, "rewards/rejected": 2.3182389736175537, "step": 18910 }, { "epoch": 0.8784066112632899, "grad_norm": 21.81856346130371, "learning_rate": 3.5369175294427163e-07, "logits/chosen": -18.979724884033203, "logits/rejected": -18.032384872436523, "logps/chosen": -383.1526794433594, "logps/rejected": -310.4757995605469, "loss": 0.6219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.935603618621826, "rewards/margins": 0.9260671734809875, "rewards/rejected": 2.0095367431640625, "step": 18920 }, { "epoch": 0.8788708853707229, "grad_norm": 191.4679412841797, "learning_rate": 3.5361437392636614e-07, "logits/chosen": -18.880163192749023, "logits/rejected": -17.983509063720703, "logps/chosen": -339.0188903808594, "logps/rejected": -377.6166687011719, "loss": 1.0916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8016269207000732, "rewards/margins": 0.3650604486465454, "rewards/rejected": 2.4365668296813965, "step": 18930 }, { "epoch": 0.8793351594781559, "grad_norm": 44.72614669799805, "learning_rate": 3.535369949084606e-07, "logits/chosen": -19.38890838623047, "logits/rejected": -18.943317413330078, "logps/chosen": -415.92236328125, "logps/rejected": -342.6141052246094, "loss": 0.5426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8348495960235596, "rewards/margins": 1.166830062866211, "rewards/rejected": 2.6680195331573486, "step": 18940 }, { "epoch": 0.879799433585589, "grad_norm": 26.15018653869629, "learning_rate": 3.534596158905551e-07, "logits/chosen": -19.565082550048828, "logits/rejected": -18.999866485595703, "logps/chosen": -468.418701171875, "logps/rejected": -339.1783142089844, "loss": 0.3591, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.276895523071289, "rewards/margins": 1.5060603618621826, "rewards/rejected": 2.7708351612091064, "step": 18950 }, { "epoch": 0.880263707693022, "grad_norm": 17.03668975830078, "learning_rate": 3.533822368726496e-07, "logits/chosen": -18.53483009338379, "logits/rejected": -17.452648162841797, "logps/chosen": -469.43560791015625, "logps/rejected": -296.40509033203125, "loss": 0.5287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6497714519500732, "rewards/margins": 0.935948371887207, "rewards/rejected": 2.7138235569000244, "step": 18960 }, { "epoch": 0.880727981800455, "grad_norm": 82.99005889892578, "learning_rate": 3.533048578547441e-07, "logits/chosen": -18.865530014038086, "logits/rejected": -18.335702896118164, "logps/chosen": -402.0840759277344, "logps/rejected": -363.3589782714844, "loss": 0.7265, "rewards/accuracies": 0.5, "rewards/chosen": 3.3316540718078613, "rewards/margins": 0.42735886573791504, "rewards/rejected": 2.9042952060699463, "step": 18970 }, { "epoch": 0.881192255907888, "grad_norm": 180.09002685546875, "learning_rate": 3.532274788368386e-07, "logits/chosen": -19.050731658935547, "logits/rejected": -18.856910705566406, "logps/chosen": -431.99676513671875, "logps/rejected": -419.31317138671875, "loss": 0.7675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9140141010284424, "rewards/margins": 0.28476017713546753, "rewards/rejected": 3.62925386428833, "step": 18980 }, { "epoch": 0.8816565300153211, "grad_norm": 32.00477600097656, "learning_rate": 3.5315009981893305e-07, "logits/chosen": -19.85135841369629, "logits/rejected": -18.58592987060547, "logps/chosen": -457.20123291015625, "logps/rejected": -326.82342529296875, "loss": 0.6013, "rewards/accuracies": 0.5, "rewards/chosen": 3.726956844329834, "rewards/margins": 1.478301763534546, "rewards/rejected": 2.248654842376709, "step": 18990 }, { "epoch": 0.8821208041227541, "grad_norm": 162.1902618408203, "learning_rate": 3.5307272080102756e-07, "logits/chosen": -19.54367446899414, "logits/rejected": -20.067501068115234, "logps/chosen": -388.17333984375, "logps/rejected": -398.8707580566406, "loss": 0.8738, "rewards/accuracies": 0.5, "rewards/chosen": 3.200143814086914, "rewards/margins": -0.19109871983528137, "rewards/rejected": 3.391242504119873, "step": 19000 }, { "epoch": 0.8825850782301871, "grad_norm": 52.36098098754883, "learning_rate": 3.5299534178312207e-07, "logits/chosen": -18.862581253051758, "logits/rejected": -19.563020706176758, "logps/chosen": -393.41705322265625, "logps/rejected": -374.9225158691406, "loss": 0.6632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.426924467086792, "rewards/margins": 0.4689825177192688, "rewards/rejected": 2.957941770553589, "step": 19010 }, { "epoch": 0.8830493523376202, "grad_norm": 101.11170959472656, "learning_rate": 3.529179627652166e-07, "logits/chosen": -19.218477249145508, "logits/rejected": -18.38607406616211, "logps/chosen": -420.918701171875, "logps/rejected": -342.6319580078125, "loss": 0.6255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8427352905273438, "rewards/margins": 1.1160093545913696, "rewards/rejected": 2.7267260551452637, "step": 19020 }, { "epoch": 0.8835136264450532, "grad_norm": 37.929351806640625, "learning_rate": 3.528405837473111e-07, "logits/chosen": -18.84162712097168, "logits/rejected": -18.56008529663086, "logps/chosen": -339.16204833984375, "logps/rejected": -329.1005859375, "loss": 0.8823, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.063910484313965, "rewards/margins": 0.01297912560403347, "rewards/rejected": 3.050931215286255, "step": 19030 }, { "epoch": 0.8839779005524862, "grad_norm": 104.7757568359375, "learning_rate": 3.5276320472940555e-07, "logits/chosen": -18.804737091064453, "logits/rejected": -18.365814208984375, "logps/chosen": -309.759033203125, "logps/rejected": -286.19921875, "loss": 0.5878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.528844118118286, "rewards/margins": 0.6508417725563049, "rewards/rejected": 1.8780025243759155, "step": 19040 }, { "epoch": 0.8844421746599193, "grad_norm": 18.997766494750977, "learning_rate": 3.5268582571150007e-07, "logits/chosen": -19.653499603271484, "logits/rejected": -18.466848373413086, "logps/chosen": -442.36138916015625, "logps/rejected": -303.1448669433594, "loss": 0.5243, "rewards/accuracies": 0.5, "rewards/chosen": 4.424205303192139, "rewards/margins": 1.085715889930725, "rewards/rejected": 3.338489532470703, "step": 19050 }, { "epoch": 0.8849064487673522, "grad_norm": 52.18072509765625, "learning_rate": 3.526084466935946e-07, "logits/chosen": -19.19731330871582, "logits/rejected": -17.484529495239258, "logps/chosen": -459.63922119140625, "logps/rejected": -270.9580383300781, "loss": 0.3757, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5577659606933594, "rewards/margins": 1.9286565780639648, "rewards/rejected": 1.6291097402572632, "step": 19060 }, { "epoch": 0.8853707228747852, "grad_norm": 11.22876262664795, "learning_rate": 3.5253106767568903e-07, "logits/chosen": -17.94411277770996, "logits/rejected": -17.782197952270508, "logps/chosen": -315.64447021484375, "logps/rejected": -333.51715087890625, "loss": 1.1181, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4413199424743652, "rewards/margins": -0.2622644305229187, "rewards/rejected": 2.7035841941833496, "step": 19070 }, { "epoch": 0.8858349969822183, "grad_norm": 49.3265380859375, "learning_rate": 3.524614265595741e-07, "logits/chosen": -18.75570297241211, "logits/rejected": -18.34164047241211, "logps/chosen": -486.9052734375, "logps/rejected": -398.07440185546875, "loss": 1.0308, "rewards/accuracies": 0.5, "rewards/chosen": 3.5454342365264893, "rewards/margins": 0.4501457214355469, "rewards/rejected": 3.0952885150909424, "step": 19080 }, { "epoch": 0.8862992710896513, "grad_norm": 85.04219055175781, "learning_rate": 3.5238404754166857e-07, "logits/chosen": -18.54654312133789, "logits/rejected": -17.75581932067871, "logps/chosen": -561.1524658203125, "logps/rejected": -435.8094787597656, "loss": 0.4293, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.827407121658325, "rewards/margins": 1.300217866897583, "rewards/rejected": 2.527189254760742, "step": 19090 }, { "epoch": 0.8867635451970843, "grad_norm": 200.66636657714844, "learning_rate": 3.523066685237631e-07, "logits/chosen": -18.77696418762207, "logits/rejected": -18.189359664916992, "logps/chosen": -471.42327880859375, "logps/rejected": -383.07855224609375, "loss": 0.5663, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.053577423095703, "rewards/margins": 0.722568690776825, "rewards/rejected": 3.3310084342956543, "step": 19100 }, { "epoch": 0.8872278193045174, "grad_norm": 28.49555206298828, "learning_rate": 3.5222928950585754e-07, "logits/chosen": -17.820270538330078, "logits/rejected": -18.49226951599121, "logps/chosen": -236.9005889892578, "logps/rejected": -287.37310791015625, "loss": 0.855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5054073333740234, "rewards/margins": 0.5753997564315796, "rewards/rejected": 1.9300073385238647, "step": 19110 }, { "epoch": 0.8876920934119504, "grad_norm": 171.7345428466797, "learning_rate": 3.5215191048795205e-07, "logits/chosen": -18.762601852416992, "logits/rejected": -18.581867218017578, "logps/chosen": -451.75640869140625, "logps/rejected": -365.83984375, "loss": 0.7787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4943110942840576, "rewards/margins": 0.4255698323249817, "rewards/rejected": 3.0687410831451416, "step": 19120 }, { "epoch": 0.8881563675193834, "grad_norm": 145.93284606933594, "learning_rate": 3.5207453147004656e-07, "logits/chosen": -18.31601905822754, "logits/rejected": -17.905502319335938, "logps/chosen": -397.1242370605469, "logps/rejected": -306.08160400390625, "loss": 0.8056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.100491523742676, "rewards/margins": 1.0687106847763062, "rewards/rejected": 3.03178071975708, "step": 19130 }, { "epoch": 0.8886206416268164, "grad_norm": 59.170894622802734, "learning_rate": 3.5199715245214107e-07, "logits/chosen": -20.321428298950195, "logits/rejected": -19.465402603149414, "logps/chosen": -433.52117919921875, "logps/rejected": -349.84539794921875, "loss": 0.6968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3562862873077393, "rewards/margins": 0.29838526248931885, "rewards/rejected": 3.057900905609131, "step": 19140 }, { "epoch": 0.8890849157342495, "grad_norm": 98.15734100341797, "learning_rate": 3.519197734342356e-07, "logits/chosen": -19.016952514648438, "logits/rejected": -18.483572006225586, "logps/chosen": -372.5700378417969, "logps/rejected": -345.4309997558594, "loss": 0.583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4013543128967285, "rewards/margins": 0.5885177850723267, "rewards/rejected": 2.812836170196533, "step": 19150 }, { "epoch": 0.8895491898416825, "grad_norm": 97.60198211669922, "learning_rate": 3.5184239441633004e-07, "logits/chosen": -18.969812393188477, "logits/rejected": -18.374408721923828, "logps/chosen": -343.55364990234375, "logps/rejected": -269.6422424316406, "loss": 0.5563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.662696123123169, "rewards/margins": 0.6331900954246521, "rewards/rejected": 2.029505968093872, "step": 19160 }, { "epoch": 0.8900134639491155, "grad_norm": 193.24847412109375, "learning_rate": 3.5176501539842455e-07, "logits/chosen": -17.9390926361084, "logits/rejected": -18.206008911132812, "logps/chosen": -352.39361572265625, "logps/rejected": -371.62042236328125, "loss": 1.0828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7528178691864014, "rewards/margins": 0.09445442259311676, "rewards/rejected": 2.6583638191223145, "step": 19170 }, { "epoch": 0.8904777380565486, "grad_norm": 0.5558543801307678, "learning_rate": 3.51687636380519e-07, "logits/chosen": -18.647098541259766, "logits/rejected": -17.888853073120117, "logps/chosen": -429.77862548828125, "logps/rejected": -298.86102294921875, "loss": 0.6901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.203376293182373, "rewards/margins": 0.8466771841049194, "rewards/rejected": 2.356699228286743, "step": 19180 }, { "epoch": 0.8909420121639816, "grad_norm": 95.00314331054688, "learning_rate": 3.516102573626135e-07, "logits/chosen": -19.51007843017578, "logits/rejected": -19.120338439941406, "logps/chosen": -414.60821533203125, "logps/rejected": -412.1620178222656, "loss": 0.7165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6428349018096924, "rewards/margins": 0.12960028648376465, "rewards/rejected": 2.5132346153259277, "step": 19190 }, { "epoch": 0.8914062862714146, "grad_norm": 12.688246726989746, "learning_rate": 3.5153287834470803e-07, "logits/chosen": -18.921377182006836, "logits/rejected": -19.224987030029297, "logps/chosen": -327.87591552734375, "logps/rejected": -295.05999755859375, "loss": 0.8576, "rewards/accuracies": 0.5, "rewards/chosen": 3.0974724292755127, "rewards/margins": 0.5898916125297546, "rewards/rejected": 2.5075812339782715, "step": 19200 }, { "epoch": 0.8918705603788477, "grad_norm": 222.738037109375, "learning_rate": 3.514554993268025e-07, "logits/chosen": -19.35207748413086, "logits/rejected": -18.696365356445312, "logps/chosen": -408.6733093261719, "logps/rejected": -329.41217041015625, "loss": 0.6907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.393474578857422, "rewards/margins": 0.6014610528945923, "rewards/rejected": 2.792013645172119, "step": 19210 }, { "epoch": 0.8923348344862807, "grad_norm": 21.88086700439453, "learning_rate": 3.51378120308897e-07, "logits/chosen": -19.2737979888916, "logits/rejected": -17.740217208862305, "logps/chosen": -321.5188903808594, "logps/rejected": -184.23855590820312, "loss": 0.3451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7831695079803467, "rewards/margins": 1.5177631378173828, "rewards/rejected": 1.2654063701629639, "step": 19220 }, { "epoch": 0.8927991085937137, "grad_norm": 8.465386390686035, "learning_rate": 3.513007412909915e-07, "logits/chosen": -18.57712173461914, "logits/rejected": -17.86532211303711, "logps/chosen": -317.73248291015625, "logps/rejected": -319.33197021484375, "loss": 0.5755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0341403484344482, "rewards/margins": 1.2198539972305298, "rewards/rejected": 1.814286470413208, "step": 19230 }, { "epoch": 0.8932633827011468, "grad_norm": 46.04792404174805, "learning_rate": 3.5122336227308603e-07, "logits/chosen": -17.915990829467773, "logits/rejected": -17.45098304748535, "logps/chosen": -382.12445068359375, "logps/rejected": -344.0032043457031, "loss": 0.888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.863983392715454, "rewards/margins": 0.6591619253158569, "rewards/rejected": 2.2048215866088867, "step": 19240 }, { "epoch": 0.8937276568085798, "grad_norm": 223.01925659179688, "learning_rate": 3.5114598325518054e-07, "logits/chosen": -19.288753509521484, "logits/rejected": -18.491500854492188, "logps/chosen": -459.6092224121094, "logps/rejected": -413.5123596191406, "loss": 0.5026, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4975624084472656, "rewards/margins": 0.9337049722671509, "rewards/rejected": 2.5638577938079834, "step": 19250 }, { "epoch": 0.8941919309160128, "grad_norm": 130.1693572998047, "learning_rate": 3.5106860423727505e-07, "logits/chosen": -19.209447860717773, "logits/rejected": -18.581546783447266, "logps/chosen": -515.83642578125, "logps/rejected": -356.7171936035156, "loss": 0.4884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3419928550720215, "rewards/margins": 0.6611350774765015, "rewards/rejected": 2.6808581352233887, "step": 19260 }, { "epoch": 0.8946562050234459, "grad_norm": 15.845084190368652, "learning_rate": 3.509912252193695e-07, "logits/chosen": -18.40558433532715, "logits/rejected": -18.009456634521484, "logps/chosen": -463.05609130859375, "logps/rejected": -391.0887451171875, "loss": 0.5893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6235032081604004, "rewards/margins": 0.9394334554672241, "rewards/rejected": 2.6840696334838867, "step": 19270 }, { "epoch": 0.8951204791308789, "grad_norm": 61.48419952392578, "learning_rate": 3.5091384620146397e-07, "logits/chosen": -18.84433937072754, "logits/rejected": -18.038318634033203, "logps/chosen": -398.3902587890625, "logps/rejected": -347.632080078125, "loss": 1.0133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6236252784729004, "rewards/margins": 0.5957668423652649, "rewards/rejected": 3.027858257293701, "step": 19280 }, { "epoch": 0.8955847532383119, "grad_norm": 25.618541717529297, "learning_rate": 3.508364671835585e-07, "logits/chosen": -19.399456024169922, "logits/rejected": -17.967363357543945, "logps/chosen": -285.56072998046875, "logps/rejected": -193.16632080078125, "loss": 0.7302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2617647647857666, "rewards/margins": 0.5218123197555542, "rewards/rejected": 1.7399523258209229, "step": 19290 }, { "epoch": 0.8960490273457449, "grad_norm": 163.23175048828125, "learning_rate": 3.50759088165653e-07, "logits/chosen": -18.529338836669922, "logits/rejected": -17.64297866821289, "logps/chosen": -330.73809814453125, "logps/rejected": -239.9610137939453, "loss": 0.5784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2602503299713135, "rewards/margins": 1.1222741603851318, "rewards/rejected": 1.137976050376892, "step": 19300 }, { "epoch": 0.896513301453178, "grad_norm": 44.471702575683594, "learning_rate": 3.5068170914774745e-07, "logits/chosen": -19.94266128540039, "logits/rejected": -18.469844818115234, "logps/chosen": -351.1669006347656, "logps/rejected": -308.0597839355469, "loss": 0.5257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.204928159713745, "rewards/margins": 0.8167715072631836, "rewards/rejected": 2.3881564140319824, "step": 19310 }, { "epoch": 0.896977575560611, "grad_norm": 54.89548110961914, "learning_rate": 3.5060433012984196e-07, "logits/chosen": -19.146968841552734, "logits/rejected": -18.18211555480957, "logps/chosen": -337.1910400390625, "logps/rejected": -210.8338165283203, "loss": 0.5674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.549833297729492, "rewards/margins": 0.9083755612373352, "rewards/rejected": 1.6414577960968018, "step": 19320 }, { "epoch": 0.897441849668044, "grad_norm": 6.305757522583008, "learning_rate": 3.5052695111193647e-07, "logits/chosen": -19.51004409790039, "logits/rejected": -18.476308822631836, "logps/chosen": -366.9761657714844, "logps/rejected": -259.7915954589844, "loss": 0.3123, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6393730640411377, "rewards/margins": 1.7586253881454468, "rewards/rejected": 1.8807475566864014, "step": 19330 }, { "epoch": 0.8979061237754771, "grad_norm": 18.9158992767334, "learning_rate": 3.50449572094031e-07, "logits/chosen": -18.86855125427246, "logits/rejected": -17.79856300354004, "logps/chosen": -437.24755859375, "logps/rejected": -365.68402099609375, "loss": 0.7491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1643333435058594, "rewards/margins": 0.751406192779541, "rewards/rejected": 2.4129271507263184, "step": 19340 }, { "epoch": 0.8983703978829101, "grad_norm": 135.85855102539062, "learning_rate": 3.503721930761255e-07, "logits/chosen": -18.63794708251953, "logits/rejected": -19.608943939208984, "logps/chosen": -466.0302734375, "logps/rejected": -396.2575988769531, "loss": 0.9826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6945018768310547, "rewards/margins": -0.2157450020313263, "rewards/rejected": 3.9102470874786377, "step": 19350 }, { "epoch": 0.8988346719903431, "grad_norm": 90.847412109375, "learning_rate": 3.5029481405822e-07, "logits/chosen": -17.97934341430664, "logits/rejected": -18.259868621826172, "logps/chosen": -300.185302734375, "logps/rejected": -335.77020263671875, "loss": 0.9865, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8260257244110107, "rewards/margins": 0.21836979687213898, "rewards/rejected": 2.6076560020446777, "step": 19360 }, { "epoch": 0.8992989460977762, "grad_norm": 0.14609216153621674, "learning_rate": 3.5021743504031446e-07, "logits/chosen": -18.672876358032227, "logits/rejected": -18.002918243408203, "logps/chosen": -398.4921875, "logps/rejected": -318.11627197265625, "loss": 0.6064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0590553283691406, "rewards/margins": 0.9007951021194458, "rewards/rejected": 2.1582601070404053, "step": 19370 }, { "epoch": 0.8997632202052092, "grad_norm": 43.726417541503906, "learning_rate": 3.501400560224089e-07, "logits/chosen": -17.681543350219727, "logits/rejected": -17.483131408691406, "logps/chosen": -323.20147705078125, "logps/rejected": -296.0618591308594, "loss": 0.919, "rewards/accuracies": 0.5, "rewards/chosen": 2.492924928665161, "rewards/margins": 0.19051596522331238, "rewards/rejected": 2.3024089336395264, "step": 19380 }, { "epoch": 0.9002274943126422, "grad_norm": 41.223533630371094, "learning_rate": 3.5006267700450343e-07, "logits/chosen": -18.187095642089844, "logits/rejected": -17.334857940673828, "logps/chosen": -397.4345703125, "logps/rejected": -240.9793243408203, "loss": 0.4306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.727428913116455, "rewards/margins": 0.8294336199760437, "rewards/rejected": 1.8979952335357666, "step": 19390 }, { "epoch": 0.9006917684200753, "grad_norm": 6.029455661773682, "learning_rate": 3.4998529798659794e-07, "logits/chosen": -19.709993362426758, "logits/rejected": -17.981807708740234, "logps/chosen": -461.6771545410156, "logps/rejected": -260.5771179199219, "loss": 0.4053, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.223330020904541, "rewards/margins": 2.1413159370422363, "rewards/rejected": 2.0820140838623047, "step": 19400 }, { "epoch": 0.9011560425275082, "grad_norm": 0.8968028426170349, "learning_rate": 3.4990791896869245e-07, "logits/chosen": -18.775732040405273, "logits/rejected": -18.70480728149414, "logps/chosen": -345.1891784667969, "logps/rejected": -335.0086669921875, "loss": 0.8959, "rewards/accuracies": 0.5, "rewards/chosen": 3.249770402908325, "rewards/margins": 0.3342505991458893, "rewards/rejected": 2.9155197143554688, "step": 19410 }, { "epoch": 0.9016203166349412, "grad_norm": 157.14869689941406, "learning_rate": 3.498305399507869e-07, "logits/chosen": -18.329914093017578, "logits/rejected": -17.031597137451172, "logps/chosen": -317.3416748046875, "logps/rejected": -221.93319702148438, "loss": 0.5667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4681954383850098, "rewards/margins": 0.9345693588256836, "rewards/rejected": 1.5336260795593262, "step": 19420 }, { "epoch": 0.9020845907423743, "grad_norm": 48.86016082763672, "learning_rate": 3.497531609328814e-07, "logits/chosen": -18.649574279785156, "logits/rejected": -18.591943740844727, "logps/chosen": -491.0706481933594, "logps/rejected": -439.1622619628906, "loss": 0.7856, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7823233604431152, "rewards/margins": 0.3669288754463196, "rewards/rejected": 3.4153945446014404, "step": 19430 }, { "epoch": 0.9025488648498073, "grad_norm": 32.21444320678711, "learning_rate": 3.4967578191497594e-07, "logits/chosen": -19.481775283813477, "logits/rejected": -19.028810501098633, "logps/chosen": -392.78515625, "logps/rejected": -303.1368713378906, "loss": 0.603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.591179609298706, "rewards/margins": 1.0560383796691895, "rewards/rejected": 2.5351405143737793, "step": 19440 }, { "epoch": 0.9030131389572403, "grad_norm": 53.87241744995117, "learning_rate": 3.4959840289707045e-07, "logits/chosen": -20.160961151123047, "logits/rejected": -18.8664493560791, "logps/chosen": -411.55889892578125, "logps/rejected": -264.46246337890625, "loss": 0.8965, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.102337598800659, "rewards/margins": 0.582706868648529, "rewards/rejected": 2.5196304321289062, "step": 19450 }, { "epoch": 0.9034774130646733, "grad_norm": 99.0674057006836, "learning_rate": 3.4952102387916496e-07, "logits/chosen": -18.831769943237305, "logits/rejected": -18.31068229675293, "logps/chosen": -338.88287353515625, "logps/rejected": -306.11846923828125, "loss": 0.6132, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8583357334136963, "rewards/margins": 0.6556724309921265, "rewards/rejected": 2.2026636600494385, "step": 19460 }, { "epoch": 0.9039416871721064, "grad_norm": 63.17644119262695, "learning_rate": 3.4944364486125936e-07, "logits/chosen": -19.163753509521484, "logits/rejected": -17.87203025817871, "logps/chosen": -547.735595703125, "logps/rejected": -391.2482604980469, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.087292194366455, "rewards/margins": 1.2771751880645752, "rewards/rejected": 2.810116767883301, "step": 19470 }, { "epoch": 0.9044059612795394, "grad_norm": 0.41293439269065857, "learning_rate": 3.493662658433539e-07, "logits/chosen": -19.633493423461914, "logits/rejected": -19.456758499145508, "logps/chosen": -318.05352783203125, "logps/rejected": -380.0077209472656, "loss": 1.103, "rewards/accuracies": 0.5, "rewards/chosen": 3.0253686904907227, "rewards/margins": 0.2671182453632355, "rewards/rejected": 2.7582507133483887, "step": 19480 }, { "epoch": 0.9048702353869724, "grad_norm": 36.701316833496094, "learning_rate": 3.492888868254484e-07, "logits/chosen": -18.336952209472656, "logits/rejected": -18.165374755859375, "logps/chosen": -377.9185791015625, "logps/rejected": -310.86968994140625, "loss": 0.6822, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.451927900314331, "rewards/margins": 0.9148207902908325, "rewards/rejected": 2.537107229232788, "step": 19490 }, { "epoch": 0.9053345094944055, "grad_norm": 38.91905212402344, "learning_rate": 3.492115078075429e-07, "logits/chosen": -18.927751541137695, "logits/rejected": -17.978561401367188, "logps/chosen": -328.2293395996094, "logps/rejected": -276.53814697265625, "loss": 0.7969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.885789394378662, "rewards/margins": 0.41853252053260803, "rewards/rejected": 2.467256546020508, "step": 19500 }, { "epoch": 0.9057987836018385, "grad_norm": 76.82722473144531, "learning_rate": 3.491341287896374e-07, "logits/chosen": -19.339025497436523, "logits/rejected": -19.079238891601562, "logps/chosen": -279.39129638671875, "logps/rejected": -305.55804443359375, "loss": 1.0537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3890299797058105, "rewards/margins": -0.22413232922554016, "rewards/rejected": 2.6131622791290283, "step": 19510 }, { "epoch": 0.9062630577092715, "grad_norm": 11.074430465698242, "learning_rate": 3.4905674977173187e-07, "logits/chosen": -19.69381332397461, "logits/rejected": -18.51317024230957, "logps/chosen": -398.76495361328125, "logps/rejected": -314.3983154296875, "loss": 0.5107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.112343788146973, "rewards/margins": 1.0335958003997803, "rewards/rejected": 3.0787484645843506, "step": 19520 }, { "epoch": 0.9067273318167046, "grad_norm": 81.63055419921875, "learning_rate": 3.489793707538264e-07, "logits/chosen": -18.993114471435547, "logits/rejected": -17.72220230102539, "logps/chosen": -312.13677978515625, "logps/rejected": -235.0812530517578, "loss": 0.5926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8447153568267822, "rewards/margins": 0.9604110717773438, "rewards/rejected": 1.8843040466308594, "step": 19530 }, { "epoch": 0.9071916059241376, "grad_norm": 94.71862030029297, "learning_rate": 3.489019917359209e-07, "logits/chosen": -19.479265213012695, "logits/rejected": -19.417369842529297, "logps/chosen": -301.42633056640625, "logps/rejected": -260.0965576171875, "loss": 0.8741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0941600799560547, "rewards/margins": 0.13944898545742035, "rewards/rejected": 2.9547109603881836, "step": 19540 }, { "epoch": 0.9076558800315706, "grad_norm": 192.4697265625, "learning_rate": 3.488246127180154e-07, "logits/chosen": -19.36605453491211, "logits/rejected": -18.366207122802734, "logps/chosen": -330.5152282714844, "logps/rejected": -290.7113037109375, "loss": 0.917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8437466621398926, "rewards/margins": 1.0396435260772705, "rewards/rejected": 1.8041034936904907, "step": 19550 }, { "epoch": 0.9081201541390037, "grad_norm": 17.26537322998047, "learning_rate": 3.487472337001099e-07, "logits/chosen": -19.680341720581055, "logits/rejected": -18.7972412109375, "logps/chosen": -419.96356201171875, "logps/rejected": -327.1643371582031, "loss": 0.7045, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.846031665802002, "rewards/margins": 1.1689294576644897, "rewards/rejected": 2.6771020889282227, "step": 19560 }, { "epoch": 0.9085844282464367, "grad_norm": 109.95220184326172, "learning_rate": 3.486698546822043e-07, "logits/chosen": -18.932165145874023, "logits/rejected": -18.333147048950195, "logps/chosen": -419.043701171875, "logps/rejected": -382.869140625, "loss": 0.6519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8339123725891113, "rewards/margins": 0.2647155821323395, "rewards/rejected": 2.5691967010498047, "step": 19570 }, { "epoch": 0.9090487023538697, "grad_norm": 39.371917724609375, "learning_rate": 3.4859247566429883e-07, "logits/chosen": -18.468639373779297, "logits/rejected": -18.11098861694336, "logps/chosen": -351.26373291015625, "logps/rejected": -296.0756530761719, "loss": 0.444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.642779588699341, "rewards/margins": 0.9601823687553406, "rewards/rejected": 2.6825973987579346, "step": 19580 }, { "epoch": 0.9095129764613028, "grad_norm": 181.58563232421875, "learning_rate": 3.4851509664639334e-07, "logits/chosen": -19.028793334960938, "logits/rejected": -17.908071517944336, "logps/chosen": -454.41192626953125, "logps/rejected": -356.5639343261719, "loss": 1.3309, "rewards/accuracies": 0.5, "rewards/chosen": 3.006913661956787, "rewards/margins": -0.017505263909697533, "rewards/rejected": 3.024418354034424, "step": 19590 }, { "epoch": 0.9099772505687358, "grad_norm": 13.94489860534668, "learning_rate": 3.4843771762848785e-07, "logits/chosen": -19.14845085144043, "logits/rejected": -18.699565887451172, "logps/chosen": -489.94781494140625, "logps/rejected": -388.79693603515625, "loss": 0.6182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2569472789764404, "rewards/margins": 0.7199212908744812, "rewards/rejected": 2.5370259284973145, "step": 19600 }, { "epoch": 0.9104415246761688, "grad_norm": 100.93639373779297, "learning_rate": 3.4836033861058236e-07, "logits/chosen": -19.517934799194336, "logits/rejected": -18.479209899902344, "logps/chosen": -385.176513671875, "logps/rejected": -300.7269287109375, "loss": 0.4493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5043387413024902, "rewards/margins": 1.4241201877593994, "rewards/rejected": 2.08021879196167, "step": 19610 }, { "epoch": 0.9109057987836018, "grad_norm": 164.20167541503906, "learning_rate": 3.482829595926768e-07, "logits/chosen": -18.31325912475586, "logits/rejected": -16.85953140258789, "logps/chosen": -418.30047607421875, "logps/rejected": -260.3171081542969, "loss": 0.5636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4591174125671387, "rewards/margins": 1.1926581859588623, "rewards/rejected": 2.2664592266082764, "step": 19620 }, { "epoch": 0.9113700728910349, "grad_norm": 21.711864471435547, "learning_rate": 3.4820558057477133e-07, "logits/chosen": -17.793886184692383, "logits/rejected": -17.693984985351562, "logps/chosen": -360.34735107421875, "logps/rejected": -333.4924621582031, "loss": 0.7758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1779873371124268, "rewards/margins": 0.5450159311294556, "rewards/rejected": 1.6329715251922607, "step": 19630 }, { "epoch": 0.9118343469984679, "grad_norm": 101.20448303222656, "learning_rate": 3.4812820155686584e-07, "logits/chosen": -18.69869041442871, "logits/rejected": -18.269906997680664, "logps/chosen": -429.3857421875, "logps/rejected": -369.8059997558594, "loss": 0.8967, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6958518028259277, "rewards/margins": -0.11139827966690063, "rewards/rejected": 2.8072502613067627, "step": 19640 }, { "epoch": 0.9122986211059009, "grad_norm": 6.480894088745117, "learning_rate": 3.4805082253896035e-07, "logits/chosen": -19.50531578063965, "logits/rejected": -18.032426834106445, "logps/chosen": -450.84295654296875, "logps/rejected": -286.3265380859375, "loss": 0.5636, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.457068681716919, "rewards/margins": 1.3896982669830322, "rewards/rejected": 2.0673701763153076, "step": 19650 }, { "epoch": 0.912762895213334, "grad_norm": 54.919761657714844, "learning_rate": 3.479734435210548e-07, "logits/chosen": -18.530517578125, "logits/rejected": -18.078227996826172, "logps/chosen": -347.76434326171875, "logps/rejected": -323.0293273925781, "loss": 0.8729, "rewards/accuracies": 0.5, "rewards/chosen": 2.497729778289795, "rewards/margins": 0.08637209981679916, "rewards/rejected": 2.411357879638672, "step": 19660 }, { "epoch": 0.913227169320767, "grad_norm": 25.004722595214844, "learning_rate": 3.4789606450314927e-07, "logits/chosen": -18.893672943115234, "logits/rejected": -18.733081817626953, "logps/chosen": -319.52471923828125, "logps/rejected": -293.2361145019531, "loss": 0.5613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6838226318359375, "rewards/margins": 0.45404529571533203, "rewards/rejected": 2.2297770977020264, "step": 19670 }, { "epoch": 0.9136914434282, "grad_norm": 184.23960876464844, "learning_rate": 3.478186854852438e-07, "logits/chosen": -18.874719619750977, "logits/rejected": -17.33154296875, "logps/chosen": -317.34991455078125, "logps/rejected": -233.3376922607422, "loss": 0.3801, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8646998405456543, "rewards/margins": 1.8318917751312256, "rewards/rejected": 1.0328081846237183, "step": 19680 }, { "epoch": 0.9141557175356331, "grad_norm": 3.109112501144409, "learning_rate": 3.477413064673383e-07, "logits/chosen": -18.882205963134766, "logits/rejected": -18.15183448791504, "logps/chosen": -408.38262939453125, "logps/rejected": -301.39141845703125, "loss": 0.6517, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3591742515563965, "rewards/margins": 1.261705756187439, "rewards/rejected": 2.097468852996826, "step": 19690 }, { "epoch": 0.9146199916430661, "grad_norm": 50.91168975830078, "learning_rate": 3.476639274494328e-07, "logits/chosen": -20.268695831298828, "logits/rejected": -18.74679946899414, "logps/chosen": -412.1012268066406, "logps/rejected": -341.2098083496094, "loss": 0.4271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5577139854431152, "rewards/margins": 1.3729338645935059, "rewards/rejected": 2.1847805976867676, "step": 19700 }, { "epoch": 0.9150842657504991, "grad_norm": 69.80738830566406, "learning_rate": 3.475865484315273e-07, "logits/chosen": -18.282840728759766, "logits/rejected": -17.216468811035156, "logps/chosen": -385.3956604003906, "logps/rejected": -325.7275085449219, "loss": 0.6389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8058531284332275, "rewards/margins": 0.754103422164917, "rewards/rejected": 2.0517494678497314, "step": 19710 }, { "epoch": 0.9155485398579322, "grad_norm": 47.70536422729492, "learning_rate": 3.475091694136218e-07, "logits/chosen": -18.890256881713867, "logits/rejected": -17.53911018371582, "logps/chosen": -401.88983154296875, "logps/rejected": -347.184814453125, "loss": 0.4284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.431971311569214, "rewards/margins": 1.2376086711883545, "rewards/rejected": 2.1943628787994385, "step": 19720 }, { "epoch": 0.9160128139653652, "grad_norm": 155.78848266601562, "learning_rate": 3.474317903957163e-07, "logits/chosen": -19.336471557617188, "logits/rejected": -18.621103286743164, "logps/chosen": -396.38275146484375, "logps/rejected": -357.4229736328125, "loss": 0.7416, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8941876888275146, "rewards/margins": 0.1743377447128296, "rewards/rejected": 2.7198500633239746, "step": 19730 }, { "epoch": 0.9164770880727982, "grad_norm": 70.21986389160156, "learning_rate": 3.473544113778108e-07, "logits/chosen": -19.03970718383789, "logits/rejected": -17.86296844482422, "logps/chosen": -453.86328125, "logps/rejected": -343.0663757324219, "loss": 0.4872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9926884174346924, "rewards/margins": 1.2793889045715332, "rewards/rejected": 2.7132999897003174, "step": 19740 }, { "epoch": 0.9169413621802313, "grad_norm": 30.153413772583008, "learning_rate": 3.472770323599053e-07, "logits/chosen": -19.505855560302734, "logits/rejected": -18.982242584228516, "logps/chosen": -368.0855712890625, "logps/rejected": -288.8140563964844, "loss": 0.4817, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1959917545318604, "rewards/margins": 0.9400616884231567, "rewards/rejected": 2.255929946899414, "step": 19750 }, { "epoch": 0.9174056362876643, "grad_norm": 0.5395578145980835, "learning_rate": 3.4719965334199977e-07, "logits/chosen": -19.146854400634766, "logits/rejected": -18.200345993041992, "logps/chosen": -446.06243896484375, "logps/rejected": -361.9828186035156, "loss": 0.6246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.173975467681885, "rewards/margins": 1.3000268936157227, "rewards/rejected": 2.873948335647583, "step": 19760 }, { "epoch": 0.9178699103950972, "grad_norm": 210.9971923828125, "learning_rate": 3.471222743240942e-07, "logits/chosen": -17.837398529052734, "logits/rejected": -17.77560806274414, "logps/chosen": -303.01763916015625, "logps/rejected": -268.9952392578125, "loss": 0.9779, "rewards/accuracies": 0.5, "rewards/chosen": 2.447054386138916, "rewards/margins": 0.0546218641102314, "rewards/rejected": 2.39243221282959, "step": 19770 }, { "epoch": 0.9183341845025303, "grad_norm": 0.9895266890525818, "learning_rate": 3.4704489530618874e-07, "logits/chosen": -18.63674545288086, "logits/rejected": -17.636524200439453, "logps/chosen": -373.16876220703125, "logps/rejected": -290.575439453125, "loss": 0.782, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8524727821350098, "rewards/margins": 1.4289487600326538, "rewards/rejected": 2.4235239028930664, "step": 19780 }, { "epoch": 0.9187984586099633, "grad_norm": 48.67668914794922, "learning_rate": 3.4696751628828325e-07, "logits/chosen": -18.990474700927734, "logits/rejected": -18.28099822998047, "logps/chosen": -374.3558349609375, "logps/rejected": -324.8853454589844, "loss": 0.6408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3844501972198486, "rewards/margins": 0.502139687538147, "rewards/rejected": 2.882310628890991, "step": 19790 }, { "epoch": 0.9192627327173963, "grad_norm": 8.155476570129395, "learning_rate": 3.4689013727037776e-07, "logits/chosen": -19.573434829711914, "logits/rejected": -18.507984161376953, "logps/chosen": -458.56781005859375, "logps/rejected": -366.02886962890625, "loss": 0.4783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6516551971435547, "rewards/margins": 1.3153588771820068, "rewards/rejected": 2.336296558380127, "step": 19800 }, { "epoch": 0.9197270068248293, "grad_norm": 40.28862380981445, "learning_rate": 3.4681275825247227e-07, "logits/chosen": -17.914134979248047, "logits/rejected": -17.359947204589844, "logps/chosen": -464.64892578125, "logps/rejected": -336.3880615234375, "loss": 0.7004, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.676708936691284, "rewards/margins": 0.758575439453125, "rewards/rejected": 1.9181333780288696, "step": 19810 }, { "epoch": 0.9201912809322624, "grad_norm": 182.67831420898438, "learning_rate": 3.4673537923456673e-07, "logits/chosen": -18.075115203857422, "logits/rejected": -18.051218032836914, "logps/chosen": -351.1114196777344, "logps/rejected": -388.07244873046875, "loss": 1.2211, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7166635990142822, "rewards/margins": -0.483971506357193, "rewards/rejected": 3.2006351947784424, "step": 19820 }, { "epoch": 0.9206555550396954, "grad_norm": 30.73847007751465, "learning_rate": 3.4665800021666124e-07, "logits/chosen": -18.698402404785156, "logits/rejected": -18.513967514038086, "logps/chosen": -269.63946533203125, "logps/rejected": -266.3391418457031, "loss": 1.0454, "rewards/accuracies": 0.5, "rewards/chosen": 2.3336169719696045, "rewards/margins": -0.16667680442333221, "rewards/rejected": 2.500293731689453, "step": 19830 }, { "epoch": 0.9211198291471284, "grad_norm": 80.87696075439453, "learning_rate": 3.4658062119875575e-07, "logits/chosen": -18.666669845581055, "logits/rejected": -17.800140380859375, "logps/chosen": -468.211181640625, "logps/rejected": -319.71197509765625, "loss": 0.7623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.042451858520508, "rewards/margins": 1.4032018184661865, "rewards/rejected": 2.639249801635742, "step": 19840 }, { "epoch": 0.9215841032545615, "grad_norm": 217.8729248046875, "learning_rate": 3.4650324218085026e-07, "logits/chosen": -17.4241886138916, "logits/rejected": -17.380857467651367, "logps/chosen": -422.57159423828125, "logps/rejected": -476.8330078125, "loss": 1.8189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.744903326034546, "rewards/margins": -0.6934782266616821, "rewards/rejected": 3.4383816719055176, "step": 19850 }, { "epoch": 0.9220483773619945, "grad_norm": 89.43595123291016, "learning_rate": 3.464258631629447e-07, "logits/chosen": -18.424732208251953, "logits/rejected": -17.386417388916016, "logps/chosen": -501.00616455078125, "logps/rejected": -375.80450439453125, "loss": 0.4275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.232034206390381, "rewards/margins": 1.5767066478729248, "rewards/rejected": 2.6553280353546143, "step": 19860 }, { "epoch": 0.9225126514694275, "grad_norm": 79.26083374023438, "learning_rate": 3.463484841450392e-07, "logits/chosen": -18.378530502319336, "logits/rejected": -18.87567901611328, "logps/chosen": -309.2638244628906, "logps/rejected": -380.665771484375, "loss": 1.2595, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.5232911109924316, "rewards/margins": -0.7806623578071594, "rewards/rejected": 3.3039536476135254, "step": 19870 }, { "epoch": 0.9229769255768606, "grad_norm": 126.33283233642578, "learning_rate": 3.462711051271337e-07, "logits/chosen": -18.18756103515625, "logits/rejected": -17.583097457885742, "logps/chosen": -444.10540771484375, "logps/rejected": -375.6889343261719, "loss": 0.7086, "rewards/accuracies": 0.5, "rewards/chosen": 3.654336929321289, "rewards/margins": 0.4453276991844177, "rewards/rejected": 3.2090091705322266, "step": 19880 }, { "epoch": 0.9234411996842936, "grad_norm": 24.107437133789062, "learning_rate": 3.461937261092282e-07, "logits/chosen": -18.818309783935547, "logits/rejected": -18.513031005859375, "logps/chosen": -328.3565368652344, "logps/rejected": -178.4074249267578, "loss": 0.3882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5783066749572754, "rewards/margins": 1.592186689376831, "rewards/rejected": 0.9861198663711548, "step": 19890 }, { "epoch": 0.9239054737917266, "grad_norm": 154.732666015625, "learning_rate": 3.461163470913227e-07, "logits/chosen": -19.22548484802246, "logits/rejected": -18.823091506958008, "logps/chosen": -342.72412109375, "logps/rejected": -297.62994384765625, "loss": 0.6859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5052695274353027, "rewards/margins": 0.3214937746524811, "rewards/rejected": 3.1837756633758545, "step": 19900 }, { "epoch": 0.9243697478991597, "grad_norm": 40.30570602416992, "learning_rate": 3.460389680734172e-07, "logits/chosen": -18.76654052734375, "logits/rejected": -17.93143081665039, "logps/chosen": -440.2952575683594, "logps/rejected": -338.4970397949219, "loss": 0.4612, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1662590503692627, "rewards/margins": 0.8765301704406738, "rewards/rejected": 2.2897286415100098, "step": 19910 }, { "epoch": 0.9248340220065927, "grad_norm": 69.83946990966797, "learning_rate": 3.459615890555117e-07, "logits/chosen": -18.86166763305664, "logits/rejected": -18.155534744262695, "logps/chosen": -399.43939208984375, "logps/rejected": -321.8032531738281, "loss": 0.6351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3124194145202637, "rewards/margins": 1.0676202774047852, "rewards/rejected": 2.2447991371154785, "step": 19920 }, { "epoch": 0.9252982961140257, "grad_norm": 132.7205810546875, "learning_rate": 3.458842100376062e-07, "logits/chosen": -18.90573501586914, "logits/rejected": -18.108261108398438, "logps/chosen": -516.1529541015625, "logps/rejected": -376.09210205078125, "loss": 0.559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.963611602783203, "rewards/margins": 1.1413733959197998, "rewards/rejected": 2.822237730026245, "step": 19930 }, { "epoch": 0.9257625702214588, "grad_norm": 37.9665412902832, "learning_rate": 3.458068310197007e-07, "logits/chosen": -18.76681137084961, "logits/rejected": -18.477218627929688, "logps/chosen": -393.6209411621094, "logps/rejected": -268.4458312988281, "loss": 0.2871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6673007011413574, "rewards/margins": 1.6838476657867432, "rewards/rejected": 1.9834537506103516, "step": 19940 }, { "epoch": 0.9262268443288918, "grad_norm": 31.85274887084961, "learning_rate": 3.4572945200179516e-07, "logits/chosen": -18.38429069519043, "logits/rejected": -18.23257064819336, "logps/chosen": -388.175537109375, "logps/rejected": -315.5467224121094, "loss": 0.9057, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2789807319641113, "rewards/margins": 0.7903600931167603, "rewards/rejected": 2.4886205196380615, "step": 19950 }, { "epoch": 0.9266911184363248, "grad_norm": 64.87410736083984, "learning_rate": 3.456520729838897e-07, "logits/chosen": -18.124683380126953, "logits/rejected": -17.76565170288086, "logps/chosen": -329.5273742675781, "logps/rejected": -367.88018798828125, "loss": 0.72, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3183364868164062, "rewards/margins": 0.4664899706840515, "rewards/rejected": 2.85184645652771, "step": 19960 }, { "epoch": 0.9271553925437578, "grad_norm": 82.59078216552734, "learning_rate": 3.4557469396598413e-07, "logits/chosen": -19.732662200927734, "logits/rejected": -19.545183181762695, "logps/chosen": -404.5399169921875, "logps/rejected": -396.3082580566406, "loss": 0.3893, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9480438232421875, "rewards/margins": 1.3753716945648193, "rewards/rejected": 2.572672128677368, "step": 19970 }, { "epoch": 0.9276196666511909, "grad_norm": 227.80421447753906, "learning_rate": 3.4549731494807865e-07, "logits/chosen": -18.76026725769043, "logits/rejected": -17.917421340942383, "logps/chosen": -430.27301025390625, "logps/rejected": -370.8994140625, "loss": 0.6602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.135042667388916, "rewards/margins": 1.4398761987686157, "rewards/rejected": 2.69516658782959, "step": 19980 }, { "epoch": 0.9280839407586239, "grad_norm": 119.9244155883789, "learning_rate": 3.4541993593017316e-07, "logits/chosen": -18.560768127441406, "logits/rejected": -18.175912857055664, "logps/chosen": -283.95819091796875, "logps/rejected": -272.0840148925781, "loss": 0.911, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8996436595916748, "rewards/margins": -0.10654398053884506, "rewards/rejected": 2.006187915802002, "step": 19990 }, { "epoch": 0.9285482148660569, "grad_norm": 88.056884765625, "learning_rate": 3.4534255691226767e-07, "logits/chosen": -19.062776565551758, "logits/rejected": -18.209705352783203, "logps/chosen": -330.6055603027344, "logps/rejected": -278.0043029785156, "loss": 0.5085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.792956829071045, "rewards/margins": 1.0682045221328735, "rewards/rejected": 1.7247521877288818, "step": 20000 }, { "epoch": 0.92901248897349, "grad_norm": 223.98497009277344, "learning_rate": 3.452651778943622e-07, "logits/chosen": -19.427698135375977, "logits/rejected": -18.86754035949707, "logps/chosen": -458.95269775390625, "logps/rejected": -421.4971618652344, "loss": 0.8762, "rewards/accuracies": 0.5, "rewards/chosen": 3.482771635055542, "rewards/margins": -0.05343541502952576, "rewards/rejected": 3.536207675933838, "step": 20010 }, { "epoch": 0.929476763080923, "grad_norm": 24.35014533996582, "learning_rate": 3.4518779887645664e-07, "logits/chosen": -19.65399742126465, "logits/rejected": -18.996379852294922, "logps/chosen": -437.7862243652344, "logps/rejected": -376.5979919433594, "loss": 0.53, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7308719158172607, "rewards/margins": 0.6422184109687805, "rewards/rejected": 3.088653564453125, "step": 20020 }, { "epoch": 0.929941037188356, "grad_norm": 60.72016143798828, "learning_rate": 3.4511041985855115e-07, "logits/chosen": -19.39385414123535, "logits/rejected": -19.236228942871094, "logps/chosen": -391.00677490234375, "logps/rejected": -412.0589904785156, "loss": 0.6282, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4342589378356934, "rewards/margins": 0.4834723472595215, "rewards/rejected": 2.950786590576172, "step": 20030 }, { "epoch": 0.9304053112957891, "grad_norm": 91.75064849853516, "learning_rate": 3.4503304084064566e-07, "logits/chosen": -18.437326431274414, "logits/rejected": -17.273487091064453, "logps/chosen": -407.069091796875, "logps/rejected": -215.5881805419922, "loss": 0.3854, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.659262180328369, "rewards/margins": 1.9844989776611328, "rewards/rejected": 1.6747627258300781, "step": 20040 }, { "epoch": 0.9308695854032221, "grad_norm": 77.28791809082031, "learning_rate": 3.449556618227401e-07, "logits/chosen": -18.57268714904785, "logits/rejected": -18.520198822021484, "logps/chosen": -384.4777526855469, "logps/rejected": -408.8514709472656, "loss": 1.3249, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.4276492595672607, "rewards/margins": -0.24216365814208984, "rewards/rejected": 3.669813632965088, "step": 20050 }, { "epoch": 0.9313338595106551, "grad_norm": 67.07270812988281, "learning_rate": 3.4487828280483463e-07, "logits/chosen": -18.929027557373047, "logits/rejected": -18.174240112304688, "logps/chosen": -426.72247314453125, "logps/rejected": -351.1371154785156, "loss": 0.5793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8405966758728027, "rewards/margins": 0.513359785079956, "rewards/rejected": 3.3272368907928467, "step": 20060 }, { "epoch": 0.9317981336180882, "grad_norm": 24.09621810913086, "learning_rate": 3.448009037869291e-07, "logits/chosen": -19.052831649780273, "logits/rejected": -18.386831283569336, "logps/chosen": -383.4922790527344, "logps/rejected": -364.950927734375, "loss": 1.1691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.661492347717285, "rewards/margins": -0.2782323360443115, "rewards/rejected": 2.9397246837615967, "step": 20070 }, { "epoch": 0.9322624077255212, "grad_norm": 99.40797424316406, "learning_rate": 3.447235247690236e-07, "logits/chosen": -18.72879981994629, "logits/rejected": -17.624282836914062, "logps/chosen": -415.0321350097656, "logps/rejected": -264.72784423828125, "loss": 1.0738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6081383228302, "rewards/margins": 0.4409011900424957, "rewards/rejected": 2.1672370433807373, "step": 20080 }, { "epoch": 0.9327266818329542, "grad_norm": 27.327844619750977, "learning_rate": 3.446461457511181e-07, "logits/chosen": -18.772403717041016, "logits/rejected": -17.24156379699707, "logps/chosen": -379.22467041015625, "logps/rejected": -239.35556030273438, "loss": 0.3463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.961304187774658, "rewards/margins": 2.216146230697632, "rewards/rejected": 1.7451578378677368, "step": 20090 }, { "epoch": 0.9331909559403873, "grad_norm": 25.40734100341797, "learning_rate": 3.445687667332126e-07, "logits/chosen": -18.256254196166992, "logits/rejected": -17.59698486328125, "logps/chosen": -435.255615234375, "logps/rejected": -354.01776123046875, "loss": 0.7607, "rewards/accuracies": 0.5, "rewards/chosen": 3.3196043968200684, "rewards/margins": 0.305791437625885, "rewards/rejected": 3.013813018798828, "step": 20100 }, { "epoch": 0.9336552300478203, "grad_norm": 95.11824798583984, "learning_rate": 3.4449138771530713e-07, "logits/chosen": -19.425477981567383, "logits/rejected": -19.075132369995117, "logps/chosen": -442.5234375, "logps/rejected": -342.87762451171875, "loss": 0.6203, "rewards/accuracies": 0.5, "rewards/chosen": 3.3508009910583496, "rewards/margins": 0.6334498524665833, "rewards/rejected": 2.7173514366149902, "step": 20110 }, { "epoch": 0.9341195041552532, "grad_norm": 13.555120468139648, "learning_rate": 3.444140086974016e-07, "logits/chosen": -19.09042739868164, "logits/rejected": -18.513391494750977, "logps/chosen": -432.9292907714844, "logps/rejected": -353.97998046875, "loss": 0.8801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6307544708251953, "rewards/margins": 0.5384446978569031, "rewards/rejected": 3.0923097133636475, "step": 20120 }, { "epoch": 0.9345837782626862, "grad_norm": 183.46636962890625, "learning_rate": 3.443366296794961e-07, "logits/chosen": -19.643978118896484, "logits/rejected": -18.597423553466797, "logps/chosen": -426.94866943359375, "logps/rejected": -310.55950927734375, "loss": 0.4757, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6418890953063965, "rewards/margins": 0.889116644859314, "rewards/rejected": 1.752772331237793, "step": 20130 }, { "epoch": 0.9350480523701193, "grad_norm": 184.965087890625, "learning_rate": 3.4425925066159056e-07, "logits/chosen": -19.272998809814453, "logits/rejected": -18.946670532226562, "logps/chosen": -409.53411865234375, "logps/rejected": -378.48394775390625, "loss": 0.4982, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.421208143234253, "rewards/margins": 0.6855244636535645, "rewards/rejected": 2.7356836795806885, "step": 20140 }, { "epoch": 0.9355123264775523, "grad_norm": 48.552711486816406, "learning_rate": 3.4418187164368507e-07, "logits/chosen": -18.95549201965332, "logits/rejected": -18.463863372802734, "logps/chosen": -474.2198181152344, "logps/rejected": -410.8292541503906, "loss": 0.63, "rewards/accuracies": 0.5, "rewards/chosen": 3.2025344371795654, "rewards/margins": 0.4014406204223633, "rewards/rejected": 2.8010940551757812, "step": 20150 }, { "epoch": 0.9359766005849853, "grad_norm": 100.17538452148438, "learning_rate": 3.441044926257796e-07, "logits/chosen": -19.022777557373047, "logits/rejected": -18.669797897338867, "logps/chosen": -380.541259765625, "logps/rejected": -318.3994140625, "loss": 0.9324, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.3890395164489746, "rewards/margins": -0.10123801231384277, "rewards/rejected": 2.4902777671813965, "step": 20160 }, { "epoch": 0.9364408746924184, "grad_norm": 62.06541442871094, "learning_rate": 3.4402711360787404e-07, "logits/chosen": -18.919103622436523, "logits/rejected": -18.66873550415039, "logps/chosen": -315.572998046875, "logps/rejected": -244.447509765625, "loss": 0.7156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.922898530960083, "rewards/margins": 0.3601200580596924, "rewards/rejected": 1.5627782344818115, "step": 20170 }, { "epoch": 0.9369051487998514, "grad_norm": 25.389116287231445, "learning_rate": 3.4394973458996855e-07, "logits/chosen": -18.982873916625977, "logits/rejected": -18.14048194885254, "logps/chosen": -410.0224609375, "logps/rejected": -369.6845397949219, "loss": 0.59, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.245434522628784, "rewards/margins": 0.5233614444732666, "rewards/rejected": 2.7220730781555176, "step": 20180 }, { "epoch": 0.9373694229072844, "grad_norm": 33.86468505859375, "learning_rate": 3.4387235557206306e-07, "logits/chosen": -19.557872772216797, "logits/rejected": -17.65326499938965, "logps/chosen": -271.055908203125, "logps/rejected": -160.15939331054688, "loss": 0.3472, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.524311065673828, "rewards/margins": 1.4427142143249512, "rewards/rejected": 1.0815967321395874, "step": 20190 }, { "epoch": 0.9378336970147175, "grad_norm": 69.12953186035156, "learning_rate": 3.437949765541576e-07, "logits/chosen": -18.696502685546875, "logits/rejected": -18.030858993530273, "logps/chosen": -300.4468994140625, "logps/rejected": -240.0150604248047, "loss": 0.4683, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.514591932296753, "rewards/margins": 0.627228856086731, "rewards/rejected": 1.887363076210022, "step": 20200 }, { "epoch": 0.9382979711221505, "grad_norm": 46.70167922973633, "learning_rate": 3.437175975362521e-07, "logits/chosen": -19.032094955444336, "logits/rejected": -18.36336898803711, "logps/chosen": -449.69696044921875, "logps/rejected": -386.4261474609375, "loss": 0.6486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1366336345672607, "rewards/margins": 0.4866129755973816, "rewards/rejected": 2.6500210762023926, "step": 20210 }, { "epoch": 0.9387622452295835, "grad_norm": 71.06326293945312, "learning_rate": 3.4364021851834655e-07, "logits/chosen": -18.270397186279297, "logits/rejected": -18.348201751708984, "logps/chosen": -348.01776123046875, "logps/rejected": -382.482177734375, "loss": 1.2109, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.9877755641937256, "rewards/margins": -0.4455896317958832, "rewards/rejected": 3.4333653450012207, "step": 20220 }, { "epoch": 0.9392265193370166, "grad_norm": 120.45796203613281, "learning_rate": 3.4356283950044106e-07, "logits/chosen": -18.215566635131836, "logits/rejected": -18.495405197143555, "logps/chosen": -384.8868713378906, "logps/rejected": -427.89874267578125, "loss": 1.0432, "rewards/accuracies": 0.5, "rewards/chosen": 3.0470082759857178, "rewards/margins": -0.10640501976013184, "rewards/rejected": 3.1534130573272705, "step": 20230 }, { "epoch": 0.9396907934444496, "grad_norm": 52.5286979675293, "learning_rate": 3.434854604825355e-07, "logits/chosen": -19.475479125976562, "logits/rejected": -18.45155143737793, "logps/chosen": -485.8424377441406, "logps/rejected": -367.8677978515625, "loss": 0.6042, "rewards/accuracies": 0.5, "rewards/chosen": 3.371112108230591, "rewards/margins": 0.6528903841972351, "rewards/rejected": 2.718221426010132, "step": 20240 }, { "epoch": 0.9401550675518826, "grad_norm": 24.28602409362793, "learning_rate": 3.4340808146463003e-07, "logits/chosen": -19.003860473632812, "logits/rejected": -18.49900245666504, "logps/chosen": -468.4183044433594, "logps/rejected": -366.36212158203125, "loss": 0.4481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6351120471954346, "rewards/margins": 1.3383972644805908, "rewards/rejected": 2.296715021133423, "step": 20250 }, { "epoch": 0.9406193416593157, "grad_norm": 52.001461029052734, "learning_rate": 3.4333070244672454e-07, "logits/chosen": -18.663616180419922, "logits/rejected": -19.102014541625977, "logps/chosen": -187.9561309814453, "logps/rejected": -239.0773162841797, "loss": 1.1751, "rewards/accuracies": 0.5, "rewards/chosen": 1.3750693798065186, "rewards/margins": -0.5155487656593323, "rewards/rejected": 1.8906185626983643, "step": 20260 }, { "epoch": 0.9410836157667487, "grad_norm": 42.55584716796875, "learning_rate": 3.43253323428819e-07, "logits/chosen": -18.65060043334961, "logits/rejected": -18.28651237487793, "logps/chosen": -417.87237548828125, "logps/rejected": -287.7526550292969, "loss": 0.5317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.514477252960205, "rewards/margins": 0.9532960653305054, "rewards/rejected": 2.561180830001831, "step": 20270 }, { "epoch": 0.9415478898741817, "grad_norm": 65.17577362060547, "learning_rate": 3.431759444109135e-07, "logits/chosen": -20.018861770629883, "logits/rejected": -18.81221580505371, "logps/chosen": -388.86444091796875, "logps/rejected": -391.2484130859375, "loss": 0.8024, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.447354793548584, "rewards/margins": 0.36427366733551025, "rewards/rejected": 3.0830817222595215, "step": 20280 }, { "epoch": 0.9420121639816147, "grad_norm": 222.19776916503906, "learning_rate": 3.43098565393008e-07, "logits/chosen": -19.677303314208984, "logits/rejected": -18.454301834106445, "logps/chosen": -533.0641479492188, "logps/rejected": -371.2577819824219, "loss": 0.4153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.790423393249512, "rewards/margins": 1.5122578144073486, "rewards/rejected": 3.278165340423584, "step": 20290 }, { "epoch": 0.9424764380890478, "grad_norm": 294.2801208496094, "learning_rate": 3.4302118637510253e-07, "logits/chosen": -18.73443603515625, "logits/rejected": -18.557451248168945, "logps/chosen": -348.12872314453125, "logps/rejected": -396.78460693359375, "loss": 1.5724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.122380495071411, "rewards/margins": -0.21556691825389862, "rewards/rejected": 3.337947368621826, "step": 20300 }, { "epoch": 0.9429407121964808, "grad_norm": 43.961143493652344, "learning_rate": 3.4294380735719704e-07, "logits/chosen": -19.26524543762207, "logits/rejected": -18.213878631591797, "logps/chosen": -436.7548828125, "logps/rejected": -283.4158020019531, "loss": 0.6482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.644385814666748, "rewards/margins": 1.2293962240219116, "rewards/rejected": 2.414989709854126, "step": 20310 }, { "epoch": 0.9434049863039138, "grad_norm": 140.11883544921875, "learning_rate": 3.428664283392915e-07, "logits/chosen": -18.42532730102539, "logits/rejected": -18.289974212646484, "logps/chosen": -379.7774963378906, "logps/rejected": -339.2480773925781, "loss": 0.8617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2989017963409424, "rewards/margins": 0.3379422128200531, "rewards/rejected": 2.9609599113464355, "step": 20320 }, { "epoch": 0.9438692604113469, "grad_norm": 38.004215240478516, "learning_rate": 3.4278904932138596e-07, "logits/chosen": -18.8382511138916, "logits/rejected": -18.007755279541016, "logps/chosen": -425.5155334472656, "logps/rejected": -366.9013671875, "loss": 0.7577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7141594886779785, "rewards/margins": 0.6500507593154907, "rewards/rejected": 3.0641086101531982, "step": 20330 }, { "epoch": 0.9443335345187799, "grad_norm": 168.383056640625, "learning_rate": 3.4271167030348047e-07, "logits/chosen": -20.385486602783203, "logits/rejected": -18.37870216369629, "logps/chosen": -401.3854064941406, "logps/rejected": -286.6683654785156, "loss": 0.4739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.03110933303833, "rewards/margins": 1.8461068868637085, "rewards/rejected": 2.185001850128174, "step": 20340 }, { "epoch": 0.9447978086262129, "grad_norm": 76.49650573730469, "learning_rate": 3.42634291285575e-07, "logits/chosen": -18.486286163330078, "logits/rejected": -17.990673065185547, "logps/chosen": -445.13543701171875, "logps/rejected": -394.27313232421875, "loss": 0.5758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.635971784591675, "rewards/margins": 0.6926935911178589, "rewards/rejected": 2.9432783126831055, "step": 20350 }, { "epoch": 0.945262082733646, "grad_norm": 91.77403259277344, "learning_rate": 3.425569122676695e-07, "logits/chosen": -17.689022064208984, "logits/rejected": -17.455333709716797, "logps/chosen": -279.0517883300781, "logps/rejected": -271.1351318359375, "loss": 0.9274, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5766024589538574, "rewards/margins": 0.19435808062553406, "rewards/rejected": 2.38224458694458, "step": 20360 }, { "epoch": 0.945726356841079, "grad_norm": 54.31019973754883, "learning_rate": 3.4247953324976395e-07, "logits/chosen": -18.975299835205078, "logits/rejected": -18.504547119140625, "logps/chosen": -478.04974365234375, "logps/rejected": -272.389892578125, "loss": 0.9245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6378026008605957, "rewards/margins": 1.238505482673645, "rewards/rejected": 2.3992972373962402, "step": 20370 }, { "epoch": 0.946190630948512, "grad_norm": 182.16522216796875, "learning_rate": 3.4240215423185846e-07, "logits/chosen": -18.669401168823242, "logits/rejected": -18.084514617919922, "logps/chosen": -476.379638671875, "logps/rejected": -429.30712890625, "loss": 0.5292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.339691638946533, "rewards/margins": 1.1872841119766235, "rewards/rejected": 2.152407169342041, "step": 20380 }, { "epoch": 0.9466549050559451, "grad_norm": 199.67535400390625, "learning_rate": 3.4232477521395297e-07, "logits/chosen": -19.37830924987793, "logits/rejected": -18.417255401611328, "logps/chosen": -412.8629455566406, "logps/rejected": -371.08514404296875, "loss": 0.5288, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9479153156280518, "rewards/margins": 0.8254526853561401, "rewards/rejected": 2.122462749481201, "step": 20390 }, { "epoch": 0.9471191791633781, "grad_norm": 30.166259765625, "learning_rate": 3.422473961960475e-07, "logits/chosen": -18.971765518188477, "logits/rejected": -18.858278274536133, "logps/chosen": -395.52679443359375, "logps/rejected": -392.086669921875, "loss": 0.6079, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.970867156982422, "rewards/margins": 0.3657166063785553, "rewards/rejected": 2.6051506996154785, "step": 20400 }, { "epoch": 0.9475834532708111, "grad_norm": 241.87435913085938, "learning_rate": 3.42170017178142e-07, "logits/chosen": -18.754230499267578, "logits/rejected": -17.80422592163086, "logps/chosen": -355.3548278808594, "logps/rejected": -283.67083740234375, "loss": 0.9175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7617175579071045, "rewards/margins": 0.25940755009651184, "rewards/rejected": 2.502309799194336, "step": 20410 }, { "epoch": 0.9480477273782442, "grad_norm": 28.656553268432617, "learning_rate": 3.4209263816023645e-07, "logits/chosen": -19.35875701904297, "logits/rejected": -17.560604095458984, "logps/chosen": -425.72076416015625, "logps/rejected": -207.47048950195312, "loss": 0.254, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6014151573181152, "rewards/margins": 2.455193519592285, "rewards/rejected": 1.1462217569351196, "step": 20420 }, { "epoch": 0.9485120014856772, "grad_norm": 61.155784606933594, "learning_rate": 3.420152591423309e-07, "logits/chosen": -18.979671478271484, "logits/rejected": -19.45656394958496, "logps/chosen": -331.1222839355469, "logps/rejected": -365.0414123535156, "loss": 0.7508, "rewards/accuracies": 0.5, "rewards/chosen": 2.594555616378784, "rewards/margins": 0.28140363097190857, "rewards/rejected": 2.3131518363952637, "step": 20430 }, { "epoch": 0.9489762755931102, "grad_norm": 160.6521453857422, "learning_rate": 3.419378801244254e-07, "logits/chosen": -19.247886657714844, "logits/rejected": -18.21920394897461, "logps/chosen": -470.7340393066406, "logps/rejected": -334.3959045410156, "loss": 0.7091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5356686115264893, "rewards/margins": 1.1747682094573975, "rewards/rejected": 2.360900402069092, "step": 20440 }, { "epoch": 0.9494405497005431, "grad_norm": 62.78692626953125, "learning_rate": 3.4186050110651993e-07, "logits/chosen": -18.99919319152832, "logits/rejected": -17.991863250732422, "logps/chosen": -329.100830078125, "logps/rejected": -263.54644775390625, "loss": 0.8318, "rewards/accuracies": 0.5, "rewards/chosen": 2.1439454555511475, "rewards/margins": 0.12809278070926666, "rewards/rejected": 2.015852451324463, "step": 20450 }, { "epoch": 0.9499048238079763, "grad_norm": 12.469947814941406, "learning_rate": 3.4178312208861445e-07, "logits/chosen": -18.90953826904297, "logits/rejected": -18.196170806884766, "logps/chosen": -510.75518798828125, "logps/rejected": -371.91070556640625, "loss": 0.8285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.709557056427002, "rewards/margins": 0.6232402920722961, "rewards/rejected": 3.0863163471221924, "step": 20460 }, { "epoch": 0.9503690979154092, "grad_norm": 103.24071502685547, "learning_rate": 3.417057430707089e-07, "logits/chosen": -19.73040771484375, "logits/rejected": -18.47123146057129, "logps/chosen": -425.33062744140625, "logps/rejected": -338.7856750488281, "loss": 0.5268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9948081970214844, "rewards/margins": 1.1198399066925049, "rewards/rejected": 2.8749680519104004, "step": 20470 }, { "epoch": 0.9508333720228422, "grad_norm": 5.832977771759033, "learning_rate": 3.416283640528034e-07, "logits/chosen": -18.932931900024414, "logits/rejected": -17.9395694732666, "logps/chosen": -343.4532775878906, "logps/rejected": -254.86032104492188, "loss": 0.661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8189122676849365, "rewards/margins": 0.4173017144203186, "rewards/rejected": 2.401610851287842, "step": 20480 }, { "epoch": 0.9512976461302753, "grad_norm": 301.3721923828125, "learning_rate": 3.4155098503489793e-07, "logits/chosen": -19.43997573852539, "logits/rejected": -18.525203704833984, "logps/chosen": -457.7154846191406, "logps/rejected": -374.12213134765625, "loss": 0.8464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9927191734313965, "rewards/margins": 0.4523330330848694, "rewards/rejected": 2.5403859615325928, "step": 20490 }, { "epoch": 0.9517619202377083, "grad_norm": 21.092012405395508, "learning_rate": 3.4147360601699244e-07, "logits/chosen": -19.44060516357422, "logits/rejected": -17.757226943969727, "logps/chosen": -366.5814514160156, "logps/rejected": -218.4451141357422, "loss": 0.7978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8269355297088623, "rewards/margins": 1.6781381368637085, "rewards/rejected": 1.1487975120544434, "step": 20500 }, { "epoch": 0.9522261943451413, "grad_norm": 51.56580352783203, "learning_rate": 3.4139622699908695e-07, "logits/chosen": -19.291837692260742, "logits/rejected": -18.31856918334961, "logps/chosen": -334.4631042480469, "logps/rejected": -231.39065551757812, "loss": 0.4121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8417162895202637, "rewards/margins": 1.3765443563461304, "rewards/rejected": 1.4651719331741333, "step": 20510 }, { "epoch": 0.9526904684525744, "grad_norm": 2.4015145301818848, "learning_rate": 3.4131884798118146e-07, "logits/chosen": -19.34065055847168, "logits/rejected": -19.2460880279541, "logps/chosen": -416.64385986328125, "logps/rejected": -361.6097412109375, "loss": 0.6957, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.449128150939941, "rewards/margins": 0.9029960632324219, "rewards/rejected": 3.5461318492889404, "step": 20520 }, { "epoch": 0.9531547425600074, "grad_norm": 115.40042877197266, "learning_rate": 3.4124146896327587e-07, "logits/chosen": -19.326990127563477, "logits/rejected": -18.438583374023438, "logps/chosen": -307.486083984375, "logps/rejected": -293.35906982421875, "loss": 0.6797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2608482837677, "rewards/margins": 0.3594137132167816, "rewards/rejected": 1.9014346599578857, "step": 20530 }, { "epoch": 0.9536190166674404, "grad_norm": 319.796630859375, "learning_rate": 3.411640899453704e-07, "logits/chosen": -19.179758071899414, "logits/rejected": -18.330259323120117, "logps/chosen": -395.2061767578125, "logps/rejected": -357.9193115234375, "loss": 0.6537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.581272840499878, "rewards/margins": 0.9438996315002441, "rewards/rejected": 2.637373685836792, "step": 20540 }, { "epoch": 0.9540832907748735, "grad_norm": 24.000560760498047, "learning_rate": 3.410867109274649e-07, "logits/chosen": -18.915090560913086, "logits/rejected": -18.383068084716797, "logps/chosen": -433.76214599609375, "logps/rejected": -424.05780029296875, "loss": 0.8622, "rewards/accuracies": 0.5, "rewards/chosen": 2.9812893867492676, "rewards/margins": 0.24433882534503937, "rewards/rejected": 2.736950397491455, "step": 20550 }, { "epoch": 0.9545475648823065, "grad_norm": 1.0031771659851074, "learning_rate": 3.410093319095594e-07, "logits/chosen": -18.382869720458984, "logits/rejected": -16.87216567993164, "logps/chosen": -319.3770751953125, "logps/rejected": -251.31472778320312, "loss": 0.4262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4040958881378174, "rewards/margins": 2.1027817726135254, "rewards/rejected": 1.301314115524292, "step": 20560 }, { "epoch": 0.9550118389897395, "grad_norm": 262.1253967285156, "learning_rate": 3.4093195289165386e-07, "logits/chosen": -18.733707427978516, "logits/rejected": -18.890445709228516, "logps/chosen": -401.3299865722656, "logps/rejected": -441.2523498535156, "loss": 1.1218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5403072834014893, "rewards/margins": 0.1610749065876007, "rewards/rejected": 3.379232406616211, "step": 20570 }, { "epoch": 0.9554761130971726, "grad_norm": 174.191162109375, "learning_rate": 3.4085457387374837e-07, "logits/chosen": -19.012638092041016, "logits/rejected": -18.73799705505371, "logps/chosen": -385.1244812011719, "logps/rejected": -343.636474609375, "loss": 0.7593, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4026503562927246, "rewards/margins": 0.3878961205482483, "rewards/rejected": 3.0147547721862793, "step": 20580 }, { "epoch": 0.9559403872046056, "grad_norm": 10.09605598449707, "learning_rate": 3.407771948558429e-07, "logits/chosen": -19.13067054748535, "logits/rejected": -18.679840087890625, "logps/chosen": -357.3439636230469, "logps/rejected": -301.9216003417969, "loss": 0.9723, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.243189573287964, "rewards/margins": -0.08594582229852676, "rewards/rejected": 2.3291354179382324, "step": 20590 }, { "epoch": 0.9564046613120386, "grad_norm": 13.800424575805664, "learning_rate": 3.406998158379374e-07, "logits/chosen": -18.654550552368164, "logits/rejected": -18.493406295776367, "logps/chosen": -378.32574462890625, "logps/rejected": -338.17559814453125, "loss": 0.6536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.733309268951416, "rewards/margins": 0.7687485218048096, "rewards/rejected": 2.9645605087280273, "step": 20600 }, { "epoch": 0.9568689354194717, "grad_norm": 35.2247314453125, "learning_rate": 3.406224368200319e-07, "logits/chosen": -19.2249698638916, "logits/rejected": -18.737287521362305, "logps/chosen": -328.6761169433594, "logps/rejected": -349.17938232421875, "loss": 1.0627, "rewards/accuracies": 0.5, "rewards/chosen": 2.9024863243103027, "rewards/margins": 0.1963614970445633, "rewards/rejected": 2.706124782562256, "step": 20610 }, { "epoch": 0.9573332095269047, "grad_norm": 78.88438415527344, "learning_rate": 3.405450578021263e-07, "logits/chosen": -19.1180419921875, "logits/rejected": -18.548641204833984, "logps/chosen": -376.76171875, "logps/rejected": -291.3579406738281, "loss": 0.6766, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4240050315856934, "rewards/margins": 1.0752829313278198, "rewards/rejected": 2.348721981048584, "step": 20620 }, { "epoch": 0.9577974836343377, "grad_norm": 117.39793395996094, "learning_rate": 3.404676787842208e-07, "logits/chosen": -20.207046508789062, "logits/rejected": -19.074464797973633, "logps/chosen": -366.9693908691406, "logps/rejected": -312.78997802734375, "loss": 0.7615, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.647578477859497, "rewards/margins": 0.5300424695014954, "rewards/rejected": 3.1175358295440674, "step": 20630 }, { "epoch": 0.9582617577417707, "grad_norm": 1.2592453956604004, "learning_rate": 3.4039029976631533e-07, "logits/chosen": -18.672237396240234, "logits/rejected": -17.50008773803711, "logps/chosen": -367.4784240722656, "logps/rejected": -215.58480834960938, "loss": 0.3674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4735782146453857, "rewards/margins": 1.5513358116149902, "rewards/rejected": 1.9222424030303955, "step": 20640 }, { "epoch": 0.9587260318492038, "grad_norm": 58.630489349365234, "learning_rate": 3.4031292074840984e-07, "logits/chosen": -19.275638580322266, "logits/rejected": -19.258724212646484, "logps/chosen": -399.00213623046875, "logps/rejected": -278.84967041015625, "loss": 0.9506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9942755699157715, "rewards/margins": 0.5031701326370239, "rewards/rejected": 2.491105318069458, "step": 20650 }, { "epoch": 0.9591903059566368, "grad_norm": 5.445524215698242, "learning_rate": 3.4023554173050435e-07, "logits/chosen": -18.97133445739746, "logits/rejected": -18.319438934326172, "logps/chosen": -402.98748779296875, "logps/rejected": -350.42340087890625, "loss": 0.701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.249277114868164, "rewards/margins": 0.551745593547821, "rewards/rejected": 1.6975314617156982, "step": 20660 }, { "epoch": 0.9596545800640698, "grad_norm": 49.731201171875, "learning_rate": 3.4015816271259887e-07, "logits/chosen": -19.8234920501709, "logits/rejected": -19.007017135620117, "logps/chosen": -307.94195556640625, "logps/rejected": -262.38092041015625, "loss": 0.8516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.577061176300049, "rewards/margins": 0.27448803186416626, "rewards/rejected": 2.3025732040405273, "step": 20670 }, { "epoch": 0.9601188541715029, "grad_norm": 134.5157928466797, "learning_rate": 3.400807836946933e-07, "logits/chosen": -18.907634735107422, "logits/rejected": -17.464282989501953, "logps/chosen": -342.201416015625, "logps/rejected": -221.56527709960938, "loss": 0.2611, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.013096332550049, "rewards/margins": 2.136131763458252, "rewards/rejected": 1.8769642114639282, "step": 20680 }, { "epoch": 0.9605831282789359, "grad_norm": 7.104738712310791, "learning_rate": 3.4000340467678784e-07, "logits/chosen": -19.30898666381836, "logits/rejected": -18.1748104095459, "logps/chosen": -387.1092224121094, "logps/rejected": -258.6249694824219, "loss": 0.4767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.338298797607422, "rewards/margins": 0.9312540888786316, "rewards/rejected": 2.4070446491241455, "step": 20690 }, { "epoch": 0.9610474023863689, "grad_norm": 2.7328410148620605, "learning_rate": 3.3992602565888235e-07, "logits/chosen": -18.762859344482422, "logits/rejected": -18.6436710357666, "logps/chosen": -302.77276611328125, "logps/rejected": -261.43023681640625, "loss": 0.7015, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.92356014251709, "rewards/margins": 1.087144374847412, "rewards/rejected": 1.8364158868789673, "step": 20700 }, { "epoch": 0.961511676493802, "grad_norm": 17.90363883972168, "learning_rate": 3.3984864664097686e-07, "logits/chosen": -19.797685623168945, "logits/rejected": -19.315637588500977, "logps/chosen": -365.46136474609375, "logps/rejected": -302.25701904296875, "loss": 0.4431, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0258681774139404, "rewards/margins": 0.8221960067749023, "rewards/rejected": 2.203672170639038, "step": 20710 }, { "epoch": 0.961975950601235, "grad_norm": 22.440458297729492, "learning_rate": 3.3977126762307126e-07, "logits/chosen": -19.60638427734375, "logits/rejected": -18.785720825195312, "logps/chosen": -425.66424560546875, "logps/rejected": -350.6656494140625, "loss": 0.5371, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8783411979675293, "rewards/margins": 0.7324649095535278, "rewards/rejected": 3.145876169204712, "step": 20720 }, { "epoch": 0.962440224708668, "grad_norm": 98.30671691894531, "learning_rate": 3.396938886051658e-07, "logits/chosen": -20.211896896362305, "logits/rejected": -18.895875930786133, "logps/chosen": -461.88104248046875, "logps/rejected": -308.7881774902344, "loss": 0.5434, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1596083641052246, "rewards/margins": 0.9399330019950867, "rewards/rejected": 2.219675302505493, "step": 20730 }, { "epoch": 0.9629044988161011, "grad_norm": 47.64250946044922, "learning_rate": 3.396165095872603e-07, "logits/chosen": -18.381160736083984, "logits/rejected": -17.991703033447266, "logps/chosen": -338.20977783203125, "logps/rejected": -253.3164520263672, "loss": 0.801, "rewards/accuracies": 0.5, "rewards/chosen": 2.44964599609375, "rewards/margins": 0.6418757438659668, "rewards/rejected": 1.8077701330184937, "step": 20740 }, { "epoch": 0.9633687729235341, "grad_norm": 152.64476013183594, "learning_rate": 3.395391305693548e-07, "logits/chosen": -19.201099395751953, "logits/rejected": -17.8281307220459, "logps/chosen": -446.83612060546875, "logps/rejected": -363.3946228027344, "loss": 0.6506, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6524288654327393, "rewards/margins": 0.9520159959793091, "rewards/rejected": 2.7004127502441406, "step": 20750 }, { "epoch": 0.9638330470309671, "grad_norm": 1.476033329963684, "learning_rate": 3.394617515514493e-07, "logits/chosen": -18.687124252319336, "logits/rejected": -17.44136619567871, "logps/chosen": -384.28363037109375, "logps/rejected": -299.24468994140625, "loss": 0.8473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2107608318328857, "rewards/margins": 1.340057134628296, "rewards/rejected": 1.8707036972045898, "step": 20760 }, { "epoch": 0.9642973211384002, "grad_norm": 94.28759765625, "learning_rate": 3.393843725335438e-07, "logits/chosen": -18.94489860534668, "logits/rejected": -18.614818572998047, "logps/chosen": -325.23577880859375, "logps/rejected": -261.7340087890625, "loss": 0.7076, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2551212310791016, "rewards/margins": 0.4061390459537506, "rewards/rejected": 1.8489824533462524, "step": 20770 }, { "epoch": 0.9647615952458332, "grad_norm": 266.455322265625, "learning_rate": 3.393069935156383e-07, "logits/chosen": -19.64349937438965, "logits/rejected": -19.640857696533203, "logps/chosen": -536.1370849609375, "logps/rejected": -512.56298828125, "loss": 0.6513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.413747310638428, "rewards/margins": 0.8402048349380493, "rewards/rejected": 3.573542833328247, "step": 20780 }, { "epoch": 0.9652258693532662, "grad_norm": 19.50670051574707, "learning_rate": 3.392296144977328e-07, "logits/chosen": -18.50929069519043, "logits/rejected": -18.49953269958496, "logps/chosen": -321.7735290527344, "logps/rejected": -311.8753662109375, "loss": 0.9036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8801331520080566, "rewards/margins": 0.049924325197935104, "rewards/rejected": 2.8302087783813477, "step": 20790 }, { "epoch": 0.9656901434606991, "grad_norm": 7.327343940734863, "learning_rate": 3.391522354798273e-07, "logits/chosen": -18.26043701171875, "logits/rejected": -18.662172317504883, "logps/chosen": -363.5694580078125, "logps/rejected": -388.1617126464844, "loss": 1.726, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6613411903381348, "rewards/margins": -0.7121375203132629, "rewards/rejected": 3.373479127883911, "step": 20800 }, { "epoch": 0.9661544175681323, "grad_norm": 141.88955688476562, "learning_rate": 3.3907485646192176e-07, "logits/chosen": -19.061803817749023, "logits/rejected": -18.51512908935547, "logps/chosen": -310.49847412109375, "logps/rejected": -233.55593872070312, "loss": 0.5995, "rewards/accuracies": 0.5, "rewards/chosen": 2.5517449378967285, "rewards/margins": 0.7608767747879028, "rewards/rejected": 1.7908680438995361, "step": 20810 }, { "epoch": 0.9666186916755652, "grad_norm": 48.77698516845703, "learning_rate": 3.3899747744401627e-07, "logits/chosen": -19.459007263183594, "logits/rejected": -18.022109985351562, "logps/chosen": -357.74993896484375, "logps/rejected": -233.6908721923828, "loss": 0.5421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8079733848571777, "rewards/margins": 1.4259302616119385, "rewards/rejected": 2.3820433616638184, "step": 20820 }, { "epoch": 0.9670829657829982, "grad_norm": 140.1695556640625, "learning_rate": 3.3892009842611073e-07, "logits/chosen": -18.314861297607422, "logits/rejected": -17.930984497070312, "logps/chosen": -416.80810546875, "logps/rejected": -359.71923828125, "loss": 0.4138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.91583514213562, "rewards/margins": 1.081362009048462, "rewards/rejected": 2.8344733715057373, "step": 20830 }, { "epoch": 0.9675472398904313, "grad_norm": 141.84075927734375, "learning_rate": 3.3884271940820524e-07, "logits/chosen": -18.025196075439453, "logits/rejected": -17.90276527404785, "logps/chosen": -349.929931640625, "logps/rejected": -307.9576110839844, "loss": 1.2022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.62469220161438, "rewards/margins": -0.0764111801981926, "rewards/rejected": 2.7011032104492188, "step": 20840 }, { "epoch": 0.9680115139978643, "grad_norm": 249.9705047607422, "learning_rate": 3.3876534039029975e-07, "logits/chosen": -18.423213958740234, "logits/rejected": -18.656185150146484, "logps/chosen": -381.0663757324219, "logps/rejected": -385.0653991699219, "loss": 0.8998, "rewards/accuracies": 0.5, "rewards/chosen": 3.1011717319488525, "rewards/margins": 0.15484164655208588, "rewards/rejected": 2.9463300704956055, "step": 20850 }, { "epoch": 0.9684757881052973, "grad_norm": 185.64791870117188, "learning_rate": 3.3868796137239426e-07, "logits/chosen": -18.070878982543945, "logits/rejected": -18.32396125793457, "logps/chosen": -310.0312805175781, "logps/rejected": -329.7924499511719, "loss": 1.4846, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.7182013988494873, "rewards/margins": -0.8949396014213562, "rewards/rejected": 2.6131410598754883, "step": 20860 }, { "epoch": 0.9689400622127304, "grad_norm": 41.30828857421875, "learning_rate": 3.386105823544888e-07, "logits/chosen": -18.41057777404785, "logits/rejected": -17.753095626831055, "logps/chosen": -355.58355712890625, "logps/rejected": -336.3827209472656, "loss": 0.4969, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4249024391174316, "rewards/margins": 1.2394102811813354, "rewards/rejected": 2.1854920387268066, "step": 20870 }, { "epoch": 0.9694043363201634, "grad_norm": 177.158203125, "learning_rate": 3.3853320333658323e-07, "logits/chosen": -18.742618560791016, "logits/rejected": -18.424318313598633, "logps/chosen": -450.7862243652344, "logps/rejected": -346.49127197265625, "loss": 0.7199, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7525241374969482, "rewards/margins": 0.7540749311447144, "rewards/rejected": 2.9984495639801025, "step": 20880 }, { "epoch": 0.9698686104275964, "grad_norm": 12.673079490661621, "learning_rate": 3.3845582431867774e-07, "logits/chosen": -18.88037109375, "logits/rejected": -18.0875301361084, "logps/chosen": -442.90313720703125, "logps/rejected": -286.3559265136719, "loss": 0.8538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.041060447692871, "rewards/margins": 0.8524154424667358, "rewards/rejected": 2.1886448860168457, "step": 20890 }, { "epoch": 0.9703328845350295, "grad_norm": 273.4514465332031, "learning_rate": 3.3837844530077225e-07, "logits/chosen": -18.008419036865234, "logits/rejected": -17.553316116333008, "logps/chosen": -497.65618896484375, "logps/rejected": -361.3319091796875, "loss": 0.7672, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1875827312469482, "rewards/margins": 0.6378201842308044, "rewards/rejected": 2.5497629642486572, "step": 20900 }, { "epoch": 0.9707971586424625, "grad_norm": 76.74221801757812, "learning_rate": 3.383010662828667e-07, "logits/chosen": -18.798847198486328, "logits/rejected": -17.798965454101562, "logps/chosen": -370.1462097167969, "logps/rejected": -270.8250732421875, "loss": 0.593, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8998193740844727, "rewards/margins": 0.8789340853691101, "rewards/rejected": 2.020885467529297, "step": 20910 }, { "epoch": 0.9712614327498955, "grad_norm": 99.46710968017578, "learning_rate": 3.382236872649612e-07, "logits/chosen": -19.349702835083008, "logits/rejected": -17.02606773376465, "logps/chosen": -449.9012756347656, "logps/rejected": -200.93099975585938, "loss": 0.298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.885258436203003, "rewards/margins": 2.462247371673584, "rewards/rejected": 1.4230115413665771, "step": 20920 }, { "epoch": 0.9717257068573286, "grad_norm": 50.3852653503418, "learning_rate": 3.381463082470557e-07, "logits/chosen": -18.737987518310547, "logits/rejected": -18.175264358520508, "logps/chosen": -453.68505859375, "logps/rejected": -354.9660339355469, "loss": 0.8665, "rewards/accuracies": 0.5, "rewards/chosen": 3.4899325370788574, "rewards/margins": 0.563295841217041, "rewards/rejected": 2.9266369342803955, "step": 20930 }, { "epoch": 0.9721899809647616, "grad_norm": 25.840702056884766, "learning_rate": 3.380689292291502e-07, "logits/chosen": -17.706958770751953, "logits/rejected": -16.999446868896484, "logps/chosen": -354.3627624511719, "logps/rejected": -285.5173645019531, "loss": 0.3827, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1242175102233887, "rewards/margins": 1.7678813934326172, "rewards/rejected": 1.3563363552093506, "step": 20940 }, { "epoch": 0.9726542550721946, "grad_norm": 135.3413543701172, "learning_rate": 3.379915502112447e-07, "logits/chosen": -18.820964813232422, "logits/rejected": -18.345781326293945, "logps/chosen": -395.17291259765625, "logps/rejected": -302.63995361328125, "loss": 1.0448, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2496070861816406, "rewards/margins": 0.08362376689910889, "rewards/rejected": 3.1659836769104004, "step": 20950 }, { "epoch": 0.9731185291796276, "grad_norm": 77.98939514160156, "learning_rate": 3.379141711933392e-07, "logits/chosen": -18.077871322631836, "logits/rejected": -17.476192474365234, "logps/chosen": -412.15087890625, "logps/rejected": -353.52154541015625, "loss": 0.6862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.075331211090088, "rewards/margins": 0.9281843304634094, "rewards/rejected": 2.1471469402313232, "step": 20960 }, { "epoch": 0.9735828032870607, "grad_norm": 4.598881244659424, "learning_rate": 3.3783679217543373e-07, "logits/chosen": -18.906890869140625, "logits/rejected": -17.706274032592773, "logps/chosen": -345.5501708984375, "logps/rejected": -238.1930389404297, "loss": 0.5819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4577224254608154, "rewards/margins": 1.6667133569717407, "rewards/rejected": 1.7910093069076538, "step": 20970 }, { "epoch": 0.9740470773944937, "grad_norm": 16.687286376953125, "learning_rate": 3.377594131575282e-07, "logits/chosen": -18.329362869262695, "logits/rejected": -18.516319274902344, "logps/chosen": -333.86846923828125, "logps/rejected": -352.9962158203125, "loss": 1.4955, "rewards/accuracies": 0.5, "rewards/chosen": 3.151766300201416, "rewards/margins": -0.20735399425029755, "rewards/rejected": 3.3591198921203613, "step": 20980 }, { "epoch": 0.9745113515019267, "grad_norm": 88.97269439697266, "learning_rate": 3.376820341396227e-07, "logits/chosen": -19.42860984802246, "logits/rejected": -18.02525520324707, "logps/chosen": -384.08013916015625, "logps/rejected": -244.48904418945312, "loss": 0.3078, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2982680797576904, "rewards/margins": 1.6788127422332764, "rewards/rejected": 1.6194555759429932, "step": 20990 }, { "epoch": 0.9749756256093598, "grad_norm": 53.60332107543945, "learning_rate": 3.3760465512171716e-07, "logits/chosen": -19.73026466369629, "logits/rejected": -17.936124801635742, "logps/chosen": -355.32379150390625, "logps/rejected": -239.73495483398438, "loss": 0.5994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6851086616516113, "rewards/margins": 0.5788070559501648, "rewards/rejected": 2.106301784515381, "step": 21000 }, { "epoch": 0.9754398997167928, "grad_norm": 71.59992980957031, "learning_rate": 3.3752727610381167e-07, "logits/chosen": -18.409143447875977, "logits/rejected": -17.908601760864258, "logps/chosen": -444.1651916503906, "logps/rejected": -366.3702697753906, "loss": 0.672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.4389238357543945, "rewards/margins": 0.7355309724807739, "rewards/rejected": 3.703392744064331, "step": 21010 }, { "epoch": 0.9759041738242258, "grad_norm": 101.47463989257812, "learning_rate": 3.374498970859062e-07, "logits/chosen": -18.82383155822754, "logits/rejected": -18.233139038085938, "logps/chosen": -373.5399475097656, "logps/rejected": -328.14776611328125, "loss": 0.6437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8413290977478027, "rewards/margins": 0.8379826545715332, "rewards/rejected": 3.0033466815948486, "step": 21020 }, { "epoch": 0.9763684479316589, "grad_norm": 79.75505828857422, "learning_rate": 3.3737251806800064e-07, "logits/chosen": -19.375322341918945, "logits/rejected": -17.542734146118164, "logps/chosen": -485.9148864746094, "logps/rejected": -328.2112731933594, "loss": 0.4848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.154014587402344, "rewards/margins": 1.6344292163848877, "rewards/rejected": 2.519585132598877, "step": 21030 }, { "epoch": 0.9768327220390919, "grad_norm": 60.33013916015625, "learning_rate": 3.3729513905009515e-07, "logits/chosen": -18.555320739746094, "logits/rejected": -18.030277252197266, "logps/chosen": -285.8518371582031, "logps/rejected": -250.3529052734375, "loss": 0.6606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1496262550354004, "rewards/margins": 0.6592919230461121, "rewards/rejected": 1.490334391593933, "step": 21040 }, { "epoch": 0.9772969961465249, "grad_norm": 23.748918533325195, "learning_rate": 3.3721776003218966e-07, "logits/chosen": -18.648487091064453, "logits/rejected": -18.15911293029785, "logps/chosen": -412.3568420410156, "logps/rejected": -317.92242431640625, "loss": 0.4075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.300920009613037, "rewards/margins": 1.054163932800293, "rewards/rejected": 2.246756076812744, "step": 21050 }, { "epoch": 0.977761270253958, "grad_norm": 19.24966049194336, "learning_rate": 3.3714038101428417e-07, "logits/chosen": -18.47842025756836, "logits/rejected": -18.060131072998047, "logps/chosen": -447.44989013671875, "logps/rejected": -397.67425537109375, "loss": 0.773, "rewards/accuracies": 0.5, "rewards/chosen": 3.437504291534424, "rewards/margins": 0.3881945312023163, "rewards/rejected": 3.0493099689483643, "step": 21060 }, { "epoch": 0.978225544361391, "grad_norm": 162.17042541503906, "learning_rate": 3.370630019963787e-07, "logits/chosen": -19.81694221496582, "logits/rejected": -18.805017471313477, "logps/chosen": -473.79833984375, "logps/rejected": -386.1839904785156, "loss": 0.596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.459586143493652, "rewards/margins": 1.4123433828353882, "rewards/rejected": 3.0472424030303955, "step": 21070 }, { "epoch": 0.978689818468824, "grad_norm": 283.93267822265625, "learning_rate": 3.3698562297847314e-07, "logits/chosen": -17.992664337158203, "logits/rejected": -17.005481719970703, "logps/chosen": -403.2373046875, "logps/rejected": -348.17327880859375, "loss": 0.7939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.314323902130127, "rewards/margins": 1.0555120706558228, "rewards/rejected": 2.258812427520752, "step": 21080 }, { "epoch": 0.9791540925762571, "grad_norm": 135.37493896484375, "learning_rate": 3.3690824396056765e-07, "logits/chosen": -18.621469497680664, "logits/rejected": -17.846403121948242, "logps/chosen": -371.2274475097656, "logps/rejected": -274.4373474121094, "loss": 0.6624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6314234733581543, "rewards/margins": 0.5149673223495483, "rewards/rejected": 2.1164565086364746, "step": 21090 }, { "epoch": 0.9796183666836901, "grad_norm": 32.8687629699707, "learning_rate": 3.368308649426621e-07, "logits/chosen": -18.428287506103516, "logits/rejected": -17.722545623779297, "logps/chosen": -411.03643798828125, "logps/rejected": -325.3716125488281, "loss": 0.7209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0974581241607666, "rewards/margins": 0.3713544011116028, "rewards/rejected": 2.7261035442352295, "step": 21100 }, { "epoch": 0.9800826407911231, "grad_norm": 24.04166030883789, "learning_rate": 3.367534859247566e-07, "logits/chosen": -19.29144859313965, "logits/rejected": -18.440279006958008, "logps/chosen": -424.69793701171875, "logps/rejected": -287.7054748535156, "loss": 0.443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3703856468200684, "rewards/margins": 1.381523847579956, "rewards/rejected": 1.9888620376586914, "step": 21110 }, { "epoch": 0.980546914898556, "grad_norm": 81.39854431152344, "learning_rate": 3.3667610690685113e-07, "logits/chosen": -20.299814224243164, "logits/rejected": -19.615163803100586, "logps/chosen": -439.8099670410156, "logps/rejected": -384.18011474609375, "loss": 0.5282, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.675879716873169, "rewards/margins": 0.5663537979125977, "rewards/rejected": 3.109525680541992, "step": 21120 }, { "epoch": 0.9810111890059892, "grad_norm": 20.028371810913086, "learning_rate": 3.365987278889456e-07, "logits/chosen": -19.249277114868164, "logits/rejected": -18.81576919555664, "logps/chosen": -562.1212768554688, "logps/rejected": -463.4996643066406, "loss": 0.4266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.064606666564941, "rewards/margins": 1.117360234260559, "rewards/rejected": 3.94724702835083, "step": 21130 }, { "epoch": 0.9814754631134222, "grad_norm": 31.802165985107422, "learning_rate": 3.365213488710401e-07, "logits/chosen": -18.198732376098633, "logits/rejected": -17.311912536621094, "logps/chosen": -405.9131774902344, "logps/rejected": -302.01739501953125, "loss": 0.256, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5956568717956543, "rewards/margins": 2.0073797702789307, "rewards/rejected": 1.5882774591445923, "step": 21140 }, { "epoch": 0.9819397372208551, "grad_norm": 248.25975036621094, "learning_rate": 3.364439698531346e-07, "logits/chosen": -19.92742919921875, "logits/rejected": -19.058305740356445, "logps/chosen": -505.8245544433594, "logps/rejected": -382.5061950683594, "loss": 0.5881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.127560615539551, "rewards/margins": 1.4178216457366943, "rewards/rejected": 2.7097387313842773, "step": 21150 }, { "epoch": 0.9824040113282883, "grad_norm": 80.38809204101562, "learning_rate": 3.363665908352291e-07, "logits/chosen": -19.983327865600586, "logits/rejected": -18.920530319213867, "logps/chosen": -489.7645568847656, "logps/rejected": -365.57159423828125, "loss": 0.6875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.220572471618652, "rewards/margins": 0.9016290903091431, "rewards/rejected": 3.3189430236816406, "step": 21160 }, { "epoch": 0.9828682854357212, "grad_norm": 58.23789596557617, "learning_rate": 3.3628921181732364e-07, "logits/chosen": -18.681047439575195, "logits/rejected": -18.377269744873047, "logps/chosen": -382.3300476074219, "logps/rejected": -355.4211730957031, "loss": 0.6665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8807342052459717, "rewards/margins": 0.40220898389816284, "rewards/rejected": 2.478525400161743, "step": 21170 }, { "epoch": 0.9833325595431542, "grad_norm": 95.67570495605469, "learning_rate": 3.362118327994181e-07, "logits/chosen": -19.360605239868164, "logits/rejected": -18.55411720275879, "logps/chosen": -509.08154296875, "logps/rejected": -420.17236328125, "loss": 0.6641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8152480125427246, "rewards/margins": 0.20138950645923615, "rewards/rejected": 3.6138579845428467, "step": 21180 }, { "epoch": 0.9837968336505873, "grad_norm": 4.9845499992370605, "learning_rate": 3.361344537815126e-07, "logits/chosen": -17.93228530883789, "logits/rejected": -17.88136100769043, "logps/chosen": -313.0043640136719, "logps/rejected": -237.2939453125, "loss": 0.8918, "rewards/accuracies": 0.5, "rewards/chosen": 2.246007204055786, "rewards/margins": 0.5190675258636475, "rewards/rejected": 1.7269395589828491, "step": 21190 }, { "epoch": 0.9842611077580203, "grad_norm": 206.3105010986328, "learning_rate": 3.3605707476360706e-07, "logits/chosen": -19.840364456176758, "logits/rejected": -19.09029197692871, "logps/chosen": -592.5361328125, "logps/rejected": -406.72918701171875, "loss": 0.6355, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.131509304046631, "rewards/margins": 1.361180305480957, "rewards/rejected": 2.770328998565674, "step": 21200 }, { "epoch": 0.9847253818654533, "grad_norm": 95.22007751464844, "learning_rate": 3.359796957457016e-07, "logits/chosen": -17.998580932617188, "logits/rejected": -17.195112228393555, "logps/chosen": -428.0147399902344, "logps/rejected": -280.4771728515625, "loss": 0.595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3442349433898926, "rewards/margins": 0.7588092088699341, "rewards/rejected": 1.5854257345199585, "step": 21210 }, { "epoch": 0.9851896559728864, "grad_norm": 204.34132385253906, "learning_rate": 3.359023167277961e-07, "logits/chosen": -18.66815757751465, "logits/rejected": -17.713470458984375, "logps/chosen": -391.2352600097656, "logps/rejected": -269.0963439941406, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.036836624145508, "rewards/margins": 0.7434443235397339, "rewards/rejected": 2.2933926582336426, "step": 21220 }, { "epoch": 0.9856539300803194, "grad_norm": 5.314167499542236, "learning_rate": 3.3582493770989055e-07, "logits/chosen": -18.151493072509766, "logits/rejected": -17.851947784423828, "logps/chosen": -473.56768798828125, "logps/rejected": -423.1129455566406, "loss": 0.7919, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8991446495056152, "rewards/margins": 0.3407931923866272, "rewards/rejected": 3.558351516723633, "step": 21230 }, { "epoch": 0.9861182041877524, "grad_norm": 16.79804229736328, "learning_rate": 3.3574755869198506e-07, "logits/chosen": -18.572856903076172, "logits/rejected": -17.903467178344727, "logps/chosen": -462.9788513183594, "logps/rejected": -357.6780700683594, "loss": 1.0295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.970313310623169, "rewards/margins": 0.8417565226554871, "rewards/rejected": 3.128556489944458, "step": 21240 }, { "epoch": 0.9865824782951855, "grad_norm": 13.791409492492676, "learning_rate": 3.3567017967407957e-07, "logits/chosen": -19.673831939697266, "logits/rejected": -17.86898422241211, "logps/chosen": -419.2713928222656, "logps/rejected": -307.22637939453125, "loss": 0.2805, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.040161609649658, "rewards/margins": 1.88595712184906, "rewards/rejected": 2.1542046070098877, "step": 21250 }, { "epoch": 0.9870467524026185, "grad_norm": 112.56810760498047, "learning_rate": 3.355928006561741e-07, "logits/chosen": -18.161100387573242, "logits/rejected": -17.98131561279297, "logps/chosen": -342.7637023925781, "logps/rejected": -339.899169921875, "loss": 1.3815, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9486637115478516, "rewards/margins": -0.10559077560901642, "rewards/rejected": 3.0542542934417725, "step": 21260 }, { "epoch": 0.9875110265100515, "grad_norm": 13.758783340454102, "learning_rate": 3.355154216382686e-07, "logits/chosen": -18.655658721923828, "logits/rejected": -18.724916458129883, "logps/chosen": -339.772705078125, "logps/rejected": -408.81610107421875, "loss": 0.5048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.596461534500122, "rewards/margins": 0.6126147508621216, "rewards/rejected": 1.98384690284729, "step": 21270 }, { "epoch": 0.9879753006174845, "grad_norm": 9.015793800354004, "learning_rate": 3.3543804262036305e-07, "logits/chosen": -19.350000381469727, "logits/rejected": -17.57843589782715, "logps/chosen": -432.11761474609375, "logps/rejected": -288.11163330078125, "loss": 0.2142, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1402740478515625, "rewards/margins": 2.255350112915039, "rewards/rejected": 1.8849236965179443, "step": 21280 }, { "epoch": 0.9884395747249176, "grad_norm": 11.609445571899414, "learning_rate": 3.353606636024575e-07, "logits/chosen": -19.69862174987793, "logits/rejected": -18.278337478637695, "logps/chosen": -504.3207092285156, "logps/rejected": -365.6589660644531, "loss": 0.8109, "rewards/accuracies": 0.5, "rewards/chosen": 4.454300880432129, "rewards/margins": 0.7985454797744751, "rewards/rejected": 3.6557552814483643, "step": 21290 }, { "epoch": 0.9889038488323506, "grad_norm": 173.4634552001953, "learning_rate": 3.35283284584552e-07, "logits/chosen": -19.020626068115234, "logits/rejected": -18.45881462097168, "logps/chosen": -478.46337890625, "logps/rejected": -360.53448486328125, "loss": 0.7923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2135684490203857, "rewards/margins": 0.24563463032245636, "rewards/rejected": 2.9679338932037354, "step": 21300 }, { "epoch": 0.9893681229397836, "grad_norm": 41.85852813720703, "learning_rate": 3.3520590556664653e-07, "logits/chosen": -19.30097770690918, "logits/rejected": -17.85254669189453, "logps/chosen": -401.2878723144531, "logps/rejected": -281.72174072265625, "loss": 0.7602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.605043411254883, "rewards/margins": 0.9336146116256714, "rewards/rejected": 2.671428918838501, "step": 21310 }, { "epoch": 0.9898323970472167, "grad_norm": 62.58427429199219, "learning_rate": 3.3512852654874104e-07, "logits/chosen": -18.532407760620117, "logits/rejected": -17.106006622314453, "logps/chosen": -364.8433837890625, "logps/rejected": -264.8292541503906, "loss": 0.5069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.78930401802063, "rewards/margins": 1.5973782539367676, "rewards/rejected": 2.1919257640838623, "step": 21320 }, { "epoch": 0.9902966711546497, "grad_norm": 15.853261947631836, "learning_rate": 3.350511475308355e-07, "logits/chosen": -17.88717269897461, "logits/rejected": -16.949373245239258, "logps/chosen": -345.54046630859375, "logps/rejected": -230.07791137695312, "loss": 0.3618, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1203713417053223, "rewards/margins": 1.5328691005706787, "rewards/rejected": 1.587502360343933, "step": 21330 }, { "epoch": 0.9907609452620827, "grad_norm": 81.5516357421875, "learning_rate": 3.3497376851293e-07, "logits/chosen": -19.323348999023438, "logits/rejected": -18.340473175048828, "logps/chosen": -459.55462646484375, "logps/rejected": -293.02117919921875, "loss": 0.4345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.585137128829956, "rewards/margins": 1.7762126922607422, "rewards/rejected": 1.8089243173599243, "step": 21340 }, { "epoch": 0.9912252193695158, "grad_norm": 13.24804973602295, "learning_rate": 3.348963894950245e-07, "logits/chosen": -18.070371627807617, "logits/rejected": -17.137516021728516, "logps/chosen": -393.64898681640625, "logps/rejected": -213.499755859375, "loss": 0.8819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6767947673797607, "rewards/margins": 1.0082838535308838, "rewards/rejected": 1.6685110330581665, "step": 21350 }, { "epoch": 0.9916894934769488, "grad_norm": 105.28241729736328, "learning_rate": 3.3481901047711903e-07, "logits/chosen": -18.44601058959961, "logits/rejected": -17.43002700805664, "logps/chosen": -377.99249267578125, "logps/rejected": -289.8340759277344, "loss": 0.492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.495004177093506, "rewards/margins": 0.9645723104476929, "rewards/rejected": 1.5304315090179443, "step": 21360 }, { "epoch": 0.9921537675843818, "grad_norm": 38.82086181640625, "learning_rate": 3.3474163145921354e-07, "logits/chosen": -19.210308074951172, "logits/rejected": -19.04503631591797, "logps/chosen": -413.072998046875, "logps/rejected": -362.05194091796875, "loss": 0.8327, "rewards/accuracies": 0.5, "rewards/chosen": 2.7647883892059326, "rewards/margins": 0.19734737277030945, "rewards/rejected": 2.567440986633301, "step": 21370 }, { "epoch": 0.9926180416918149, "grad_norm": 78.93623352050781, "learning_rate": 3.34664252441308e-07, "logits/chosen": -18.85857582092285, "logits/rejected": -18.471479415893555, "logps/chosen": -266.25823974609375, "logps/rejected": -253.85498046875, "loss": 0.809, "rewards/accuracies": 0.5, "rewards/chosen": 2.169705629348755, "rewards/margins": 0.6509010791778564, "rewards/rejected": 1.5188040733337402, "step": 21380 }, { "epoch": 0.9930823157992479, "grad_norm": 200.21859741210938, "learning_rate": 3.3458687342340246e-07, "logits/chosen": -19.311290740966797, "logits/rejected": -18.318973541259766, "logps/chosen": -452.9463806152344, "logps/rejected": -331.62451171875, "loss": 0.7843, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.568575382232666, "rewards/margins": 0.9239165186882019, "rewards/rejected": 2.6446590423583984, "step": 21390 }, { "epoch": 0.9935465899066809, "grad_norm": 146.86080932617188, "learning_rate": 3.3450949440549697e-07, "logits/chosen": -19.95008659362793, "logits/rejected": -18.756376266479492, "logps/chosen": -457.08154296875, "logps/rejected": -390.2080993652344, "loss": 0.587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.653561115264893, "rewards/margins": 1.115900993347168, "rewards/rejected": 3.5376601219177246, "step": 21400 }, { "epoch": 0.994010864014114, "grad_norm": 58.664154052734375, "learning_rate": 3.344321153875915e-07, "logits/chosen": -18.176334381103516, "logits/rejected": -17.229076385498047, "logps/chosen": -315.1601257324219, "logps/rejected": -237.3365478515625, "loss": 0.4556, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.864809513092041, "rewards/margins": 1.3754756450653076, "rewards/rejected": 1.4893337488174438, "step": 21410 }, { "epoch": 0.994475138121547, "grad_norm": 192.6312255859375, "learning_rate": 3.34354736369686e-07, "logits/chosen": -18.672672271728516, "logits/rejected": -18.794843673706055, "logps/chosen": -348.673583984375, "logps/rejected": -434.1719665527344, "loss": 1.1103, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.647526502609253, "rewards/margins": -0.07613496482372284, "rewards/rejected": 3.7236618995666504, "step": 21420 }, { "epoch": 0.99493941222898, "grad_norm": 22.270458221435547, "learning_rate": 3.3427735735178045e-07, "logits/chosen": -18.63054847717285, "logits/rejected": -18.320606231689453, "logps/chosen": -364.0243225097656, "logps/rejected": -382.6161804199219, "loss": 1.0288, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.305037021636963, "rewards/margins": 0.30539044737815857, "rewards/rejected": 2.999645948410034, "step": 21430 }, { "epoch": 0.995403686336413, "grad_norm": 98.95704650878906, "learning_rate": 3.3419997833387496e-07, "logits/chosen": -17.6990909576416, "logits/rejected": -17.45107650756836, "logps/chosen": -320.29742431640625, "logps/rejected": -266.7890319824219, "loss": 0.6354, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9237446784973145, "rewards/margins": 0.9650216102600098, "rewards/rejected": 1.9587228298187256, "step": 21440 }, { "epoch": 0.9958679604438461, "grad_norm": 11.561963081359863, "learning_rate": 3.341225993159695e-07, "logits/chosen": -19.09783363342285, "logits/rejected": -18.981159210205078, "logps/chosen": -221.8227081298828, "logps/rejected": -261.40130615234375, "loss": 0.6746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.004211187362671, "rewards/margins": 0.1643342226743698, "rewards/rejected": 1.8398773670196533, "step": 21450 }, { "epoch": 0.9963322345512791, "grad_norm": 111.862060546875, "learning_rate": 3.34045220298064e-07, "logits/chosen": -17.909761428833008, "logits/rejected": -18.22748374938965, "logps/chosen": -265.3137512207031, "logps/rejected": -309.00701904296875, "loss": 0.9602, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5260341167449951, "rewards/margins": -0.3339908719062805, "rewards/rejected": 1.8600250482559204, "step": 21460 }, { "epoch": 0.996796508658712, "grad_norm": 68.42395782470703, "learning_rate": 3.339678412801585e-07, "logits/chosen": -18.499780654907227, "logits/rejected": -18.476572036743164, "logps/chosen": -383.86376953125, "logps/rejected": -394.0766296386719, "loss": 1.0705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.071864604949951, "rewards/margins": -0.04608628898859024, "rewards/rejected": 3.117950916290283, "step": 21470 }, { "epoch": 0.9972607827661452, "grad_norm": 6.178192615509033, "learning_rate": 3.338904622622529e-07, "logits/chosen": -17.763818740844727, "logits/rejected": -17.805233001708984, "logps/chosen": -355.85235595703125, "logps/rejected": -283.86370849609375, "loss": 0.7911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7436330318450928, "rewards/margins": 0.28443101048469543, "rewards/rejected": 2.4592020511627197, "step": 21480 }, { "epoch": 0.9977250568735782, "grad_norm": 198.6486053466797, "learning_rate": 3.338130832443474e-07, "logits/chosen": -19.250356674194336, "logits/rejected": -18.02335548400879, "logps/chosen": -505.0811462402344, "logps/rejected": -365.87615966796875, "loss": 0.4528, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.727525234222412, "rewards/margins": 1.412025809288025, "rewards/rejected": 2.315499782562256, "step": 21490 }, { "epoch": 0.9981893309810111, "grad_norm": 192.4958953857422, "learning_rate": 3.3373570422644193e-07, "logits/chosen": -19.036161422729492, "logits/rejected": -17.990924835205078, "logps/chosen": -606.3881225585938, "logps/rejected": -485.12908935546875, "loss": 0.8514, "rewards/accuracies": 0.5, "rewards/chosen": 4.343449592590332, "rewards/margins": 0.9164409637451172, "rewards/rejected": 3.4270076751708984, "step": 21500 }, { "epoch": 0.9986536050884443, "grad_norm": 32.63386917114258, "learning_rate": 3.3365832520853644e-07, "logits/chosen": -19.4097957611084, "logits/rejected": -18.272939682006836, "logps/chosen": -401.68939208984375, "logps/rejected": -363.90533447265625, "loss": 0.7038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6714892387390137, "rewards/margins": 0.6731595396995544, "rewards/rejected": 1.998329520225525, "step": 21510 }, { "epoch": 0.9991178791958772, "grad_norm": 58.775630950927734, "learning_rate": 3.3358094619063095e-07, "logits/chosen": -17.998979568481445, "logits/rejected": -17.918306350708008, "logps/chosen": -337.07489013671875, "logps/rejected": -331.4573669433594, "loss": 0.854, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1411356925964355, "rewards/margins": 0.007141613867133856, "rewards/rejected": 2.1339941024780273, "step": 21520 }, { "epoch": 0.9995821533033102, "grad_norm": 79.59160614013672, "learning_rate": 3.335035671727254e-07, "logits/chosen": -19.73360824584961, "logits/rejected": -18.539344787597656, "logps/chosen": -459.92535400390625, "logps/rejected": -329.47796630859375, "loss": 0.5846, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8078904151916504, "rewards/margins": 1.1492140293121338, "rewards/rejected": 2.6586766242980957, "step": 21530 }, { "epoch": 1.0000464274107432, "grad_norm": 86.38446807861328, "learning_rate": 3.334261881548199e-07, "logits/chosen": -18.380107879638672, "logits/rejected": -17.107418060302734, "logps/chosen": -394.21624755859375, "logps/rejected": -230.438720703125, "loss": 0.5716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5843474864959717, "rewards/margins": 1.6978145837783813, "rewards/rejected": 1.8865330219268799, "step": 21540 }, { "epoch": 1.0005107015181762, "grad_norm": 3.4686388969421387, "learning_rate": 3.3334880913691443e-07, "logits/chosen": -18.578548431396484, "logits/rejected": -17.594093322753906, "logps/chosen": -378.99078369140625, "logps/rejected": -288.582763671875, "loss": 0.4683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.697803020477295, "rewards/margins": 1.7463171482086182, "rewards/rejected": 1.9514856338500977, "step": 21550 }, { "epoch": 1.0009749756256094, "grad_norm": 4.272004127502441, "learning_rate": 3.3327143011900894e-07, "logits/chosen": -19.929574966430664, "logits/rejected": -20.11396026611328, "logps/chosen": -462.3568420410156, "logps/rejected": -418.1639099121094, "loss": 0.7257, "rewards/accuracies": 0.5, "rewards/chosen": 4.49238395690918, "rewards/margins": 0.554848313331604, "rewards/rejected": 3.9375357627868652, "step": 21560 }, { "epoch": 1.0014392497330424, "grad_norm": 0.04559234157204628, "learning_rate": 3.3320178900289396e-07, "logits/chosen": -19.00809669494629, "logits/rejected": -17.575485229492188, "logps/chosen": -481.7930603027344, "logps/rejected": -356.17987060546875, "loss": 1.114, "rewards/accuracies": 0.5, "rewards/chosen": 4.53801155090332, "rewards/margins": 1.4797842502593994, "rewards/rejected": 3.0582275390625, "step": 21570 }, { "epoch": 1.0019035238404754, "grad_norm": 89.6898422241211, "learning_rate": 3.331244099849885e-07, "logits/chosen": -19.184572219848633, "logits/rejected": -18.585540771484375, "logps/chosen": -378.7278747558594, "logps/rejected": -269.93756103515625, "loss": 0.4567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.480691909790039, "rewards/margins": 1.3518145084381104, "rewards/rejected": 2.128877639770508, "step": 21580 }, { "epoch": 1.0023677979479084, "grad_norm": 185.7530517578125, "learning_rate": 3.33047030967083e-07, "logits/chosen": -17.43044090270996, "logits/rejected": -17.85121726989746, "logps/chosen": -298.39715576171875, "logps/rejected": -351.4996337890625, "loss": 1.2596, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6884236335754395, "rewards/margins": -0.3374231457710266, "rewards/rejected": 3.0258467197418213, "step": 21590 }, { "epoch": 1.0028320720553414, "grad_norm": 4.516270637512207, "learning_rate": 3.3296965194917745e-07, "logits/chosen": -18.820331573486328, "logits/rejected": -17.702924728393555, "logps/chosen": -363.8675842285156, "logps/rejected": -214.3356475830078, "loss": 0.6656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7586116790771484, "rewards/margins": 0.8305326700210571, "rewards/rejected": 1.9280788898468018, "step": 21600 }, { "epoch": 1.0032963461627744, "grad_norm": 2.4656105041503906, "learning_rate": 3.328922729312719e-07, "logits/chosen": -18.574142456054688, "logits/rejected": -16.95039939880371, "logps/chosen": -428.29534912109375, "logps/rejected": -240.08837890625, "loss": 0.2558, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.338027238845825, "rewards/margins": 1.990767478942871, "rewards/rejected": 1.3472596406936646, "step": 21610 }, { "epoch": 1.0037606202702076, "grad_norm": 75.29472351074219, "learning_rate": 3.328148939133664e-07, "logits/chosen": -17.357078552246094, "logits/rejected": -17.7493953704834, "logps/chosen": -294.3003845214844, "logps/rejected": -290.77838134765625, "loss": 0.8406, "rewards/accuracies": 0.5, "rewards/chosen": 1.8056926727294922, "rewards/margins": 0.007074224762618542, "rewards/rejected": 1.7986183166503906, "step": 21620 }, { "epoch": 1.0042248943776406, "grad_norm": 36.88668441772461, "learning_rate": 3.327375148954609e-07, "logits/chosen": -18.532108306884766, "logits/rejected": -18.558364868164062, "logps/chosen": -355.2406311035156, "logps/rejected": -340.09808349609375, "loss": 1.0901, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5916848182678223, "rewards/margins": -0.06125594303011894, "rewards/rejected": 2.6529407501220703, "step": 21630 }, { "epoch": 1.0046891684850736, "grad_norm": 120.57288360595703, "learning_rate": 3.3266013587755544e-07, "logits/chosen": -19.31424903869629, "logits/rejected": -18.57926368713379, "logps/chosen": -394.7129821777344, "logps/rejected": -303.8997802734375, "loss": 0.8241, "rewards/accuracies": 0.5, "rewards/chosen": 3.4406566619873047, "rewards/margins": 0.7728241682052612, "rewards/rejected": 2.667832851409912, "step": 21640 }, { "epoch": 1.0051534425925066, "grad_norm": 2.545928955078125, "learning_rate": 3.3258275685964995e-07, "logits/chosen": -18.750370025634766, "logits/rejected": -18.300537109375, "logps/chosen": -325.01861572265625, "logps/rejected": -259.7380676269531, "loss": 0.7221, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1984646320343018, "rewards/margins": 1.1248087882995605, "rewards/rejected": 2.073655605316162, "step": 21650 }, { "epoch": 1.0056177166999396, "grad_norm": 100.68196105957031, "learning_rate": 3.325053778417444e-07, "logits/chosen": -18.65782928466797, "logits/rejected": -18.4256534576416, "logps/chosen": -353.91717529296875, "logps/rejected": -368.0137023925781, "loss": 1.3725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6939780712127686, "rewards/margins": 0.09720466285943985, "rewards/rejected": 2.596773624420166, "step": 21660 }, { "epoch": 1.0060819908073726, "grad_norm": 40.024932861328125, "learning_rate": 3.324279988238389e-07, "logits/chosen": -19.083951950073242, "logits/rejected": -18.089258193969727, "logps/chosen": -514.9102172851562, "logps/rejected": -355.59234619140625, "loss": 0.4409, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.002711296081543, "rewards/margins": 1.7465680837631226, "rewards/rejected": 3.2561423778533936, "step": 21670 }, { "epoch": 1.0065462649148058, "grad_norm": 187.7625732421875, "learning_rate": 3.3235061980593343e-07, "logits/chosen": -18.9139347076416, "logits/rejected": -17.829082489013672, "logps/chosen": -363.56622314453125, "logps/rejected": -277.15350341796875, "loss": 0.3302, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.170372009277344, "rewards/margins": 2.3959712982177734, "rewards/rejected": 1.7744009494781494, "step": 21680 }, { "epoch": 1.0070105390222388, "grad_norm": 50.902523040771484, "learning_rate": 3.3227324078802794e-07, "logits/chosen": -18.953123092651367, "logits/rejected": -18.89057731628418, "logps/chosen": -453.4270935058594, "logps/rejected": -427.52789306640625, "loss": 0.6862, "rewards/accuracies": 0.5, "rewards/chosen": 3.404437303543091, "rewards/margins": 0.29994475841522217, "rewards/rejected": 3.104492425918579, "step": 21690 }, { "epoch": 1.0074748131296718, "grad_norm": 62.59230422973633, "learning_rate": 3.321958617701224e-07, "logits/chosen": -19.23525619506836, "logits/rejected": -18.832714080810547, "logps/chosen": -448.06378173828125, "logps/rejected": -392.796630859375, "loss": 0.6096, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5094845294952393, "rewards/margins": 0.9877287149429321, "rewards/rejected": 2.5217559337615967, "step": 21700 }, { "epoch": 1.0079390872371048, "grad_norm": 3.306694507598877, "learning_rate": 3.3211848275221686e-07, "logits/chosen": -19.456212997436523, "logits/rejected": -19.212141036987305, "logps/chosen": -554.6572265625, "logps/rejected": -465.0923767089844, "loss": 0.6268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.312325477600098, "rewards/margins": 1.4535229206085205, "rewards/rejected": 3.858802318572998, "step": 21710 }, { "epoch": 1.0084033613445378, "grad_norm": 15.875338554382324, "learning_rate": 3.3204110373431137e-07, "logits/chosen": -18.48373031616211, "logits/rejected": -17.565555572509766, "logps/chosen": -327.1954650878906, "logps/rejected": -244.9009552001953, "loss": 0.8452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7703146934509277, "rewards/margins": 0.5348696708679199, "rewards/rejected": 2.235445022583008, "step": 21720 }, { "epoch": 1.0088676354519708, "grad_norm": 98.17103576660156, "learning_rate": 3.319637247164059e-07, "logits/chosen": -18.60634994506836, "logits/rejected": -18.221298217773438, "logps/chosen": -425.89776611328125, "logps/rejected": -411.95526123046875, "loss": 0.865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.891677141189575, "rewards/margins": 0.07302705943584442, "rewards/rejected": 2.818650007247925, "step": 21730 }, { "epoch": 1.0093319095594038, "grad_norm": 64.77328491210938, "learning_rate": 3.318863456985004e-07, "logits/chosen": -18.67471694946289, "logits/rejected": -16.905479431152344, "logps/chosen": -329.32135009765625, "logps/rejected": -165.243408203125, "loss": 0.3521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.09906268119812, "rewards/margins": 2.1579949855804443, "rewards/rejected": 0.9410678148269653, "step": 21740 }, { "epoch": 1.009796183666837, "grad_norm": 137.0302276611328, "learning_rate": 3.318089666805949e-07, "logits/chosen": -18.914749145507812, "logits/rejected": -18.13299560546875, "logps/chosen": -451.7586975097656, "logps/rejected": -436.85247802734375, "loss": 0.8225, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.9005725383758545, "rewards/margins": 0.9552074670791626, "rewards/rejected": 2.9453654289245605, "step": 21750 }, { "epoch": 1.01026045777427, "grad_norm": 180.2565155029297, "learning_rate": 3.3173158766268936e-07, "logits/chosen": -18.446531295776367, "logits/rejected": -18.279766082763672, "logps/chosen": -327.5072326660156, "logps/rejected": -345.5527038574219, "loss": 1.1095, "rewards/accuracies": 0.5, "rewards/chosen": 3.709881544113159, "rewards/margins": -0.0992836132645607, "rewards/rejected": 3.8091654777526855, "step": 21760 }, { "epoch": 1.010724731881703, "grad_norm": 39.49718475341797, "learning_rate": 3.3165420864478387e-07, "logits/chosen": -18.564828872680664, "logits/rejected": -18.373815536499023, "logps/chosen": -410.612060546875, "logps/rejected": -439.50872802734375, "loss": 0.6385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9508395195007324, "rewards/margins": 1.0496432781219482, "rewards/rejected": 2.901196241378784, "step": 21770 }, { "epoch": 1.011189005989136, "grad_norm": 99.13623809814453, "learning_rate": 3.315768296268784e-07, "logits/chosen": -19.00510597229004, "logits/rejected": -18.986982345581055, "logps/chosen": -418.57305908203125, "logps/rejected": -388.8902282714844, "loss": 0.6999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3422722816467285, "rewards/margins": 0.5169270634651184, "rewards/rejected": 2.825345277786255, "step": 21780 }, { "epoch": 1.011653280096569, "grad_norm": 45.84938049316406, "learning_rate": 3.3149945060897284e-07, "logits/chosen": -19.36676788330078, "logits/rejected": -17.97000503540039, "logps/chosen": -279.7881164550781, "logps/rejected": -205.1074981689453, "loss": 0.6302, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.974632501602173, "rewards/margins": 1.5131418704986572, "rewards/rejected": 1.4614903926849365, "step": 21790 }, { "epoch": 1.012117554204002, "grad_norm": 75.9041748046875, "learning_rate": 3.3142207159106735e-07, "logits/chosen": -19.326753616333008, "logits/rejected": -19.221410751342773, "logps/chosen": -347.99212646484375, "logps/rejected": -328.72076416015625, "loss": 1.0692, "rewards/accuracies": 0.5, "rewards/chosen": 3.1300222873687744, "rewards/margins": 0.24094954133033752, "rewards/rejected": 2.8890724182128906, "step": 21800 }, { "epoch": 1.0125818283114352, "grad_norm": 0.20559611916542053, "learning_rate": 3.313446925731618e-07, "logits/chosen": -19.15431785583496, "logits/rejected": -17.451183319091797, "logps/chosen": -486.0941467285156, "logps/rejected": -290.56561279296875, "loss": 0.4748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.650571823120117, "rewards/margins": 1.3267451524734497, "rewards/rejected": 2.323827028274536, "step": 21810 }, { "epoch": 1.0130461024188682, "grad_norm": 136.7900390625, "learning_rate": 3.312673135552563e-07, "logits/chosen": -18.47209930419922, "logits/rejected": -17.720003128051758, "logps/chosen": -346.68243408203125, "logps/rejected": -316.17718505859375, "loss": 0.9825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.725585699081421, "rewards/margins": 0.5970416069030762, "rewards/rejected": 2.1285440921783447, "step": 21820 }, { "epoch": 1.0135103765263012, "grad_norm": 14.831700325012207, "learning_rate": 3.3118993453735083e-07, "logits/chosen": -19.626358032226562, "logits/rejected": -19.012554168701172, "logps/chosen": -302.5556335449219, "logps/rejected": -249.886474609375, "loss": 0.3319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.690950870513916, "rewards/margins": 2.0201001167297363, "rewards/rejected": 1.6708505153656006, "step": 21830 }, { "epoch": 1.0139746506337342, "grad_norm": 8.581067085266113, "learning_rate": 3.3111255551944535e-07, "logits/chosen": -19.318775177001953, "logits/rejected": -18.65105628967285, "logps/chosen": -398.740966796875, "logps/rejected": -263.90582275390625, "loss": 0.3335, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6527493000030518, "rewards/margins": 1.4637219905853271, "rewards/rejected": 2.1890273094177246, "step": 21840 }, { "epoch": 1.0144389247411671, "grad_norm": 12.53396224975586, "learning_rate": 3.3103517650153986e-07, "logits/chosen": -19.79589080810547, "logits/rejected": -19.23338508605957, "logps/chosen": -442.28485107421875, "logps/rejected": -334.8807067871094, "loss": 0.2609, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.814816951751709, "rewards/margins": 1.5424506664276123, "rewards/rejected": 2.2723662853240967, "step": 21850 }, { "epoch": 1.0149031988486001, "grad_norm": 3.8518621921539307, "learning_rate": 3.309577974836343e-07, "logits/chosen": -19.585588455200195, "logits/rejected": -18.750492095947266, "logps/chosen": -366.28692626953125, "logps/rejected": -339.8381042480469, "loss": 1.0055, "rewards/accuracies": 0.5, "rewards/chosen": 3.505035400390625, "rewards/margins": 0.2871728539466858, "rewards/rejected": 3.217862367630005, "step": 21860 }, { "epoch": 1.0153674729560331, "grad_norm": 62.081966400146484, "learning_rate": 3.3088041846572883e-07, "logits/chosen": -18.486143112182617, "logits/rejected": -18.449373245239258, "logps/chosen": -411.75604248046875, "logps/rejected": -306.30194091796875, "loss": 0.6544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9386680126190186, "rewards/margins": 1.0735487937927246, "rewards/rejected": 2.865119457244873, "step": 21870 }, { "epoch": 1.0158317470634664, "grad_norm": 0.5360575318336487, "learning_rate": 3.3080303944782334e-07, "logits/chosen": -18.626033782958984, "logits/rejected": -17.941650390625, "logps/chosen": -285.9955139160156, "logps/rejected": -233.34365844726562, "loss": 0.4271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.030409574508667, "rewards/margins": 1.305891752243042, "rewards/rejected": 1.724517822265625, "step": 21880 }, { "epoch": 1.0162960211708993, "grad_norm": 58.21342849731445, "learning_rate": 3.307256604299178e-07, "logits/chosen": -19.40914535522461, "logits/rejected": -18.15114974975586, "logps/chosen": -509.27996826171875, "logps/rejected": -351.1559753417969, "loss": 0.3706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2454006671905518, "rewards/margins": 1.3065071105957031, "rewards/rejected": 1.9388936758041382, "step": 21890 }, { "epoch": 1.0167602952783323, "grad_norm": 35.02589797973633, "learning_rate": 3.306482814120123e-07, "logits/chosen": -19.527116775512695, "logits/rejected": -18.830825805664062, "logps/chosen": -350.53594970703125, "logps/rejected": -256.3549499511719, "loss": 0.258, "rewards/accuracies": 1.0, "rewards/chosen": 3.785900115966797, "rewards/margins": 1.8285372257232666, "rewards/rejected": 1.9573628902435303, "step": 21900 }, { "epoch": 1.0172245693857653, "grad_norm": 10.712501525878906, "learning_rate": 3.3057090239410677e-07, "logits/chosen": -19.65225601196289, "logits/rejected": -18.52839469909668, "logps/chosen": -437.0392150878906, "logps/rejected": -328.92303466796875, "loss": 0.7488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.031552314758301, "rewards/margins": 1.5727570056915283, "rewards/rejected": 2.4587953090667725, "step": 21910 }, { "epoch": 1.0176888434931983, "grad_norm": 24.544023513793945, "learning_rate": 3.304935233762013e-07, "logits/chosen": -19.94501304626465, "logits/rejected": -18.609695434570312, "logps/chosen": -341.8628845214844, "logps/rejected": -262.8133239746094, "loss": 0.7621, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.101909875869751, "rewards/margins": 0.6932193636894226, "rewards/rejected": 2.4086902141571045, "step": 21920 }, { "epoch": 1.0181531176006313, "grad_norm": 29.436922073364258, "learning_rate": 3.304161443582958e-07, "logits/chosen": -20.12991714477539, "logits/rejected": -19.884326934814453, "logps/chosen": -351.62689208984375, "logps/rejected": -314.9557800292969, "loss": 0.7222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.611565113067627, "rewards/margins": 0.7393757104873657, "rewards/rejected": 2.8721890449523926, "step": 21930 }, { "epoch": 1.0186173917080645, "grad_norm": 186.89111328125, "learning_rate": 3.303387653403903e-07, "logits/chosen": -19.871816635131836, "logits/rejected": -19.33285903930664, "logps/chosen": -434.2275390625, "logps/rejected": -398.2742919921875, "loss": 0.994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.076853275299072, "rewards/margins": 0.46006354689598083, "rewards/rejected": 3.6167895793914795, "step": 21940 }, { "epoch": 1.0190816658154975, "grad_norm": 53.365142822265625, "learning_rate": 3.302613863224848e-07, "logits/chosen": -18.624731063842773, "logits/rejected": -18.750093460083008, "logps/chosen": -404.23638916015625, "logps/rejected": -358.11712646484375, "loss": 0.8742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5048298835754395, "rewards/margins": 0.7602912783622742, "rewards/rejected": 2.7445383071899414, "step": 21950 }, { "epoch": 1.0195459399229305, "grad_norm": 23.39455223083496, "learning_rate": 3.3018400730457927e-07, "logits/chosen": -18.235286712646484, "logits/rejected": -17.890565872192383, "logps/chosen": -401.99859619140625, "logps/rejected": -335.63885498046875, "loss": 1.2751, "rewards/accuracies": 0.5, "rewards/chosen": 4.029770851135254, "rewards/margins": 0.33275336027145386, "rewards/rejected": 3.697016954421997, "step": 21960 }, { "epoch": 1.0200102140303635, "grad_norm": 148.78407287597656, "learning_rate": 3.301066282866738e-07, "logits/chosen": -19.56055450439453, "logits/rejected": -18.187984466552734, "logps/chosen": -409.154541015625, "logps/rejected": -287.87322998046875, "loss": 0.4311, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4964499473571777, "rewards/margins": 1.5386475324630737, "rewards/rejected": 1.957802414894104, "step": 21970 }, { "epoch": 1.0204744881377965, "grad_norm": 43.67917251586914, "learning_rate": 3.3002924926876824e-07, "logits/chosen": -18.98946189880371, "logits/rejected": -18.03342056274414, "logps/chosen": -351.7813415527344, "logps/rejected": -243.76388549804688, "loss": 0.4466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5089454650878906, "rewards/margins": 0.9026327133178711, "rewards/rejected": 2.6063125133514404, "step": 21980 }, { "epoch": 1.0209387622452295, "grad_norm": 55.03089141845703, "learning_rate": 3.2995187025086275e-07, "logits/chosen": -19.349838256835938, "logits/rejected": -18.922697067260742, "logps/chosen": -377.3403015136719, "logps/rejected": -330.88348388671875, "loss": 0.6983, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.511117935180664, "rewards/margins": 1.1241799592971802, "rewards/rejected": 2.3869378566741943, "step": 21990 }, { "epoch": 1.0214030363526627, "grad_norm": 9.229068756103516, "learning_rate": 3.2987449123295726e-07, "logits/chosen": -18.818485260009766, "logits/rejected": -18.184595108032227, "logps/chosen": -382.80914306640625, "logps/rejected": -309.16802978515625, "loss": 0.4835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0222551822662354, "rewards/margins": 0.9078361392021179, "rewards/rejected": 2.1144187450408936, "step": 22000 }, { "epoch": 1.0218673104600957, "grad_norm": 33.783538818359375, "learning_rate": 3.297971122150517e-07, "logits/chosen": -20.190303802490234, "logits/rejected": -19.893890380859375, "logps/chosen": -381.52239990234375, "logps/rejected": -325.2305908203125, "loss": 0.6317, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0617728233337402, "rewards/margins": 0.3778964877128601, "rewards/rejected": 2.6838765144348145, "step": 22010 }, { "epoch": 1.0223315845675287, "grad_norm": 36.126712799072266, "learning_rate": 3.2971973319714623e-07, "logits/chosen": -19.103910446166992, "logits/rejected": -18.279232025146484, "logps/chosen": -378.74114990234375, "logps/rejected": -283.58966064453125, "loss": 0.9969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4944393634796143, "rewards/margins": 0.8249086141586304, "rewards/rejected": 2.6695306301116943, "step": 22020 }, { "epoch": 1.0227958586749617, "grad_norm": 1.5776093006134033, "learning_rate": 3.2964235417924074e-07, "logits/chosen": -19.638248443603516, "logits/rejected": -18.2781982421875, "logps/chosen": -434.0313415527344, "logps/rejected": -301.0865783691406, "loss": 0.8131, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9818477630615234, "rewards/margins": 1.3068761825561523, "rewards/rejected": 2.674971580505371, "step": 22030 }, { "epoch": 1.0232601327823947, "grad_norm": 5.587155818939209, "learning_rate": 3.2956497516133525e-07, "logits/chosen": -18.730365753173828, "logits/rejected": -19.086624145507812, "logps/chosen": -444.8143615722656, "logps/rejected": -438.8544006347656, "loss": 1.2199, "rewards/accuracies": 0.5, "rewards/chosen": 3.126497507095337, "rewards/margins": -0.25532665848731995, "rewards/rejected": 3.381824016571045, "step": 22040 }, { "epoch": 1.0237244068898277, "grad_norm": 160.82345581054688, "learning_rate": 3.2948759614342977e-07, "logits/chosen": -18.717708587646484, "logits/rejected": -18.612018585205078, "logps/chosen": -399.1616516113281, "logps/rejected": -297.4248046875, "loss": 0.6813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2796082496643066, "rewards/margins": 1.0651170015335083, "rewards/rejected": 2.214491128921509, "step": 22050 }, { "epoch": 1.0241886809972607, "grad_norm": 18.446260452270508, "learning_rate": 3.294102171255242e-07, "logits/chosen": -18.665443420410156, "logits/rejected": -18.453760147094727, "logps/chosen": -383.93865966796875, "logps/rejected": -309.9309997558594, "loss": 0.7275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9864609241485596, "rewards/margins": 0.5172560214996338, "rewards/rejected": 2.469204902648926, "step": 22060 }, { "epoch": 1.024652955104694, "grad_norm": 131.34280395507812, "learning_rate": 3.2933283810761874e-07, "logits/chosen": -17.941726684570312, "logits/rejected": -17.753793716430664, "logps/chosen": -230.84378051757812, "logps/rejected": -236.3797149658203, "loss": 0.7479, "rewards/accuracies": 0.5, "rewards/chosen": 2.3216841220855713, "rewards/margins": 0.79206383228302, "rewards/rejected": 1.5296201705932617, "step": 22070 }, { "epoch": 1.025117229212127, "grad_norm": 19.277353286743164, "learning_rate": 3.292554590897132e-07, "logits/chosen": -18.57729721069336, "logits/rejected": -18.03961181640625, "logps/chosen": -421.44085693359375, "logps/rejected": -413.4917907714844, "loss": 0.8122, "rewards/accuracies": 0.5, "rewards/chosen": 3.237741470336914, "rewards/margins": 0.45324817299842834, "rewards/rejected": 2.7844932079315186, "step": 22080 }, { "epoch": 1.0255815033195599, "grad_norm": 188.8779296875, "learning_rate": 3.291780800718077e-07, "logits/chosen": -18.708255767822266, "logits/rejected": -18.464588165283203, "logps/chosen": -362.6529846191406, "logps/rejected": -344.757568359375, "loss": 0.7991, "rewards/accuracies": 0.5, "rewards/chosen": 2.527102470397949, "rewards/margins": 0.3147057592868805, "rewards/rejected": 2.2123966217041016, "step": 22090 }, { "epoch": 1.0260457774269929, "grad_norm": 20.198755264282227, "learning_rate": 3.291007010539022e-07, "logits/chosen": -19.557483673095703, "logits/rejected": -18.477066040039062, "logps/chosen": -448.6234436035156, "logps/rejected": -251.2769775390625, "loss": 0.3439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7962887287139893, "rewards/margins": 2.060330629348755, "rewards/rejected": 1.7359577417373657, "step": 22100 }, { "epoch": 1.0265100515344259, "grad_norm": 159.125244140625, "learning_rate": 3.290233220359967e-07, "logits/chosen": -19.550838470458984, "logits/rejected": -19.11939811706543, "logps/chosen": -440.08380126953125, "logps/rejected": -423.5953063964844, "loss": 0.9821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.621608018875122, "rewards/margins": 0.34794381260871887, "rewards/rejected": 3.2736639976501465, "step": 22110 }, { "epoch": 1.0269743256418589, "grad_norm": 1.314971685409546, "learning_rate": 3.289459430180912e-07, "logits/chosen": -18.272701263427734, "logits/rejected": -17.223997116088867, "logps/chosen": -361.47686767578125, "logps/rejected": -272.69256591796875, "loss": 0.8111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.502997636795044, "rewards/margins": 1.6931178569793701, "rewards/rejected": 1.8098796606063843, "step": 22120 }, { "epoch": 1.027438599749292, "grad_norm": 235.943603515625, "learning_rate": 3.288685640001857e-07, "logits/chosen": -19.291446685791016, "logits/rejected": -19.334732055664062, "logps/chosen": -504.600830078125, "logps/rejected": -494.3277282714844, "loss": 0.5063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.483652114868164, "rewards/margins": 0.8702041506767273, "rewards/rejected": 3.613447904586792, "step": 22130 }, { "epoch": 1.027902873856725, "grad_norm": 7.952581405639648, "learning_rate": 3.287911849822802e-07, "logits/chosen": -18.95067024230957, "logits/rejected": -17.864978790283203, "logps/chosen": -426.1168518066406, "logps/rejected": -282.317138671875, "loss": 0.3388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.007374286651611, "rewards/margins": 1.8860063552856445, "rewards/rejected": 2.121367931365967, "step": 22140 }, { "epoch": 1.028367147964158, "grad_norm": 24.102678298950195, "learning_rate": 3.287138059643747e-07, "logits/chosen": -19.395854949951172, "logits/rejected": -18.426593780517578, "logps/chosen": -380.55096435546875, "logps/rejected": -343.01348876953125, "loss": 1.0228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9957799911499023, "rewards/margins": -0.08050241321325302, "rewards/rejected": 3.076282262802124, "step": 22150 }, { "epoch": 1.028831422071591, "grad_norm": 50.26515197753906, "learning_rate": 3.286364269464692e-07, "logits/chosen": -20.702404022216797, "logits/rejected": -18.842519760131836, "logps/chosen": -376.67120361328125, "logps/rejected": -221.8709716796875, "loss": 0.5451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4632182121276855, "rewards/margins": 0.8269592523574829, "rewards/rejected": 1.636258840560913, "step": 22160 }, { "epoch": 1.029295696179024, "grad_norm": 23.876827239990234, "learning_rate": 3.2855904792856364e-07, "logits/chosen": -19.07632827758789, "logits/rejected": -17.301637649536133, "logps/chosen": -366.53887939453125, "logps/rejected": -208.68997192382812, "loss": 0.2563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.071239948272705, "rewards/margins": 2.0836992263793945, "rewards/rejected": 0.9875405430793762, "step": 22170 }, { "epoch": 1.029759970286457, "grad_norm": 1.8346668481826782, "learning_rate": 3.2848166891065815e-07, "logits/chosen": -18.70145034790039, "logits/rejected": -18.607582092285156, "logps/chosen": -336.93157958984375, "logps/rejected": -368.651611328125, "loss": 0.7846, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.301300525665283, "rewards/margins": 0.34356197714805603, "rewards/rejected": 2.9577383995056152, "step": 22180 }, { "epoch": 1.03022424439389, "grad_norm": 205.69857788085938, "learning_rate": 3.2840428989275266e-07, "logits/chosen": -18.816375732421875, "logits/rejected": -18.2465877532959, "logps/chosen": -481.5768127441406, "logps/rejected": -407.56878662109375, "loss": 0.6151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.540139675140381, "rewards/margins": 0.8655936121940613, "rewards/rejected": 2.674546241760254, "step": 22190 }, { "epoch": 1.0306885185013233, "grad_norm": 40.621185302734375, "learning_rate": 3.2832691087484717e-07, "logits/chosen": -18.794103622436523, "logits/rejected": -18.269329071044922, "logps/chosen": -316.79693603515625, "logps/rejected": -275.70233154296875, "loss": 0.6591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.012899875640869, "rewards/margins": 1.0750404596328735, "rewards/rejected": 1.9378595352172852, "step": 22200 }, { "epoch": 1.0311527926087563, "grad_norm": 41.60443878173828, "learning_rate": 3.2824953185694163e-07, "logits/chosen": -19.21452522277832, "logits/rejected": -18.75090789794922, "logps/chosen": -366.54022216796875, "logps/rejected": -326.9327697753906, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 2.7170250415802, "rewards/margins": 0.3423711955547333, "rewards/rejected": 2.3746535778045654, "step": 22210 }, { "epoch": 1.0316170667161892, "grad_norm": 78.70639038085938, "learning_rate": 3.2817215283903614e-07, "logits/chosen": -19.29010581970215, "logits/rejected": -19.35097885131836, "logps/chosen": -425.8219299316406, "logps/rejected": -411.34783935546875, "loss": 0.6783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.937004804611206, "rewards/margins": 0.7509945034980774, "rewards/rejected": 3.1860103607177734, "step": 22220 }, { "epoch": 1.0320813408236222, "grad_norm": 74.50452423095703, "learning_rate": 3.2809477382113065e-07, "logits/chosen": -18.487266540527344, "logits/rejected": -18.206634521484375, "logps/chosen": -361.4700927734375, "logps/rejected": -306.47467041015625, "loss": 0.6047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4603073596954346, "rewards/margins": 0.9720569849014282, "rewards/rejected": 2.488250494003296, "step": 22230 }, { "epoch": 1.0325456149310552, "grad_norm": 141.6244354248047, "learning_rate": 3.2801739480322516e-07, "logits/chosen": -18.745580673217773, "logits/rejected": -17.848331451416016, "logps/chosen": -368.0647888183594, "logps/rejected": -290.0821228027344, "loss": 0.5635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.926849126815796, "rewards/margins": 1.213675618171692, "rewards/rejected": 1.7131736278533936, "step": 22240 }, { "epoch": 1.0330098890384882, "grad_norm": 200.35948181152344, "learning_rate": 3.279400157853197e-07, "logits/chosen": -19.111948013305664, "logits/rejected": -19.143217086791992, "logps/chosen": -326.37457275390625, "logps/rejected": -306.04156494140625, "loss": 0.9519, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.272193193435669, "rewards/margins": 0.2886424958705902, "rewards/rejected": 2.983550548553467, "step": 22250 }, { "epoch": 1.0334741631459214, "grad_norm": 203.7347412109375, "learning_rate": 3.2786263676741413e-07, "logits/chosen": -18.363452911376953, "logits/rejected": -17.03982925415039, "logps/chosen": -439.12298583984375, "logps/rejected": -292.2825622558594, "loss": 0.5973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.802116870880127, "rewards/margins": 0.9379162788391113, "rewards/rejected": 1.8642005920410156, "step": 22260 }, { "epoch": 1.0339384372533544, "grad_norm": 30.190908432006836, "learning_rate": 3.277852577495086e-07, "logits/chosen": -19.23528480529785, "logits/rejected": -17.762161254882812, "logps/chosen": -387.7503662109375, "logps/rejected": -299.44580078125, "loss": 0.8048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5963501930236816, "rewards/margins": 1.2681020498275757, "rewards/rejected": 2.3282482624053955, "step": 22270 }, { "epoch": 1.0344027113607874, "grad_norm": 75.49212646484375, "learning_rate": 3.277078787316031e-07, "logits/chosen": -18.916454315185547, "logits/rejected": -18.9785213470459, "logps/chosen": -348.13323974609375, "logps/rejected": -345.73675537109375, "loss": 0.8613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8218538761138916, "rewards/margins": 0.6340254545211792, "rewards/rejected": 2.187828540802002, "step": 22280 }, { "epoch": 1.0348669854682204, "grad_norm": 161.6466064453125, "learning_rate": 3.276304997136976e-07, "logits/chosen": -19.675457000732422, "logits/rejected": -19.347660064697266, "logps/chosen": -440.1204528808594, "logps/rejected": -413.468017578125, "loss": 0.6302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.396242618560791, "rewards/margins": 0.3148769736289978, "rewards/rejected": 3.0813655853271484, "step": 22290 }, { "epoch": 1.0353312595756534, "grad_norm": 79.42683410644531, "learning_rate": 3.275531206957921e-07, "logits/chosen": -18.91840934753418, "logits/rejected": -18.259305953979492, "logps/chosen": -379.5601806640625, "logps/rejected": -349.86041259765625, "loss": 0.4756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5104153156280518, "rewards/margins": 1.0455849170684814, "rewards/rejected": 2.464829921722412, "step": 22300 }, { "epoch": 1.0357955336830864, "grad_norm": 210.99803161621094, "learning_rate": 3.274757416778866e-07, "logits/chosen": -18.3162899017334, "logits/rejected": -17.75984764099121, "logps/chosen": -438.63507080078125, "logps/rejected": -294.6918640136719, "loss": 0.7082, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1538538932800293, "rewards/margins": 0.7562471628189087, "rewards/rejected": 2.39760684967041, "step": 22310 }, { "epoch": 1.0362598077905196, "grad_norm": 250.64419555664062, "learning_rate": 3.273983626599811e-07, "logits/chosen": -17.949525833129883, "logits/rejected": -17.449472427368164, "logps/chosen": -371.469970703125, "logps/rejected": -296.29388427734375, "loss": 0.6341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.678065538406372, "rewards/margins": 0.6435738801956177, "rewards/rejected": 2.034491539001465, "step": 22320 }, { "epoch": 1.0367240818979526, "grad_norm": 0.6423009037971497, "learning_rate": 3.273209836420756e-07, "logits/chosen": -18.246150970458984, "logits/rejected": -17.42599105834961, "logps/chosen": -382.07415771484375, "logps/rejected": -313.79034423828125, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 3.19874906539917, "rewards/margins": 1.3114509582519531, "rewards/rejected": 1.8872982263565063, "step": 22330 }, { "epoch": 1.0371883560053856, "grad_norm": 25.21233558654785, "learning_rate": 3.272436046241701e-07, "logits/chosen": -18.328277587890625, "logits/rejected": -17.250944137573242, "logps/chosen": -348.93994140625, "logps/rejected": -248.9931182861328, "loss": 0.5046, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0778071880340576, "rewards/margins": 1.656795859336853, "rewards/rejected": 1.4210113286972046, "step": 22340 }, { "epoch": 1.0376526301128186, "grad_norm": 10.035090446472168, "learning_rate": 3.2716622560626463e-07, "logits/chosen": -19.117481231689453, "logits/rejected": -18.00261116027832, "logps/chosen": -534.4830322265625, "logps/rejected": -419.56201171875, "loss": 0.3533, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.303807258605957, "rewards/margins": 1.7102744579315186, "rewards/rejected": 3.5935325622558594, "step": 22350 }, { "epoch": 1.0381169042202516, "grad_norm": 83.2650375366211, "learning_rate": 3.2708884658835914e-07, "logits/chosen": -19.73678970336914, "logits/rejected": -19.18157386779785, "logps/chosen": -444.6949768066406, "logps/rejected": -362.8485412597656, "loss": 0.3503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.579193115234375, "rewards/margins": 2.405775547027588, "rewards/rejected": 2.173417568206787, "step": 22360 }, { "epoch": 1.0385811783276846, "grad_norm": 165.26051330566406, "learning_rate": 3.2701146757045354e-07, "logits/chosen": -18.35373306274414, "logits/rejected": -18.459423065185547, "logps/chosen": -345.62005615234375, "logps/rejected": -373.6021728515625, "loss": 0.8423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.349255323410034, "rewards/margins": 0.13591642677783966, "rewards/rejected": 3.213338851928711, "step": 22370 }, { "epoch": 1.0390454524351176, "grad_norm": 1.730043649673462, "learning_rate": 3.2693408855254806e-07, "logits/chosen": -18.370521545410156, "logits/rejected": -17.507837295532227, "logps/chosen": -349.23785400390625, "logps/rejected": -254.83139038085938, "loss": 0.3346, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.118504047393799, "rewards/margins": 1.639005422592163, "rewards/rejected": 1.4794983863830566, "step": 22380 }, { "epoch": 1.0395097265425508, "grad_norm": 22.705114364624023, "learning_rate": 3.2685670953464257e-07, "logits/chosen": -18.733306884765625, "logits/rejected": -17.095569610595703, "logps/chosen": -361.0763244628906, "logps/rejected": -193.9543914794922, "loss": 0.3905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3265812397003174, "rewards/margins": 2.1800715923309326, "rewards/rejected": 1.1465094089508057, "step": 22390 }, { "epoch": 1.0399740006499838, "grad_norm": 277.40814208984375, "learning_rate": 3.267793305167371e-07, "logits/chosen": -18.482629776000977, "logits/rejected": -17.65505599975586, "logps/chosen": -376.0754699707031, "logps/rejected": -375.80224609375, "loss": 1.1115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3269717693328857, "rewards/margins": 0.4334356188774109, "rewards/rejected": 2.89353609085083, "step": 22400 }, { "epoch": 1.0404382747574168, "grad_norm": 3.4652628898620605, "learning_rate": 3.2670195149883154e-07, "logits/chosen": -19.730493545532227, "logits/rejected": -18.438188552856445, "logps/chosen": -460.1199645996094, "logps/rejected": -327.3616027832031, "loss": 0.4198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1147661209106445, "rewards/margins": 1.1051127910614014, "rewards/rejected": 3.009653091430664, "step": 22410 }, { "epoch": 1.0409025488648498, "grad_norm": 18.434940338134766, "learning_rate": 3.2662457248092605e-07, "logits/chosen": -19.427356719970703, "logits/rejected": -18.93283462524414, "logps/chosen": -323.9087219238281, "logps/rejected": -304.54150390625, "loss": 0.6279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.213068723678589, "rewards/margins": 0.8603441119194031, "rewards/rejected": 2.352724313735962, "step": 22420 }, { "epoch": 1.0413668229722828, "grad_norm": 56.17611312866211, "learning_rate": 3.2654719346302056e-07, "logits/chosen": -18.99549674987793, "logits/rejected": -17.86945915222168, "logps/chosen": -433.21514892578125, "logps/rejected": -290.3252258300781, "loss": 0.5319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6078999042510986, "rewards/margins": 0.8791608810424805, "rewards/rejected": 1.728738784790039, "step": 22430 }, { "epoch": 1.0418310970797158, "grad_norm": 139.36233520507812, "learning_rate": 3.2646981444511507e-07, "logits/chosen": -19.687580108642578, "logits/rejected": -19.336902618408203, "logps/chosen": -534.0181884765625, "logps/rejected": -476.51123046875, "loss": 0.6494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.0844879150390625, "rewards/margins": 0.5785337090492249, "rewards/rejected": 3.5059542655944824, "step": 22440 }, { "epoch": 1.042295371187149, "grad_norm": 70.66111755371094, "learning_rate": 3.263924354272096e-07, "logits/chosen": -18.852781295776367, "logits/rejected": -18.513111114501953, "logps/chosen": -370.52044677734375, "logps/rejected": -256.2440185546875, "loss": 0.5123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7780685424804688, "rewards/margins": 1.1356112957000732, "rewards/rejected": 2.6424567699432373, "step": 22450 }, { "epoch": 1.042759645294582, "grad_norm": 45.548240661621094, "learning_rate": 3.26315056409304e-07, "logits/chosen": -18.59638786315918, "logits/rejected": -18.667354583740234, "logps/chosen": -311.12335205078125, "logps/rejected": -290.7051696777344, "loss": 0.8058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.51743483543396, "rewards/margins": 0.4973078668117523, "rewards/rejected": 2.020127058029175, "step": 22460 }, { "epoch": 1.043223919402015, "grad_norm": 213.1436309814453, "learning_rate": 3.262376773913985e-07, "logits/chosen": -20.433422088623047, "logits/rejected": -19.45093536376953, "logps/chosen": -624.6716918945312, "logps/rejected": -459.915771484375, "loss": 0.8953, "rewards/accuracies": 0.5, "rewards/chosen": 4.456840515136719, "rewards/margins": 0.4840849041938782, "rewards/rejected": 3.9727559089660645, "step": 22470 }, { "epoch": 1.043688193509448, "grad_norm": 20.802576065063477, "learning_rate": 3.26160298373493e-07, "logits/chosen": -18.71502685546875, "logits/rejected": -17.62525749206543, "logps/chosen": -324.97479248046875, "logps/rejected": -232.77676391601562, "loss": 0.5943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2895658016204834, "rewards/margins": 0.6101301312446594, "rewards/rejected": 1.6794357299804688, "step": 22480 }, { "epoch": 1.044152467616881, "grad_norm": 28.165565490722656, "learning_rate": 3.260829193555875e-07, "logits/chosen": -19.872236251831055, "logits/rejected": -19.342960357666016, "logps/chosen": -428.0279846191406, "logps/rejected": -396.5434875488281, "loss": 0.756, "rewards/accuracies": 0.5, "rewards/chosen": 3.6077053546905518, "rewards/margins": 0.47616925835609436, "rewards/rejected": 3.1315360069274902, "step": 22490 }, { "epoch": 1.044616741724314, "grad_norm": 130.1699981689453, "learning_rate": 3.2600554033768203e-07, "logits/chosen": -18.64140510559082, "logits/rejected": -18.459308624267578, "logps/chosen": -480.89471435546875, "logps/rejected": -409.1729431152344, "loss": 0.5758, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.236860990524292, "rewards/margins": 1.175594687461853, "rewards/rejected": 2.0612664222717285, "step": 22500 }, { "epoch": 1.0450810158317472, "grad_norm": 40.943260192871094, "learning_rate": 3.2592816131977654e-07, "logits/chosen": -18.887575149536133, "logits/rejected": -17.95509147644043, "logps/chosen": -539.244873046875, "logps/rejected": -473.7110900878906, "loss": 1.0, "rewards/accuracies": 0.5, "rewards/chosen": 3.964759349822998, "rewards/margins": 1.059316873550415, "rewards/rejected": 2.905442714691162, "step": 22510 }, { "epoch": 1.0455452899391802, "grad_norm": 21.17177963256836, "learning_rate": 3.25850782301871e-07, "logits/chosen": -19.039302825927734, "logits/rejected": -18.529592514038086, "logps/chosen": -440.2171325683594, "logps/rejected": -346.3606872558594, "loss": 0.7186, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5563900470733643, "rewards/margins": 0.6872431635856628, "rewards/rejected": 2.8691468238830566, "step": 22520 }, { "epoch": 1.0460095640466132, "grad_norm": 56.8607177734375, "learning_rate": 3.257734032839655e-07, "logits/chosen": -18.80160903930664, "logits/rejected": -18.204193115234375, "logps/chosen": -303.8439636230469, "logps/rejected": -240.1595001220703, "loss": 0.604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.253509283065796, "rewards/margins": 0.7159539461135864, "rewards/rejected": 1.5375555753707886, "step": 22530 }, { "epoch": 1.0464738381540462, "grad_norm": 120.243896484375, "learning_rate": 3.2569602426606e-07, "logits/chosen": -18.158443450927734, "logits/rejected": -17.848873138427734, "logps/chosen": -483.953369140625, "logps/rejected": -326.55059814453125, "loss": 0.3108, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.863710880279541, "rewards/margins": 2.03486967086792, "rewards/rejected": 1.828841209411621, "step": 22540 }, { "epoch": 1.0469381122614791, "grad_norm": 35.69999313354492, "learning_rate": 3.2561864524815454e-07, "logits/chosen": -18.77942657470703, "logits/rejected": -18.141849517822266, "logps/chosen": -447.5028381347656, "logps/rejected": -373.72418212890625, "loss": 0.8312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3284783363342285, "rewards/margins": 0.6128080487251282, "rewards/rejected": 2.715670108795166, "step": 22550 }, { "epoch": 1.0474023863689121, "grad_norm": 67.8578872680664, "learning_rate": 3.2554126623024894e-07, "logits/chosen": -18.361766815185547, "logits/rejected": -18.179874420166016, "logps/chosen": -409.69549560546875, "logps/rejected": -363.7973327636719, "loss": 0.9615, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 3.13413405418396, "rewards/margins": -0.08179186284542084, "rewards/rejected": 3.215925931930542, "step": 22560 }, { "epoch": 1.0478666604763451, "grad_norm": 34.84209060668945, "learning_rate": 3.2546388721234345e-07, "logits/chosen": -18.677066802978516, "logits/rejected": -17.304813385009766, "logps/chosen": -383.59149169921875, "logps/rejected": -310.17352294921875, "loss": 0.8967, "rewards/accuracies": 0.5, "rewards/chosen": 3.0560593605041504, "rewards/margins": 0.42902684211730957, "rewards/rejected": 2.62703275680542, "step": 22570 }, { "epoch": 1.0483309345837784, "grad_norm": 27.530838012695312, "learning_rate": 3.2538650819443796e-07, "logits/chosen": -18.125347137451172, "logits/rejected": -17.161922454833984, "logps/chosen": -392.341552734375, "logps/rejected": -301.57684326171875, "loss": 0.5082, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9232425689697266, "rewards/margins": 1.3812263011932373, "rewards/rejected": 2.5420165061950684, "step": 22580 }, { "epoch": 1.0487952086912113, "grad_norm": 143.3734130859375, "learning_rate": 3.253091291765325e-07, "logits/chosen": -18.84889793395996, "logits/rejected": -17.247203826904297, "logps/chosen": -490.61328125, "logps/rejected": -287.0898742675781, "loss": 0.4732, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.611880779266357, "rewards/margins": 1.881485939025879, "rewards/rejected": 2.7303948402404785, "step": 22590 }, { "epoch": 1.0492594827986443, "grad_norm": 197.7873077392578, "learning_rate": 3.25231750158627e-07, "logits/chosen": -18.13006019592285, "logits/rejected": -18.537933349609375, "logps/chosen": -255.8184814453125, "logps/rejected": -313.58984375, "loss": 1.3875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.154674768447876, "rewards/margins": -0.446068674325943, "rewards/rejected": 2.600743293762207, "step": 22600 }, { "epoch": 1.0497237569060773, "grad_norm": 193.1431884765625, "learning_rate": 3.251543711407215e-07, "logits/chosen": -18.458189010620117, "logits/rejected": -17.44785499572754, "logps/chosen": -382.85552978515625, "logps/rejected": -240.7218780517578, "loss": 0.8472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.745934247970581, "rewards/margins": 1.4723427295684814, "rewards/rejected": 2.2735915184020996, "step": 22610 }, { "epoch": 1.0501880310135103, "grad_norm": 188.7971649169922, "learning_rate": 3.2507699212281596e-07, "logits/chosen": -18.58283042907715, "logits/rejected": -18.076187133789062, "logps/chosen": -453.0526428222656, "logps/rejected": -423.4864807128906, "loss": 1.0044, "rewards/accuracies": 0.5, "rewards/chosen": 3.699375867843628, "rewards/margins": 0.25682222843170166, "rewards/rejected": 3.442553997039795, "step": 22620 }, { "epoch": 1.0506523051209433, "grad_norm": 11.573293685913086, "learning_rate": 3.2499961310491047e-07, "logits/chosen": -20.126291275024414, "logits/rejected": -17.795988082885742, "logps/chosen": -389.95635986328125, "logps/rejected": -229.76870727539062, "loss": 0.4039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.461460828781128, "rewards/margins": 1.6120471954345703, "rewards/rejected": 1.849413514137268, "step": 22630 }, { "epoch": 1.0511165792283765, "grad_norm": 31.164216995239258, "learning_rate": 3.24922234087005e-07, "logits/chosen": -19.051876068115234, "logits/rejected": -18.58540916442871, "logps/chosen": -301.930908203125, "logps/rejected": -286.28741455078125, "loss": 0.6512, "rewards/accuracies": 0.5, "rewards/chosen": 2.2101500034332275, "rewards/margins": 0.5201814770698547, "rewards/rejected": 1.6899687051773071, "step": 22640 }, { "epoch": 1.0515808533358095, "grad_norm": 13.64538860321045, "learning_rate": 3.2484485506909944e-07, "logits/chosen": -19.23063850402832, "logits/rejected": -18.207895278930664, "logps/chosen": -409.10760498046875, "logps/rejected": -340.4429016113281, "loss": 0.5715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.248121738433838, "rewards/margins": 1.0027382373809814, "rewards/rejected": 2.2453835010528564, "step": 22650 }, { "epoch": 1.0520451274432425, "grad_norm": 17.155067443847656, "learning_rate": 3.2476747605119395e-07, "logits/chosen": -19.45952796936035, "logits/rejected": -18.24188804626465, "logps/chosen": -408.2054748535156, "logps/rejected": -272.9046630859375, "loss": 0.5156, "rewards/accuracies": 0.5, "rewards/chosen": 3.4983794689178467, "rewards/margins": 1.8742278814315796, "rewards/rejected": 1.6241514682769775, "step": 22660 }, { "epoch": 1.0525094015506755, "grad_norm": 210.51168823242188, "learning_rate": 3.246900970332884e-07, "logits/chosen": -20.35183334350586, "logits/rejected": -18.750534057617188, "logps/chosen": -525.8820190429688, "logps/rejected": -444.7040100097656, "loss": 0.7676, "rewards/accuracies": 0.5, "rewards/chosen": 3.849870204925537, "rewards/margins": 0.7368731498718262, "rewards/rejected": 3.112996816635132, "step": 22670 }, { "epoch": 1.0529736756581085, "grad_norm": 35.5096321105957, "learning_rate": 3.246127180153829e-07, "logits/chosen": -19.011592864990234, "logits/rejected": -18.497211456298828, "logps/chosen": -516.7036743164062, "logps/rejected": -450.60565185546875, "loss": 0.7116, "rewards/accuracies": 0.5, "rewards/chosen": 3.922795057296753, "rewards/margins": 0.6221387982368469, "rewards/rejected": 3.3006560802459717, "step": 22680 }, { "epoch": 1.0534379497655415, "grad_norm": 34.2889289855957, "learning_rate": 3.2453533899747743e-07, "logits/chosen": -18.30344009399414, "logits/rejected": -18.345048904418945, "logps/chosen": -355.14520263671875, "logps/rejected": -368.0216064453125, "loss": 0.6615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1641342639923096, "rewards/margins": 0.544843316078186, "rewards/rejected": 2.619290828704834, "step": 22690 }, { "epoch": 1.0539022238729745, "grad_norm": 159.70474243164062, "learning_rate": 3.2445795997957194e-07, "logits/chosen": -18.869873046875, "logits/rejected": -18.803600311279297, "logps/chosen": -403.76446533203125, "logps/rejected": -479.45623779296875, "loss": 1.5134, "rewards/accuracies": 0.5, "rewards/chosen": 4.213736534118652, "rewards/margins": -0.09212814271450043, "rewards/rejected": 4.3058648109436035, "step": 22700 }, { "epoch": 1.0543664979804077, "grad_norm": 108.34195709228516, "learning_rate": 3.2438058096166645e-07, "logits/chosen": -19.371538162231445, "logits/rejected": -19.105289459228516, "logps/chosen": -329.9634704589844, "logps/rejected": -284.5223693847656, "loss": 0.7856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3687562942504883, "rewards/margins": 0.6689350008964539, "rewards/rejected": 1.6998212337493896, "step": 22710 }, { "epoch": 1.0548307720878407, "grad_norm": 191.2266082763672, "learning_rate": 3.243032019437609e-07, "logits/chosen": -18.521671295166016, "logits/rejected": -17.98859977722168, "logps/chosen": -341.9755859375, "logps/rejected": -305.082763671875, "loss": 0.6644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3826231956481934, "rewards/margins": 0.7439010739326477, "rewards/rejected": 1.6387220621109009, "step": 22720 }, { "epoch": 1.0552950461952737, "grad_norm": 61.98442840576172, "learning_rate": 3.242258229258554e-07, "logits/chosen": -18.505455017089844, "logits/rejected": -18.277603149414062, "logps/chosen": -373.1592102050781, "logps/rejected": -334.94384765625, "loss": 0.825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4819865226745605, "rewards/margins": 0.15618391335010529, "rewards/rejected": 2.3258023262023926, "step": 22730 }, { "epoch": 1.0557593203027067, "grad_norm": 108.84259796142578, "learning_rate": 3.2414844390794993e-07, "logits/chosen": -18.70059585571289, "logits/rejected": -18.621938705444336, "logps/chosen": -361.53448486328125, "logps/rejected": -391.8524475097656, "loss": 0.7871, "rewards/accuracies": 0.5, "rewards/chosen": 3.2663371562957764, "rewards/margins": 0.7367883920669556, "rewards/rejected": 2.5295486450195312, "step": 22740 }, { "epoch": 1.0562235944101397, "grad_norm": 62.353179931640625, "learning_rate": 3.240710648900444e-07, "logits/chosen": -18.735397338867188, "logits/rejected": -18.545204162597656, "logps/chosen": -351.28717041015625, "logps/rejected": -300.71630859375, "loss": 0.9388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.50384783744812, "rewards/margins": 0.3322061598300934, "rewards/rejected": 3.1716418266296387, "step": 22750 }, { "epoch": 1.0566878685175727, "grad_norm": 169.76060485839844, "learning_rate": 3.239936858721389e-07, "logits/chosen": -19.968740463256836, "logits/rejected": -18.726879119873047, "logps/chosen": -391.2210388183594, "logps/rejected": -372.0293884277344, "loss": 0.8867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6717529296875, "rewards/margins": 0.7994208335876465, "rewards/rejected": 2.8723320960998535, "step": 22760 }, { "epoch": 1.057152142625006, "grad_norm": 82.30223846435547, "learning_rate": 3.2391630685423336e-07, "logits/chosen": -19.612323760986328, "logits/rejected": -19.068065643310547, "logps/chosen": -444.38214111328125, "logps/rejected": -360.8807373046875, "loss": 0.7026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9757728576660156, "rewards/margins": 0.8907161951065063, "rewards/rejected": 3.085057258605957, "step": 22770 }, { "epoch": 1.057616416732439, "grad_norm": 9.659241676330566, "learning_rate": 3.2383892783632787e-07, "logits/chosen": -19.137617111206055, "logits/rejected": -18.06766128540039, "logps/chosen": -425.46142578125, "logps/rejected": -338.95709228515625, "loss": 0.3057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.662259578704834, "rewards/margins": 1.7496490478515625, "rewards/rejected": 1.912610411643982, "step": 22780 }, { "epoch": 1.0580806908398719, "grad_norm": 3.3398139476776123, "learning_rate": 3.237615488184224e-07, "logits/chosen": -19.19150161743164, "logits/rejected": -18.238122940063477, "logps/chosen": -448.68450927734375, "logps/rejected": -353.74237060546875, "loss": 0.5448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.551173448562622, "rewards/margins": 1.45619535446167, "rewards/rejected": 2.094977855682373, "step": 22790 }, { "epoch": 1.0585449649473049, "grad_norm": 37.17695999145508, "learning_rate": 3.236841698005169e-07, "logits/chosen": -18.589155197143555, "logits/rejected": -17.692184448242188, "logps/chosen": -365.5999450683594, "logps/rejected": -294.0623474121094, "loss": 0.5037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.406393051147461, "rewards/margins": 1.1864089965820312, "rewards/rejected": 2.219984531402588, "step": 22800 }, { "epoch": 1.0590092390547379, "grad_norm": 197.03175354003906, "learning_rate": 3.236067907826114e-07, "logits/chosen": -18.239078521728516, "logits/rejected": -17.193256378173828, "logps/chosen": -327.6639709472656, "logps/rejected": -252.65060424804688, "loss": 0.6299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9011869430541992, "rewards/margins": 0.6544877886772156, "rewards/rejected": 1.2466988563537598, "step": 22810 }, { "epoch": 1.0594735131621709, "grad_norm": 2.1196365356445312, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -17.696767807006836, "logits/rejected": -16.959430694580078, "logps/chosen": -331.5827941894531, "logps/rejected": -213.81192016601562, "loss": 0.5503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7960450649261475, "rewards/margins": 1.2776185274124146, "rewards/rejected": 1.5184262990951538, "step": 22820 }, { "epoch": 1.059937787269604, "grad_norm": 97.60128021240234, "learning_rate": 3.234520327468004e-07, "logits/chosen": -19.269437789916992, "logits/rejected": -18.419857025146484, "logps/chosen": -463.9236755371094, "logps/rejected": -365.4163818359375, "loss": 0.4492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.235814094543457, "rewards/margins": 1.247936487197876, "rewards/rejected": 2.98787784576416, "step": 22830 }, { "epoch": 1.060402061377037, "grad_norm": 11.521659851074219, "learning_rate": 3.2337465372889483e-07, "logits/chosen": -18.358478546142578, "logits/rejected": -17.8443603515625, "logps/chosen": -303.8401184082031, "logps/rejected": -308.77886962890625, "loss": 0.6658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.816999673843384, "rewards/margins": 0.8411165475845337, "rewards/rejected": 1.97588312625885, "step": 22840 }, { "epoch": 1.06086633548447, "grad_norm": 57.145938873291016, "learning_rate": 3.2329727471098935e-07, "logits/chosen": -19.837017059326172, "logits/rejected": -18.588289260864258, "logps/chosen": -457.5908203125, "logps/rejected": -376.14300537109375, "loss": 0.558, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.738518238067627, "rewards/margins": 1.3768978118896484, "rewards/rejected": 3.3616206645965576, "step": 22850 }, { "epoch": 1.061330609591903, "grad_norm": 125.14153289794922, "learning_rate": 3.2321989569308386e-07, "logits/chosen": -19.058849334716797, "logits/rejected": -18.166244506835938, "logps/chosen": -486.8004455566406, "logps/rejected": -395.6397399902344, "loss": 0.6082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.958207607269287, "rewards/margins": 1.0056462287902832, "rewards/rejected": 2.952561616897583, "step": 22860 }, { "epoch": 1.061794883699336, "grad_norm": 140.81468200683594, "learning_rate": 3.231425166751783e-07, "logits/chosen": -18.891237258911133, "logits/rejected": -18.209152221679688, "logps/chosen": -333.3502502441406, "logps/rejected": -319.60723876953125, "loss": 0.845, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.792370319366455, "rewards/margins": 0.7139889001846313, "rewards/rejected": 2.0783815383911133, "step": 22870 }, { "epoch": 1.062259157806769, "grad_norm": 79.78160095214844, "learning_rate": 3.2306513765727283e-07, "logits/chosen": -18.567176818847656, "logits/rejected": -18.0595645904541, "logps/chosen": -391.4489440917969, "logps/rejected": -319.96282958984375, "loss": 0.4012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.829075336456299, "rewards/margins": 1.4890865087509155, "rewards/rejected": 2.339988946914673, "step": 22880 }, { "epoch": 1.062723431914202, "grad_norm": 15.006538391113281, "learning_rate": 3.2298775863936734e-07, "logits/chosen": -18.269031524658203, "logits/rejected": -17.024206161499023, "logps/chosen": -293.66461181640625, "logps/rejected": -196.2481231689453, "loss": 0.262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1302084922790527, "rewards/margins": 1.968108892440796, "rewards/rejected": 1.1620997190475464, "step": 22890 }, { "epoch": 1.0631877060216353, "grad_norm": 212.94805908203125, "learning_rate": 3.2291037962146185e-07, "logits/chosen": -18.61914825439453, "logits/rejected": -17.528125762939453, "logps/chosen": -464.235595703125, "logps/rejected": -271.8114318847656, "loss": 0.3539, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8220057487487793, "rewards/margins": 1.3108268976211548, "rewards/rejected": 1.5111788511276245, "step": 22900 }, { "epoch": 1.0636519801290683, "grad_norm": 66.53014373779297, "learning_rate": 3.2283300060355636e-07, "logits/chosen": -18.27657127380371, "logits/rejected": -17.544342041015625, "logps/chosen": -437.63018798828125, "logps/rejected": -319.4408264160156, "loss": 0.4801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.978781223297119, "rewards/margins": 1.3159716129302979, "rewards/rejected": 1.6628096103668213, "step": 22910 }, { "epoch": 1.0641162542365012, "grad_norm": 16.982746124267578, "learning_rate": 3.227556215856508e-07, "logits/chosen": -17.849441528320312, "logits/rejected": -16.813831329345703, "logps/chosen": -418.8321228027344, "logps/rejected": -297.27630615234375, "loss": 0.4063, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.461988925933838, "rewards/margins": 1.5716387033462524, "rewards/rejected": 2.890350341796875, "step": 22920 }, { "epoch": 1.0645805283439342, "grad_norm": 63.74024963378906, "learning_rate": 3.2267824256774533e-07, "logits/chosen": -19.176525115966797, "logits/rejected": -18.382570266723633, "logps/chosen": -487.8866271972656, "logps/rejected": -346.4284973144531, "loss": 1.0448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1270031929016113, "rewards/margins": 0.4848678708076477, "rewards/rejected": 2.6421351432800293, "step": 22930 }, { "epoch": 1.0650448024513672, "grad_norm": 30.95237922668457, "learning_rate": 3.226008635498398e-07, "logits/chosen": -19.149250030517578, "logits/rejected": -18.897397994995117, "logps/chosen": -319.98883056640625, "logps/rejected": -280.32098388671875, "loss": 0.6561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8865420818328857, "rewards/margins": 0.6546808481216431, "rewards/rejected": 2.2318613529205322, "step": 22940 }, { "epoch": 1.0655090765588002, "grad_norm": 52.531917572021484, "learning_rate": 3.225234845319343e-07, "logits/chosen": -19.558910369873047, "logits/rejected": -18.545564651489258, "logps/chosen": -393.2786560058594, "logps/rejected": -252.5296630859375, "loss": 0.4983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.326190233230591, "rewards/margins": 1.4892394542694092, "rewards/rejected": 1.8369510173797607, "step": 22950 }, { "epoch": 1.0659733506662334, "grad_norm": 11.682816505432129, "learning_rate": 3.224461055140288e-07, "logits/chosen": -18.65239143371582, "logits/rejected": -18.134296417236328, "logps/chosen": -359.4520568847656, "logps/rejected": -366.2200622558594, "loss": 0.5968, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.445467233657837, "rewards/margins": 0.7099035978317261, "rewards/rejected": 1.7355635166168213, "step": 22960 }, { "epoch": 1.0664376247736664, "grad_norm": 193.89523315429688, "learning_rate": 3.2236872649612327e-07, "logits/chosen": -18.53167152404785, "logits/rejected": -18.129430770874023, "logps/chosen": -503.2019958496094, "logps/rejected": -461.1338806152344, "loss": 0.5492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.477302551269531, "rewards/margins": 0.9648006558418274, "rewards/rejected": 3.5125019550323486, "step": 22970 }, { "epoch": 1.0669018988810994, "grad_norm": 147.180419921875, "learning_rate": 3.222913474782178e-07, "logits/chosen": -19.216217041015625, "logits/rejected": -18.99928855895996, "logps/chosen": -344.54022216796875, "logps/rejected": -302.89569091796875, "loss": 0.4877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2455475330352783, "rewards/margins": 0.882250189781189, "rewards/rejected": 2.3632972240448, "step": 22980 }, { "epoch": 1.0673661729885324, "grad_norm": 51.51752471923828, "learning_rate": 3.222139684603123e-07, "logits/chosen": -18.916067123413086, "logits/rejected": -18.76664161682129, "logps/chosen": -372.7596740722656, "logps/rejected": -349.96319580078125, "loss": 0.689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2460334300994873, "rewards/margins": 0.44387999176979065, "rewards/rejected": 2.8021538257598877, "step": 22990 }, { "epoch": 1.0678304470959654, "grad_norm": 315.4212646484375, "learning_rate": 3.221365894424068e-07, "logits/chosen": -20.1028995513916, "logits/rejected": -19.04157257080078, "logps/chosen": -517.1243286132812, "logps/rejected": -399.3967590332031, "loss": 0.5675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.357949256896973, "rewards/margins": 1.4245227575302124, "rewards/rejected": 2.93342661857605, "step": 23000 }, { "epoch": 1.0682947212033984, "grad_norm": 214.92189025878906, "learning_rate": 3.220592104245013e-07, "logits/chosen": -19.634794235229492, "logits/rejected": -18.591583251953125, "logps/chosen": -368.89239501953125, "logps/rejected": -293.4572448730469, "loss": 0.7875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1112236976623535, "rewards/margins": 1.1388088464736938, "rewards/rejected": 1.9724149703979492, "step": 23010 }, { "epoch": 1.0687589953108314, "grad_norm": 272.14752197265625, "learning_rate": 3.2198183140659577e-07, "logits/chosen": -18.595678329467773, "logits/rejected": -17.27107048034668, "logps/chosen": -459.0633850097656, "logps/rejected": -325.1034851074219, "loss": 0.5812, "rewards/accuracies": 0.5, "rewards/chosen": 3.654115676879883, "rewards/margins": 1.482741117477417, "rewards/rejected": 2.1713738441467285, "step": 23020 }, { "epoch": 1.0692232694182646, "grad_norm": 62.99557876586914, "learning_rate": 3.219044523886903e-07, "logits/chosen": -17.683822631835938, "logits/rejected": -17.855087280273438, "logps/chosen": -297.5009765625, "logps/rejected": -332.44781494140625, "loss": 1.6686, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4307169914245605, "rewards/margins": -0.6372658610343933, "rewards/rejected": 3.0679826736450195, "step": 23030 }, { "epoch": 1.0696875435256976, "grad_norm": 96.94075775146484, "learning_rate": 3.2182707337078474e-07, "logits/chosen": -18.849933624267578, "logits/rejected": -17.82218360900879, "logps/chosen": -322.4747009277344, "logps/rejected": -215.43630981445312, "loss": 0.5422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9865331649780273, "rewards/margins": 1.47989022731781, "rewards/rejected": 1.5066428184509277, "step": 23040 }, { "epoch": 1.0701518176331306, "grad_norm": 92.91490936279297, "learning_rate": 3.2174969435287925e-07, "logits/chosen": -19.124065399169922, "logits/rejected": -19.107919692993164, "logps/chosen": -296.423095703125, "logps/rejected": -316.6776428222656, "loss": 0.8434, "rewards/accuracies": 0.5, "rewards/chosen": 2.6265275478363037, "rewards/margins": 0.27933311462402344, "rewards/rejected": 2.3471946716308594, "step": 23050 }, { "epoch": 1.0706160917405636, "grad_norm": 10.08176040649414, "learning_rate": 3.2167231533497377e-07, "logits/chosen": -19.33837890625, "logits/rejected": -18.30634117126465, "logps/chosen": -440.95623779296875, "logps/rejected": -353.5030212402344, "loss": 0.5942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.8440093994140625, "rewards/margins": 2.026160478591919, "rewards/rejected": 2.8178491592407227, "step": 23060 }, { "epoch": 1.0710803658479966, "grad_norm": 49.53594970703125, "learning_rate": 3.215949363170682e-07, "logits/chosen": -18.67399024963379, "logits/rejected": -17.728605270385742, "logps/chosen": -415.22100830078125, "logps/rejected": -328.4779357910156, "loss": 0.3279, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.197204113006592, "rewards/margins": 1.5184074640274048, "rewards/rejected": 2.6787967681884766, "step": 23070 }, { "epoch": 1.0715446399554296, "grad_norm": 186.16128540039062, "learning_rate": 3.2151755729916273e-07, "logits/chosen": -18.47464370727539, "logits/rejected": -18.069772720336914, "logps/chosen": -282.67962646484375, "logps/rejected": -242.5900115966797, "loss": 0.8909, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4254355430603027, "rewards/margins": 0.2787521779537201, "rewards/rejected": 2.14668345451355, "step": 23080 }, { "epoch": 1.0720089140628628, "grad_norm": 43.18574142456055, "learning_rate": 3.2144017828125725e-07, "logits/chosen": -19.19344711303711, "logits/rejected": -18.524932861328125, "logps/chosen": -407.55975341796875, "logps/rejected": -313.12286376953125, "loss": 0.7217, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.596721649169922, "rewards/margins": 0.9955275654792786, "rewards/rejected": 2.601193904876709, "step": 23090 }, { "epoch": 1.0724731881702958, "grad_norm": 266.44989013671875, "learning_rate": 3.2136279926335176e-07, "logits/chosen": -18.531597137451172, "logits/rejected": -18.484893798828125, "logps/chosen": -311.5288391113281, "logps/rejected": -319.785888671875, "loss": 1.1104, "rewards/accuracies": 0.5, "rewards/chosen": 2.176226854324341, "rewards/margins": -0.048955000936985016, "rewards/rejected": 2.225182056427002, "step": 23100 }, { "epoch": 1.0729374622777288, "grad_norm": 85.3512191772461, "learning_rate": 3.2128542024544627e-07, "logits/chosen": -18.74072265625, "logits/rejected": -18.927715301513672, "logps/chosen": -382.66253662109375, "logps/rejected": -424.0394592285156, "loss": 1.2661, "rewards/accuracies": 0.5, "rewards/chosen": 3.3120830059051514, "rewards/margins": -0.4803132116794586, "rewards/rejected": 3.7923965454101562, "step": 23110 }, { "epoch": 1.0734017363851618, "grad_norm": 180.1964874267578, "learning_rate": 3.2120804122754073e-07, "logits/chosen": -19.231243133544922, "logits/rejected": -17.741687774658203, "logps/chosen": -416.47149658203125, "logps/rejected": -277.16131591796875, "loss": 0.4908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.931380271911621, "rewards/margins": 2.301848888397217, "rewards/rejected": 2.6295313835144043, "step": 23120 }, { "epoch": 1.0738660104925948, "grad_norm": 101.58229064941406, "learning_rate": 3.211306622096352e-07, "logits/chosen": -18.440399169921875, "logits/rejected": -17.337162017822266, "logps/chosen": -504.06048583984375, "logps/rejected": -311.938232421875, "loss": 0.4163, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.125975608825684, "rewards/margins": 1.6506452560424805, "rewards/rejected": 2.475330114364624, "step": 23130 }, { "epoch": 1.0743302846000278, "grad_norm": 43.33998107910156, "learning_rate": 3.210532831917297e-07, "logits/chosen": -19.608016967773438, "logits/rejected": -19.27338981628418, "logps/chosen": -472.23516845703125, "logps/rejected": -336.13525390625, "loss": 0.4666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8055458068847656, "rewards/margins": 0.857852578163147, "rewards/rejected": 2.9476935863494873, "step": 23140 }, { "epoch": 1.074794558707461, "grad_norm": 28.415298461914062, "learning_rate": 3.209759041738242e-07, "logits/chosen": -19.059236526489258, "logits/rejected": -17.72905731201172, "logps/chosen": -478.581298828125, "logps/rejected": -358.5870666503906, "loss": 0.569, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.7219438552856445, "rewards/margins": 2.154494524002075, "rewards/rejected": 2.5674490928649902, "step": 23150 }, { "epoch": 1.075258832814894, "grad_norm": 50.1676139831543, "learning_rate": 3.208985251559187e-07, "logits/chosen": -19.861190795898438, "logits/rejected": -18.258686065673828, "logps/chosen": -482.3946838378906, "logps/rejected": -400.7640686035156, "loss": 0.2566, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.457715034484863, "rewards/margins": 2.5860610008239746, "rewards/rejected": 2.8716530799865723, "step": 23160 }, { "epoch": 1.075723106922327, "grad_norm": 138.1268310546875, "learning_rate": 3.208211461380132e-07, "logits/chosen": -18.571508407592773, "logits/rejected": -18.527585983276367, "logps/chosen": -322.77752685546875, "logps/rejected": -287.3092956542969, "loss": 0.7107, "rewards/accuracies": 0.5, "rewards/chosen": 2.2336132526397705, "rewards/margins": 0.20798537135124207, "rewards/rejected": 2.025627613067627, "step": 23170 }, { "epoch": 1.07618738102976, "grad_norm": 57.86177444458008, "learning_rate": 3.207437671201077e-07, "logits/chosen": -18.32577896118164, "logits/rejected": -17.84450912475586, "logps/chosen": -372.0607604980469, "logps/rejected": -279.66717529296875, "loss": 0.5968, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.863734483718872, "rewards/margins": 1.3867766857147217, "rewards/rejected": 2.4769577980041504, "step": 23180 }, { "epoch": 1.076651655137193, "grad_norm": 135.9197998046875, "learning_rate": 3.206663881022022e-07, "logits/chosen": -18.537822723388672, "logits/rejected": -17.36832618713379, "logps/chosen": -447.32598876953125, "logps/rejected": -297.56268310546875, "loss": 0.4447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.146626949310303, "rewards/margins": 2.0516908168792725, "rewards/rejected": 2.094935894012451, "step": 23190 }, { "epoch": 1.077115929244626, "grad_norm": 128.9460906982422, "learning_rate": 3.205890090842967e-07, "logits/chosen": -18.573205947875977, "logits/rejected": -18.132516860961914, "logps/chosen": -395.99237060546875, "logps/rejected": -332.3867492675781, "loss": 0.8384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3125343322753906, "rewards/margins": 0.5105863809585571, "rewards/rejected": 2.8019473552703857, "step": 23200 }, { "epoch": 1.077580203352059, "grad_norm": 68.35189819335938, "learning_rate": 3.205116300663912e-07, "logits/chosen": -17.590574264526367, "logits/rejected": -18.155067443847656, "logps/chosen": -345.32568359375, "logps/rejected": -411.3525390625, "loss": 1.7613, "rewards/accuracies": 0.5, "rewards/chosen": 2.908360481262207, "rewards/margins": -0.49729281663894653, "rewards/rejected": 3.405653476715088, "step": 23210 }, { "epoch": 1.0780444774594922, "grad_norm": 141.0111846923828, "learning_rate": 3.204342510484857e-07, "logits/chosen": -18.5425968170166, "logits/rejected": -17.534908294677734, "logps/chosen": -358.3990783691406, "logps/rejected": -265.1903381347656, "loss": 0.3167, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.274590015411377, "rewards/margins": 1.7220863103866577, "rewards/rejected": 1.5525034666061401, "step": 23220 }, { "epoch": 1.0785087515669252, "grad_norm": 101.63026428222656, "learning_rate": 3.2035687203058014e-07, "logits/chosen": -18.230388641357422, "logits/rejected": -18.91440200805664, "logps/chosen": -361.08148193359375, "logps/rejected": -303.1392517089844, "loss": 0.7193, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.2380123138427734, "rewards/margins": 0.4976314604282379, "rewards/rejected": 2.7403807640075684, "step": 23230 }, { "epoch": 1.0789730256743582, "grad_norm": 146.81031799316406, "learning_rate": 3.2027949301267465e-07, "logits/chosen": -18.95217514038086, "logits/rejected": -18.77255630493164, "logps/chosen": -425.84375, "logps/rejected": -401.63916015625, "loss": 0.5939, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.022145986557007, "rewards/margins": 0.5089812278747559, "rewards/rejected": 2.513164520263672, "step": 23240 }, { "epoch": 1.0794372997817911, "grad_norm": 5.6083502769470215, "learning_rate": 3.2020211399476916e-07, "logits/chosen": -19.3253231048584, "logits/rejected": -19.225893020629883, "logps/chosen": -352.246337890625, "logps/rejected": -391.3905029296875, "loss": 1.3206, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6298797130584717, "rewards/margins": -0.39139851927757263, "rewards/rejected": 3.0212783813476562, "step": 23250 }, { "epoch": 1.0799015738892241, "grad_norm": 59.64235305786133, "learning_rate": 3.2012473497686367e-07, "logits/chosen": -20.034116744995117, "logits/rejected": -19.018129348754883, "logps/chosen": -497.94659423828125, "logps/rejected": -344.3608093261719, "loss": 0.3126, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.446076393127441, "rewards/margins": 1.7590614557266235, "rewards/rejected": 2.687014579772949, "step": 23260 }, { "epoch": 1.0803658479966571, "grad_norm": 74.7237777709961, "learning_rate": 3.2004735595895813e-07, "logits/chosen": -19.12938690185547, "logits/rejected": -18.002605438232422, "logps/chosen": -386.6355285644531, "logps/rejected": -306.1251220703125, "loss": 0.529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4841110706329346, "rewards/margins": 1.401593804359436, "rewards/rejected": 2.082516670227051, "step": 23270 }, { "epoch": 1.0808301221040904, "grad_norm": 8.482105255126953, "learning_rate": 3.1996997694105264e-07, "logits/chosen": -18.701900482177734, "logits/rejected": -17.782695770263672, "logps/chosen": -399.27777099609375, "logps/rejected": -290.8585510253906, "loss": 0.6328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3099682331085205, "rewards/margins": 1.1462371349334717, "rewards/rejected": 2.163731098175049, "step": 23280 }, { "epoch": 1.0812943962115233, "grad_norm": 18.071468353271484, "learning_rate": 3.1989259792314715e-07, "logits/chosen": -19.227745056152344, "logits/rejected": -18.750354766845703, "logps/chosen": -442.41754150390625, "logps/rejected": -463.7293395996094, "loss": 0.6983, "rewards/accuracies": 0.5, "rewards/chosen": 3.9303977489471436, "rewards/margins": 0.900119423866272, "rewards/rejected": 3.030278205871582, "step": 23290 }, { "epoch": 1.0817586703189563, "grad_norm": 166.5045928955078, "learning_rate": 3.1981521890524167e-07, "logits/chosen": -19.23068618774414, "logits/rejected": -18.468238830566406, "logps/chosen": -366.5668640136719, "logps/rejected": -388.57183837890625, "loss": 0.8068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2420029640197754, "rewards/margins": 0.3364938795566559, "rewards/rejected": 2.9055092334747314, "step": 23300 }, { "epoch": 1.0822229444263893, "grad_norm": 55.03152084350586, "learning_rate": 3.197378398873362e-07, "logits/chosen": -19.607973098754883, "logits/rejected": -19.168249130249023, "logps/chosen": -373.1304016113281, "logps/rejected": -354.1461486816406, "loss": 0.668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5044524669647217, "rewards/margins": 0.709981381893158, "rewards/rejected": 2.794471263885498, "step": 23310 }, { "epoch": 1.0826872185338223, "grad_norm": 0.9811850190162659, "learning_rate": 3.196604608694306e-07, "logits/chosen": -18.147933959960938, "logits/rejected": -16.8913516998291, "logps/chosen": -503.9153747558594, "logps/rejected": -322.84893798828125, "loss": 0.4781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.782201051712036, "rewards/margins": 1.6862977743148804, "rewards/rejected": 2.0959036350250244, "step": 23320 }, { "epoch": 1.0831514926412553, "grad_norm": 6.116415977478027, "learning_rate": 3.195830818515251e-07, "logits/chosen": -18.724300384521484, "logits/rejected": -17.970844268798828, "logps/chosen": -516.7493896484375, "logps/rejected": -407.01605224609375, "loss": 0.3169, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7447350025177, "rewards/margins": 1.576549768447876, "rewards/rejected": 2.1681854724884033, "step": 23330 }, { "epoch": 1.0836157667486885, "grad_norm": 44.06062316894531, "learning_rate": 3.195057028336196e-07, "logits/chosen": -18.133996963500977, "logits/rejected": -17.59290313720703, "logps/chosen": -268.0210266113281, "logps/rejected": -235.90817260742188, "loss": 0.6099, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9951845407485962, "rewards/margins": 0.5045181512832642, "rewards/rejected": 1.4906666278839111, "step": 23340 }, { "epoch": 1.0840800408561215, "grad_norm": 18.20830535888672, "learning_rate": 3.194283238157141e-07, "logits/chosen": -18.71932601928711, "logits/rejected": -18.027576446533203, "logps/chosen": -510.38043212890625, "logps/rejected": -360.9032287597656, "loss": 0.5652, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.384866237640381, "rewards/margins": 1.5511895418167114, "rewards/rejected": 2.833676815032959, "step": 23350 }, { "epoch": 1.0845443149635545, "grad_norm": 30.547916412353516, "learning_rate": 3.1935094479780863e-07, "logits/chosen": -18.568408966064453, "logits/rejected": -19.307958602905273, "logps/chosen": -400.85504150390625, "logps/rejected": -356.4402770996094, "loss": 0.9095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.278986692428589, "rewards/margins": 0.2606140077114105, "rewards/rejected": 3.0183730125427246, "step": 23360 }, { "epoch": 1.0850085890709875, "grad_norm": 0.15748952329158783, "learning_rate": 3.192735657799031e-07, "logits/chosen": -20.135873794555664, "logits/rejected": -19.84678840637207, "logps/chosen": -382.71026611328125, "logps/rejected": -327.1499328613281, "loss": 0.8177, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.850623607635498, "rewards/margins": 0.5098382830619812, "rewards/rejected": 2.340785264968872, "step": 23370 }, { "epoch": 1.0854728631784205, "grad_norm": 25.978919982910156, "learning_rate": 3.191961867619976e-07, "logits/chosen": -19.604061126708984, "logits/rejected": -18.05472755432129, "logps/chosen": -375.50787353515625, "logps/rejected": -281.84002685546875, "loss": 0.4012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.470144271850586, "rewards/margins": 1.2468318939208984, "rewards/rejected": 2.2233121395111084, "step": 23380 }, { "epoch": 1.0859371372858535, "grad_norm": 3.4706673622131348, "learning_rate": 3.191188077440921e-07, "logits/chosen": -19.155864715576172, "logits/rejected": -18.976383209228516, "logps/chosen": -406.013671875, "logps/rejected": -377.92803955078125, "loss": 1.0826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3895423412323, "rewards/margins": 0.11601848900318146, "rewards/rejected": 3.2735238075256348, "step": 23390 }, { "epoch": 1.0864014113932865, "grad_norm": 62.94377899169922, "learning_rate": 3.190414287261866e-07, "logits/chosen": -18.71853256225586, "logits/rejected": -18.216426849365234, "logps/chosen": -364.1255187988281, "logps/rejected": -331.95379638671875, "loss": 0.8467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.0982279777526855, "rewards/margins": 0.40643101930618286, "rewards/rejected": 3.6917972564697266, "step": 23400 }, { "epoch": 1.0868656855007197, "grad_norm": 68.89019775390625, "learning_rate": 3.1896404970828113e-07, "logits/chosen": -19.198238372802734, "logits/rejected": -18.20401382446289, "logps/chosen": -362.7730712890625, "logps/rejected": -335.7595520019531, "loss": 0.5471, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1042046546936035, "rewards/margins": 1.1566364765167236, "rewards/rejected": 1.9475681781768799, "step": 23410 }, { "epoch": 1.0873299596081527, "grad_norm": 265.7239990234375, "learning_rate": 3.1888667069037554e-07, "logits/chosen": -18.050045013427734, "logits/rejected": -18.425281524658203, "logps/chosen": -321.5020446777344, "logps/rejected": -355.5464172363281, "loss": 1.3074, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3912012577056885, "rewards/margins": -0.4034716486930847, "rewards/rejected": 2.7946724891662598, "step": 23420 }, { "epoch": 1.0877942337155857, "grad_norm": 141.29661560058594, "learning_rate": 3.1880929167247005e-07, "logits/chosen": -19.09810447692871, "logits/rejected": -19.124500274658203, "logps/chosen": -395.36346435546875, "logps/rejected": -309.8217468261719, "loss": 0.7482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6365058422088623, "rewards/margins": 0.4398480951786041, "rewards/rejected": 3.196658134460449, "step": 23430 }, { "epoch": 1.0882585078230187, "grad_norm": 26.44422149658203, "learning_rate": 3.1873191265456456e-07, "logits/chosen": -18.028593063354492, "logits/rejected": -17.711536407470703, "logps/chosen": -299.0086669921875, "logps/rejected": -194.57313537597656, "loss": 0.6507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2252650260925293, "rewards/margins": 0.7472713589668274, "rewards/rejected": 1.4779938459396362, "step": 23440 }, { "epoch": 1.0887227819304517, "grad_norm": 44.33222961425781, "learning_rate": 3.1865453363665907e-07, "logits/chosen": -19.099491119384766, "logits/rejected": -18.07435417175293, "logps/chosen": -474.6730041503906, "logps/rejected": -353.5782470703125, "loss": 1.1877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3319382667541504, "rewards/margins": 0.7324779033660889, "rewards/rejected": 2.5994603633880615, "step": 23450 }, { "epoch": 1.0891870560378847, "grad_norm": 121.49495697021484, "learning_rate": 3.185771546187536e-07, "logits/chosen": -18.276264190673828, "logits/rejected": -17.871644973754883, "logps/chosen": -392.5175476074219, "logps/rejected": -386.01263427734375, "loss": 0.7478, "rewards/accuracies": 0.5, "rewards/chosen": 2.813962697982788, "rewards/margins": 0.711079478263855, "rewards/rejected": 2.1028833389282227, "step": 23460 }, { "epoch": 1.089651330145318, "grad_norm": 8.443062782287598, "learning_rate": 3.1849977560084804e-07, "logits/chosen": -20.522214889526367, "logits/rejected": -18.605709075927734, "logps/chosen": -485.0271911621094, "logps/rejected": -245.6271514892578, "loss": 0.2491, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.641866683959961, "rewards/margins": 3.2219557762145996, "rewards/rejected": 1.4199109077453613, "step": 23470 }, { "epoch": 1.090115604252751, "grad_norm": 92.9120101928711, "learning_rate": 3.1842239658294255e-07, "logits/chosen": -19.74573516845703, "logits/rejected": -18.612924575805664, "logps/chosen": -379.0970764160156, "logps/rejected": -247.8166046142578, "loss": 0.6188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1353554725646973, "rewards/margins": 1.4807686805725098, "rewards/rejected": 1.6545867919921875, "step": 23480 }, { "epoch": 1.0905798783601839, "grad_norm": 50.604312896728516, "learning_rate": 3.1834501756503706e-07, "logits/chosen": -18.55263328552246, "logits/rejected": -18.20358657836914, "logps/chosen": -359.9716796875, "logps/rejected": -345.99444580078125, "loss": 0.6858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.025609016418457, "rewards/margins": 0.48512133955955505, "rewards/rejected": 2.540487766265869, "step": 23490 }, { "epoch": 1.0910441524676169, "grad_norm": 10.074548721313477, "learning_rate": 3.182676385471316e-07, "logits/chosen": -18.45149040222168, "logits/rejected": -18.382802963256836, "logps/chosen": -459.26953125, "logps/rejected": -420.6861877441406, "loss": 0.7386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.383842945098877, "rewards/margins": 0.8572839498519897, "rewards/rejected": 2.5265586376190186, "step": 23500 }, { "epoch": 1.0915084265750499, "grad_norm": 166.96893310546875, "learning_rate": 3.181902595292261e-07, "logits/chosen": -17.90152931213379, "logits/rejected": -18.035430908203125, "logps/chosen": -441.52197265625, "logps/rejected": -401.1947326660156, "loss": 1.0714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9295811653137207, "rewards/margins": 0.521466851234436, "rewards/rejected": 3.408114194869995, "step": 23510 }, { "epoch": 1.0919727006824829, "grad_norm": 131.6863250732422, "learning_rate": 3.181128805113205e-07, "logits/chosen": -18.78844451904297, "logits/rejected": -17.634511947631836, "logps/chosen": -405.42791748046875, "logps/rejected": -378.1449279785156, "loss": 0.5149, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4471449851989746, "rewards/margins": 1.4632093906402588, "rewards/rejected": 1.9839363098144531, "step": 23520 }, { "epoch": 1.092436974789916, "grad_norm": 4.155229568481445, "learning_rate": 3.18035501493415e-07, "logits/chosen": -18.808937072753906, "logits/rejected": -18.22692108154297, "logps/chosen": -424.5877380371094, "logps/rejected": -352.89764404296875, "loss": 0.8603, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6075005531311035, "rewards/margins": 0.6748709678649902, "rewards/rejected": 2.932629108428955, "step": 23530 }, { "epoch": 1.092901248897349, "grad_norm": 39.05205535888672, "learning_rate": 3.179581224755095e-07, "logits/chosen": -18.71260643005371, "logits/rejected": -18.208642959594727, "logps/chosen": -399.4681701660156, "logps/rejected": -363.89971923828125, "loss": 0.6766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0499844551086426, "rewards/margins": 0.8546854257583618, "rewards/rejected": 2.195298910140991, "step": 23540 }, { "epoch": 1.093365523004782, "grad_norm": 129.30946350097656, "learning_rate": 3.17880743457604e-07, "logits/chosen": -18.76333999633789, "logits/rejected": -18.21973991394043, "logps/chosen": -477.84405517578125, "logps/rejected": -389.6449890136719, "loss": 0.6555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2993640899658203, "rewards/margins": 0.7483927011489868, "rewards/rejected": 2.550971269607544, "step": 23550 }, { "epoch": 1.093829797112215, "grad_norm": 7.263808727264404, "learning_rate": 3.1780336443969854e-07, "logits/chosen": -20.02225112915039, "logits/rejected": -18.82200050354004, "logps/chosen": -411.01873779296875, "logps/rejected": -363.42608642578125, "loss": 0.4956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3109779357910156, "rewards/margins": 1.2022708654403687, "rewards/rejected": 2.1087071895599365, "step": 23560 }, { "epoch": 1.094294071219648, "grad_norm": 1.2579517364501953, "learning_rate": 3.17725985421793e-07, "logits/chosen": -18.858043670654297, "logits/rejected": -17.762147903442383, "logps/chosen": -446.1314392089844, "logps/rejected": -307.8141174316406, "loss": 0.3903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.312861919403076, "rewards/margins": 1.967411756515503, "rewards/rejected": 2.345449686050415, "step": 23570 }, { "epoch": 1.094758345327081, "grad_norm": NaN, "learning_rate": 3.1765634430567807e-07, "logits/chosen": -18.314769744873047, "logits/rejected": -18.222002029418945, "logps/chosen": -337.65643310546875, "logps/rejected": -368.5634765625, "loss": 1.0366, "rewards/accuracies": 0.5, "rewards/chosen": 2.2584927082061768, "rewards/margins": -0.40310782194137573, "rewards/rejected": 2.6616008281707764, "step": 23580 }, { "epoch": 1.095222619434514, "grad_norm": 7.478865146636963, "learning_rate": 3.175789652877726e-07, "logits/chosen": -20.17803192138672, "logits/rejected": -19.611083984375, "logps/chosen": -393.602294921875, "logps/rejected": -307.04248046875, "loss": 0.3649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.944408416748047, "rewards/margins": 1.5318453311920166, "rewards/rejected": 2.412562847137451, "step": 23590 }, { "epoch": 1.0956868935419473, "grad_norm": 154.66139221191406, "learning_rate": 3.1750158626986704e-07, "logits/chosen": -20.51028823852539, "logits/rejected": -19.101818084716797, "logps/chosen": -351.14215087890625, "logps/rejected": -265.0797119140625, "loss": 0.5033, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.197964668273926, "rewards/margins": 1.3938713073730469, "rewards/rejected": 2.804093360900879, "step": 23600 }, { "epoch": 1.0961511676493803, "grad_norm": 1.6708898544311523, "learning_rate": 3.1742420725196155e-07, "logits/chosen": -19.481901168823242, "logits/rejected": -18.369455337524414, "logps/chosen": -410.7019958496094, "logps/rejected": -308.86956787109375, "loss": 0.5929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.126907825469971, "rewards/margins": 1.6986534595489502, "rewards/rejected": 2.4282543659210205, "step": 23610 }, { "epoch": 1.0966154417568132, "grad_norm": 70.03323364257812, "learning_rate": 3.1734682823405606e-07, "logits/chosen": -19.84410858154297, "logits/rejected": -17.994083404541016, "logps/chosen": -427.78564453125, "logps/rejected": -247.64028930664062, "loss": 0.2502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.926210880279541, "rewards/margins": 2.374718189239502, "rewards/rejected": 1.5514925718307495, "step": 23620 }, { "epoch": 1.0970797158642462, "grad_norm": 16.346277236938477, "learning_rate": 3.172694492161505e-07, "logits/chosen": -19.521833419799805, "logits/rejected": -18.707021713256836, "logps/chosen": -397.72216796875, "logps/rejected": -242.9692840576172, "loss": 0.6746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0240519046783447, "rewards/margins": 0.7636985778808594, "rewards/rejected": 2.2603530883789062, "step": 23630 }, { "epoch": 1.0975439899716792, "grad_norm": 49.661094665527344, "learning_rate": 3.1719207019824503e-07, "logits/chosen": -19.520936965942383, "logits/rejected": -18.918581008911133, "logps/chosen": -487.83172607421875, "logps/rejected": -387.28912353515625, "loss": 0.588, "rewards/accuracies": 0.5, "rewards/chosen": 4.325158596038818, "rewards/margins": 1.198118805885315, "rewards/rejected": 3.1270394325256348, "step": 23640 }, { "epoch": 1.0980082640791122, "grad_norm": 157.6107635498047, "learning_rate": 3.171146911803395e-07, "logits/chosen": -19.496164321899414, "logits/rejected": -18.41576385498047, "logps/chosen": -425.7347717285156, "logps/rejected": -346.9272766113281, "loss": 0.7713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3042044639587402, "rewards/margins": 0.4072825312614441, "rewards/rejected": 2.8969221115112305, "step": 23650 }, { "epoch": 1.0984725381865452, "grad_norm": 93.19847869873047, "learning_rate": 3.17037312162434e-07, "logits/chosen": -18.42072105407715, "logits/rejected": -18.844871520996094, "logps/chosen": -368.765869140625, "logps/rejected": -339.0426330566406, "loss": 0.9258, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.125824451446533, "rewards/margins": 0.09465236961841583, "rewards/rejected": 3.031172513961792, "step": 23660 }, { "epoch": 1.0989368122939784, "grad_norm": 9.47498607635498, "learning_rate": 3.169599331445285e-07, "logits/chosen": -18.504756927490234, "logits/rejected": -18.48990249633789, "logps/chosen": -262.7359313964844, "logps/rejected": -277.6759948730469, "loss": 0.6465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7163097858428955, "rewards/margins": 0.5692270994186401, "rewards/rejected": 2.147082567214966, "step": 23670 }, { "epoch": 1.0994010864014114, "grad_norm": 61.78432083129883, "learning_rate": 3.16882554126623e-07, "logits/chosen": -19.23967933654785, "logits/rejected": -18.565208435058594, "logps/chosen": -480.12042236328125, "logps/rejected": -397.64434814453125, "loss": 0.6865, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6040821075439453, "rewards/margins": 0.48929286003112793, "rewards/rejected": 3.1147894859313965, "step": 23680 }, { "epoch": 1.0998653605088444, "grad_norm": 77.83318328857422, "learning_rate": 3.1680517510871754e-07, "logits/chosen": -19.529380798339844, "logits/rejected": -19.2913875579834, "logps/chosen": -438.64581298828125, "logps/rejected": -381.0133361816406, "loss": 0.6118, "rewards/accuracies": 0.5, "rewards/chosen": 4.333449363708496, "rewards/margins": 0.49861153960227966, "rewards/rejected": 3.8348374366760254, "step": 23690 }, { "epoch": 1.1003296346162774, "grad_norm": 259.7747497558594, "learning_rate": 3.16727796090812e-07, "logits/chosen": -17.703752517700195, "logits/rejected": -17.736143112182617, "logps/chosen": -349.9427185058594, "logps/rejected": -346.61932373046875, "loss": 1.5351, "rewards/accuracies": 0.5, "rewards/chosen": 2.9990601539611816, "rewards/margins": -0.6291374564170837, "rewards/rejected": 3.628197431564331, "step": 23700 }, { "epoch": 1.1007939087237104, "grad_norm": 143.83108520507812, "learning_rate": 3.166504170729065e-07, "logits/chosen": -19.316007614135742, "logits/rejected": -18.67989158630371, "logps/chosen": -443.92755126953125, "logps/rejected": -315.98175048828125, "loss": 1.1498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9112181663513184, "rewards/margins": 1.4167020320892334, "rewards/rejected": 2.494516372680664, "step": 23710 }, { "epoch": 1.1012581828311434, "grad_norm": 93.99537658691406, "learning_rate": 3.16573038055001e-07, "logits/chosen": -20.119152069091797, "logits/rejected": -19.087299346923828, "logps/chosen": -417.5650939941406, "logps/rejected": -321.4484558105469, "loss": 0.714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.218691825866699, "rewards/margins": 0.9727867245674133, "rewards/rejected": 3.2459044456481934, "step": 23720 }, { "epoch": 1.1017224569385766, "grad_norm": 77.41704559326172, "learning_rate": 3.164956590370955e-07, "logits/chosen": -18.603557586669922, "logits/rejected": -18.292844772338867, "logps/chosen": -356.69989013671875, "logps/rejected": -347.8616943359375, "loss": 0.6831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5095183849334717, "rewards/margins": 0.2656930685043335, "rewards/rejected": 2.2438254356384277, "step": 23730 }, { "epoch": 1.1021867310460096, "grad_norm": 11.945576667785645, "learning_rate": 3.1641828001919e-07, "logits/chosen": -19.479969024658203, "logits/rejected": -18.820581436157227, "logps/chosen": -447.00946044921875, "logps/rejected": -365.9393615722656, "loss": 0.6397, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.265324354171753, "rewards/margins": 0.7597168684005737, "rewards/rejected": 2.5056076049804688, "step": 23740 }, { "epoch": 1.1026510051534426, "grad_norm": 59.258724212646484, "learning_rate": 3.1634090100128444e-07, "logits/chosen": -18.528900146484375, "logits/rejected": -17.80494499206543, "logps/chosen": -332.9281921386719, "logps/rejected": -254.52737426757812, "loss": 0.5069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9250502586364746, "rewards/margins": 1.4321844577789307, "rewards/rejected": 1.492865800857544, "step": 23750 }, { "epoch": 1.1031152792608756, "grad_norm": 3.9506046772003174, "learning_rate": 3.1626352198337896e-07, "logits/chosen": -19.350826263427734, "logits/rejected": -17.397350311279297, "logps/chosen": -552.2689208984375, "logps/rejected": -368.219970703125, "loss": 0.2129, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.818522930145264, "rewards/margins": 2.4285194873809814, "rewards/rejected": 2.3900043964385986, "step": 23760 }, { "epoch": 1.1035795533683086, "grad_norm": 26.517065048217773, "learning_rate": 3.1618614296547347e-07, "logits/chosen": -18.782567977905273, "logits/rejected": -17.42556381225586, "logps/chosen": -392.18817138671875, "logps/rejected": -315.56610107421875, "loss": 0.4506, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5276153087615967, "rewards/margins": 0.9532384872436523, "rewards/rejected": 2.5743770599365234, "step": 23770 }, { "epoch": 1.1040438274757416, "grad_norm": 44.518619537353516, "learning_rate": 3.16108763947568e-07, "logits/chosen": -19.113378524780273, "logits/rejected": -17.930051803588867, "logps/chosen": -374.2325134277344, "logps/rejected": -262.6407775878906, "loss": 0.3721, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5484261512756348, "rewards/margins": 1.8540899753570557, "rewards/rejected": 1.6943362951278687, "step": 23780 }, { "epoch": 1.1045081015831748, "grad_norm": 24.607555389404297, "learning_rate": 3.160313849296625e-07, "logits/chosen": -18.488800048828125, "logits/rejected": -17.913503646850586, "logps/chosen": -327.6914978027344, "logps/rejected": -228.9544219970703, "loss": 0.586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0468037128448486, "rewards/margins": 1.1291753053665161, "rewards/rejected": 1.9176286458969116, "step": 23790 }, { "epoch": 1.1049723756906078, "grad_norm": 120.38643646240234, "learning_rate": 3.1595400591175695e-07, "logits/chosen": -19.111421585083008, "logits/rejected": -18.897415161132812, "logps/chosen": -398.3230895996094, "logps/rejected": -383.13983154296875, "loss": 0.9175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.07303524017334, "rewards/margins": 0.2717706859111786, "rewards/rejected": 3.801265001296997, "step": 23800 }, { "epoch": 1.1054366497980408, "grad_norm": 0.5113467574119568, "learning_rate": 3.1587662689385146e-07, "logits/chosen": -18.702016830444336, "logits/rejected": -18.101713180541992, "logps/chosen": -508.02685546875, "logps/rejected": -395.88568115234375, "loss": 0.5362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.35380220413208, "rewards/margins": 1.1063517332077026, "rewards/rejected": 2.247450113296509, "step": 23810 }, { "epoch": 1.1059009239054738, "grad_norm": 0.11611594259738922, "learning_rate": 3.157992478759459e-07, "logits/chosen": -19.133710861206055, "logits/rejected": -17.560848236083984, "logps/chosen": -504.81024169921875, "logps/rejected": -360.71136474609375, "loss": 0.3115, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.223826885223389, "rewards/margins": 1.9908195734024048, "rewards/rejected": 2.2330076694488525, "step": 23820 }, { "epoch": 1.1063651980129068, "grad_norm": 4.470834732055664, "learning_rate": 3.1572186885804043e-07, "logits/chosen": -19.04861831665039, "logits/rejected": -17.87224006652832, "logps/chosen": -378.95794677734375, "logps/rejected": -220.9684295654297, "loss": 0.4098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.321575880050659, "rewards/margins": 1.3783982992172241, "rewards/rejected": 1.9431774616241455, "step": 23830 }, { "epoch": 1.1068294721203398, "grad_norm": 12.782079696655273, "learning_rate": 3.1564448984013494e-07, "logits/chosen": -19.631439208984375, "logits/rejected": -18.34785270690918, "logps/chosen": -308.55633544921875, "logps/rejected": -221.06240844726562, "loss": 0.3952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.474769115447998, "rewards/margins": 1.6275914907455444, "rewards/rejected": 1.8471777439117432, "step": 23840 }, { "epoch": 1.1072937462277728, "grad_norm": 32.481781005859375, "learning_rate": 3.155671108222294e-07, "logits/chosen": -18.53800392150879, "logits/rejected": -17.24138069152832, "logps/chosen": -427.892822265625, "logps/rejected": -273.00872802734375, "loss": 0.373, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.080418348312378, "rewards/margins": 1.172086477279663, "rewards/rejected": 1.9083318710327148, "step": 23850 }, { "epoch": 1.107758020335206, "grad_norm": 32.5103874206543, "learning_rate": 3.154897318043239e-07, "logits/chosen": -18.224143981933594, "logits/rejected": -18.1492919921875, "logps/chosen": -310.20086669921875, "logps/rejected": -257.90234375, "loss": 1.1824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6928939819335938, "rewards/margins": 0.2612166702747345, "rewards/rejected": 2.4316773414611816, "step": 23860 }, { "epoch": 1.108222294442639, "grad_norm": 15.2833890914917, "learning_rate": 3.154123527864184e-07, "logits/chosen": -19.538509368896484, "logits/rejected": -18.85896873474121, "logps/chosen": -450.390625, "logps/rejected": -311.8350524902344, "loss": 0.711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4957549571990967, "rewards/margins": 1.060779333114624, "rewards/rejected": 2.4349753856658936, "step": 23870 }, { "epoch": 1.108686568550072, "grad_norm": 113.70022583007812, "learning_rate": 3.1533497376851293e-07, "logits/chosen": -17.90043067932129, "logits/rejected": -18.39982032775879, "logps/chosen": -324.5069885253906, "logps/rejected": -287.66363525390625, "loss": 0.8436, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.573930025100708, "rewards/margins": 0.08755433559417725, "rewards/rejected": 2.4863758087158203, "step": 23880 }, { "epoch": 1.109150842657505, "grad_norm": 170.53968811035156, "learning_rate": 3.1525759475060744e-07, "logits/chosen": -18.806903839111328, "logits/rejected": -18.929052352905273, "logps/chosen": -386.17779541015625, "logps/rejected": -360.76416015625, "loss": 0.7605, "rewards/accuracies": 0.5, "rewards/chosen": 3.4046547412872314, "rewards/margins": 0.4677780270576477, "rewards/rejected": 2.9368767738342285, "step": 23890 }, { "epoch": 1.109615116764938, "grad_norm": 122.78472137451172, "learning_rate": 3.151802157327019e-07, "logits/chosen": -18.535018920898438, "logits/rejected": -17.552141189575195, "logps/chosen": -458.0567321777344, "logps/rejected": -340.324462890625, "loss": 0.7451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.440323829650879, "rewards/margins": 1.663273572921753, "rewards/rejected": 2.777050495147705, "step": 23900 }, { "epoch": 1.110079390872371, "grad_norm": 10.411555290222168, "learning_rate": 3.151028367147964e-07, "logits/chosen": -18.182682037353516, "logits/rejected": -17.76852798461914, "logps/chosen": -458.95458984375, "logps/rejected": -396.01739501953125, "loss": 0.8596, "rewards/accuracies": 0.5, "rewards/chosen": 3.5454108715057373, "rewards/margins": 0.773404598236084, "rewards/rejected": 2.7720065116882324, "step": 23910 }, { "epoch": 1.1105436649798042, "grad_norm": 107.36468505859375, "learning_rate": 3.1502545769689087e-07, "logits/chosen": -18.52061653137207, "logits/rejected": -18.488704681396484, "logps/chosen": -327.6576843261719, "logps/rejected": -388.2378234863281, "loss": 0.6025, "rewards/accuracies": 0.5, "rewards/chosen": 3.0461480617523193, "rewards/margins": 0.4071008563041687, "rewards/rejected": 2.639047145843506, "step": 23920 }, { "epoch": 1.1110079390872372, "grad_norm": 5.723079681396484, "learning_rate": 3.149480786789854e-07, "logits/chosen": -18.784212112426758, "logits/rejected": -18.1112117767334, "logps/chosen": -352.2304382324219, "logps/rejected": -333.27825927734375, "loss": 0.766, "rewards/accuracies": 0.5, "rewards/chosen": 3.2971718311309814, "rewards/margins": 0.4457119107246399, "rewards/rejected": 2.8514599800109863, "step": 23930 }, { "epoch": 1.1114722131946702, "grad_norm": 209.30015563964844, "learning_rate": 3.148706996610799e-07, "logits/chosen": -18.741682052612305, "logits/rejected": -18.485780715942383, "logps/chosen": -415.4413146972656, "logps/rejected": -422.12548828125, "loss": 1.4793, "rewards/accuracies": 0.5, "rewards/chosen": 3.017015218734741, "rewards/margins": -0.49963122606277466, "rewards/rejected": 3.516646146774292, "step": 23940 }, { "epoch": 1.1119364873021031, "grad_norm": 2.1000845432281494, "learning_rate": 3.1479332064317435e-07, "logits/chosen": -18.079299926757812, "logits/rejected": -17.04787826538086, "logps/chosen": -591.2175903320312, "logps/rejected": -384.8963623046875, "loss": 0.4184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.5031938552856445, "rewards/margins": 2.365018844604492, "rewards/rejected": 2.1381752490997314, "step": 23950 }, { "epoch": 1.1124007614095361, "grad_norm": 239.04598999023438, "learning_rate": 3.1471594162526886e-07, "logits/chosen": -18.431772232055664, "logits/rejected": -18.08534812927246, "logps/chosen": -459.01031494140625, "logps/rejected": -439.7025451660156, "loss": 1.0826, "rewards/accuracies": 0.5, "rewards/chosen": 3.2924556732177734, "rewards/margins": 0.34887534379959106, "rewards/rejected": 2.943580150604248, "step": 23960 }, { "epoch": 1.1128650355169691, "grad_norm": 74.67398834228516, "learning_rate": 3.146385626073634e-07, "logits/chosen": -18.263051986694336, "logits/rejected": -17.510440826416016, "logps/chosen": -392.2004699707031, "logps/rejected": -257.5137939453125, "loss": 0.2594, "rewards/accuracies": 1.0, "rewards/chosen": 3.2425460815429688, "rewards/margins": 1.4664649963378906, "rewards/rejected": 1.776080846786499, "step": 23970 }, { "epoch": 1.1133293096244024, "grad_norm": 218.18557739257812, "learning_rate": 3.145611835894579e-07, "logits/chosen": -19.15296745300293, "logits/rejected": -18.294017791748047, "logps/chosen": -511.7364196777344, "logps/rejected": -387.1968688964844, "loss": 0.6759, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.598146915435791, "rewards/margins": 1.514525294303894, "rewards/rejected": 3.0836217403411865, "step": 23980 }, { "epoch": 1.1137935837318353, "grad_norm": 34.42813491821289, "learning_rate": 3.144838045715524e-07, "logits/chosen": -17.931110382080078, "logits/rejected": -16.49444007873535, "logps/chosen": -508.13690185546875, "logps/rejected": -322.06781005859375, "loss": 0.3023, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.386436462402344, "rewards/margins": 2.499068021774292, "rewards/rejected": 1.8873687982559204, "step": 23990 }, { "epoch": 1.1142578578392683, "grad_norm": 63.02931213378906, "learning_rate": 3.1440642555364686e-07, "logits/chosen": -18.53812026977539, "logits/rejected": -19.39073371887207, "logps/chosen": -463.68780517578125, "logps/rejected": -528.6476440429688, "loss": 1.219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.76491641998291, "rewards/margins": -0.03257008641958237, "rewards/rejected": 4.797487258911133, "step": 24000 }, { "epoch": 1.1147221319467013, "grad_norm": 1.0924381017684937, "learning_rate": 3.143290465357413e-07, "logits/chosen": -18.771039962768555, "logits/rejected": -17.982885360717773, "logps/chosen": -388.1264343261719, "logps/rejected": -229.8395233154297, "loss": 0.7903, "rewards/accuracies": 0.5, "rewards/chosen": 3.1273207664489746, "rewards/margins": 1.459646463394165, "rewards/rejected": 1.6676738262176514, "step": 24010 }, { "epoch": 1.1151864060541343, "grad_norm": 10.599212646484375, "learning_rate": 3.142516675178358e-07, "logits/chosen": -18.552682876586914, "logits/rejected": -18.268497467041016, "logps/chosen": -253.21585083007812, "logps/rejected": -250.7066192626953, "loss": 0.5741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9178273677825928, "rewards/margins": 0.643200159072876, "rewards/rejected": 1.2746270895004272, "step": 24020 }, { "epoch": 1.1156506801615673, "grad_norm": 63.87752151489258, "learning_rate": 3.1417428849993034e-07, "logits/chosen": -19.7392520904541, "logits/rejected": -18.85271453857422, "logps/chosen": -519.6151123046875, "logps/rejected": -392.72918701171875, "loss": 0.6435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8490073680877686, "rewards/margins": 0.4305371344089508, "rewards/rejected": 3.4184703826904297, "step": 24030 }, { "epoch": 1.1161149542690003, "grad_norm": 133.50482177734375, "learning_rate": 3.1409690948202485e-07, "logits/chosen": -18.927127838134766, "logits/rejected": -19.463844299316406, "logps/chosen": -431.80413818359375, "logps/rejected": -505.61932373046875, "loss": 0.8271, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4483375549316406, "rewards/margins": 0.2234613001346588, "rewards/rejected": 3.2248764038085938, "step": 24040 }, { "epoch": 1.1165792283764335, "grad_norm": 26.982746124267578, "learning_rate": 3.140195304641193e-07, "logits/chosen": -19.01226043701172, "logits/rejected": -18.687667846679688, "logps/chosen": -388.5165100097656, "logps/rejected": -247.2863311767578, "loss": 0.4327, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3280391693115234, "rewards/margins": 1.7340961694717407, "rewards/rejected": 1.5939427614212036, "step": 24050 }, { "epoch": 1.1170435024838665, "grad_norm": 44.737483978271484, "learning_rate": 3.139421514462138e-07, "logits/chosen": -19.733211517333984, "logits/rejected": -18.816381454467773, "logps/chosen": -271.87579345703125, "logps/rejected": -204.76348876953125, "loss": 0.399, "rewards/accuracies": 1.0, "rewards/chosen": 2.4961702823638916, "rewards/margins": 0.8022998571395874, "rewards/rejected": 1.693870186805725, "step": 24060 }, { "epoch": 1.1175077765912995, "grad_norm": 47.831912994384766, "learning_rate": 3.1386477242830833e-07, "logits/chosen": -18.665531158447266, "logits/rejected": -17.80643653869629, "logps/chosen": -320.21142578125, "logps/rejected": -311.19879150390625, "loss": 0.5248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1371374130249023, "rewards/margins": 0.9389855265617371, "rewards/rejected": 2.1981520652770996, "step": 24070 }, { "epoch": 1.1179720506987325, "grad_norm": 89.70541381835938, "learning_rate": 3.1378739341040284e-07, "logits/chosen": -18.730430603027344, "logits/rejected": -18.346925735473633, "logps/chosen": -379.0732116699219, "logps/rejected": -306.0775451660156, "loss": 0.6582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.287797212600708, "rewards/margins": 0.86314457654953, "rewards/rejected": 2.424652576446533, "step": 24080 }, { "epoch": 1.1184363248061655, "grad_norm": 8.186108589172363, "learning_rate": 3.1371001439249735e-07, "logits/chosen": -19.285480499267578, "logits/rejected": -18.143840789794922, "logps/chosen": -391.4630432128906, "logps/rejected": -318.1307067871094, "loss": 0.4276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.146877288818359, "rewards/margins": 1.7587636709213257, "rewards/rejected": 2.388113498687744, "step": 24090 }, { "epoch": 1.1189005989135985, "grad_norm": 109.4122314453125, "learning_rate": 3.136326353745918e-07, "logits/chosen": -19.31805992126465, "logits/rejected": -17.502662658691406, "logps/chosen": -429.22998046875, "logps/rejected": -258.1659240722656, "loss": 0.3381, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6716198921203613, "rewards/margins": 2.011875867843628, "rewards/rejected": 1.6597436666488647, "step": 24100 }, { "epoch": 1.1193648730210317, "grad_norm": 269.2696228027344, "learning_rate": 3.1355525635668627e-07, "logits/chosen": -17.65668296813965, "logits/rejected": -17.8774471282959, "logps/chosen": -249.64697265625, "logps/rejected": -269.15289306640625, "loss": 0.9412, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7392055988311768, "rewards/margins": 0.10881845653057098, "rewards/rejected": 1.6303870677947998, "step": 24110 }, { "epoch": 1.1198291471284647, "grad_norm": 104.22077178955078, "learning_rate": 3.134778773387808e-07, "logits/chosen": -18.958057403564453, "logits/rejected": -18.28006935119629, "logps/chosen": -372.79473876953125, "logps/rejected": -335.31317138671875, "loss": 0.5603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8882689476013184, "rewards/margins": 0.8939735293388367, "rewards/rejected": 2.994295597076416, "step": 24120 }, { "epoch": 1.1202934212358977, "grad_norm": 36.87474822998047, "learning_rate": 3.134004983208753e-07, "logits/chosen": -18.759977340698242, "logits/rejected": -17.9652156829834, "logps/chosen": -430.52197265625, "logps/rejected": -348.96221923828125, "loss": 0.4104, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2343146800994873, "rewards/margins": 1.0162322521209717, "rewards/rejected": 2.2180824279785156, "step": 24130 }, { "epoch": 1.1207576953433307, "grad_norm": 8.97262191772461, "learning_rate": 3.133231193029698e-07, "logits/chosen": -18.941747665405273, "logits/rejected": -19.116695404052734, "logps/chosen": -356.7261657714844, "logps/rejected": -318.2901916503906, "loss": 0.7724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.22575306892395, "rewards/margins": 0.21527652442455292, "rewards/rejected": 3.010476589202881, "step": 24140 }, { "epoch": 1.1212219694507637, "grad_norm": 69.5738296508789, "learning_rate": 3.1324574028506426e-07, "logits/chosen": -19.688085556030273, "logits/rejected": -18.774263381958008, "logps/chosen": -470.6417541503906, "logps/rejected": -424.39971923828125, "loss": 1.0546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.754999876022339, "rewards/margins": 0.19027535617351532, "rewards/rejected": 3.564725160598755, "step": 24150 }, { "epoch": 1.1216862435581967, "grad_norm": 14.26781177520752, "learning_rate": 3.1316836126715877e-07, "logits/chosen": -18.453433990478516, "logits/rejected": -17.508136749267578, "logps/chosen": -399.88433837890625, "logps/rejected": -282.27044677734375, "loss": 0.4485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.571669340133667, "rewards/margins": 1.3635547161102295, "rewards/rejected": 2.2081143856048584, "step": 24160 }, { "epoch": 1.12215051766563, "grad_norm": 58.72050857543945, "learning_rate": 3.130909822492533e-07, "logits/chosen": -18.524730682373047, "logits/rejected": -18.0074520111084, "logps/chosen": -297.51727294921875, "logps/rejected": -225.291015625, "loss": 0.7122, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0399279594421387, "rewards/margins": 0.3035494387149811, "rewards/rejected": 1.7363786697387695, "step": 24170 }, { "epoch": 1.122614791773063, "grad_norm": 159.02685546875, "learning_rate": 3.130136032313478e-07, "logits/chosen": -18.96017074584961, "logits/rejected": -18.608234405517578, "logps/chosen": -469.87078857421875, "logps/rejected": -332.01165771484375, "loss": 1.0545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.460420608520508, "rewards/margins": 0.8018089532852173, "rewards/rejected": 3.658612012863159, "step": 24180 }, { "epoch": 1.1230790658804959, "grad_norm": 2.815793037414551, "learning_rate": 3.129362242134423e-07, "logits/chosen": -19.400434494018555, "logits/rejected": -19.092296600341797, "logps/chosen": -347.50567626953125, "logps/rejected": -288.3646240234375, "loss": 0.9127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5224931240081787, "rewards/margins": 0.5271433591842651, "rewards/rejected": 1.9953498840332031, "step": 24190 }, { "epoch": 1.1235433399879289, "grad_norm": 133.939453125, "learning_rate": 3.128588451955368e-07, "logits/chosen": -19.310848236083984, "logits/rejected": -18.741865158081055, "logps/chosen": -373.090087890625, "logps/rejected": -321.19366455078125, "loss": 0.6497, "rewards/accuracies": 0.5, "rewards/chosen": 2.6476454734802246, "rewards/margins": 0.5426754355430603, "rewards/rejected": 2.1049699783325195, "step": 24200 }, { "epoch": 1.1240076140953619, "grad_norm": 71.11193084716797, "learning_rate": 3.127814661776312e-07, "logits/chosen": -17.918306350708008, "logits/rejected": -16.818798065185547, "logps/chosen": -511.9375915527344, "logps/rejected": -368.39935302734375, "loss": 0.4881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6116783618927, "rewards/margins": 1.4425184726715088, "rewards/rejected": 2.1691598892211914, "step": 24210 }, { "epoch": 1.1244718882027949, "grad_norm": 20.144651412963867, "learning_rate": 3.1270408715972573e-07, "logits/chosen": -18.790374755859375, "logits/rejected": -17.014278411865234, "logps/chosen": -352.1968688964844, "logps/rejected": -176.3994903564453, "loss": 0.754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.197209358215332, "rewards/margins": 1.765655279159546, "rewards/rejected": 1.4315539598464966, "step": 24220 }, { "epoch": 1.1249361623102279, "grad_norm": 0.6897250413894653, "learning_rate": 3.1262670814182025e-07, "logits/chosen": -18.228469848632812, "logits/rejected": -17.456262588500977, "logps/chosen": -414.34429931640625, "logps/rejected": -277.5496826171875, "loss": 0.5326, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.647355794906616, "rewards/margins": 1.260093331336975, "rewards/rejected": 1.3872625827789307, "step": 24230 }, { "epoch": 1.125400436417661, "grad_norm": 140.0797882080078, "learning_rate": 3.1254932912391476e-07, "logits/chosen": -18.263376235961914, "logits/rejected": -17.398771286010742, "logps/chosen": -432.6363830566406, "logps/rejected": -243.7658233642578, "loss": 0.7217, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5615813732147217, "rewards/margins": 1.5172834396362305, "rewards/rejected": 2.044297933578491, "step": 24240 }, { "epoch": 1.125864710525094, "grad_norm": 114.546142578125, "learning_rate": 3.124719501060092e-07, "logits/chosen": -19.788837432861328, "logits/rejected": -18.131351470947266, "logps/chosen": -403.80120849609375, "logps/rejected": -272.9461975097656, "loss": 0.4004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1915204524993896, "rewards/margins": 1.9129482507705688, "rewards/rejected": 1.2785720825195312, "step": 24250 }, { "epoch": 1.126328984632527, "grad_norm": 30.702308654785156, "learning_rate": 3.123945710881037e-07, "logits/chosen": -19.947607040405273, "logits/rejected": -18.55327796936035, "logps/chosen": -399.49365234375, "logps/rejected": -292.0592041015625, "loss": 0.62, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0753540992736816, "rewards/margins": 0.8991939425468445, "rewards/rejected": 2.1761600971221924, "step": 24260 }, { "epoch": 1.12679325873996, "grad_norm": 7.505984783172607, "learning_rate": 3.1231719207019824e-07, "logits/chosen": -20.376543045043945, "logits/rejected": -18.889528274536133, "logps/chosen": -384.43817138671875, "logps/rejected": -314.76275634765625, "loss": 0.4439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4261245727539062, "rewards/margins": 1.1475306749343872, "rewards/rejected": 2.2785942554473877, "step": 24270 }, { "epoch": 1.127257532847393, "grad_norm": 61.122196197509766, "learning_rate": 3.1223981305229275e-07, "logits/chosen": -18.84298324584961, "logits/rejected": -18.888446807861328, "logps/chosen": -294.54937744140625, "logps/rejected": -394.6133728027344, "loss": 1.448, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1128973960876465, "rewards/margins": -0.7153667211532593, "rewards/rejected": 2.828263759613037, "step": 24280 }, { "epoch": 1.127721806954826, "grad_norm": 285.3262023925781, "learning_rate": 3.1216243403438726e-07, "logits/chosen": -18.972164154052734, "logits/rejected": -18.61385726928711, "logps/chosen": -459.6900329589844, "logps/rejected": -410.98388671875, "loss": 0.8193, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7298569679260254, "rewards/margins": 0.29863736033439636, "rewards/rejected": 3.4312195777893066, "step": 24290 }, { "epoch": 1.128186081062259, "grad_norm": 4.4976420402526855, "learning_rate": 3.1208505501648167e-07, "logits/chosen": -19.194133758544922, "logits/rejected": -18.700212478637695, "logps/chosen": -388.981689453125, "logps/rejected": -349.67340087890625, "loss": 0.6313, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.274534702301025, "rewards/margins": 1.1562683582305908, "rewards/rejected": 3.1182665824890137, "step": 24300 }, { "epoch": 1.1286503551696923, "grad_norm": 0.5129907131195068, "learning_rate": 3.120076759985762e-07, "logits/chosen": -18.634353637695312, "logits/rejected": -18.708459854125977, "logps/chosen": -414.02484130859375, "logps/rejected": -427.0397033691406, "loss": 1.1601, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.509883403778076, "rewards/margins": 0.2051698863506317, "rewards/rejected": 3.304713487625122, "step": 24310 }, { "epoch": 1.1291146292771252, "grad_norm": 45.69752883911133, "learning_rate": 3.119302969806707e-07, "logits/chosen": -19.380273818969727, "logits/rejected": -19.054744720458984, "logps/chosen": -483.433837890625, "logps/rejected": -427.04345703125, "loss": 0.8697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.221928119659424, "rewards/margins": 0.7276755571365356, "rewards/rejected": 3.4942517280578613, "step": 24320 }, { "epoch": 1.1295789033845582, "grad_norm": 71.96358489990234, "learning_rate": 3.118529179627652e-07, "logits/chosen": -18.184659957885742, "logits/rejected": -17.393238067626953, "logps/chosen": -316.171142578125, "logps/rejected": -265.7117004394531, "loss": 0.6444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3261334896087646, "rewards/margins": 1.0184643268585205, "rewards/rejected": 1.3076694011688232, "step": 24330 }, { "epoch": 1.1300431774919912, "grad_norm": 171.60447692871094, "learning_rate": 3.117755389448597e-07, "logits/chosen": -20.050243377685547, "logits/rejected": -19.52678871154785, "logps/chosen": -400.45977783203125, "logps/rejected": -377.4561767578125, "loss": 0.7605, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.165140151977539, "rewards/margins": 0.9040935635566711, "rewards/rejected": 2.2610466480255127, "step": 24340 }, { "epoch": 1.1305074515994242, "grad_norm": 0.03810914605855942, "learning_rate": 3.116981599269542e-07, "logits/chosen": -19.511600494384766, "logits/rejected": -18.314529418945312, "logps/chosen": -408.2522888183594, "logps/rejected": -310.64825439453125, "loss": 0.4782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.33206033706665, "rewards/margins": 2.185014009475708, "rewards/rejected": 2.1470465660095215, "step": 24350 }, { "epoch": 1.1309717257068574, "grad_norm": 6.843425273895264, "learning_rate": 3.116207809090487e-07, "logits/chosen": -18.77463150024414, "logits/rejected": -18.1354923248291, "logps/chosen": -455.20166015625, "logps/rejected": -346.4813537597656, "loss": 0.5157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.84161376953125, "rewards/margins": 1.1841518878936768, "rewards/rejected": 2.6574621200561523, "step": 24360 }, { "epoch": 1.1314359998142904, "grad_norm": 60.71798324584961, "learning_rate": 3.115434018911432e-07, "logits/chosen": -18.404998779296875, "logits/rejected": -18.413589477539062, "logps/chosen": -383.03582763671875, "logps/rejected": -279.7375183105469, "loss": 0.8777, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0395240783691406, "rewards/margins": 0.09889280796051025, "rewards/rejected": 1.9406315088272095, "step": 24370 }, { "epoch": 1.1319002739217234, "grad_norm": 9.628904342651367, "learning_rate": 3.114660228732377e-07, "logits/chosen": -17.862625122070312, "logits/rejected": -17.902755737304688, "logps/chosen": -431.09014892578125, "logps/rejected": -417.91961669921875, "loss": 1.4411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3294124603271484, "rewards/margins": 0.2239028960466385, "rewards/rejected": 3.1055095195770264, "step": 24380 }, { "epoch": 1.1323645480291564, "grad_norm": 32.338985443115234, "learning_rate": 3.113886438553322e-07, "logits/chosen": -19.369443893432617, "logits/rejected": -18.982141494750977, "logps/chosen": -483.767578125, "logps/rejected": -443.1947326660156, "loss": 0.4342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.6303205490112305, "rewards/margins": 1.5079114437103271, "rewards/rejected": 3.122408628463745, "step": 24390 }, { "epoch": 1.1328288221365894, "grad_norm": 63.74126052856445, "learning_rate": 3.113112648374266e-07, "logits/chosen": -18.53420066833496, "logits/rejected": -18.587976455688477, "logps/chosen": -333.5859069824219, "logps/rejected": -385.2942199707031, "loss": 1.313, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.5967049598693848, "rewards/margins": -0.3338729739189148, "rewards/rejected": 2.9305782318115234, "step": 24400 }, { "epoch": 1.1332930962440224, "grad_norm": 27.04631996154785, "learning_rate": 3.1123388581952113e-07, "logits/chosen": -18.624286651611328, "logits/rejected": -17.89998435974121, "logps/chosen": -477.86859130859375, "logps/rejected": -254.62930297851562, "loss": 0.5637, "rewards/accuracies": 0.5, "rewards/chosen": 3.5149714946746826, "rewards/margins": 1.394728422164917, "rewards/rejected": 2.1202433109283447, "step": 24410 }, { "epoch": 1.1337573703514554, "grad_norm": 1.830390214920044, "learning_rate": 3.1115650680161564e-07, "logits/chosen": -18.288570404052734, "logits/rejected": -17.47722816467285, "logps/chosen": -356.5258483886719, "logps/rejected": -302.778076171875, "loss": 0.6126, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5643038749694824, "rewards/margins": 0.9124223589897156, "rewards/rejected": 1.6518815755844116, "step": 24420 }, { "epoch": 1.1342216444588886, "grad_norm": 195.5532684326172, "learning_rate": 3.1107912778371015e-07, "logits/chosen": -17.654905319213867, "logits/rejected": -18.25214195251465, "logps/chosen": -254.99075317382812, "logps/rejected": -310.73687744140625, "loss": 1.2726, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.8343963623046875, "rewards/margins": -0.7524839639663696, "rewards/rejected": 2.5868802070617676, "step": 24430 }, { "epoch": 1.1346859185663216, "grad_norm": 134.15036010742188, "learning_rate": 3.1100174876580466e-07, "logits/chosen": -20.037158966064453, "logits/rejected": -18.66564178466797, "logps/chosen": -360.18524169921875, "logps/rejected": -256.54510498046875, "loss": 0.6544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0551857948303223, "rewards/margins": 0.7396355867385864, "rewards/rejected": 2.3155500888824463, "step": 24440 }, { "epoch": 1.1351501926737546, "grad_norm": 99.75232696533203, "learning_rate": 3.109243697478992e-07, "logits/chosen": -19.57830238342285, "logits/rejected": -18.729230880737305, "logps/chosen": -424.25762939453125, "logps/rejected": -338.3169860839844, "loss": 0.5795, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.928805351257324, "rewards/margins": 1.5921971797943115, "rewards/rejected": 3.3366081714630127, "step": 24450 }, { "epoch": 1.1356144667811876, "grad_norm": 9.793439865112305, "learning_rate": 3.1084699072999363e-07, "logits/chosen": -20.786653518676758, "logits/rejected": -19.964801788330078, "logps/chosen": -340.80303955078125, "logps/rejected": -368.2825622558594, "loss": 0.6016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9774508476257324, "rewards/margins": 0.6618199348449707, "rewards/rejected": 2.315631151199341, "step": 24460 }, { "epoch": 1.1360787408886206, "grad_norm": 40.77851867675781, "learning_rate": 3.1076961171208815e-07, "logits/chosen": -18.923282623291016, "logits/rejected": -17.127586364746094, "logps/chosen": -419.7859802246094, "logps/rejected": -275.1879577636719, "loss": 0.4492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.892296314239502, "rewards/margins": 1.8515942096710205, "rewards/rejected": 2.0407018661499023, "step": 24470 }, { "epoch": 1.1365430149960536, "grad_norm": 48.517513275146484, "learning_rate": 3.1069223269418266e-07, "logits/chosen": -19.02880859375, "logits/rejected": -17.519786834716797, "logps/chosen": -338.67047119140625, "logps/rejected": -269.67864990234375, "loss": 0.6548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.310929775238037, "rewards/margins": 1.674791932106018, "rewards/rejected": 1.6361382007598877, "step": 24480 }, { "epoch": 1.1370072891034866, "grad_norm": 37.349002838134766, "learning_rate": 3.106148536762771e-07, "logits/chosen": -19.286197662353516, "logits/rejected": -18.740924835205078, "logps/chosen": -300.3794250488281, "logps/rejected": -268.8108215332031, "loss": 0.3269, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6028733253479004, "rewards/margins": 1.5725903511047363, "rewards/rejected": 1.0302832126617432, "step": 24490 }, { "epoch": 1.1374715632109198, "grad_norm": 45.286659240722656, "learning_rate": 3.1053747465837163e-07, "logits/chosen": -18.90161895751953, "logits/rejected": -19.040437698364258, "logps/chosen": -407.66961669921875, "logps/rejected": -391.2325439453125, "loss": 0.6164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.062755823135376, "rewards/margins": 0.8440993428230286, "rewards/rejected": 2.218656301498413, "step": 24500 }, { "epoch": 1.1379358373183528, "grad_norm": 140.8688201904297, "learning_rate": 3.104600956404661e-07, "logits/chosen": -18.499954223632812, "logits/rejected": -17.939119338989258, "logps/chosen": -393.09344482421875, "logps/rejected": -328.4131774902344, "loss": 1.0305, "rewards/accuracies": 0.5, "rewards/chosen": 4.424117088317871, "rewards/margins": 0.7230623960494995, "rewards/rejected": 3.701054334640503, "step": 24510 }, { "epoch": 1.1384001114257858, "grad_norm": 40.95003890991211, "learning_rate": 3.103827166225606e-07, "logits/chosen": -18.641164779663086, "logits/rejected": -18.48673439025879, "logps/chosen": -387.66204833984375, "logps/rejected": -384.876953125, "loss": 0.7826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1701931953430176, "rewards/margins": 0.2868611216545105, "rewards/rejected": 2.8833320140838623, "step": 24520 }, { "epoch": 1.1388643855332188, "grad_norm": 37.31190490722656, "learning_rate": 3.103053376046551e-07, "logits/chosen": -19.270448684692383, "logits/rejected": -19.043109893798828, "logps/chosen": -429.8648986816406, "logps/rejected": -446.22528076171875, "loss": 0.9213, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7132046222686768, "rewards/margins": 0.0968535915017128, "rewards/rejected": 3.6163506507873535, "step": 24530 }, { "epoch": 1.1393286596406518, "grad_norm": 32.62922668457031, "learning_rate": 3.102279585867496e-07, "logits/chosen": -20.59941291809082, "logits/rejected": -20.246429443359375, "logps/chosen": -316.3010559082031, "logps/rejected": -286.2996826171875, "loss": 0.4989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1609930992126465, "rewards/margins": 0.969015896320343, "rewards/rejected": 2.1919772624969482, "step": 24540 }, { "epoch": 1.139792933748085, "grad_norm": 73.8326644897461, "learning_rate": 3.1015057956884413e-07, "logits/chosen": -18.05329132080078, "logits/rejected": -17.767053604125977, "logps/chosen": -257.02197265625, "logps/rejected": -221.67471313476562, "loss": 0.6412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8741285800933838, "rewards/margins": 0.6896181106567383, "rewards/rejected": 1.1845104694366455, "step": 24550 }, { "epoch": 1.140257207855518, "grad_norm": 94.6172103881836, "learning_rate": 3.100732005509386e-07, "logits/chosen": -19.878028869628906, "logits/rejected": -19.248210906982422, "logps/chosen": -446.5581970214844, "logps/rejected": -373.0621337890625, "loss": 0.7969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.319000244140625, "rewards/margins": 1.4445682764053345, "rewards/rejected": 2.874431848526001, "step": 24560 }, { "epoch": 1.140721481962951, "grad_norm": 8.988649368286133, "learning_rate": 3.099958215330331e-07, "logits/chosen": -18.322355270385742, "logits/rejected": -17.455413818359375, "logps/chosen": -350.8175354003906, "logps/rejected": -251.02536010742188, "loss": 0.6533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4379076957702637, "rewards/margins": 0.8955137133598328, "rewards/rejected": 1.5423941612243652, "step": 24570 }, { "epoch": 1.141185756070384, "grad_norm": 9.603087425231934, "learning_rate": 3.099184425151276e-07, "logits/chosen": -18.026792526245117, "logits/rejected": -16.756317138671875, "logps/chosen": -469.9532165527344, "logps/rejected": -259.75689697265625, "loss": 0.2158, "rewards/accuracies": 1.0, "rewards/chosen": 4.058655261993408, "rewards/margins": 2.2406582832336426, "rewards/rejected": 1.8179969787597656, "step": 24580 }, { "epoch": 1.141650030177817, "grad_norm": 206.9143524169922, "learning_rate": 3.0984106349722207e-07, "logits/chosen": -18.79177474975586, "logits/rejected": -18.146774291992188, "logps/chosen": -393.99444580078125, "logps/rejected": -310.5331115722656, "loss": 0.9279, "rewards/accuracies": 0.5, "rewards/chosen": 3.4743449687957764, "rewards/margins": 0.8206311464309692, "rewards/rejected": 2.6537137031555176, "step": 24590 }, { "epoch": 1.14211430428525, "grad_norm": 239.43807983398438, "learning_rate": 3.097636844793166e-07, "logits/chosen": -19.61035919189453, "logits/rejected": -18.872699737548828, "logps/chosen": -439.13372802734375, "logps/rejected": -281.2743225097656, "loss": 0.7943, "rewards/accuracies": 0.5, "rewards/chosen": 3.2204482555389404, "rewards/margins": 0.7492088079452515, "rewards/rejected": 2.4712393283843994, "step": 24600 }, { "epoch": 1.142578578392683, "grad_norm": 53.36684799194336, "learning_rate": 3.0968630546141104e-07, "logits/chosen": -18.86734962463379, "logits/rejected": -18.095396041870117, "logps/chosen": -330.90081787109375, "logps/rejected": -282.66119384765625, "loss": 0.5241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.012359380722046, "rewards/margins": 0.6152466535568237, "rewards/rejected": 2.3971128463745117, "step": 24610 }, { "epoch": 1.1430428525001162, "grad_norm": 7.758335590362549, "learning_rate": 3.0960892644350555e-07, "logits/chosen": -18.04253578186035, "logits/rejected": -17.88692855834961, "logps/chosen": -351.32464599609375, "logps/rejected": -290.7980651855469, "loss": 0.7387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.060965061187744, "rewards/margins": 1.1052162647247314, "rewards/rejected": 1.9557489156723022, "step": 24620 }, { "epoch": 1.1435071266075492, "grad_norm": 25.179006576538086, "learning_rate": 3.0953154742560006e-07, "logits/chosen": -19.28447723388672, "logits/rejected": -17.924104690551758, "logps/chosen": -355.977783203125, "logps/rejected": -317.37249755859375, "loss": 0.7879, "rewards/accuracies": 0.5, "rewards/chosen": 2.997772693634033, "rewards/margins": 0.6210967302322388, "rewards/rejected": 2.376675844192505, "step": 24630 }, { "epoch": 1.1439714007149822, "grad_norm": 157.61224365234375, "learning_rate": 3.0945416840769457e-07, "logits/chosen": -19.20417022705078, "logits/rejected": -18.344989776611328, "logps/chosen": -551.7657470703125, "logps/rejected": -416.060546875, "loss": 0.3042, "rewards/accuracies": 1.0, "rewards/chosen": 4.221466064453125, "rewards/margins": 1.6850887537002563, "rewards/rejected": 2.536377429962158, "step": 24640 }, { "epoch": 1.1444356748224151, "grad_norm": 69.36983489990234, "learning_rate": 3.093767893897891e-07, "logits/chosen": -19.698381423950195, "logits/rejected": -18.900543212890625, "logps/chosen": -484.2069396972656, "logps/rejected": -394.01434326171875, "loss": 0.86, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.537766218185425, "rewards/margins": 1.0176234245300293, "rewards/rejected": 2.5201427936553955, "step": 24650 }, { "epoch": 1.1448999489298481, "grad_norm": 43.58606719970703, "learning_rate": 3.0929941037188354e-07, "logits/chosen": -19.23459243774414, "logits/rejected": -18.994474411010742, "logps/chosen": -452.3128967285156, "logps/rejected": -383.340087890625, "loss": 1.2838, "rewards/accuracies": 0.5, "rewards/chosen": 3.4611237049102783, "rewards/margins": -0.3377370834350586, "rewards/rejected": 3.798860549926758, "step": 24660 }, { "epoch": 1.1453642230372811, "grad_norm": 18.55373191833496, "learning_rate": 3.0922203135397805e-07, "logits/chosen": -19.06411361694336, "logits/rejected": -18.337890625, "logps/chosen": -273.1245422363281, "logps/rejected": -216.7994384765625, "loss": 0.4771, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9292073249816895, "rewards/margins": 0.8469840884208679, "rewards/rejected": 2.082223415374756, "step": 24670 }, { "epoch": 1.1458284971447141, "grad_norm": 22.04636573791504, "learning_rate": 3.091446523360725e-07, "logits/chosen": -20.066532135009766, "logits/rejected": -18.23899269104004, "logps/chosen": -347.8873291015625, "logps/rejected": -236.1519317626953, "loss": 0.7505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.317578077316284, "rewards/margins": 1.330942153930664, "rewards/rejected": 1.9866359233856201, "step": 24680 }, { "epoch": 1.1462927712521473, "grad_norm": 29.88972282409668, "learning_rate": 3.09067273318167e-07, "logits/chosen": -19.022598266601562, "logits/rejected": -18.908708572387695, "logps/chosen": -438.29559326171875, "logps/rejected": -419.5445251464844, "loss": 1.0436, "rewards/accuracies": 0.5, "rewards/chosen": 3.4513473510742188, "rewards/margins": 0.12181756645441055, "rewards/rejected": 3.329529285430908, "step": 24690 }, { "epoch": 1.1467570453595803, "grad_norm": 37.84848403930664, "learning_rate": 3.0898989430026154e-07, "logits/chosen": -18.5443058013916, "logits/rejected": -17.39474105834961, "logps/chosen": -420.7560119628906, "logps/rejected": -267.69647216796875, "loss": 0.4812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.469115734100342, "rewards/margins": 1.3097370862960815, "rewards/rejected": 2.1593785285949707, "step": 24700 }, { "epoch": 1.1472213194670133, "grad_norm": 14.496050834655762, "learning_rate": 3.08912515282356e-07, "logits/chosen": -19.38127326965332, "logits/rejected": -18.434494018554688, "logps/chosen": -488.4247131347656, "logps/rejected": -433.4266662597656, "loss": 0.632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7745590209960938, "rewards/margins": 0.6347377896308899, "rewards/rejected": 3.139821767807007, "step": 24710 }, { "epoch": 1.1476855935744463, "grad_norm": 16.784868240356445, "learning_rate": 3.088351362644505e-07, "logits/chosen": -17.560672760009766, "logits/rejected": -17.107858657836914, "logps/chosen": -389.9533996582031, "logps/rejected": -273.44647216796875, "loss": 0.8311, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.14031982421875, "rewards/margins": 0.6065540909767151, "rewards/rejected": 1.5337655544281006, "step": 24720 }, { "epoch": 1.1481498676818793, "grad_norm": 163.32188415527344, "learning_rate": 3.08757757246545e-07, "logits/chosen": -19.907634735107422, "logits/rejected": -18.292757034301758, "logps/chosen": -368.0910949707031, "logps/rejected": -303.51715087890625, "loss": 0.3156, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.950098752975464, "rewards/margins": 1.6266279220581055, "rewards/rejected": 2.3234708309173584, "step": 24730 }, { "epoch": 1.1486141417893123, "grad_norm": 49.82249069213867, "learning_rate": 3.0868037822863953e-07, "logits/chosen": -18.795719146728516, "logits/rejected": -17.548599243164062, "logps/chosen": -315.3118896484375, "logps/rejected": -233.37301635742188, "loss": 0.6617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.663600444793701, "rewards/margins": 0.8553635478019714, "rewards/rejected": 1.808236837387085, "step": 24740 }, { "epoch": 1.1490784158967455, "grad_norm": 93.64647674560547, "learning_rate": 3.0860299921073404e-07, "logits/chosen": -19.070375442504883, "logits/rejected": -19.458721160888672, "logps/chosen": -358.06378173828125, "logps/rejected": -395.01739501953125, "loss": 1.5386, "rewards/accuracies": 0.5, "rewards/chosen": 2.9489943981170654, "rewards/margins": -0.9613062739372253, "rewards/rejected": 3.9103007316589355, "step": 24750 }, { "epoch": 1.1495426900041785, "grad_norm": 6.612910270690918, "learning_rate": 3.085256201928285e-07, "logits/chosen": -19.817943572998047, "logits/rejected": -18.817516326904297, "logps/chosen": -481.9209899902344, "logps/rejected": -338.68145751953125, "loss": 0.4149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9743576049804688, "rewards/margins": 1.560545563697815, "rewards/rejected": 2.4138121604919434, "step": 24760 }, { "epoch": 1.1500069641116115, "grad_norm": 45.9445915222168, "learning_rate": 3.08448241174923e-07, "logits/chosen": -18.05725860595703, "logits/rejected": -17.617219924926758, "logps/chosen": -339.5686950683594, "logps/rejected": -264.6923828125, "loss": 0.4644, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6420247554779053, "rewards/margins": 1.3296313285827637, "rewards/rejected": 1.312393307685852, "step": 24770 }, { "epoch": 1.1504712382190445, "grad_norm": 0.5941656827926636, "learning_rate": 3.0837086215701747e-07, "logits/chosen": -18.55386734008789, "logits/rejected": -17.905290603637695, "logps/chosen": -427.1650390625, "logps/rejected": -312.4210510253906, "loss": 0.6888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8203086853027344, "rewards/margins": 1.4363170862197876, "rewards/rejected": 2.3839917182922363, "step": 24780 }, { "epoch": 1.1509355123264775, "grad_norm": 23.60670280456543, "learning_rate": 3.08293483139112e-07, "logits/chosen": -18.226987838745117, "logits/rejected": -17.383312225341797, "logps/chosen": -385.7773132324219, "logps/rejected": -329.2974548339844, "loss": 0.5516, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0883474349975586, "rewards/margins": 1.2355743646621704, "rewards/rejected": 1.8527733087539673, "step": 24790 }, { "epoch": 1.1513997864339105, "grad_norm": 18.789875030517578, "learning_rate": 3.082161041212065e-07, "logits/chosen": -19.482784271240234, "logits/rejected": -18.802968978881836, "logps/chosen": -266.4732971191406, "logps/rejected": -239.30459594726562, "loss": 0.7239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7122788429260254, "rewards/margins": 0.6977864503860474, "rewards/rejected": 2.0144922733306885, "step": 24800 }, { "epoch": 1.1518640605413437, "grad_norm": 32.464088439941406, "learning_rate": 3.0813872510330095e-07, "logits/chosen": -17.81731605529785, "logits/rejected": -17.205427169799805, "logps/chosen": -391.58282470703125, "logps/rejected": -322.89996337890625, "loss": 0.7888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.078716993331909, "rewards/margins": 1.0315577983856201, "rewards/rejected": 2.047159433364868, "step": 24810 }, { "epoch": 1.1523283346487767, "grad_norm": 87.8274154663086, "learning_rate": 3.0806134608539546e-07, "logits/chosen": -18.553081512451172, "logits/rejected": -17.07524871826172, "logps/chosen": -352.13372802734375, "logps/rejected": -229.2607879638672, "loss": 0.8305, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.491375684738159, "rewards/margins": 0.7629965543746948, "rewards/rejected": 1.728379249572754, "step": 24820 }, { "epoch": 1.1527926087562097, "grad_norm": 84.39122772216797, "learning_rate": 3.0798396706748997e-07, "logits/chosen": -18.079870223999023, "logits/rejected": -18.097915649414062, "logps/chosen": -361.305419921875, "logps/rejected": -399.81658935546875, "loss": 0.7628, "rewards/accuracies": 0.5, "rewards/chosen": 3.2667853832244873, "rewards/margins": 0.5618180632591248, "rewards/rejected": 2.7049672603607178, "step": 24830 }, { "epoch": 1.1532568828636427, "grad_norm": 128.39244079589844, "learning_rate": 3.079065880495845e-07, "logits/chosen": -18.430959701538086, "logits/rejected": -17.530479431152344, "logps/chosen": -402.9192810058594, "logps/rejected": -260.15869140625, "loss": 0.439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.927687406539917, "rewards/margins": 1.6286401748657227, "rewards/rejected": 2.299046754837036, "step": 24840 }, { "epoch": 1.1537211569710757, "grad_norm": 65.38572692871094, "learning_rate": 3.07829209031679e-07, "logits/chosen": -18.099811553955078, "logits/rejected": -17.13656997680664, "logps/chosen": -386.9835205078125, "logps/rejected": -364.58135986328125, "loss": 0.4549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6692397594451904, "rewards/margins": 1.1533050537109375, "rewards/rejected": 2.515934467315674, "step": 24850 }, { "epoch": 1.1541854310785087, "grad_norm": 7.5340256690979, "learning_rate": 3.0775183001377345e-07, "logits/chosen": -19.59457778930664, "logits/rejected": -18.839649200439453, "logps/chosen": -412.7320861816406, "logps/rejected": -424.8140563964844, "loss": 0.5056, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6745407581329346, "rewards/margins": 1.1321651935577393, "rewards/rejected": 2.542375326156616, "step": 24860 }, { "epoch": 1.1546497051859417, "grad_norm": 44.404327392578125, "learning_rate": 3.0767445099586796e-07, "logits/chosen": -19.60042953491211, "logits/rejected": -18.729022979736328, "logps/chosen": -487.23876953125, "logps/rejected": -443.0165100097656, "loss": 0.5592, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.832197666168213, "rewards/margins": 1.1893961429595947, "rewards/rejected": 2.642801284790039, "step": 24870 }, { "epoch": 1.155113979293375, "grad_norm": 15.614289283752441, "learning_rate": 3.075970719779624e-07, "logits/chosen": -18.724159240722656, "logits/rejected": -18.66192626953125, "logps/chosen": -393.06353759765625, "logps/rejected": -401.64788818359375, "loss": 0.8842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6825172901153564, "rewards/margins": 0.7897273302078247, "rewards/rejected": 2.8927900791168213, "step": 24880 }, { "epoch": 1.1555782534008079, "grad_norm": 4.607479572296143, "learning_rate": 3.0751969296005693e-07, "logits/chosen": -18.614553451538086, "logits/rejected": -18.022262573242188, "logps/chosen": -472.7361755371094, "logps/rejected": -396.62322998046875, "loss": 1.1946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.923168182373047, "rewards/margins": 0.8932567834854126, "rewards/rejected": 3.029911518096924, "step": 24890 }, { "epoch": 1.1560425275082409, "grad_norm": 6.418002605438232, "learning_rate": 3.0744231394215144e-07, "logits/chosen": -19.182138442993164, "logits/rejected": -18.51876449584961, "logps/chosen": -528.4222412109375, "logps/rejected": -346.07562255859375, "loss": 0.3162, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.46046781539917, "rewards/margins": 1.8926700353622437, "rewards/rejected": 2.5677974224090576, "step": 24900 }, { "epoch": 1.1565068016156739, "grad_norm": 49.81993103027344, "learning_rate": 3.073649349242459e-07, "logits/chosen": -19.02436065673828, "logits/rejected": -18.067806243896484, "logps/chosen": -345.8201599121094, "logps/rejected": -217.8888702392578, "loss": 0.46, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1109519004821777, "rewards/margins": 1.2704713344573975, "rewards/rejected": 1.8404804468154907, "step": 24910 }, { "epoch": 1.1569710757231069, "grad_norm": 15.225174903869629, "learning_rate": 3.072875559063404e-07, "logits/chosen": -18.563222885131836, "logits/rejected": -17.483232498168945, "logps/chosen": -391.50213623046875, "logps/rejected": -281.54925537109375, "loss": 0.465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0525312423706055, "rewards/margins": 0.9258192777633667, "rewards/rejected": 2.1267123222351074, "step": 24920 }, { "epoch": 1.1574353498305399, "grad_norm": 6.119211196899414, "learning_rate": 3.072101768884349e-07, "logits/chosen": -18.65982437133789, "logits/rejected": -18.060672760009766, "logps/chosen": -428.15924072265625, "logps/rejected": -407.11529541015625, "loss": 1.3225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.555014133453369, "rewards/margins": 0.5373662710189819, "rewards/rejected": 3.0176475048065186, "step": 24930 }, { "epoch": 1.1578996239379729, "grad_norm": 150.30081176757812, "learning_rate": 3.0713279787052944e-07, "logits/chosen": -18.16788101196289, "logits/rejected": -18.068920135498047, "logps/chosen": -296.43389892578125, "logps/rejected": -273.39178466796875, "loss": 0.8012, "rewards/accuracies": 0.5, "rewards/chosen": 2.403677463531494, "rewards/margins": 0.393033504486084, "rewards/rejected": 2.01064395904541, "step": 24940 }, { "epoch": 1.158363898045406, "grad_norm": 75.44208526611328, "learning_rate": 3.0705541885262395e-07, "logits/chosen": -19.210800170898438, "logits/rejected": -18.45511245727539, "logps/chosen": -430.3074645996094, "logps/rejected": -320.0809020996094, "loss": 0.3455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.063990592956543, "rewards/margins": 1.6536022424697876, "rewards/rejected": 2.410388469696045, "step": 24950 }, { "epoch": 1.158828172152839, "grad_norm": 255.28091430664062, "learning_rate": 3.069780398347184e-07, "logits/chosen": -18.57036781311035, "logits/rejected": -17.783977508544922, "logps/chosen": -482.8949279785156, "logps/rejected": -397.2802734375, "loss": 0.8569, "rewards/accuracies": 0.5, "rewards/chosen": 3.6740269660949707, "rewards/margins": 0.6460489630699158, "rewards/rejected": 3.0279781818389893, "step": 24960 }, { "epoch": 1.159292446260272, "grad_norm": 24.39899253845215, "learning_rate": 3.0690066081681286e-07, "logits/chosen": -18.88703727722168, "logits/rejected": -17.838115692138672, "logps/chosen": -428.3321228027344, "logps/rejected": -256.0451965332031, "loss": 0.3555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.139129161834717, "rewards/margins": 1.9470129013061523, "rewards/rejected": 2.1921167373657227, "step": 24970 }, { "epoch": 1.159756720367705, "grad_norm": 23.223976135253906, "learning_rate": 3.068232817989074e-07, "logits/chosen": -18.939626693725586, "logits/rejected": -18.580060958862305, "logps/chosen": -437.6976623535156, "logps/rejected": -396.830322265625, "loss": 0.9814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8435521125793457, "rewards/margins": 0.21856336295604706, "rewards/rejected": 2.6249887943267822, "step": 24980 }, { "epoch": 1.160220994475138, "grad_norm": 159.52813720703125, "learning_rate": 3.067459027810019e-07, "logits/chosen": -19.634231567382812, "logits/rejected": -18.705963134765625, "logps/chosen": -376.5760803222656, "logps/rejected": -411.0506286621094, "loss": 0.9225, "rewards/accuracies": 0.5, "rewards/chosen": 3.1513843536376953, "rewards/margins": 0.03036665916442871, "rewards/rejected": 3.1210174560546875, "step": 24990 }, { "epoch": 1.1606852685825713, "grad_norm": 52.609764099121094, "learning_rate": 3.066685237630964e-07, "logits/chosen": -18.39927101135254, "logits/rejected": -17.944963455200195, "logps/chosen": -315.59942626953125, "logps/rejected": -250.365966796875, "loss": 0.4145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.992974042892456, "rewards/margins": 0.9950039982795715, "rewards/rejected": 0.9979701042175293, "step": 25000 }, { "epoch": 1.1611495426900043, "grad_norm": 198.53079223632812, "learning_rate": 3.0659114474519086e-07, "logits/chosen": -18.221899032592773, "logits/rejected": -17.494789123535156, "logps/chosen": -344.57537841796875, "logps/rejected": -227.20321655273438, "loss": 0.8633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.339843273162842, "rewards/margins": 1.5219638347625732, "rewards/rejected": 1.8178799152374268, "step": 25010 }, { "epoch": 1.1616138167974372, "grad_norm": 6.6295695304870605, "learning_rate": 3.0651376572728537e-07, "logits/chosen": -18.651966094970703, "logits/rejected": -18.456932067871094, "logps/chosen": -457.04534912109375, "logps/rejected": -377.08770751953125, "loss": 0.5532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.751547336578369, "rewards/margins": 1.6783530712127686, "rewards/rejected": 3.0731940269470215, "step": 25020 }, { "epoch": 1.1620780909048702, "grad_norm": 36.370323181152344, "learning_rate": 3.064363867093799e-07, "logits/chosen": -19.110498428344727, "logits/rejected": -17.594205856323242, "logps/chosen": -527.1854858398438, "logps/rejected": -375.9322814941406, "loss": 0.5936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.091368198394775, "rewards/margins": 1.8828216791152954, "rewards/rejected": 3.2085468769073486, "step": 25030 }, { "epoch": 1.1625423650123032, "grad_norm": 153.7802734375, "learning_rate": 3.063590076914744e-07, "logits/chosen": -20.14563751220703, "logits/rejected": -19.303083419799805, "logps/chosen": -540.3405151367188, "logps/rejected": -451.1446228027344, "loss": 0.9978, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.151596546173096, "rewards/margins": 0.4546695351600647, "rewards/rejected": 3.6969268321990967, "step": 25040 }, { "epoch": 1.1630066391197362, "grad_norm": 109.86103057861328, "learning_rate": 3.062816286735689e-07, "logits/chosen": -18.492393493652344, "logits/rejected": -17.485803604125977, "logps/chosen": -343.94036865234375, "logps/rejected": -235.52566528320312, "loss": 0.7392, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.135019063949585, "rewards/margins": 0.8120207786560059, "rewards/rejected": 1.322998285293579, "step": 25050 }, { "epoch": 1.1634709132271692, "grad_norm": 115.02781677246094, "learning_rate": 3.0620424965566336e-07, "logits/chosen": -18.328536987304688, "logits/rejected": -18.044063568115234, "logps/chosen": -345.6527404785156, "logps/rejected": -368.1960144042969, "loss": 0.7914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8517754077911377, "rewards/margins": 0.3221534192562103, "rewards/rejected": 3.5296223163604736, "step": 25060 }, { "epoch": 1.1639351873346024, "grad_norm": 15.415960311889648, "learning_rate": 3.061268706377578e-07, "logits/chosen": -18.123647689819336, "logits/rejected": -17.081645965576172, "logps/chosen": -307.12921142578125, "logps/rejected": -213.8245849609375, "loss": 0.5877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.882254123687744, "rewards/margins": 1.5510680675506592, "rewards/rejected": 1.331186056137085, "step": 25070 }, { "epoch": 1.1643994614420354, "grad_norm": 115.69042205810547, "learning_rate": 3.0604949161985233e-07, "logits/chosen": -17.655771255493164, "logits/rejected": -17.40310287475586, "logps/chosen": -404.3396911621094, "logps/rejected": -299.88421630859375, "loss": 0.9432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7294697761535645, "rewards/margins": 0.7559390664100647, "rewards/rejected": 1.9735307693481445, "step": 25080 }, { "epoch": 1.1648637355494684, "grad_norm": 29.090417861938477, "learning_rate": 3.0597211260194684e-07, "logits/chosen": -19.276874542236328, "logits/rejected": -17.80048179626465, "logps/chosen": -407.2208557128906, "logps/rejected": -344.0690002441406, "loss": 0.4866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5399887561798096, "rewards/margins": 1.0466108322143555, "rewards/rejected": 2.493378162384033, "step": 25090 }, { "epoch": 1.1653280096569014, "grad_norm": 31.003711700439453, "learning_rate": 3.0589473358404135e-07, "logits/chosen": -18.61983871459961, "logits/rejected": -17.81890296936035, "logps/chosen": -375.346923828125, "logps/rejected": -295.0904541015625, "loss": 0.6276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1623663902282715, "rewards/margins": 1.3920238018035889, "rewards/rejected": 1.7703425884246826, "step": 25100 }, { "epoch": 1.1657922837643344, "grad_norm": 12.400678634643555, "learning_rate": 3.058173545661358e-07, "logits/chosen": -19.26125717163086, "logits/rejected": -18.116785049438477, "logps/chosen": -413.5345764160156, "logps/rejected": -355.18170166015625, "loss": 0.4066, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.3558149337768555, "rewards/margins": 1.540466547012329, "rewards/rejected": 2.8153488636016846, "step": 25110 }, { "epoch": 1.1662565578717674, "grad_norm": 3.2978906631469727, "learning_rate": 3.057399755482303e-07, "logits/chosen": -20.162479400634766, "logits/rejected": -18.352136611938477, "logps/chosen": -421.853271484375, "logps/rejected": -254.62277221679688, "loss": 0.6712, "rewards/accuracies": 0.5, "rewards/chosen": 4.069762229919434, "rewards/margins": 1.7214720249176025, "rewards/rejected": 2.3482909202575684, "step": 25120 }, { "epoch": 1.1667208319792004, "grad_norm": 37.50197982788086, "learning_rate": 3.0566259653032483e-07, "logits/chosen": -18.487516403198242, "logits/rejected": -17.712352752685547, "logps/chosen": -412.509765625, "logps/rejected": -326.6039123535156, "loss": 0.5565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.141676425933838, "rewards/margins": 0.5737687349319458, "rewards/rejected": 2.5679078102111816, "step": 25130 }, { "epoch": 1.1671851060866336, "grad_norm": 49.71088409423828, "learning_rate": 3.0558521751241934e-07, "logits/chosen": -18.871082305908203, "logits/rejected": -17.854568481445312, "logps/chosen": -389.7537536621094, "logps/rejected": -321.39056396484375, "loss": 0.8742, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8099520206451416, "rewards/margins": 0.02025054767727852, "rewards/rejected": 2.789701461791992, "step": 25140 }, { "epoch": 1.1676493801940666, "grad_norm": 54.209232330322266, "learning_rate": 3.0550783849451386e-07, "logits/chosen": -18.272686004638672, "logits/rejected": -19.07791519165039, "logps/chosen": -376.82025146484375, "logps/rejected": -376.2998352050781, "loss": 0.7932, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3787028789520264, "rewards/margins": 0.2699181139469147, "rewards/rejected": 3.1087844371795654, "step": 25150 }, { "epoch": 1.1681136543014996, "grad_norm": 1.462844967842102, "learning_rate": 3.0543045947660826e-07, "logits/chosen": -18.518327713012695, "logits/rejected": -17.655672073364258, "logps/chosen": -351.3077087402344, "logps/rejected": -291.517578125, "loss": 0.6827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.331465244293213, "rewards/margins": 1.2114083766937256, "rewards/rejected": 2.120056629180908, "step": 25160 }, { "epoch": 1.1685779284089326, "grad_norm": 40.87935256958008, "learning_rate": 3.0535308045870277e-07, "logits/chosen": -18.426563262939453, "logits/rejected": -17.314462661743164, "logps/chosen": -394.38397216796875, "logps/rejected": -245.97225952148438, "loss": 0.5958, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.449427843093872, "rewards/margins": 1.5726304054260254, "rewards/rejected": 1.876797080039978, "step": 25170 }, { "epoch": 1.1690422025163656, "grad_norm": 67.9564437866211, "learning_rate": 3.052757014407973e-07, "logits/chosen": -18.617311477661133, "logits/rejected": -18.484766006469727, "logps/chosen": -485.05419921875, "logps/rejected": -431.3285217285156, "loss": 0.9256, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.033996820449829, "rewards/margins": -0.09240882098674774, "rewards/rejected": 3.126405715942383, "step": 25180 }, { "epoch": 1.1695064766237988, "grad_norm": 177.05734252929688, "learning_rate": 3.051983224228918e-07, "logits/chosen": -18.346553802490234, "logits/rejected": -17.851825714111328, "logps/chosen": -335.61968994140625, "logps/rejected": -268.646728515625, "loss": 0.8815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8835700750350952, "rewards/margins": 0.16158917546272278, "rewards/rejected": 1.7219808101654053, "step": 25190 }, { "epoch": 1.1699707507312318, "grad_norm": 14.90299129486084, "learning_rate": 3.051209434049863e-07, "logits/chosen": -19.071216583251953, "logits/rejected": -18.017227172851562, "logps/chosen": -371.4818115234375, "logps/rejected": -275.6458740234375, "loss": 0.5753, "rewards/accuracies": 0.5, "rewards/chosen": 3.0097947120666504, "rewards/margins": 1.0176211595535278, "rewards/rejected": 1.9921735525131226, "step": 25200 }, { "epoch": 1.1704350248386648, "grad_norm": 230.8051300048828, "learning_rate": 3.0504356438708076e-07, "logits/chosen": -18.358943939208984, "logits/rejected": -18.344783782958984, "logps/chosen": -421.0782775878906, "logps/rejected": -323.1424865722656, "loss": 1.5294, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7920143604278564, "rewards/margins": -0.20133638381958008, "rewards/rejected": 2.9933502674102783, "step": 25210 }, { "epoch": 1.1708992989460978, "grad_norm": 43.51859664916992, "learning_rate": 3.049661853691753e-07, "logits/chosen": -17.505847930908203, "logits/rejected": -17.3222599029541, "logps/chosen": -342.10931396484375, "logps/rejected": -318.4178466796875, "loss": 0.6061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9300763607025146, "rewards/margins": 0.7228396534919739, "rewards/rejected": 2.2072367668151855, "step": 25220 }, { "epoch": 1.1713635730535308, "grad_norm": 39.093482971191406, "learning_rate": 3.048888063512698e-07, "logits/chosen": -18.443218231201172, "logits/rejected": -17.587663650512695, "logps/chosen": -286.4349670410156, "logps/rejected": -213.8407440185547, "loss": 0.3961, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8575360774993896, "rewards/margins": 1.5773650407791138, "rewards/rejected": 1.2801711559295654, "step": 25230 }, { "epoch": 1.1718278471609638, "grad_norm": 46.57522964477539, "learning_rate": 3.048114273333643e-07, "logits/chosen": -19.34331512451172, "logits/rejected": -19.161338806152344, "logps/chosen": -381.88323974609375, "logps/rejected": -328.19769287109375, "loss": 0.6518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5921008586883545, "rewards/margins": 1.165915608406067, "rewards/rejected": 2.426184892654419, "step": 25240 }, { "epoch": 1.1722921212683968, "grad_norm": 60.797393798828125, "learning_rate": 3.047340483154588e-07, "logits/chosen": -18.659896850585938, "logits/rejected": -18.43937873840332, "logps/chosen": -346.62469482421875, "logps/rejected": -268.2078857421875, "loss": 0.5395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.319145679473877, "rewards/margins": 1.2398111820220947, "rewards/rejected": 2.079334259033203, "step": 25250 }, { "epoch": 1.17275639537583, "grad_norm": 51.46422576904297, "learning_rate": 3.046566692975532e-07, "logits/chosen": -18.83843231201172, "logits/rejected": -18.331539154052734, "logps/chosen": -453.489501953125, "logps/rejected": -401.20135498046875, "loss": 0.672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4674084186553955, "rewards/margins": 0.32796791195869446, "rewards/rejected": 3.1394407749176025, "step": 25260 }, { "epoch": 1.173220669483263, "grad_norm": 128.1038360595703, "learning_rate": 3.045792902796477e-07, "logits/chosen": -18.91939926147461, "logits/rejected": -18.258258819580078, "logps/chosen": -535.4929809570312, "logps/rejected": -425.7318420410156, "loss": 0.6588, "rewards/accuracies": 0.5, "rewards/chosen": 4.001722812652588, "rewards/margins": 0.6124790906906128, "rewards/rejected": 3.3892436027526855, "step": 25270 }, { "epoch": 1.173684943590696, "grad_norm": 79.63016510009766, "learning_rate": 3.0450191126174224e-07, "logits/chosen": -18.09982681274414, "logits/rejected": -17.84674644470215, "logps/chosen": -472.26904296875, "logps/rejected": -435.55731201171875, "loss": 0.5858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.984848976135254, "rewards/margins": 0.7837188839912415, "rewards/rejected": 4.201129913330078, "step": 25280 }, { "epoch": 1.174149217698129, "grad_norm": 68.54210662841797, "learning_rate": 3.0442453224383675e-07, "logits/chosen": -18.658042907714844, "logits/rejected": -18.600818634033203, "logps/chosen": -421.37030029296875, "logps/rejected": -323.90283203125, "loss": 0.6363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.328495740890503, "rewards/margins": 0.8712321519851685, "rewards/rejected": 2.4572641849517822, "step": 25290 }, { "epoch": 1.174613491805562, "grad_norm": 24.853721618652344, "learning_rate": 3.0434715322593126e-07, "logits/chosen": -19.75240135192871, "logits/rejected": -17.89173698425293, "logps/chosen": -468.39605712890625, "logps/rejected": -383.63482666015625, "loss": 0.5483, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.445258378982544, "rewards/margins": 1.0813506841659546, "rewards/rejected": 2.363907814025879, "step": 25300 }, { "epoch": 1.175077765912995, "grad_norm": 149.8687286376953, "learning_rate": 3.042697742080257e-07, "logits/chosen": -18.76113510131836, "logits/rejected": -18.207599639892578, "logps/chosen": -521.5303955078125, "logps/rejected": -436.07269287109375, "loss": 0.4857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.195471286773682, "rewards/margins": 1.4661109447479248, "rewards/rejected": 2.7293601036071777, "step": 25310 }, { "epoch": 1.175542040020428, "grad_norm": 45.63612747192383, "learning_rate": 3.0419239519012023e-07, "logits/chosen": -19.577831268310547, "logits/rejected": -18.378643035888672, "logps/chosen": -350.35528564453125, "logps/rejected": -327.3486022949219, "loss": 0.4899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.772076368331909, "rewards/margins": 1.0479453802108765, "rewards/rejected": 2.7241311073303223, "step": 25320 }, { "epoch": 1.1760063141278612, "grad_norm": 75.50975799560547, "learning_rate": 3.0411501617221474e-07, "logits/chosen": -18.791610717773438, "logits/rejected": -18.022518157958984, "logps/chosen": -405.98126220703125, "logps/rejected": -299.455810546875, "loss": 0.5385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4077460765838623, "rewards/margins": 0.8881512880325317, "rewards/rejected": 2.5195953845977783, "step": 25330 }, { "epoch": 1.1764705882352942, "grad_norm": 122.50552368164062, "learning_rate": 3.0403763715430925e-07, "logits/chosen": -19.446487426757812, "logits/rejected": -18.795080184936523, "logps/chosen": -396.1374206542969, "logps/rejected": -335.0435791015625, "loss": 0.9421, "rewards/accuracies": 0.5, "rewards/chosen": 3.894169330596924, "rewards/margins": 0.5443300008773804, "rewards/rejected": 3.349839448928833, "step": 25340 }, { "epoch": 1.1769348623427272, "grad_norm": 6.204151153564453, "learning_rate": 3.0396025813640376e-07, "logits/chosen": -19.326133728027344, "logits/rejected": -18.858335494995117, "logps/chosen": -457.66412353515625, "logps/rejected": -448.422607421875, "loss": 0.7349, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.26991605758667, "rewards/margins": 0.9669147729873657, "rewards/rejected": 3.3030014038085938, "step": 25350 }, { "epoch": 1.1773991364501601, "grad_norm": 70.97268676757812, "learning_rate": 3.0388287911849817e-07, "logits/chosen": -20.077152252197266, "logits/rejected": -18.210981369018555, "logps/chosen": -416.77789306640625, "logps/rejected": -279.3345642089844, "loss": 0.6101, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.274136781692505, "rewards/margins": 1.6749982833862305, "rewards/rejected": 1.5991384983062744, "step": 25360 }, { "epoch": 1.1778634105575931, "grad_norm": 53.26245880126953, "learning_rate": 3.038055001005927e-07, "logits/chosen": -18.765743255615234, "logits/rejected": -18.671451568603516, "logps/chosen": -385.9814453125, "logps/rejected": -340.9190368652344, "loss": 1.0503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7413721084594727, "rewards/margins": -0.35403895378112793, "rewards/rejected": 3.0954110622406006, "step": 25370 }, { "epoch": 1.1783276846650264, "grad_norm": 3.5299055576324463, "learning_rate": 3.037281210826872e-07, "logits/chosen": -18.341135025024414, "logits/rejected": -17.721050262451172, "logps/chosen": -433.8148498535156, "logps/rejected": -343.9341735839844, "loss": 0.9824, "rewards/accuracies": 0.5, "rewards/chosen": 4.259739875793457, "rewards/margins": 1.1293938159942627, "rewards/rejected": 3.1303467750549316, "step": 25380 }, { "epoch": 1.1787919587724593, "grad_norm": 9.987923622131348, "learning_rate": 3.036507420647817e-07, "logits/chosen": -19.491952896118164, "logits/rejected": -17.453163146972656, "logps/chosen": -459.53289794921875, "logps/rejected": -270.66973876953125, "loss": 0.2236, "rewards/accuracies": 1.0, "rewards/chosen": 4.742327690124512, "rewards/margins": 2.2886338233947754, "rewards/rejected": 2.4536936283111572, "step": 25390 }, { "epoch": 1.1792562328798923, "grad_norm": 25.683809280395508, "learning_rate": 3.035733630468762e-07, "logits/chosen": -19.449176788330078, "logits/rejected": -17.920602798461914, "logps/chosen": -457.80206298828125, "logps/rejected": -273.8723449707031, "loss": 0.3397, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3748371601104736, "rewards/margins": 1.8735967874526978, "rewards/rejected": 1.5012402534484863, "step": 25400 }, { "epoch": 1.1797205069873253, "grad_norm": 21.96602439880371, "learning_rate": 3.0349598402897067e-07, "logits/chosen": -19.84390640258789, "logits/rejected": -19.052879333496094, "logps/chosen": -439.03338623046875, "logps/rejected": -385.3146057128906, "loss": 0.8122, "rewards/accuracies": 0.5, "rewards/chosen": 3.6278316974639893, "rewards/margins": 0.598254919052124, "rewards/rejected": 3.0295770168304443, "step": 25410 }, { "epoch": 1.1801847810947583, "grad_norm": 21.682052612304688, "learning_rate": 3.034186050110652e-07, "logits/chosen": -18.180213928222656, "logits/rejected": -17.642803192138672, "logps/chosen": -327.4105529785156, "logps/rejected": -328.64892578125, "loss": 0.9812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6876139640808105, "rewards/margins": 0.2977997362613678, "rewards/rejected": 2.389814615249634, "step": 25420 }, { "epoch": 1.1806490552021913, "grad_norm": 50.37309265136719, "learning_rate": 3.033412259931597e-07, "logits/chosen": -17.729106903076172, "logits/rejected": -16.759843826293945, "logps/chosen": -333.11773681640625, "logps/rejected": -190.95828247070312, "loss": 0.416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0802738666534424, "rewards/margins": 2.0614023208618164, "rewards/rejected": 1.018871545791626, "step": 25430 }, { "epoch": 1.1811133293096243, "grad_norm": 30.297786712646484, "learning_rate": 3.032638469752542e-07, "logits/chosen": -19.12320899963379, "logits/rejected": -19.075284957885742, "logps/chosen": -389.7353210449219, "logps/rejected": -346.22869873046875, "loss": 1.1148, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.90659499168396, "rewards/margins": -0.03734240680932999, "rewards/rejected": 2.9439375400543213, "step": 25440 }, { "epoch": 1.1815776034170575, "grad_norm": 59.07012939453125, "learning_rate": 3.0318646795734866e-07, "logits/chosen": -19.110301971435547, "logits/rejected": -19.098058700561523, "logps/chosen": -371.75958251953125, "logps/rejected": -345.81903076171875, "loss": 0.6592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2930309772491455, "rewards/margins": 0.33442461490631104, "rewards/rejected": 2.958606719970703, "step": 25450 }, { "epoch": 1.1820418775244905, "grad_norm": 105.5409927368164, "learning_rate": 3.0311682684123374e-07, "logits/chosen": -18.503347396850586, "logits/rejected": -18.584632873535156, "logps/chosen": -474.75457763671875, "logps/rejected": -411.9891052246094, "loss": 0.9908, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4850831031799316, "rewards/margins": 0.4550386369228363, "rewards/rejected": 3.0300443172454834, "step": 25460 }, { "epoch": 1.1825061516319235, "grad_norm": 262.11834716796875, "learning_rate": 3.030394478233282e-07, "logits/chosen": -17.368236541748047, "logits/rejected": -18.897077560424805, "logps/chosen": -396.0313415527344, "logps/rejected": -463.2447204589844, "loss": 1.9151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8186354637145996, "rewards/margins": -0.516265869140625, "rewards/rejected": 4.334901332855225, "step": 25470 }, { "epoch": 1.1829704257393565, "grad_norm": 46.9211311340332, "learning_rate": 3.029620688054227e-07, "logits/chosen": -19.26204490661621, "logits/rejected": -19.282291412353516, "logps/chosen": -350.9829406738281, "logps/rejected": -292.2918395996094, "loss": 0.8563, "rewards/accuracies": 0.5, "rewards/chosen": 3.543205738067627, "rewards/margins": 0.22739486396312714, "rewards/rejected": 3.3158111572265625, "step": 25480 }, { "epoch": 1.1834346998467895, "grad_norm": 44.410953521728516, "learning_rate": 3.0288468978751717e-07, "logits/chosen": -18.0606689453125, "logits/rejected": -17.780750274658203, "logps/chosen": -338.07830810546875, "logps/rejected": -304.2325134277344, "loss": 0.7294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3442769050598145, "rewards/margins": 0.4476719796657562, "rewards/rejected": 2.8966050148010254, "step": 25490 }, { "epoch": 1.1838989739542225, "grad_norm": 207.16268920898438, "learning_rate": 3.028073107696117e-07, "logits/chosen": -19.224546432495117, "logits/rejected": -17.484220504760742, "logps/chosen": -546.391845703125, "logps/rejected": -408.7865295410156, "loss": 0.5956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.674742221832275, "rewards/margins": 1.906355619430542, "rewards/rejected": 2.7683863639831543, "step": 25500 }, { "epoch": 1.1843632480616555, "grad_norm": 30.189682006835938, "learning_rate": 3.027299317517062e-07, "logits/chosen": -18.053401947021484, "logits/rejected": -18.70326805114746, "logps/chosen": -319.4779968261719, "logps/rejected": -395.26861572265625, "loss": 1.5435, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1399991512298584, "rewards/margins": -0.974532961845398, "rewards/rejected": 3.114532232284546, "step": 25510 }, { "epoch": 1.1848275221690887, "grad_norm": 0.8362933397293091, "learning_rate": 3.026525527338007e-07, "logits/chosen": -18.73992156982422, "logits/rejected": -17.21108627319336, "logps/chosen": -411.21624755859375, "logps/rejected": -252.3016815185547, "loss": 0.5143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4893717765808105, "rewards/margins": 2.3040170669555664, "rewards/rejected": 1.1853549480438232, "step": 25520 }, { "epoch": 1.1852917962765217, "grad_norm": 30.94882583618164, "learning_rate": 3.025751737158952e-07, "logits/chosen": -19.219697952270508, "logits/rejected": -17.833078384399414, "logps/chosen": -335.37432861328125, "logps/rejected": -235.4477081298828, "loss": 1.0026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.900256633758545, "rewards/margins": 0.7115973234176636, "rewards/rejected": 2.188659191131592, "step": 25530 }, { "epoch": 1.1857560703839547, "grad_norm": 106.7374038696289, "learning_rate": 3.0249779469798967e-07, "logits/chosen": -19.665851593017578, "logits/rejected": -18.49062156677246, "logps/chosen": -426.14080810546875, "logps/rejected": -304.46661376953125, "loss": 0.3955, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7743191719055176, "rewards/margins": 1.5261919498443604, "rewards/rejected": 2.2481274604797363, "step": 25540 }, { "epoch": 1.1862203444913877, "grad_norm": 31.572301864624023, "learning_rate": 3.024204156800842e-07, "logits/chosen": -19.4635066986084, "logits/rejected": -18.338747024536133, "logps/chosen": -344.9144592285156, "logps/rejected": -224.93112182617188, "loss": 0.2699, "rewards/accuracies": 1.0, "rewards/chosen": 3.4582104682922363, "rewards/margins": 1.588602066040039, "rewards/rejected": 1.8696081638336182, "step": 25550 }, { "epoch": 1.1866846185988207, "grad_norm": 246.56680297851562, "learning_rate": 3.023430366621787e-07, "logits/chosen": -18.65308952331543, "logits/rejected": -17.873043060302734, "logps/chosen": -360.94793701171875, "logps/rejected": -323.2242126464844, "loss": 0.655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1884799003601074, "rewards/margins": 0.7792209386825562, "rewards/rejected": 2.4092586040496826, "step": 25560 }, { "epoch": 1.1871488927062537, "grad_norm": 20.095083236694336, "learning_rate": 3.0226565764427315e-07, "logits/chosen": -17.821311950683594, "logits/rejected": -17.88473129272461, "logps/chosen": -345.0761413574219, "logps/rejected": -272.81549072265625, "loss": 0.861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.682070255279541, "rewards/margins": 0.6535152196884155, "rewards/rejected": 2.028555393218994, "step": 25570 }, { "epoch": 1.187613166813687, "grad_norm": 15.545079231262207, "learning_rate": 3.0218827862636766e-07, "logits/chosen": -19.230859756469727, "logits/rejected": -18.313508987426758, "logps/chosen": -401.10308837890625, "logps/rejected": -334.7464904785156, "loss": 0.3344, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.40356707572937, "rewards/margins": 1.3025901317596436, "rewards/rejected": 2.1009769439697266, "step": 25580 }, { "epoch": 1.1880774409211199, "grad_norm": 13.331336975097656, "learning_rate": 3.021108996084621e-07, "logits/chosen": -18.518871307373047, "logits/rejected": -17.703514099121094, "logps/chosen": -389.92376708984375, "logps/rejected": -286.18218994140625, "loss": 0.4545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.019257545471191, "rewards/margins": 2.081571340560913, "rewards/rejected": 1.9376862049102783, "step": 25590 }, { "epoch": 1.1885417150285529, "grad_norm": 132.22659301757812, "learning_rate": 3.0203352059055663e-07, "logits/chosen": -19.58759880065918, "logits/rejected": -18.170808792114258, "logps/chosen": -478.53302001953125, "logps/rejected": -364.7622985839844, "loss": 0.5002, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4431374073028564, "rewards/margins": 0.8302336931228638, "rewards/rejected": 2.6129038333892822, "step": 25600 }, { "epoch": 1.1890059891359859, "grad_norm": 120.28585052490234, "learning_rate": 3.0195614157265115e-07, "logits/chosen": -18.766124725341797, "logits/rejected": -17.711042404174805, "logps/chosen": -364.19403076171875, "logps/rejected": -307.8253479003906, "loss": 0.4521, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.107556104660034, "rewards/margins": 1.0471901893615723, "rewards/rejected": 2.060366153717041, "step": 25610 }, { "epoch": 1.1894702632434189, "grad_norm": 97.89989471435547, "learning_rate": 3.0187876255474566e-07, "logits/chosen": -19.092716217041016, "logits/rejected": -17.985475540161133, "logps/chosen": -448.719482421875, "logps/rejected": -330.1909484863281, "loss": 0.647, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6407406330108643, "rewards/margins": 0.39221692085266113, "rewards/rejected": 3.248523712158203, "step": 25620 }, { "epoch": 1.1899345373508519, "grad_norm": 251.11256408691406, "learning_rate": 3.0180138353684017e-07, "logits/chosen": -18.026952743530273, "logits/rejected": -17.323678970336914, "logps/chosen": -355.8199157714844, "logps/rejected": -351.79998779296875, "loss": 0.926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6452853679656982, "rewards/margins": 1.1114857196807861, "rewards/rejected": 2.533799409866333, "step": 25630 }, { "epoch": 1.190398811458285, "grad_norm": 78.57220458984375, "learning_rate": 3.017240045189346e-07, "logits/chosen": -19.078113555908203, "logits/rejected": -18.60678482055664, "logps/chosen": -360.74072265625, "logps/rejected": -337.0374755859375, "loss": 0.8561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6229357719421387, "rewards/margins": 0.30723366141319275, "rewards/rejected": 2.315702199935913, "step": 25640 }, { "epoch": 1.190863085565718, "grad_norm": 162.80979919433594, "learning_rate": 3.0164662550102914e-07, "logits/chosen": -19.169178009033203, "logits/rejected": -17.440723419189453, "logps/chosen": -437.33782958984375, "logps/rejected": -238.2005157470703, "loss": 0.2189, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.720841884613037, "rewards/margins": 2.429931879043579, "rewards/rejected": 1.2909101247787476, "step": 25650 }, { "epoch": 1.191327359673151, "grad_norm": 43.86693572998047, "learning_rate": 3.015692464831236e-07, "logits/chosen": -18.157258987426758, "logits/rejected": -17.153966903686523, "logps/chosen": -253.03976440429688, "logps/rejected": -163.90818786621094, "loss": 0.4989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.885223388671875, "rewards/margins": 1.648329734802246, "rewards/rejected": 1.236893653869629, "step": 25660 }, { "epoch": 1.191791633780584, "grad_norm": 34.01689529418945, "learning_rate": 3.014918674652181e-07, "logits/chosen": -18.955934524536133, "logits/rejected": -17.589691162109375, "logps/chosen": -368.73004150390625, "logps/rejected": -273.8164367675781, "loss": 0.2974, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5355892181396484, "rewards/margins": 1.4462531805038452, "rewards/rejected": 1.0893359184265137, "step": 25670 }, { "epoch": 1.192255907888017, "grad_norm": 56.66413879394531, "learning_rate": 3.014144884473126e-07, "logits/chosen": -19.000978469848633, "logits/rejected": -17.763484954833984, "logps/chosen": -505.1929626464844, "logps/rejected": -276.28692626953125, "loss": 0.358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.989072799682617, "rewards/margins": 2.0462632179260254, "rewards/rejected": 1.9428093433380127, "step": 25680 }, { "epoch": 1.19272018199545, "grad_norm": 198.16787719726562, "learning_rate": 3.013371094294071e-07, "logits/chosen": -18.055063247680664, "logits/rejected": -17.611461639404297, "logps/chosen": -433.1211853027344, "logps/rejected": -332.99407958984375, "loss": 0.7789, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1167845726013184, "rewards/margins": 1.2489893436431885, "rewards/rejected": 1.8677953481674194, "step": 25690 }, { "epoch": 1.193184456102883, "grad_norm": 103.45655059814453, "learning_rate": 3.012597304115016e-07, "logits/chosen": -17.61313247680664, "logits/rejected": -17.49232292175293, "logps/chosen": -365.66619873046875, "logps/rejected": -277.6163635253906, "loss": 1.4768, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8238725662231445, "rewards/margins": -0.044786907732486725, "rewards/rejected": 2.868659257888794, "step": 25700 }, { "epoch": 1.1936487302103163, "grad_norm": 83.45181274414062, "learning_rate": 3.011823513935961e-07, "logits/chosen": -18.098054885864258, "logits/rejected": -17.8836727142334, "logps/chosen": -283.4347839355469, "logps/rejected": -278.8588562011719, "loss": 0.7623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4138004779815674, "rewards/margins": 0.346841961145401, "rewards/rejected": 2.066958427429199, "step": 25710 }, { "epoch": 1.1941130043177492, "grad_norm": 117.14299774169922, "learning_rate": 3.011049723756906e-07, "logits/chosen": -18.766773223876953, "logits/rejected": -19.20322608947754, "logps/chosen": -400.22357177734375, "logps/rejected": -415.75592041015625, "loss": 1.0423, "rewards/accuracies": 0.5, "rewards/chosen": 2.8557839393615723, "rewards/margins": -0.17498329281806946, "rewards/rejected": 3.0307674407958984, "step": 25720 }, { "epoch": 1.1945772784251822, "grad_norm": 25.762941360473633, "learning_rate": 3.010275933577851e-07, "logits/chosen": -19.044139862060547, "logits/rejected": -17.259132385253906, "logps/chosen": -354.7337951660156, "logps/rejected": -222.2205352783203, "loss": 0.5478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.670370578765869, "rewards/margins": 1.9096847772598267, "rewards/rejected": 1.7606861591339111, "step": 25730 }, { "epoch": 1.1950415525326152, "grad_norm": 57.111263275146484, "learning_rate": 3.009502143398796e-07, "logits/chosen": -19.013385772705078, "logits/rejected": -17.531780242919922, "logps/chosen": -298.3760986328125, "logps/rejected": -193.52859497070312, "loss": 0.3005, "rewards/accuracies": 1.0, "rewards/chosen": 2.0764949321746826, "rewards/margins": 1.6018116474151611, "rewards/rejected": 0.47468358278274536, "step": 25740 }, { "epoch": 1.1955058266400482, "grad_norm": 46.53036117553711, "learning_rate": 3.008728353219741e-07, "logits/chosen": -18.668895721435547, "logits/rejected": -18.077388763427734, "logps/chosen": -450.572265625, "logps/rejected": -365.31854248046875, "loss": 0.6481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.384619951248169, "rewards/margins": 0.9896658062934875, "rewards/rejected": 2.394954204559326, "step": 25750 }, { "epoch": 1.1959701007474812, "grad_norm": 3.2297885417938232, "learning_rate": 3.0079545630406855e-07, "logits/chosen": -18.6621150970459, "logits/rejected": -17.9398250579834, "logps/chosen": -443.5846252441406, "logps/rejected": -328.77752685546875, "loss": 0.2561, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7878055572509766, "rewards/margins": 2.3194539546966553, "rewards/rejected": 1.4683512449264526, "step": 25760 }, { "epoch": 1.1964343748549142, "grad_norm": 98.92804718017578, "learning_rate": 3.0071807728616306e-07, "logits/chosen": -18.049007415771484, "logits/rejected": -17.38327407836914, "logps/chosen": -295.63079833984375, "logps/rejected": -227.2437744140625, "loss": 0.6614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4316685199737549, "rewards/margins": 0.5091094970703125, "rewards/rejected": 0.9225590825080872, "step": 25770 }, { "epoch": 1.1968986489623474, "grad_norm": 84.50466918945312, "learning_rate": 3.0064069826825757e-07, "logits/chosen": -18.88182830810547, "logits/rejected": -18.156641006469727, "logps/chosen": -516.2015380859375, "logps/rejected": -433.82916259765625, "loss": 0.5133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.285000801086426, "rewards/margins": 1.1065069437026978, "rewards/rejected": 3.1784939765930176, "step": 25780 }, { "epoch": 1.1973629230697804, "grad_norm": 52.1141471862793, "learning_rate": 3.0056331925035203e-07, "logits/chosen": -18.278623580932617, "logits/rejected": -17.85665512084961, "logps/chosen": -414.321533203125, "logps/rejected": -397.25958251953125, "loss": 0.6132, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0816047191619873, "rewards/margins": 0.9661195874214172, "rewards/rejected": 2.115485191345215, "step": 25790 }, { "epoch": 1.1978271971772134, "grad_norm": 70.3980484008789, "learning_rate": 3.0048594023244654e-07, "logits/chosen": -19.071748733520508, "logits/rejected": -18.716415405273438, "logps/chosen": -392.55908203125, "logps/rejected": -366.3552551269531, "loss": 0.5427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.056051731109619, "rewards/margins": 1.2183383703231812, "rewards/rejected": 2.8377134799957275, "step": 25800 }, { "epoch": 1.1982914712846464, "grad_norm": 189.10174560546875, "learning_rate": 3.0040856121454105e-07, "logits/chosen": -18.407329559326172, "logits/rejected": -17.71842384338379, "logps/chosen": -430.3382873535156, "logps/rejected": -392.888671875, "loss": 1.1892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8973541259765625, "rewards/margins": 0.504518985748291, "rewards/rejected": 3.3928351402282715, "step": 25810 }, { "epoch": 1.1987557453920794, "grad_norm": 153.53623962402344, "learning_rate": 3.0033118219663556e-07, "logits/chosen": -19.555736541748047, "logits/rejected": -18.476886749267578, "logps/chosen": -369.9966735839844, "logps/rejected": -381.42864990234375, "loss": 0.767, "rewards/accuracies": 0.5, "rewards/chosen": 3.777165651321411, "rewards/margins": 0.7328087091445923, "rewards/rejected": 3.0443568229675293, "step": 25820 }, { "epoch": 1.1992200194995126, "grad_norm": 88.6453628540039, "learning_rate": 3.002538031787301e-07, "logits/chosen": -19.618398666381836, "logits/rejected": -18.873178482055664, "logps/chosen": -532.4685668945312, "logps/rejected": -458.7093200683594, "loss": 0.7617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.437972068786621, "rewards/margins": 1.022287368774414, "rewards/rejected": 3.4156851768493652, "step": 25830 }, { "epoch": 1.1996842936069456, "grad_norm": 104.92142486572266, "learning_rate": 3.0017642416082453e-07, "logits/chosen": -20.076757431030273, "logits/rejected": -19.098201751708984, "logps/chosen": -457.68634033203125, "logps/rejected": -394.9218444824219, "loss": 0.9925, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.058299541473389, "rewards/margins": 0.7061126232147217, "rewards/rejected": 3.352186918258667, "step": 25840 }, { "epoch": 1.2001485677143786, "grad_norm": 92.62405395507812, "learning_rate": 3.00099045142919e-07, "logits/chosen": -18.430259704589844, "logits/rejected": -17.646732330322266, "logps/chosen": -354.7498779296875, "logps/rejected": -290.6287536621094, "loss": 0.8254, "rewards/accuracies": 0.5, "rewards/chosen": 3.949385404586792, "rewards/margins": 0.9346287846565247, "rewards/rejected": 3.014756679534912, "step": 25850 }, { "epoch": 1.2006128418218116, "grad_norm": 56.920936584472656, "learning_rate": 3.000216661250135e-07, "logits/chosen": -19.90094566345215, "logits/rejected": -19.51333999633789, "logps/chosen": -441.15185546875, "logps/rejected": -402.18927001953125, "loss": 0.8843, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.6622111797332764, "rewards/margins": -0.13440847396850586, "rewards/rejected": 3.796619415283203, "step": 25860 }, { "epoch": 1.2010771159292446, "grad_norm": 173.89915466308594, "learning_rate": 2.99944287107108e-07, "logits/chosen": -18.90723419189453, "logits/rejected": -17.688148498535156, "logps/chosen": -365.9725036621094, "logps/rejected": -301.66326904296875, "loss": 0.76, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.143383026123047, "rewards/margins": 1.6244128942489624, "rewards/rejected": 2.518969774246216, "step": 25870 }, { "epoch": 1.2015413900366776, "grad_norm": 105.42864227294922, "learning_rate": 2.9986690808920253e-07, "logits/chosen": -17.91446304321289, "logits/rejected": -17.550743103027344, "logps/chosen": -396.73211669921875, "logps/rejected": -360.1773681640625, "loss": 0.7358, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.459731340408325, "rewards/margins": 0.849570095539093, "rewards/rejected": 2.610161304473877, "step": 25880 }, { "epoch": 1.2020056641441106, "grad_norm": 29.512853622436523, "learning_rate": 2.99789529071297e-07, "logits/chosen": -18.570411682128906, "logits/rejected": -18.455463409423828, "logps/chosen": -311.7702941894531, "logps/rejected": -341.7831726074219, "loss": 0.9972, "rewards/accuracies": 0.5, "rewards/chosen": 3.3692593574523926, "rewards/margins": 0.0393763892352581, "rewards/rejected": 3.329882860183716, "step": 25890 }, { "epoch": 1.2024699382515438, "grad_norm": 11.309395790100098, "learning_rate": 2.997121500533915e-07, "logits/chosen": -18.91379737854004, "logits/rejected": -18.499134063720703, "logps/chosen": -322.6467590332031, "logps/rejected": -275.81011962890625, "loss": 0.5021, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3052432537078857, "rewards/margins": 1.477209448814392, "rewards/rejected": 1.828033685684204, "step": 25900 }, { "epoch": 1.2029342123589768, "grad_norm": 21.535825729370117, "learning_rate": 2.99634771035486e-07, "logits/chosen": -18.63501739501953, "logits/rejected": -17.71735954284668, "logps/chosen": -377.1702575683594, "logps/rejected": -255.4610137939453, "loss": 0.9304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.111379623413086, "rewards/margins": 0.3802323341369629, "rewards/rejected": 1.7311471700668335, "step": 25910 }, { "epoch": 1.2033984864664098, "grad_norm": 87.19859313964844, "learning_rate": 2.995573920175805e-07, "logits/chosen": -18.31171226501465, "logits/rejected": -17.925533294677734, "logps/chosen": -401.0411071777344, "logps/rejected": -353.1426696777344, "loss": 0.5531, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7809433937072754, "rewards/margins": 0.6792702674865723, "rewards/rejected": 2.101672887802124, "step": 25920 }, { "epoch": 1.2038627605738428, "grad_norm": 39.08914566040039, "learning_rate": 2.9948001299967503e-07, "logits/chosen": -18.323938369750977, "logits/rejected": -17.770641326904297, "logps/chosen": -487.01983642578125, "logps/rejected": -415.099365234375, "loss": 0.6259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0995917320251465, "rewards/margins": 1.1670221090316772, "rewards/rejected": 2.932569742202759, "step": 25930 }, { "epoch": 1.2043270346812758, "grad_norm": 197.52456665039062, "learning_rate": 2.994026339817695e-07, "logits/chosen": -19.142852783203125, "logits/rejected": -18.37045669555664, "logps/chosen": -439.3684997558594, "logps/rejected": -324.4560241699219, "loss": 0.4698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6047699451446533, "rewards/margins": 1.0027295351028442, "rewards/rejected": 2.6020402908325195, "step": 25940 }, { "epoch": 1.2047913087887088, "grad_norm": 6.993671894073486, "learning_rate": 2.9932525496386395e-07, "logits/chosen": -20.109983444213867, "logits/rejected": -18.4838924407959, "logps/chosen": -409.3557434082031, "logps/rejected": -281.13568115234375, "loss": 0.4219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.254916191101074, "rewards/margins": 1.6654058694839478, "rewards/rejected": 2.589510440826416, "step": 25950 }, { "epoch": 1.2052555828961418, "grad_norm": 3.2991750240325928, "learning_rate": 2.9924787594595846e-07, "logits/chosen": -18.31928825378418, "logits/rejected": -17.897214889526367, "logps/chosen": -485.5868225097656, "logps/rejected": -327.4483947753906, "loss": 0.5256, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0084292888641357, "rewards/margins": 1.0347028970718384, "rewards/rejected": 1.9737269878387451, "step": 25960 }, { "epoch": 1.205719857003575, "grad_norm": 180.02651977539062, "learning_rate": 2.9917049692805297e-07, "logits/chosen": -18.56576919555664, "logits/rejected": -17.958860397338867, "logps/chosen": -329.8035888671875, "logps/rejected": -250.48361206054688, "loss": 0.4085, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4813952445983887, "rewards/margins": 1.1748037338256836, "rewards/rejected": 1.3065917491912842, "step": 25970 }, { "epoch": 1.206184131111008, "grad_norm": 259.01116943359375, "learning_rate": 2.990931179101475e-07, "logits/chosen": -18.445846557617188, "logits/rejected": -19.10070037841797, "logps/chosen": -356.17919921875, "logps/rejected": -372.53558349609375, "loss": 1.3664, "rewards/accuracies": 0.5, "rewards/chosen": 2.8468031883239746, "rewards/margins": -0.19172003865242004, "rewards/rejected": 3.0385231971740723, "step": 25980 }, { "epoch": 1.206648405218441, "grad_norm": 11.578805923461914, "learning_rate": 2.9901573889224194e-07, "logits/chosen": -18.669170379638672, "logits/rejected": -17.888023376464844, "logps/chosen": -368.26104736328125, "logps/rejected": -323.03033447265625, "loss": 0.2433, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.761601209640503, "rewards/margins": 1.8930705785751343, "rewards/rejected": 1.8685302734375, "step": 25990 }, { "epoch": 1.207112679325874, "grad_norm": 24.245397567749023, "learning_rate": 2.9893835987433645e-07, "logits/chosen": -18.046268463134766, "logits/rejected": -17.581729888916016, "logps/chosen": -423.5543518066406, "logps/rejected": -397.961669921875, "loss": 1.0085, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.616070508956909, "rewards/margins": 0.8421465158462524, "rewards/rejected": 2.7739243507385254, "step": 26000 }, { "epoch": 1.207576953433307, "grad_norm": 53.77635192871094, "learning_rate": 2.9886098085643096e-07, "logits/chosen": -18.871055603027344, "logits/rejected": -18.659404754638672, "logps/chosen": -455.5818786621094, "logps/rejected": -380.410888671875, "loss": 0.8586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6302390098571777, "rewards/margins": 0.37205153703689575, "rewards/rejected": 3.2581870555877686, "step": 26010 }, { "epoch": 1.2080412275407402, "grad_norm": 304.17889404296875, "learning_rate": 2.9878360183852547e-07, "logits/chosen": -19.16156578063965, "logits/rejected": -18.950328826904297, "logps/chosen": -343.0397033691406, "logps/rejected": -386.2437438964844, "loss": 0.8711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.212433338165283, "rewards/margins": 0.23027105629444122, "rewards/rejected": 2.9821622371673584, "step": 26020 }, { "epoch": 1.2085055016481732, "grad_norm": 106.30699920654297, "learning_rate": 2.9870622282062e-07, "logits/chosen": -20.424638748168945, "logits/rejected": -19.719539642333984, "logps/chosen": -461.5947265625, "logps/rejected": -372.66839599609375, "loss": 0.316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.12606143951416, "rewards/margins": 1.8795220851898193, "rewards/rejected": 2.2465391159057617, "step": 26030 }, { "epoch": 1.2089697757556062, "grad_norm": 40.1856803894043, "learning_rate": 2.986288438027145e-07, "logits/chosen": -18.87468910217285, "logits/rejected": -18.154645919799805, "logps/chosen": -394.1764831542969, "logps/rejected": -273.38946533203125, "loss": 0.6475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.508070468902588, "rewards/margins": 0.9920371770858765, "rewards/rejected": 1.516033411026001, "step": 26040 }, { "epoch": 1.2094340498630392, "grad_norm": 25.492223739624023, "learning_rate": 2.985514647848089e-07, "logits/chosen": -18.83392333984375, "logits/rejected": -18.09343910217285, "logps/chosen": -383.3934020996094, "logps/rejected": -319.6675720214844, "loss": 0.5439, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.025067090988159, "rewards/margins": 0.8525524139404297, "rewards/rejected": 2.1725146770477295, "step": 26050 }, { "epoch": 1.2098983239704721, "grad_norm": 5.718894958496094, "learning_rate": 2.984740857669034e-07, "logits/chosen": -18.424945831298828, "logits/rejected": -19.348474502563477, "logps/chosen": -342.56768798828125, "logps/rejected": -433.49908447265625, "loss": 0.8955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3108859062194824, "rewards/margins": 0.23451881110668182, "rewards/rejected": 3.076366901397705, "step": 26060 }, { "epoch": 1.2103625980779051, "grad_norm": 89.89395141601562, "learning_rate": 2.983967067489979e-07, "logits/chosen": -19.528736114501953, "logits/rejected": -18.416067123413086, "logps/chosen": -378.7823181152344, "logps/rejected": -291.35699462890625, "loss": 0.7418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7548716068267822, "rewards/margins": 0.7973703145980835, "rewards/rejected": 1.9575014114379883, "step": 26070 }, { "epoch": 1.2108268721853381, "grad_norm": 95.60160064697266, "learning_rate": 2.9831932773109244e-07, "logits/chosen": -18.725624084472656, "logits/rejected": -18.593143463134766, "logps/chosen": -329.3515319824219, "logps/rejected": -331.20050048828125, "loss": 0.8193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3654723167419434, "rewards/margins": 0.07355208694934845, "rewards/rejected": 2.2919201850891113, "step": 26080 }, { "epoch": 1.2112911462927713, "grad_norm": 10.894244194030762, "learning_rate": 2.982419487131869e-07, "logits/chosen": -19.088350296020508, "logits/rejected": -19.444950103759766, "logps/chosen": -383.13922119140625, "logps/rejected": -381.32672119140625, "loss": 0.9364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.174769878387451, "rewards/margins": 0.34121885895729065, "rewards/rejected": 2.8335509300231934, "step": 26090 }, { "epoch": 1.2117554204002043, "grad_norm": 176.37921142578125, "learning_rate": 2.981645696952814e-07, "logits/chosen": -18.652000427246094, "logits/rejected": -20.05501937866211, "logps/chosen": -336.3030090332031, "logps/rejected": -473.5381774902344, "loss": 1.2813, "rewards/accuracies": 0.5, "rewards/chosen": 3.2448089122772217, "rewards/margins": -0.4028933048248291, "rewards/rejected": 3.64770245552063, "step": 26100 }, { "epoch": 1.2122196945076373, "grad_norm": 29.254777908325195, "learning_rate": 2.980871906773759e-07, "logits/chosen": -19.58127784729004, "logits/rejected": -18.03890609741211, "logps/chosen": -409.7137145996094, "logps/rejected": -279.771728515625, "loss": 0.7847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.176192760467529, "rewards/margins": 1.8576858043670654, "rewards/rejected": 2.3185067176818848, "step": 26110 }, { "epoch": 1.2126839686150703, "grad_norm": 201.42599487304688, "learning_rate": 2.9800981165947043e-07, "logits/chosen": -18.24287986755371, "logits/rejected": -18.11774444580078, "logps/chosen": -260.4484558105469, "logps/rejected": -253.63729858398438, "loss": 0.679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0014493465423584, "rewards/margins": 0.3163837492465973, "rewards/rejected": 1.6850656270980835, "step": 26120 }, { "epoch": 1.2131482427225033, "grad_norm": 105.75165557861328, "learning_rate": 2.9793243264156494e-07, "logits/chosen": -18.84623146057129, "logits/rejected": -18.229122161865234, "logps/chosen": -475.78961181640625, "logps/rejected": -424.840576171875, "loss": 0.6807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8033523559570312, "rewards/margins": 0.544058084487915, "rewards/rejected": 3.259294033050537, "step": 26130 }, { "epoch": 1.2136125168299363, "grad_norm": 83.1175308227539, "learning_rate": 2.9785505362365934e-07, "logits/chosen": -18.73887825012207, "logits/rejected": -17.5310115814209, "logps/chosen": -363.50457763671875, "logps/rejected": -330.4412536621094, "loss": 0.5437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0381174087524414, "rewards/margins": 0.8731679916381836, "rewards/rejected": 2.164949417114258, "step": 26140 }, { "epoch": 1.2140767909373693, "grad_norm": 110.00984191894531, "learning_rate": 2.9777767460575386e-07, "logits/chosen": -19.379371643066406, "logits/rejected": -18.712749481201172, "logps/chosen": -454.49615478515625, "logps/rejected": -367.54974365234375, "loss": 0.3477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7967705726623535, "rewards/margins": 1.3703968524932861, "rewards/rejected": 2.4263737201690674, "step": 26150 }, { "epoch": 1.2145410650448025, "grad_norm": 75.28990173339844, "learning_rate": 2.9770029558784837e-07, "logits/chosen": -18.676788330078125, "logits/rejected": -17.729122161865234, "logps/chosen": -332.43121337890625, "logps/rejected": -252.79446411132812, "loss": 0.4618, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4569523334503174, "rewards/margins": 1.4218884706497192, "rewards/rejected": 2.035064220428467, "step": 26160 }, { "epoch": 1.2150053391522355, "grad_norm": 62.66688537597656, "learning_rate": 2.976229165699429e-07, "logits/chosen": -18.356033325195312, "logits/rejected": -17.397695541381836, "logps/chosen": -390.6080017089844, "logps/rejected": -270.8788146972656, "loss": 0.3752, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.662614345550537, "rewards/margins": 1.5154591798782349, "rewards/rejected": 2.147155284881592, "step": 26170 }, { "epoch": 1.2154696132596685, "grad_norm": 4.317182540893555, "learning_rate": 2.975455375520374e-07, "logits/chosen": -18.879892349243164, "logits/rejected": -17.838092803955078, "logps/chosen": -498.2289123535156, "logps/rejected": -361.38458251953125, "loss": 0.4161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.409848690032959, "rewards/margins": 2.034418821334839, "rewards/rejected": 2.375430107116699, "step": 26180 }, { "epoch": 1.2159338873671015, "grad_norm": 8.429280281066895, "learning_rate": 2.974681585341319e-07, "logits/chosen": -18.3195743560791, "logits/rejected": -17.59373664855957, "logps/chosen": -347.6824951171875, "logps/rejected": -219.6698760986328, "loss": 0.4234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.215513229370117, "rewards/margins": 1.6684402227401733, "rewards/rejected": 1.5470731258392334, "step": 26190 }, { "epoch": 1.2163981614745345, "grad_norm": 104.74517059326172, "learning_rate": 2.9739077951622636e-07, "logits/chosen": -18.43216323852539, "logits/rejected": -18.228483200073242, "logps/chosen": -342.5714111328125, "logps/rejected": -392.6980895996094, "loss": 0.9461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6202375888824463, "rewards/margins": 0.18441712856292725, "rewards/rejected": 2.4358201026916504, "step": 26200 }, { "epoch": 1.2168624355819677, "grad_norm": 26.213586807250977, "learning_rate": 2.9731340049832087e-07, "logits/chosen": -18.36037826538086, "logits/rejected": -18.014270782470703, "logps/chosen": -425.3310546875, "logps/rejected": -346.00311279296875, "loss": 0.5687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0530848503112793, "rewards/margins": 1.1268939971923828, "rewards/rejected": 1.926190972328186, "step": 26210 }, { "epoch": 1.2173267096894007, "grad_norm": 30.82196807861328, "learning_rate": 2.972360214804154e-07, "logits/chosen": -17.82701301574707, "logits/rejected": -17.915334701538086, "logps/chosen": -396.20843505859375, "logps/rejected": -400.7176818847656, "loss": 0.7785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2422947883605957, "rewards/margins": 0.8574386835098267, "rewards/rejected": 2.3848559856414795, "step": 26220 }, { "epoch": 1.2177909837968337, "grad_norm": 271.67657470703125, "learning_rate": 2.971586424625099e-07, "logits/chosen": -18.74643325805664, "logits/rejected": -17.60645294189453, "logps/chosen": -406.8343505859375, "logps/rejected": -302.80987548828125, "loss": 0.9775, "rewards/accuracies": 0.5, "rewards/chosen": 2.8090837001800537, "rewards/margins": 0.23522226512432098, "rewards/rejected": 2.573861598968506, "step": 26230 }, { "epoch": 1.2182552579042667, "grad_norm": 2.1598639488220215, "learning_rate": 2.970812634446043e-07, "logits/chosen": -18.964107513427734, "logits/rejected": -18.418481826782227, "logps/chosen": -393.16143798828125, "logps/rejected": -322.48480224609375, "loss": 0.704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.952547073364258, "rewards/margins": 1.5012032985687256, "rewards/rejected": 2.4513440132141113, "step": 26240 }, { "epoch": 1.2187195320116997, "grad_norm": 201.1040496826172, "learning_rate": 2.970038844266988e-07, "logits/chosen": -18.55634880065918, "logits/rejected": -18.16705322265625, "logps/chosen": -444.89654541015625, "logps/rejected": -373.801025390625, "loss": 0.6581, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.310123920440674, "rewards/margins": 0.6936686635017395, "rewards/rejected": 2.616455316543579, "step": 26250 }, { "epoch": 1.2191838061191327, "grad_norm": 88.70382690429688, "learning_rate": 2.969265054087933e-07, "logits/chosen": -18.470890045166016, "logits/rejected": -16.684783935546875, "logps/chosen": -391.71173095703125, "logps/rejected": -214.71243286132812, "loss": 0.2538, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7579007148742676, "rewards/margins": 2.615990400314331, "rewards/rejected": 1.1419099569320679, "step": 26260 }, { "epoch": 1.2196480802265657, "grad_norm": 60.811317443847656, "learning_rate": 2.9684912639088783e-07, "logits/chosen": -18.44693374633789, "logits/rejected": -17.92074203491211, "logps/chosen": -349.8345947265625, "logps/rejected": -300.6589660644531, "loss": 0.664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3857779502868652, "rewards/margins": 1.1018555164337158, "rewards/rejected": 2.2839221954345703, "step": 26270 }, { "epoch": 1.220112354333999, "grad_norm": 7.087762832641602, "learning_rate": 2.9677174737298234e-07, "logits/chosen": -19.647579193115234, "logits/rejected": -18.902576446533203, "logps/chosen": -323.8705139160156, "logps/rejected": -283.74322509765625, "loss": 0.6964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4756953716278076, "rewards/margins": 0.9193860292434692, "rewards/rejected": 1.5563093423843384, "step": 26280 }, { "epoch": 1.2205766284414319, "grad_norm": 93.3818130493164, "learning_rate": 2.9669436835507685e-07, "logits/chosen": -18.421329498291016, "logits/rejected": -18.10666847229004, "logps/chosen": -383.0702209472656, "logps/rejected": -400.4584045410156, "loss": 0.5003, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0469961166381836, "rewards/margins": 0.9880701303482056, "rewards/rejected": 2.0589258670806885, "step": 26290 }, { "epoch": 1.2210409025488649, "grad_norm": 175.3734588623047, "learning_rate": 2.966169893371713e-07, "logits/chosen": -19.414098739624023, "logits/rejected": -19.716367721557617, "logps/chosen": -405.1053466796875, "logps/rejected": -478.79754638671875, "loss": 0.9824, "rewards/accuracies": 0.5, "rewards/chosen": 4.257358074188232, "rewards/margins": 0.34837326407432556, "rewards/rejected": 3.908984661102295, "step": 26300 }, { "epoch": 1.2215051766562979, "grad_norm": 20.01958465576172, "learning_rate": 2.965396103192658e-07, "logits/chosen": -18.96700096130371, "logits/rejected": -18.277538299560547, "logps/chosen": -332.7284240722656, "logps/rejected": -300.54376220703125, "loss": 0.6098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.419719696044922, "rewards/margins": 0.7500273585319519, "rewards/rejected": 2.669692277908325, "step": 26310 }, { "epoch": 1.2219694507637309, "grad_norm": 59.81870651245117, "learning_rate": 2.9646223130136034e-07, "logits/chosen": -19.66295623779297, "logits/rejected": -18.39776611328125, "logps/chosen": -402.58648681640625, "logps/rejected": -287.1727294921875, "loss": 0.3921, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.237900733947754, "rewards/margins": 2.2291669845581055, "rewards/rejected": 2.0087342262268066, "step": 26320 }, { "epoch": 1.2224337248711639, "grad_norm": 223.1611328125, "learning_rate": 2.963848522834548e-07, "logits/chosen": -19.168277740478516, "logits/rejected": -18.398479461669922, "logps/chosen": -417.35675048828125, "logps/rejected": -358.0741882324219, "loss": 0.6533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.184234619140625, "rewards/margins": 0.8736001253128052, "rewards/rejected": 3.310634136199951, "step": 26330 }, { "epoch": 1.2228979989785969, "grad_norm": 10.827144622802734, "learning_rate": 2.963074732655493e-07, "logits/chosen": -18.896263122558594, "logits/rejected": -17.458818435668945, "logps/chosen": -464.86334228515625, "logps/rejected": -290.8669738769531, "loss": 0.3455, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.812370777130127, "rewards/margins": 1.9799458980560303, "rewards/rejected": 1.8324253559112549, "step": 26340 }, { "epoch": 1.22336227308603, "grad_norm": 64.3970947265625, "learning_rate": 2.9623009424764376e-07, "logits/chosen": -18.480266571044922, "logits/rejected": -17.073545455932617, "logps/chosen": -475.26458740234375, "logps/rejected": -291.9932556152344, "loss": 0.452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.161157131195068, "rewards/margins": 2.2376484870910645, "rewards/rejected": 1.923508882522583, "step": 26350 }, { "epoch": 1.223826547193463, "grad_norm": 96.95104217529297, "learning_rate": 2.961527152297383e-07, "logits/chosen": -18.184078216552734, "logits/rejected": -17.966812133789062, "logps/chosen": -404.21832275390625, "logps/rejected": -328.002197265625, "loss": 0.6799, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.533508777618408, "rewards/margins": 0.8324003219604492, "rewards/rejected": 2.70110821723938, "step": 26360 }, { "epoch": 1.224290821300896, "grad_norm": 23.479339599609375, "learning_rate": 2.960753362118328e-07, "logits/chosen": -18.879161834716797, "logits/rejected": -17.860549926757812, "logps/chosen": -426.2293395996094, "logps/rejected": -323.27508544921875, "loss": 0.6852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8618290424346924, "rewards/margins": 1.0019891262054443, "rewards/rejected": 1.8598397970199585, "step": 26370 }, { "epoch": 1.224755095408329, "grad_norm": 219.36117553710938, "learning_rate": 2.959979571939273e-07, "logits/chosen": -19.848526000976562, "logits/rejected": -19.262653350830078, "logps/chosen": -413.7743225097656, "logps/rejected": -350.8681640625, "loss": 1.2522, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8832080364227295, "rewards/margins": 0.4648929536342621, "rewards/rejected": 3.4183151721954346, "step": 26380 }, { "epoch": 1.225219369515762, "grad_norm": 103.34503936767578, "learning_rate": 2.959205781760218e-07, "logits/chosen": -19.20139503479004, "logits/rejected": -18.78831672668457, "logps/chosen": -365.6996765136719, "logps/rejected": -280.00482177734375, "loss": 0.5186, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5087618827819824, "rewards/margins": 1.3287373781204224, "rewards/rejected": 2.1800246238708496, "step": 26390 }, { "epoch": 1.225683643623195, "grad_norm": 39.22383117675781, "learning_rate": 2.9584319915811627e-07, "logits/chosen": -18.91933250427246, "logits/rejected": -18.55141830444336, "logps/chosen": -361.2900085449219, "logps/rejected": -273.1702880859375, "loss": 0.9108, "rewards/accuracies": 0.5, "rewards/chosen": 2.181196689605713, "rewards/margins": -0.009258603677153587, "rewards/rejected": 2.190455198287964, "step": 26400 }, { "epoch": 1.2261479177306283, "grad_norm": 0.4363902509212494, "learning_rate": 2.957658201402108e-07, "logits/chosen": -20.37600326538086, "logits/rejected": -18.610057830810547, "logps/chosen": -308.2738342285156, "logps/rejected": -185.02127075195312, "loss": 0.4735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.356379747390747, "rewards/margins": 2.5429720878601074, "rewards/rejected": 0.8134074211120605, "step": 26410 }, { "epoch": 1.2266121918380612, "grad_norm": 29.708833694458008, "learning_rate": 2.956884411223053e-07, "logits/chosen": -19.371030807495117, "logits/rejected": -18.574922561645508, "logps/chosen": -436.9883728027344, "logps/rejected": -338.514404296875, "loss": 0.7422, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.953321933746338, "rewards/margins": 0.5610207319259644, "rewards/rejected": 2.392301559448242, "step": 26420 }, { "epoch": 1.2270764659454942, "grad_norm": 54.16996383666992, "learning_rate": 2.9561106210439975e-07, "logits/chosen": -19.789310455322266, "logits/rejected": -18.278644561767578, "logps/chosen": -378.0829162597656, "logps/rejected": -300.5897521972656, "loss": 0.3839, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5279972553253174, "rewards/margins": 1.5034517049789429, "rewards/rejected": 2.024545431137085, "step": 26430 }, { "epoch": 1.2275407400529272, "grad_norm": 0.13651472330093384, "learning_rate": 2.9553368308649426e-07, "logits/chosen": -19.02873992919922, "logits/rejected": -18.512056350708008, "logps/chosen": -374.69244384765625, "logps/rejected": -306.4844055175781, "loss": 0.9028, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.26322865486145, "rewards/margins": 0.45318603515625, "rewards/rejected": 2.8100426197052, "step": 26440 }, { "epoch": 1.2280050141603602, "grad_norm": 266.2236022949219, "learning_rate": 2.954563040685887e-07, "logits/chosen": -18.42917251586914, "logits/rejected": -17.88949203491211, "logps/chosen": -326.52825927734375, "logps/rejected": -272.728271484375, "loss": 0.6699, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.821523904800415, "rewards/margins": 1.0870254039764404, "rewards/rejected": 1.734498381614685, "step": 26450 }, { "epoch": 1.2284692882677932, "grad_norm": 60.85560607910156, "learning_rate": 2.9537892505068323e-07, "logits/chosen": -18.72588348388672, "logits/rejected": -17.414833068847656, "logps/chosen": -450.49407958984375, "logps/rejected": -310.36187744140625, "loss": 0.4117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.690677642822266, "rewards/margins": 1.9279266595840454, "rewards/rejected": 2.7627511024475098, "step": 26460 }, { "epoch": 1.2289335623752264, "grad_norm": 123.66567993164062, "learning_rate": 2.9530154603277774e-07, "logits/chosen": -18.073266983032227, "logits/rejected": -19.11327362060547, "logps/chosen": -385.27734375, "logps/rejected": -449.440185546875, "loss": 1.3142, "rewards/accuracies": 0.5, "rewards/chosen": 3.032695770263672, "rewards/margins": -0.09214882552623749, "rewards/rejected": 3.124844789505005, "step": 26470 }, { "epoch": 1.2293978364826594, "grad_norm": 6.667342662811279, "learning_rate": 2.9522416701487225e-07, "logits/chosen": -18.806514739990234, "logits/rejected": -17.977615356445312, "logps/chosen": -454.0306091308594, "logps/rejected": -384.6230163574219, "loss": 0.8163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.864349842071533, "rewards/margins": 0.8539825677871704, "rewards/rejected": 3.010366916656494, "step": 26480 }, { "epoch": 1.2298621105900924, "grad_norm": 67.33797454833984, "learning_rate": 2.9514678799696676e-07, "logits/chosen": -19.219364166259766, "logits/rejected": -18.795650482177734, "logps/chosen": -474.8363342285156, "logps/rejected": -330.8227844238281, "loss": 0.403, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5235302448272705, "rewards/margins": 1.0713196992874146, "rewards/rejected": 2.4522101879119873, "step": 26490 }, { "epoch": 1.2303263846975254, "grad_norm": 0.7598028779029846, "learning_rate": 2.950694089790612e-07, "logits/chosen": -18.71973419189453, "logits/rejected": -17.46987533569336, "logps/chosen": -473.39178466796875, "logps/rejected": -265.6083984375, "loss": 0.2913, "rewards/accuracies": 1.0, "rewards/chosen": 3.6622345447540283, "rewards/margins": 1.8360649347305298, "rewards/rejected": 1.8261692523956299, "step": 26500 }, { "epoch": 1.2307906588049584, "grad_norm": 36.57651138305664, "learning_rate": 2.9499202996115573e-07, "logits/chosen": -18.339723587036133, "logits/rejected": -17.86415672302246, "logps/chosen": -419.0899353027344, "logps/rejected": -402.17193603515625, "loss": 0.7478, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5561022758483887, "rewards/margins": 0.7437346577644348, "rewards/rejected": 2.8123679161071777, "step": 26510 }, { "epoch": 1.2312549329123914, "grad_norm": 250.5431671142578, "learning_rate": 2.949146509432502e-07, "logits/chosen": -19.116069793701172, "logits/rejected": -19.271217346191406, "logps/chosen": -393.28790283203125, "logps/rejected": -248.699462890625, "loss": 0.5755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5373637676239014, "rewards/margins": 1.4623075723648071, "rewards/rejected": 2.0750560760498047, "step": 26520 }, { "epoch": 1.2317192070198244, "grad_norm": 32.48138427734375, "learning_rate": 2.948372719253447e-07, "logits/chosen": -19.18277359008789, "logits/rejected": -17.85216522216797, "logps/chosen": -329.4433898925781, "logps/rejected": -270.94183349609375, "loss": 0.4639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.254909515380859, "rewards/margins": 2.0384268760681152, "rewards/rejected": 2.2164828777313232, "step": 26530 }, { "epoch": 1.2321834811272576, "grad_norm": 75.42967987060547, "learning_rate": 2.947598929074392e-07, "logits/chosen": -18.899282455444336, "logits/rejected": -18.451953887939453, "logps/chosen": -387.8158874511719, "logps/rejected": -367.1582336425781, "loss": 1.3945, "rewards/accuracies": 0.5, "rewards/chosen": 2.980705738067627, "rewards/margins": -0.4737294316291809, "rewards/rejected": 3.454434871673584, "step": 26540 }, { "epoch": 1.2326477552346906, "grad_norm": 9.649212837219238, "learning_rate": 2.9468251388953367e-07, "logits/chosen": -19.246408462524414, "logits/rejected": -17.447988510131836, "logps/chosen": -379.4258728027344, "logps/rejected": -296.2513122558594, "loss": 0.5068, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6888184547424316, "rewards/margins": 1.3997104167938232, "rewards/rejected": 2.2891077995300293, "step": 26550 }, { "epoch": 1.2331120293421236, "grad_norm": 47.415687561035156, "learning_rate": 2.946051348716282e-07, "logits/chosen": -20.142580032348633, "logits/rejected": -19.410533905029297, "logps/chosen": -325.60986328125, "logps/rejected": -295.1832580566406, "loss": 0.5603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8574748039245605, "rewards/margins": 1.071441888809204, "rewards/rejected": 2.7860329151153564, "step": 26560 }, { "epoch": 1.2335763034495566, "grad_norm": 93.96986389160156, "learning_rate": 2.945277558537227e-07, "logits/chosen": -18.614185333251953, "logits/rejected": -17.445287704467773, "logps/chosen": -394.73040771484375, "logps/rejected": -306.54949951171875, "loss": 0.5675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5555992126464844, "rewards/margins": 1.476754903793335, "rewards/rejected": 2.0788445472717285, "step": 26570 }, { "epoch": 1.2340405775569896, "grad_norm": 22.749479293823242, "learning_rate": 2.944503768358172e-07, "logits/chosen": -19.290485382080078, "logits/rejected": -17.648197174072266, "logps/chosen": -381.9664001464844, "logps/rejected": -278.7654724121094, "loss": 0.3859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.627847194671631, "rewards/margins": 2.248492956161499, "rewards/rejected": 2.3793540000915527, "step": 26580 }, { "epoch": 1.2345048516644226, "grad_norm": 161.2133331298828, "learning_rate": 2.943729978179117e-07, "logits/chosen": -19.020078659057617, "logits/rejected": -18.309429168701172, "logps/chosen": -456.0331115722656, "logps/rejected": -335.688720703125, "loss": 0.5784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.277390241622925, "rewards/margins": 0.9577542543411255, "rewards/rejected": 2.319636344909668, "step": 26590 }, { "epoch": 1.2349691257718556, "grad_norm": 0.3253045380115509, "learning_rate": 2.942956188000062e-07, "logits/chosen": -20.148792266845703, "logits/rejected": -18.819934844970703, "logps/chosen": -512.9673461914062, "logps/rejected": -366.2793273925781, "loss": 0.4864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0687665939331055, "rewards/margins": 1.1994415521621704, "rewards/rejected": 2.8693251609802246, "step": 26600 }, { "epoch": 1.2354333998792888, "grad_norm": 12.001124382019043, "learning_rate": 2.942182397821007e-07, "logits/chosen": -18.637134552001953, "logits/rejected": -18.51296615600586, "logps/chosen": -371.77423095703125, "logps/rejected": -371.4903869628906, "loss": 0.8456, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6416969299316406, "rewards/margins": 0.24764080345630646, "rewards/rejected": 2.3940560817718506, "step": 26610 }, { "epoch": 1.2358976739867218, "grad_norm": 90.53736877441406, "learning_rate": 2.9414086076419514e-07, "logits/chosen": -19.285274505615234, "logits/rejected": -19.07210922241211, "logps/chosen": -413.04412841796875, "logps/rejected": -370.387939453125, "loss": 0.4914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9199752807617188, "rewards/margins": 0.9263588786125183, "rewards/rejected": 2.9936165809631348, "step": 26620 }, { "epoch": 1.2363619480941548, "grad_norm": 7.906403541564941, "learning_rate": 2.9406348174628966e-07, "logits/chosen": -18.583683013916016, "logits/rejected": -18.131696701049805, "logps/chosen": -446.33770751953125, "logps/rejected": -359.14141845703125, "loss": 0.6197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.91215443611145, "rewards/margins": 0.8917988538742065, "rewards/rejected": 3.020355701446533, "step": 26630 }, { "epoch": 1.2368262222015878, "grad_norm": 113.75695037841797, "learning_rate": 2.9398610272838417e-07, "logits/chosen": -19.780887603759766, "logits/rejected": -17.566612243652344, "logps/chosen": -487.9481506347656, "logps/rejected": -337.71612548828125, "loss": 0.4703, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8718273639678955, "rewards/margins": 1.9072577953338623, "rewards/rejected": 1.964569330215454, "step": 26640 }, { "epoch": 1.2372904963090208, "grad_norm": 70.57888793945312, "learning_rate": 2.939087237104786e-07, "logits/chosen": -18.96489715576172, "logits/rejected": -17.99092674255371, "logps/chosen": -358.92413330078125, "logps/rejected": -270.7736511230469, "loss": 0.5079, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3165316581726074, "rewards/margins": 1.4576239585876465, "rewards/rejected": 1.8589074611663818, "step": 26650 }, { "epoch": 1.237754770416454, "grad_norm": 97.56547546386719, "learning_rate": 2.9383134469257314e-07, "logits/chosen": -18.430118560791016, "logits/rejected": -17.4614200592041, "logps/chosen": -461.8152770996094, "logps/rejected": -352.2490234375, "loss": 0.349, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.171960353851318, "rewards/margins": 1.3369958400726318, "rewards/rejected": 2.8349647521972656, "step": 26660 }, { "epoch": 1.238219044523887, "grad_norm": 92.79601287841797, "learning_rate": 2.9375396567466765e-07, "logits/chosen": -18.80470848083496, "logits/rejected": -18.06681251525879, "logps/chosen": -475.17999267578125, "logps/rejected": -427.6468200683594, "loss": 0.6187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.511837959289551, "rewards/margins": 1.4967453479766846, "rewards/rejected": 3.0150933265686035, "step": 26670 }, { "epoch": 1.23868331863132, "grad_norm": 196.20973205566406, "learning_rate": 2.9367658665676216e-07, "logits/chosen": -18.944881439208984, "logits/rejected": -17.6956787109375, "logps/chosen": -397.77801513671875, "logps/rejected": -239.5106964111328, "loss": 0.6466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6331868171691895, "rewards/margins": 1.1649492979049683, "rewards/rejected": 2.4682371616363525, "step": 26680 }, { "epoch": 1.239147592738753, "grad_norm": 102.95645141601562, "learning_rate": 2.9359920763885667e-07, "logits/chosen": -18.44439697265625, "logits/rejected": -18.126901626586914, "logps/chosen": -342.4441833496094, "logps/rejected": -315.95330810546875, "loss": 0.6538, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.797801971435547, "rewards/margins": 0.5172263979911804, "rewards/rejected": 2.280575752258301, "step": 26690 }, { "epoch": 1.239611866846186, "grad_norm": 83.0148696899414, "learning_rate": 2.9352182862095113e-07, "logits/chosen": -19.133289337158203, "logits/rejected": -19.184968948364258, "logps/chosen": -359.3753662109375, "logps/rejected": -399.1068115234375, "loss": 0.6986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.145970582962036, "rewards/margins": 0.6992451548576355, "rewards/rejected": 2.446725368499756, "step": 26700 }, { "epoch": 1.240076140953619, "grad_norm": 11.61478328704834, "learning_rate": 2.9344444960304564e-07, "logits/chosen": -19.567325592041016, "logits/rejected": -18.511646270751953, "logps/chosen": -399.76068115234375, "logps/rejected": -284.91827392578125, "loss": 0.4845, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.183368682861328, "rewards/margins": 1.8526802062988281, "rewards/rejected": 2.3306884765625, "step": 26710 }, { "epoch": 1.240540415061052, "grad_norm": 11.293585777282715, "learning_rate": 2.933670705851401e-07, "logits/chosen": -19.552974700927734, "logits/rejected": -19.116464614868164, "logps/chosen": -352.58148193359375, "logps/rejected": -327.563232421875, "loss": 0.9447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.294348955154419, "rewards/margins": 0.6229082942008972, "rewards/rejected": 2.671440601348877, "step": 26720 }, { "epoch": 1.2410046891684852, "grad_norm": 10.215328216552734, "learning_rate": 2.932896915672346e-07, "logits/chosen": -18.896366119384766, "logits/rejected": -18.13715171813965, "logps/chosen": -412.376220703125, "logps/rejected": -342.76239013671875, "loss": 0.5237, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6967597007751465, "rewards/margins": 0.9187813997268677, "rewards/rejected": 1.7779783010482788, "step": 26730 }, { "epoch": 1.2414689632759182, "grad_norm": 4.2928972244262695, "learning_rate": 2.932123125493291e-07, "logits/chosen": -18.670194625854492, "logits/rejected": -17.409854888916016, "logps/chosen": -434.73309326171875, "logps/rejected": -300.1705627441406, "loss": 0.9428, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.103555679321289, "rewards/margins": 1.197101354598999, "rewards/rejected": 1.906454086303711, "step": 26740 }, { "epoch": 1.2419332373833512, "grad_norm": 77.066162109375, "learning_rate": 2.931349335314236e-07, "logits/chosen": -19.610280990600586, "logits/rejected": -18.43861198425293, "logps/chosen": -350.80096435546875, "logps/rejected": -281.44415283203125, "loss": 0.5376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.416548490524292, "rewards/margins": 0.7826226353645325, "rewards/rejected": 2.633925437927246, "step": 26750 }, { "epoch": 1.2423975114907841, "grad_norm": 62.931217193603516, "learning_rate": 2.930575545135181e-07, "logits/chosen": -19.42432403564453, "logits/rejected": -19.374439239501953, "logps/chosen": -376.3748474121094, "logps/rejected": -363.3124084472656, "loss": 0.7505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.836824417114258, "rewards/margins": 0.3437911868095398, "rewards/rejected": 2.4930331707000732, "step": 26760 }, { "epoch": 1.2428617855982171, "grad_norm": 68.88325500488281, "learning_rate": 2.929801754956126e-07, "logits/chosen": -18.784460067749023, "logits/rejected": -18.655012130737305, "logps/chosen": -348.3844299316406, "logps/rejected": -300.16058349609375, "loss": 0.619, "rewards/accuracies": 0.5, "rewards/chosen": 2.8397202491760254, "rewards/margins": 0.4261929392814636, "rewards/rejected": 2.413527488708496, "step": 26770 }, { "epoch": 1.2433260597056501, "grad_norm": 43.21292495727539, "learning_rate": 2.929027964777071e-07, "logits/chosen": -18.482519149780273, "logits/rejected": -18.330583572387695, "logps/chosen": -317.206298828125, "logps/rejected": -313.4041442871094, "loss": 0.8617, "rewards/accuracies": 0.5, "rewards/chosen": 2.8695647716522217, "rewards/margins": 0.2506332993507385, "rewards/rejected": 2.618931531906128, "step": 26780 }, { "epoch": 1.2437903338130831, "grad_norm": 33.814964294433594, "learning_rate": 2.928254174598016e-07, "logits/chosen": -18.180248260498047, "logits/rejected": -18.078733444213867, "logps/chosen": -392.44342041015625, "logps/rejected": -397.09246826171875, "loss": 1.2549, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.4998674392700195, "rewards/margins": 0.5109816789627075, "rewards/rejected": 3.9888863563537598, "step": 26790 }, { "epoch": 1.2442546079205163, "grad_norm": 16.320402145385742, "learning_rate": 2.927480384418961e-07, "logits/chosen": -19.460002899169922, "logits/rejected": -17.749488830566406, "logps/chosen": -350.520751953125, "logps/rejected": -231.1751251220703, "loss": 0.6012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.263822078704834, "rewards/margins": 1.828249216079712, "rewards/rejected": 1.435572862625122, "step": 26800 }, { "epoch": 1.2447188820279493, "grad_norm": 19.858375549316406, "learning_rate": 2.9267065942399054e-07, "logits/chosen": -18.779720306396484, "logits/rejected": -17.52991485595703, "logps/chosen": -320.2150573730469, "logps/rejected": -251.45828247070312, "loss": 0.4385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8936374187469482, "rewards/margins": 1.0741994380950928, "rewards/rejected": 1.8194379806518555, "step": 26810 }, { "epoch": 1.2451831561353823, "grad_norm": 165.25830078125, "learning_rate": 2.9259328040608505e-07, "logits/chosen": -18.95620346069336, "logits/rejected": -18.813838958740234, "logps/chosen": -416.35772705078125, "logps/rejected": -368.969482421875, "loss": 0.7733, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1188297271728516, "rewards/margins": 0.5496727824211121, "rewards/rejected": 2.569157123565674, "step": 26820 }, { "epoch": 1.2456474302428153, "grad_norm": 20.797990798950195, "learning_rate": 2.9251590138817956e-07, "logits/chosen": -18.626476287841797, "logits/rejected": -18.4246826171875, "logps/chosen": -346.08624267578125, "logps/rejected": -361.99114990234375, "loss": 0.6834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.720417022705078, "rewards/margins": 0.6405372023582458, "rewards/rejected": 3.0798795223236084, "step": 26830 }, { "epoch": 1.2461117043502483, "grad_norm": 32.20197296142578, "learning_rate": 2.924385223702741e-07, "logits/chosen": -20.297380447387695, "logits/rejected": -19.400798797607422, "logps/chosen": -458.46160888671875, "logps/rejected": -447.473876953125, "loss": 0.4795, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.089094638824463, "rewards/margins": 0.7498382925987244, "rewards/rejected": 3.3392562866210938, "step": 26840 }, { "epoch": 1.2465759784576815, "grad_norm": 199.9432373046875, "learning_rate": 2.9236114335236853e-07, "logits/chosen": -18.68285369873047, "logits/rejected": -18.431194305419922, "logps/chosen": -380.2889099121094, "logps/rejected": -344.0389099121094, "loss": 1.0241, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.534090757369995, "rewards/margins": 0.30411773920059204, "rewards/rejected": 3.229973316192627, "step": 26850 }, { "epoch": 1.2470402525651145, "grad_norm": 97.92949676513672, "learning_rate": 2.9228376433446305e-07, "logits/chosen": -19.4412784576416, "logits/rejected": -18.91071319580078, "logps/chosen": -423.06781005859375, "logps/rejected": -320.4271240234375, "loss": 1.271, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3774495124816895, "rewards/margins": 0.5028923749923706, "rewards/rejected": 2.8745570182800293, "step": 26860 }, { "epoch": 1.2475045266725475, "grad_norm": 25.965572357177734, "learning_rate": 2.9220638531655756e-07, "logits/chosen": -18.7664794921875, "logits/rejected": -18.26342010498047, "logps/chosen": -311.9223327636719, "logps/rejected": -259.58221435546875, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7872166633605957, "rewards/margins": 0.5843631625175476, "rewards/rejected": 2.202853202819824, "step": 26870 }, { "epoch": 1.2479688007799805, "grad_norm": 52.19681167602539, "learning_rate": 2.9212900629865207e-07, "logits/chosen": -19.282230377197266, "logits/rejected": -18.345752716064453, "logps/chosen": -413.8069763183594, "logps/rejected": -239.8252410888672, "loss": 0.2402, "rewards/accuracies": 1.0, "rewards/chosen": 3.4452080726623535, "rewards/margins": 1.4739630222320557, "rewards/rejected": 1.9712451696395874, "step": 26880 }, { "epoch": 1.2484330748874135, "grad_norm": 192.96714782714844, "learning_rate": 2.920516272807466e-07, "logits/chosen": -19.3795108795166, "logits/rejected": -18.608386993408203, "logps/chosen": -362.23065185546875, "logps/rejected": -285.716796875, "loss": 0.9946, "rewards/accuracies": 0.5, "rewards/chosen": 3.221104860305786, "rewards/margins": 0.5659370422363281, "rewards/rejected": 2.655168056488037, "step": 26890 }, { "epoch": 1.2488973489948465, "grad_norm": 4.425607204437256, "learning_rate": 2.9197424826284104e-07, "logits/chosen": -18.197378158569336, "logits/rejected": -17.649354934692383, "logps/chosen": -332.7084655761719, "logps/rejected": -205.68643188476562, "loss": 0.6126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8946871757507324, "rewards/margins": 1.4241392612457275, "rewards/rejected": 1.4705479145050049, "step": 26900 }, { "epoch": 1.2493616231022795, "grad_norm": 106.41792297363281, "learning_rate": 2.918968692449355e-07, "logits/chosen": -19.972064971923828, "logits/rejected": -20.128379821777344, "logps/chosen": -441.84130859375, "logps/rejected": -424.909423828125, "loss": 0.6728, "rewards/accuracies": 0.5, "rewards/chosen": 4.573639392852783, "rewards/margins": 0.24906161427497864, "rewards/rejected": 4.324577808380127, "step": 26910 }, { "epoch": 1.2498258972097127, "grad_norm": 9.082710266113281, "learning_rate": 2.9181949022703e-07, "logits/chosen": -19.391386032104492, "logits/rejected": -17.818870544433594, "logps/chosen": -388.38934326171875, "logps/rejected": -259.64764404296875, "loss": 0.1977, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1204352378845215, "rewards/margins": 2.368680715560913, "rewards/rejected": 0.7517539858818054, "step": 26920 }, { "epoch": 1.2502901713171457, "grad_norm": 111.15959930419922, "learning_rate": 2.917421112091245e-07, "logits/chosen": -18.663558959960938, "logits/rejected": -18.24994468688965, "logps/chosen": -403.0003967285156, "logps/rejected": -339.3382873535156, "loss": 0.5746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.757146120071411, "rewards/margins": 0.39593955874443054, "rewards/rejected": 2.361206531524658, "step": 26930 }, { "epoch": 1.2507544454245787, "grad_norm": 0.5040281414985657, "learning_rate": 2.9166473219121903e-07, "logits/chosen": -19.28251075744629, "logits/rejected": -18.775833129882812, "logps/chosen": -403.9576416015625, "logps/rejected": -302.1742858886719, "loss": 0.4442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5822594165802, "rewards/margins": 1.1205666065216064, "rewards/rejected": 2.4616928100585938, "step": 26940 }, { "epoch": 1.2512187195320117, "grad_norm": 122.25445556640625, "learning_rate": 2.915873531733135e-07, "logits/chosen": -18.110958099365234, "logits/rejected": -17.705846786499023, "logps/chosen": -264.04937744140625, "logps/rejected": -241.56332397460938, "loss": 0.5021, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.586824417114258, "rewards/margins": 0.7444068789482117, "rewards/rejected": 1.8424174785614014, "step": 26950 }, { "epoch": 1.2516829936394447, "grad_norm": 73.9061508178711, "learning_rate": 2.91509974155408e-07, "logits/chosen": -18.31307601928711, "logits/rejected": -17.710168838500977, "logps/chosen": -461.5755920410156, "logps/rejected": -361.6919860839844, "loss": 0.5208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.570596694946289, "rewards/margins": 1.7808935642242432, "rewards/rejected": 2.789703369140625, "step": 26960 }, { "epoch": 1.2521472677468777, "grad_norm": 4.831273078918457, "learning_rate": 2.914325951375025e-07, "logits/chosen": -18.04055404663086, "logits/rejected": -17.502178192138672, "logps/chosen": -431.49969482421875, "logps/rejected": -368.2024841308594, "loss": 1.0131, "rewards/accuracies": 0.5, "rewards/chosen": 3.607440233230591, "rewards/margins": 0.5676687955856323, "rewards/rejected": 3.039771318435669, "step": 26970 }, { "epoch": 1.2526115418543107, "grad_norm": 104.83039093017578, "learning_rate": 2.91355216119597e-07, "logits/chosen": -19.304838180541992, "logits/rejected": -18.96822166442871, "logps/chosen": -347.93292236328125, "logps/rejected": -330.3548278808594, "loss": 0.5781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6260809898376465, "rewards/margins": 0.5958632230758667, "rewards/rejected": 3.0302181243896484, "step": 26980 }, { "epoch": 1.2530758159617439, "grad_norm": 143.0410614013672, "learning_rate": 2.9127783710169153e-07, "logits/chosen": -18.829383850097656, "logits/rejected": -18.208110809326172, "logps/chosen": -414.636962890625, "logps/rejected": -271.4754638671875, "loss": 0.3979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.591161012649536, "rewards/margins": 1.470954179763794, "rewards/rejected": 2.1202070713043213, "step": 26990 }, { "epoch": 1.2535400900691769, "grad_norm": 1.4231696128845215, "learning_rate": 2.9120045808378594e-07, "logits/chosen": -18.50485610961914, "logits/rejected": -18.11498260498047, "logps/chosen": -435.092529296875, "logps/rejected": -426.7325134277344, "loss": 0.8388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7566208839416504, "rewards/margins": 0.9695930480957031, "rewards/rejected": 2.7870280742645264, "step": 27000 }, { "epoch": 1.2540043641766099, "grad_norm": 0.9006716012954712, "learning_rate": 2.9112307906588045e-07, "logits/chosen": -18.343280792236328, "logits/rejected": -17.7319278717041, "logps/chosen": -379.97100830078125, "logps/rejected": -328.41827392578125, "loss": 0.7474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.679003953933716, "rewards/margins": 0.7968004941940308, "rewards/rejected": 2.8822033405303955, "step": 27010 }, { "epoch": 1.2544686382840429, "grad_norm": 20.483352661132812, "learning_rate": 2.9104570004797496e-07, "logits/chosen": -17.994882583618164, "logits/rejected": -17.45942497253418, "logps/chosen": -273.3011169433594, "logps/rejected": -257.576171875, "loss": 0.8612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5408554077148438, "rewards/margins": 0.7358251810073853, "rewards/rejected": 1.8050302267074585, "step": 27020 }, { "epoch": 1.2549329123914759, "grad_norm": 111.69686126708984, "learning_rate": 2.9096832103006947e-07, "logits/chosen": -18.90863609313965, "logits/rejected": -18.66872215270996, "logps/chosen": -397.58856201171875, "logps/rejected": -354.77020263671875, "loss": 0.5519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6812126636505127, "rewards/margins": 0.8715624809265137, "rewards/rejected": 2.80964994430542, "step": 27030 }, { "epoch": 1.255397186498909, "grad_norm": 87.76336669921875, "learning_rate": 2.90890942012164e-07, "logits/chosen": -18.57949447631836, "logits/rejected": -18.876834869384766, "logps/chosen": -384.32861328125, "logps/rejected": -314.04095458984375, "loss": 1.1145, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8193323612213135, "rewards/margins": 0.37426668405532837, "rewards/rejected": 2.44506573677063, "step": 27040 }, { "epoch": 1.2558614606063419, "grad_norm": 18.328943252563477, "learning_rate": 2.9081356299425844e-07, "logits/chosen": -18.424840927124023, "logits/rejected": -17.44990348815918, "logps/chosen": -305.5111999511719, "logps/rejected": -250.9722137451172, "loss": 0.4715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.615351915359497, "rewards/margins": 1.6539669036865234, "rewards/rejected": 1.9613850116729736, "step": 27050 }, { "epoch": 1.256325734713775, "grad_norm": 137.64964294433594, "learning_rate": 2.9073618397635295e-07, "logits/chosen": -18.55792999267578, "logits/rejected": -17.49740219116211, "logps/chosen": -395.20794677734375, "logps/rejected": -292.8785400390625, "loss": 0.5266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.57189679145813, "rewards/margins": 1.4002310037612915, "rewards/rejected": 2.171665668487549, "step": 27060 }, { "epoch": 1.256790008821208, "grad_norm": 1.2160564661026, "learning_rate": 2.9065880495844746e-07, "logits/chosen": -19.57833480834961, "logits/rejected": -19.37435531616211, "logps/chosen": -486.8187561035156, "logps/rejected": -477.90704345703125, "loss": 0.7397, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.303990840911865, "rewards/margins": 0.5072150230407715, "rewards/rejected": 3.7967758178710938, "step": 27070 }, { "epoch": 1.257254282928641, "grad_norm": 90.28077697753906, "learning_rate": 2.90581425940542e-07, "logits/chosen": -19.114330291748047, "logits/rejected": -18.12626075744629, "logps/chosen": -616.4824829101562, "logps/rejected": -456.8228454589844, "loss": 0.6713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.801448822021484, "rewards/margins": 0.9219358563423157, "rewards/rejected": 3.8795127868652344, "step": 27080 }, { "epoch": 1.257718557036074, "grad_norm": 189.8494415283203, "learning_rate": 2.905040469226365e-07, "logits/chosen": -18.428340911865234, "logits/rejected": -18.676406860351562, "logps/chosen": -445.7310485839844, "logps/rejected": -384.4435119628906, "loss": 1.0957, "rewards/accuracies": 0.5, "rewards/chosen": 3.275573253631592, "rewards/margins": 0.017176473513245583, "rewards/rejected": 3.258396863937378, "step": 27090 }, { "epoch": 1.258182831143507, "grad_norm": 127.99202728271484, "learning_rate": 2.904266679047309e-07, "logits/chosen": -18.117374420166016, "logits/rejected": -17.395002365112305, "logps/chosen": -366.17877197265625, "logps/rejected": -317.38739013671875, "loss": 0.5433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.207104444503784, "rewards/margins": 1.6367744207382202, "rewards/rejected": 1.5703301429748535, "step": 27100 }, { "epoch": 1.2586471052509403, "grad_norm": 28.91310691833496, "learning_rate": 2.903492888868254e-07, "logits/chosen": -20.00815200805664, "logits/rejected": -19.639137268066406, "logps/chosen": -367.95123291015625, "logps/rejected": -302.95770263671875, "loss": 0.7834, "rewards/accuracies": 0.5, "rewards/chosen": 2.5580711364746094, "rewards/margins": 0.47604984045028687, "rewards/rejected": 2.0820212364196777, "step": 27110 }, { "epoch": 1.2591113793583733, "grad_norm": 84.26869201660156, "learning_rate": 2.902719098689199e-07, "logits/chosen": -20.300701141357422, "logits/rejected": -19.440692901611328, "logps/chosen": -346.24066162109375, "logps/rejected": -300.41278076171875, "loss": 0.5223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.507387161254883, "rewards/margins": 0.9860929250717163, "rewards/rejected": 2.521294355392456, "step": 27120 }, { "epoch": 1.2595756534658062, "grad_norm": 11.19141674041748, "learning_rate": 2.9019453085101443e-07, "logits/chosen": -19.222225189208984, "logits/rejected": -18.3746280670166, "logps/chosen": -362.6253662109375, "logps/rejected": -292.7347412109375, "loss": 0.6052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.203113079071045, "rewards/margins": 0.8038334846496582, "rewards/rejected": 2.3992795944213867, "step": 27130 }, { "epoch": 1.2600399275732392, "grad_norm": 39.6735725402832, "learning_rate": 2.9011715183310894e-07, "logits/chosen": -18.21824836730957, "logits/rejected": -17.859729766845703, "logps/chosen": -393.1560974121094, "logps/rejected": -366.65863037109375, "loss": 0.5937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.839578628540039, "rewards/margins": 1.263628363609314, "rewards/rejected": 2.5759501457214355, "step": 27140 }, { "epoch": 1.2605042016806722, "grad_norm": 3.1167056560516357, "learning_rate": 2.900397728152034e-07, "logits/chosen": -18.64071273803711, "logits/rejected": -18.663009643554688, "logps/chosen": -407.59906005859375, "logps/rejected": -475.8409729003906, "loss": 1.1147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4079155921936035, "rewards/margins": 0.30010324716567993, "rewards/rejected": 3.1078126430511475, "step": 27150 }, { "epoch": 1.2609684757881052, "grad_norm": 35.627952575683594, "learning_rate": 2.899623937972979e-07, "logits/chosen": -18.720548629760742, "logits/rejected": -18.694501876831055, "logps/chosen": -343.77459716796875, "logps/rejected": -357.3562927246094, "loss": 1.4531, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.4998373985290527, "rewards/margins": -0.5040944814682007, "rewards/rejected": 3.003931760787964, "step": 27160 }, { "epoch": 1.2614327498955382, "grad_norm": 15.615818977355957, "learning_rate": 2.898850147793924e-07, "logits/chosen": -19.70627212524414, "logits/rejected": -18.235118865966797, "logps/chosen": -354.2669372558594, "logps/rejected": -246.37777709960938, "loss": 0.7577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7181496620178223, "rewards/margins": 1.1999324560165405, "rewards/rejected": 1.5182174444198608, "step": 27170 }, { "epoch": 1.2618970240029714, "grad_norm": 74.86214447021484, "learning_rate": 2.8980763576148693e-07, "logits/chosen": -19.717267990112305, "logits/rejected": -19.03134536743164, "logps/chosen": -368.57550048828125, "logps/rejected": -305.59039306640625, "loss": 0.6519, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5019431114196777, "rewards/margins": 0.9784120321273804, "rewards/rejected": 2.523530960083008, "step": 27180 }, { "epoch": 1.2623612981104044, "grad_norm": 0.5613347887992859, "learning_rate": 2.8973025674358144e-07, "logits/chosen": -18.72085189819336, "logits/rejected": -18.738689422607422, "logps/chosen": -409.93646240234375, "logps/rejected": -362.25775146484375, "loss": 1.4471, "rewards/accuracies": 0.5, "rewards/chosen": 3.2847423553466797, "rewards/margins": -0.20590989291667938, "rewards/rejected": 3.490652084350586, "step": 27190 }, { "epoch": 1.2628255722178374, "grad_norm": 19.129657745361328, "learning_rate": 2.8965287772567585e-07, "logits/chosen": -19.077314376831055, "logits/rejected": -18.720504760742188, "logps/chosen": -430.5540466308594, "logps/rejected": -345.69671630859375, "loss": 0.8872, "rewards/accuracies": 0.5, "rewards/chosen": 3.780203342437744, "rewards/margins": 0.7391443252563477, "rewards/rejected": 3.0410587787628174, "step": 27200 }, { "epoch": 1.2632898463252704, "grad_norm": 131.13983154296875, "learning_rate": 2.8957549870777036e-07, "logits/chosen": -18.1549072265625, "logits/rejected": -18.08176040649414, "logps/chosen": -428.05322265625, "logps/rejected": -492.53863525390625, "loss": 1.1067, "rewards/accuracies": 0.5, "rewards/chosen": 3.535876512527466, "rewards/margins": -0.22393150627613068, "rewards/rejected": 3.759808301925659, "step": 27210 }, { "epoch": 1.2637541204327034, "grad_norm": 0.5592687129974365, "learning_rate": 2.8949811968986487e-07, "logits/chosen": -19.510906219482422, "logits/rejected": -19.107580184936523, "logps/chosen": -479.78973388671875, "logps/rejected": -402.02618408203125, "loss": 0.9124, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.014876365661621, "rewards/margins": 1.248763918876648, "rewards/rejected": 2.7661125659942627, "step": 27220 }, { "epoch": 1.2642183945401366, "grad_norm": 22.194438934326172, "learning_rate": 2.894207406719594e-07, "logits/chosen": -19.655887603759766, "logits/rejected": -18.15325164794922, "logps/chosen": -436.58172607421875, "logps/rejected": -292.3600158691406, "loss": 0.6773, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4043164253234863, "rewards/margins": 1.5496230125427246, "rewards/rejected": 1.8546931743621826, "step": 27230 }, { "epoch": 1.2646826686475694, "grad_norm": 39.384490966796875, "learning_rate": 2.893433616540539e-07, "logits/chosen": -20.266958236694336, "logits/rejected": -20.14166831970215, "logps/chosen": -472.37530517578125, "logps/rejected": -456.892578125, "loss": 0.7692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.529188632965088, "rewards/margins": 0.25901827216148376, "rewards/rejected": 3.2701706886291504, "step": 27240 }, { "epoch": 1.2651469427550026, "grad_norm": 82.48239135742188, "learning_rate": 2.8926598263614835e-07, "logits/chosen": -17.622379302978516, "logits/rejected": -17.343542098999023, "logps/chosen": -344.7496032714844, "logps/rejected": -286.50018310546875, "loss": 0.6006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.479437828063965, "rewards/margins": 1.0425339937210083, "rewards/rejected": 1.4369040727615356, "step": 27250 }, { "epoch": 1.2656112168624356, "grad_norm": 61.635711669921875, "learning_rate": 2.8918860361824286e-07, "logits/chosen": -17.89034652709961, "logits/rejected": -17.050823211669922, "logps/chosen": -358.27752685546875, "logps/rejected": -239.5846710205078, "loss": 0.6233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.708080291748047, "rewards/margins": 1.5365997552871704, "rewards/rejected": 1.1714805364608765, "step": 27260 }, { "epoch": 1.2660754909698686, "grad_norm": 43.650081634521484, "learning_rate": 2.8911122460033737e-07, "logits/chosen": -19.118040084838867, "logits/rejected": -18.829914093017578, "logps/chosen": -449.83740234375, "logps/rejected": -363.70050048828125, "loss": 0.7017, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9429707527160645, "rewards/margins": 0.8846727609634399, "rewards/rejected": 3.058297872543335, "step": 27270 }, { "epoch": 1.2665397650773016, "grad_norm": 42.21326446533203, "learning_rate": 2.890338455824319e-07, "logits/chosen": -19.11640739440918, "logits/rejected": -18.509546279907227, "logps/chosen": -424.07861328125, "logps/rejected": -274.8019714355469, "loss": 0.6181, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.039679527282715, "rewards/margins": 1.1960134506225586, "rewards/rejected": 2.8436665534973145, "step": 27280 }, { "epoch": 1.2670040391847346, "grad_norm": 53.392799377441406, "learning_rate": 2.8895646656452634e-07, "logits/chosen": -18.046314239501953, "logits/rejected": -17.47174072265625, "logps/chosen": -306.47802734375, "logps/rejected": -262.8497619628906, "loss": 0.7137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0182690620422363, "rewards/margins": 1.4006415605545044, "rewards/rejected": 1.617627501487732, "step": 27290 }, { "epoch": 1.2674683132921678, "grad_norm": 217.48583984375, "learning_rate": 2.888790875466208e-07, "logits/chosen": -18.014726638793945, "logits/rejected": -17.380496978759766, "logps/chosen": -457.46923828125, "logps/rejected": -378.3451232910156, "loss": 0.8075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2721357345581055, "rewards/margins": 1.3147432804107666, "rewards/rejected": 2.957392454147339, "step": 27300 }, { "epoch": 1.2679325873996008, "grad_norm": 19.41179656982422, "learning_rate": 2.888017085287153e-07, "logits/chosen": -19.697439193725586, "logits/rejected": -18.984411239624023, "logps/chosen": -396.6146240234375, "logps/rejected": -294.60125732421875, "loss": 0.4742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9699673652648926, "rewards/margins": 0.7376429438591003, "rewards/rejected": 2.2323241233825684, "step": 27310 }, { "epoch": 1.2683968615070338, "grad_norm": 40.91877365112305, "learning_rate": 2.887243295108098e-07, "logits/chosen": -17.96846580505371, "logits/rejected": -17.991024017333984, "logps/chosen": -454.0774841308594, "logps/rejected": -497.90447998046875, "loss": 0.8021, "rewards/accuracies": 0.5, "rewards/chosen": 4.271130561828613, "rewards/margins": 0.6496976613998413, "rewards/rejected": 3.6214332580566406, "step": 27320 }, { "epoch": 1.2688611356144668, "grad_norm": 0.027810778468847275, "learning_rate": 2.8864695049290434e-07, "logits/chosen": -18.410120010375977, "logits/rejected": -17.708436965942383, "logps/chosen": -376.4032287597656, "logps/rejected": -284.6012268066406, "loss": 0.3679, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9442615509033203, "rewards/margins": 1.7731157541275024, "rewards/rejected": 2.1711459159851074, "step": 27330 }, { "epoch": 1.2693254097218998, "grad_norm": 0.08366917818784714, "learning_rate": 2.8856957147499885e-07, "logits/chosen": -18.464689254760742, "logits/rejected": -17.69268035888672, "logps/chosen": -355.57830810546875, "logps/rejected": -271.79498291015625, "loss": 0.5492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3018875122070312, "rewards/margins": 1.9392093420028687, "rewards/rejected": 1.3626782894134521, "step": 27340 }, { "epoch": 1.2697896838293328, "grad_norm": 0.4475499093532562, "learning_rate": 2.884921924570933e-07, "logits/chosen": -17.617311477661133, "logits/rejected": -18.11642837524414, "logps/chosen": -393.30926513671875, "logps/rejected": -412.4369201660156, "loss": 1.5675, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3126213550567627, "rewards/margins": 0.0008338451152667403, "rewards/rejected": 3.3117873668670654, "step": 27350 }, { "epoch": 1.2702539579367658, "grad_norm": 4.6496100425720215, "learning_rate": 2.884148134391878e-07, "logits/chosen": -18.587295532226562, "logits/rejected": -17.663480758666992, "logps/chosen": -425.3780212402344, "logps/rejected": -340.0162353515625, "loss": 0.5654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.825291872024536, "rewards/margins": 1.0119147300720215, "rewards/rejected": 2.8133769035339355, "step": 27360 }, { "epoch": 1.270718232044199, "grad_norm": 131.36416625976562, "learning_rate": 2.8833743442128233e-07, "logits/chosen": -19.454254150390625, "logits/rejected": -18.744144439697266, "logps/chosen": -498.63427734375, "logps/rejected": -441.05169677734375, "loss": 0.9216, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7829813957214355, "rewards/margins": 0.8403825759887695, "rewards/rejected": 2.942598819732666, "step": 27370 }, { "epoch": 1.271182506151632, "grad_norm": 15.149130821228027, "learning_rate": 2.8826005540337684e-07, "logits/chosen": -18.145614624023438, "logits/rejected": -17.64613914489746, "logps/chosen": -301.2220458984375, "logps/rejected": -210.9737548828125, "loss": 1.0691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4833757877349854, "rewards/margins": 0.6930997967720032, "rewards/rejected": 1.7902758121490479, "step": 27380 }, { "epoch": 1.271646780259065, "grad_norm": 35.49338912963867, "learning_rate": 2.881826763854713e-07, "logits/chosen": -19.513877868652344, "logits/rejected": -18.093563079833984, "logps/chosen": -493.3116149902344, "logps/rejected": -308.5742492675781, "loss": 0.315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3665874004364014, "rewards/margins": 1.8267379999160767, "rewards/rejected": 1.539849042892456, "step": 27390 }, { "epoch": 1.272111054366498, "grad_norm": 138.24636840820312, "learning_rate": 2.8810529736756576e-07, "logits/chosen": -18.31460189819336, "logits/rejected": -17.668609619140625, "logps/chosen": -332.2079162597656, "logps/rejected": -301.63897705078125, "loss": 0.9194, "rewards/accuracies": 0.5, "rewards/chosen": 3.1469247341156006, "rewards/margins": 0.3539625108242035, "rewards/rejected": 2.7929623126983643, "step": 27400 }, { "epoch": 1.272575328473931, "grad_norm": 21.950523376464844, "learning_rate": 2.8802791834966027e-07, "logits/chosen": -18.152048110961914, "logits/rejected": -17.656883239746094, "logps/chosen": -347.7774963378906, "logps/rejected": -288.53094482421875, "loss": 0.5794, "rewards/accuracies": 0.5, "rewards/chosen": 2.6572136878967285, "rewards/margins": 0.9744043350219727, "rewards/rejected": 1.6828092336654663, "step": 27410 }, { "epoch": 1.2730396025813642, "grad_norm": 14.233609199523926, "learning_rate": 2.879505393317548e-07, "logits/chosen": -19.5634822845459, "logits/rejected": -17.962289810180664, "logps/chosen": -412.316162109375, "logps/rejected": -261.678955078125, "loss": 0.4881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3745341300964355, "rewards/margins": 2.0043773651123047, "rewards/rejected": 1.3701571226119995, "step": 27420 }, { "epoch": 1.273503876688797, "grad_norm": 20.557403564453125, "learning_rate": 2.878731603138493e-07, "logits/chosen": -18.73179054260254, "logits/rejected": -19.018217086791992, "logps/chosen": -446.71929931640625, "logps/rejected": -453.79730224609375, "loss": 1.131, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.7022061347961426, "rewards/margins": 0.13381877541542053, "rewards/rejected": 3.568387269973755, "step": 27430 }, { "epoch": 1.2739681507962302, "grad_norm": 123.94729614257812, "learning_rate": 2.877957812959438e-07, "logits/chosen": -18.97665786743164, "logits/rejected": -18.47859764099121, "logps/chosen": -309.60162353515625, "logps/rejected": -294.36517333984375, "loss": 0.9298, "rewards/accuracies": 0.5, "rewards/chosen": 3.040364980697632, "rewards/margins": 0.8092666864395142, "rewards/rejected": 2.2310986518859863, "step": 27440 }, { "epoch": 1.2744324249036632, "grad_norm": 40.607810974121094, "learning_rate": 2.877184022780383e-07, "logits/chosen": -18.367595672607422, "logits/rejected": -18.555559158325195, "logps/chosen": -422.56646728515625, "logps/rejected": -303.3612365722656, "loss": 0.5441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.831177234649658, "rewards/margins": 1.1471662521362305, "rewards/rejected": 2.6840109825134277, "step": 27450 }, { "epoch": 1.2748966990110961, "grad_norm": 86.82665252685547, "learning_rate": 2.8764102326013277e-07, "logits/chosen": -19.793901443481445, "logits/rejected": -18.735958099365234, "logps/chosen": -405.30096435546875, "logps/rejected": -336.5980529785156, "loss": 0.8986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7180514335632324, "rewards/margins": 1.040470838546753, "rewards/rejected": 2.6775805950164795, "step": 27460 }, { "epoch": 1.2753609731185291, "grad_norm": 78.2398452758789, "learning_rate": 2.875636442422273e-07, "logits/chosen": -18.074539184570312, "logits/rejected": -17.794723510742188, "logps/chosen": -299.17315673828125, "logps/rejected": -265.7069396972656, "loss": 0.7873, "rewards/accuracies": 0.5, "rewards/chosen": 1.6925290822982788, "rewards/margins": 0.6969471573829651, "rewards/rejected": 0.995582103729248, "step": 27470 }, { "epoch": 1.2758252472259621, "grad_norm": 58.99146270751953, "learning_rate": 2.8748626522432174e-07, "logits/chosen": -18.14592933654785, "logits/rejected": -18.361141204833984, "logps/chosen": -336.2247009277344, "logps/rejected": -332.71826171875, "loss": 0.8491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.018572807312012, "rewards/margins": 0.48364749550819397, "rewards/rejected": 3.5349249839782715, "step": 27480 }, { "epoch": 1.2762895213333953, "grad_norm": 205.4014892578125, "learning_rate": 2.8740888620641625e-07, "logits/chosen": -19.90970230102539, "logits/rejected": -19.447776794433594, "logps/chosen": -409.56317138671875, "logps/rejected": -368.2301330566406, "loss": 0.9914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5887343883514404, "rewards/margins": 0.5806227922439575, "rewards/rejected": 3.0081119537353516, "step": 27490 }, { "epoch": 1.2767537954408283, "grad_norm": 0.47830283641815186, "learning_rate": 2.873315071885107e-07, "logits/chosen": -19.340721130371094, "logits/rejected": -18.370677947998047, "logps/chosen": -415.68963623046875, "logps/rejected": -315.18975830078125, "loss": 0.6881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.529257297515869, "rewards/margins": 1.3861294984817505, "rewards/rejected": 3.143127679824829, "step": 27500 }, { "epoch": 1.2772180695482613, "grad_norm": 105.44830322265625, "learning_rate": 2.872541281706052e-07, "logits/chosen": -19.307430267333984, "logits/rejected": -18.750011444091797, "logps/chosen": -512.707275390625, "logps/rejected": -463.6895446777344, "loss": 0.3329, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.080880165100098, "rewards/margins": 1.4725556373596191, "rewards/rejected": 2.6083240509033203, "step": 27510 }, { "epoch": 1.2776823436556943, "grad_norm": 258.4903564453125, "learning_rate": 2.8717674915269973e-07, "logits/chosen": -18.785139083862305, "logits/rejected": -19.12668800354004, "logps/chosen": -352.265869140625, "logps/rejected": -320.52105712890625, "loss": 1.1928, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.452646255493164, "rewards/margins": -0.28785762190818787, "rewards/rejected": 3.740504503250122, "step": 27520 }, { "epoch": 1.2781466177631273, "grad_norm": 46.275962829589844, "learning_rate": 2.8709937013479424e-07, "logits/chosen": -17.824460983276367, "logits/rejected": -16.650060653686523, "logps/chosen": -365.42877197265625, "logps/rejected": -223.0095672607422, "loss": 0.2503, "rewards/accuracies": 1.0, "rewards/chosen": 2.6900553703308105, "rewards/margins": 2.107135772705078, "rewards/rejected": 0.5829194188117981, "step": 27530 }, { "epoch": 1.2786108918705603, "grad_norm": 2.6715245246887207, "learning_rate": 2.8702199111688875e-07, "logits/chosen": -18.40892791748047, "logits/rejected": -17.52817726135254, "logps/chosen": -561.6234130859375, "logps/rejected": -382.33331298828125, "loss": 0.5958, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.520235538482666, "rewards/margins": 1.5884249210357666, "rewards/rejected": 2.9318106174468994, "step": 27540 }, { "epoch": 1.2790751659779933, "grad_norm": 3.4792287349700928, "learning_rate": 2.8694461209898327e-07, "logits/chosen": -18.428173065185547, "logits/rejected": -17.714582443237305, "logps/chosen": -390.5416259765625, "logps/rejected": -296.3145751953125, "loss": 0.8364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4507675170898438, "rewards/margins": 1.1269221305847168, "rewards/rejected": 2.3238449096679688, "step": 27550 }, { "epoch": 1.2795394400854265, "grad_norm": 98.17758178710938, "learning_rate": 2.868672330810777e-07, "logits/chosen": -18.651336669921875, "logits/rejected": -18.474401473999023, "logps/chosen": -460.77716064453125, "logps/rejected": -377.32720947265625, "loss": 0.4081, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.696820259094238, "rewards/margins": 1.076480507850647, "rewards/rejected": 3.620339870452881, "step": 27560 }, { "epoch": 1.2800037141928595, "grad_norm": 15.441662788391113, "learning_rate": 2.8678985406317224e-07, "logits/chosen": -20.246938705444336, "logits/rejected": -18.845489501953125, "logps/chosen": -476.48370361328125, "logps/rejected": -363.32666015625, "loss": 0.5789, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.121315002441406, "rewards/margins": 1.1599280834197998, "rewards/rejected": 2.961386203765869, "step": 27570 }, { "epoch": 1.2804679883002925, "grad_norm": 34.685546875, "learning_rate": 2.867124750452667e-07, "logits/chosen": -19.309158325195312, "logits/rejected": -19.037084579467773, "logps/chosen": -346.07464599609375, "logps/rejected": -390.5585632324219, "loss": 0.3629, "rewards/accuracies": 1.0, "rewards/chosen": 3.469670534133911, "rewards/margins": 0.9878959655761719, "rewards/rejected": 2.4817748069763184, "step": 27580 }, { "epoch": 1.2809322624077255, "grad_norm": 27.462156295776367, "learning_rate": 2.866350960273612e-07, "logits/chosen": -18.641014099121094, "logits/rejected": -17.982580184936523, "logps/chosen": -401.8086853027344, "logps/rejected": -354.64300537109375, "loss": 0.3538, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.368494749069214, "rewards/margins": 1.1906603574752808, "rewards/rejected": 2.1778340339660645, "step": 27590 }, { "epoch": 1.2813965365151585, "grad_norm": 66.81890106201172, "learning_rate": 2.865577170094557e-07, "logits/chosen": -20.257413864135742, "logits/rejected": -19.951904296875, "logps/chosen": -502.96453857421875, "logps/rejected": -462.9393615722656, "loss": 0.5019, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.938786745071411, "rewards/margins": 1.282246470451355, "rewards/rejected": 2.6565401554107666, "step": 27600 }, { "epoch": 1.2818608106225915, "grad_norm": 41.99913024902344, "learning_rate": 2.864803379915502e-07, "logits/chosen": -18.865680694580078, "logits/rejected": -18.260501861572266, "logps/chosen": -426.05487060546875, "logps/rejected": -376.42266845703125, "loss": 0.2311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1784796714782715, "rewards/margins": 1.8054583072662354, "rewards/rejected": 2.373021364212036, "step": 27610 }, { "epoch": 1.2823250847300245, "grad_norm": 23.957279205322266, "learning_rate": 2.864029589736447e-07, "logits/chosen": -19.525035858154297, "logits/rejected": -18.88985824584961, "logps/chosen": -327.231689453125, "logps/rejected": -246.72647094726562, "loss": 0.7407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.537527322769165, "rewards/margins": 1.0091603994369507, "rewards/rejected": 1.5283668041229248, "step": 27620 }, { "epoch": 1.2827893588374577, "grad_norm": 0.6229042410850525, "learning_rate": 2.863255799557392e-07, "logits/chosen": -18.293201446533203, "logits/rejected": -17.51839828491211, "logps/chosen": -338.2586975097656, "logps/rejected": -289.69818115234375, "loss": 0.8832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2192893028259277, "rewards/margins": 1.1587164402008057, "rewards/rejected": 2.060572862625122, "step": 27630 }, { "epoch": 1.2832536329448907, "grad_norm": 58.65909194946289, "learning_rate": 2.862482009378337e-07, "logits/chosen": -19.026573181152344, "logits/rejected": -18.135984420776367, "logps/chosen": -363.0325012207031, "logps/rejected": -206.2690887451172, "loss": 0.4533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9459564685821533, "rewards/margins": 1.9728155136108398, "rewards/rejected": 1.9731409549713135, "step": 27640 }, { "epoch": 1.2837179070523237, "grad_norm": 73.36380767822266, "learning_rate": 2.861708219199282e-07, "logits/chosen": -18.466218948364258, "logits/rejected": -17.977359771728516, "logps/chosen": -358.38677978515625, "logps/rejected": -354.1697692871094, "loss": 0.7585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2094573974609375, "rewards/margins": 0.3277300298213959, "rewards/rejected": 2.8817272186279297, "step": 27650 }, { "epoch": 1.2841821811597567, "grad_norm": 131.4878692626953, "learning_rate": 2.860934429020227e-07, "logits/chosen": -18.549753189086914, "logits/rejected": -18.389362335205078, "logps/chosen": -483.2035217285156, "logps/rejected": -469.2431640625, "loss": 1.377, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.6794590950012207, "rewards/margins": -0.3956959843635559, "rewards/rejected": 4.075155258178711, "step": 27660 }, { "epoch": 1.2846464552671897, "grad_norm": 206.77151489257812, "learning_rate": 2.8601606388411714e-07, "logits/chosen": -19.28249740600586, "logits/rejected": -18.150615692138672, "logps/chosen": -417.9624938964844, "logps/rejected": -315.6953430175781, "loss": 0.4243, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.217609882354736, "rewards/margins": 1.726017951965332, "rewards/rejected": 2.491591453552246, "step": 27670 }, { "epoch": 1.285110729374623, "grad_norm": 84.2789535522461, "learning_rate": 2.8593868486621165e-07, "logits/chosen": -18.108144760131836, "logits/rejected": -16.72930145263672, "logps/chosen": -382.0052795410156, "logps/rejected": -314.1378173828125, "loss": 0.784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4816536903381348, "rewards/margins": 0.7232129573822021, "rewards/rejected": 1.7584407329559326, "step": 27680 }, { "epoch": 1.2855750034820557, "grad_norm": 321.9184265136719, "learning_rate": 2.8586130584830616e-07, "logits/chosen": -18.152446746826172, "logits/rejected": -18.18802833557129, "logps/chosen": -322.27081298828125, "logps/rejected": -477.86578369140625, "loss": 1.4674, "rewards/accuracies": 0.5, "rewards/chosen": 3.3595480918884277, "rewards/margins": -0.03984532505273819, "rewards/rejected": 3.3993937969207764, "step": 27690 }, { "epoch": 1.2860392775894889, "grad_norm": 220.33657836914062, "learning_rate": 2.8578392683040067e-07, "logits/chosen": -19.311710357666016, "logits/rejected": -19.203556060791016, "logps/chosen": -440.834716796875, "logps/rejected": -425.5935974121094, "loss": 0.8307, "rewards/accuracies": 0.5, "rewards/chosen": 3.5139613151550293, "rewards/margins": 0.3357817530632019, "rewards/rejected": 3.178179979324341, "step": 27700 }, { "epoch": 1.2865035516969219, "grad_norm": 43.6501350402832, "learning_rate": 2.8570654781249513e-07, "logits/chosen": -19.395971298217773, "logits/rejected": -18.516849517822266, "logps/chosen": -369.22674560546875, "logps/rejected": -286.8101501464844, "loss": 0.6377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.385194778442383, "rewards/margins": 1.016358733177185, "rewards/rejected": 2.3688364028930664, "step": 27710 }, { "epoch": 1.2869678258043549, "grad_norm": 194.6470489501953, "learning_rate": 2.8562916879458964e-07, "logits/chosen": -18.462980270385742, "logits/rejected": -18.663894653320312, "logps/chosen": -355.67034912109375, "logps/rejected": -358.59967041015625, "loss": 0.824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.808171510696411, "rewards/margins": 0.8692243695259094, "rewards/rejected": 2.9389472007751465, "step": 27720 }, { "epoch": 1.2874320999117879, "grad_norm": 140.9870147705078, "learning_rate": 2.8555178977668415e-07, "logits/chosen": -20.114547729492188, "logits/rejected": -17.890750885009766, "logps/chosen": -468.2057189941406, "logps/rejected": -294.10394287109375, "loss": 0.494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.295923233032227, "rewards/margins": 1.5358999967575073, "rewards/rejected": 2.760023593902588, "step": 27730 }, { "epoch": 1.2878963740192209, "grad_norm": 0.3306210935115814, "learning_rate": 2.8547441075877866e-07, "logits/chosen": -18.631980895996094, "logits/rejected": -17.857501983642578, "logps/chosen": -412.575927734375, "logps/rejected": -330.7405700683594, "loss": 0.6904, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.557556629180908, "rewards/margins": 0.9849990606307983, "rewards/rejected": 2.5725574493408203, "step": 27740 }, { "epoch": 1.288360648126654, "grad_norm": 21.605695724487305, "learning_rate": 2.853970317408732e-07, "logits/chosen": -19.146589279174805, "logits/rejected": -19.295724868774414, "logps/chosen": -371.8199768066406, "logps/rejected": -336.0099182128906, "loss": 0.3409, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7016048431396484, "rewards/margins": 1.417496681213379, "rewards/rejected": 2.2841086387634277, "step": 27750 }, { "epoch": 1.288824922234087, "grad_norm": 0.06149211898446083, "learning_rate": 2.8531965272296763e-07, "logits/chosen": -18.722339630126953, "logits/rejected": -17.508617401123047, "logps/chosen": -363.8673095703125, "logps/rejected": -249.0923309326172, "loss": 0.6727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.394738674163818, "rewards/margins": 2.8805575370788574, "rewards/rejected": 1.5141814947128296, "step": 27760 }, { "epoch": 1.28928919634152, "grad_norm": 6.164584636688232, "learning_rate": 2.852422737050621e-07, "logits/chosen": -19.198753356933594, "logits/rejected": -17.891733169555664, "logps/chosen": -404.3812255859375, "logps/rejected": -348.30255126953125, "loss": 0.6379, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.151464939117432, "rewards/margins": 1.5179475545883179, "rewards/rejected": 2.633516788482666, "step": 27770 }, { "epoch": 1.289753470448953, "grad_norm": 209.2349395751953, "learning_rate": 2.851648946871566e-07, "logits/chosen": -18.527721405029297, "logits/rejected": -17.14215087890625, "logps/chosen": -345.2257995605469, "logps/rejected": -244.3804168701172, "loss": 0.5579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1036171913146973, "rewards/margins": 2.042457103729248, "rewards/rejected": 1.0611598491668701, "step": 27780 }, { "epoch": 1.290217744556386, "grad_norm": 0.3500976264476776, "learning_rate": 2.850875156692511e-07, "logits/chosen": -19.07610511779785, "logits/rejected": -17.451473236083984, "logps/chosen": -491.73846435546875, "logps/rejected": -281.45172119140625, "loss": 0.45, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.686711311340332, "rewards/margins": 2.338714122772217, "rewards/rejected": 2.3479971885681152, "step": 27790 }, { "epoch": 1.290682018663819, "grad_norm": 67.84487915039062, "learning_rate": 2.850101366513456e-07, "logits/chosen": -18.609933853149414, "logits/rejected": -17.560293197631836, "logps/chosen": -379.10272216796875, "logps/rejected": -282.07196044921875, "loss": 0.5391, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0966222286224365, "rewards/margins": 1.1858835220336914, "rewards/rejected": 1.9107387065887451, "step": 27800 }, { "epoch": 1.291146292771252, "grad_norm": 1.2213447093963623, "learning_rate": 2.849327576334401e-07, "logits/chosen": -18.113330841064453, "logits/rejected": -17.0930118560791, "logps/chosen": -446.0018615722656, "logps/rejected": -300.07769775390625, "loss": 0.5394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8162567615509033, "rewards/margins": 1.4428329467773438, "rewards/rejected": 2.3734235763549805, "step": 27810 }, { "epoch": 1.2916105668786853, "grad_norm": 132.39341735839844, "learning_rate": 2.848553786155346e-07, "logits/chosen": -18.46766471862793, "logits/rejected": -17.986370086669922, "logps/chosen": -403.5187072753906, "logps/rejected": -293.70721435546875, "loss": 0.9922, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.174057960510254, "rewards/margins": 1.1000274419784546, "rewards/rejected": 3.0740303993225098, "step": 27820 }, { "epoch": 1.2920748409861182, "grad_norm": 86.11268615722656, "learning_rate": 2.847779995976291e-07, "logits/chosen": -18.845760345458984, "logits/rejected": -18.00044059753418, "logps/chosen": -361.3221130371094, "logps/rejected": -351.489990234375, "loss": 1.0108, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.880683660507202, "rewards/margins": 0.05649157613515854, "rewards/rejected": 2.8241920471191406, "step": 27830 }, { "epoch": 1.2925391150935512, "grad_norm": 135.4141082763672, "learning_rate": 2.847006205797236e-07, "logits/chosen": -18.988361358642578, "logits/rejected": -17.69625473022461, "logps/chosen": -375.9157409667969, "logps/rejected": -271.2004699707031, "loss": 0.585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.26641845703125, "rewards/margins": 1.556462287902832, "rewards/rejected": 2.709956407546997, "step": 27840 }, { "epoch": 1.2930033892009842, "grad_norm": 92.18450927734375, "learning_rate": 2.8462324156181813e-07, "logits/chosen": -18.70968246459961, "logits/rejected": -18.403295516967773, "logps/chosen": -320.8094482421875, "logps/rejected": -272.2589111328125, "loss": 1.2379, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.238379955291748, "rewards/margins": -0.3485282361507416, "rewards/rejected": 2.5869081020355225, "step": 27850 }, { "epoch": 1.2934676633084172, "grad_norm": 123.28557586669922, "learning_rate": 2.845458625439126e-07, "logits/chosen": -18.853092193603516, "logits/rejected": -18.738313674926758, "logps/chosen": -334.1675109863281, "logps/rejected": -341.49658203125, "loss": 1.0603, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.488576650619507, "rewards/margins": -0.18915219604969025, "rewards/rejected": 2.6777291297912598, "step": 27860 }, { "epoch": 1.2939319374158504, "grad_norm": 10.52514934539795, "learning_rate": 2.8446848352600705e-07, "logits/chosen": -19.02522087097168, "logits/rejected": -18.419971466064453, "logps/chosen": -431.10345458984375, "logps/rejected": -440.70465087890625, "loss": 0.3787, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.808666706085205, "rewards/margins": 1.1600021123886108, "rewards/rejected": 3.6486644744873047, "step": 27870 }, { "epoch": 1.2943962115232832, "grad_norm": 66.25965118408203, "learning_rate": 2.8439110450810156e-07, "logits/chosen": -19.688657760620117, "logits/rejected": -19.457080841064453, "logps/chosen": -396.66259765625, "logps/rejected": -358.697265625, "loss": 0.6527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.84582257270813, "rewards/margins": 0.694497287273407, "rewards/rejected": 3.151324987411499, "step": 27880 }, { "epoch": 1.2948604856307164, "grad_norm": 63.602420806884766, "learning_rate": 2.8431372549019607e-07, "logits/chosen": -18.8773136138916, "logits/rejected": -17.916526794433594, "logps/chosen": -374.50042724609375, "logps/rejected": -315.761962890625, "loss": 0.4281, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2162482738494873, "rewards/margins": 1.0506315231323242, "rewards/rejected": 2.165616750717163, "step": 27890 }, { "epoch": 1.2953247597381494, "grad_norm": 2.658470392227173, "learning_rate": 2.842363464722906e-07, "logits/chosen": -18.86148452758789, "logits/rejected": -17.85910415649414, "logps/chosen": -504.5450744628906, "logps/rejected": -353.46221923828125, "loss": 0.4417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8302714824676514, "rewards/margins": 1.366597294807434, "rewards/rejected": 2.463674545288086, "step": 27900 }, { "epoch": 1.2957890338455824, "grad_norm": 95.90513610839844, "learning_rate": 2.8415896745438504e-07, "logits/chosen": -17.85250473022461, "logits/rejected": -17.344120025634766, "logps/chosen": -454.28546142578125, "logps/rejected": -356.7271423339844, "loss": 0.5504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.057995319366455, "rewards/margins": 0.8890687823295593, "rewards/rejected": 3.1689257621765137, "step": 27910 }, { "epoch": 1.2962533079530154, "grad_norm": 10.878715515136719, "learning_rate": 2.8408158843647955e-07, "logits/chosen": -19.207263946533203, "logits/rejected": -18.2003173828125, "logps/chosen": -336.40228271484375, "logps/rejected": -283.4402770996094, "loss": 0.5941, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.379169464111328, "rewards/margins": 0.6669355630874634, "rewards/rejected": 2.712233781814575, "step": 27920 }, { "epoch": 1.2967175820604484, "grad_norm": 51.870628356933594, "learning_rate": 2.8400420941857406e-07, "logits/chosen": -18.03895378112793, "logits/rejected": -18.309799194335938, "logps/chosen": -302.06512451171875, "logps/rejected": -287.27606201171875, "loss": 1.1471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.577108860015869, "rewards/margins": 0.540061354637146, "rewards/rejected": 2.0370471477508545, "step": 27930 }, { "epoch": 1.2971818561678816, "grad_norm": 60.5936393737793, "learning_rate": 2.8392683040066857e-07, "logits/chosen": -18.352489471435547, "logits/rejected": -17.922954559326172, "logps/chosen": -349.6490478515625, "logps/rejected": -362.6252746582031, "loss": 0.8817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.242053508758545, "rewards/margins": 0.7727109789848328, "rewards/rejected": 2.4693427085876465, "step": 27940 }, { "epoch": 1.2976461302753146, "grad_norm": 30.908411026000977, "learning_rate": 2.838494513827631e-07, "logits/chosen": -19.38510513305664, "logits/rejected": -19.056467056274414, "logps/chosen": -365.9130859375, "logps/rejected": -338.67169189453125, "loss": 0.4989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.366319179534912, "rewards/margins": 1.689300298690796, "rewards/rejected": 1.677018404006958, "step": 27950 }, { "epoch": 1.2981104043827476, "grad_norm": 154.24490356445312, "learning_rate": 2.837720723648575e-07, "logits/chosen": -19.297626495361328, "logits/rejected": -18.163766860961914, "logps/chosen": -442.5321350097656, "logps/rejected": -336.5743103027344, "loss": 0.4628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8638641834259033, "rewards/margins": 1.358889102935791, "rewards/rejected": 2.504974842071533, "step": 27960 }, { "epoch": 1.2985746784901806, "grad_norm": 71.0943832397461, "learning_rate": 2.83694693346952e-07, "logits/chosen": -18.240413665771484, "logits/rejected": -16.869991302490234, "logps/chosen": -465.7793884277344, "logps/rejected": -363.66485595703125, "loss": 0.4581, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.759384870529175, "rewards/margins": 1.3930667638778687, "rewards/rejected": 2.3663182258605957, "step": 27970 }, { "epoch": 1.2990389525976136, "grad_norm": 24.142732620239258, "learning_rate": 2.836173143290465e-07, "logits/chosen": -18.206871032714844, "logits/rejected": -18.677310943603516, "logps/chosen": -461.48663330078125, "logps/rejected": -449.46942138671875, "loss": 1.6668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2342898845672607, "rewards/margins": -0.7652313113212585, "rewards/rejected": 3.9995205402374268, "step": 27980 }, { "epoch": 1.2995032267050466, "grad_norm": 69.69544219970703, "learning_rate": 2.83539935311141e-07, "logits/chosen": -18.874393463134766, "logits/rejected": -18.127187728881836, "logps/chosen": -363.51922607421875, "logps/rejected": -339.43743896484375, "loss": 0.4332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4230716228485107, "rewards/margins": 0.8846760988235474, "rewards/rejected": 2.538395404815674, "step": 27990 }, { "epoch": 1.2999675008124796, "grad_norm": 0.06335694342851639, "learning_rate": 2.8346255629323553e-07, "logits/chosen": -19.639989852905273, "logits/rejected": -17.545259475708008, "logps/chosen": -331.66534423828125, "logps/rejected": -222.2626190185547, "loss": 0.6787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.391738176345825, "rewards/margins": 2.2774951457977295, "rewards/rejected": 1.1142429113388062, "step": 28000 }, { "epoch": 1.3004317749199128, "grad_norm": 11.577787399291992, "learning_rate": 2.8338517727533e-07, "logits/chosen": -18.571979522705078, "logits/rejected": -18.22554588317871, "logps/chosen": -328.30804443359375, "logps/rejected": -264.1151123046875, "loss": 0.5576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.778470516204834, "rewards/margins": 1.441816806793213, "rewards/rejected": 1.336653470993042, "step": 28010 }, { "epoch": 1.3008960490273458, "grad_norm": 4.199653625488281, "learning_rate": 2.833077982574245e-07, "logits/chosen": -20.020931243896484, "logits/rejected": -18.306961059570312, "logps/chosen": -405.3003845214844, "logps/rejected": -236.8839111328125, "loss": 0.2311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.572165012359619, "rewards/margins": 2.8154492378234863, "rewards/rejected": 1.7567155361175537, "step": 28020 }, { "epoch": 1.3013603231347788, "grad_norm": 245.73744201660156, "learning_rate": 2.83230419239519e-07, "logits/chosen": -19.511423110961914, "logits/rejected": -18.83554458618164, "logps/chosen": -427.21893310546875, "logps/rejected": -369.4228515625, "loss": 0.5707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6747524738311768, "rewards/margins": 0.840765655040741, "rewards/rejected": 2.833986759185791, "step": 28030 }, { "epoch": 1.3018245972422118, "grad_norm": 78.1362075805664, "learning_rate": 2.831530402216135e-07, "logits/chosen": -18.712156295776367, "logits/rejected": -18.10452651977539, "logps/chosen": -425.58795166015625, "logps/rejected": -343.6294860839844, "loss": 0.9318, "rewards/accuracies": 0.5, "rewards/chosen": 4.169427394866943, "rewards/margins": 0.8483413457870483, "rewards/rejected": 3.3210864067077637, "step": 28040 }, { "epoch": 1.3022888713496448, "grad_norm": 2.945244550704956, "learning_rate": 2.8307566120370804e-07, "logits/chosen": -19.091571807861328, "logits/rejected": -18.184432983398438, "logps/chosen": -370.96893310546875, "logps/rejected": -352.7339172363281, "loss": 0.5808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9282279014587402, "rewards/margins": 0.7172444462776184, "rewards/rejected": 2.2109832763671875, "step": 28050 }, { "epoch": 1.302753145457078, "grad_norm": 22.144376754760742, "learning_rate": 2.8299828218580244e-07, "logits/chosen": -19.36772918701172, "logits/rejected": -18.032360076904297, "logps/chosen": -478.46514892578125, "logps/rejected": -337.295166015625, "loss": 0.3834, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.183569431304932, "rewards/margins": 2.2734463214874268, "rewards/rejected": 2.910123348236084, "step": 28060 }, { "epoch": 1.3032174195645108, "grad_norm": 47.6575927734375, "learning_rate": 2.8292090316789695e-07, "logits/chosen": -18.48442268371582, "logits/rejected": -17.421710968017578, "logps/chosen": -392.0523986816406, "logps/rejected": -299.17572021484375, "loss": 0.3629, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1643426418304443, "rewards/margins": 1.415083408355713, "rewards/rejected": 1.7492592334747314, "step": 28070 }, { "epoch": 1.303681693671944, "grad_norm": 14.442606925964355, "learning_rate": 2.8284352414999146e-07, "logits/chosen": -18.578868865966797, "logits/rejected": -17.07029151916504, "logps/chosen": -367.6546325683594, "logps/rejected": -220.97610473632812, "loss": 0.2402, "rewards/accuracies": 1.0, "rewards/chosen": 3.235902786254883, "rewards/margins": 1.8896583318710327, "rewards/rejected": 1.3462440967559814, "step": 28080 }, { "epoch": 1.304145967779377, "grad_norm": 0.37099868059158325, "learning_rate": 2.82766145132086e-07, "logits/chosen": -18.91305160522461, "logits/rejected": -18.25756072998047, "logps/chosen": -397.28302001953125, "logps/rejected": -339.01873779296875, "loss": 0.5781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0240721702575684, "rewards/margins": 0.9451678395271301, "rewards/rejected": 2.078904151916504, "step": 28090 }, { "epoch": 1.30461024188681, "grad_norm": 0.22294610738754272, "learning_rate": 2.826887661141805e-07, "logits/chosen": -19.29570198059082, "logits/rejected": -17.219005584716797, "logps/chosen": -531.9974365234375, "logps/rejected": -306.97576904296875, "loss": 0.3217, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.55100154876709, "rewards/margins": 2.380380392074585, "rewards/rejected": 2.170621156692505, "step": 28100 }, { "epoch": 1.305074515994243, "grad_norm": 2.8431220054626465, "learning_rate": 2.8261138709627495e-07, "logits/chosen": -18.35153579711914, "logits/rejected": -17.507915496826172, "logps/chosen": -354.56732177734375, "logps/rejected": -252.366455078125, "loss": 0.9196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9582347869873047, "rewards/margins": 0.6946316957473755, "rewards/rejected": 2.2636029720306396, "step": 28110 }, { "epoch": 1.305538790101676, "grad_norm": 8.918224334716797, "learning_rate": 2.8253400807836946e-07, "logits/chosen": -19.087665557861328, "logits/rejected": -17.4090576171875, "logps/chosen": -543.7811889648438, "logps/rejected": -296.24285888671875, "loss": 0.9767, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.673491954803467, "rewards/margins": 1.2357085943222046, "rewards/rejected": 2.4377834796905518, "step": 28120 }, { "epoch": 1.3060030642091092, "grad_norm": 43.22324752807617, "learning_rate": 2.8245662906046397e-07, "logits/chosen": -18.280773162841797, "logits/rejected": -17.13552474975586, "logps/chosen": -459.8091735839844, "logps/rejected": -308.84405517578125, "loss": 0.7762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4701309204101562, "rewards/margins": 1.6863765716552734, "rewards/rejected": 1.7837541103363037, "step": 28130 }, { "epoch": 1.3064673383165422, "grad_norm": 38.48942565917969, "learning_rate": 2.823792500425585e-07, "logits/chosen": -19.855825424194336, "logits/rejected": -19.08029556274414, "logps/chosen": -359.5667724609375, "logps/rejected": -362.61114501953125, "loss": 1.0257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6402628421783447, "rewards/margins": 0.3609083890914917, "rewards/rejected": 2.2793545722961426, "step": 28140 }, { "epoch": 1.3069316124239752, "grad_norm": 202.305908203125, "learning_rate": 2.8230187102465294e-07, "logits/chosen": -18.5180721282959, "logits/rejected": -17.922245025634766, "logps/chosen": -396.03350830078125, "logps/rejected": -334.51739501953125, "loss": 0.9008, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4836783409118652, "rewards/margins": 0.5970782041549683, "rewards/rejected": 1.8865998983383179, "step": 28150 }, { "epoch": 1.3073958865314081, "grad_norm": 93.1297378540039, "learning_rate": 2.822244920067474e-07, "logits/chosen": -19.561290740966797, "logits/rejected": -18.32537269592285, "logps/chosen": -406.1944885253906, "logps/rejected": -250.80850219726562, "loss": 0.3244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8277149200439453, "rewards/margins": 2.5488123893737793, "rewards/rejected": 1.2789021730422974, "step": 28160 }, { "epoch": 1.3078601606388411, "grad_norm": 117.60313415527344, "learning_rate": 2.821471129888419e-07, "logits/chosen": -19.096961975097656, "logits/rejected": -18.97898292541504, "logps/chosen": -395.8476867675781, "logps/rejected": -440.7626037597656, "loss": 1.224, "rewards/accuracies": 0.5, "rewards/chosen": 3.0792102813720703, "rewards/margins": 0.22323846817016602, "rewards/rejected": 2.8559718132019043, "step": 28170 }, { "epoch": 1.3083244347462741, "grad_norm": 18.77232551574707, "learning_rate": 2.820697339709364e-07, "logits/chosen": -18.89922523498535, "logits/rejected": -18.889591217041016, "logps/chosen": -297.4163513183594, "logps/rejected": -261.8628845214844, "loss": 0.9462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.204284906387329, "rewards/margins": -0.09737689793109894, "rewards/rejected": 2.301661968231201, "step": 28180 }, { "epoch": 1.3087887088537071, "grad_norm": 5.812829971313477, "learning_rate": 2.8199235495303093e-07, "logits/chosen": -18.26717758178711, "logits/rejected": -17.584413528442383, "logps/chosen": -333.83099365234375, "logps/rejected": -208.694091796875, "loss": 0.5619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.238165378570557, "rewards/margins": 2.3537402153015137, "rewards/rejected": 1.884425163269043, "step": 28190 }, { "epoch": 1.3092529829611403, "grad_norm": 0.0138319730758667, "learning_rate": 2.8191497593512544e-07, "logits/chosen": -19.23174476623535, "logits/rejected": -18.56290054321289, "logps/chosen": -465.36639404296875, "logps/rejected": -437.4278259277344, "loss": 0.884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.668064594268799, "rewards/margins": 1.2653087377548218, "rewards/rejected": 2.4027559757232666, "step": 28200 }, { "epoch": 1.3097172570685733, "grad_norm": 5.510120868682861, "learning_rate": 2.818375969172199e-07, "logits/chosen": -18.274614334106445, "logits/rejected": -17.455385208129883, "logps/chosen": -267.5893249511719, "logps/rejected": -204.41317749023438, "loss": 0.5612, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7355430126190186, "rewards/margins": 0.8528928756713867, "rewards/rejected": 0.8826497793197632, "step": 28210 }, { "epoch": 1.3101815311760063, "grad_norm": 46.12841033935547, "learning_rate": 2.817602178993144e-07, "logits/chosen": -19.1782283782959, "logits/rejected": -18.330968856811523, "logps/chosen": -357.7518310546875, "logps/rejected": -264.78350830078125, "loss": 0.3252, "rewards/accuracies": 1.0, "rewards/chosen": 3.4315788745880127, "rewards/margins": 1.246100664138794, "rewards/rejected": 2.185478448867798, "step": 28220 }, { "epoch": 1.3106458052834393, "grad_norm": 6.965996265411377, "learning_rate": 2.816828388814089e-07, "logits/chosen": -18.786046981811523, "logits/rejected": -17.235437393188477, "logps/chosen": -401.18328857421875, "logps/rejected": -252.9320068359375, "loss": 0.3313, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.051978588104248, "rewards/margins": 2.437520742416382, "rewards/rejected": 1.614458441734314, "step": 28230 }, { "epoch": 1.3111100793908723, "grad_norm": 1.646965503692627, "learning_rate": 2.8160545986350343e-07, "logits/chosen": -18.91909408569336, "logits/rejected": -17.91703987121582, "logps/chosen": -456.0943298339844, "logps/rejected": -316.22479248046875, "loss": 0.368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3603553771972656, "rewards/margins": 1.923649787902832, "rewards/rejected": 1.4367053508758545, "step": 28240 }, { "epoch": 1.3115743534983055, "grad_norm": 2.1785171031951904, "learning_rate": 2.815280808455979e-07, "logits/chosen": -18.622112274169922, "logits/rejected": -17.579204559326172, "logps/chosen": -460.2745056152344, "logps/rejected": -299.54547119140625, "loss": 0.7027, "rewards/accuracies": 0.5, "rewards/chosen": 2.783322334289551, "rewards/margins": 0.874356746673584, "rewards/rejected": 1.9089657068252563, "step": 28250 }, { "epoch": 1.3120386276057383, "grad_norm": 173.0576934814453, "learning_rate": 2.8145070182769235e-07, "logits/chosen": -18.34779930114746, "logits/rejected": -17.125629425048828, "logps/chosen": -372.6938781738281, "logps/rejected": -237.37332153320312, "loss": 0.4517, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.085779905319214, "rewards/margins": 1.429979681968689, "rewards/rejected": 1.655800223350525, "step": 28260 }, { "epoch": 1.3125029017131715, "grad_norm": 17.13361930847168, "learning_rate": 2.8137332280978686e-07, "logits/chosen": -18.76606559753418, "logits/rejected": -18.455806732177734, "logps/chosen": -389.69537353515625, "logps/rejected": -370.1393127441406, "loss": 0.6283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3817243576049805, "rewards/margins": 1.3150055408477783, "rewards/rejected": 3.066718578338623, "step": 28270 }, { "epoch": 1.3129671758206045, "grad_norm": 35.22394561767578, "learning_rate": 2.8129594379188137e-07, "logits/chosen": -18.539222717285156, "logits/rejected": -17.897695541381836, "logps/chosen": -314.1677551269531, "logps/rejected": -295.86895751953125, "loss": 0.5633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6568310260772705, "rewards/margins": 1.4679454565048218, "rewards/rejected": 1.1888854503631592, "step": 28280 }, { "epoch": 1.3134314499280375, "grad_norm": 0.853169858455658, "learning_rate": 2.812185647739759e-07, "logits/chosen": -18.14988899230957, "logits/rejected": -17.186786651611328, "logps/chosen": -377.8233947753906, "logps/rejected": -259.03009033203125, "loss": 0.4865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1890058517456055, "rewards/margins": 2.3899452686309814, "rewards/rejected": 1.7990608215332031, "step": 28290 }, { "epoch": 1.3138957240354705, "grad_norm": 155.09071350097656, "learning_rate": 2.811411857560704e-07, "logits/chosen": -18.69789695739746, "logits/rejected": -18.042980194091797, "logps/chosen": -368.91668701171875, "logps/rejected": -324.24639892578125, "loss": 0.5931, "rewards/accuracies": 0.5, "rewards/chosen": 3.4965968132019043, "rewards/margins": 1.1217797994613647, "rewards/rejected": 2.374817371368408, "step": 28300 }, { "epoch": 1.3143599981429035, "grad_norm": 183.36090087890625, "learning_rate": 2.8106380673816485e-07, "logits/chosen": -18.929540634155273, "logits/rejected": -17.789201736450195, "logps/chosen": -442.74737548828125, "logps/rejected": -379.19512939453125, "loss": 0.5996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.981564998626709, "rewards/margins": 0.9301276206970215, "rewards/rejected": 3.0514378547668457, "step": 28310 }, { "epoch": 1.3148242722503367, "grad_norm": 51.22715759277344, "learning_rate": 2.8098642772025937e-07, "logits/chosen": -18.311176300048828, "logits/rejected": -17.792455673217773, "logps/chosen": -402.3924865722656, "logps/rejected": -346.77642822265625, "loss": 0.9362, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.902454137802124, "rewards/margins": 0.015981603413820267, "rewards/rejected": 2.886472702026367, "step": 28320 }, { "epoch": 1.3152885463577697, "grad_norm": 164.87196350097656, "learning_rate": 2.809090487023539e-07, "logits/chosen": -19.675899505615234, "logits/rejected": -19.013561248779297, "logps/chosen": -414.1871032714844, "logps/rejected": -328.7298889160156, "loss": 0.4084, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.244918346405029, "rewards/margins": 1.353390097618103, "rewards/rejected": 2.8915278911590576, "step": 28330 }, { "epoch": 1.3157528204652027, "grad_norm": 95.34879302978516, "learning_rate": 2.8083166968444833e-07, "logits/chosen": -18.498804092407227, "logits/rejected": -17.33652687072754, "logps/chosen": -421.0450744628906, "logps/rejected": -244.73770141601562, "loss": 0.3481, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3221373558044434, "rewards/margins": 1.5638071298599243, "rewards/rejected": 1.7583303451538086, "step": 28340 }, { "epoch": 1.3162170945726357, "grad_norm": 44.69028091430664, "learning_rate": 2.8075429066654285e-07, "logits/chosen": -19.331600189208984, "logits/rejected": -20.279085159301758, "logps/chosen": -410.1844177246094, "logps/rejected": -391.3115234375, "loss": 1.0896, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 3.2790932655334473, "rewards/margins": -0.5446408987045288, "rewards/rejected": 3.8237338066101074, "step": 28350 }, { "epoch": 1.3166813686800687, "grad_norm": 40.7852783203125, "learning_rate": 2.806769116486373e-07, "logits/chosen": -18.814250946044922, "logits/rejected": -18.144424438476562, "logps/chosen": -380.98321533203125, "logps/rejected": -363.0377197265625, "loss": 0.6819, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5249481201171875, "rewards/margins": 0.7400063872337341, "rewards/rejected": 2.7849411964416504, "step": 28360 }, { "epoch": 1.3171456427875017, "grad_norm": 23.635345458984375, "learning_rate": 2.805995326307318e-07, "logits/chosen": -17.939891815185547, "logits/rejected": -17.093008041381836, "logps/chosen": -398.94989013671875, "logps/rejected": -284.207763671875, "loss": 0.703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7462565898895264, "rewards/margins": 1.6840107440948486, "rewards/rejected": 2.0622458457946777, "step": 28370 }, { "epoch": 1.3176099168949347, "grad_norm": 12.66170883178711, "learning_rate": 2.8052215361282633e-07, "logits/chosen": -18.228567123413086, "logits/rejected": -17.98295021057129, "logps/chosen": -399.7398986816406, "logps/rejected": -409.13037109375, "loss": 0.5948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0713882446289062, "rewards/margins": 0.8558169603347778, "rewards/rejected": 2.215571403503418, "step": 28380 }, { "epoch": 1.318074191002368, "grad_norm": 42.6094970703125, "learning_rate": 2.8044477459492084e-07, "logits/chosen": -18.88149070739746, "logits/rejected": -18.79129981994629, "logps/chosen": -405.6610107421875, "logps/rejected": -372.4642333984375, "loss": 0.7881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5592122077941895, "rewards/margins": 0.4658431112766266, "rewards/rejected": 3.0933690071105957, "step": 28390 }, { "epoch": 1.3185384651098009, "grad_norm": 115.74100494384766, "learning_rate": 2.8036739557701535e-07, "logits/chosen": -19.27035140991211, "logits/rejected": -18.26877212524414, "logps/chosen": -336.0254211425781, "logps/rejected": -296.64306640625, "loss": 1.0206, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.885932445526123, "rewards/margins": 0.6579561233520508, "rewards/rejected": 2.2279765605926514, "step": 28400 }, { "epoch": 1.3190027392172339, "grad_norm": 25.18412208557129, "learning_rate": 2.802900165591098e-07, "logits/chosen": -19.042133331298828, "logits/rejected": -18.194194793701172, "logps/chosen": -406.25927734375, "logps/rejected": -363.5919494628906, "loss": 0.5006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.014797210693359, "rewards/margins": 1.7611055374145508, "rewards/rejected": 2.2536914348602295, "step": 28410 }, { "epoch": 1.3194670133246669, "grad_norm": 172.32118225097656, "learning_rate": 2.802126375412043e-07, "logits/chosen": -18.136425018310547, "logits/rejected": -17.59984016418457, "logps/chosen": -414.7188415527344, "logps/rejected": -306.506103515625, "loss": 0.7206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.014252185821533, "rewards/margins": 1.1705882549285889, "rewards/rejected": 2.8436644077301025, "step": 28420 }, { "epoch": 1.3199312874320999, "grad_norm": 43.105804443359375, "learning_rate": 2.8013525852329883e-07, "logits/chosen": -19.55852508544922, "logits/rejected": -18.695892333984375, "logps/chosen": -426.244384765625, "logps/rejected": -305.6221923828125, "loss": 0.7154, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2263875007629395, "rewards/margins": 1.2544591426849365, "rewards/rejected": 2.971928119659424, "step": 28430 }, { "epoch": 1.3203955615395329, "grad_norm": 55.862281799316406, "learning_rate": 2.800578795053933e-07, "logits/chosen": -18.344924926757812, "logits/rejected": -18.443113327026367, "logps/chosen": -400.1418762207031, "logps/rejected": -389.38763427734375, "loss": 1.0602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.780176877975464, "rewards/margins": -0.009103590622544289, "rewards/rejected": 2.7892801761627197, "step": 28440 }, { "epoch": 1.3208598356469659, "grad_norm": 228.38043212890625, "learning_rate": 2.799805004874878e-07, "logits/chosen": -19.42674446105957, "logits/rejected": -18.984743118286133, "logps/chosen": -410.826904296875, "logps/rejected": -438.3526916503906, "loss": 0.9355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.875487804412842, "rewards/margins": 0.5843106508255005, "rewards/rejected": 3.2911770343780518, "step": 28450 }, { "epoch": 1.321324109754399, "grad_norm": 37.03493881225586, "learning_rate": 2.7990312146958226e-07, "logits/chosen": -19.111356735229492, "logits/rejected": -17.4339599609375, "logps/chosen": -358.65899658203125, "logps/rejected": -286.10186767578125, "loss": 0.5625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.274428606033325, "rewards/margins": 1.5896168947219849, "rewards/rejected": 1.6848113536834717, "step": 28460 }, { "epoch": 1.321788383861832, "grad_norm": 166.00535583496094, "learning_rate": 2.7982574245167677e-07, "logits/chosen": -20.14393424987793, "logits/rejected": -18.31060218811035, "logps/chosen": -508.5000915527344, "logps/rejected": -329.14544677734375, "loss": 0.3798, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.482357978820801, "rewards/margins": 2.0250232219696045, "rewards/rejected": 2.457334518432617, "step": 28470 }, { "epoch": 1.322252657969265, "grad_norm": 59.165748596191406, "learning_rate": 2.797483634337713e-07, "logits/chosen": -18.986061096191406, "logits/rejected": -18.607261657714844, "logps/chosen": -425.1702575683594, "logps/rejected": -413.85772705078125, "loss": 0.6895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8194618225097656, "rewards/margins": 0.6808125376701355, "rewards/rejected": 3.1386497020721436, "step": 28480 }, { "epoch": 1.322716932076698, "grad_norm": 137.7693328857422, "learning_rate": 2.796709844158658e-07, "logits/chosen": -19.652238845825195, "logits/rejected": -18.67399787902832, "logps/chosen": -495.6002502441406, "logps/rejected": -336.3833312988281, "loss": 0.3911, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1236419677734375, "rewards/margins": 1.6466217041015625, "rewards/rejected": 2.477020740509033, "step": 28490 }, { "epoch": 1.323181206184131, "grad_norm": 38.88869857788086, "learning_rate": 2.795936053979603e-07, "logits/chosen": -18.648357391357422, "logits/rejected": -18.798864364624023, "logps/chosen": -356.098388671875, "logps/rejected": -393.35711669921875, "loss": 1.0294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2540230751037598, "rewards/margins": 0.16417188942432404, "rewards/rejected": 3.089850902557373, "step": 28500 }, { "epoch": 1.3236454802915643, "grad_norm": 106.13963317871094, "learning_rate": 2.7951622638005476e-07, "logits/chosen": -18.393341064453125, "logits/rejected": -17.990825653076172, "logps/chosen": -381.3976135253906, "logps/rejected": -331.65936279296875, "loss": 0.8397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5980496406555176, "rewards/margins": 0.7828210592269897, "rewards/rejected": 2.81522798538208, "step": 28510 }, { "epoch": 1.324109754398997, "grad_norm": 27.24631690979004, "learning_rate": 2.7943884736214927e-07, "logits/chosen": -19.86736488342285, "logits/rejected": -19.507349014282227, "logps/chosen": -423.67047119140625, "logps/rejected": -348.82843017578125, "loss": 0.597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5508041381835938, "rewards/margins": 0.5266228318214417, "rewards/rejected": 3.024181365966797, "step": 28520 }, { "epoch": 1.3245740285064302, "grad_norm": 24.623855590820312, "learning_rate": 2.793614683442438e-07, "logits/chosen": -18.520397186279297, "logits/rejected": -18.041688919067383, "logps/chosen": -367.89349365234375, "logps/rejected": -291.0815734863281, "loss": 0.5674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8009543418884277, "rewards/margins": 1.1053568124771118, "rewards/rejected": 2.6955976486206055, "step": 28530 }, { "epoch": 1.3250383026138632, "grad_norm": 276.9911193847656, "learning_rate": 2.7928408932633824e-07, "logits/chosen": -19.609285354614258, "logits/rejected": -18.231555938720703, "logps/chosen": -360.81298828125, "logps/rejected": -258.0989990234375, "loss": 0.7685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2235989570617676, "rewards/margins": 1.0317751169204712, "rewards/rejected": 2.1918234825134277, "step": 28540 }, { "epoch": 1.3255025767212962, "grad_norm": 48.22181701660156, "learning_rate": 2.7920671030843275e-07, "logits/chosen": -18.588825225830078, "logits/rejected": -18.907955169677734, "logps/chosen": -407.44537353515625, "logps/rejected": -432.2950134277344, "loss": 1.2206, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.81115460395813, "rewards/margins": -0.24502098560333252, "rewards/rejected": 4.056175231933594, "step": 28550 }, { "epoch": 1.3259668508287292, "grad_norm": 11.47855281829834, "learning_rate": 2.791293312905272e-07, "logits/chosen": -17.860313415527344, "logits/rejected": -18.257850646972656, "logps/chosen": -251.2956085205078, "logps/rejected": -274.9901123046875, "loss": 1.3269, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.031860828399658, "rewards/margins": -0.3279462456703186, "rewards/rejected": 2.359807252883911, "step": 28560 }, { "epoch": 1.3264311249361622, "grad_norm": 0.12607170641422272, "learning_rate": 2.790519522726217e-07, "logits/chosen": -19.03989601135254, "logits/rejected": -18.055490493774414, "logps/chosen": -413.4579162597656, "logps/rejected": -316.6772155761719, "loss": 0.9717, "rewards/accuracies": 0.5, "rewards/chosen": 3.704937696456909, "rewards/margins": 1.205258846282959, "rewards/rejected": 2.4996790885925293, "step": 28570 }, { "epoch": 1.3268953990435954, "grad_norm": 37.425846099853516, "learning_rate": 2.7897457325471624e-07, "logits/chosen": -19.212820053100586, "logits/rejected": -19.85076904296875, "logps/chosen": -378.2391052246094, "logps/rejected": -355.70703125, "loss": 0.642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.393911838531494, "rewards/margins": 0.3317297101020813, "rewards/rejected": 3.0621819496154785, "step": 28580 }, { "epoch": 1.3273596731510284, "grad_norm": 153.46336364746094, "learning_rate": 2.7889719423681075e-07, "logits/chosen": -18.54668426513672, "logits/rejected": -18.25769805908203, "logps/chosen": -384.58746337890625, "logps/rejected": -325.79119873046875, "loss": 0.639, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5263805389404297, "rewards/margins": 0.8563446998596191, "rewards/rejected": 2.6700358390808105, "step": 28590 }, { "epoch": 1.3278239472584614, "grad_norm": 31.75190544128418, "learning_rate": 2.7881981521890526e-07, "logits/chosen": -18.817729949951172, "logits/rejected": -19.37360191345215, "logps/chosen": -394.6839294433594, "logps/rejected": -408.58837890625, "loss": 0.7746, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.0864691734313965, "rewards/margins": 0.5251847505569458, "rewards/rejected": 3.561284303665161, "step": 28600 }, { "epoch": 1.3282882213658944, "grad_norm": 115.98297119140625, "learning_rate": 2.787424362009997e-07, "logits/chosen": -18.435054779052734, "logits/rejected": -18.020288467407227, "logps/chosen": -461.81793212890625, "logps/rejected": -382.41937255859375, "loss": 0.8306, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.448634386062622, "rewards/margins": 1.1946195363998413, "rewards/rejected": 2.254014492034912, "step": 28610 }, { "epoch": 1.3287524954733274, "grad_norm": 33.062835693359375, "learning_rate": 2.7866505718309423e-07, "logits/chosen": -18.279298782348633, "logits/rejected": -18.283422470092773, "logps/chosen": -335.657958984375, "logps/rejected": -374.62841796875, "loss": 1.2639, "rewards/accuracies": 0.5, "rewards/chosen": 2.2940831184387207, "rewards/margins": -0.4300384521484375, "rewards/rejected": 2.7241218090057373, "step": 28620 }, { "epoch": 1.3292167695807604, "grad_norm": 83.68732452392578, "learning_rate": 2.785876781651887e-07, "logits/chosen": -18.67778968811035, "logits/rejected": -16.801776885986328, "logps/chosen": -410.3897399902344, "logps/rejected": -237.657958984375, "loss": 0.2345, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7002594470977783, "rewards/margins": 2.949951410293579, "rewards/rejected": 0.7503083944320679, "step": 28630 }, { "epoch": 1.3296810436881934, "grad_norm": 164.53070068359375, "learning_rate": 2.785102991472832e-07, "logits/chosen": -18.386600494384766, "logits/rejected": -18.106542587280273, "logps/chosen": -408.3726501464844, "logps/rejected": -358.35650634765625, "loss": 1.1587, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7375402450561523, "rewards/margins": -0.0401492603123188, "rewards/rejected": 2.7776894569396973, "step": 28640 }, { "epoch": 1.3301453177956266, "grad_norm": 96.6283950805664, "learning_rate": 2.784329201293777e-07, "logits/chosen": -18.794742584228516, "logits/rejected": -18.616016387939453, "logps/chosen": -357.0118103027344, "logps/rejected": -380.17828369140625, "loss": 1.2714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.797677993774414, "rewards/margins": -0.08767397701740265, "rewards/rejected": 2.8853516578674316, "step": 28650 }, { "epoch": 1.3306095919030596, "grad_norm": 108.74359130859375, "learning_rate": 2.7835554111147217e-07, "logits/chosen": -19.49860954284668, "logits/rejected": -18.58212661743164, "logps/chosen": -376.60845947265625, "logps/rejected": -337.5989074707031, "loss": 0.6388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.35444712638855, "rewards/margins": 0.9836808443069458, "rewards/rejected": 2.3707664012908936, "step": 28660 }, { "epoch": 1.3310738660104926, "grad_norm": 74.50218963623047, "learning_rate": 2.782781620935667e-07, "logits/chosen": -19.355945587158203, "logits/rejected": -17.28542709350586, "logps/chosen": -564.5745239257812, "logps/rejected": -294.4480895996094, "loss": 0.2432, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.7556962966918945, "rewards/margins": 3.1940975189208984, "rewards/rejected": 1.5615990161895752, "step": 28670 }, { "epoch": 1.3315381401179256, "grad_norm": 38.22278594970703, "learning_rate": 2.782007830756612e-07, "logits/chosen": -17.92926597595215, "logits/rejected": -17.897571563720703, "logps/chosen": -380.1836853027344, "logps/rejected": -371.4799499511719, "loss": 0.7014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1941123008728027, "rewards/margins": 0.7162817716598511, "rewards/rejected": 2.477830171585083, "step": 28680 }, { "epoch": 1.3320024142253586, "grad_norm": 168.17726135253906, "learning_rate": 2.781234040577557e-07, "logits/chosen": -19.480117797851562, "logits/rejected": -19.258188247680664, "logps/chosen": -387.249267578125, "logps/rejected": -340.5089416503906, "loss": 0.8893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.433391571044922, "rewards/margins": 0.6280423402786255, "rewards/rejected": 2.805349588394165, "step": 28690 }, { "epoch": 1.3324666883327918, "grad_norm": 11.62511920928955, "learning_rate": 2.780460250398502e-07, "logits/chosen": -19.361095428466797, "logits/rejected": -19.746829986572266, "logps/chosen": -399.6197814941406, "logps/rejected": -369.19482421875, "loss": 1.0421, "rewards/accuracies": 0.5, "rewards/chosen": 3.359539031982422, "rewards/margins": 0.2344159632921219, "rewards/rejected": 3.1251230239868164, "step": 28700 }, { "epoch": 1.3329309624402246, "grad_norm": 14.82224178314209, "learning_rate": 2.779686460219447e-07, "logits/chosen": -18.175344467163086, "logits/rejected": -17.292545318603516, "logps/chosen": -351.5764465332031, "logps/rejected": -279.137939453125, "loss": 0.5845, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.476008415222168, "rewards/margins": 0.7233076691627502, "rewards/rejected": 1.7527005672454834, "step": 28710 }, { "epoch": 1.3333952365476578, "grad_norm": 76.61141967773438, "learning_rate": 2.778912670040392e-07, "logits/chosen": -20.021495819091797, "logits/rejected": -18.798702239990234, "logps/chosen": -475.660888671875, "logps/rejected": -309.3350830078125, "loss": 0.4599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.74070930480957, "rewards/margins": 2.146339178085327, "rewards/rejected": 2.5943706035614014, "step": 28720 }, { "epoch": 1.3338595106550908, "grad_norm": 130.39662170410156, "learning_rate": 2.7781388798613364e-07, "logits/chosen": -18.405071258544922, "logits/rejected": -17.954986572265625, "logps/chosen": -326.40472412109375, "logps/rejected": -330.01959228515625, "loss": 0.9134, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6991333961486816, "rewards/margins": 0.20798850059509277, "rewards/rejected": 2.4911446571350098, "step": 28730 }, { "epoch": 1.3343237847625238, "grad_norm": 235.97117614746094, "learning_rate": 2.7773650896822815e-07, "logits/chosen": -18.559497833251953, "logits/rejected": -18.24654769897461, "logps/chosen": -340.3546142578125, "logps/rejected": -299.53704833984375, "loss": 0.5999, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1017913818359375, "rewards/margins": 1.288583517074585, "rewards/rejected": 1.8132076263427734, "step": 28740 }, { "epoch": 1.3347880588699568, "grad_norm": 89.53162384033203, "learning_rate": 2.7765912995032266e-07, "logits/chosen": -19.557796478271484, "logits/rejected": -18.91229820251465, "logps/chosen": -349.50408935546875, "logps/rejected": -340.39422607421875, "loss": 0.4368, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.341071367263794, "rewards/margins": 1.178165078163147, "rewards/rejected": 2.1629066467285156, "step": 28750 }, { "epoch": 1.3352523329773898, "grad_norm": 159.91799926757812, "learning_rate": 2.775817509324171e-07, "logits/chosen": -19.26090431213379, "logits/rejected": -18.33455467224121, "logps/chosen": -511.00848388671875, "logps/rejected": -355.07659912109375, "loss": 0.3059, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.298064708709717, "rewards/margins": 2.052621603012085, "rewards/rejected": 2.245443344116211, "step": 28760 }, { "epoch": 1.335716607084823, "grad_norm": 47.12178039550781, "learning_rate": 2.7750437191451163e-07, "logits/chosen": -19.34947395324707, "logits/rejected": -18.808881759643555, "logps/chosen": -477.9454040527344, "logps/rejected": -436.251953125, "loss": 1.0787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.355921745300293, "rewards/margins": 0.1783304512500763, "rewards/rejected": 4.177591800689697, "step": 28770 }, { "epoch": 1.336180881192256, "grad_norm": 39.273719787597656, "learning_rate": 2.7742699289660614e-07, "logits/chosen": -20.015810012817383, "logits/rejected": -19.740814208984375, "logps/chosen": -533.4825439453125, "logps/rejected": -507.4297790527344, "loss": 1.1462, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.967041015625, "rewards/margins": -0.33321407437324524, "rewards/rejected": 4.300255298614502, "step": 28780 }, { "epoch": 1.336645155299689, "grad_norm": 43.82804489135742, "learning_rate": 2.7734961387870065e-07, "logits/chosen": -19.296995162963867, "logits/rejected": -18.226848602294922, "logps/chosen": -471.45953369140625, "logps/rejected": -357.927490234375, "loss": 0.5269, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.306967258453369, "rewards/margins": 0.9282398223876953, "rewards/rejected": 2.378727436065674, "step": 28790 }, { "epoch": 1.337109429407122, "grad_norm": 16.183032989501953, "learning_rate": 2.7727223486079517e-07, "logits/chosen": -19.239383697509766, "logits/rejected": -18.371990203857422, "logps/chosen": -490.6412658691406, "logps/rejected": -371.4068298339844, "loss": 0.5041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.21870756149292, "rewards/margins": 1.6333885192871094, "rewards/rejected": 2.5853195190429688, "step": 28800 }, { "epoch": 1.337573703514555, "grad_norm": 137.9697723388672, "learning_rate": 2.771948558428897e-07, "logits/chosen": -18.97783851623535, "logits/rejected": -18.728586196899414, "logps/chosen": -418.3768615722656, "logps/rejected": -438.82080078125, "loss": 0.7921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.430475234985352, "rewards/margins": 0.899479866027832, "rewards/rejected": 3.5309951305389404, "step": 28810 }, { "epoch": 1.338037977621988, "grad_norm": 52.5797233581543, "learning_rate": 2.771174768249841e-07, "logits/chosen": -19.649555206298828, "logits/rejected": -18.33904266357422, "logps/chosen": -477.133544921875, "logps/rejected": -319.6141052246094, "loss": 0.3474, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.893970012664795, "rewards/margins": 1.025175929069519, "rewards/rejected": 2.8687942028045654, "step": 28820 }, { "epoch": 1.338502251729421, "grad_norm": 1.41032874584198, "learning_rate": 2.770400978070786e-07, "logits/chosen": -18.961091995239258, "logits/rejected": -18.415935516357422, "logps/chosen": -449.414794921875, "logps/rejected": -431.86627197265625, "loss": 1.0314, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.4036788940429688, "rewards/margins": -0.0722619891166687, "rewards/rejected": 3.475940704345703, "step": 28830 }, { "epoch": 1.3389665258368542, "grad_norm": 5.533041000366211, "learning_rate": 2.769627187891731e-07, "logits/chosen": -19.875553131103516, "logits/rejected": -19.17608070373535, "logps/chosen": -314.4568786621094, "logps/rejected": -237.65335083007812, "loss": 0.6769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.516451358795166, "rewards/margins": 1.3411552906036377, "rewards/rejected": 2.1752963066101074, "step": 28840 }, { "epoch": 1.3394307999442872, "grad_norm": 132.17428588867188, "learning_rate": 2.768853397712676e-07, "logits/chosen": -18.288572311401367, "logits/rejected": -17.868133544921875, "logps/chosen": -362.35302734375, "logps/rejected": -293.6504821777344, "loss": 0.3958, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.439676284790039, "rewards/margins": 1.2350974082946777, "rewards/rejected": 2.2045791149139404, "step": 28850 }, { "epoch": 1.3398950740517201, "grad_norm": 61.432350158691406, "learning_rate": 2.7680796075336213e-07, "logits/chosen": -18.619157791137695, "logits/rejected": -17.857229232788086, "logps/chosen": -380.4638366699219, "logps/rejected": -311.372802734375, "loss": 0.876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.115705490112305, "rewards/margins": 1.0604437589645386, "rewards/rejected": 3.0552618503570557, "step": 28860 }, { "epoch": 1.3403593481591531, "grad_norm": 7.796875953674316, "learning_rate": 2.767305817354566e-07, "logits/chosen": -18.644481658935547, "logits/rejected": -17.824459075927734, "logps/chosen": -254.332763671875, "logps/rejected": -207.03726196289062, "loss": 0.4517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.571169376373291, "rewards/margins": 1.353816270828247, "rewards/rejected": 1.2173532247543335, "step": 28870 }, { "epoch": 1.3408236222665861, "grad_norm": 293.8385009765625, "learning_rate": 2.766532027175511e-07, "logits/chosen": -19.033552169799805, "logits/rejected": -19.165748596191406, "logps/chosen": -361.5027160644531, "logps/rejected": -369.27044677734375, "loss": 1.0834, "rewards/accuracies": 0.5, "rewards/chosen": 3.198723316192627, "rewards/margins": -0.020266318693757057, "rewards/rejected": 3.218989610671997, "step": 28880 }, { "epoch": 1.3412878963740194, "grad_norm": 0.29941415786743164, "learning_rate": 2.765758236996456e-07, "logits/chosen": -19.786205291748047, "logits/rejected": -18.71304702758789, "logps/chosen": -471.42315673828125, "logps/rejected": -302.3658142089844, "loss": 0.462, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.462529182434082, "rewards/margins": 2.3009655475616455, "rewards/rejected": 2.1615636348724365, "step": 28890 }, { "epoch": 1.3417521704814521, "grad_norm": 52.886573791503906, "learning_rate": 2.764984446817401e-07, "logits/chosen": -19.300254821777344, "logits/rejected": -19.132051467895508, "logps/chosen": -479.5884704589844, "logps/rejected": -432.74395751953125, "loss": 0.6739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.406344890594482, "rewards/margins": 0.6688010096549988, "rewards/rejected": 3.737543821334839, "step": 28900 }, { "epoch": 1.3422164445888853, "grad_norm": 22.225515365600586, "learning_rate": 2.7642106566383463e-07, "logits/chosen": -19.332504272460938, "logits/rejected": -18.37655258178711, "logps/chosen": -306.21124267578125, "logps/rejected": -182.31088256835938, "loss": 0.4008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0165462493896484, "rewards/margins": 1.895358681678772, "rewards/rejected": 1.121187448501587, "step": 28910 }, { "epoch": 1.3426807186963183, "grad_norm": 6.450200080871582, "learning_rate": 2.7634368664592904e-07, "logits/chosen": -19.203433990478516, "logits/rejected": -18.35512924194336, "logps/chosen": -523.6590576171875, "logps/rejected": -360.5706787109375, "loss": 1.158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.041686058044434, "rewards/margins": 1.0109800100326538, "rewards/rejected": 3.030705690383911, "step": 28920 }, { "epoch": 1.3431449928037513, "grad_norm": 58.27489471435547, "learning_rate": 2.7626630762802355e-07, "logits/chosen": -18.64781379699707, "logits/rejected": -18.014820098876953, "logps/chosen": -436.64404296875, "logps/rejected": -384.0528869628906, "loss": 0.5469, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.123070001602173, "rewards/margins": 0.7820002436637878, "rewards/rejected": 2.3410696983337402, "step": 28930 }, { "epoch": 1.3436092669111843, "grad_norm": 57.23807144165039, "learning_rate": 2.7618892861011806e-07, "logits/chosen": -20.607881546020508, "logits/rejected": -20.093046188354492, "logps/chosen": -406.46734619140625, "logps/rejected": -372.38018798828125, "loss": 0.7992, "rewards/accuracies": 0.5, "rewards/chosen": 3.4335155487060547, "rewards/margins": 0.8394922018051147, "rewards/rejected": 2.5940232276916504, "step": 28940 }, { "epoch": 1.3440735410186173, "grad_norm": 120.42842864990234, "learning_rate": 2.7611154959221257e-07, "logits/chosen": -19.957401275634766, "logits/rejected": -18.92177963256836, "logps/chosen": -409.2850646972656, "logps/rejected": -312.8861999511719, "loss": 0.4151, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9614057540893555, "rewards/margins": 1.14479660987854, "rewards/rejected": 1.8166091442108154, "step": 28950 }, { "epoch": 1.3445378151260505, "grad_norm": 21.079076766967773, "learning_rate": 2.760341705743071e-07, "logits/chosen": -18.759265899658203, "logits/rejected": -18.1694278717041, "logps/chosen": -302.8693542480469, "logps/rejected": -244.1614227294922, "loss": 0.627, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2892322540283203, "rewards/margins": 0.3454698324203491, "rewards/rejected": 1.9437625408172607, "step": 28960 }, { "epoch": 1.3450020892334835, "grad_norm": 139.73094177246094, "learning_rate": 2.7595679155640154e-07, "logits/chosen": -19.169864654541016, "logits/rejected": -18.99677848815918, "logps/chosen": -351.7748718261719, "logps/rejected": -366.52923583984375, "loss": 0.6889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.264786958694458, "rewards/margins": 1.0032439231872559, "rewards/rejected": 2.261542797088623, "step": 28970 }, { "epoch": 1.3454663633409165, "grad_norm": 19.86754035949707, "learning_rate": 2.7587941253849605e-07, "logits/chosen": -18.32846450805664, "logits/rejected": -17.773679733276367, "logps/chosen": -301.2626647949219, "logps/rejected": -217.0930938720703, "loss": 0.661, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7118964195251465, "rewards/margins": 1.0305893421173096, "rewards/rejected": 1.6813068389892578, "step": 28980 }, { "epoch": 1.3459306374483495, "grad_norm": 56.58674621582031, "learning_rate": 2.7580203352059056e-07, "logits/chosen": -19.154605865478516, "logits/rejected": -18.246252059936523, "logps/chosen": -374.27313232421875, "logps/rejected": -270.2363586425781, "loss": 0.4305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.627594470977783, "rewards/margins": 1.0428435802459717, "rewards/rejected": 1.584751009941101, "step": 28990 }, { "epoch": 1.3463949115557825, "grad_norm": 14.74008846282959, "learning_rate": 2.757246545026851e-07, "logits/chosen": -18.338781356811523, "logits/rejected": -18.050737380981445, "logps/chosen": -278.4842529296875, "logps/rejected": -230.52804565429688, "loss": 0.6458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4290127754211426, "rewards/margins": 0.8509839177131653, "rewards/rejected": 1.578028917312622, "step": 29000 }, { "epoch": 1.3468591856632155, "grad_norm": 3.701340913772583, "learning_rate": 2.756472754847796e-07, "logits/chosen": -19.170589447021484, "logits/rejected": -18.300304412841797, "logps/chosen": -322.4405517578125, "logps/rejected": -225.01870727539062, "loss": 0.3737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.953476667404175, "rewards/margins": 1.4365756511688232, "rewards/rejected": 1.516900897026062, "step": 29010 }, { "epoch": 1.3473234597706485, "grad_norm": 141.2955780029297, "learning_rate": 2.75569896466874e-07, "logits/chosen": -19.390029907226562, "logits/rejected": -18.895137786865234, "logps/chosen": -409.479736328125, "logps/rejected": -364.48602294921875, "loss": 0.4836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.24043345451355, "rewards/margins": 1.177963376045227, "rewards/rejected": 2.062469959259033, "step": 29020 }, { "epoch": 1.3477877338780817, "grad_norm": 32.0178337097168, "learning_rate": 2.754925174489685e-07, "logits/chosen": -18.988601684570312, "logits/rejected": -18.300453186035156, "logps/chosen": -414.62347412109375, "logps/rejected": -289.2923889160156, "loss": 0.4682, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6251022815704346, "rewards/margins": 0.8883734941482544, "rewards/rejected": 1.7367289066314697, "step": 29030 }, { "epoch": 1.3482520079855147, "grad_norm": 48.61091995239258, "learning_rate": 2.75415138431063e-07, "logits/chosen": -17.845073699951172, "logits/rejected": -17.781631469726562, "logps/chosen": -411.68817138671875, "logps/rejected": -386.08233642578125, "loss": 0.713, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.283282518386841, "rewards/margins": 0.3108200430870056, "rewards/rejected": 2.9724621772766113, "step": 29040 }, { "epoch": 1.3487162820929477, "grad_norm": 0.14023572206497192, "learning_rate": 2.753377594131575e-07, "logits/chosen": -20.113948822021484, "logits/rejected": -18.76675796508789, "logps/chosen": -457.64483642578125, "logps/rejected": -369.03668212890625, "loss": 0.2828, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.436631202697754, "rewards/margins": 2.6316657066345215, "rewards/rejected": 2.8049654960632324, "step": 29050 }, { "epoch": 1.3491805562003807, "grad_norm": 12.59914493560791, "learning_rate": 2.7526038039525204e-07, "logits/chosen": -18.276123046875, "logits/rejected": -17.556283950805664, "logps/chosen": -416.61505126953125, "logps/rejected": -292.4274597167969, "loss": 0.4784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.146864891052246, "rewards/margins": 1.3198106288909912, "rewards/rejected": 1.8270543813705444, "step": 29060 }, { "epoch": 1.3496448303078137, "grad_norm": 24.581003189086914, "learning_rate": 2.751830013773465e-07, "logits/chosen": -18.80989646911621, "logits/rejected": -18.340402603149414, "logps/chosen": -236.04696655273438, "logps/rejected": -243.05166625976562, "loss": 0.8907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5807547569274902, "rewards/margins": 0.8737780451774597, "rewards/rejected": 1.7069766521453857, "step": 29070 }, { "epoch": 1.350109104415247, "grad_norm": 32.05027389526367, "learning_rate": 2.75105622359441e-07, "logits/chosen": -20.723876953125, "logits/rejected": -18.634002685546875, "logps/chosen": -406.22918701171875, "logps/rejected": -241.7259521484375, "loss": 0.2167, "rewards/accuracies": 1.0, "rewards/chosen": 3.4662728309631348, "rewards/margins": 2.0406060218811035, "rewards/rejected": 1.4256668090820312, "step": 29080 }, { "epoch": 1.3505733785226797, "grad_norm": 42.084999084472656, "learning_rate": 2.750282433415355e-07, "logits/chosen": -18.104473114013672, "logits/rejected": -17.489248275756836, "logps/chosen": -389.54608154296875, "logps/rejected": -429.15997314453125, "loss": 0.8941, "rewards/accuracies": 0.5, "rewards/chosen": 3.5422675609588623, "rewards/margins": 0.2772885262966156, "rewards/rejected": 3.264979124069214, "step": 29090 }, { "epoch": 1.3510376526301129, "grad_norm": 82.79557800292969, "learning_rate": 2.7495086432363003e-07, "logits/chosen": -18.40751838684082, "logits/rejected": -17.98793601989746, "logps/chosen": -398.28814697265625, "logps/rejected": -283.485107421875, "loss": 0.4935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3570556640625, "rewards/margins": 1.474513292312622, "rewards/rejected": 1.882542371749878, "step": 29100 }, { "epoch": 1.3515019267375459, "grad_norm": 58.11591720581055, "learning_rate": 2.748734853057245e-07, "logits/chosen": -18.252805709838867, "logits/rejected": -18.035032272338867, "logps/chosen": -447.25604248046875, "logps/rejected": -354.08172607421875, "loss": 0.8246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3288731575012207, "rewards/margins": 0.36823511123657227, "rewards/rejected": 2.9606382846832275, "step": 29110 }, { "epoch": 1.3519662008449789, "grad_norm": 10.777541160583496, "learning_rate": 2.7479610628781895e-07, "logits/chosen": -18.944948196411133, "logits/rejected": -17.600101470947266, "logps/chosen": -315.88299560546875, "logps/rejected": -238.01815795898438, "loss": 0.5997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8252651691436768, "rewards/margins": 1.3219190835952759, "rewards/rejected": 1.5033462047576904, "step": 29120 }, { "epoch": 1.3524304749524119, "grad_norm": 8.661033630371094, "learning_rate": 2.7471872726991346e-07, "logits/chosen": -18.120441436767578, "logits/rejected": -18.10569190979004, "logps/chosen": -420.6109313964844, "logps/rejected": -391.44989013671875, "loss": 1.2046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4916131496429443, "rewards/margins": 0.12644243240356445, "rewards/rejected": 2.36517071723938, "step": 29130 }, { "epoch": 1.3528947490598449, "grad_norm": 15.4335298538208, "learning_rate": 2.7464134825200797e-07, "logits/chosen": -19.129091262817383, "logits/rejected": -18.959720611572266, "logps/chosen": -405.1817932128906, "logps/rejected": -356.3326721191406, "loss": 0.4165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.288117408752441, "rewards/margins": 1.2671849727630615, "rewards/rejected": 3.02093243598938, "step": 29140 }, { "epoch": 1.353359023167278, "grad_norm": 154.42572021484375, "learning_rate": 2.745639692341025e-07, "logits/chosen": -19.703819274902344, "logits/rejected": -18.771907806396484, "logps/chosen": -393.29345703125, "logps/rejected": -311.4538879394531, "loss": 0.7225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.486940383911133, "rewards/margins": 0.9195858836174011, "rewards/rejected": 2.567354440689087, "step": 29150 }, { "epoch": 1.353823297274711, "grad_norm": 54.109920501708984, "learning_rate": 2.74486590216197e-07, "logits/chosen": -18.6623592376709, "logits/rejected": -18.60095977783203, "logps/chosen": -431.68170166015625, "logps/rejected": -399.9954528808594, "loss": 1.0632, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9072370529174805, "rewards/margins": -0.06288997828960419, "rewards/rejected": 2.9701271057128906, "step": 29160 }, { "epoch": 1.354287571382144, "grad_norm": 12.914237022399902, "learning_rate": 2.7440921119829145e-07, "logits/chosen": -19.60659408569336, "logits/rejected": -17.83059310913086, "logps/chosen": -368.95965576171875, "logps/rejected": -234.9336395263672, "loss": 0.3028, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.692322254180908, "rewards/margins": 1.602882742881775, "rewards/rejected": 1.0894393920898438, "step": 29170 }, { "epoch": 1.354751845489577, "grad_norm": 3.940173625946045, "learning_rate": 2.7433183218038596e-07, "logits/chosen": -19.21035385131836, "logits/rejected": -19.063678741455078, "logps/chosen": -469.2608947753906, "logps/rejected": -441.169189453125, "loss": 0.5643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.511349678039551, "rewards/margins": 1.466827630996704, "rewards/rejected": 3.044522523880005, "step": 29180 }, { "epoch": 1.35521611959701, "grad_norm": 23.774682998657227, "learning_rate": 2.7425445316248047e-07, "logits/chosen": -18.97038459777832, "logits/rejected": -18.329580307006836, "logps/chosen": -465.5599670410156, "logps/rejected": -374.6590270996094, "loss": 0.5266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.848169326782227, "rewards/margins": 1.2347854375839233, "rewards/rejected": 3.6133837699890137, "step": 29190 }, { "epoch": 1.355680393704443, "grad_norm": 38.940670013427734, "learning_rate": 2.74177074144575e-07, "logits/chosen": -18.486225128173828, "logits/rejected": -18.30075454711914, "logps/chosen": -322.6728820800781, "logps/rejected": -350.30169677734375, "loss": 0.4441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3143832683563232, "rewards/margins": 0.7275682687759399, "rewards/rejected": 1.5868151187896729, "step": 29200 }, { "epoch": 1.356144667811876, "grad_norm": 46.02302169799805, "learning_rate": 2.7409969512666944e-07, "logits/chosen": -18.952865600585938, "logits/rejected": -17.248523712158203, "logps/chosen": -422.19525146484375, "logps/rejected": -255.2789306640625, "loss": 0.6073, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3994297981262207, "rewards/margins": 1.9411849975585938, "rewards/rejected": 1.458245038986206, "step": 29210 }, { "epoch": 1.3566089419193093, "grad_norm": 9.514514923095703, "learning_rate": 2.740223161087639e-07, "logits/chosen": -18.084789276123047, "logits/rejected": -16.748214721679688, "logps/chosen": -449.4970703125, "logps/rejected": -307.46014404296875, "loss": 0.5615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.901637554168701, "rewards/margins": 1.4703621864318848, "rewards/rejected": 2.4312758445739746, "step": 29220 }, { "epoch": 1.3570732160267422, "grad_norm": 93.17048645019531, "learning_rate": 2.739449370908584e-07, "logits/chosen": -18.493139266967773, "logits/rejected": -17.30780029296875, "logps/chosen": -334.1895751953125, "logps/rejected": -236.2777862548828, "loss": 0.5318, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5843701362609863, "rewards/margins": 1.2417898178100586, "rewards/rejected": 1.3425804376602173, "step": 29230 }, { "epoch": 1.3575374901341752, "grad_norm": 3.6081886291503906, "learning_rate": 2.738675580729529e-07, "logits/chosen": -18.5391845703125, "logits/rejected": -17.312259674072266, "logps/chosen": -386.4758605957031, "logps/rejected": -256.099365234375, "loss": 0.5977, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9472222328186035, "rewards/margins": 1.209989070892334, "rewards/rejected": 1.7372328042984009, "step": 29240 }, { "epoch": 1.3580017642416082, "grad_norm": 26.747974395751953, "learning_rate": 2.7379017905504743e-07, "logits/chosen": -19.93669891357422, "logits/rejected": -19.7744197845459, "logps/chosen": -458.20611572265625, "logps/rejected": -395.54302978515625, "loss": 0.8544, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1672844886779785, "rewards/margins": 0.21686506271362305, "rewards/rejected": 3.9504191875457764, "step": 29250 }, { "epoch": 1.3584660383490412, "grad_norm": 29.33815574645996, "learning_rate": 2.7371280003714194e-07, "logits/chosen": -19.38479232788086, "logits/rejected": -19.325916290283203, "logps/chosen": -397.3176574707031, "logps/rejected": -434.5223693847656, "loss": 1.1172, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.076965808868408, "rewards/margins": -0.1938447207212448, "rewards/rejected": 4.270810127258301, "step": 29260 }, { "epoch": 1.3589303124564742, "grad_norm": 0.7781445980072021, "learning_rate": 2.736354210192364e-07, "logits/chosen": -19.219268798828125, "logits/rejected": -17.534931182861328, "logps/chosen": -442.46038818359375, "logps/rejected": -354.3652648925781, "loss": 0.5747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8009636402130127, "rewards/margins": 1.4352762699127197, "rewards/rejected": 2.3656868934631348, "step": 29270 }, { "epoch": 1.3593945865639072, "grad_norm": 16.81730842590332, "learning_rate": 2.735580420013309e-07, "logits/chosen": -19.144542694091797, "logits/rejected": -18.776052474975586, "logps/chosen": -283.55572509765625, "logps/rejected": -308.45269775390625, "loss": 0.8693, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.080972909927368, "rewards/margins": 0.6765273809432983, "rewards/rejected": 2.4044454097747803, "step": 29280 }, { "epoch": 1.3598588606713404, "grad_norm": 10.455975532531738, "learning_rate": 2.734806629834254e-07, "logits/chosen": -19.074350357055664, "logits/rejected": -18.50198745727539, "logps/chosen": -415.0862731933594, "logps/rejected": -357.6407470703125, "loss": 0.3361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.66194224357605, "rewards/margins": 1.576147437095642, "rewards/rejected": 2.0857951641082764, "step": 29290 }, { "epoch": 1.3603231347787734, "grad_norm": 4.995025634765625, "learning_rate": 2.734032839655199e-07, "logits/chosen": -18.49462127685547, "logits/rejected": -18.053953170776367, "logps/chosen": -417.630859375, "logps/rejected": -260.546142578125, "loss": 0.4573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.089589834213257, "rewards/margins": 1.249860405921936, "rewards/rejected": 1.8397290706634521, "step": 29300 }, { "epoch": 1.3607874088862064, "grad_norm": 1.7527300119400024, "learning_rate": 2.733259049476144e-07, "logits/chosen": -18.39590072631836, "logits/rejected": -17.936487197875977, "logps/chosen": -343.7615966796875, "logps/rejected": -236.2691192626953, "loss": 0.8981, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4694931507110596, "rewards/margins": 1.2681758403778076, "rewards/rejected": 2.2013165950775146, "step": 29310 }, { "epoch": 1.3612516829936394, "grad_norm": 59.59162902832031, "learning_rate": 2.7324852592970885e-07, "logits/chosen": -18.993968963623047, "logits/rejected": -17.755401611328125, "logps/chosen": -366.198486328125, "logps/rejected": -259.77606201171875, "loss": 0.5697, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1475744247436523, "rewards/margins": 1.2794787883758545, "rewards/rejected": 1.8680957555770874, "step": 29320 }, { "epoch": 1.3617159571010724, "grad_norm": 108.2056655883789, "learning_rate": 2.7317114691180336e-07, "logits/chosen": -19.387990951538086, "logits/rejected": -17.447311401367188, "logps/chosen": -467.40472412109375, "logps/rejected": -268.31640625, "loss": 0.2982, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3858642578125, "rewards/margins": 2.6808018684387207, "rewards/rejected": 1.7050625085830688, "step": 29330 }, { "epoch": 1.3621802312085056, "grad_norm": 312.90374755859375, "learning_rate": 2.730937678938979e-07, "logits/chosen": -19.033357620239258, "logits/rejected": -18.12788200378418, "logps/chosen": -364.8557434082031, "logps/rejected": -269.25030517578125, "loss": 0.8711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7371437549591064, "rewards/margins": 0.6313504576683044, "rewards/rejected": 2.1057934761047363, "step": 29340 }, { "epoch": 1.3626445053159384, "grad_norm": 213.9418182373047, "learning_rate": 2.730163888759924e-07, "logits/chosen": -19.443336486816406, "logits/rejected": -18.499588012695312, "logps/chosen": -431.83941650390625, "logps/rejected": -450.0518493652344, "loss": 0.9891, "rewards/accuracies": 0.5, "rewards/chosen": 3.2512383460998535, "rewards/margins": 0.5901288986206055, "rewards/rejected": 2.661109447479248, "step": 29350 }, { "epoch": 1.3631087794233716, "grad_norm": 53.11000061035156, "learning_rate": 2.729390098580869e-07, "logits/chosen": -19.668292999267578, "logits/rejected": -18.662059783935547, "logps/chosen": -346.796630859375, "logps/rejected": -290.07232666015625, "loss": 0.5241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3662381172180176, "rewards/margins": 1.0654890537261963, "rewards/rejected": 2.3007490634918213, "step": 29360 }, { "epoch": 1.3635730535308046, "grad_norm": 76.22904968261719, "learning_rate": 2.7286163084018136e-07, "logits/chosen": -18.96674156188965, "logits/rejected": -19.207366943359375, "logps/chosen": -289.1634826660156, "logps/rejected": -299.71160888671875, "loss": 1.2201, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.4233603477478027, "rewards/margins": -0.39959192276000977, "rewards/rejected": 2.8229520320892334, "step": 29370 }, { "epoch": 1.3640373276382376, "grad_norm": 42.759395599365234, "learning_rate": 2.7278425182227587e-07, "logits/chosen": -21.1761417388916, "logits/rejected": -18.758018493652344, "logps/chosen": -459.5486755371094, "logps/rejected": -319.489013671875, "loss": 0.5912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.488651752471924, "rewards/margins": 2.0696005821228027, "rewards/rejected": 2.419050931930542, "step": 29380 }, { "epoch": 1.3645016017456706, "grad_norm": 101.90167236328125, "learning_rate": 2.727068728043704e-07, "logits/chosen": -18.919153213500977, "logits/rejected": -18.816373825073242, "logps/chosen": -330.91015625, "logps/rejected": -327.16558837890625, "loss": 0.9214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3975672721862793, "rewards/margins": 0.6526784300804138, "rewards/rejected": 2.744889259338379, "step": 29390 }, { "epoch": 1.3649658758531036, "grad_norm": 0.4711402952671051, "learning_rate": 2.7262949378646484e-07, "logits/chosen": -18.528173446655273, "logits/rejected": -17.45303726196289, "logps/chosen": -384.7862548828125, "logps/rejected": -280.0523376464844, "loss": 0.4077, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.501131534576416, "rewards/margins": 2.3128726482391357, "rewards/rejected": 2.188258409500122, "step": 29400 }, { "epoch": 1.3654301499605368, "grad_norm": 148.52418518066406, "learning_rate": 2.7255211476855935e-07, "logits/chosen": -19.19792938232422, "logits/rejected": -19.034358978271484, "logps/chosen": -373.0281066894531, "logps/rejected": -398.9834899902344, "loss": 1.2516, "rewards/accuracies": 0.5, "rewards/chosen": 3.4698028564453125, "rewards/margins": -0.1745128333568573, "rewards/rejected": 3.644315719604492, "step": 29410 }, { "epoch": 1.3658944240679698, "grad_norm": 141.4966278076172, "learning_rate": 2.724747357506538e-07, "logits/chosen": -19.93951416015625, "logits/rejected": -19.63154411315918, "logps/chosen": -350.90167236328125, "logps/rejected": -308.2395324707031, "loss": 0.6994, "rewards/accuracies": 0.5, "rewards/chosen": 3.692333221435547, "rewards/margins": 0.4155580997467041, "rewards/rejected": 3.276775360107422, "step": 29420 }, { "epoch": 1.3663586981754028, "grad_norm": 2.3319032192230225, "learning_rate": 2.723973567327483e-07, "logits/chosen": -19.22754669189453, "logits/rejected": -18.651124954223633, "logps/chosen": -443.76519775390625, "logps/rejected": -374.0753479003906, "loss": 0.758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.55741810798645, "rewards/margins": 0.7800869345664978, "rewards/rejected": 2.7773311138153076, "step": 29430 }, { "epoch": 1.3668229722828358, "grad_norm": 54.038551330566406, "learning_rate": 2.7231997771484283e-07, "logits/chosen": -19.012540817260742, "logits/rejected": -17.796283721923828, "logps/chosen": -406.5762939453125, "logps/rejected": -297.1059265136719, "loss": 0.8254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8626949787139893, "rewards/margins": 1.3754078149795532, "rewards/rejected": 2.4872870445251465, "step": 29440 }, { "epoch": 1.3672872463902688, "grad_norm": 10.879138946533203, "learning_rate": 2.7224259869693734e-07, "logits/chosen": -19.15572166442871, "logits/rejected": -18.441452026367188, "logps/chosen": -497.34686279296875, "logps/rejected": -379.89642333984375, "loss": 0.4736, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.420755386352539, "rewards/margins": 1.854397177696228, "rewards/rejected": 2.5663580894470215, "step": 29450 }, { "epoch": 1.3677515204977018, "grad_norm": 51.46665573120117, "learning_rate": 2.7216521967903185e-07, "logits/chosen": -19.62188148498535, "logits/rejected": -18.647281646728516, "logps/chosen": -349.5755920410156, "logps/rejected": -321.2305908203125, "loss": 0.6694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4704909324645996, "rewards/margins": 0.1713007390499115, "rewards/rejected": 2.299190044403076, "step": 29460 }, { "epoch": 1.3682157946051348, "grad_norm": 4.993635654449463, "learning_rate": 2.720878406611263e-07, "logits/chosen": -18.49370765686035, "logits/rejected": -17.132064819335938, "logps/chosen": -419.2543029785156, "logps/rejected": -251.3346710205078, "loss": 0.3723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.556668758392334, "rewards/margins": 2.7386603355407715, "rewards/rejected": 1.8180086612701416, "step": 29470 }, { "epoch": 1.368680068712568, "grad_norm": 28.952857971191406, "learning_rate": 2.720104616432208e-07, "logits/chosen": -18.11974334716797, "logits/rejected": -18.26908302307129, "logps/chosen": -289.3258361816406, "logps/rejected": -309.0738830566406, "loss": 0.7764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.121295928955078, "rewards/margins": 0.2978341281414032, "rewards/rejected": 1.8234617710113525, "step": 29480 }, { "epoch": 1.369144342820001, "grad_norm": 74.22229766845703, "learning_rate": 2.719330826253153e-07, "logits/chosen": -19.284400939941406, "logits/rejected": -18.424205780029297, "logps/chosen": -404.8836975097656, "logps/rejected": -357.6045837402344, "loss": 0.4156, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4143688678741455, "rewards/margins": 0.9799703359603882, "rewards/rejected": 2.434398651123047, "step": 29490 }, { "epoch": 1.369608616927434, "grad_norm": 1.3976186513900757, "learning_rate": 2.718557036074098e-07, "logits/chosen": -18.3182430267334, "logits/rejected": -17.59600067138672, "logps/chosen": -404.3369140625, "logps/rejected": -300.6619873046875, "loss": 0.7949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6977996826171875, "rewards/margins": 1.1700469255447388, "rewards/rejected": 2.5277531147003174, "step": 29500 }, { "epoch": 1.370072891034867, "grad_norm": 20.690523147583008, "learning_rate": 2.717783245895043e-07, "logits/chosen": -18.798965454101562, "logits/rejected": -18.596080780029297, "logps/chosen": -340.1388244628906, "logps/rejected": -431.1653747558594, "loss": 1.966, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.264106273651123, "rewards/margins": -1.3767683506011963, "rewards/rejected": 3.6408743858337402, "step": 29510 }, { "epoch": 1.3705371651423, "grad_norm": 32.22704315185547, "learning_rate": 2.7170094557159876e-07, "logits/chosen": -18.419824600219727, "logits/rejected": -17.196636199951172, "logps/chosen": -327.91845703125, "logps/rejected": -167.2837371826172, "loss": 0.355, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.943099021911621, "rewards/margins": 1.5759963989257812, "rewards/rejected": 0.3671025335788727, "step": 29520 }, { "epoch": 1.3710014392497332, "grad_norm": 54.78057098388672, "learning_rate": 2.7162356655369327e-07, "logits/chosen": -19.169879913330078, "logits/rejected": -19.11370849609375, "logps/chosen": -440.392822265625, "logps/rejected": -494.62762451171875, "loss": 0.5453, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8952572345733643, "rewards/margins": 0.5370581746101379, "rewards/rejected": 3.358198881149292, "step": 29530 }, { "epoch": 1.371465713357166, "grad_norm": 0.10761183500289917, "learning_rate": 2.715461875357878e-07, "logits/chosen": -18.967288970947266, "logits/rejected": -18.01375961303711, "logps/chosen": -337.80126953125, "logps/rejected": -254.7932586669922, "loss": 0.4467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.554725170135498, "rewards/margins": 1.4150230884552002, "rewards/rejected": 1.1397018432617188, "step": 29540 }, { "epoch": 1.3719299874645992, "grad_norm": 100.31336975097656, "learning_rate": 2.714688085178823e-07, "logits/chosen": -18.386919021606445, "logits/rejected": -17.442604064941406, "logps/chosen": -342.11822509765625, "logps/rejected": -235.6815948486328, "loss": 0.5648, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.211703062057495, "rewards/margins": 0.9929329752922058, "rewards/rejected": 2.2187702655792236, "step": 29550 }, { "epoch": 1.3723942615720321, "grad_norm": 2.9752047061920166, "learning_rate": 2.713914294999768e-07, "logits/chosen": -19.07013702392578, "logits/rejected": -17.910869598388672, "logps/chosen": -423.76165771484375, "logps/rejected": -283.2478332519531, "loss": 0.3155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0751354694366455, "rewards/margins": 1.6639267206192017, "rewards/rejected": 1.4112086296081543, "step": 29560 }, { "epoch": 1.3728585356794651, "grad_norm": 99.78086853027344, "learning_rate": 2.7131405048207127e-07, "logits/chosen": -18.41241455078125, "logits/rejected": -17.71457290649414, "logps/chosen": -379.7122497558594, "logps/rejected": -285.95892333984375, "loss": 0.7645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.109800338745117, "rewards/margins": 0.41949042677879333, "rewards/rejected": 2.69031023979187, "step": 29570 }, { "epoch": 1.3733228097868981, "grad_norm": Infinity, "learning_rate": 2.7124440936595634e-07, "logits/chosen": -18.842491149902344, "logits/rejected": -17.83218765258789, "logps/chosen": -424.76947021484375, "logps/rejected": -297.872802734375, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": 3.2137627601623535, "rewards/margins": 0.6280009150505066, "rewards/rejected": 2.585761785507202, "step": 29580 }, { "epoch": 1.3737870838943311, "grad_norm": 10.564361572265625, "learning_rate": 2.7116703034805085e-07, "logits/chosen": -19.016977310180664, "logits/rejected": -18.36332130432129, "logps/chosen": -323.1806335449219, "logps/rejected": -312.73504638671875, "loss": 1.1666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.597073793411255, "rewards/margins": -0.043505262583494186, "rewards/rejected": 2.6405789852142334, "step": 29590 }, { "epoch": 1.3742513580017643, "grad_norm": 22.546669006347656, "learning_rate": 2.710896513301453e-07, "logits/chosen": -19.025585174560547, "logits/rejected": -18.3448486328125, "logps/chosen": -395.8536376953125, "logps/rejected": -310.5702209472656, "loss": 0.5983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7091572284698486, "rewards/margins": 0.7367078065872192, "rewards/rejected": 1.9724493026733398, "step": 29600 }, { "epoch": 1.3747156321091973, "grad_norm": 102.53713989257812, "learning_rate": 2.7101227231223977e-07, "logits/chosen": -18.53695297241211, "logits/rejected": -18.08261489868164, "logps/chosen": -284.06671142578125, "logps/rejected": -238.22708129882812, "loss": 1.1664, "rewards/accuracies": 0.5, "rewards/chosen": 1.912308692932129, "rewards/margins": -0.465959370136261, "rewards/rejected": 2.378268003463745, "step": 29610 }, { "epoch": 1.3751799062166303, "grad_norm": 0.41775649785995483, "learning_rate": 2.709348932943343e-07, "logits/chosen": -19.310657501220703, "logits/rejected": -17.925506591796875, "logps/chosen": -371.86260986328125, "logps/rejected": -287.7621154785156, "loss": 0.5456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4819717407226562, "rewards/margins": 1.535238265991211, "rewards/rejected": 1.9467334747314453, "step": 29620 }, { "epoch": 1.3756441803240633, "grad_norm": 24.180599212646484, "learning_rate": 2.708575142764288e-07, "logits/chosen": -18.85369110107422, "logits/rejected": -18.303874969482422, "logps/chosen": -463.9393005371094, "logps/rejected": -364.49090576171875, "loss": 0.485, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.180089950561523, "rewards/margins": 1.8005962371826172, "rewards/rejected": 2.3794937133789062, "step": 29630 }, { "epoch": 1.3761084544314963, "grad_norm": 2.664546012878418, "learning_rate": 2.707801352585233e-07, "logits/chosen": -18.662593841552734, "logits/rejected": -17.109615325927734, "logps/chosen": -305.45709228515625, "logps/rejected": -226.58837890625, "loss": 0.7002, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.934539794921875, "rewards/margins": 1.707363486289978, "rewards/rejected": 1.2271761894226074, "step": 29640 }, { "epoch": 1.3765727285389293, "grad_norm": 74.3008041381836, "learning_rate": 2.7070275624061776e-07, "logits/chosen": -19.48743438720703, "logits/rejected": -17.977703094482422, "logps/chosen": -411.71014404296875, "logps/rejected": -241.9388885498047, "loss": 0.5354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.018134117126465, "rewards/margins": 2.3979554176330566, "rewards/rejected": 1.6201785802841187, "step": 29650 }, { "epoch": 1.3770370026463623, "grad_norm": 178.8349609375, "learning_rate": 2.7062537722271227e-07, "logits/chosen": -19.770160675048828, "logits/rejected": -19.585960388183594, "logps/chosen": -389.42742919921875, "logps/rejected": -408.16204833984375, "loss": 0.9236, "rewards/accuracies": 0.5, "rewards/chosen": 3.719417095184326, "rewards/margins": 0.8780008554458618, "rewards/rejected": 2.841416597366333, "step": 29660 }, { "epoch": 1.3775012767537955, "grad_norm": 82.12003326416016, "learning_rate": 2.705479982048068e-07, "logits/chosen": -18.94090461730957, "logits/rejected": -18.47060775756836, "logps/chosen": -367.7607116699219, "logps/rejected": -362.34130859375, "loss": 1.3174, "rewards/accuracies": 0.5, "rewards/chosen": 2.174253463745117, "rewards/margins": -0.4349920153617859, "rewards/rejected": 2.6092453002929688, "step": 29670 }, { "epoch": 1.3779655508612285, "grad_norm": 11.164817810058594, "learning_rate": 2.704706191869013e-07, "logits/chosen": -18.811269760131836, "logits/rejected": -17.85721778869629, "logps/chosen": -398.5093688964844, "logps/rejected": -298.95648193359375, "loss": 0.8051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5273051261901855, "rewards/margins": 1.0994738340377808, "rewards/rejected": 2.4278311729431152, "step": 29680 }, { "epoch": 1.3784298249686615, "grad_norm": 100.52278900146484, "learning_rate": 2.703932401689958e-07, "logits/chosen": -19.351388931274414, "logits/rejected": -18.441314697265625, "logps/chosen": -440.3934631347656, "logps/rejected": -367.36163330078125, "loss": 0.4606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.96970796585083, "rewards/margins": 1.0581557750701904, "rewards/rejected": 2.9115519523620605, "step": 29690 }, { "epoch": 1.3788940990760945, "grad_norm": 7.096358299255371, "learning_rate": 2.7031586115109026e-07, "logits/chosen": -18.97287940979004, "logits/rejected": -18.388561248779297, "logps/chosen": -322.1465148925781, "logps/rejected": -297.61041259765625, "loss": 0.5575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.693138599395752, "rewards/margins": 0.8602763414382935, "rewards/rejected": 1.8328624963760376, "step": 29700 }, { "epoch": 1.3793583731835275, "grad_norm": 21.256916046142578, "learning_rate": 2.702384821331847e-07, "logits/chosen": -19.123979568481445, "logits/rejected": -18.248477935791016, "logps/chosen": -486.6851501464844, "logps/rejected": -394.15020751953125, "loss": 0.475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.558632850646973, "rewards/margins": 1.4581682682037354, "rewards/rejected": 3.1004645824432373, "step": 29710 }, { "epoch": 1.3798226472909607, "grad_norm": 80.91207885742188, "learning_rate": 2.7016110311527923e-07, "logits/chosen": -19.06952476501465, "logits/rejected": -18.490036010742188, "logps/chosen": -370.09783935546875, "logps/rejected": -282.4089660644531, "loss": 0.635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8550093173980713, "rewards/margins": 0.7313455939292908, "rewards/rejected": 2.1236634254455566, "step": 29720 }, { "epoch": 1.3802869213983935, "grad_norm": 75.93052673339844, "learning_rate": 2.7008372409737375e-07, "logits/chosen": -19.833316802978516, "logits/rejected": -18.34170150756836, "logps/chosen": -530.32568359375, "logps/rejected": -418.31573486328125, "loss": 0.2135, "rewards/accuracies": 1.0, "rewards/chosen": 5.090336799621582, "rewards/margins": 2.546710968017578, "rewards/rejected": 2.543625831604004, "step": 29730 }, { "epoch": 1.3807511955058267, "grad_norm": 231.46961975097656, "learning_rate": 2.7000634507946826e-07, "logits/chosen": -19.905040740966797, "logits/rejected": -20.25611114501953, "logps/chosen": -451.0272521972656, "logps/rejected": -461.47198486328125, "loss": 1.0263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.860340118408203, "rewards/margins": 0.18101665377616882, "rewards/rejected": 3.679323196411133, "step": 29740 }, { "epoch": 1.3812154696132597, "grad_norm": 21.96904945373535, "learning_rate": 2.699289660615627e-07, "logits/chosen": -19.397319793701172, "logits/rejected": -17.861289978027344, "logps/chosen": -324.49334716796875, "logps/rejected": -182.0677947998047, "loss": 0.3659, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.835451602935791, "rewards/margins": 2.106924057006836, "rewards/rejected": 0.7285276651382446, "step": 29750 }, { "epoch": 1.3816797437206927, "grad_norm": 48.08738708496094, "learning_rate": 2.6985158704365723e-07, "logits/chosen": -20.793672561645508, "logits/rejected": -19.128982543945312, "logps/chosen": -355.2630920410156, "logps/rejected": -303.8415222167969, "loss": 0.3839, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.418530225753784, "rewards/margins": 1.6220957040786743, "rewards/rejected": 1.796434998512268, "step": 29760 }, { "epoch": 1.3821440178281257, "grad_norm": 9.280301094055176, "learning_rate": 2.6977420802575174e-07, "logits/chosen": -19.49542808532715, "logits/rejected": -18.19826316833496, "logps/chosen": -402.10430908203125, "logps/rejected": -280.507568359375, "loss": 0.3993, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.487894058227539, "rewards/margins": 2.0808491706848145, "rewards/rejected": 2.4070446491241455, "step": 29770 }, { "epoch": 1.3826082919355587, "grad_norm": 25.153282165527344, "learning_rate": 2.6969682900784625e-07, "logits/chosen": -18.548839569091797, "logits/rejected": -18.68630599975586, "logps/chosen": -372.513671875, "logps/rejected": -329.30902099609375, "loss": 0.9158, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.759129285812378, "rewards/margins": 0.6472073793411255, "rewards/rejected": 3.111921787261963, "step": 29780 }, { "epoch": 1.383072566042992, "grad_norm": 11.540790557861328, "learning_rate": 2.6961944998994076e-07, "logits/chosen": -18.81570816040039, "logits/rejected": -17.072147369384766, "logps/chosen": -349.0387878417969, "logps/rejected": -202.88534545898438, "loss": 0.4106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6743276119232178, "rewards/margins": 1.8302749395370483, "rewards/rejected": 0.8440526723861694, "step": 29790 }, { "epoch": 1.3835368401504249, "grad_norm": 54.82822036743164, "learning_rate": 2.6954207097203517e-07, "logits/chosen": -19.4130916595459, "logits/rejected": -18.852872848510742, "logps/chosen": -357.6288146972656, "logps/rejected": -278.9911193847656, "loss": 0.39, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4010608196258545, "rewards/margins": 1.2041387557983398, "rewards/rejected": 2.1969223022460938, "step": 29800 }, { "epoch": 1.3840011142578579, "grad_norm": 39.18110275268555, "learning_rate": 2.694646919541297e-07, "logits/chosen": -19.104543685913086, "logits/rejected": -18.54709243774414, "logps/chosen": -465.2085876464844, "logps/rejected": -488.01190185546875, "loss": 0.5893, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.360051155090332, "rewards/margins": 1.0842814445495605, "rewards/rejected": 3.2757694721221924, "step": 29810 }, { "epoch": 1.3844653883652909, "grad_norm": 34.363094329833984, "learning_rate": 2.693873129362242e-07, "logits/chosen": -18.915836334228516, "logits/rejected": -18.0568904876709, "logps/chosen": -411.4892578125, "logps/rejected": -292.8518371582031, "loss": 0.2195, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.134160041809082, "rewards/margins": 2.660745143890381, "rewards/rejected": 1.4734151363372803, "step": 29820 }, { "epoch": 1.3849296624727239, "grad_norm": 158.93898010253906, "learning_rate": 2.693099339183187e-07, "logits/chosen": -19.212055206298828, "logits/rejected": -18.07421875, "logps/chosen": -371.70416259765625, "logps/rejected": -320.6064758300781, "loss": 0.6291, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.882969617843628, "rewards/margins": 1.4732567071914673, "rewards/rejected": 2.4097132682800293, "step": 29830 }, { "epoch": 1.3853939365801569, "grad_norm": 151.99452209472656, "learning_rate": 2.692325549004132e-07, "logits/chosen": -19.330045700073242, "logits/rejected": -19.20553207397461, "logps/chosen": -466.98663330078125, "logps/rejected": -439.4537048339844, "loss": 0.7651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.996535539627075, "rewards/margins": 1.1765533685684204, "rewards/rejected": 2.8199820518493652, "step": 29840 }, { "epoch": 1.3858582106875899, "grad_norm": 1.1881542205810547, "learning_rate": 2.6915517588250767e-07, "logits/chosen": -18.79068374633789, "logits/rejected": -17.41681480407715, "logps/chosen": -469.9620666503906, "logps/rejected": -222.35269165039062, "loss": 0.2674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.107847213745117, "rewards/margins": 2.7251524925231934, "rewards/rejected": 1.382694959640503, "step": 29850 }, { "epoch": 1.386322484795023, "grad_norm": 56.15802001953125, "learning_rate": 2.690777968646022e-07, "logits/chosen": -19.779499053955078, "logits/rejected": -18.85445785522461, "logps/chosen": -477.1227111816406, "logps/rejected": -321.773193359375, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.701457977294922, "rewards/margins": 1.1130915880203247, "rewards/rejected": 2.5883662700653076, "step": 29860 }, { "epoch": 1.386786758902456, "grad_norm": 19.1243896484375, "learning_rate": 2.690004178466967e-07, "logits/chosen": -19.97690773010254, "logits/rejected": -18.82399559020996, "logps/chosen": -448.8443908691406, "logps/rejected": -269.29998779296875, "loss": 0.3441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.135921955108643, "rewards/margins": 1.9739866256713867, "rewards/rejected": 2.161935329437256, "step": 29870 }, { "epoch": 1.387251033009889, "grad_norm": 253.9959259033203, "learning_rate": 2.689230388287912e-07, "logits/chosen": -18.691509246826172, "logits/rejected": -18.08732795715332, "logps/chosen": -483.88543701171875, "logps/rejected": -407.9254150390625, "loss": 0.3773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.358303070068359, "rewards/margins": 1.9227946996688843, "rewards/rejected": 2.4355080127716064, "step": 29880 }, { "epoch": 1.387715307117322, "grad_norm": 7.786584854125977, "learning_rate": 2.688456598108857e-07, "logits/chosen": -18.38039779663086, "logits/rejected": -17.232053756713867, "logps/chosen": -443.6787109375, "logps/rejected": -300.95306396484375, "loss": 0.649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.02117919921875, "rewards/margins": 1.4358065128326416, "rewards/rejected": 2.5853729248046875, "step": 29890 }, { "epoch": 1.388179581224755, "grad_norm": 3.5969481468200684, "learning_rate": 2.687682807929801e-07, "logits/chosen": -18.095043182373047, "logits/rejected": -17.473400115966797, "logps/chosen": -247.1631317138672, "logps/rejected": -180.8124237060547, "loss": 0.2474, "rewards/accuracies": 1.0, "rewards/chosen": 2.6420280933380127, "rewards/margins": 1.7173560857772827, "rewards/rejected": 0.9246721267700195, "step": 29900 }, { "epoch": 1.3886438553321883, "grad_norm": 118.3885726928711, "learning_rate": 2.6869090177507463e-07, "logits/chosen": -19.105255126953125, "logits/rejected": -18.36219596862793, "logps/chosen": -443.9425354003906, "logps/rejected": -266.3872375488281, "loss": 0.3898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.808937072753906, "rewards/margins": 2.1583738327026367, "rewards/rejected": 2.6505627632141113, "step": 29910 }, { "epoch": 1.389108129439621, "grad_norm": 5.164167881011963, "learning_rate": 2.6861352275716914e-07, "logits/chosen": -19.168893814086914, "logits/rejected": -17.882551193237305, "logps/chosen": -418.87615966796875, "logps/rejected": -325.0138244628906, "loss": 0.5656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8731961250305176, "rewards/margins": 1.5127238035202026, "rewards/rejected": 2.3604726791381836, "step": 29920 }, { "epoch": 1.3895724035470542, "grad_norm": 1.3350282907485962, "learning_rate": 2.6853614373926365e-07, "logits/chosen": -20.199642181396484, "logits/rejected": -19.386322021484375, "logps/chosen": -381.7919616699219, "logps/rejected": -290.66143798828125, "loss": 0.5646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.507882595062256, "rewards/margins": 1.211774468421936, "rewards/rejected": 2.2961084842681885, "step": 29930 }, { "epoch": 1.3900366776544872, "grad_norm": 18.393678665161133, "learning_rate": 2.6845876472135817e-07, "logits/chosen": -19.175451278686523, "logits/rejected": -19.52077865600586, "logps/chosen": -282.4747009277344, "logps/rejected": -278.06927490234375, "loss": 1.1687, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9623104333877563, "rewards/margins": -0.4577454626560211, "rewards/rejected": 2.420055866241455, "step": 29940 }, { "epoch": 1.3905009517619202, "grad_norm": 18.56375503540039, "learning_rate": 2.683813857034526e-07, "logits/chosen": -19.042695999145508, "logits/rejected": -18.2913761138916, "logps/chosen": -324.120849609375, "logps/rejected": -250.02334594726562, "loss": 0.5325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4925339221954346, "rewards/margins": 1.6637992858886719, "rewards/rejected": 1.8287343978881836, "step": 29950 }, { "epoch": 1.3909652258693532, "grad_norm": 30.423099517822266, "learning_rate": 2.6830400668554714e-07, "logits/chosen": -19.651905059814453, "logits/rejected": -18.89296531677246, "logps/chosen": -646.8004150390625, "logps/rejected": -445.28863525390625, "loss": 0.5251, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.451825141906738, "rewards/margins": 2.0643837451934814, "rewards/rejected": 3.387442111968994, "step": 29960 }, { "epoch": 1.3914294999767862, "grad_norm": 89.4675521850586, "learning_rate": 2.6822662766764165e-07, "logits/chosen": -19.44582176208496, "logits/rejected": -18.702234268188477, "logps/chosen": -477.38214111328125, "logps/rejected": -384.5355529785156, "loss": 0.6604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.226563930511475, "rewards/margins": 0.8753805160522461, "rewards/rejected": 3.3511836528778076, "step": 29970 }, { "epoch": 1.3918937740842194, "grad_norm": 64.67066955566406, "learning_rate": 2.6814924864973616e-07, "logits/chosen": -18.399995803833008, "logits/rejected": -17.99362564086914, "logps/chosen": -386.08074951171875, "logps/rejected": -319.14019775390625, "loss": 0.8794, "rewards/accuracies": 0.5, "rewards/chosen": 3.8316924571990967, "rewards/margins": 0.4496378004550934, "rewards/rejected": 3.3820552825927734, "step": 29980 }, { "epoch": 1.3923580481916524, "grad_norm": 127.16104888916016, "learning_rate": 2.680718696318306e-07, "logits/chosen": -19.3513240814209, "logits/rejected": -19.101036071777344, "logps/chosen": -420.497802734375, "logps/rejected": -409.5958557128906, "loss": 1.1487, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.042827129364014, "rewards/margins": 0.2550623118877411, "rewards/rejected": 3.78776478767395, "step": 29990 }, { "epoch": 1.3928223222990854, "grad_norm": 8.123077392578125, "learning_rate": 2.679944906139251e-07, "logits/chosen": -19.229326248168945, "logits/rejected": -18.76422691345215, "logps/chosen": -422.33721923828125, "logps/rejected": -354.5446472167969, "loss": 0.4982, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5139851570129395, "rewards/margins": 1.0376001596450806, "rewards/rejected": 2.4763851165771484, "step": 30000 }, { "epoch": 1.3932865964065184, "grad_norm": 244.94900512695312, "learning_rate": 2.679171115960196e-07, "logits/chosen": -18.665233612060547, "logits/rejected": -18.63285255432129, "logps/chosen": -467.0625, "logps/rejected": -442.7093811035156, "loss": 0.6615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.122467994689941, "rewards/margins": 0.579116702079773, "rewards/rejected": 3.543351411819458, "step": 30010 }, { "epoch": 1.3937508705139514, "grad_norm": 15.267674446105957, "learning_rate": 2.678397325781141e-07, "logits/chosen": -19.481698989868164, "logits/rejected": -18.57172203063965, "logps/chosen": -478.83349609375, "logps/rejected": -348.75030517578125, "loss": 0.6041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1315813064575195, "rewards/margins": 0.6878864169120789, "rewards/rejected": 3.443695068359375, "step": 30020 }, { "epoch": 1.3942151446213844, "grad_norm": 7.345300674438477, "learning_rate": 2.677623535602086e-07, "logits/chosen": -19.6595458984375, "logits/rejected": -18.962387084960938, "logps/chosen": -568.4939575195312, "logps/rejected": -492.592529296875, "loss": 0.3071, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.758130073547363, "rewards/margins": 1.6124670505523682, "rewards/rejected": 4.145663261413574, "step": 30030 }, { "epoch": 1.3946794187288174, "grad_norm": 4.748716831207275, "learning_rate": 2.676849745423031e-07, "logits/chosen": -19.29861831665039, "logits/rejected": -18.709901809692383, "logps/chosen": -329.2889099121094, "logps/rejected": -291.5013427734375, "loss": 0.4775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6716697216033936, "rewards/margins": 1.1610370874404907, "rewards/rejected": 1.5106326341629028, "step": 30040 }, { "epoch": 1.3951436928362506, "grad_norm": 106.10504913330078, "learning_rate": 2.676075955243976e-07, "logits/chosen": -19.694522857666016, "logits/rejected": -19.259366989135742, "logps/chosen": -511.6864318847656, "logps/rejected": -404.71502685546875, "loss": 0.506, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1018967628479, "rewards/margins": 0.9193326830863953, "rewards/rejected": 3.1825640201568604, "step": 30050 }, { "epoch": 1.3956079669436836, "grad_norm": 96.41278076171875, "learning_rate": 2.675302165064921e-07, "logits/chosen": -18.64749526977539, "logits/rejected": -18.643911361694336, "logps/chosen": -395.483642578125, "logps/rejected": -320.7076721191406, "loss": 0.2818, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9795806407928467, "rewards/margins": 1.9887927770614624, "rewards/rejected": 1.9907878637313843, "step": 30060 }, { "epoch": 1.3960722410511166, "grad_norm": 33.196746826171875, "learning_rate": 2.674528374885866e-07, "logits/chosen": -19.10794448852539, "logits/rejected": -18.64330291748047, "logps/chosen": -380.81341552734375, "logps/rejected": -337.8086242675781, "loss": 0.4885, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3239524364471436, "rewards/margins": 1.041244626045227, "rewards/rejected": 2.282708168029785, "step": 30070 }, { "epoch": 1.3965365151585496, "grad_norm": 86.44453430175781, "learning_rate": 2.673754584706811e-07, "logits/chosen": -18.8625431060791, "logits/rejected": -18.420307159423828, "logps/chosen": -472.6455993652344, "logps/rejected": -444.30389404296875, "loss": 0.6757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7815842628479004, "rewards/margins": 0.22140610218048096, "rewards/rejected": 3.56017804145813, "step": 30080 }, { "epoch": 1.3970007892659826, "grad_norm": 0.44023099541664124, "learning_rate": 2.6729807945277557e-07, "logits/chosen": -20.297170639038086, "logits/rejected": -19.233503341674805, "logps/chosen": -342.5583801269531, "logps/rejected": -260.94488525390625, "loss": 0.6211, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6761629581451416, "rewards/margins": 0.9492858052253723, "rewards/rejected": 1.7268768548965454, "step": 30090 }, { "epoch": 1.3974650633734156, "grad_norm": 1.1338841915130615, "learning_rate": 2.6722070043487003e-07, "logits/chosen": -19.080997467041016, "logits/rejected": -18.555713653564453, "logps/chosen": -304.51116943359375, "logps/rejected": -214.93881225585938, "loss": 0.5361, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.890632152557373, "rewards/margins": 1.259097695350647, "rewards/rejected": 1.6315343379974365, "step": 30100 }, { "epoch": 1.3979293374808486, "grad_norm": 53.291358947753906, "learning_rate": 2.6714332141696454e-07, "logits/chosen": -19.552186965942383, "logits/rejected": -18.442678451538086, "logps/chosen": -374.0055847167969, "logps/rejected": -281.10736083984375, "loss": 0.2927, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.360391616821289, "rewards/margins": 2.2530465126037598, "rewards/rejected": 2.1073451042175293, "step": 30110 }, { "epoch": 1.3983936115882818, "grad_norm": 42.83880615234375, "learning_rate": 2.6706594239905905e-07, "logits/chosen": -19.566940307617188, "logits/rejected": -19.126983642578125, "logps/chosen": -477.6360778808594, "logps/rejected": -378.6449890136719, "loss": 1.0037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.633014678955078, "rewards/margins": 0.3883451521396637, "rewards/rejected": 4.244669437408447, "step": 30120 }, { "epoch": 1.3988578856957148, "grad_norm": 6.727845668792725, "learning_rate": 2.6698856338115356e-07, "logits/chosen": -19.15064239501953, "logits/rejected": -18.41216468811035, "logps/chosen": -385.81121826171875, "logps/rejected": -337.64044189453125, "loss": 0.902, "rewards/accuracies": 0.5, "rewards/chosen": 3.0734434127807617, "rewards/margins": 0.21267247200012207, "rewards/rejected": 2.8607707023620605, "step": 30130 }, { "epoch": 1.3993221598031478, "grad_norm": 69.0899429321289, "learning_rate": 2.669111843632481e-07, "logits/chosen": -19.70020866394043, "logits/rejected": -18.98810386657715, "logps/chosen": -423.541015625, "logps/rejected": -340.86029052734375, "loss": 0.4375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.75205659866333, "rewards/margins": 1.1935145854949951, "rewards/rejected": 1.558542013168335, "step": 30140 }, { "epoch": 1.3997864339105808, "grad_norm": 160.64756774902344, "learning_rate": 2.6683380534534253e-07, "logits/chosen": -19.39102554321289, "logits/rejected": -18.282005310058594, "logps/chosen": -476.722900390625, "logps/rejected": -400.0902404785156, "loss": 0.4675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.610347747802734, "rewards/margins": 1.750190019607544, "rewards/rejected": 2.8601577281951904, "step": 30150 }, { "epoch": 1.4002507080180138, "grad_norm": 6.86995267868042, "learning_rate": 2.6675642632743704e-07, "logits/chosen": -19.57327651977539, "logits/rejected": -18.5025691986084, "logps/chosen": -303.43804931640625, "logps/rejected": -258.42718505859375, "loss": 0.4892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1456124782562256, "rewards/margins": 1.1491848230361938, "rewards/rejected": 1.9964277744293213, "step": 30160 }, { "epoch": 1.400714982125447, "grad_norm": 47.31193161010742, "learning_rate": 2.6667904730953155e-07, "logits/chosen": -19.00093650817871, "logits/rejected": -18.71076011657715, "logps/chosen": -377.29461669921875, "logps/rejected": -334.2049865722656, "loss": 0.7857, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3088650703430176, "rewards/margins": 0.22435541450977325, "rewards/rejected": 3.084510087966919, "step": 30170 }, { "epoch": 1.4011792562328798, "grad_norm": 8.272954940795898, "learning_rate": 2.66601668291626e-07, "logits/chosen": -19.71237564086914, "logits/rejected": -19.279117584228516, "logps/chosen": -448.9168395996094, "logps/rejected": -377.85064697265625, "loss": 0.4772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.414761543273926, "rewards/margins": 1.4241212606430054, "rewards/rejected": 2.990640640258789, "step": 30180 }, { "epoch": 1.401643530340313, "grad_norm": 99.98583221435547, "learning_rate": 2.665242892737205e-07, "logits/chosen": -18.74155044555664, "logits/rejected": -17.944360733032227, "logps/chosen": -479.000244140625, "logps/rejected": -355.8583679199219, "loss": 0.6733, "rewards/accuracies": 0.5, "rewards/chosen": 2.9840340614318848, "rewards/margins": 0.7750186324119568, "rewards/rejected": 2.209015369415283, "step": 30190 }, { "epoch": 1.402107804447746, "grad_norm": 281.7245788574219, "learning_rate": 2.66446910255815e-07, "logits/chosen": -18.526674270629883, "logits/rejected": -18.366193771362305, "logps/chosen": -244.53616333007812, "logps/rejected": -328.1459045410156, "loss": 1.178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.219026803970337, "rewards/margins": 0.6521240472793579, "rewards/rejected": 1.566902756690979, "step": 30200 }, { "epoch": 1.402572078555179, "grad_norm": 66.43827056884766, "learning_rate": 2.663695312379095e-07, "logits/chosen": -19.380680084228516, "logits/rejected": -18.758148193359375, "logps/chosen": -413.29779052734375, "logps/rejected": -349.7719421386719, "loss": 0.7403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8050265312194824, "rewards/margins": 0.6853354573249817, "rewards/rejected": 3.1196908950805664, "step": 30210 }, { "epoch": 1.403036352662612, "grad_norm": 26.40030860900879, "learning_rate": 2.66292152220004e-07, "logits/chosen": -20.261356353759766, "logits/rejected": -18.482372283935547, "logps/chosen": -381.17279052734375, "logps/rejected": -270.18572998046875, "loss": 0.4889, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2971110343933105, "rewards/margins": 1.4406088590621948, "rewards/rejected": 1.8565025329589844, "step": 30220 }, { "epoch": 1.403500626770045, "grad_norm": 1.593999981880188, "learning_rate": 2.662147732020985e-07, "logits/chosen": -18.135387420654297, "logits/rejected": -17.32399559020996, "logps/chosen": -289.9601745605469, "logps/rejected": -236.7003173828125, "loss": 1.5141, "rewards/accuracies": 0.5, "rewards/chosen": 2.2822139263153076, "rewards/margins": 0.08451547473669052, "rewards/rejected": 2.1976985931396484, "step": 30230 }, { "epoch": 1.4039649008774782, "grad_norm": 1.9331966638565063, "learning_rate": 2.6613739418419303e-07, "logits/chosen": -19.38807487487793, "logits/rejected": -17.973604202270508, "logps/chosen": -468.75628662109375, "logps/rejected": -346.6627197265625, "loss": 0.5747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.908089637756348, "rewards/margins": 1.500052571296692, "rewards/rejected": 3.408036708831787, "step": 30240 }, { "epoch": 1.4044291749849112, "grad_norm": 44.44602966308594, "learning_rate": 2.660600151662875e-07, "logits/chosen": -18.649272918701172, "logits/rejected": -18.137493133544922, "logps/chosen": -405.67767333984375, "logps/rejected": -314.2052917480469, "loss": 0.9788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.566210985183716, "rewards/margins": 0.6020091772079468, "rewards/rejected": 2.9642016887664795, "step": 30250 }, { "epoch": 1.4048934490923441, "grad_norm": 175.28500366210938, "learning_rate": 2.65982636148382e-07, "logits/chosen": -18.584627151489258, "logits/rejected": -18.66241455078125, "logps/chosen": -407.0887145996094, "logps/rejected": -387.2139587402344, "loss": 0.7904, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5074667930603027, "rewards/margins": 0.23125973343849182, "rewards/rejected": 3.276207447052002, "step": 30260 }, { "epoch": 1.4053577231997771, "grad_norm": 131.93658447265625, "learning_rate": 2.659052571304765e-07, "logits/chosen": -18.95808982849121, "logits/rejected": -17.940628051757812, "logps/chosen": -349.72088623046875, "logps/rejected": -282.85101318359375, "loss": 0.6566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7943668365478516, "rewards/margins": 1.2886269092559814, "rewards/rejected": 2.505739688873291, "step": 30270 }, { "epoch": 1.4058219973072101, "grad_norm": 88.07275390625, "learning_rate": 2.6582787811257097e-07, "logits/chosen": -18.609895706176758, "logits/rejected": -18.849796295166016, "logps/chosen": -420.8965759277344, "logps/rejected": -444.23736572265625, "loss": 0.4992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.008840560913086, "rewards/margins": 0.6754172444343567, "rewards/rejected": 2.333423137664795, "step": 30280 }, { "epoch": 1.4062862714146431, "grad_norm": 141.0452117919922, "learning_rate": 2.657504990946655e-07, "logits/chosen": -18.580265045166016, "logits/rejected": -18.575563430786133, "logps/chosen": -405.50079345703125, "logps/rejected": -445.9657287597656, "loss": 1.1123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2174572944641113, "rewards/margins": -0.19698813557624817, "rewards/rejected": 3.414445400238037, "step": 30290 }, { "epoch": 1.4067505455220761, "grad_norm": 112.46511840820312, "learning_rate": 2.6567312007675994e-07, "logits/chosen": -19.427839279174805, "logits/rejected": -18.73731803894043, "logps/chosen": -415.85400390625, "logps/rejected": -335.97601318359375, "loss": 0.5213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.759981155395508, "rewards/margins": 0.7779648900032043, "rewards/rejected": 2.9820163249969482, "step": 30300 }, { "epoch": 1.4072148196295093, "grad_norm": 79.4952163696289, "learning_rate": 2.6559574105885445e-07, "logits/chosen": -18.65435028076172, "logits/rejected": -17.57175636291504, "logps/chosen": -414.90606689453125, "logps/rejected": -272.2060241699219, "loss": 0.3052, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0987637042999268, "rewards/margins": 1.6813089847564697, "rewards/rejected": 1.4174548387527466, "step": 30310 }, { "epoch": 1.4076790937369423, "grad_norm": 186.3828125, "learning_rate": 2.6551836204094896e-07, "logits/chosen": -20.04433250427246, "logits/rejected": -19.274850845336914, "logps/chosen": -391.4252014160156, "logps/rejected": -415.06707763671875, "loss": 0.627, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.350334882736206, "rewards/margins": 0.6317939758300781, "rewards/rejected": 2.718541383743286, "step": 30320 }, { "epoch": 1.4081433678443753, "grad_norm": 3.4288148880004883, "learning_rate": 2.6544098302304347e-07, "logits/chosen": -19.29068374633789, "logits/rejected": -17.39508056640625, "logps/chosen": -440.76263427734375, "logps/rejected": -327.55694580078125, "loss": 0.2207, "rewards/accuracies": 1.0, "rewards/chosen": 3.8599982261657715, "rewards/margins": 1.786492943763733, "rewards/rejected": 2.07350492477417, "step": 30330 }, { "epoch": 1.4086076419518083, "grad_norm": 132.55908203125, "learning_rate": 2.65363604005138e-07, "logits/chosen": -19.33740234375, "logits/rejected": -18.96634292602539, "logps/chosen": -345.3046569824219, "logps/rejected": -308.7524108886719, "loss": 0.9313, "rewards/accuracies": 0.5, "rewards/chosen": 2.927765369415283, "rewards/margins": 0.23094868659973145, "rewards/rejected": 2.6968164443969727, "step": 30340 }, { "epoch": 1.4090719160592413, "grad_norm": 155.70953369140625, "learning_rate": 2.6528622498723244e-07, "logits/chosen": -19.668804168701172, "logits/rejected": -17.45810317993164, "logps/chosen": -505.65252685546875, "logps/rejected": -244.87277221679688, "loss": 0.2223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.868314743041992, "rewards/margins": 3.1615960597991943, "rewards/rejected": 1.7067190408706665, "step": 30350 }, { "epoch": 1.4095361901666745, "grad_norm": 0.2799389362335205, "learning_rate": 2.6520884596932695e-07, "logits/chosen": -19.14627456665039, "logits/rejected": -18.537277221679688, "logps/chosen": -510.73931884765625, "logps/rejected": -420.6861877441406, "loss": 0.5114, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.503363132476807, "rewards/margins": 1.3475881814956665, "rewards/rejected": 3.155775308609009, "step": 30360 }, { "epoch": 1.4100004642741073, "grad_norm": 114.18955993652344, "learning_rate": 2.6513146695142146e-07, "logits/chosen": -19.900785446166992, "logits/rejected": -20.040756225585938, "logps/chosen": -383.71612548828125, "logps/rejected": -368.8619079589844, "loss": 0.8152, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.9475533962249756, "rewards/margins": -0.025597214698791504, "rewards/rejected": 2.9731507301330566, "step": 30370 }, { "epoch": 1.4104647383815405, "grad_norm": 25.796873092651367, "learning_rate": 2.650540879335159e-07, "logits/chosen": -19.217002868652344, "logits/rejected": -18.919057846069336, "logps/chosen": -355.12457275390625, "logps/rejected": -300.57232666015625, "loss": 0.5654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.47160267829895, "rewards/margins": 0.982627272605896, "rewards/rejected": 2.4889755249023438, "step": 30380 }, { "epoch": 1.4109290124889735, "grad_norm": 34.41257858276367, "learning_rate": 2.6497670891561043e-07, "logits/chosen": -18.949752807617188, "logits/rejected": -17.404447555541992, "logps/chosen": -449.61822509765625, "logps/rejected": -278.2989501953125, "loss": 0.2634, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9189751148223877, "rewards/margins": 2.3779823780059814, "rewards/rejected": 1.5409924983978271, "step": 30390 }, { "epoch": 1.4113932865964065, "grad_norm": 1.2368035316467285, "learning_rate": 2.648993298977049e-07, "logits/chosen": -18.864107131958008, "logits/rejected": -18.14285659790039, "logps/chosen": -467.50030517578125, "logps/rejected": -433.9990234375, "loss": 0.9787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.752719879150391, "rewards/margins": 0.700395941734314, "rewards/rejected": 4.052323341369629, "step": 30400 }, { "epoch": 1.4118575607038395, "grad_norm": 2.0761570930480957, "learning_rate": 2.648219508797994e-07, "logits/chosen": -18.271270751953125, "logits/rejected": -18.190916061401367, "logps/chosen": -437.75927734375, "logps/rejected": -407.41485595703125, "loss": 1.0123, "rewards/accuracies": 0.5, "rewards/chosen": 3.604588270187378, "rewards/margins": 0.8779088258743286, "rewards/rejected": 2.7266793251037598, "step": 30410 }, { "epoch": 1.4123218348112725, "grad_norm": 29.881563186645508, "learning_rate": 2.647445718618939e-07, "logits/chosen": -19.216081619262695, "logits/rejected": -18.424266815185547, "logps/chosen": -369.4698791503906, "logps/rejected": -331.0631103515625, "loss": 0.81, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.429673671722412, "rewards/margins": 0.7643305659294128, "rewards/rejected": 2.6653435230255127, "step": 30420 }, { "epoch": 1.4127861089187057, "grad_norm": 33.08353805541992, "learning_rate": 2.646671928439884e-07, "logits/chosen": -19.881023406982422, "logits/rejected": -18.752010345458984, "logps/chosen": -490.84063720703125, "logps/rejected": -387.955322265625, "loss": 0.458, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2759575843811035, "rewards/margins": 0.95317143201828, "rewards/rejected": 3.3227858543395996, "step": 30430 }, { "epoch": 1.4132503830261387, "grad_norm": 20.02408790588379, "learning_rate": 2.6458981382608294e-07, "logits/chosen": -18.334928512573242, "logits/rejected": -16.633026123046875, "logps/chosen": -381.9961242675781, "logps/rejected": -223.57080078125, "loss": 0.2979, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4146652221679688, "rewards/margins": 2.5241539478302, "rewards/rejected": 0.890511155128479, "step": 30440 }, { "epoch": 1.4137146571335717, "grad_norm": 108.53372955322266, "learning_rate": 2.645124348081774e-07, "logits/chosen": -20.953519821166992, "logits/rejected": -18.184375762939453, "logps/chosen": -558.1746826171875, "logps/rejected": -337.3676452636719, "loss": 0.3998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.87695837020874, "rewards/margins": 2.361281394958496, "rewards/rejected": 2.5156772136688232, "step": 30450 }, { "epoch": 1.4141789312410047, "grad_norm": 80.89480590820312, "learning_rate": 2.644350557902719e-07, "logits/chosen": -18.990724563598633, "logits/rejected": -17.012802124023438, "logps/chosen": -404.83636474609375, "logps/rejected": -344.3077697753906, "loss": 0.8353, "rewards/accuracies": 0.5, "rewards/chosen": 3.6315791606903076, "rewards/margins": 1.3840677738189697, "rewards/rejected": 2.247511148452759, "step": 30460 }, { "epoch": 1.4146432053484377, "grad_norm": 20.49036979675293, "learning_rate": 2.6435767677236636e-07, "logits/chosen": -19.021900177001953, "logits/rejected": -19.292627334594727, "logps/chosen": -347.3138732910156, "logps/rejected": -318.6253356933594, "loss": 0.8849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8532886505126953, "rewards/margins": 0.34774237871170044, "rewards/rejected": 3.5055465698242188, "step": 30470 }, { "epoch": 1.4151074794558707, "grad_norm": 59.300323486328125, "learning_rate": 2.642802977544609e-07, "logits/chosen": -19.262813568115234, "logits/rejected": -18.804298400878906, "logps/chosen": -402.41522216796875, "logps/rejected": -393.8800964355469, "loss": 0.6281, "rewards/accuracies": 0.5, "rewards/chosen": 3.7111270427703857, "rewards/margins": 0.3085721433162689, "rewards/rejected": 3.402554988861084, "step": 30480 }, { "epoch": 1.4155717535633037, "grad_norm": 27.668304443359375, "learning_rate": 2.642029187365554e-07, "logits/chosen": -19.11724090576172, "logits/rejected": -19.094064712524414, "logps/chosen": -460.85406494140625, "logps/rejected": -351.48809814453125, "loss": 1.1206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5382370948791504, "rewards/margins": 0.41775649785995483, "rewards/rejected": 3.120480537414551, "step": 30490 }, { "epoch": 1.4160360276707369, "grad_norm": 298.21234130859375, "learning_rate": 2.6412553971864985e-07, "logits/chosen": -19.349567413330078, "logits/rejected": -19.361980438232422, "logps/chosen": -394.49835205078125, "logps/rejected": -380.7083435058594, "loss": 1.3569, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9304497241973877, "rewards/margins": -0.44704610109329224, "rewards/rejected": 3.3774960041046143, "step": 30500 }, { "epoch": 1.4165003017781699, "grad_norm": 60.44265365600586, "learning_rate": 2.6404816070074436e-07, "logits/chosen": -19.64899253845215, "logits/rejected": -18.865581512451172, "logps/chosen": -433.4408264160156, "logps/rejected": -340.9251403808594, "loss": 0.7716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8549041748046875, "rewards/margins": 0.8824704885482788, "rewards/rejected": 2.9724338054656982, "step": 30510 }, { "epoch": 1.4169645758856029, "grad_norm": 38.364532470703125, "learning_rate": 2.6397078168283887e-07, "logits/chosen": -19.021608352661133, "logits/rejected": -18.01274871826172, "logps/chosen": -325.3985900878906, "logps/rejected": -287.44720458984375, "loss": 0.5389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7645061016082764, "rewards/margins": 1.8380413055419922, "rewards/rejected": 0.926464855670929, "step": 30520 }, { "epoch": 1.4174288499930359, "grad_norm": 29.232805252075195, "learning_rate": 2.638934026649334e-07, "logits/chosen": -19.22982406616211, "logits/rejected": -18.52565574645996, "logps/chosen": -482.3434143066406, "logps/rejected": -359.23748779296875, "loss": 0.392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.009950637817383, "rewards/margins": 1.436936616897583, "rewards/rejected": 2.573014259338379, "step": 30530 }, { "epoch": 1.4178931241004689, "grad_norm": 195.8957061767578, "learning_rate": 2.638160236470279e-07, "logits/chosen": -18.544649124145508, "logits/rejected": -18.92089080810547, "logps/chosen": -405.45556640625, "logps/rejected": -400.32806396484375, "loss": 1.254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4288277626037598, "rewards/margins": -0.14344929158687592, "rewards/rejected": 3.572277069091797, "step": 30540 }, { "epoch": 1.418357398207902, "grad_norm": 27.436098098754883, "learning_rate": 2.637386446291224e-07, "logits/chosen": -20.107677459716797, "logits/rejected": -18.464452743530273, "logps/chosen": -443.1514587402344, "logps/rejected": -319.9005432128906, "loss": 0.5002, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6745123863220215, "rewards/margins": 1.4198030233383179, "rewards/rejected": 2.254709243774414, "step": 30550 }, { "epoch": 1.4188216723153348, "grad_norm": 145.1536865234375, "learning_rate": 2.6366126561121686e-07, "logits/chosen": -18.820261001586914, "logits/rejected": -17.364402770996094, "logps/chosen": -403.87274169921875, "logps/rejected": -295.1039733886719, "loss": 0.5805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.462038040161133, "rewards/margins": 1.0640708208084106, "rewards/rejected": 2.3979673385620117, "step": 30560 }, { "epoch": 1.419285946422768, "grad_norm": 9.924264907836914, "learning_rate": 2.635838865933113e-07, "logits/chosen": -19.67685890197754, "logits/rejected": -18.505313873291016, "logps/chosen": -331.15435791015625, "logps/rejected": -267.24713134765625, "loss": 0.8856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1735360622406006, "rewards/margins": 0.8031096458435059, "rewards/rejected": 2.3704261779785156, "step": 30570 }, { "epoch": 1.419750220530201, "grad_norm": 11.379990577697754, "learning_rate": 2.6350650757540583e-07, "logits/chosen": -18.728313446044922, "logits/rejected": -18.25359535217285, "logps/chosen": -548.2593383789062, "logps/rejected": -397.3778076171875, "loss": 0.6161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8147754669189453, "rewards/margins": 0.8924111127853394, "rewards/rejected": 2.9223642349243164, "step": 30580 }, { "epoch": 1.420214494637634, "grad_norm": 9.88764762878418, "learning_rate": 2.6342912855750034e-07, "logits/chosen": -19.158462524414062, "logits/rejected": -18.3840389251709, "logps/chosen": -397.6355285644531, "logps/rejected": -361.787109375, "loss": 0.514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5251777172088623, "rewards/margins": 0.8022791147232056, "rewards/rejected": 2.722898244857788, "step": 30590 }, { "epoch": 1.420678768745067, "grad_norm": 185.6741180419922, "learning_rate": 2.633517495395948e-07, "logits/chosen": -19.81026840209961, "logits/rejected": -18.96446418762207, "logps/chosen": -445.32000732421875, "logps/rejected": -379.89813232421875, "loss": 0.6955, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.026528835296631, "rewards/margins": 1.1951303482055664, "rewards/rejected": 2.8313982486724854, "step": 30600 }, { "epoch": 1.4211430428525, "grad_norm": 39.72905731201172, "learning_rate": 2.632743705216893e-07, "logits/chosen": -19.304019927978516, "logits/rejected": -18.811309814453125, "logps/chosen": -383.11492919921875, "logps/rejected": -289.9903259277344, "loss": 0.7415, "rewards/accuracies": 0.5, "rewards/chosen": 3.6429615020751953, "rewards/margins": 0.4889472424983978, "rewards/rejected": 3.1540141105651855, "step": 30610 }, { "epoch": 1.4216073169599333, "grad_norm": 205.9364776611328, "learning_rate": 2.631969915037838e-07, "logits/chosen": -19.065860748291016, "logits/rejected": -19.0416259765625, "logps/chosen": -389.96783447265625, "logps/rejected": -393.0527038574219, "loss": 1.4823, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.187573194503784, "rewards/margins": -0.31866419315338135, "rewards/rejected": 3.506237506866455, "step": 30620 }, { "epoch": 1.4220715910673662, "grad_norm": 59.774539947509766, "learning_rate": 2.6311961248587833e-07, "logits/chosen": -18.1998291015625, "logits/rejected": -18.307979583740234, "logps/chosen": -351.46636962890625, "logps/rejected": -331.421875, "loss": 0.9414, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.449613571166992, "rewards/margins": 0.2287047803401947, "rewards/rejected": 2.2209088802337646, "step": 30630 }, { "epoch": 1.4225358651747992, "grad_norm": 148.24856567382812, "learning_rate": 2.6304223346797284e-07, "logits/chosen": -19.842777252197266, "logits/rejected": -19.10123062133789, "logps/chosen": -561.1685791015625, "logps/rejected": -437.346435546875, "loss": 0.5988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.650547981262207, "rewards/margins": 0.7254846096038818, "rewards/rejected": 3.9250636100769043, "step": 30640 }, { "epoch": 1.4230001392822322, "grad_norm": 22.65253448486328, "learning_rate": 2.6296485445006736e-07, "logits/chosen": -19.468652725219727, "logits/rejected": -18.150131225585938, "logps/chosen": -500.17803955078125, "logps/rejected": -416.2352600097656, "loss": 0.5618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7300877571105957, "rewards/margins": 0.5908921957015991, "rewards/rejected": 3.139194965362549, "step": 30650 }, { "epoch": 1.4234644133896652, "grad_norm": 317.9973449707031, "learning_rate": 2.6288747543216176e-07, "logits/chosen": -18.495437622070312, "logits/rejected": -17.28327178955078, "logps/chosen": -335.2483825683594, "logps/rejected": -284.96246337890625, "loss": 0.9373, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.218177080154419, "rewards/margins": 0.7249306440353394, "rewards/rejected": 2.49324631690979, "step": 30660 }, { "epoch": 1.4239286874970982, "grad_norm": 54.14459228515625, "learning_rate": 2.6281009641425627e-07, "logits/chosen": -18.79935646057129, "logits/rejected": -18.55479621887207, "logps/chosen": -358.7061767578125, "logps/rejected": -323.1320495605469, "loss": 0.5932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.892547845840454, "rewards/margins": 0.7051253318786621, "rewards/rejected": 2.187422275543213, "step": 30670 }, { "epoch": 1.4243929616045312, "grad_norm": 1.0456680059432983, "learning_rate": 2.627327173963508e-07, "logits/chosen": -18.877498626708984, "logits/rejected": -18.00556755065918, "logps/chosen": -487.5072326660156, "logps/rejected": -364.949462890625, "loss": 0.7307, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9098076820373535, "rewards/margins": 1.6392066478729248, "rewards/rejected": 2.2706005573272705, "step": 30680 }, { "epoch": 1.4248572357119644, "grad_norm": 95.70741271972656, "learning_rate": 2.626553383784453e-07, "logits/chosen": -19.590198516845703, "logits/rejected": -18.816944122314453, "logps/chosen": -360.5982360839844, "logps/rejected": -303.30816650390625, "loss": 0.6654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6508677005767822, "rewards/margins": 0.35913529992103577, "rewards/rejected": 2.2917323112487793, "step": 30690 }, { "epoch": 1.4253215098193974, "grad_norm": 40.16423034667969, "learning_rate": 2.625779593605398e-07, "logits/chosen": -18.391845703125, "logits/rejected": -17.707231521606445, "logps/chosen": -313.64422607421875, "logps/rejected": -239.41598510742188, "loss": 0.381, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.566323757171631, "rewards/margins": 1.0684291124343872, "rewards/rejected": 1.4978950023651123, "step": 30700 }, { "epoch": 1.4257857839268304, "grad_norm": 8.82968521118164, "learning_rate": 2.6250058034263426e-07, "logits/chosen": -19.704484939575195, "logits/rejected": -18.113780975341797, "logps/chosen": -363.39898681640625, "logps/rejected": -262.47491455078125, "loss": 0.2979, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.560856342315674, "rewards/margins": 1.6925541162490845, "rewards/rejected": 1.8683021068572998, "step": 30710 }, { "epoch": 1.4262500580342634, "grad_norm": 63.500972747802734, "learning_rate": 2.624232013247288e-07, "logits/chosen": -19.0040283203125, "logits/rejected": -17.82003402709961, "logps/chosen": -400.3749694824219, "logps/rejected": -239.9290008544922, "loss": 0.4219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.413962364196777, "rewards/margins": 1.8145233392715454, "rewards/rejected": 2.5994391441345215, "step": 30720 }, { "epoch": 1.4267143321416964, "grad_norm": 3.0054397583007812, "learning_rate": 2.623458223068233e-07, "logits/chosen": -19.620344161987305, "logits/rejected": -19.568727493286133, "logps/chosen": -444.220947265625, "logps/rejected": -335.89691162109375, "loss": 0.6397, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.652316093444824, "rewards/margins": 1.4017996788024902, "rewards/rejected": 3.250516891479492, "step": 30730 }, { "epoch": 1.4271786062491296, "grad_norm": 180.678955078125, "learning_rate": 2.622684432889178e-07, "logits/chosen": -18.397729873657227, "logits/rejected": -17.666805267333984, "logps/chosen": -315.40960693359375, "logps/rejected": -247.10928344726562, "loss": 1.0536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9902920722961426, "rewards/margins": 0.5300499796867371, "rewards/rejected": 2.4602420330047607, "step": 30740 }, { "epoch": 1.4276428803565624, "grad_norm": 179.66758728027344, "learning_rate": 2.621910642710123e-07, "logits/chosen": -18.694826126098633, "logits/rejected": -18.709508895874023, "logps/chosen": -348.58209228515625, "logps/rejected": -385.5227966308594, "loss": 1.168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.68015456199646, "rewards/margins": -0.15634115040302277, "rewards/rejected": 2.836495876312256, "step": 30750 }, { "epoch": 1.4281071544639956, "grad_norm": 84.28324127197266, "learning_rate": 2.621136852531067e-07, "logits/chosen": -19.1926212310791, "logits/rejected": -19.123409271240234, "logps/chosen": -372.2451171875, "logps/rejected": -389.82635498046875, "loss": 0.6848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0683493614196777, "rewards/margins": 0.4733734726905823, "rewards/rejected": 2.5949759483337402, "step": 30760 }, { "epoch": 1.4285714285714286, "grad_norm": 1.0251238346099854, "learning_rate": 2.6203630623520123e-07, "logits/chosen": -19.683298110961914, "logits/rejected": -18.599002838134766, "logps/chosen": -375.3111572265625, "logps/rejected": -270.5923156738281, "loss": 0.7259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.848719835281372, "rewards/margins": 1.465786337852478, "rewards/rejected": 2.3829333782196045, "step": 30770 }, { "epoch": 1.4290357026788616, "grad_norm": 9.234173774719238, "learning_rate": 2.6195892721729574e-07, "logits/chosen": -19.139766693115234, "logits/rejected": -18.318405151367188, "logps/chosen": -388.8416748046875, "logps/rejected": -320.988525390625, "loss": 0.7268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.418811321258545, "rewards/margins": 1.0448955297470093, "rewards/rejected": 2.373915910720825, "step": 30780 }, { "epoch": 1.4294999767862946, "grad_norm": 70.53447723388672, "learning_rate": 2.6188154819939025e-07, "logits/chosen": -17.69647216796875, "logits/rejected": -18.217586517333984, "logps/chosen": -296.99139404296875, "logps/rejected": -346.4700927734375, "loss": 1.2077, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8050172328948975, "rewards/margins": -0.3825584053993225, "rewards/rejected": 2.187575578689575, "step": 30790 }, { "epoch": 1.4299642508937276, "grad_norm": 23.18819808959961, "learning_rate": 2.6180416918148476e-07, "logits/chosen": -19.530258178710938, "logits/rejected": -19.505563735961914, "logps/chosen": -480.27410888671875, "logps/rejected": -385.8206481933594, "loss": 0.7323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.498435974121094, "rewards/margins": 1.154301404953003, "rewards/rejected": 3.344134569168091, "step": 30800 }, { "epoch": 1.4304285250011608, "grad_norm": 35.83039474487305, "learning_rate": 2.617267901635792e-07, "logits/chosen": -20.342613220214844, "logits/rejected": -20.067707061767578, "logps/chosen": -401.99969482421875, "logps/rejected": -455.1954040527344, "loss": 0.9352, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.804847002029419, "rewards/margins": -0.0019521951908245683, "rewards/rejected": 3.8067989349365234, "step": 30810 }, { "epoch": 1.4308927991085938, "grad_norm": 65.01592254638672, "learning_rate": 2.6164941114567373e-07, "logits/chosen": -19.47114372253418, "logits/rejected": -18.53704071044922, "logps/chosen": -345.2510681152344, "logps/rejected": -324.24322509765625, "loss": 0.6051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.568802833557129, "rewards/margins": 0.9927509427070618, "rewards/rejected": 1.5760520696640015, "step": 30820 }, { "epoch": 1.4313570732160268, "grad_norm": 67.11474609375, "learning_rate": 2.6157203212776824e-07, "logits/chosen": -19.075206756591797, "logits/rejected": -18.525712966918945, "logps/chosen": -353.1569519042969, "logps/rejected": -339.0763244628906, "loss": 0.5363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.269638776779175, "rewards/margins": 0.7178158760070801, "rewards/rejected": 2.5518226623535156, "step": 30830 }, { "epoch": 1.4318213473234598, "grad_norm": 0.3528643250465393, "learning_rate": 2.6149465310986275e-07, "logits/chosen": -19.656269073486328, "logits/rejected": -19.031497955322266, "logps/chosen": -438.3343811035156, "logps/rejected": -375.8058166503906, "loss": 0.3648, "rewards/accuracies": 1.0, "rewards/chosen": 4.275971412658691, "rewards/margins": 1.5204193592071533, "rewards/rejected": 2.755551815032959, "step": 30840 }, { "epoch": 1.4322856214308928, "grad_norm": 57.10085678100586, "learning_rate": 2.6141727409195726e-07, "logits/chosen": -19.394054412841797, "logits/rejected": -18.179420471191406, "logps/chosen": -433.52105712890625, "logps/rejected": -306.75823974609375, "loss": 0.4543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6778945922851562, "rewards/margins": 1.4895460605621338, "rewards/rejected": 2.1883487701416016, "step": 30850 }, { "epoch": 1.4327498955383258, "grad_norm": 95.45120239257812, "learning_rate": 2.6133989507405167e-07, "logits/chosen": -19.610671997070312, "logits/rejected": -19.12542152404785, "logps/chosen": -442.1405334472656, "logps/rejected": -441.6778869628906, "loss": 1.371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.578566312789917, "rewards/margins": 0.20238561928272247, "rewards/rejected": 3.376180648803711, "step": 30860 }, { "epoch": 1.4332141696457588, "grad_norm": 13.441828727722168, "learning_rate": 2.612625160561462e-07, "logits/chosen": -19.048542022705078, "logits/rejected": -17.905216217041016, "logps/chosen": -429.45184326171875, "logps/rejected": -303.29925537109375, "loss": 0.2752, "rewards/accuracies": 1.0, "rewards/chosen": 4.097220420837402, "rewards/margins": 1.8645098209381104, "rewards/rejected": 2.23271107673645, "step": 30870 }, { "epoch": 1.433678443753192, "grad_norm": 119.19902038574219, "learning_rate": 2.611851370382407e-07, "logits/chosen": -18.497926712036133, "logits/rejected": -17.941505432128906, "logps/chosen": -444.03997802734375, "logps/rejected": -393.28948974609375, "loss": 0.9916, "rewards/accuracies": 0.5, "rewards/chosen": 3.7787671089172363, "rewards/margins": 0.03158414363861084, "rewards/rejected": 3.747183322906494, "step": 30880 }, { "epoch": 1.434142717860625, "grad_norm": 77.49568939208984, "learning_rate": 2.611077580203352e-07, "logits/chosen": -17.885576248168945, "logits/rejected": -18.062061309814453, "logps/chosen": -278.11712646484375, "logps/rejected": -336.8646545410156, "loss": 0.7576, "rewards/accuracies": 0.5, "rewards/chosen": 2.8219077587127686, "rewards/margins": 0.5746411681175232, "rewards/rejected": 2.2472667694091797, "step": 30890 }, { "epoch": 1.434606991968058, "grad_norm": 159.3446044921875, "learning_rate": 2.610303790024297e-07, "logits/chosen": -19.697507858276367, "logits/rejected": -19.23531723022461, "logps/chosen": -445.07696533203125, "logps/rejected": -385.0984191894531, "loss": 0.7608, "rewards/accuracies": 0.5, "rewards/chosen": 3.8344733715057373, "rewards/margins": 0.675520122051239, "rewards/rejected": 3.1589531898498535, "step": 30900 }, { "epoch": 1.435071266075491, "grad_norm": 130.234130859375, "learning_rate": 2.6095299998452417e-07, "logits/chosen": -19.65159034729004, "logits/rejected": -18.799463272094727, "logps/chosen": -382.41595458984375, "logps/rejected": -326.06219482421875, "loss": 0.6201, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2626941204071045, "rewards/margins": 1.0103986263275146, "rewards/rejected": 2.2522952556610107, "step": 30910 }, { "epoch": 1.435535540182924, "grad_norm": 3.683610677719116, "learning_rate": 2.608756209666187e-07, "logits/chosen": -18.277753829956055, "logits/rejected": -17.153337478637695, "logps/chosen": -393.78778076171875, "logps/rejected": -282.9831237792969, "loss": 0.5143, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.106046199798584, "rewards/margins": 1.0427361726760864, "rewards/rejected": 2.063310146331787, "step": 30920 }, { "epoch": 1.435999814290357, "grad_norm": 0.11945797502994537, "learning_rate": 2.607982419487132e-07, "logits/chosen": -19.253589630126953, "logits/rejected": -18.17612648010254, "logps/chosen": -461.03607177734375, "logps/rejected": -361.99188232421875, "loss": 0.3377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.83530855178833, "rewards/margins": 2.4205498695373535, "rewards/rejected": 2.4147589206695557, "step": 30930 }, { "epoch": 1.43646408839779, "grad_norm": 0.1347111463546753, "learning_rate": 2.607208629308077e-07, "logits/chosen": -19.048118591308594, "logits/rejected": -18.695358276367188, "logps/chosen": -391.1741638183594, "logps/rejected": -291.8661804199219, "loss": 0.7395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.505549192428589, "rewards/margins": 1.3530701398849487, "rewards/rejected": 2.1524786949157715, "step": 30940 }, { "epoch": 1.4369283625052232, "grad_norm": 0.3420739471912384, "learning_rate": 2.6064348391290217e-07, "logits/chosen": -18.680320739746094, "logits/rejected": -18.034069061279297, "logps/chosen": -360.58013916015625, "logps/rejected": -342.5704650878906, "loss": 0.8679, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8872931003570557, "rewards/margins": 0.6542909145355225, "rewards/rejected": 2.233002185821533, "step": 30950 }, { "epoch": 1.4373926366126561, "grad_norm": 43.1353874206543, "learning_rate": 2.605661048949966e-07, "logits/chosen": -19.023794174194336, "logits/rejected": -19.391633987426758, "logps/chosen": -289.663818359375, "logps/rejected": -291.96832275390625, "loss": 0.7385, "rewards/accuracies": 0.5, "rewards/chosen": 2.3688905239105225, "rewards/margins": 0.05592390149831772, "rewards/rejected": 2.312966823577881, "step": 30960 }, { "epoch": 1.4378569107200891, "grad_norm": 15.808199882507324, "learning_rate": 2.6048872587709113e-07, "logits/chosen": -19.16956901550293, "logits/rejected": -18.420106887817383, "logps/chosen": -403.79345703125, "logps/rejected": -334.8526916503906, "loss": 0.8366, "rewards/accuracies": 0.5, "rewards/chosen": 2.605541706085205, "rewards/margins": 0.19576874375343323, "rewards/rejected": 2.4097728729248047, "step": 30970 }, { "epoch": 1.4383211848275221, "grad_norm": 52.922035217285156, "learning_rate": 2.6041134685918565e-07, "logits/chosen": -19.520465850830078, "logits/rejected": -18.726531982421875, "logps/chosen": -377.255859375, "logps/rejected": -298.00653076171875, "loss": 0.4607, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.075141668319702, "rewards/margins": 1.2775685787200928, "rewards/rejected": 1.7975728511810303, "step": 30980 }, { "epoch": 1.4387854589349551, "grad_norm": 59.82831573486328, "learning_rate": 2.6033396784128016e-07, "logits/chosen": -18.697999954223633, "logits/rejected": -18.140663146972656, "logps/chosen": -338.50921630859375, "logps/rejected": -253.666015625, "loss": 0.4825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.606387138366699, "rewards/margins": 1.2543222904205322, "rewards/rejected": 3.352064847946167, "step": 30990 }, { "epoch": 1.4392497330423883, "grad_norm": 28.014610290527344, "learning_rate": 2.6025658882337467e-07, "logits/chosen": -19.31705665588379, "logits/rejected": -18.36350440979004, "logps/chosen": -433.38360595703125, "logps/rejected": -336.72674560546875, "loss": 0.5641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.809826612472534, "rewards/margins": 1.309099793434143, "rewards/rejected": 2.5007271766662598, "step": 31000 }, { "epoch": 1.4397140071498211, "grad_norm": 0.4483368396759033, "learning_rate": 2.6017920980546913e-07, "logits/chosen": -19.35541343688965, "logits/rejected": -17.815933227539062, "logps/chosen": -468.5668029785156, "logps/rejected": -264.4657287597656, "loss": 0.1267, "rewards/accuracies": 1.0, "rewards/chosen": 4.653237819671631, "rewards/margins": 2.9349777698516846, "rewards/rejected": 1.7182601690292358, "step": 31010 }, { "epoch": 1.4401782812572543, "grad_norm": 0.7627884745597839, "learning_rate": 2.6010183078756364e-07, "logits/chosen": -19.84041976928711, "logits/rejected": -18.375850677490234, "logps/chosen": -490.8949279785156, "logps/rejected": -379.760986328125, "loss": 0.7453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.348836421966553, "rewards/margins": 1.7621653079986572, "rewards/rejected": 3.5866706371307373, "step": 31020 }, { "epoch": 1.4406425553646873, "grad_norm": 19.763004302978516, "learning_rate": 2.6002445176965815e-07, "logits/chosen": -19.046579360961914, "logits/rejected": -18.74204444885254, "logps/chosen": -458.9740295410156, "logps/rejected": -430.99945068359375, "loss": 0.6653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.189671993255615, "rewards/margins": 0.835369884967804, "rewards/rejected": 3.3543026447296143, "step": 31030 }, { "epoch": 1.4411068294721203, "grad_norm": 113.88078308105469, "learning_rate": 2.5994707275175266e-07, "logits/chosen": -18.571048736572266, "logits/rejected": -18.460254669189453, "logps/chosen": -414.1302795410156, "logps/rejected": -414.7176818847656, "loss": 1.1958, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.393583297729492, "rewards/margins": 0.07109908759593964, "rewards/rejected": 3.3224844932556152, "step": 31040 }, { "epoch": 1.4415711035795533, "grad_norm": 3.995976448059082, "learning_rate": 2.598696937338471e-07, "logits/chosen": -18.758474349975586, "logits/rejected": -18.01527976989746, "logps/chosen": -428.00909423828125, "logps/rejected": -306.8576965332031, "loss": 0.6529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.100827693939209, "rewards/margins": 1.526453971862793, "rewards/rejected": 2.574373483657837, "step": 31050 }, { "epoch": 1.4420353776869863, "grad_norm": 37.08604431152344, "learning_rate": 2.597923147159416e-07, "logits/chosen": -18.860355377197266, "logits/rejected": -17.702022552490234, "logps/chosen": -303.42352294921875, "logps/rejected": -228.40771484375, "loss": 0.4145, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.457282543182373, "rewards/margins": 1.183168649673462, "rewards/rejected": 1.2741138935089111, "step": 31060 }, { "epoch": 1.4424996517944195, "grad_norm": 109.60082244873047, "learning_rate": 2.597149356980361e-07, "logits/chosen": -18.154870986938477, "logits/rejected": -17.95975112915039, "logps/chosen": -434.33392333984375, "logps/rejected": -417.71295166015625, "loss": 0.9438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.453585147857666, "rewards/margins": 0.4544767737388611, "rewards/rejected": 2.9991087913513184, "step": 31070 }, { "epoch": 1.4429639259018525, "grad_norm": 21.40761375427246, "learning_rate": 2.596375566801306e-07, "logits/chosen": -18.789775848388672, "logits/rejected": -18.26527214050293, "logps/chosen": -276.39825439453125, "logps/rejected": -244.25180053710938, "loss": 0.6856, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.184223175048828, "rewards/margins": 0.8836881518363953, "rewards/rejected": 2.300534725189209, "step": 31080 }, { "epoch": 1.4434282000092855, "grad_norm": 11.031790733337402, "learning_rate": 2.595601776622251e-07, "logits/chosen": -19.147815704345703, "logits/rejected": -17.08415412902832, "logps/chosen": -370.1568908691406, "logps/rejected": -187.96876525878906, "loss": 0.3592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.041996955871582, "rewards/margins": 1.9137191772460938, "rewards/rejected": 2.128277540206909, "step": 31090 }, { "epoch": 1.4438924741167185, "grad_norm": 59.23523712158203, "learning_rate": 2.594827986443196e-07, "logits/chosen": -19.491836547851562, "logits/rejected": -18.403213500976562, "logps/chosen": -440.85498046875, "logps/rejected": -305.82598876953125, "loss": 0.3711, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.208649635314941, "rewards/margins": 1.9801514148712158, "rewards/rejected": 2.2284984588623047, "step": 31100 }, { "epoch": 1.4443567482241515, "grad_norm": 36.08755874633789, "learning_rate": 2.594054196264141e-07, "logits/chosen": -18.914234161376953, "logits/rejected": -18.02680778503418, "logps/chosen": -345.1065673828125, "logps/rejected": -321.58294677734375, "loss": 0.9682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.714749813079834, "rewards/margins": 0.5358420610427856, "rewards/rejected": 2.178908109664917, "step": 31110 }, { "epoch": 1.4448210223315845, "grad_norm": 138.24169921875, "learning_rate": 2.593280406085086e-07, "logits/chosen": -18.997882843017578, "logits/rejected": -17.876657485961914, "logps/chosen": -441.4195251464844, "logps/rejected": -320.72052001953125, "loss": 0.4855, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.633666515350342, "rewards/margins": 0.9644729495048523, "rewards/rejected": 2.669193744659424, "step": 31120 }, { "epoch": 1.4452852964390175, "grad_norm": 0.06578543037176132, "learning_rate": 2.592506615906031e-07, "logits/chosen": -17.978260040283203, "logits/rejected": -17.350509643554688, "logps/chosen": -453.1053161621094, "logps/rejected": -405.8443298339844, "loss": 1.0172, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.461526870727539, "rewards/margins": 0.468151330947876, "rewards/rejected": 2.993375301361084, "step": 31130 }, { "epoch": 1.4457495705464507, "grad_norm": 68.835693359375, "learning_rate": 2.5917328257269756e-07, "logits/chosen": -18.512863159179688, "logits/rejected": -17.680091857910156, "logps/chosen": -435.65606689453125, "logps/rejected": -313.45880126953125, "loss": 0.7959, "rewards/accuracies": 0.5, "rewards/chosen": 3.78132700920105, "rewards/margins": 1.1622321605682373, "rewards/rejected": 2.6190948486328125, "step": 31140 }, { "epoch": 1.4462138446538837, "grad_norm": 36.4284553527832, "learning_rate": 2.5909590355479207e-07, "logits/chosen": -19.348928451538086, "logits/rejected": -17.792804718017578, "logps/chosen": -418.71258544921875, "logps/rejected": -247.5594024658203, "loss": 0.4701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.329360008239746, "rewards/margins": 2.234549045562744, "rewards/rejected": 2.094810724258423, "step": 31150 }, { "epoch": 1.4466781187613167, "grad_norm": 138.18809509277344, "learning_rate": 2.5901852453688653e-07, "logits/chosen": -18.57906723022461, "logits/rejected": -18.645328521728516, "logps/chosen": -363.27862548828125, "logps/rejected": -412.60345458984375, "loss": 0.7435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.545813798904419, "rewards/margins": 0.17813199758529663, "rewards/rejected": 3.3676815032958984, "step": 31160 }, { "epoch": 1.4471423928687497, "grad_norm": 47.77169418334961, "learning_rate": 2.5894114551898104e-07, "logits/chosen": -19.54317855834961, "logits/rejected": -18.641178131103516, "logps/chosen": -350.29473876953125, "logps/rejected": -239.1210479736328, "loss": 0.5978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2246041297912598, "rewards/margins": 1.1312583684921265, "rewards/rejected": 2.0933456420898438, "step": 31170 }, { "epoch": 1.4476066669761827, "grad_norm": 7.240759372711182, "learning_rate": 2.5886376650107555e-07, "logits/chosen": -18.770999908447266, "logits/rejected": -17.382722854614258, "logps/chosen": -459.32379150390625, "logps/rejected": -298.98565673828125, "loss": 0.4153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.461385250091553, "rewards/margins": 2.1262598037719727, "rewards/rejected": 2.33512544631958, "step": 31180 }, { "epoch": 1.448070941083616, "grad_norm": 161.25982666015625, "learning_rate": 2.5878638748317007e-07, "logits/chosen": -18.764188766479492, "logits/rejected": -18.173137664794922, "logps/chosen": -468.06793212890625, "logps/rejected": -425.9214782714844, "loss": 1.0987, "rewards/accuracies": 0.5, "rewards/chosen": 3.354377031326294, "rewards/margins": 0.16320650279521942, "rewards/rejected": 3.1911706924438477, "step": 31190 }, { "epoch": 1.4485352151910487, "grad_norm": 47.50677490234375, "learning_rate": 2.587090084652646e-07, "logits/chosen": -18.346141815185547, "logits/rejected": -17.909162521362305, "logps/chosen": -363.5329284667969, "logps/rejected": -344.9455261230469, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": 3.1239750385284424, "rewards/margins": 1.1100783348083496, "rewards/rejected": 2.0138964653015137, "step": 31200 }, { "epoch": 1.4489994892984819, "grad_norm": 3.0813705921173096, "learning_rate": 2.5863162944735904e-07, "logits/chosen": -18.539175033569336, "logits/rejected": -17.385211944580078, "logps/chosen": -432.99163818359375, "logps/rejected": -339.8446350097656, "loss": 0.4708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8550896644592285, "rewards/margins": 1.5900238752365112, "rewards/rejected": 2.2650656700134277, "step": 31210 }, { "epoch": 1.4494637634059149, "grad_norm": 71.98981475830078, "learning_rate": 2.5855425042945355e-07, "logits/chosen": -18.798952102661133, "logits/rejected": -18.35761070251465, "logps/chosen": -393.9572448730469, "logps/rejected": -362.2317810058594, "loss": 0.5363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.972036361694336, "rewards/margins": 0.8179343342781067, "rewards/rejected": 2.154102087020874, "step": 31220 }, { "epoch": 1.4499280375133479, "grad_norm": 19.337383270263672, "learning_rate": 2.5847687141154806e-07, "logits/chosen": -19.465991973876953, "logits/rejected": -17.228404998779297, "logps/chosen": -396.3909606933594, "logps/rejected": -205.2999267578125, "loss": 0.1888, "rewards/accuracies": 1.0, "rewards/chosen": 2.947885036468506, "rewards/margins": 2.033724069595337, "rewards/rejected": 0.9141613245010376, "step": 31230 }, { "epoch": 1.4503923116207809, "grad_norm": 2.5616745948791504, "learning_rate": 2.583994923936425e-07, "logits/chosen": -19.003915786743164, "logits/rejected": -17.675617218017578, "logps/chosen": -434.37188720703125, "logps/rejected": -264.1081237792969, "loss": 0.3202, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9159343242645264, "rewards/margins": 1.7163509130477905, "rewards/rejected": 1.1995834112167358, "step": 31240 }, { "epoch": 1.4508565857282139, "grad_norm": 5.4454827308654785, "learning_rate": 2.5832211337573703e-07, "logits/chosen": -18.27458953857422, "logits/rejected": -16.872159957885742, "logps/chosen": -481.5343322753906, "logps/rejected": -289.0772399902344, "loss": 0.2785, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.700193405151367, "rewards/margins": 2.594238758087158, "rewards/rejected": 2.105954647064209, "step": 31250 }, { "epoch": 1.451320859835647, "grad_norm": 41.367225646972656, "learning_rate": 2.582447343578315e-07, "logits/chosen": -19.773000717163086, "logits/rejected": -18.79515266418457, "logps/chosen": -378.0076599121094, "logps/rejected": -275.9161376953125, "loss": 0.3794, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.872215986251831, "rewards/margins": 1.962001085281372, "rewards/rejected": 1.9102147817611694, "step": 31260 }, { "epoch": 1.45178513394308, "grad_norm": 3.662687301635742, "learning_rate": 2.58167355339926e-07, "logits/chosen": -18.655759811401367, "logits/rejected": -18.37016487121582, "logps/chosen": -369.6048889160156, "logps/rejected": -350.4985656738281, "loss": 0.6774, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2878918647766113, "rewards/margins": 0.5481294393539429, "rewards/rejected": 2.739762783050537, "step": 31270 }, { "epoch": 1.452249408050513, "grad_norm": 72.69499969482422, "learning_rate": 2.580899763220205e-07, "logits/chosen": -17.881053924560547, "logits/rejected": -17.287891387939453, "logps/chosen": -371.4111328125, "logps/rejected": -303.434326171875, "loss": 1.169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.569829940795898, "rewards/margins": 1.5987050533294678, "rewards/rejected": 2.9711248874664307, "step": 31280 }, { "epoch": 1.452713682157946, "grad_norm": 71.63740539550781, "learning_rate": 2.58012597304115e-07, "logits/chosen": -19.02269172668457, "logits/rejected": -18.227846145629883, "logps/chosen": -357.1372985839844, "logps/rejected": -279.4320983886719, "loss": 1.0004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2952537536621094, "rewards/margins": 1.535431146621704, "rewards/rejected": 1.7598228454589844, "step": 31290 }, { "epoch": 1.453177956265379, "grad_norm": 191.71742248535156, "learning_rate": 2.5793521828620953e-07, "logits/chosen": -18.96165657043457, "logits/rejected": -18.214853286743164, "logps/chosen": -467.6097106933594, "logps/rejected": -368.8177795410156, "loss": 0.8662, "rewards/accuracies": 0.5, "rewards/chosen": 3.5653011798858643, "rewards/margins": 0.4678531289100647, "rewards/rejected": 3.0974481105804443, "step": 31300 }, { "epoch": 1.453642230372812, "grad_norm": 152.60531616210938, "learning_rate": 2.57857839268304e-07, "logits/chosen": -19.129297256469727, "logits/rejected": -18.67593002319336, "logps/chosen": -513.843994140625, "logps/rejected": -451.4684143066406, "loss": 0.7863, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8686118125915527, "rewards/margins": 0.6845349669456482, "rewards/rejected": 3.184077024459839, "step": 31310 }, { "epoch": 1.454106504480245, "grad_norm": 17.350534439086914, "learning_rate": 2.577804602503985e-07, "logits/chosen": -19.2877254486084, "logits/rejected": -18.33100128173828, "logps/chosen": -299.0141906738281, "logps/rejected": -205.6000213623047, "loss": 0.4631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.098464012145996, "rewards/margins": 1.978563904762268, "rewards/rejected": 1.1198999881744385, "step": 31320 }, { "epoch": 1.4545707785876782, "grad_norm": 282.275390625, "learning_rate": 2.5770308123249296e-07, "logits/chosen": -18.521678924560547, "logits/rejected": -18.904525756835938, "logps/chosen": -353.5053405761719, "logps/rejected": -336.5818176269531, "loss": 1.025, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2959561347961426, "rewards/margins": 0.14250436425209045, "rewards/rejected": 3.153451919555664, "step": 31330 }, { "epoch": 1.4550350526951112, "grad_norm": 2.404263973236084, "learning_rate": 2.5762570221458747e-07, "logits/chosen": -19.712139129638672, "logits/rejected": -17.753196716308594, "logps/chosen": -414.8792419433594, "logps/rejected": -239.56680297851562, "loss": 0.4192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.694798946380615, "rewards/margins": 2.5979197025299072, "rewards/rejected": 2.096879720687866, "step": 31340 }, { "epoch": 1.4554993268025442, "grad_norm": 34.176475524902344, "learning_rate": 2.57548323196682e-07, "logits/chosen": -19.489852905273438, "logits/rejected": -18.338802337646484, "logps/chosen": -392.65576171875, "logps/rejected": -317.1016845703125, "loss": 0.4324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7893118858337402, "rewards/margins": 1.638766884803772, "rewards/rejected": 2.150545358657837, "step": 31350 }, { "epoch": 1.4559636009099772, "grad_norm": 216.9047393798828, "learning_rate": 2.5747094417877644e-07, "logits/chosen": -18.413990020751953, "logits/rejected": -17.80817985534668, "logps/chosen": -548.3101806640625, "logps/rejected": -422.2090759277344, "loss": 0.6069, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.172654151916504, "rewards/margins": 1.2575228214263916, "rewards/rejected": 3.9151320457458496, "step": 31360 }, { "epoch": 1.4564278750174102, "grad_norm": 150.16566467285156, "learning_rate": 2.5739356516087095e-07, "logits/chosen": -19.36712646484375, "logits/rejected": -18.456607818603516, "logps/chosen": -363.37042236328125, "logps/rejected": -269.5733337402344, "loss": 0.9417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8855814933776855, "rewards/margins": 1.755561113357544, "rewards/rejected": 2.1300208568573, "step": 31370 }, { "epoch": 1.4568921491248434, "grad_norm": 6.460335731506348, "learning_rate": 2.5731618614296546e-07, "logits/chosen": -19.11361312866211, "logits/rejected": -17.961124420166016, "logps/chosen": -322.6385192871094, "logps/rejected": -220.99716186523438, "loss": 0.4538, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3512368202209473, "rewards/margins": 1.7069618701934814, "rewards/rejected": 0.6442749500274658, "step": 31380 }, { "epoch": 1.4573564232322762, "grad_norm": 0.42194870114326477, "learning_rate": 2.5723880712506e-07, "logits/chosen": -17.610198974609375, "logits/rejected": -17.80304527282715, "logps/chosen": -361.70233154296875, "logps/rejected": -349.1684875488281, "loss": 1.0955, "rewards/accuracies": 0.5, "rewards/chosen": 2.5651936531066895, "rewards/margins": 0.44049328565597534, "rewards/rejected": 2.1247000694274902, "step": 31390 }, { "epoch": 1.4578206973397094, "grad_norm": 14.629716873168945, "learning_rate": 2.571614281071545e-07, "logits/chosen": -18.622119903564453, "logits/rejected": -17.449682235717773, "logps/chosen": -467.886474609375, "logps/rejected": -368.9148864746094, "loss": 0.4847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9151501655578613, "rewards/margins": 1.2666758298873901, "rewards/rejected": 2.6484739780426025, "step": 31400 }, { "epoch": 1.4582849714471424, "grad_norm": 11.62054443359375, "learning_rate": 2.5708404908924894e-07, "logits/chosen": -19.264358520507812, "logits/rejected": -17.908458709716797, "logps/chosen": -566.5712280273438, "logps/rejected": -445.891357421875, "loss": 0.2894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.552523612976074, "rewards/margins": 2.3189873695373535, "rewards/rejected": 3.2335362434387207, "step": 31410 }, { "epoch": 1.4587492455545754, "grad_norm": 107.54925537109375, "learning_rate": 2.5700667007134345e-07, "logits/chosen": -18.618209838867188, "logits/rejected": -18.907142639160156, "logps/chosen": -379.9403076171875, "logps/rejected": -428.0555725097656, "loss": 1.0414, "rewards/accuracies": 0.5, "rewards/chosen": 3.4390411376953125, "rewards/margins": -0.0778607428073883, "rewards/rejected": 3.516901731491089, "step": 31420 }, { "epoch": 1.4592135196620084, "grad_norm": 3.391002893447876, "learning_rate": 2.569292910534379e-07, "logits/chosen": -20.414989471435547, "logits/rejected": -19.4952449798584, "logps/chosen": -429.6161193847656, "logps/rejected": -335.59002685546875, "loss": 0.4897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.5505170822143555, "rewards/margins": 1.0801355838775635, "rewards/rejected": 3.470381498336792, "step": 31430 }, { "epoch": 1.4596777937694414, "grad_norm": 42.0474853515625, "learning_rate": 2.568519120355324e-07, "logits/chosen": -19.76817512512207, "logits/rejected": -18.985965728759766, "logps/chosen": -374.4337463378906, "logps/rejected": -313.9032897949219, "loss": 0.4691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.390472888946533, "rewards/margins": 0.9903701543807983, "rewards/rejected": 2.4001028537750244, "step": 31440 }, { "epoch": 1.4601420678768746, "grad_norm": 45.513999938964844, "learning_rate": 2.5677453301762694e-07, "logits/chosen": -19.326717376708984, "logits/rejected": -18.99028968811035, "logps/chosen": -370.154296875, "logps/rejected": -327.9981689453125, "loss": 0.6745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3786377906799316, "rewards/margins": 0.5669097900390625, "rewards/rejected": 2.811728000640869, "step": 31450 }, { "epoch": 1.4606063419843076, "grad_norm": 17.897567749023438, "learning_rate": 2.566971539997214e-07, "logits/chosen": -18.693927764892578, "logits/rejected": -17.955642700195312, "logps/chosen": -337.90576171875, "logps/rejected": -263.4726867675781, "loss": 0.5622, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.397946834564209, "rewards/margins": 1.4573453664779663, "rewards/rejected": 0.9406012296676636, "step": 31460 }, { "epoch": 1.4610706160917406, "grad_norm": 166.7979278564453, "learning_rate": 2.566197749818159e-07, "logits/chosen": -18.907209396362305, "logits/rejected": -17.359333038330078, "logps/chosen": -546.9112548828125, "logps/rejected": -349.38494873046875, "loss": 0.5516, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.144596099853516, "rewards/margins": 2.0010955333709717, "rewards/rejected": 3.143500804901123, "step": 31470 }, { "epoch": 1.4615348901991736, "grad_norm": 110.61722564697266, "learning_rate": 2.565423959639104e-07, "logits/chosen": -19.289081573486328, "logits/rejected": -18.232845306396484, "logps/chosen": -344.8056335449219, "logps/rejected": -274.481201171875, "loss": 0.4242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0444040298461914, "rewards/margins": 1.0899699926376343, "rewards/rejected": 1.9544341564178467, "step": 31480 }, { "epoch": 1.4619991643066066, "grad_norm": 224.5916290283203, "learning_rate": 2.5646501694600493e-07, "logits/chosen": -18.508464813232422, "logits/rejected": -18.41983413696289, "logps/chosen": -306.20355224609375, "logps/rejected": -268.3169250488281, "loss": 1.2078, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8129808902740479, "rewards/margins": -0.41672641038894653, "rewards/rejected": 2.2297072410583496, "step": 31490 }, { "epoch": 1.4624634384140396, "grad_norm": 32.68265151977539, "learning_rate": 2.5638763792809944e-07, "logits/chosen": -19.250394821166992, "logits/rejected": -19.251846313476562, "logps/chosen": -400.5912170410156, "logps/rejected": -396.784912109375, "loss": 0.8937, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4791998863220215, "rewards/margins": 0.3387686610221863, "rewards/rejected": 3.1404314041137695, "step": 31500 }, { "epoch": 1.4629277125214726, "grad_norm": 181.61680603027344, "learning_rate": 2.563102589101939e-07, "logits/chosen": -18.489261627197266, "logits/rejected": -18.13233757019043, "logps/chosen": -381.26507568359375, "logps/rejected": -303.62847900390625, "loss": 0.6024, "rewards/accuracies": 0.5, "rewards/chosen": 3.827873945236206, "rewards/margins": 1.1635692119598389, "rewards/rejected": 2.664304733276367, "step": 31510 }, { "epoch": 1.4633919866289058, "grad_norm": 6.448492527008057, "learning_rate": 2.562328798922884e-07, "logits/chosen": -18.219863891601562, "logits/rejected": -18.061145782470703, "logps/chosen": -367.7198181152344, "logps/rejected": -348.8224182128906, "loss": 0.9464, "rewards/accuracies": 0.5, "rewards/chosen": 2.6871390342712402, "rewards/margins": 0.5303077697753906, "rewards/rejected": 2.1568310260772705, "step": 31520 }, { "epoch": 1.4638562607363388, "grad_norm": 156.3499298095703, "learning_rate": 2.5615550087438287e-07, "logits/chosen": -18.811420440673828, "logits/rejected": -17.38831901550293, "logps/chosen": -516.9284057617188, "logps/rejected": -378.18212890625, "loss": 0.7767, "rewards/accuracies": 0.5, "rewards/chosen": 3.8712680339813232, "rewards/margins": 0.3358544707298279, "rewards/rejected": 3.5354137420654297, "step": 31530 }, { "epoch": 1.4643205348437718, "grad_norm": 2.3175244331359863, "learning_rate": 2.560781218564774e-07, "logits/chosen": -18.622730255126953, "logits/rejected": -16.665576934814453, "logps/chosen": -435.65771484375, "logps/rejected": -222.20068359375, "loss": 0.2988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6201159954071045, "rewards/margins": 2.1615023612976074, "rewards/rejected": 1.4586137533187866, "step": 31540 }, { "epoch": 1.4647848089512048, "grad_norm": 0.5727382898330688, "learning_rate": 2.560007428385719e-07, "logits/chosen": -18.831954956054688, "logits/rejected": -18.203445434570312, "logps/chosen": -329.40435791015625, "logps/rejected": -287.32232666015625, "loss": 0.6246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.061607599258423, "rewards/margins": 0.8777691125869751, "rewards/rejected": 2.183838367462158, "step": 31550 }, { "epoch": 1.4652490830586378, "grad_norm": 140.98435974121094, "learning_rate": 2.5592336382066635e-07, "logits/chosen": -19.085607528686523, "logits/rejected": -18.550342559814453, "logps/chosen": -473.00164794921875, "logps/rejected": -459.2926330566406, "loss": 0.6445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.334544658660889, "rewards/margins": 0.7102382183074951, "rewards/rejected": 3.6243064403533936, "step": 31560 }, { "epoch": 1.465713357166071, "grad_norm": 93.60553741455078, "learning_rate": 2.5584598480276086e-07, "logits/chosen": -18.52680015563965, "logits/rejected": -17.813777923583984, "logps/chosen": -373.0094299316406, "logps/rejected": -228.1014862060547, "loss": 0.5144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2017662525177, "rewards/margins": 1.873110055923462, "rewards/rejected": 1.3286559581756592, "step": 31570 }, { "epoch": 1.4661776312735038, "grad_norm": 242.67279052734375, "learning_rate": 2.5576860578485537e-07, "logits/chosen": -18.688457489013672, "logits/rejected": -18.12215232849121, "logps/chosen": -351.67755126953125, "logps/rejected": -341.82940673828125, "loss": 0.7323, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0070106983184814, "rewards/margins": 0.8288556933403015, "rewards/rejected": 2.178154706954956, "step": 31580 }, { "epoch": 1.466641905380937, "grad_norm": 228.03353881835938, "learning_rate": 2.556912267669499e-07, "logits/chosen": -19.161836624145508, "logits/rejected": -18.759342193603516, "logps/chosen": -265.3326416015625, "logps/rejected": -311.7899475097656, "loss": 0.8085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.296794891357422, "rewards/margins": 0.9909752607345581, "rewards/rejected": 2.3058197498321533, "step": 31590 }, { "epoch": 1.46710617948837, "grad_norm": 21.678503036499023, "learning_rate": 2.556138477490444e-07, "logits/chosen": -18.814899444580078, "logits/rejected": -18.13767433166504, "logps/chosen": -401.8018798828125, "logps/rejected": -321.098388671875, "loss": 0.6327, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3191871643066406, "rewards/margins": 0.7674524188041687, "rewards/rejected": 2.551734685897827, "step": 31600 }, { "epoch": 1.467570453595803, "grad_norm": 143.20523071289062, "learning_rate": 2.5553646873113885e-07, "logits/chosen": -18.875381469726562, "logits/rejected": -17.820646286010742, "logps/chosen": -350.1214599609375, "logps/rejected": -277.4135437011719, "loss": 0.6672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.965757369995117, "rewards/margins": 0.5043075680732727, "rewards/rejected": 2.4614498615264893, "step": 31610 }, { "epoch": 1.468034727703236, "grad_norm": 12.06017780303955, "learning_rate": 2.554590897132333e-07, "logits/chosen": -19.337034225463867, "logits/rejected": -17.958759307861328, "logps/chosen": -502.1405334472656, "logps/rejected": -366.5383605957031, "loss": 0.7003, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.529952526092529, "rewards/margins": 1.7275199890136719, "rewards/rejected": 2.8024325370788574, "step": 31620 }, { "epoch": 1.468499001810669, "grad_norm": 148.55345153808594, "learning_rate": 2.553817106953278e-07, "logits/chosen": -19.41935157775879, "logits/rejected": -18.279590606689453, "logps/chosen": -316.52239990234375, "logps/rejected": -189.2279815673828, "loss": 0.3947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.713273525238037, "rewards/margins": 1.4925637245178223, "rewards/rejected": 1.2207096815109253, "step": 31630 }, { "epoch": 1.4689632759181022, "grad_norm": 0.9151383638381958, "learning_rate": 2.5530433167742233e-07, "logits/chosen": -18.297048568725586, "logits/rejected": -17.69740867614746, "logps/chosen": -406.496337890625, "logps/rejected": -346.64495849609375, "loss": 0.5166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9803123474121094, "rewards/margins": 1.4268521070480347, "rewards/rejected": 2.553460121154785, "step": 31640 }, { "epoch": 1.4694275500255352, "grad_norm": 8.116951942443848, "learning_rate": 2.5522695265951684e-07, "logits/chosen": -19.04601287841797, "logits/rejected": -17.957183837890625, "logps/chosen": -442.2328186035156, "logps/rejected": -334.1067810058594, "loss": 0.9468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.935239315032959, "rewards/margins": 1.5394216775894165, "rewards/rejected": 2.395817995071411, "step": 31650 }, { "epoch": 1.4698918241329681, "grad_norm": 28.924161911010742, "learning_rate": 2.551495736416113e-07, "logits/chosen": -20.323657989501953, "logits/rejected": -19.426761627197266, "logps/chosen": -334.1120910644531, "logps/rejected": -331.5091552734375, "loss": 0.9027, "rewards/accuracies": 0.5, "rewards/chosen": 3.6919028759002686, "rewards/margins": -0.041504956781864166, "rewards/rejected": 3.733407974243164, "step": 31660 }, { "epoch": 1.4703560982404011, "grad_norm": 175.73074340820312, "learning_rate": 2.550721946237058e-07, "logits/chosen": -19.080556869506836, "logits/rejected": -18.412921905517578, "logps/chosen": -457.82196044921875, "logps/rejected": -352.6781311035156, "loss": 0.7818, "rewards/accuracies": 0.5, "rewards/chosen": 4.148449897766113, "rewards/margins": 0.9400480389595032, "rewards/rejected": 3.208402156829834, "step": 31670 }, { "epoch": 1.4708203723478341, "grad_norm": 211.53831481933594, "learning_rate": 2.549948156058003e-07, "logits/chosen": -18.748310089111328, "logits/rejected": -18.41012191772461, "logps/chosen": -433.55169677734375, "logps/rejected": -402.2461853027344, "loss": 0.7649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.716034412384033, "rewards/margins": 0.9169772863388062, "rewards/rejected": 2.7990572452545166, "step": 31680 }, { "epoch": 1.4712846464552671, "grad_norm": 42.63877868652344, "learning_rate": 2.5491743658789484e-07, "logits/chosen": -18.260623931884766, "logits/rejected": -17.653377532958984, "logps/chosen": -360.04400634765625, "logps/rejected": -223.695068359375, "loss": 0.3582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.232470750808716, "rewards/margins": 1.8384946584701538, "rewards/rejected": 1.3939764499664307, "step": 31690 }, { "epoch": 1.4717489205627001, "grad_norm": 38.74581527709961, "learning_rate": 2.5484005756998935e-07, "logits/chosen": -19.228633880615234, "logits/rejected": -18.164852142333984, "logps/chosen": -520.9751586914062, "logps/rejected": -325.79241943359375, "loss": 0.4508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8563473224639893, "rewards/margins": 1.7617686986923218, "rewards/rejected": 2.094578742980957, "step": 31700 }, { "epoch": 1.4722131946701333, "grad_norm": 152.40293884277344, "learning_rate": 2.547626785520838e-07, "logits/chosen": -18.876060485839844, "logits/rejected": -18.678119659423828, "logps/chosen": -270.3716125488281, "logps/rejected": -235.53677368164062, "loss": 0.8871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2435288429260254, "rewards/margins": 0.40390974283218384, "rewards/rejected": 1.8396189212799072, "step": 31710 }, { "epoch": 1.4726774687775663, "grad_norm": 54.15504455566406, "learning_rate": 2.5468529953417826e-07, "logits/chosen": -18.50864028930664, "logits/rejected": -17.998008728027344, "logps/chosen": -362.1022644042969, "logps/rejected": -293.9465637207031, "loss": 0.4401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.003469467163086, "rewards/margins": 1.727240800857544, "rewards/rejected": 2.276228904724121, "step": 31720 }, { "epoch": 1.4731417428849993, "grad_norm": 149.95703125, "learning_rate": 2.546079205162728e-07, "logits/chosen": -18.87570571899414, "logits/rejected": -18.05465316772461, "logps/chosen": -485.4671936035156, "logps/rejected": -408.1941223144531, "loss": 0.6941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6556763648986816, "rewards/margins": 1.191432237625122, "rewards/rejected": 2.4642436504364014, "step": 31730 }, { "epoch": 1.4736060169924323, "grad_norm": 138.684814453125, "learning_rate": 2.545305414983673e-07, "logits/chosen": -18.568334579467773, "logits/rejected": -17.77103042602539, "logps/chosen": -444.68109130859375, "logps/rejected": -338.7110900878906, "loss": 0.5556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.768531322479248, "rewards/margins": 1.3317241668701172, "rewards/rejected": 2.436807155609131, "step": 31740 }, { "epoch": 1.4740702910998653, "grad_norm": 29.393407821655273, "learning_rate": 2.544531624804618e-07, "logits/chosen": -18.28163719177246, "logits/rejected": -17.948131561279297, "logps/chosen": -432.0146484375, "logps/rejected": -351.0071716308594, "loss": 0.6469, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4411303997039795, "rewards/margins": 0.3984202742576599, "rewards/rejected": 2.042710065841675, "step": 31750 }, { "epoch": 1.4745345652072983, "grad_norm": 27.115087509155273, "learning_rate": 2.5437578346255626e-07, "logits/chosen": -18.453603744506836, "logits/rejected": -18.34627342224121, "logps/chosen": -325.96185302734375, "logps/rejected": -283.345947265625, "loss": 0.3657, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4555020332336426, "rewards/margins": 1.2157331705093384, "rewards/rejected": 2.2397685050964355, "step": 31760 }, { "epoch": 1.4749988393147313, "grad_norm": 10.022770881652832, "learning_rate": 2.5429840444465077e-07, "logits/chosen": -19.157695770263672, "logits/rejected": -19.028696060180664, "logps/chosen": -263.2575378417969, "logps/rejected": -291.55535888671875, "loss": 0.5685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9933134317398071, "rewards/margins": 0.8151793479919434, "rewards/rejected": 1.1781343221664429, "step": 31770 }, { "epoch": 1.4754631134221645, "grad_norm": 77.0357437133789, "learning_rate": 2.542210254267453e-07, "logits/chosen": -19.90668296813965, "logits/rejected": -18.796396255493164, "logps/chosen": -369.28558349609375, "logps/rejected": -280.44287109375, "loss": 0.362, "rewards/accuracies": 1.0, "rewards/chosen": 3.8685030937194824, "rewards/margins": 1.2579996585845947, "rewards/rejected": 2.6105034351348877, "step": 31780 }, { "epoch": 1.4759273875295975, "grad_norm": 5.099295139312744, "learning_rate": 2.541436464088398e-07, "logits/chosen": -19.12478256225586, "logits/rejected": -18.11136245727539, "logps/chosen": -355.4524841308594, "logps/rejected": -243.0114288330078, "loss": 0.4475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2381794452667236, "rewards/margins": 1.0527451038360596, "rewards/rejected": 1.1854342222213745, "step": 31790 }, { "epoch": 1.4763916616370305, "grad_norm": 14.027173042297363, "learning_rate": 2.540662673909343e-07, "logits/chosen": -19.332263946533203, "logits/rejected": -18.623958587646484, "logps/chosen": -488.2098693847656, "logps/rejected": -363.31134033203125, "loss": 0.5161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9442806243896484, "rewards/margins": 1.1360723972320557, "rewards/rejected": 2.8082082271575928, "step": 31800 }, { "epoch": 1.4768559357444635, "grad_norm": 1.3811026811599731, "learning_rate": 2.539888883730287e-07, "logits/chosen": -19.038162231445312, "logits/rejected": -18.08582878112793, "logps/chosen": -437.89532470703125, "logps/rejected": -246.94613647460938, "loss": 0.3523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4898784160614014, "rewards/margins": 1.922875165939331, "rewards/rejected": 1.5670034885406494, "step": 31810 }, { "epoch": 1.4773202098518965, "grad_norm": 43.092445373535156, "learning_rate": 2.539115093551232e-07, "logits/chosen": -19.280080795288086, "logits/rejected": -18.590097427368164, "logps/chosen": -444.6591796875, "logps/rejected": -320.58233642578125, "loss": 0.4038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.10483980178833, "rewards/margins": 1.5015432834625244, "rewards/rejected": 2.6032962799072266, "step": 31820 }, { "epoch": 1.4777844839593297, "grad_norm": 66.47640228271484, "learning_rate": 2.5383413033721773e-07, "logits/chosen": -18.18763542175293, "logits/rejected": -17.029294967651367, "logps/chosen": -306.94122314453125, "logps/rejected": -202.74285888671875, "loss": 0.5502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2655301094055176, "rewards/margins": 1.1557419300079346, "rewards/rejected": 1.109787940979004, "step": 31830 }, { "epoch": 1.4782487580667625, "grad_norm": 53.53861999511719, "learning_rate": 2.5375675131931224e-07, "logits/chosen": -17.93707275390625, "logits/rejected": -18.367572784423828, "logps/chosen": -301.7256774902344, "logps/rejected": -382.4288635253906, "loss": 0.8609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7954540252685547, "rewards/margins": 0.31071338057518005, "rewards/rejected": 2.4847404956817627, "step": 31840 }, { "epoch": 1.4787130321741957, "grad_norm": 34.875797271728516, "learning_rate": 2.5367937230140675e-07, "logits/chosen": -18.50157356262207, "logits/rejected": -17.393518447875977, "logps/chosen": -280.35699462890625, "logps/rejected": -194.5028839111328, "loss": 0.5459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0724387168884277, "rewards/margins": 1.5139003992080688, "rewards/rejected": 1.5585384368896484, "step": 31850 }, { "epoch": 1.4791773062816287, "grad_norm": 111.2894058227539, "learning_rate": 2.536019932835012e-07, "logits/chosen": -18.65127182006836, "logits/rejected": -17.683212280273438, "logps/chosen": -328.8037109375, "logps/rejected": -215.103759765625, "loss": 0.6454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.031360387802124, "rewards/margins": 1.5256694555282593, "rewards/rejected": 1.5056908130645752, "step": 31860 }, { "epoch": 1.4796415803890617, "grad_norm": 0.07541824877262115, "learning_rate": 2.535246142655957e-07, "logits/chosen": -19.459514617919922, "logits/rejected": -18.128576278686523, "logps/chosen": -406.884033203125, "logps/rejected": -308.2783203125, "loss": 0.739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7246012687683105, "rewards/margins": 1.3067433834075928, "rewards/rejected": 2.4178578853607178, "step": 31870 }, { "epoch": 1.4801058544964947, "grad_norm": 0.40732038021087646, "learning_rate": 2.5344723524769023e-07, "logits/chosen": -18.768762588500977, "logits/rejected": -16.685504913330078, "logps/chosen": -355.6019287109375, "logps/rejected": -142.50009155273438, "loss": 0.3685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5192646980285645, "rewards/margins": 2.59871768951416, "rewards/rejected": 0.9205468893051147, "step": 31880 }, { "epoch": 1.4805701286039277, "grad_norm": 81.10489654541016, "learning_rate": 2.5336985622978474e-07, "logits/chosen": -18.97650909423828, "logits/rejected": -18.70090675354004, "logps/chosen": -400.2219543457031, "logps/rejected": -326.26678466796875, "loss": 0.4871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7328267097473145, "rewards/margins": 0.8164359927177429, "rewards/rejected": 1.9163906574249268, "step": 31890 }, { "epoch": 1.4810344027113609, "grad_norm": 20.92007827758789, "learning_rate": 2.5329247721187926e-07, "logits/chosen": -18.415197372436523, "logits/rejected": -16.946796417236328, "logps/chosen": -404.55670166015625, "logps/rejected": -227.72702026367188, "loss": 0.2862, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9907584190368652, "rewards/margins": 2.13071870803833, "rewards/rejected": 0.8600398898124695, "step": 31900 }, { "epoch": 1.4814986768187939, "grad_norm": 123.1675796508789, "learning_rate": 2.5321509819397366e-07, "logits/chosen": -20.108280181884766, "logits/rejected": -19.21153450012207, "logps/chosen": -474.4490661621094, "logps/rejected": -286.3226623535156, "loss": 0.431, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.441405773162842, "rewards/margins": 1.6623684167861938, "rewards/rejected": 2.7790374755859375, "step": 31910 }, { "epoch": 1.4819629509262269, "grad_norm": 1.7240946292877197, "learning_rate": 2.5313771917606817e-07, "logits/chosen": -17.711711883544922, "logits/rejected": -17.995546340942383, "logps/chosen": -356.1665344238281, "logps/rejected": -328.4260559082031, "loss": 0.7404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1423163414001465, "rewards/margins": 0.8754774928092957, "rewards/rejected": 2.266839027404785, "step": 31920 }, { "epoch": 1.4824272250336599, "grad_norm": 22.525352478027344, "learning_rate": 2.530603401581627e-07, "logits/chosen": -19.01865577697754, "logits/rejected": -18.949710845947266, "logps/chosen": -372.55328369140625, "logps/rejected": -331.8396911621094, "loss": 0.5639, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5718512535095215, "rewards/margins": 0.8869678378105164, "rewards/rejected": 2.6848835945129395, "step": 31930 }, { "epoch": 1.4828914991410929, "grad_norm": 157.12660217285156, "learning_rate": 2.529829611402572e-07, "logits/chosen": -18.23537254333496, "logits/rejected": -18.09800910949707, "logps/chosen": -297.95086669921875, "logps/rejected": -295.9506530761719, "loss": 0.8552, "rewards/accuracies": 0.5, "rewards/chosen": 2.01932954788208, "rewards/margins": 0.15729530155658722, "rewards/rejected": 1.8620342016220093, "step": 31940 }, { "epoch": 1.4833557732485259, "grad_norm": 55.6037712097168, "learning_rate": 2.529055821223517e-07, "logits/chosen": -18.399763107299805, "logits/rejected": -17.88113021850586, "logps/chosen": -420.9349670410156, "logps/rejected": -334.3558044433594, "loss": 0.6227, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.682401180267334, "rewards/margins": 1.68148934841156, "rewards/rejected": 2.0009119510650635, "step": 31950 }, { "epoch": 1.4838200473559588, "grad_norm": 55.4910774230957, "learning_rate": 2.528282031044462e-07, "logits/chosen": -18.327791213989258, "logits/rejected": -18.082355499267578, "logps/chosen": -372.5360412597656, "logps/rejected": -375.4129333496094, "loss": 0.882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.689866065979004, "rewards/margins": 0.7499272227287292, "rewards/rejected": 1.9399389028549194, "step": 31960 }, { "epoch": 1.484284321463392, "grad_norm": 21.186609268188477, "learning_rate": 2.527508240865407e-07, "logits/chosen": -18.51915168762207, "logits/rejected": -17.90151023864746, "logps/chosen": -386.42431640625, "logps/rejected": -339.52703857421875, "loss": 0.5567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.990736722946167, "rewards/margins": 0.7364189028739929, "rewards/rejected": 2.25431752204895, "step": 31970 }, { "epoch": 1.484748595570825, "grad_norm": 1.7392455339431763, "learning_rate": 2.526734450686352e-07, "logits/chosen": -19.17525291442871, "logits/rejected": -18.96992301940918, "logps/chosen": -391.7151794433594, "logps/rejected": -330.8062744140625, "loss": 0.6878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.17560338973999, "rewards/margins": 1.4101706743240356, "rewards/rejected": 2.765432834625244, "step": 31980 }, { "epoch": 1.485212869678258, "grad_norm": 93.9107666015625, "learning_rate": 2.525960660507297e-07, "logits/chosen": -18.735910415649414, "logits/rejected": -18.061237335205078, "logps/chosen": -393.78240966796875, "logps/rejected": -357.92279052734375, "loss": 0.3082, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.954145908355713, "rewards/margins": 1.4489452838897705, "rewards/rejected": 1.5052011013031006, "step": 31990 }, { "epoch": 1.485677143785691, "grad_norm": 2.2766482830047607, "learning_rate": 2.5251868703282416e-07, "logits/chosen": -19.77375030517578, "logits/rejected": -18.466228485107422, "logps/chosen": -418.7250061035156, "logps/rejected": -298.0113220214844, "loss": 0.4236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.842674255371094, "rewards/margins": 2.456073045730591, "rewards/rejected": 2.386600971221924, "step": 32000 }, { "epoch": 1.486141417893124, "grad_norm": 127.09286499023438, "learning_rate": 2.524413080149186e-07, "logits/chosen": -20.215404510498047, "logits/rejected": -18.53533935546875, "logps/chosen": -499.96844482421875, "logps/rejected": -383.51239013671875, "loss": 0.2221, "rewards/accuracies": 1.0, "rewards/chosen": 5.054689407348633, "rewards/margins": 2.35105562210083, "rewards/rejected": 2.7036337852478027, "step": 32010 }, { "epoch": 1.4866056920005573, "grad_norm": 87.47856140136719, "learning_rate": 2.5236392899701313e-07, "logits/chosen": -18.494678497314453, "logits/rejected": -18.3044376373291, "logps/chosen": -435.505126953125, "logps/rejected": -412.577880859375, "loss": 0.7981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.040616512298584, "rewards/margins": 0.1743013709783554, "rewards/rejected": 2.8663153648376465, "step": 32020 }, { "epoch": 1.48706996610799, "grad_norm": 36.388980865478516, "learning_rate": 2.5228654997910764e-07, "logits/chosen": -17.721614837646484, "logits/rejected": -17.44160270690918, "logps/chosen": -333.7076721191406, "logps/rejected": -300.74822998046875, "loss": 0.5683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.249687671661377, "rewards/margins": 0.7672444581985474, "rewards/rejected": 1.4824432134628296, "step": 32030 }, { "epoch": 1.4875342402154232, "grad_norm": 26.26605224609375, "learning_rate": 2.5220917096120215e-07, "logits/chosen": -18.250011444091797, "logits/rejected": -17.26247787475586, "logps/chosen": -348.5329895019531, "logps/rejected": -255.5291290283203, "loss": 0.7072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.836602210998535, "rewards/margins": 1.9000589847564697, "rewards/rejected": 2.9365432262420654, "step": 32040 }, { "epoch": 1.4879985143228562, "grad_norm": 16.77273178100586, "learning_rate": 2.5213179194329666e-07, "logits/chosen": -19.036354064941406, "logits/rejected": -18.083044052124023, "logps/chosen": -442.20465087890625, "logps/rejected": -325.31268310546875, "loss": 0.3174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.785001277923584, "rewards/margins": 1.6879628896713257, "rewards/rejected": 2.0970380306243896, "step": 32050 }, { "epoch": 1.4884627884302892, "grad_norm": 69.882568359375, "learning_rate": 2.5205441292539117e-07, "logits/chosen": -17.89957618713379, "logits/rejected": -17.128217697143555, "logps/chosen": -351.8827819824219, "logps/rejected": -209.64334106445312, "loss": 0.4932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7399635314941406, "rewards/margins": 1.3046671152114868, "rewards/rejected": 1.4352962970733643, "step": 32060 }, { "epoch": 1.4889270625377222, "grad_norm": 4.974944591522217, "learning_rate": 2.519847718092762e-07, "logits/chosen": -19.53154945373535, "logits/rejected": -18.519695281982422, "logps/chosen": -442.983642578125, "logps/rejected": -320.86932373046875, "loss": 0.3924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.821031093597412, "rewards/margins": 1.9477695226669312, "rewards/rejected": 1.8732616901397705, "step": 32070 }, { "epoch": 1.4893913366451552, "grad_norm": 180.15475463867188, "learning_rate": 2.519073927913707e-07, "logits/chosen": -19.28615951538086, "logits/rejected": -19.383365631103516, "logps/chosen": -351.5039367675781, "logps/rejected": -362.8927001953125, "loss": 0.8338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.86401629447937, "rewards/margins": 0.2613028287887573, "rewards/rejected": 2.6027133464813232, "step": 32080 }, { "epoch": 1.4898556107525884, "grad_norm": 7.5995306968688965, "learning_rate": 2.5183001377346516e-07, "logits/chosen": -19.359939575195312, "logits/rejected": -18.42604637145996, "logps/chosen": -429.69097900390625, "logps/rejected": -307.76898193359375, "loss": 0.4291, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2151689529418945, "rewards/margins": 1.319858193397522, "rewards/rejected": 2.895310640335083, "step": 32090 }, { "epoch": 1.4903198848600214, "grad_norm": 0.9282143712043762, "learning_rate": 2.517526347555597e-07, "logits/chosen": -19.952791213989258, "logits/rejected": -18.669221878051758, "logps/chosen": -391.1344299316406, "logps/rejected": -275.43682861328125, "loss": 0.18, "rewards/accuracies": 1.0, "rewards/chosen": 4.3797430992126465, "rewards/margins": 2.6980373859405518, "rewards/rejected": 1.6817058324813843, "step": 32100 }, { "epoch": 1.4907841589674544, "grad_norm": 209.5072021484375, "learning_rate": 2.516752557376542e-07, "logits/chosen": -18.113388061523438, "logits/rejected": -18.290517807006836, "logps/chosen": -297.7368469238281, "logps/rejected": -316.24505615234375, "loss": 1.1765, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.301109552383423, "rewards/margins": -0.23243243992328644, "rewards/rejected": 2.5335419178009033, "step": 32110 }, { "epoch": 1.4912484330748874, "grad_norm": 6.523828029632568, "learning_rate": 2.5159787671974865e-07, "logits/chosen": -18.897846221923828, "logits/rejected": -17.986522674560547, "logps/chosen": -461.0164489746094, "logps/rejected": -364.1805725097656, "loss": 0.4037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2549543380737305, "rewards/margins": 1.409984827041626, "rewards/rejected": 2.8449695110321045, "step": 32120 }, { "epoch": 1.4917127071823204, "grad_norm": 33.642757415771484, "learning_rate": 2.5152049770184316e-07, "logits/chosen": -18.287948608398438, "logits/rejected": -19.121442794799805, "logps/chosen": -455.62841796875, "logps/rejected": -455.8678283691406, "loss": 0.9123, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 4.229054927825928, "rewards/margins": 0.3201357126235962, "rewards/rejected": 3.9089195728302, "step": 32130 }, { "epoch": 1.4921769812897534, "grad_norm": 76.54238891601562, "learning_rate": 2.514431186839376e-07, "logits/chosen": -18.685253143310547, "logits/rejected": -19.08487319946289, "logps/chosen": -323.8215026855469, "logps/rejected": -397.7436218261719, "loss": 1.3509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.704848051071167, "rewards/margins": -0.4568379819393158, "rewards/rejected": 3.161686420440674, "step": 32140 }, { "epoch": 1.4926412553971864, "grad_norm": 197.9324951171875, "learning_rate": 2.513657396660321e-07, "logits/chosen": -18.677387237548828, "logits/rejected": -17.724008560180664, "logps/chosen": -525.7486572265625, "logps/rejected": -326.44403076171875, "loss": 0.3725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.734050750732422, "rewards/margins": 1.986315369606018, "rewards/rejected": 2.7477355003356934, "step": 32150 }, { "epoch": 1.4931055295046196, "grad_norm": 34.893795013427734, "learning_rate": 2.5128836064812664e-07, "logits/chosen": -18.60519790649414, "logits/rejected": -18.514373779296875, "logps/chosen": -333.6890563964844, "logps/rejected": -285.4182434082031, "loss": 0.6746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.781848907470703, "rewards/margins": 0.4587852358818054, "rewards/rejected": 2.323064088821411, "step": 32160 }, { "epoch": 1.4935698036120526, "grad_norm": 100.76454162597656, "learning_rate": 2.5121098163022115e-07, "logits/chosen": -20.283681869506836, "logits/rejected": -18.99146842956543, "logps/chosen": -366.67852783203125, "logps/rejected": -282.7543029785156, "loss": 0.5113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.348414897918701, "rewards/margins": 1.461851954460144, "rewards/rejected": 1.8865629434585571, "step": 32170 }, { "epoch": 1.4940340777194856, "grad_norm": 5.0572357177734375, "learning_rate": 2.5113360261231566e-07, "logits/chosen": -18.701364517211914, "logits/rejected": -17.72995376586914, "logps/chosen": -386.5094909667969, "logps/rejected": -232.95339965820312, "loss": 0.5168, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4487693309783936, "rewards/margins": 1.1136398315429688, "rewards/rejected": 2.335129499435425, "step": 32180 }, { "epoch": 1.4944983518269186, "grad_norm": 27.719650268554688, "learning_rate": 2.510562235944101e-07, "logits/chosen": -19.52985954284668, "logits/rejected": -19.54647445678711, "logps/chosen": -382.24481201171875, "logps/rejected": -380.98760986328125, "loss": 0.6299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.407688856124878, "rewards/margins": 0.4979340136051178, "rewards/rejected": 2.909754991531372, "step": 32190 }, { "epoch": 1.4949626259343516, "grad_norm": 3.9238829612731934, "learning_rate": 2.5097884457650463e-07, "logits/chosen": -18.404584884643555, "logits/rejected": -17.621511459350586, "logps/chosen": -342.81402587890625, "logps/rejected": -248.65139770507812, "loss": 0.6611, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8599796295166016, "rewards/margins": 1.2781891822814941, "rewards/rejected": 1.5817906856536865, "step": 32200 }, { "epoch": 1.4954269000417848, "grad_norm": 49.43098831176758, "learning_rate": 2.5090146555859914e-07, "logits/chosen": -18.441936492919922, "logits/rejected": -17.185287475585938, "logps/chosen": -404.4212951660156, "logps/rejected": -286.91644287109375, "loss": 0.642, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.115990161895752, "rewards/margins": 2.011131525039673, "rewards/rejected": 2.104858636856079, "step": 32210 }, { "epoch": 1.4958911741492176, "grad_norm": 60.588809967041016, "learning_rate": 2.508240865406936e-07, "logits/chosen": -18.89243507385254, "logits/rejected": -17.291242599487305, "logps/chosen": -517.3050537109375, "logps/rejected": -342.181640625, "loss": 0.1515, "rewards/accuracies": 1.0, "rewards/chosen": 5.295167922973633, "rewards/margins": 3.196916103363037, "rewards/rejected": 2.0982513427734375, "step": 32220 }, { "epoch": 1.4963554482566508, "grad_norm": 66.40416717529297, "learning_rate": 2.507467075227881e-07, "logits/chosen": -18.91753387451172, "logits/rejected": -17.6981143951416, "logps/chosen": -399.218994140625, "logps/rejected": -309.17474365234375, "loss": 0.3805, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.775417327880859, "rewards/margins": 1.4271520376205444, "rewards/rejected": 3.3482654094696045, "step": 32230 }, { "epoch": 1.4968197223640838, "grad_norm": 30.427892684936523, "learning_rate": 2.5066932850488257e-07, "logits/chosen": -19.062877655029297, "logits/rejected": -19.118223190307617, "logps/chosen": -414.92041015625, "logps/rejected": -454.9967346191406, "loss": 1.3387, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9705371856689453, "rewards/margins": -0.06509075313806534, "rewards/rejected": 3.035627841949463, "step": 32240 }, { "epoch": 1.4972839964715168, "grad_norm": 8.802712440490723, "learning_rate": 2.505919494869771e-07, "logits/chosen": -18.234607696533203, "logits/rejected": -18.30308723449707, "logps/chosen": -435.58807373046875, "logps/rejected": -378.6927185058594, "loss": 1.2817, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9135985374450684, "rewards/margins": -0.07037319988012314, "rewards/rejected": 2.9839720726013184, "step": 32250 }, { "epoch": 1.4977482705789498, "grad_norm": 58.884376525878906, "learning_rate": 2.505145704690716e-07, "logits/chosen": -18.31589126586914, "logits/rejected": -17.922962188720703, "logps/chosen": -298.785400390625, "logps/rejected": -277.75128173828125, "loss": 0.8364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.341871738433838, "rewards/margins": 0.5042173862457275, "rewards/rejected": 2.837653875350952, "step": 32260 }, { "epoch": 1.4982125446863828, "grad_norm": 0.14088943600654602, "learning_rate": 2.504371914511661e-07, "logits/chosen": -18.826343536376953, "logits/rejected": -18.01938247680664, "logps/chosen": -311.8204040527344, "logps/rejected": -244.2725830078125, "loss": 0.6159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7569503784179688, "rewards/margins": 1.215585708618164, "rewards/rejected": 1.5413646697998047, "step": 32270 }, { "epoch": 1.498676818793816, "grad_norm": 40.76153564453125, "learning_rate": 2.503598124332606e-07, "logits/chosen": -18.55653190612793, "logits/rejected": -17.611608505249023, "logps/chosen": -354.29571533203125, "logps/rejected": -217.33877563476562, "loss": 0.7699, "rewards/accuracies": 0.5, "rewards/chosen": 2.8456249237060547, "rewards/margins": 1.0131242275238037, "rewards/rejected": 1.8325008153915405, "step": 32280 }, { "epoch": 1.499141092901249, "grad_norm": 136.95994567871094, "learning_rate": 2.5028243341535507e-07, "logits/chosen": -19.500965118408203, "logits/rejected": -18.727123260498047, "logps/chosen": -444.92144775390625, "logps/rejected": -311.8194580078125, "loss": 0.4741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0865678787231445, "rewards/margins": 1.196103811264038, "rewards/rejected": 2.8904638290405273, "step": 32290 }, { "epoch": 1.499605367008682, "grad_norm": 36.19911193847656, "learning_rate": 2.502050543974496e-07, "logits/chosen": -18.37515640258789, "logits/rejected": -17.677888870239258, "logps/chosen": -498.13037109375, "logps/rejected": -379.83331298828125, "loss": 0.7257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9291205406188965, "rewards/margins": 0.9232776761054993, "rewards/rejected": 3.005842685699463, "step": 32300 }, { "epoch": 1.500069641116115, "grad_norm": 15.831507682800293, "learning_rate": 2.5012767537954404e-07, "logits/chosen": -17.511632919311523, "logits/rejected": -17.663311004638672, "logps/chosen": -225.70193481445312, "logps/rejected": -262.42547607421875, "loss": 0.6394, "rewards/accuracies": 0.5, "rewards/chosen": 2.2028300762176514, "rewards/margins": 0.7300660610198975, "rewards/rejected": 1.4727638959884644, "step": 32310 }, { "epoch": 1.500533915223548, "grad_norm": 46.19369888305664, "learning_rate": 2.5005029636163855e-07, "logits/chosen": -19.29408836364746, "logits/rejected": -17.869617462158203, "logps/chosen": -495.520751953125, "logps/rejected": -383.91522216796875, "loss": 0.4482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6563382148742676, "rewards/margins": 1.4957218170166016, "rewards/rejected": 2.160616636276245, "step": 32320 }, { "epoch": 1.5009981893309812, "grad_norm": 33.40667724609375, "learning_rate": 2.4997291734373306e-07, "logits/chosen": -19.159488677978516, "logits/rejected": -18.31499481201172, "logps/chosen": -373.96636962890625, "logps/rejected": -256.97314453125, "loss": 0.3383, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1056113243103027, "rewards/margins": 1.6428873538970947, "rewards/rejected": 1.462723970413208, "step": 32330 }, { "epoch": 1.501462463438414, "grad_norm": 36.027198791503906, "learning_rate": 2.498955383258275e-07, "logits/chosen": -19.474777221679688, "logits/rejected": -18.40117073059082, "logps/chosen": -435.33056640625, "logps/rejected": -332.9244079589844, "loss": 0.3505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.805816650390625, "rewards/margins": 1.5563539266586304, "rewards/rejected": 2.249462604522705, "step": 32340 }, { "epoch": 1.5019267375458472, "grad_norm": 0.2747364342212677, "learning_rate": 2.4981815930792203e-07, "logits/chosen": -19.280014038085938, "logits/rejected": -17.768810272216797, "logps/chosen": -449.7264099121094, "logps/rejected": -288.1427307128906, "loss": 0.2671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.446118354797363, "rewards/margins": 3.1074554920196533, "rewards/rejected": 1.3386629819869995, "step": 32350 }, { "epoch": 1.5023910116532802, "grad_norm": 57.339908599853516, "learning_rate": 2.4974078029001655e-07, "logits/chosen": -19.61747169494629, "logits/rejected": -19.16225242614746, "logps/chosen": -479.00897216796875, "logps/rejected": -388.9695739746094, "loss": 0.7002, "rewards/accuracies": 0.5, "rewards/chosen": 3.7307562828063965, "rewards/margins": 0.7174172401428223, "rewards/rejected": 3.013338804244995, "step": 32360 }, { "epoch": 1.5028552857607131, "grad_norm": 49.6462516784668, "learning_rate": 2.4966340127211106e-07, "logits/chosen": -18.298084259033203, "logits/rejected": -17.696170806884766, "logps/chosen": -429.63787841796875, "logps/rejected": -351.7286376953125, "loss": 0.7582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.269073009490967, "rewards/margins": 1.1504971981048584, "rewards/rejected": 2.1185760498046875, "step": 32370 }, { "epoch": 1.5033195598681461, "grad_norm": 65.36762237548828, "learning_rate": 2.495860222542055e-07, "logits/chosen": -19.338577270507812, "logits/rejected": -17.794279098510742, "logps/chosen": -452.4335021972656, "logps/rejected": -294.3934631347656, "loss": 0.395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3919005393981934, "rewards/margins": 1.4360102415084839, "rewards/rejected": 1.9558902978897095, "step": 32380 }, { "epoch": 1.5037838339755791, "grad_norm": 126.95121765136719, "learning_rate": 2.4950864323630003e-07, "logits/chosen": -18.30031967163086, "logits/rejected": -17.997161865234375, "logps/chosen": -312.0479736328125, "logps/rejected": -283.134521484375, "loss": 1.4529, "rewards/accuracies": 0.5, "rewards/chosen": 2.884711503982544, "rewards/margins": 0.6867401003837585, "rewards/rejected": 2.1979713439941406, "step": 32390 }, { "epoch": 1.5042481080830123, "grad_norm": 13.97301959991455, "learning_rate": 2.4943126421839454e-07, "logits/chosen": -19.296192169189453, "logits/rejected": -18.49575424194336, "logps/chosen": -317.2593688964844, "logps/rejected": -272.62738037109375, "loss": 0.8372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.212444305419922, "rewards/margins": 1.1070600748062134, "rewards/rejected": 2.105384349822998, "step": 32400 }, { "epoch": 1.5047123821904451, "grad_norm": 18.140636444091797, "learning_rate": 2.4935388520048905e-07, "logits/chosen": -19.58827781677246, "logits/rejected": -17.808429718017578, "logps/chosen": -495.1185607910156, "logps/rejected": -355.30584716796875, "loss": 0.8573, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.757138729095459, "rewards/margins": 1.324876070022583, "rewards/rejected": 2.432262897491455, "step": 32410 }, { "epoch": 1.5051766562978783, "grad_norm": 21.918943405151367, "learning_rate": 2.492765061825835e-07, "logits/chosen": -20.098247528076172, "logits/rejected": -19.401634216308594, "logps/chosen": -468.8292541503906, "logps/rejected": -406.1036376953125, "loss": 0.8948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8502185344696045, "rewards/margins": 0.7681643962860107, "rewards/rejected": 3.0820541381835938, "step": 32420 }, { "epoch": 1.5056409304053113, "grad_norm": 3.2572028636932373, "learning_rate": 2.49199127164678e-07, "logits/chosen": -18.45124053955078, "logits/rejected": -17.521066665649414, "logps/chosen": -345.9441223144531, "logps/rejected": -255.9907684326172, "loss": 0.8017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1757686138153076, "rewards/margins": 0.8029189109802246, "rewards/rejected": 1.372849702835083, "step": 32430 }, { "epoch": 1.5061052045127443, "grad_norm": 93.67430114746094, "learning_rate": 2.491217481467725e-07, "logits/chosen": -19.90422821044922, "logits/rejected": -18.776887893676758, "logps/chosen": -416.66973876953125, "logps/rejected": -364.8879699707031, "loss": 0.4847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0010223388671875, "rewards/margins": 0.8258332014083862, "rewards/rejected": 2.175189256668091, "step": 32440 }, { "epoch": 1.5065694786201773, "grad_norm": 26.328874588012695, "learning_rate": 2.49044369128867e-07, "logits/chosen": -18.180667877197266, "logits/rejected": -17.177349090576172, "logps/chosen": -364.9713134765625, "logps/rejected": -269.9712219238281, "loss": 0.2565, "rewards/accuracies": 1.0, "rewards/chosen": 3.6744542121887207, "rewards/margins": 1.9643503427505493, "rewards/rejected": 1.7101032733917236, "step": 32450 }, { "epoch": 1.5070337527276103, "grad_norm": 129.63613891601562, "learning_rate": 2.489669901109615e-07, "logits/chosen": -19.220359802246094, "logits/rejected": -17.065391540527344, "logps/chosen": -368.4744873046875, "logps/rejected": -250.7952117919922, "loss": 0.5557, "rewards/accuracies": 0.5, "rewards/chosen": 3.370344638824463, "rewards/margins": 1.3676623106002808, "rewards/rejected": 2.0026822090148926, "step": 32460 }, { "epoch": 1.5074980268350435, "grad_norm": 68.00651550292969, "learning_rate": 2.48889611093056e-07, "logits/chosen": -18.524904251098633, "logits/rejected": -17.917739868164062, "logps/chosen": -425.84515380859375, "logps/rejected": -407.14996337890625, "loss": 1.0151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3444442749023438, "rewards/margins": 0.2529381513595581, "rewards/rejected": 3.0915067195892334, "step": 32470 }, { "epoch": 1.5079623009424763, "grad_norm": 1.5885932445526123, "learning_rate": 2.4881223207515047e-07, "logits/chosen": -19.331571578979492, "logits/rejected": -19.170316696166992, "logps/chosen": -471.58331298828125, "logps/rejected": -415.196533203125, "loss": 0.8295, "rewards/accuracies": 0.5, "rewards/chosen": 4.731874942779541, "rewards/margins": 1.386765718460083, "rewards/rejected": 3.3451087474823, "step": 32480 }, { "epoch": 1.5084265750499095, "grad_norm": 204.1876220703125, "learning_rate": 2.48734853057245e-07, "logits/chosen": -18.12894058227539, "logits/rejected": -18.17285919189453, "logps/chosen": -322.37042236328125, "logps/rejected": -355.09820556640625, "loss": 1.0527, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.920389175415039, "rewards/margins": -0.0797351598739624, "rewards/rejected": 3.000124216079712, "step": 32490 }, { "epoch": 1.5088908491573425, "grad_norm": 20.192861557006836, "learning_rate": 2.486574740393395e-07, "logits/chosen": -18.1879825592041, "logits/rejected": -17.31142807006836, "logps/chosen": -331.5752258300781, "logps/rejected": -276.78564453125, "loss": 0.6597, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.91603684425354, "rewards/margins": 1.0021460056304932, "rewards/rejected": 1.9138904809951782, "step": 32500 }, { "epoch": 1.5093551232647755, "grad_norm": 42.004207611083984, "learning_rate": 2.48580095021434e-07, "logits/chosen": -18.85324478149414, "logits/rejected": -18.00191879272461, "logps/chosen": -325.69976806640625, "logps/rejected": -261.0977478027344, "loss": 0.4346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.625239610671997, "rewards/margins": 1.5894434452056885, "rewards/rejected": 1.03579580783844, "step": 32510 }, { "epoch": 1.5098193973722087, "grad_norm": 44.86335754394531, "learning_rate": 2.4850271600352846e-07, "logits/chosen": -18.996482849121094, "logits/rejected": -16.971616744995117, "logps/chosen": -545.8704833984375, "logps/rejected": -372.05987548828125, "loss": 0.189, "rewards/accuracies": 1.0, "rewards/chosen": 4.255642414093018, "rewards/margins": 2.9414315223693848, "rewards/rejected": 1.3142106533050537, "step": 32520 }, { "epoch": 1.5102836714796415, "grad_norm": 30.773324966430664, "learning_rate": 2.4842533698562297e-07, "logits/chosen": -19.014068603515625, "logits/rejected": -18.076709747314453, "logps/chosen": -365.38629150390625, "logps/rejected": -277.2615661621094, "loss": 0.5382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.335080623626709, "rewards/margins": 2.3863439559936523, "rewards/rejected": 1.948736548423767, "step": 32530 }, { "epoch": 1.5107479455870747, "grad_norm": 211.66200256347656, "learning_rate": 2.483479579677175e-07, "logits/chosen": -18.33916473388672, "logits/rejected": -17.030261993408203, "logps/chosen": -425.87109375, "logps/rejected": -270.45294189453125, "loss": 0.5065, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3452365398406982, "rewards/margins": 1.9533218145370483, "rewards/rejected": 1.3919148445129395, "step": 32540 }, { "epoch": 1.5112122196945075, "grad_norm": 53.556270599365234, "learning_rate": 2.4827057894981194e-07, "logits/chosen": -19.492097854614258, "logits/rejected": -17.935922622680664, "logps/chosen": -384.78326416015625, "logps/rejected": -352.42254638671875, "loss": 1.3088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7155940532684326, "rewards/margins": -0.3035593032836914, "rewards/rejected": 3.019153594970703, "step": 32550 }, { "epoch": 1.5116764938019407, "grad_norm": 190.14398193359375, "learning_rate": 2.4819319993190645e-07, "logits/chosen": -17.901233673095703, "logits/rejected": -18.44011878967285, "logps/chosen": -435.92156982421875, "logps/rejected": -447.04742431640625, "loss": 1.2768, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.540294647216797, "rewards/margins": -0.07213039696216583, "rewards/rejected": 3.612424850463867, "step": 32560 }, { "epoch": 1.5121407679093737, "grad_norm": 195.69888305664062, "learning_rate": 2.4811582091400097e-07, "logits/chosen": -18.715633392333984, "logits/rejected": -18.595094680786133, "logps/chosen": -323.05462646484375, "logps/rejected": -306.00091552734375, "loss": 0.9261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4785211086273193, "rewards/margins": 0.3114170730113983, "rewards/rejected": 2.1671032905578613, "step": 32570 }, { "epoch": 1.5126050420168067, "grad_norm": 30.578882217407227, "learning_rate": 2.480384418960954e-07, "logits/chosen": -18.703876495361328, "logits/rejected": -18.016006469726562, "logps/chosen": -349.90716552734375, "logps/rejected": -340.3926696777344, "loss": 0.3125, "rewards/accuracies": 1.0, "rewards/chosen": 3.126244306564331, "rewards/margins": 1.292909026145935, "rewards/rejected": 1.833335280418396, "step": 32580 }, { "epoch": 1.51306931612424, "grad_norm": 54.438899993896484, "learning_rate": 2.4796106287818994e-07, "logits/chosen": -18.950117111206055, "logits/rejected": -18.068519592285156, "logps/chosen": -355.2620849609375, "logps/rejected": -339.211669921875, "loss": 1.0327, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.193413257598877, "rewards/margins": 0.171125128865242, "rewards/rejected": 3.0222878456115723, "step": 32590 }, { "epoch": 1.5135335902316727, "grad_norm": 58.8890266418457, "learning_rate": 2.4788368386028445e-07, "logits/chosen": -19.33315658569336, "logits/rejected": -18.472599029541016, "logps/chosen": -410.8153381347656, "logps/rejected": -322.43304443359375, "loss": 0.6515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.190720319747925, "rewards/margins": 0.6716113090515137, "rewards/rejected": 2.5191092491149902, "step": 32600 }, { "epoch": 1.5139978643391059, "grad_norm": 203.09286499023438, "learning_rate": 2.4780630484237896e-07, "logits/chosen": -18.037778854370117, "logits/rejected": -16.864315032958984, "logps/chosen": -330.84869384765625, "logps/rejected": -221.31423950195312, "loss": 0.7748, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.134082317352295, "rewards/margins": 0.6094785928726196, "rewards/rejected": 1.5246034860610962, "step": 32610 }, { "epoch": 1.5144621384465389, "grad_norm": 78.13067626953125, "learning_rate": 2.477289258244734e-07, "logits/chosen": -18.51334571838379, "logits/rejected": -17.553043365478516, "logps/chosen": -409.75457763671875, "logps/rejected": -264.4573974609375, "loss": 0.6691, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.24579119682312, "rewards/margins": 1.1993318796157837, "rewards/rejected": 2.046459674835205, "step": 32620 }, { "epoch": 1.5149264125539719, "grad_norm": 31.004148483276367, "learning_rate": 2.4765154680656793e-07, "logits/chosen": -18.6367244720459, "logits/rejected": -17.84613037109375, "logps/chosen": -442.000732421875, "logps/rejected": -396.33636474609375, "loss": 0.5449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.478497266769409, "rewards/margins": 1.094771385192871, "rewards/rejected": 2.3837263584136963, "step": 32630 }, { "epoch": 1.5153906866614049, "grad_norm": 33.571929931640625, "learning_rate": 2.4757416778866244e-07, "logits/chosen": -18.096851348876953, "logits/rejected": -18.416147232055664, "logps/chosen": -355.2509460449219, "logps/rejected": -355.3235168457031, "loss": 1.2119, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.817403793334961, "rewards/margins": -0.11412955820560455, "rewards/rejected": 2.931533098220825, "step": 32640 }, { "epoch": 1.5158549607688379, "grad_norm": 140.431884765625, "learning_rate": 2.474967887707569e-07, "logits/chosen": -18.68858528137207, "logits/rejected": -18.772733688354492, "logps/chosen": -338.35614013671875, "logps/rejected": -340.90545654296875, "loss": 0.7836, "rewards/accuracies": 0.5, "rewards/chosen": 2.1771161556243896, "rewards/margins": 0.623127281665802, "rewards/rejected": 1.5539889335632324, "step": 32650 }, { "epoch": 1.516319234876271, "grad_norm": 23.72405242919922, "learning_rate": 2.474194097528514e-07, "logits/chosen": -17.74289894104004, "logits/rejected": -17.092815399169922, "logps/chosen": -465.0291442871094, "logps/rejected": -332.9259338378906, "loss": 0.8102, "rewards/accuracies": 0.5, "rewards/chosen": 3.3212692737579346, "rewards/margins": 1.3645853996276855, "rewards/rejected": 1.9566843509674072, "step": 32660 }, { "epoch": 1.5167835089837038, "grad_norm": 53.05460739135742, "learning_rate": 2.4734203073494587e-07, "logits/chosen": -18.050888061523438, "logits/rejected": -17.647104263305664, "logps/chosen": -324.16021728515625, "logps/rejected": -290.8219909667969, "loss": 0.3574, "rewards/accuracies": 1.0, "rewards/chosen": 3.0210838317871094, "rewards/margins": 1.2185577154159546, "rewards/rejected": 1.8025261163711548, "step": 32670 }, { "epoch": 1.517247783091137, "grad_norm": 154.32260131835938, "learning_rate": 2.472646517170404e-07, "logits/chosen": -18.194744110107422, "logits/rejected": -17.92924690246582, "logps/chosen": -398.85601806640625, "logps/rejected": -341.92376708984375, "loss": 1.0722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7655136585235596, "rewards/margins": 0.5765547752380371, "rewards/rejected": 3.1889586448669434, "step": 32680 }, { "epoch": 1.51771205719857, "grad_norm": 1.7645602226257324, "learning_rate": 2.471872726991349e-07, "logits/chosen": -18.67702865600586, "logits/rejected": -18.27606773376465, "logps/chosen": -320.97503662109375, "logps/rejected": -277.641845703125, "loss": 0.8699, "rewards/accuracies": 0.5, "rewards/chosen": 2.743173122406006, "rewards/margins": 0.5811417698860168, "rewards/rejected": 2.162031650543213, "step": 32690 }, { "epoch": 1.518176331306003, "grad_norm": 13.790632247924805, "learning_rate": 2.471098936812294e-07, "logits/chosen": -18.69858169555664, "logits/rejected": -16.934490203857422, "logps/chosen": -384.93829345703125, "logps/rejected": -186.7236328125, "loss": 0.2318, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6405575275421143, "rewards/margins": 2.472487449645996, "rewards/rejected": 1.1680700778961182, "step": 32700 }, { "epoch": 1.518640605413436, "grad_norm": 8.258953094482422, "learning_rate": 2.470325146633239e-07, "logits/chosen": -19.4273738861084, "logits/rejected": -19.18332290649414, "logps/chosen": -381.074951171875, "logps/rejected": -363.24542236328125, "loss": 0.5649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9222571849823, "rewards/margins": 0.884596049785614, "rewards/rejected": 3.037661552429199, "step": 32710 }, { "epoch": 1.519104879520869, "grad_norm": 60.616458892822266, "learning_rate": 2.4695513564541837e-07, "logits/chosen": -19.533777236938477, "logits/rejected": -18.734994888305664, "logps/chosen": -496.05712890625, "logps/rejected": -411.2355041503906, "loss": 0.3948, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.257203102111816, "rewards/margins": 1.5935020446777344, "rewards/rejected": 2.6637017726898193, "step": 32720 }, { "epoch": 1.5195691536283022, "grad_norm": 0.7021080851554871, "learning_rate": 2.468777566275129e-07, "logits/chosen": -18.531198501586914, "logits/rejected": -18.639511108398438, "logps/chosen": -391.93023681640625, "logps/rejected": -404.3780822753906, "loss": 0.7888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.155091285705566, "rewards/margins": 0.6192084550857544, "rewards/rejected": 3.5358829498291016, "step": 32730 }, { "epoch": 1.520033427735735, "grad_norm": 92.20764923095703, "learning_rate": 2.468003776096074e-07, "logits/chosen": -19.694080352783203, "logits/rejected": -19.82837677001953, "logps/chosen": -412.89410400390625, "logps/rejected": -428.09906005859375, "loss": 1.0415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.066877841949463, "rewards/margins": 0.42264342308044434, "rewards/rejected": 3.6442344188690186, "step": 32740 }, { "epoch": 1.5204977018431682, "grad_norm": 18.34037208557129, "learning_rate": 2.4672299859170185e-07, "logits/chosen": -18.217178344726562, "logits/rejected": -18.167749404907227, "logps/chosen": -393.5263671875, "logps/rejected": -323.48577880859375, "loss": 0.6946, "rewards/accuracies": 0.5, "rewards/chosen": 2.3254756927490234, "rewards/margins": 0.35764673352241516, "rewards/rejected": 1.9678289890289307, "step": 32750 }, { "epoch": 1.5209619759506012, "grad_norm": 158.1086883544922, "learning_rate": 2.4664561957379636e-07, "logits/chosen": -19.251699447631836, "logits/rejected": -18.411380767822266, "logps/chosen": -481.15966796875, "logps/rejected": -369.0967712402344, "loss": 0.4593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.606200695037842, "rewards/margins": 1.646437644958496, "rewards/rejected": 2.9597630500793457, "step": 32760 }, { "epoch": 1.5214262500580342, "grad_norm": 27.401029586791992, "learning_rate": 2.465682405558908e-07, "logits/chosen": -18.86629295349121, "logits/rejected": -19.397724151611328, "logps/chosen": -324.11749267578125, "logps/rejected": -277.7242736816406, "loss": 0.7067, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.267192840576172, "rewards/margins": 0.4903014600276947, "rewards/rejected": 1.7768914699554443, "step": 32770 }, { "epoch": 1.5218905241654674, "grad_norm": 49.64231872558594, "learning_rate": 2.4649086153798533e-07, "logits/chosen": -19.318790435791016, "logits/rejected": -17.910429000854492, "logps/chosen": -390.0777587890625, "logps/rejected": -204.42208862304688, "loss": 0.2171, "rewards/accuracies": 1.0, "rewards/chosen": 3.2260711193084717, "rewards/margins": 2.1842551231384277, "rewards/rejected": 1.0418161153793335, "step": 32780 }, { "epoch": 1.5223547982729002, "grad_norm": 27.344097137451172, "learning_rate": 2.4641348252007984e-07, "logits/chosen": -18.728418350219727, "logits/rejected": -17.354902267456055, "logps/chosen": -388.35247802734375, "logps/rejected": -336.29132080078125, "loss": 0.8135, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.774862051010132, "rewards/margins": 0.27772781252861023, "rewards/rejected": 2.49713397026062, "step": 32790 }, { "epoch": 1.5228190723803334, "grad_norm": 158.68869018554688, "learning_rate": 2.4633610350217435e-07, "logits/chosen": -19.262908935546875, "logits/rejected": -18.7587890625, "logps/chosen": -404.2657775878906, "logps/rejected": -359.0144958496094, "loss": 0.7389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9102184772491455, "rewards/margins": 0.6786823272705078, "rewards/rejected": 2.2315361499786377, "step": 32800 }, { "epoch": 1.5232833464877664, "grad_norm": 223.99476623535156, "learning_rate": 2.4625872448426887e-07, "logits/chosen": -18.398189544677734, "logits/rejected": -17.03289222717285, "logps/chosen": -420.171630859375, "logps/rejected": -324.76251220703125, "loss": 0.647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.802380084991455, "rewards/margins": 1.1597827672958374, "rewards/rejected": 2.64259672164917, "step": 32810 }, { "epoch": 1.5237476205951994, "grad_norm": 72.77935028076172, "learning_rate": 2.461813454663633e-07, "logits/chosen": -19.10144805908203, "logits/rejected": -18.674896240234375, "logps/chosen": -408.69866943359375, "logps/rejected": -436.9776306152344, "loss": 0.7443, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5651791095733643, "rewards/margins": 0.3356805741786957, "rewards/rejected": 3.2294986248016357, "step": 32820 }, { "epoch": 1.5242118947026324, "grad_norm": 25.973682403564453, "learning_rate": 2.4610396644845784e-07, "logits/chosen": -19.73324966430664, "logits/rejected": -18.376163482666016, "logps/chosen": -409.52508544921875, "logps/rejected": -318.98114013671875, "loss": 0.3651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.573171138763428, "rewards/margins": 1.6690962314605713, "rewards/rejected": 2.9040751457214355, "step": 32830 }, { "epoch": 1.5246761688100654, "grad_norm": 68.51624298095703, "learning_rate": 2.4602658743055235e-07, "logits/chosen": -18.453266143798828, "logits/rejected": -17.27082061767578, "logps/chosen": -423.89471435546875, "logps/rejected": -284.8255615234375, "loss": 0.378, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7665810585021973, "rewards/margins": 1.6137549877166748, "rewards/rejected": 2.1528265476226807, "step": 32840 }, { "epoch": 1.5251404429174986, "grad_norm": 1.2646474838256836, "learning_rate": 2.459492084126468e-07, "logits/chosen": -18.716327667236328, "logits/rejected": -17.924602508544922, "logps/chosen": -358.65325927734375, "logps/rejected": -351.29095458984375, "loss": 0.5956, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.02093505859375, "rewards/margins": 1.4631153345108032, "rewards/rejected": 2.5578200817108154, "step": 32850 }, { "epoch": 1.5256047170249314, "grad_norm": 208.3641357421875, "learning_rate": 2.458718293947413e-07, "logits/chosen": -18.583789825439453, "logits/rejected": -17.869359970092773, "logps/chosen": -432.04620361328125, "logps/rejected": -375.2635803222656, "loss": 0.9956, "rewards/accuracies": 0.5, "rewards/chosen": 3.9825596809387207, "rewards/margins": 0.4150083661079407, "rewards/rejected": 3.567551374435425, "step": 32860 }, { "epoch": 1.5260689911323646, "grad_norm": 25.74835968017578, "learning_rate": 2.457944503768358e-07, "logits/chosen": -18.095745086669922, "logits/rejected": -17.60272979736328, "logps/chosen": -316.50103759765625, "logps/rejected": -307.8277893066406, "loss": 0.7138, "rewards/accuracies": 0.5, "rewards/chosen": 2.9538347721099854, "rewards/margins": 0.9342229962348938, "rewards/rejected": 2.0196118354797363, "step": 32870 }, { "epoch": 1.5265332652397976, "grad_norm": 207.93492126464844, "learning_rate": 2.457170713589303e-07, "logits/chosen": -18.160505294799805, "logits/rejected": -18.240346908569336, "logps/chosen": -417.6201171875, "logps/rejected": -370.295166015625, "loss": 1.0571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3422064781188965, "rewards/margins": 0.2662871778011322, "rewards/rejected": 3.0759196281433105, "step": 32880 }, { "epoch": 1.5269975393472306, "grad_norm": 49.45735168457031, "learning_rate": 2.456396923410248e-07, "logits/chosen": -18.318307876586914, "logits/rejected": -17.537395477294922, "logps/chosen": -520.9420166015625, "logps/rejected": -383.5205993652344, "loss": 0.5127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8294899463653564, "rewards/margins": 1.4171632528305054, "rewards/rejected": 2.4123268127441406, "step": 32890 }, { "epoch": 1.5274618134546636, "grad_norm": 7.976652145385742, "learning_rate": 2.455623133231193e-07, "logits/chosen": -18.907567977905273, "logits/rejected": -17.98817253112793, "logps/chosen": -344.4837646484375, "logps/rejected": -319.30462646484375, "loss": 0.7473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4571311473846436, "rewards/margins": 1.151681900024414, "rewards/rejected": 2.3054494857788086, "step": 32900 }, { "epoch": 1.5279260875620966, "grad_norm": 6.340032577514648, "learning_rate": 2.4548493430521377e-07, "logits/chosen": -19.177175521850586, "logits/rejected": -18.405271530151367, "logps/chosen": -384.761962890625, "logps/rejected": -291.62896728515625, "loss": 0.4754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.876169204711914, "rewards/margins": 1.419865608215332, "rewards/rejected": 2.456303596496582, "step": 32910 }, { "epoch": 1.5283903616695298, "grad_norm": 52.69981384277344, "learning_rate": 2.454075552873083e-07, "logits/chosen": -18.571208953857422, "logits/rejected": -17.924718856811523, "logps/chosen": -249.8047332763672, "logps/rejected": -199.97824096679688, "loss": 0.7374, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0945727825164795, "rewards/margins": 0.6883876323699951, "rewards/rejected": 1.406185269355774, "step": 32920 }, { "epoch": 1.5288546357769626, "grad_norm": 29.35011863708496, "learning_rate": 2.453301762694028e-07, "logits/chosen": -18.61538314819336, "logits/rejected": -18.134204864501953, "logps/chosen": -459.4397888183594, "logps/rejected": -341.96966552734375, "loss": 0.8435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7161171436309814, "rewards/margins": 1.3692853450775146, "rewards/rejected": 2.346831798553467, "step": 32930 }, { "epoch": 1.5293189098843958, "grad_norm": 3.265406847000122, "learning_rate": 2.452527972514973e-07, "logits/chosen": -19.777381896972656, "logits/rejected": -18.111379623413086, "logps/chosen": -374.0921630859375, "logps/rejected": -278.71026611328125, "loss": 0.5132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7917819023132324, "rewards/margins": 2.0207877159118652, "rewards/rejected": 1.7709938287734985, "step": 32940 }, { "epoch": 1.5297831839918288, "grad_norm": 9.381196975708008, "learning_rate": 2.4517541823359176e-07, "logits/chosen": -18.810367584228516, "logits/rejected": -17.790164947509766, "logps/chosen": -384.622802734375, "logps/rejected": -272.41912841796875, "loss": 0.3876, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.859610080718994, "rewards/margins": 0.9389171600341797, "rewards/rejected": 1.9206926822662354, "step": 32950 }, { "epoch": 1.5302474580992618, "grad_norm": 39.70363235473633, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -19.70485496520996, "logits/rejected": -18.13735008239746, "logps/chosen": -313.07354736328125, "logps/rejected": -246.8655242919922, "loss": 0.3976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9745407104492188, "rewards/margins": 2.2151665687561035, "rewards/rejected": 1.7593739032745361, "step": 32960 }, { "epoch": 1.530711732206695, "grad_norm": 175.82643127441406, "learning_rate": 2.4502066019778073e-07, "logits/chosen": -18.97940444946289, "logits/rejected": -18.66252899169922, "logps/chosen": -511.2333984375, "logps/rejected": -475.873291015625, "loss": 0.5514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.62734317779541, "rewards/margins": 0.887368381023407, "rewards/rejected": 3.7399744987487793, "step": 32970 }, { "epoch": 1.5311760063141278, "grad_norm": 110.0479507446289, "learning_rate": 2.4494328117987524e-07, "logits/chosen": -19.21417236328125, "logits/rejected": -18.83309555053711, "logps/chosen": -469.57940673828125, "logps/rejected": -502.35125732421875, "loss": 0.4811, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3599777221679688, "rewards/margins": 1.178423523902893, "rewards/rejected": 2.1815543174743652, "step": 32980 }, { "epoch": 1.531640280421561, "grad_norm": 0.07262936234474182, "learning_rate": 2.4486590216196975e-07, "logits/chosen": -18.628368377685547, "logits/rejected": -17.784059524536133, "logps/chosen": -480.81951904296875, "logps/rejected": -329.30499267578125, "loss": 0.6572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.422731399536133, "rewards/margins": 0.9921697378158569, "rewards/rejected": 2.4305615425109863, "step": 32990 }, { "epoch": 1.532104554528994, "grad_norm": 121.18531036376953, "learning_rate": 2.4478852314406426e-07, "logits/chosen": -18.708324432373047, "logits/rejected": -18.729745864868164, "logps/chosen": -470.1336975097656, "logps/rejected": -518.6188354492188, "loss": 0.8717, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.178849697113037, "rewards/margins": -0.14319221675395966, "rewards/rejected": 4.322041988372803, "step": 33000 }, { "epoch": 1.532568828636427, "grad_norm": 27.563114166259766, "learning_rate": 2.447111441261587e-07, "logits/chosen": -18.850290298461914, "logits/rejected": -18.968130111694336, "logps/chosen": -347.2471618652344, "logps/rejected": -256.68829345703125, "loss": 0.841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7924747467041016, "rewards/margins": 0.0819144994020462, "rewards/rejected": 2.7105600833892822, "step": 33010 }, { "epoch": 1.53303310274386, "grad_norm": 48.97837448120117, "learning_rate": 2.4463376510825323e-07, "logits/chosen": -19.229307174682617, "logits/rejected": -19.3114013671875, "logps/chosen": -378.920654296875, "logps/rejected": -370.3248291015625, "loss": 0.7197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.420608043670654, "rewards/margins": 0.4195200800895691, "rewards/rejected": 4.0010881423950195, "step": 33020 }, { "epoch": 1.533497376851293, "grad_norm": 22.226709365844727, "learning_rate": 2.4455638609034774e-07, "logits/chosen": -18.314189910888672, "logits/rejected": -17.62710189819336, "logps/chosen": -386.3211364746094, "logps/rejected": -255.647705078125, "loss": 0.3563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6496360301971436, "rewards/margins": 1.673107385635376, "rewards/rejected": 0.9765287637710571, "step": 33030 }, { "epoch": 1.5339616509587262, "grad_norm": 10.419321060180664, "learning_rate": 2.4447900707244226e-07, "logits/chosen": -19.65852928161621, "logits/rejected": -18.58304214477539, "logps/chosen": -279.14617919921875, "logps/rejected": -192.47885131835938, "loss": 0.4136, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7038052082061768, "rewards/margins": 1.530821681022644, "rewards/rejected": 1.1729835271835327, "step": 33040 }, { "epoch": 1.534425925066159, "grad_norm": 39.681034088134766, "learning_rate": 2.444016280545367e-07, "logits/chosen": -18.291296005249023, "logits/rejected": -17.512935638427734, "logps/chosen": -526.6839599609375, "logps/rejected": -373.33721923828125, "loss": 1.0107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.232693672180176, "rewards/margins": 1.1476925611495972, "rewards/rejected": 3.085000991821289, "step": 33050 }, { "epoch": 1.5348901991735922, "grad_norm": 119.35895538330078, "learning_rate": 2.443242490366312e-07, "logits/chosen": -18.5801944732666, "logits/rejected": -18.285858154296875, "logps/chosen": -319.53424072265625, "logps/rejected": -244.87789916992188, "loss": 0.7905, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.340914249420166, "rewards/margins": 0.7931574583053589, "rewards/rejected": 1.5477569103240967, "step": 33060 }, { "epoch": 1.5353544732810251, "grad_norm": 101.66541290283203, "learning_rate": 2.442468700187257e-07, "logits/chosen": -19.10718536376953, "logits/rejected": -18.15057373046875, "logps/chosen": -409.4366149902344, "logps/rejected": -309.4639892578125, "loss": 0.4228, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.754821300506592, "rewards/margins": 2.2153525352478027, "rewards/rejected": 1.53946852684021, "step": 33070 }, { "epoch": 1.5358187473884581, "grad_norm": 261.25439453125, "learning_rate": 2.441694910008202e-07, "logits/chosen": -18.314884185791016, "logits/rejected": -18.565887451171875, "logps/chosen": -344.6893005371094, "logps/rejected": -353.64935302734375, "loss": 0.8802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0684075355529785, "rewards/margins": 0.4547298550605774, "rewards/rejected": 2.613677978515625, "step": 33080 }, { "epoch": 1.5362830214958911, "grad_norm": 53.19618225097656, "learning_rate": 2.440921119829147e-07, "logits/chosen": -17.91400718688965, "logits/rejected": -17.463970184326172, "logps/chosen": -276.80438232421875, "logps/rejected": -224.1857452392578, "loss": 0.6187, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.56878399848938, "rewards/margins": 1.0609514713287354, "rewards/rejected": 1.5078327655792236, "step": 33090 }, { "epoch": 1.5367472956033241, "grad_norm": 103.1704330444336, "learning_rate": 2.4401473296500916e-07, "logits/chosen": -18.4216251373291, "logits/rejected": -18.171173095703125, "logps/chosen": -372.62908935546875, "logps/rejected": -335.35577392578125, "loss": 1.2125, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.39233660697937, "rewards/margins": -0.17733235657215118, "rewards/rejected": 3.569669246673584, "step": 33100 }, { "epoch": 1.5372115697107573, "grad_norm": 72.106689453125, "learning_rate": 2.439373539471037e-07, "logits/chosen": -19.771419525146484, "logits/rejected": -19.471168518066406, "logps/chosen": -405.4424743652344, "logps/rejected": -363.911865234375, "loss": 0.6165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5830185413360596, "rewards/margins": 1.0411773920059204, "rewards/rejected": 2.541841506958008, "step": 33110 }, { "epoch": 1.5376758438181901, "grad_norm": 44.21295928955078, "learning_rate": 2.438599749291982e-07, "logits/chosen": -19.588363647460938, "logits/rejected": -18.303518295288086, "logps/chosen": -441.46484375, "logps/rejected": -298.28802490234375, "loss": 0.303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9086270332336426, "rewards/margins": 1.871523141860962, "rewards/rejected": 2.0371041297912598, "step": 33120 }, { "epoch": 1.5381401179256233, "grad_norm": 327.5699157714844, "learning_rate": 2.437825959112927e-07, "logits/chosen": -18.06851577758789, "logits/rejected": -18.194900512695312, "logps/chosen": -383.4903869628906, "logps/rejected": -391.7865295410156, "loss": 1.3147, "rewards/accuracies": 0.5, "rewards/chosen": 3.006798267364502, "rewards/margins": 0.046786118298769, "rewards/rejected": 2.960012197494507, "step": 33130 }, { "epoch": 1.5386043920330563, "grad_norm": 18.52825927734375, "learning_rate": 2.437052168933872e-07, "logits/chosen": -17.873504638671875, "logits/rejected": -17.617998123168945, "logps/chosen": -362.51959228515625, "logps/rejected": -326.5459289550781, "loss": 0.8838, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1437344551086426, "rewards/margins": 0.35810431838035583, "rewards/rejected": 1.785630464553833, "step": 33140 }, { "epoch": 1.5390686661404893, "grad_norm": 67.77261352539062, "learning_rate": 2.4362783787548167e-07, "logits/chosen": -18.71549415588379, "logits/rejected": -18.389598846435547, "logps/chosen": -379.2729187011719, "logps/rejected": -318.9885559082031, "loss": 0.7284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.652977705001831, "rewards/margins": 0.8202753067016602, "rewards/rejected": 2.832702159881592, "step": 33150 }, { "epoch": 1.5395329402479225, "grad_norm": 264.49298095703125, "learning_rate": 2.435504588575762e-07, "logits/chosen": -19.289669036865234, "logits/rejected": -18.687232971191406, "logps/chosen": -315.11395263671875, "logps/rejected": -288.82403564453125, "loss": 0.7753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.008612632751465, "rewards/margins": 1.2657774686813354, "rewards/rejected": 2.7428348064422607, "step": 33160 }, { "epoch": 1.5399972143553553, "grad_norm": 146.31179809570312, "learning_rate": 2.434730798396707e-07, "logits/chosen": -18.36213493347168, "logits/rejected": -16.763254165649414, "logps/chosen": -446.7506408691406, "logps/rejected": -255.24234008789062, "loss": 0.2915, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3061745166778564, "rewards/margins": 2.2086522579193115, "rewards/rejected": 1.0975223779678345, "step": 33170 }, { "epoch": 1.5404614884627885, "grad_norm": 89.66636657714844, "learning_rate": 2.4339570082176515e-07, "logits/chosen": -18.26115608215332, "logits/rejected": -17.43033218383789, "logps/chosen": -551.0714111328125, "logps/rejected": -351.9065856933594, "loss": 0.4143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.715909957885742, "rewards/margins": 1.529577374458313, "rewards/rejected": 2.1863327026367188, "step": 33180 }, { "epoch": 1.5409257625702215, "grad_norm": 27.169851303100586, "learning_rate": 2.4331832180385966e-07, "logits/chosen": -19.553638458251953, "logits/rejected": -18.575786590576172, "logps/chosen": -367.20635986328125, "logps/rejected": -252.4252166748047, "loss": 0.514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6066036224365234, "rewards/margins": 1.611946702003479, "rewards/rejected": 1.9946571588516235, "step": 33190 }, { "epoch": 1.5413900366776545, "grad_norm": 113.8071060180664, "learning_rate": 2.432409427859541e-07, "logits/chosen": -18.636539459228516, "logits/rejected": -17.39735984802246, "logps/chosen": -419.10382080078125, "logps/rejected": -304.5079040527344, "loss": 0.9572, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.125863552093506, "rewards/margins": 0.663888692855835, "rewards/rejected": 3.461974620819092, "step": 33200 }, { "epoch": 1.5418543107850875, "grad_norm": 63.89536666870117, "learning_rate": 2.4316356376804863e-07, "logits/chosen": -18.511877059936523, "logits/rejected": -18.217655181884766, "logps/chosen": -418.51727294921875, "logps/rejected": -422.305419921875, "loss": 0.9088, "rewards/accuracies": 0.5, "rewards/chosen": 4.446534156799316, "rewards/margins": 0.6307441592216492, "rewards/rejected": 3.8157896995544434, "step": 33210 }, { "epoch": 1.5423185848925205, "grad_norm": 159.18115234375, "learning_rate": 2.4308618475014314e-07, "logits/chosen": -18.094173431396484, "logits/rejected": -18.288944244384766, "logps/chosen": -385.99462890625, "logps/rejected": -408.18438720703125, "loss": 1.0188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.073979616165161, "rewards/margins": -0.27450859546661377, "rewards/rejected": 3.3484878540039062, "step": 33220 }, { "epoch": 1.5427828589999537, "grad_norm": 176.63893127441406, "learning_rate": 2.4300880573223765e-07, "logits/chosen": -18.33346176147461, "logits/rejected": -18.115129470825195, "logps/chosen": -356.72796630859375, "logps/rejected": -277.07574462890625, "loss": 0.7953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.541802406311035, "rewards/margins": 0.7758995890617371, "rewards/rejected": 1.7659028768539429, "step": 33230 }, { "epoch": 1.5432471331073865, "grad_norm": 36.9507942199707, "learning_rate": 2.4293142671433216e-07, "logits/chosen": -18.626480102539062, "logits/rejected": -18.37542152404785, "logps/chosen": -427.83453369140625, "logps/rejected": -351.21990966796875, "loss": 0.5013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.662869453430176, "rewards/margins": 0.9666913151741028, "rewards/rejected": 1.6961778402328491, "step": 33240 }, { "epoch": 1.5437114072148197, "grad_norm": 149.2836456298828, "learning_rate": 2.428540476964266e-07, "logits/chosen": -18.343807220458984, "logits/rejected": -17.948684692382812, "logps/chosen": -329.3955078125, "logps/rejected": -261.27716064453125, "loss": 0.7155, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8776891231536865, "rewards/margins": 1.0665242671966553, "rewards/rejected": 1.8111652135849, "step": 33250 }, { "epoch": 1.5441756813222527, "grad_norm": 156.50192260742188, "learning_rate": 2.4277666867852113e-07, "logits/chosen": -18.964162826538086, "logits/rejected": -17.637035369873047, "logps/chosen": -405.6051330566406, "logps/rejected": -273.4315490722656, "loss": 0.7772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1923155784606934, "rewards/margins": 1.104193925857544, "rewards/rejected": 2.0881216526031494, "step": 33260 }, { "epoch": 1.5446399554296857, "grad_norm": 142.24794006347656, "learning_rate": 2.4269928966061564e-07, "logits/chosen": -19.06098175048828, "logits/rejected": -17.782428741455078, "logps/chosen": -439.3369140625, "logps/rejected": -286.48834228515625, "loss": 0.8946, "rewards/accuracies": 0.5, "rewards/chosen": 3.71809458732605, "rewards/margins": 1.2680120468139648, "rewards/rejected": 2.450082302093506, "step": 33270 }, { "epoch": 1.5451042295371187, "grad_norm": 115.33563232421875, "learning_rate": 2.426219106427101e-07, "logits/chosen": -19.341747283935547, "logits/rejected": -18.958402633666992, "logps/chosen": -422.93841552734375, "logps/rejected": -397.8092346191406, "loss": 0.7882, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9749228954315186, "rewards/margins": 0.23828181624412537, "rewards/rejected": 2.7366414070129395, "step": 33280 }, { "epoch": 1.5455685036445517, "grad_norm": 1.0033787488937378, "learning_rate": 2.425445316248046e-07, "logits/chosen": -18.655071258544922, "logits/rejected": -18.54902458190918, "logps/chosen": -469.85516357421875, "logps/rejected": -377.67919921875, "loss": 0.5488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.409420967102051, "rewards/margins": 1.0500720739364624, "rewards/rejected": 3.3593482971191406, "step": 33290 }, { "epoch": 1.5460327777519849, "grad_norm": 168.18121337890625, "learning_rate": 2.4246715260689907e-07, "logits/chosen": -18.3698673248291, "logits/rejected": -18.532062530517578, "logps/chosen": -327.3055114746094, "logps/rejected": -325.9349060058594, "loss": 1.5566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5156846046447754, "rewards/margins": -0.44059839844703674, "rewards/rejected": 2.9562833309173584, "step": 33300 }, { "epoch": 1.5464970518594177, "grad_norm": 23.7297420501709, "learning_rate": 2.423897735889936e-07, "logits/chosen": -18.893085479736328, "logits/rejected": -18.75908088684082, "logps/chosen": -392.3071594238281, "logps/rejected": -357.39776611328125, "loss": 0.4259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.836620330810547, "rewards/margins": 1.378487229347229, "rewards/rejected": 2.4581332206726074, "step": 33310 }, { "epoch": 1.5469613259668509, "grad_norm": 4.4832305908203125, "learning_rate": 2.423123945710881e-07, "logits/chosen": -18.698993682861328, "logits/rejected": -17.920236587524414, "logps/chosen": -488.77984619140625, "logps/rejected": -451.3609313964844, "loss": 0.8921, "rewards/accuracies": 0.5, "rewards/chosen": 4.334803581237793, "rewards/margins": 0.4100637435913086, "rewards/rejected": 3.9247403144836426, "step": 33320 }, { "epoch": 1.5474256000742839, "grad_norm": 3.0181639194488525, "learning_rate": 2.422350155531826e-07, "logits/chosen": -18.815710067749023, "logits/rejected": -18.431629180908203, "logps/chosen": -318.02587890625, "logps/rejected": -291.4188537597656, "loss": 1.1692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.760387659072876, "rewards/margins": 0.47549161314964294, "rewards/rejected": 2.284895658493042, "step": 33330 }, { "epoch": 1.5478898741817169, "grad_norm": 106.23837280273438, "learning_rate": 2.4215763653527706e-07, "logits/chosen": -18.540668487548828, "logits/rejected": -18.469026565551758, "logps/chosen": -241.9314422607422, "logps/rejected": -223.7248992919922, "loss": 0.5163, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8349382877349854, "rewards/margins": 0.6496815085411072, "rewards/rejected": 1.1852567195892334, "step": 33340 }, { "epoch": 1.54835414828915, "grad_norm": 43.94362258911133, "learning_rate": 2.420802575173716e-07, "logits/chosen": -19.696691513061523, "logits/rejected": -18.728063583374023, "logps/chosen": -347.93316650390625, "logps/rejected": -307.3643798828125, "loss": 1.1232, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9722750186920166, "rewards/margins": 0.3323938250541687, "rewards/rejected": 2.639881134033203, "step": 33350 }, { "epoch": 1.5488184223965829, "grad_norm": 9.92565631866455, "learning_rate": 2.420028784994661e-07, "logits/chosen": -17.790790557861328, "logits/rejected": -17.102455139160156, "logps/chosen": -307.1046447753906, "logps/rejected": -225.7345733642578, "loss": 0.8316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.378667116165161, "rewards/margins": 0.4577321410179138, "rewards/rejected": 1.920935034751892, "step": 33360 }, { "epoch": 1.549282696504016, "grad_norm": 2.413302183151245, "learning_rate": 2.419254994815606e-07, "logits/chosen": -17.879467010498047, "logits/rejected": -17.169336318969727, "logps/chosen": -264.7651062011719, "logps/rejected": -192.6473846435547, "loss": 0.3221, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.184858560562134, "rewards/margins": 1.4132959842681885, "rewards/rejected": 0.7715623378753662, "step": 33370 }, { "epoch": 1.5497469706114488, "grad_norm": 43.51560974121094, "learning_rate": 2.4184812046365506e-07, "logits/chosen": -19.9926700592041, "logits/rejected": -17.723400115966797, "logps/chosen": -380.6183166503906, "logps/rejected": -290.4002685546875, "loss": 0.6559, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7672743797302246, "rewards/margins": 1.172696828842163, "rewards/rejected": 1.5945775508880615, "step": 33380 }, { "epoch": 1.550211244718882, "grad_norm": 0.8133816123008728, "learning_rate": 2.4177074144574957e-07, "logits/chosen": -18.96009635925293, "logits/rejected": -18.44998550415039, "logps/chosen": -410.4927673339844, "logps/rejected": -388.35101318359375, "loss": 1.1612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.321282148361206, "rewards/margins": 0.6237958669662476, "rewards/rejected": 2.697486400604248, "step": 33390 }, { "epoch": 1.550675518826315, "grad_norm": 0.8569897413253784, "learning_rate": 2.4169336242784403e-07, "logits/chosen": -19.703062057495117, "logits/rejected": -18.837045669555664, "logps/chosen": -358.9500427246094, "logps/rejected": -313.6302490234375, "loss": 0.7148, "rewards/accuracies": 0.5, "rewards/chosen": 3.8139331340789795, "rewards/margins": 1.1068753004074097, "rewards/rejected": 2.7070579528808594, "step": 33400 }, { "epoch": 1.551139792933748, "grad_norm": 2.9967849254608154, "learning_rate": 2.4161598340993854e-07, "logits/chosen": -18.358665466308594, "logits/rejected": -17.74191665649414, "logps/chosen": -361.6256103515625, "logps/rejected": -310.3423767089844, "loss": 0.7834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7813761234283447, "rewards/margins": 0.7747389078140259, "rewards/rejected": 2.0066370964050293, "step": 33410 }, { "epoch": 1.5516040670411813, "grad_norm": 0.17509135603904724, "learning_rate": 2.4153860439203305e-07, "logits/chosen": -18.720367431640625, "logits/rejected": -17.58334732055664, "logps/chosen": -538.7440795898438, "logps/rejected": -361.0092468261719, "loss": 0.4363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8277366161346436, "rewards/margins": 2.0056488513946533, "rewards/rejected": 1.8220876455307007, "step": 33420 }, { "epoch": 1.552068341148614, "grad_norm": 1.8833688497543335, "learning_rate": 2.4146122537412756e-07, "logits/chosen": -18.358470916748047, "logits/rejected": -17.05486488342285, "logps/chosen": -330.46868896484375, "logps/rejected": -258.1661376953125, "loss": 0.2326, "rewards/accuracies": 1.0, "rewards/chosen": 2.942370653152466, "rewards/margins": 2.6462974548339844, "rewards/rejected": 0.29607343673706055, "step": 33430 }, { "epoch": 1.5525326152560472, "grad_norm": 74.42809295654297, "learning_rate": 2.41383846356222e-07, "logits/chosen": -19.87826156616211, "logits/rejected": -18.976421356201172, "logps/chosen": -461.93487548828125, "logps/rejected": -379.99285888671875, "loss": 0.5502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.066560745239258, "rewards/margins": 0.8226537704467773, "rewards/rejected": 3.2439067363739014, "step": 33440 }, { "epoch": 1.5529968893634802, "grad_norm": 37.27484893798828, "learning_rate": 2.4130646733831653e-07, "logits/chosen": -19.1053524017334, "logits/rejected": -17.641658782958984, "logps/chosen": -514.0693359375, "logps/rejected": -379.03570556640625, "loss": 0.3111, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.4933271408081055, "rewards/margins": 2.5628724098205566, "rewards/rejected": 1.9304554462432861, "step": 33450 }, { "epoch": 1.5534611634709132, "grad_norm": 19.32392120361328, "learning_rate": 2.4122908832041104e-07, "logits/chosen": -19.405664443969727, "logits/rejected": -18.663074493408203, "logps/chosen": -351.49554443359375, "logps/rejected": -286.009765625, "loss": 0.6152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.298651933670044, "rewards/margins": 0.7918513417243958, "rewards/rejected": 2.506800651550293, "step": 33460 }, { "epoch": 1.5539254375783462, "grad_norm": 1.0891892910003662, "learning_rate": 2.4115170930250555e-07, "logits/chosen": -18.663227081298828, "logits/rejected": -17.201461791992188, "logps/chosen": -449.18621826171875, "logps/rejected": -247.38314819335938, "loss": 0.4818, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9395737648010254, "rewards/margins": 2.998136043548584, "rewards/rejected": 0.9414380192756653, "step": 33470 }, { "epoch": 1.5543897116857792, "grad_norm": 40.26100540161133, "learning_rate": 2.410743302846e-07, "logits/chosen": -19.493555068969727, "logits/rejected": -18.978038787841797, "logps/chosen": -319.1508483886719, "logps/rejected": -262.039306640625, "loss": 0.5095, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.505432605743408, "rewards/margins": 1.398826241493225, "rewards/rejected": 2.1066062450408936, "step": 33480 }, { "epoch": 1.5548539857932124, "grad_norm": 55.81448745727539, "learning_rate": 2.409969512666945e-07, "logits/chosen": -18.487285614013672, "logits/rejected": -18.89971351623535, "logps/chosen": -387.3697509765625, "logps/rejected": -339.6334533691406, "loss": 0.9573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.576997756958008, "rewards/margins": 0.38519272208213806, "rewards/rejected": 3.191805124282837, "step": 33490 }, { "epoch": 1.5553182599006452, "grad_norm": 10.741705894470215, "learning_rate": 2.40919572248789e-07, "logits/chosen": -18.701112747192383, "logits/rejected": -17.524147033691406, "logps/chosen": -519.6452026367188, "logps/rejected": -362.54010009765625, "loss": 0.5259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.208430290222168, "rewards/margins": 1.087131381034851, "rewards/rejected": 3.1212990283966064, "step": 33500 }, { "epoch": 1.5557825340080784, "grad_norm": 20.38425636291504, "learning_rate": 2.408421932308835e-07, "logits/chosen": -18.634185791015625, "logits/rejected": -18.616809844970703, "logps/chosen": -369.1441650390625, "logps/rejected": -377.6538391113281, "loss": 1.2315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.352590560913086, "rewards/margins": 0.5390704274177551, "rewards/rejected": 2.8135201930999756, "step": 33510 }, { "epoch": 1.5562468081155114, "grad_norm": 5.532230377197266, "learning_rate": 2.40764814212978e-07, "logits/chosen": -18.25310516357422, "logits/rejected": -17.134746551513672, "logps/chosen": -418.4656677246094, "logps/rejected": -258.9044494628906, "loss": 0.4685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.079094886779785, "rewards/margins": 2.48856258392334, "rewards/rejected": 1.5905327796936035, "step": 33520 }, { "epoch": 1.5567110822229444, "grad_norm": 146.27978515625, "learning_rate": 2.4068743519507246e-07, "logits/chosen": -17.772682189941406, "logits/rejected": -17.485841751098633, "logps/chosen": -329.93194580078125, "logps/rejected": -226.9614715576172, "loss": 1.0367, "rewards/accuracies": 0.5, "rewards/chosen": 1.7692636251449585, "rewards/margins": 0.17522032558918, "rewards/rejected": 1.594043254852295, "step": 33530 }, { "epoch": 1.5571753563303774, "grad_norm": 21.318273544311523, "learning_rate": 2.4061005617716697e-07, "logits/chosen": -19.108680725097656, "logits/rejected": -18.051794052124023, "logps/chosen": -469.1288146972656, "logps/rejected": -358.50848388671875, "loss": 0.3937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9895808696746826, "rewards/margins": 1.7962623834609985, "rewards/rejected": 2.1933183670043945, "step": 33540 }, { "epoch": 1.5576396304378104, "grad_norm": 70.69354248046875, "learning_rate": 2.405326771592615e-07, "logits/chosen": -19.64032554626465, "logits/rejected": -17.882238388061523, "logps/chosen": -457.01513671875, "logps/rejected": -303.1407775878906, "loss": 0.3775, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.207716941833496, "rewards/margins": 2.215967893600464, "rewards/rejected": 1.991748571395874, "step": 33550 }, { "epoch": 1.5581039045452436, "grad_norm": 73.50343322753906, "learning_rate": 2.40455298141356e-07, "logits/chosen": -19.202030181884766, "logits/rejected": -18.126859664916992, "logps/chosen": -501.9789123535156, "logps/rejected": -358.622314453125, "loss": 0.3291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.015153884887695, "rewards/margins": 1.5577467679977417, "rewards/rejected": 2.457407236099243, "step": 33560 }, { "epoch": 1.5585681786526764, "grad_norm": 149.951171875, "learning_rate": 2.403779191234505e-07, "logits/chosen": -18.918272018432617, "logits/rejected": -17.203731536865234, "logps/chosen": -348.7635498046875, "logps/rejected": -184.15805053710938, "loss": 0.5278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9417026042938232, "rewards/margins": 1.578942060470581, "rewards/rejected": 1.3627604246139526, "step": 33570 }, { "epoch": 1.5590324527601096, "grad_norm": 10.81296157836914, "learning_rate": 2.4030054010554497e-07, "logits/chosen": -18.281423568725586, "logits/rejected": -17.250446319580078, "logps/chosen": -412.78985595703125, "logps/rejected": -343.1317443847656, "loss": 0.7199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4866745471954346, "rewards/margins": 1.326159119606018, "rewards/rejected": 2.160515785217285, "step": 33580 }, { "epoch": 1.5594967268675426, "grad_norm": 60.134361267089844, "learning_rate": 2.402231610876395e-07, "logits/chosen": -18.40131378173828, "logits/rejected": -17.69510841369629, "logps/chosen": -298.27215576171875, "logps/rejected": -234.93017578125, "loss": 0.5078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3515400886535645, "rewards/margins": 1.1124162673950195, "rewards/rejected": 1.2391239404678345, "step": 33590 }, { "epoch": 1.5599610009749756, "grad_norm": 48.43228530883789, "learning_rate": 2.4014578206973393e-07, "logits/chosen": -18.89438819885254, "logits/rejected": -17.523475646972656, "logps/chosen": -412.8824768066406, "logps/rejected": -263.6022033691406, "loss": 0.4638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7606747150421143, "rewards/margins": 1.7039833068847656, "rewards/rejected": 2.0566916465759277, "step": 33600 }, { "epoch": 1.5604252750824088, "grad_norm": 31.945358276367188, "learning_rate": 2.4006840305182845e-07, "logits/chosen": -19.248531341552734, "logits/rejected": -18.885364532470703, "logps/chosen": -438.58843994140625, "logps/rejected": -381.56585693359375, "loss": 0.645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.321025848388672, "rewards/margins": 1.0942559242248535, "rewards/rejected": 3.2267696857452393, "step": 33610 }, { "epoch": 1.5608895491898416, "grad_norm": 83.39540100097656, "learning_rate": 2.3999102403392296e-07, "logits/chosen": -19.277164459228516, "logits/rejected": -19.46677017211914, "logps/chosen": -388.55963134765625, "logps/rejected": -384.3934326171875, "loss": 1.2447, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6257503032684326, "rewards/margins": -0.18310639262199402, "rewards/rejected": 2.808856725692749, "step": 33620 }, { "epoch": 1.5613538232972748, "grad_norm": 0.9375561475753784, "learning_rate": 2.399136450160174e-07, "logits/chosen": -19.088550567626953, "logits/rejected": -18.185527801513672, "logps/chosen": -333.65283203125, "logps/rejected": -318.56884765625, "loss": 0.6977, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.381662368774414, "rewards/margins": 0.6652154326438904, "rewards/rejected": 2.716446876525879, "step": 33630 }, { "epoch": 1.5618180974047078, "grad_norm": 58.032623291015625, "learning_rate": 2.3983626599811193e-07, "logits/chosen": -18.486000061035156, "logits/rejected": -18.296772003173828, "logps/chosen": -356.3987731933594, "logps/rejected": -328.93829345703125, "loss": 1.3225, "rewards/accuracies": 0.5, "rewards/chosen": 2.302715301513672, "rewards/margins": -0.03056330606341362, "rewards/rejected": 2.3332784175872803, "step": 33640 }, { "epoch": 1.5622823715121408, "grad_norm": 198.03884887695312, "learning_rate": 2.3975888698020644e-07, "logits/chosen": -18.590829849243164, "logits/rejected": -17.989070892333984, "logps/chosen": -431.68170166015625, "logps/rejected": -380.48052978515625, "loss": 0.5135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9250411987304688, "rewards/margins": 1.2613900899887085, "rewards/rejected": 2.66365122795105, "step": 33650 }, { "epoch": 1.5627466456195738, "grad_norm": 142.66317749023438, "learning_rate": 2.3968150796230095e-07, "logits/chosen": -18.6257381439209, "logits/rejected": -17.519268035888672, "logps/chosen": -375.820556640625, "logps/rejected": -256.6063232421875, "loss": 0.4537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7081260681152344, "rewards/margins": 1.6445302963256836, "rewards/rejected": 2.063595771789551, "step": 33660 }, { "epoch": 1.5632109197270068, "grad_norm": 155.6817169189453, "learning_rate": 2.3960412894439546e-07, "logits/chosen": -18.907371520996094, "logits/rejected": -17.715879440307617, "logps/chosen": -426.08734130859375, "logps/rejected": -319.70831298828125, "loss": 0.4614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9736640453338623, "rewards/margins": 1.4652512073516846, "rewards/rejected": 2.5084128379821777, "step": 33670 }, { "epoch": 1.56367519383444, "grad_norm": 203.26364135742188, "learning_rate": 2.395267499264899e-07, "logits/chosen": -19.531152725219727, "logits/rejected": -18.152538299560547, "logps/chosen": -351.46728515625, "logps/rejected": -239.20315551757812, "loss": 0.8887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4421565532684326, "rewards/margins": 1.2078266143798828, "rewards/rejected": 2.234330177307129, "step": 33680 }, { "epoch": 1.5641394679418728, "grad_norm": 48.66617965698242, "learning_rate": 2.3944937090858443e-07, "logits/chosen": -20.586475372314453, "logits/rejected": -19.024465560913086, "logps/chosen": -417.0020446777344, "logps/rejected": -263.0369567871094, "loss": 0.3056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4293313026428223, "rewards/margins": 1.8474887609481812, "rewards/rejected": 1.5818426609039307, "step": 33690 }, { "epoch": 1.564603742049306, "grad_norm": 5.258441925048828, "learning_rate": 2.393719918906789e-07, "logits/chosen": -19.687870025634766, "logits/rejected": -18.932729721069336, "logps/chosen": -400.73187255859375, "logps/rejected": -429.74951171875, "loss": 0.4871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4621658325195312, "rewards/margins": 1.2204595804214478, "rewards/rejected": 2.241705894470215, "step": 33700 }, { "epoch": 1.565068016156739, "grad_norm": 101.01791381835938, "learning_rate": 2.392946128727734e-07, "logits/chosen": -19.224925994873047, "logits/rejected": -18.582738876342773, "logps/chosen": -313.2638244628906, "logps/rejected": -270.51165771484375, "loss": 0.8185, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8404242992401123, "rewards/margins": 0.8285205960273743, "rewards/rejected": 2.011903762817383, "step": 33710 }, { "epoch": 1.565532290264172, "grad_norm": 103.17813110351562, "learning_rate": 2.392172338548679e-07, "logits/chosen": -18.940021514892578, "logits/rejected": -18.42518424987793, "logps/chosen": -525.353759765625, "logps/rejected": -436.702880859375, "loss": 0.3761, "rewards/accuracies": 1.0, "rewards/chosen": 3.622241973876953, "rewards/margins": 1.136436939239502, "rewards/rejected": 2.485805034637451, "step": 33720 }, { "epoch": 1.565996564371605, "grad_norm": 77.29458618164062, "learning_rate": 2.3913985483696237e-07, "logits/chosen": -18.01726531982422, "logits/rejected": -18.693561553955078, "logps/chosen": -340.76611328125, "logps/rejected": -397.6407775878906, "loss": 0.9054, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3152637481689453, "rewards/margins": -0.16178445518016815, "rewards/rejected": 2.477048397064209, "step": 33730 }, { "epoch": 1.566460838479038, "grad_norm": 2.502448320388794, "learning_rate": 2.390624758190569e-07, "logits/chosen": -18.318359375, "logits/rejected": -17.61086654663086, "logps/chosen": -330.59466552734375, "logps/rejected": -245.4698486328125, "loss": 0.8007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.240773916244507, "rewards/margins": 0.9154060482978821, "rewards/rejected": 1.3253681659698486, "step": 33740 }, { "epoch": 1.5669251125864712, "grad_norm": 51.14931869506836, "learning_rate": 2.389850968011514e-07, "logits/chosen": -19.701889038085938, "logits/rejected": -19.97653579711914, "logps/chosen": -313.00201416015625, "logps/rejected": -330.3045959472656, "loss": 0.9241, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4159553050994873, "rewards/margins": 0.1630655974149704, "rewards/rejected": 3.252889633178711, "step": 33750 }, { "epoch": 1.567389386693904, "grad_norm": 4.283768177032471, "learning_rate": 2.389077177832459e-07, "logits/chosen": -18.7484188079834, "logits/rejected": -17.945026397705078, "logps/chosen": -439.5369567871094, "logps/rejected": -420.39776611328125, "loss": 0.7624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.497234344482422, "rewards/margins": 1.0267059803009033, "rewards/rejected": 3.4705283641815186, "step": 33760 }, { "epoch": 1.5678536608013371, "grad_norm": 3.194075345993042, "learning_rate": 2.3883033876534036e-07, "logits/chosen": -19.378292083740234, "logits/rejected": -18.270732879638672, "logps/chosen": -434.37188720703125, "logps/rejected": -290.8425598144531, "loss": 0.3787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.396601676940918, "rewards/margins": 1.8663885593414307, "rewards/rejected": 2.530212640762329, "step": 33770 }, { "epoch": 1.5683179349087701, "grad_norm": 58.20246505737305, "learning_rate": 2.3875295974743487e-07, "logits/chosen": -18.02005386352539, "logits/rejected": -17.6226863861084, "logps/chosen": -347.2506103515625, "logps/rejected": -281.5003967285156, "loss": 1.0391, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.72033953666687, "rewards/margins": 0.48931270837783813, "rewards/rejected": 2.2310268878936768, "step": 33780 }, { "epoch": 1.5687822090162031, "grad_norm": 14.48184871673584, "learning_rate": 2.386755807295294e-07, "logits/chosen": -17.917747497558594, "logits/rejected": -17.470703125, "logps/chosen": -313.6932678222656, "logps/rejected": -287.81793212890625, "loss": 0.6008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.911867618560791, "rewards/margins": 0.9677192568778992, "rewards/rejected": 1.9441478252410889, "step": 33790 }, { "epoch": 1.5692464831236363, "grad_norm": 39.038944244384766, "learning_rate": 2.385982017116239e-07, "logits/chosen": -18.02106285095215, "logits/rejected": -17.134815216064453, "logps/chosen": -383.57537841796875, "logps/rejected": -266.0039978027344, "loss": 0.4389, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.040769100189209, "rewards/margins": 1.8472048044204712, "rewards/rejected": 1.1935641765594482, "step": 33800 }, { "epoch": 1.5697107572310691, "grad_norm": 35.01860046386719, "learning_rate": 2.3852082269371835e-07, "logits/chosen": -18.93498992919922, "logits/rejected": -18.180185317993164, "logps/chosen": -298.34124755859375, "logps/rejected": -271.58734130859375, "loss": 0.729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4977176189422607, "rewards/margins": 0.5742546916007996, "rewards/rejected": 1.9234631061553955, "step": 33810 }, { "epoch": 1.5701750313385023, "grad_norm": 0.09254680573940277, "learning_rate": 2.3844344367581287e-07, "logits/chosen": -18.656469345092773, "logits/rejected": -18.951427459716797, "logps/chosen": -359.18206787109375, "logps/rejected": -302.59051513671875, "loss": 1.5823, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5399372577667236, "rewards/margins": -0.3379085063934326, "rewards/rejected": 3.8778457641601562, "step": 33820 }, { "epoch": 1.5706393054459353, "grad_norm": 75.73117065429688, "learning_rate": 2.3836606465790735e-07, "logits/chosen": -18.514326095581055, "logits/rejected": -17.91465187072754, "logps/chosen": -404.50640869140625, "logps/rejected": -395.3255615234375, "loss": 0.5608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.013355731964111, "rewards/margins": 1.2164278030395508, "rewards/rejected": 2.7969276905059814, "step": 33830 }, { "epoch": 1.5711035795533683, "grad_norm": 14.890840530395508, "learning_rate": 2.3828868564000184e-07, "logits/chosen": -18.497787475585938, "logits/rejected": -18.53074073791504, "logps/chosen": -377.4437255859375, "logps/rejected": -349.7910461425781, "loss": 0.9509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.468327760696411, "rewards/margins": 0.8534091711044312, "rewards/rejected": 2.6149184703826904, "step": 33840 }, { "epoch": 1.5715678536608013, "grad_norm": 206.02891540527344, "learning_rate": 2.3821130662209635e-07, "logits/chosen": -18.50018882751465, "logits/rejected": -18.257556915283203, "logps/chosen": -404.78485107421875, "logps/rejected": -342.76220703125, "loss": 1.3009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.908421039581299, "rewards/margins": 0.22559337317943573, "rewards/rejected": 2.6828277111053467, "step": 33850 }, { "epoch": 1.5720321277682343, "grad_norm": 24.645763397216797, "learning_rate": 2.3813392760419086e-07, "logits/chosen": -19.502117156982422, "logits/rejected": -17.46044921875, "logps/chosen": -490.49090576171875, "logps/rejected": -304.942626953125, "loss": 0.2516, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.769711494445801, "rewards/margins": 2.9184327125549316, "rewards/rejected": 1.8512790203094482, "step": 33860 }, { "epoch": 1.5724964018756675, "grad_norm": 4.493155002593994, "learning_rate": 2.3805654858628532e-07, "logits/chosen": -18.325546264648438, "logits/rejected": -17.204397201538086, "logps/chosen": -366.0186767578125, "logps/rejected": -270.0227355957031, "loss": 0.5006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.909700393676758, "rewards/margins": 1.5555156469345093, "rewards/rejected": 1.3541845083236694, "step": 33870 }, { "epoch": 1.5729606759831003, "grad_norm": 160.5437469482422, "learning_rate": 2.3797916956837983e-07, "logits/chosen": -19.248401641845703, "logits/rejected": -18.411922454833984, "logps/chosen": -404.9128723144531, "logps/rejected": -318.10125732421875, "loss": 0.7961, "rewards/accuracies": 0.5, "rewards/chosen": 3.03916335105896, "rewards/margins": 0.2654855251312256, "rewards/rejected": 2.7736780643463135, "step": 33880 }, { "epoch": 1.5734249500905335, "grad_norm": 88.32743072509766, "learning_rate": 2.379017905504743e-07, "logits/chosen": -19.057174682617188, "logits/rejected": -17.914569854736328, "logps/chosen": -434.81793212890625, "logps/rejected": -336.48809814453125, "loss": 0.2937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.753838539123535, "rewards/margins": 2.2872321605682373, "rewards/rejected": 2.4666061401367188, "step": 33890 }, { "epoch": 1.5738892241979665, "grad_norm": 205.22023010253906, "learning_rate": 2.3782441153256882e-07, "logits/chosen": -19.236438751220703, "logits/rejected": -18.970794677734375, "logps/chosen": -321.71978759765625, "logps/rejected": -295.9811096191406, "loss": 1.2312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5530121326446533, "rewards/margins": 0.008730674162507057, "rewards/rejected": 2.544281482696533, "step": 33900 }, { "epoch": 1.5743534983053995, "grad_norm": 52.88432312011719, "learning_rate": 2.3774703251466333e-07, "logits/chosen": -18.685546875, "logits/rejected": -18.199167251586914, "logps/chosen": -359.0441589355469, "logps/rejected": -342.6450500488281, "loss": 0.5862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4797871112823486, "rewards/margins": 0.810259997844696, "rewards/rejected": 2.6695268154144287, "step": 33910 }, { "epoch": 1.5748177724128325, "grad_norm": 1.0434916019439697, "learning_rate": 2.376696534967578e-07, "logits/chosen": -18.584360122680664, "logits/rejected": -17.553268432617188, "logps/chosen": -335.12701416015625, "logps/rejected": -249.78463745117188, "loss": 0.594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2852375507354736, "rewards/margins": 1.1129200458526611, "rewards/rejected": 2.1723172664642334, "step": 33920 }, { "epoch": 1.5752820465202655, "grad_norm": 265.6229553222656, "learning_rate": 2.375922744788523e-07, "logits/chosen": -18.763565063476562, "logits/rejected": -17.827497482299805, "logps/chosen": -433.9288635253906, "logps/rejected": -356.07635498046875, "loss": 0.2914, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.414153099060059, "rewards/margins": 2.411087989807129, "rewards/rejected": 2.0030646324157715, "step": 33930 }, { "epoch": 1.5757463206276987, "grad_norm": 112.98460388183594, "learning_rate": 2.375148954609468e-07, "logits/chosen": -19.42414093017578, "logits/rejected": -18.63054084777832, "logps/chosen": -423.09307861328125, "logps/rejected": -386.3001708984375, "loss": 0.8489, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.269438743591309, "rewards/margins": 0.5848151445388794, "rewards/rejected": 3.6846230030059814, "step": 33940 }, { "epoch": 1.5762105947351315, "grad_norm": 41.21625900268555, "learning_rate": 2.374375164430413e-07, "logits/chosen": -19.220890045166016, "logits/rejected": -19.46263885498047, "logps/chosen": -448.53631591796875, "logps/rejected": -507.789306640625, "loss": 1.0378, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.763185501098633, "rewards/margins": -0.40366896986961365, "rewards/rejected": 4.1668548583984375, "step": 33950 }, { "epoch": 1.5766748688425647, "grad_norm": 49.03807830810547, "learning_rate": 2.3736013742513579e-07, "logits/chosen": -19.223613739013672, "logits/rejected": -19.203609466552734, "logps/chosen": -278.0558166503906, "logps/rejected": -325.269287109375, "loss": 0.9525, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.315695285797119, "rewards/margins": -0.14452707767486572, "rewards/rejected": 2.460222005844116, "step": 33960 }, { "epoch": 1.5771391429499977, "grad_norm": 90.26675415039062, "learning_rate": 2.3728275840723027e-07, "logits/chosen": -18.564640045166016, "logits/rejected": -18.791963577270508, "logps/chosen": -345.83221435546875, "logps/rejected": -389.266845703125, "loss": 1.4089, "rewards/accuracies": 0.5, "rewards/chosen": 2.539768695831299, "rewards/margins": -0.48019304871559143, "rewards/rejected": 3.0199618339538574, "step": 33970 }, { "epoch": 1.5776034170574307, "grad_norm": 97.45821380615234, "learning_rate": 2.3720537938932478e-07, "logits/chosen": -18.88361167907715, "logits/rejected": -18.59021759033203, "logps/chosen": -346.7833251953125, "logps/rejected": -314.3592529296875, "loss": 0.8131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6267073154449463, "rewards/margins": 0.3377896249294281, "rewards/rejected": 2.2889175415039062, "step": 33980 }, { "epoch": 1.578067691164864, "grad_norm": 9.157032012939453, "learning_rate": 2.3712800037141927e-07, "logits/chosen": -19.524412155151367, "logits/rejected": -17.82906150817871, "logps/chosen": -513.7080078125, "logps/rejected": -331.8235778808594, "loss": 0.3956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4978179931640625, "rewards/margins": 2.4659276008605957, "rewards/rejected": 2.031890392303467, "step": 33990 }, { "epoch": 1.5785319652722967, "grad_norm": 14.249664306640625, "learning_rate": 2.3705062135351378e-07, "logits/chosen": -19.970033645629883, "logits/rejected": -18.87601661682129, "logps/chosen": -364.0186767578125, "logps/rejected": -347.8304138183594, "loss": 0.5212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0402321815490723, "rewards/margins": 0.7855672240257263, "rewards/rejected": 2.254664897918701, "step": 34000 }, { "epoch": 1.5789962393797299, "grad_norm": 2.5573935508728027, "learning_rate": 2.3697324233560826e-07, "logits/chosen": -18.578815460205078, "logits/rejected": -17.881914138793945, "logps/chosen": -471.7832946777344, "logps/rejected": -373.2145080566406, "loss": 1.5109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.933814525604248, "rewards/margins": 0.6998729705810547, "rewards/rejected": 3.2339420318603516, "step": 34010 }, { "epoch": 1.5794605134871629, "grad_norm": 30.872058868408203, "learning_rate": 2.3689586331770275e-07, "logits/chosen": -19.455419540405273, "logits/rejected": -17.89059829711914, "logps/chosen": -329.841552734375, "logps/rejected": -252.9297332763672, "loss": 0.2237, "rewards/accuracies": 1.0, "rewards/chosen": 4.106870174407959, "rewards/margins": 1.7838859558105469, "rewards/rejected": 2.322984457015991, "step": 34020 }, { "epoch": 1.5799247875945959, "grad_norm": 151.52748107910156, "learning_rate": 2.3681848429979726e-07, "logits/chosen": -19.28452491760254, "logits/rejected": -19.41146469116211, "logps/chosen": -374.33441162109375, "logps/rejected": -315.7065124511719, "loss": 0.8514, "rewards/accuracies": 0.5, "rewards/chosen": 2.755460500717163, "rewards/margins": -0.004091096110641956, "rewards/rejected": 2.759551525115967, "step": 34030 }, { "epoch": 1.5803890617020289, "grad_norm": 54.39704895019531, "learning_rate": 2.3674110528189174e-07, "logits/chosen": -19.113637924194336, "logits/rejected": -18.564105987548828, "logps/chosen": -516.6466674804688, "logps/rejected": -399.0313415527344, "loss": 0.5837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.578439712524414, "rewards/margins": 1.0266802310943604, "rewards/rejected": 3.551759719848633, "step": 34040 }, { "epoch": 1.5808533358094619, "grad_norm": 18.620594024658203, "learning_rate": 2.3666372626398625e-07, "logits/chosen": -19.35245132446289, "logits/rejected": -18.579669952392578, "logps/chosen": -381.78729248046875, "logps/rejected": -298.41448974609375, "loss": 0.4508, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.488469123840332, "rewards/margins": 2.0847115516662598, "rewards/rejected": 2.4037575721740723, "step": 34050 }, { "epoch": 1.581317609916895, "grad_norm": 88.01316833496094, "learning_rate": 2.3658634724608074e-07, "logits/chosen": -18.989063262939453, "logits/rejected": -18.19814109802246, "logps/chosen": -425.42938232421875, "logps/rejected": -337.810302734375, "loss": 0.3888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8894901275634766, "rewards/margins": 1.6101268529891968, "rewards/rejected": 2.2793633937835693, "step": 34060 }, { "epoch": 1.5817818840243278, "grad_norm": 55.14920425415039, "learning_rate": 2.3650896822817522e-07, "logits/chosen": -18.592437744140625, "logits/rejected": -17.84177017211914, "logps/chosen": -356.4779357910156, "logps/rejected": -290.32159423828125, "loss": 0.364, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7112765312194824, "rewards/margins": 1.3788305521011353, "rewards/rejected": 1.3324458599090576, "step": 34070 }, { "epoch": 1.582246158131761, "grad_norm": 10.514395713806152, "learning_rate": 2.3643158921026974e-07, "logits/chosen": -19.144899368286133, "logits/rejected": -18.244644165039062, "logps/chosen": -364.0157775878906, "logps/rejected": -268.28021240234375, "loss": 0.3487, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.625471591949463, "rewards/margins": 1.591615080833435, "rewards/rejected": 3.0338568687438965, "step": 34080 }, { "epoch": 1.582710432239194, "grad_norm": 2.4284133911132812, "learning_rate": 2.3635421019236425e-07, "logits/chosen": -19.481327056884766, "logits/rejected": -19.4444580078125, "logps/chosen": -310.7093811035156, "logps/rejected": -397.39849853515625, "loss": 0.5528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.269224166870117, "rewards/margins": 0.8271301984786987, "rewards/rejected": 2.442094087600708, "step": 34090 }, { "epoch": 1.583174706346627, "grad_norm": 128.81524658203125, "learning_rate": 2.3627683117445873e-07, "logits/chosen": -18.413015365600586, "logits/rejected": -18.06371307373047, "logps/chosen": -408.1407165527344, "logps/rejected": -565.9461669921875, "loss": 0.7186, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.895909070968628, "rewards/margins": 0.8533647656440735, "rewards/rejected": 3.042544364929199, "step": 34100 }, { "epoch": 1.58363898045406, "grad_norm": 13.662166595458984, "learning_rate": 2.3619945215655322e-07, "logits/chosen": -18.983320236206055, "logits/rejected": -18.277021408081055, "logps/chosen": -424.7118225097656, "logps/rejected": -327.62274169921875, "loss": 0.6705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7160158157348633, "rewards/margins": 0.5863307118415833, "rewards/rejected": 2.1296849250793457, "step": 34110 }, { "epoch": 1.584103254561493, "grad_norm": 55.382598876953125, "learning_rate": 2.361220731386477e-07, "logits/chosen": -18.63156509399414, "logits/rejected": -17.746368408203125, "logps/chosen": -495.16571044921875, "logps/rejected": -381.89251708984375, "loss": 0.8421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.396728038787842, "rewards/margins": 0.8453842997550964, "rewards/rejected": 3.551344394683838, "step": 34120 }, { "epoch": 1.5845675286689263, "grad_norm": 23.72047996520996, "learning_rate": 2.3605243202253275e-07, "logits/chosen": -19.943832397460938, "logits/rejected": -19.42167091369629, "logps/chosen": -395.3802185058594, "logps/rejected": -344.2972717285156, "loss": 1.3986, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5775444507598877, "rewards/margins": -0.5574938654899597, "rewards/rejected": 4.135037899017334, "step": 34130 }, { "epoch": 1.585031802776359, "grad_norm": 30.989641189575195, "learning_rate": 2.3597505300462726e-07, "logits/chosen": -19.18987464904785, "logits/rejected": -17.895095825195312, "logps/chosen": -412.9246520996094, "logps/rejected": -329.371337890625, "loss": 0.293, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.440384864807129, "rewards/margins": 1.7678416967391968, "rewards/rejected": 2.6725432872772217, "step": 34140 }, { "epoch": 1.5854960768837922, "grad_norm": 84.11717987060547, "learning_rate": 2.3589767398672175e-07, "logits/chosen": -18.49338722229004, "logits/rejected": -17.36989974975586, "logps/chosen": -412.1554260253906, "logps/rejected": -327.88433837890625, "loss": 0.374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8493313789367676, "rewards/margins": 2.2624008655548096, "rewards/rejected": 1.5869309902191162, "step": 34150 }, { "epoch": 1.5859603509912252, "grad_norm": 41.186012268066406, "learning_rate": 2.3582029496881626e-07, "logits/chosen": -19.477428436279297, "logits/rejected": -18.86638641357422, "logps/chosen": -351.90106201171875, "logps/rejected": -318.0116271972656, "loss": 0.844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.871764659881592, "rewards/margins": 0.35276323556900024, "rewards/rejected": 2.5190012454986572, "step": 34160 }, { "epoch": 1.5864246250986582, "grad_norm": 33.18756866455078, "learning_rate": 2.3574291595091074e-07, "logits/chosen": -19.359399795532227, "logits/rejected": -17.66727638244629, "logps/chosen": -305.223876953125, "logps/rejected": -217.43716430664062, "loss": 1.0038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.67834210395813, "rewards/margins": 0.7850008010864258, "rewards/rejected": 1.893341064453125, "step": 34170 }, { "epoch": 1.5868888992060914, "grad_norm": 18.569499969482422, "learning_rate": 2.3566553693300523e-07, "logits/chosen": -19.64735221862793, "logits/rejected": -19.143945693969727, "logps/chosen": -316.19390869140625, "logps/rejected": -279.58740234375, "loss": 0.7507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0106067657470703, "rewards/margins": 0.9775503873825073, "rewards/rejected": 2.0330560207366943, "step": 34180 }, { "epoch": 1.5873531733135242, "grad_norm": 67.63286590576172, "learning_rate": 2.3558815791509974e-07, "logits/chosen": -19.299840927124023, "logits/rejected": -17.155269622802734, "logps/chosen": -519.67919921875, "logps/rejected": -264.489013671875, "loss": 0.3166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.189546585083008, "rewards/margins": 2.898176670074463, "rewards/rejected": 2.291369915008545, "step": 34190 }, { "epoch": 1.5878174474209574, "grad_norm": 11.643331527709961, "learning_rate": 2.3551077889719422e-07, "logits/chosen": -20.2945499420166, "logits/rejected": -18.538455963134766, "logps/chosen": -375.96734619140625, "logps/rejected": -233.95260620117188, "loss": 0.3367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9026381969451904, "rewards/margins": 2.0434513092041016, "rewards/rejected": 1.859187364578247, "step": 34200 }, { "epoch": 1.5882817215283902, "grad_norm": 39.73754119873047, "learning_rate": 2.3543339987928874e-07, "logits/chosen": -19.877952575683594, "logits/rejected": -19.212600708007812, "logps/chosen": -474.4427185058594, "logps/rejected": -372.1077880859375, "loss": 0.3751, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.229853630065918, "rewards/margins": 1.173035979270935, "rewards/rejected": 3.056817054748535, "step": 34210 }, { "epoch": 1.5887459956358234, "grad_norm": 182.44895935058594, "learning_rate": 2.353560208613832e-07, "logits/chosen": -18.805030822753906, "logits/rejected": -17.642885208129883, "logps/chosen": -481.57965087890625, "logps/rejected": -380.00634765625, "loss": 0.4356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.211413383483887, "rewards/margins": 1.8784313201904297, "rewards/rejected": 2.3329825401306152, "step": 34220 }, { "epoch": 1.5892102697432564, "grad_norm": 137.7169189453125, "learning_rate": 2.352786418434777e-07, "logits/chosen": -18.933368682861328, "logits/rejected": -18.876676559448242, "logps/chosen": -396.81011962890625, "logps/rejected": -393.78985595703125, "loss": 1.1525, "rewards/accuracies": 0.5, "rewards/chosen": 3.7194037437438965, "rewards/margins": 0.3231351375579834, "rewards/rejected": 3.396268367767334, "step": 34230 }, { "epoch": 1.5896745438506894, "grad_norm": 192.3861541748047, "learning_rate": 2.3520126282557222e-07, "logits/chosen": -19.2762393951416, "logits/rejected": -18.785552978515625, "logps/chosen": -461.34869384765625, "logps/rejected": -459.793701171875, "loss": 0.6637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9532463550567627, "rewards/margins": 1.1775636672973633, "rewards/rejected": 2.7756826877593994, "step": 34240 }, { "epoch": 1.5901388179581226, "grad_norm": 3.002636432647705, "learning_rate": 2.351238838076667e-07, "logits/chosen": -19.852657318115234, "logits/rejected": -18.153575897216797, "logps/chosen": -442.1031799316406, "logps/rejected": -305.4439697265625, "loss": 0.9506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.895780563354492, "rewards/margins": 1.4644564390182495, "rewards/rejected": 2.4313242435455322, "step": 34250 }, { "epoch": 1.5906030920655554, "grad_norm": 111.19248962402344, "learning_rate": 2.350465047897612e-07, "logits/chosen": -18.965299606323242, "logits/rejected": -17.78757667541504, "logps/chosen": -447.7576599121094, "logps/rejected": -329.7833557128906, "loss": 0.5328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1727843284606934, "rewards/margins": 1.4693832397460938, "rewards/rejected": 1.7034008502960205, "step": 34260 }, { "epoch": 1.5910673661729886, "grad_norm": 27.00146484375, "learning_rate": 2.3496912577185567e-07, "logits/chosen": -19.6538143157959, "logits/rejected": -18.81495475769043, "logps/chosen": -410.2391662597656, "logps/rejected": -333.35003662109375, "loss": 0.4433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2644145488739014, "rewards/margins": 1.1911373138427734, "rewards/rejected": 2.073276996612549, "step": 34270 }, { "epoch": 1.5915316402804216, "grad_norm": 58.42687225341797, "learning_rate": 2.3489174675395018e-07, "logits/chosen": -19.069904327392578, "logits/rejected": -18.96820831298828, "logps/chosen": -451.012939453125, "logps/rejected": -384.65606689453125, "loss": 0.8312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9379935264587402, "rewards/margins": 0.5697600841522217, "rewards/rejected": 3.3682334423065186, "step": 34280 }, { "epoch": 1.5919959143878546, "grad_norm": 0.4773501753807068, "learning_rate": 2.348143677360447e-07, "logits/chosen": -19.631591796875, "logits/rejected": -18.699583053588867, "logps/chosen": -489.5252380371094, "logps/rejected": -343.3357238769531, "loss": 0.6642, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.447768211364746, "rewards/margins": 1.7516616582870483, "rewards/rejected": 2.696106195449829, "step": 34290 }, { "epoch": 1.5924601884952876, "grad_norm": 197.74600219726562, "learning_rate": 2.3473698871813918e-07, "logits/chosen": -19.699533462524414, "logits/rejected": -17.99171257019043, "logps/chosen": -342.58819580078125, "logps/rejected": -258.18157958984375, "loss": 0.5658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7621705532073975, "rewards/margins": 1.1131908893585205, "rewards/rejected": 1.6489797830581665, "step": 34300 }, { "epoch": 1.5929244626027206, "grad_norm": 79.0999526977539, "learning_rate": 2.346596097002337e-07, "logits/chosen": -18.657854080200195, "logits/rejected": -17.454631805419922, "logps/chosen": -472.4302673339844, "logps/rejected": -274.71331787109375, "loss": 0.464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.255853652954102, "rewards/margins": 1.9917749166488647, "rewards/rejected": 2.2640788555145264, "step": 34310 }, { "epoch": 1.5933887367101538, "grad_norm": 99.14498901367188, "learning_rate": 2.3458223068232815e-07, "logits/chosen": -19.05020523071289, "logits/rejected": -18.010692596435547, "logps/chosen": -521.6976928710938, "logps/rejected": -367.83245849609375, "loss": 0.4173, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3534858226776123, "rewards/margins": 1.0255084037780762, "rewards/rejected": 2.3279776573181152, "step": 34320 }, { "epoch": 1.5938530108175866, "grad_norm": 13.072593688964844, "learning_rate": 2.3450485166442266e-07, "logits/chosen": -18.935224533081055, "logits/rejected": -18.075204849243164, "logps/chosen": -415.346923828125, "logps/rejected": -361.7529602050781, "loss": 1.185, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.233290672302246, "rewards/margins": 1.0250083208084106, "rewards/rejected": 3.208282947540283, "step": 34330 }, { "epoch": 1.5943172849250198, "grad_norm": 258.1197204589844, "learning_rate": 2.3442747264651717e-07, "logits/chosen": -19.27985191345215, "logits/rejected": -18.70786476135254, "logps/chosen": -425.7535095214844, "logps/rejected": -371.379150390625, "loss": 0.9705, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.645688772201538, "rewards/margins": -0.11965696513652802, "rewards/rejected": 2.765345811843872, "step": 34340 }, { "epoch": 1.5947815590324528, "grad_norm": 210.1661376953125, "learning_rate": 2.3435009362861166e-07, "logits/chosen": -19.289281845092773, "logits/rejected": -18.929977416992188, "logps/chosen": -368.28802490234375, "logps/rejected": -399.324951171875, "loss": 1.0346, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5513482093811035, "rewards/margins": 0.4205819070339203, "rewards/rejected": 3.130765676498413, "step": 34350 }, { "epoch": 1.5952458331398858, "grad_norm": 0.32321223616600037, "learning_rate": 2.3427271461070617e-07, "logits/chosen": -19.329782485961914, "logits/rejected": -18.512216567993164, "logps/chosen": -366.82568359375, "logps/rejected": -297.1846618652344, "loss": 0.2835, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7610855102539062, "rewards/margins": 2.059366226196289, "rewards/rejected": 1.7017192840576172, "step": 34360 }, { "epoch": 1.5957101072473188, "grad_norm": 0.6320413947105408, "learning_rate": 2.3419533559280063e-07, "logits/chosen": -18.850589752197266, "logits/rejected": -18.363508224487305, "logps/chosen": -359.1083679199219, "logps/rejected": -293.46893310546875, "loss": 0.7741, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1391966342926025, "rewards/margins": 0.5467024445533752, "rewards/rejected": 2.592494249343872, "step": 34370 }, { "epoch": 1.5961743813547518, "grad_norm": 211.75404357910156, "learning_rate": 2.3411795657489514e-07, "logits/chosen": -19.042518615722656, "logits/rejected": -18.32938003540039, "logps/chosen": -377.1824645996094, "logps/rejected": -319.4835510253906, "loss": 0.3934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6217739582061768, "rewards/margins": 1.4902678728103638, "rewards/rejected": 2.1315059661865234, "step": 34380 }, { "epoch": 1.596638655462185, "grad_norm": 70.60093688964844, "learning_rate": 2.3404057755698965e-07, "logits/chosen": -17.768598556518555, "logits/rejected": -17.4721622467041, "logps/chosen": -270.96673583984375, "logps/rejected": -270.4194030761719, "loss": 0.6472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9299695491790771, "rewards/margins": 0.6388168334960938, "rewards/rejected": 1.2911527156829834, "step": 34390 }, { "epoch": 1.5971029295696177, "grad_norm": 6.909090518951416, "learning_rate": 2.3396319853908413e-07, "logits/chosen": -19.968534469604492, "logits/rejected": -18.72894287109375, "logps/chosen": -316.90582275390625, "logps/rejected": -244.6134490966797, "loss": 0.6114, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.940734386444092, "rewards/margins": 0.6890982389450073, "rewards/rejected": 2.251636028289795, "step": 34400 }, { "epoch": 1.597567203677051, "grad_norm": 49.11710739135742, "learning_rate": 2.3388581952117864e-07, "logits/chosen": -18.276716232299805, "logits/rejected": -18.45132827758789, "logps/chosen": -310.0501708984375, "logps/rejected": -254.86532592773438, "loss": 0.5891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.233030080795288, "rewards/margins": 0.6149342656135559, "rewards/rejected": 1.6180957555770874, "step": 34410 }, { "epoch": 1.598031477784484, "grad_norm": 194.02255249023438, "learning_rate": 2.338084405032731e-07, "logits/chosen": -18.809207916259766, "logits/rejected": -19.076086044311523, "logps/chosen": -411.7488708496094, "logps/rejected": -456.89495849609375, "loss": 1.2392, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.86752986907959, "rewards/margins": -0.7136695981025696, "rewards/rejected": 3.5811991691589355, "step": 34420 }, { "epoch": 1.598495751891917, "grad_norm": 38.97739028930664, "learning_rate": 2.3373106148536761e-07, "logits/chosen": -18.195728302001953, "logits/rejected": -17.817462921142578, "logps/chosen": -418.3349609375, "logps/rejected": -351.1429748535156, "loss": 0.5962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.033847332000732, "rewards/margins": 0.6882456541061401, "rewards/rejected": 3.345602035522461, "step": 34430 }, { "epoch": 1.5989600259993502, "grad_norm": 2.1466164588928223, "learning_rate": 2.3365368246746212e-07, "logits/chosen": -20.489246368408203, "logits/rejected": -18.004497528076172, "logps/chosen": -394.801513671875, "logps/rejected": -248.4052276611328, "loss": 0.6275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.554786205291748, "rewards/margins": 2.006298065185547, "rewards/rejected": 1.5484886169433594, "step": 34440 }, { "epoch": 1.599424300106783, "grad_norm": 10.509242057800293, "learning_rate": 2.335763034495566e-07, "logits/chosen": -18.56930923461914, "logits/rejected": -18.183916091918945, "logps/chosen": -298.20306396484375, "logps/rejected": -261.41607666015625, "loss": 1.2378, "rewards/accuracies": 0.5, "rewards/chosen": 2.279998779296875, "rewards/margins": -0.16532030701637268, "rewards/rejected": 2.445319175720215, "step": 34450 }, { "epoch": 1.5998885742142162, "grad_norm": 87.3580322265625, "learning_rate": 2.334989244316511e-07, "logits/chosen": -18.112377166748047, "logits/rejected": -18.996082305908203, "logps/chosen": -344.74493408203125, "logps/rejected": -470.42047119140625, "loss": 1.6117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.981637477874756, "rewards/margins": -0.5220023989677429, "rewards/rejected": 4.503640174865723, "step": 34460 }, { "epoch": 1.6003528483216491, "grad_norm": 16.403274536132812, "learning_rate": 2.3342154541374558e-07, "logits/chosen": -19.452960968017578, "logits/rejected": -18.127159118652344, "logps/chosen": -362.4441223144531, "logps/rejected": -297.71124267578125, "loss": 0.2465, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.211475372314453, "rewards/margins": 2.16593599319458, "rewards/rejected": 2.045539379119873, "step": 34470 }, { "epoch": 1.6008171224290821, "grad_norm": 171.56053161621094, "learning_rate": 2.333441663958401e-07, "logits/chosen": -20.112430572509766, "logits/rejected": -18.927841186523438, "logps/chosen": -326.11431884765625, "logps/rejected": -243.15744018554688, "loss": 0.6566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.078364133834839, "rewards/margins": 0.7697665691375732, "rewards/rejected": 1.3085976839065552, "step": 34480 }, { "epoch": 1.6012813965365151, "grad_norm": 200.2124786376953, "learning_rate": 2.332667873779346e-07, "logits/chosen": -18.90797233581543, "logits/rejected": -17.852163314819336, "logps/chosen": -342.19000244140625, "logps/rejected": -281.1813659667969, "loss": 0.7465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.725339412689209, "rewards/margins": 1.7608410120010376, "rewards/rejected": 1.96449875831604, "step": 34490 }, { "epoch": 1.6017456706439481, "grad_norm": 33.097110748291016, "learning_rate": 2.3318940836002909e-07, "logits/chosen": -19.235280990600586, "logits/rejected": -18.935806274414062, "logps/chosen": -349.72576904296875, "logps/rejected": -364.05548095703125, "loss": 0.8574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8273048400878906, "rewards/margins": 0.3288593590259552, "rewards/rejected": 2.498445510864258, "step": 34500 }, { "epoch": 1.6022099447513813, "grad_norm": 27.2225399017334, "learning_rate": 2.3311202934212357e-07, "logits/chosen": -19.979053497314453, "logits/rejected": -18.57416343688965, "logps/chosen": -362.5103454589844, "logps/rejected": -224.50357055664062, "loss": 0.3539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.30932879447937, "rewards/margins": 1.5730899572372437, "rewards/rejected": 1.7362388372421265, "step": 34510 }, { "epoch": 1.6026742188588141, "grad_norm": 4.362222194671631, "learning_rate": 2.3303465032421806e-07, "logits/chosen": -19.54167938232422, "logits/rejected": -19.432390213012695, "logps/chosen": -421.14776611328125, "logps/rejected": -426.1072692871094, "loss": 0.7909, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4764626026153564, "rewards/margins": 0.9282127618789673, "rewards/rejected": 2.5482497215270996, "step": 34520 }, { "epoch": 1.6031384929662473, "grad_norm": 17.65001106262207, "learning_rate": 2.3295727130631257e-07, "logits/chosen": -19.51544761657715, "logits/rejected": -18.9822940826416, "logps/chosen": -402.56976318359375, "logps/rejected": -332.7185974121094, "loss": 0.6379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6432313919067383, "rewards/margins": 0.5407533645629883, "rewards/rejected": 2.10247802734375, "step": 34530 }, { "epoch": 1.6036027670736803, "grad_norm": 101.17971801757812, "learning_rate": 2.3287989228840708e-07, "logits/chosen": -19.544946670532227, "logits/rejected": -18.9515323638916, "logps/chosen": -395.42791748046875, "logps/rejected": -303.8316345214844, "loss": 0.6369, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3290131092071533, "rewards/margins": 1.1041823625564575, "rewards/rejected": 2.2248308658599854, "step": 34540 }, { "epoch": 1.6040670411811133, "grad_norm": 38.37931823730469, "learning_rate": 2.3280251327050156e-07, "logits/chosen": -18.280107498168945, "logits/rejected": -18.08331298828125, "logps/chosen": -255.35281372070312, "logps/rejected": -221.7258758544922, "loss": 0.4205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7723731994628906, "rewards/margins": 1.145047664642334, "rewards/rejected": 0.6273254156112671, "step": 34550 }, { "epoch": 1.6045313152885463, "grad_norm": 135.15188598632812, "learning_rate": 2.3272513425259605e-07, "logits/chosen": -20.32366180419922, "logits/rejected": -20.029621124267578, "logps/chosen": -534.49951171875, "logps/rejected": -440.59515380859375, "loss": 0.6223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.4598846435546875, "rewards/margins": 1.9456886053085327, "rewards/rejected": 3.5141959190368652, "step": 34560 }, { "epoch": 1.6049955893959793, "grad_norm": 63.923126220703125, "learning_rate": 2.3264775523469053e-07, "logits/chosen": -19.442480087280273, "logits/rejected": -17.936830520629883, "logps/chosen": -499.6455993652344, "logps/rejected": -367.21807861328125, "loss": 0.3076, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.5665388107299805, "rewards/margins": 1.880581259727478, "rewards/rejected": 2.68595814704895, "step": 34570 }, { "epoch": 1.6054598635034125, "grad_norm": 53.871822357177734, "learning_rate": 2.3257037621678504e-07, "logits/chosen": -19.795955657958984, "logits/rejected": -18.606794357299805, "logps/chosen": -361.7015075683594, "logps/rejected": -289.2232666015625, "loss": 0.8442, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5426459312438965, "rewards/margins": 0.4056363105773926, "rewards/rejected": 2.137009382247925, "step": 34580 }, { "epoch": 1.6059241376108453, "grad_norm": 24.782251358032227, "learning_rate": 2.3249299719887956e-07, "logits/chosen": -18.744077682495117, "logits/rejected": -17.91002655029297, "logps/chosen": -364.72833251953125, "logps/rejected": -240.5463104248047, "loss": 0.3722, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5171101093292236, "rewards/margins": 1.7991729974746704, "rewards/rejected": 1.7179371118545532, "step": 34590 }, { "epoch": 1.6063884117182785, "grad_norm": 85.52830505371094, "learning_rate": 2.3241561818097404e-07, "logits/chosen": -19.6756534576416, "logits/rejected": -18.548858642578125, "logps/chosen": -516.0194091796875, "logps/rejected": -418.94964599609375, "loss": 0.5406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.165663719177246, "rewards/margins": 1.4964721202850342, "rewards/rejected": 2.6691911220550537, "step": 34600 }, { "epoch": 1.6068526858257115, "grad_norm": 73.44007110595703, "learning_rate": 2.3233823916306853e-07, "logits/chosen": -19.132129669189453, "logits/rejected": -18.686914443969727, "logps/chosen": -281.7053527832031, "logps/rejected": -313.73681640625, "loss": 1.0104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1574010848999023, "rewards/margins": -0.01903355121612549, "rewards/rejected": 2.1764347553253174, "step": 34610 }, { "epoch": 1.6073169599331445, "grad_norm": 178.79278564453125, "learning_rate": 2.32260860145163e-07, "logits/chosen": -18.74954605102539, "logits/rejected": -17.77602195739746, "logps/chosen": -510.49322509765625, "logps/rejected": -331.4045104980469, "loss": 0.353, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.385201454162598, "rewards/margins": 2.0991134643554688, "rewards/rejected": 2.286088466644287, "step": 34620 }, { "epoch": 1.6077812340405777, "grad_norm": 140.2081756591797, "learning_rate": 2.3218348112725752e-07, "logits/chosen": -18.865434646606445, "logits/rejected": -17.88058090209961, "logps/chosen": -394.05987548828125, "logps/rejected": -277.4867858886719, "loss": 0.2109, "rewards/accuracies": 1.0, "rewards/chosen": 3.1335110664367676, "rewards/margins": 1.9029521942138672, "rewards/rejected": 1.23055899143219, "step": 34630 }, { "epoch": 1.6082455081480105, "grad_norm": 119.06648254394531, "learning_rate": 2.3210610210935203e-07, "logits/chosen": -18.438467025756836, "logits/rejected": -17.520435333251953, "logps/chosen": -395.6065979003906, "logps/rejected": -298.56854248046875, "loss": 0.6563, "rewards/accuracies": 0.5, "rewards/chosen": 3.7572410106658936, "rewards/margins": 1.6341310739517212, "rewards/rejected": 2.1231095790863037, "step": 34640 }, { "epoch": 1.6087097822554437, "grad_norm": 169.9130401611328, "learning_rate": 2.3202872309144652e-07, "logits/chosen": -19.213205337524414, "logits/rejected": -19.269210815429688, "logps/chosen": -392.4779052734375, "logps/rejected": -388.13140869140625, "loss": 0.543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.767606019973755, "rewards/margins": 1.0467265844345093, "rewards/rejected": 2.720879077911377, "step": 34650 }, { "epoch": 1.6091740563628767, "grad_norm": 0.26203668117523193, "learning_rate": 2.31951344073541e-07, "logits/chosen": -19.036060333251953, "logits/rejected": -17.656028747558594, "logps/chosen": -358.04669189453125, "logps/rejected": -242.4529266357422, "loss": 0.5716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.793931007385254, "rewards/margins": 1.5009000301361084, "rewards/rejected": 1.2930309772491455, "step": 34660 }, { "epoch": 1.6096383304703097, "grad_norm": 3.9547362327575684, "learning_rate": 2.3187396505563551e-07, "logits/chosen": -19.4149169921875, "logits/rejected": -18.667400360107422, "logps/chosen": -453.3541564941406, "logps/rejected": -342.5113220214844, "loss": 0.2615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.047418117523193, "rewards/margins": 2.110783100128174, "rewards/rejected": 1.9366346597671509, "step": 34670 }, { "epoch": 1.6101026045777427, "grad_norm": 50.302093505859375, "learning_rate": 2.3179658603773e-07, "logits/chosen": -18.194297790527344, "logits/rejected": -18.817861557006836, "logps/chosen": -300.35699462890625, "logps/rejected": -350.216064453125, "loss": 0.7788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.006865978240967, "rewards/margins": 0.3496198058128357, "rewards/rejected": 2.6572461128234863, "step": 34680 }, { "epoch": 1.6105668786851757, "grad_norm": 54.38840866088867, "learning_rate": 2.317192070198245e-07, "logits/chosen": -20.896684646606445, "logits/rejected": -19.105518341064453, "logps/chosen": -390.7574768066406, "logps/rejected": -238.1048583984375, "loss": 0.1737, "rewards/accuracies": 1.0, "rewards/chosen": 4.803796768188477, "rewards/margins": 3.069727659225464, "rewards/rejected": 1.7340691089630127, "step": 34690 }, { "epoch": 1.611031152792609, "grad_norm": 32.20875930786133, "learning_rate": 2.3164182800191897e-07, "logits/chosen": -19.170841217041016, "logits/rejected": -19.35944366455078, "logps/chosen": -278.41162109375, "logps/rejected": -310.7410888671875, "loss": 1.135, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.7328615188598633, "rewards/margins": -0.044440388679504395, "rewards/rejected": 2.7773022651672363, "step": 34700 }, { "epoch": 1.6114954269000417, "grad_norm": 231.56878662109375, "learning_rate": 2.3156444898401348e-07, "logits/chosen": -20.131595611572266, "logits/rejected": -19.133487701416016, "logps/chosen": -404.40478515625, "logps/rejected": -277.1224060058594, "loss": 0.6383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.374706268310547, "rewards/margins": 1.7053451538085938, "rewards/rejected": 1.6693613529205322, "step": 34710 }, { "epoch": 1.6119597010074749, "grad_norm": 45.54719161987305, "learning_rate": 2.31487069966108e-07, "logits/chosen": -19.126134872436523, "logits/rejected": -18.47797393798828, "logps/chosen": -465.72796630859375, "logps/rejected": -382.423095703125, "loss": 0.7209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.63469934463501, "rewards/margins": 1.5019452571868896, "rewards/rejected": 3.13275408744812, "step": 34720 }, { "epoch": 1.6124239751149079, "grad_norm": 115.87139129638672, "learning_rate": 2.3140969094820248e-07, "logits/chosen": -18.45970916748047, "logits/rejected": -18.483474731445312, "logps/chosen": -489.04095458984375, "logps/rejected": -445.0970764160156, "loss": 1.5225, "rewards/accuracies": 0.5, "rewards/chosen": 4.115154266357422, "rewards/margins": -0.5425357222557068, "rewards/rejected": 4.657690525054932, "step": 34730 }, { "epoch": 1.6128882492223409, "grad_norm": 25.605575561523438, "learning_rate": 2.31332311930297e-07, "logits/chosen": -19.42877769470215, "logits/rejected": -18.110042572021484, "logps/chosen": -419.6827087402344, "logps/rejected": -306.76654052734375, "loss": 0.4441, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.380049705505371, "rewards/margins": 1.8493061065673828, "rewards/rejected": 2.53074312210083, "step": 34740 }, { "epoch": 1.6133525233297739, "grad_norm": 49.15605163574219, "learning_rate": 2.3125493291239145e-07, "logits/chosen": -18.938570022583008, "logits/rejected": -19.25948715209961, "logps/chosen": -350.0743713378906, "logps/rejected": -392.45361328125, "loss": 0.8613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4725587368011475, "rewards/margins": -0.049367617815732956, "rewards/rejected": 1.5219262838363647, "step": 34750 }, { "epoch": 1.6138167974372069, "grad_norm": 0.18453703820705414, "learning_rate": 2.3117755389448596e-07, "logits/chosen": -19.609359741210938, "logits/rejected": -18.01857566833496, "logps/chosen": -439.373291015625, "logps/rejected": -313.8484191894531, "loss": 0.7387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.017805576324463, "rewards/margins": 1.6729061603546143, "rewards/rejected": 2.3448989391326904, "step": 34760 }, { "epoch": 1.61428107154464, "grad_norm": 49.449790954589844, "learning_rate": 2.3110017487658047e-07, "logits/chosen": -19.24288558959961, "logits/rejected": -18.42726707458496, "logps/chosen": -284.33331298828125, "logps/rejected": -235.8941192626953, "loss": 0.29, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3620877265930176, "rewards/margins": 1.702755331993103, "rewards/rejected": 1.659332275390625, "step": 34770 }, { "epoch": 1.6147453456520728, "grad_norm": 64.45340728759766, "learning_rate": 2.3102279585867495e-07, "logits/chosen": -18.67601203918457, "logits/rejected": -18.790246963500977, "logps/chosen": -316.94183349609375, "logps/rejected": -324.164306640625, "loss": 1.3355, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0131778717041016, "rewards/margins": 0.38006511330604553, "rewards/rejected": 2.633112668991089, "step": 34780 }, { "epoch": 1.615209619759506, "grad_norm": 87.15409851074219, "learning_rate": 2.3094541684076946e-07, "logits/chosen": -20.76248550415039, "logits/rejected": -19.324172973632812, "logps/chosen": -459.99755859375, "logps/rejected": -299.244140625, "loss": 0.206, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9432575702667236, "rewards/margins": 2.1726126670837402, "rewards/rejected": 1.7706451416015625, "step": 34790 }, { "epoch": 1.615673893866939, "grad_norm": 213.07763671875, "learning_rate": 2.3086803782286392e-07, "logits/chosen": -19.615955352783203, "logits/rejected": -18.753442764282227, "logps/chosen": -327.4750061035156, "logps/rejected": -319.1328125, "loss": 1.179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.392829179763794, "rewards/margins": 0.5623500943183899, "rewards/rejected": 2.8304784297943115, "step": 34800 }, { "epoch": 1.616138167974372, "grad_norm": 96.14041900634766, "learning_rate": 2.3079065880495843e-07, "logits/chosen": -19.117080688476562, "logits/rejected": -19.38494873046875, "logps/chosen": -512.0673828125, "logps/rejected": -518.1853637695312, "loss": 1.2183, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.8401269912719727, "rewards/margins": -0.734806478023529, "rewards/rejected": 3.5749332904815674, "step": 34810 }, { "epoch": 1.6166024420818053, "grad_norm": 245.62014770507812, "learning_rate": 2.3071327978705295e-07, "logits/chosen": -18.181468963623047, "logits/rejected": -18.236751556396484, "logps/chosen": -349.2927551269531, "logps/rejected": -342.6699523925781, "loss": 1.1273, "rewards/accuracies": 0.5, "rewards/chosen": 3.479036331176758, "rewards/margins": 0.6455658078193665, "rewards/rejected": 2.8334708213806152, "step": 34820 }, { "epoch": 1.617066716189238, "grad_norm": 20.849803924560547, "learning_rate": 2.3063590076914743e-07, "logits/chosen": -20.31807518005371, "logits/rejected": -19.075286865234375, "logps/chosen": -467.44696044921875, "logps/rejected": -381.05426025390625, "loss": 0.3213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.261227607727051, "rewards/margins": 2.019730806350708, "rewards/rejected": 3.2414965629577637, "step": 34830 }, { "epoch": 1.6175309902966712, "grad_norm": 9.779383659362793, "learning_rate": 2.3055852175124194e-07, "logits/chosen": -19.018871307373047, "logits/rejected": -18.399768829345703, "logps/chosen": -388.5712585449219, "logps/rejected": -344.4266662597656, "loss": 0.969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.034730434417725, "rewards/margins": 0.9254870414733887, "rewards/rejected": 3.109243869781494, "step": 34840 }, { "epoch": 1.617995264404104, "grad_norm": 83.71353912353516, "learning_rate": 2.304811427333364e-07, "logits/chosen": -19.298023223876953, "logits/rejected": -18.52786636352539, "logps/chosen": -509.0647888183594, "logps/rejected": -388.4067687988281, "loss": 0.7619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.043233394622803, "rewards/margins": 1.192455768585205, "rewards/rejected": 2.8507776260375977, "step": 34850 }, { "epoch": 1.6184595385115372, "grad_norm": 132.17469787597656, "learning_rate": 2.304037637154309e-07, "logits/chosen": -19.11179542541504, "logits/rejected": -19.555021286010742, "logps/chosen": -387.6945495605469, "logps/rejected": -378.3319091796875, "loss": 1.1126, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.797367572784424, "rewards/margins": 0.4567849636077881, "rewards/rejected": 3.3405826091766357, "step": 34860 }, { "epoch": 1.6189238126189702, "grad_norm": 0.5388631820678711, "learning_rate": 2.3032638469752542e-07, "logits/chosen": -18.969707489013672, "logits/rejected": -18.56302261352539, "logps/chosen": -387.79461669921875, "logps/rejected": -304.410888671875, "loss": 0.507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2190752029418945, "rewards/margins": 1.8477976322174072, "rewards/rejected": 2.371277332305908, "step": 34870 }, { "epoch": 1.6193880867264032, "grad_norm": 105.60640716552734, "learning_rate": 2.302490056796199e-07, "logits/chosen": -19.32849884033203, "logits/rejected": -18.326519012451172, "logps/chosen": -393.40667724609375, "logps/rejected": -266.02642822265625, "loss": 0.3742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.84167218208313, "rewards/margins": 1.7031335830688477, "rewards/rejected": 2.1385388374328613, "step": 34880 }, { "epoch": 1.6198523608338364, "grad_norm": 35.41490936279297, "learning_rate": 2.301716266617144e-07, "logits/chosen": -18.22573471069336, "logits/rejected": -17.547914505004883, "logps/chosen": -395.3349609375, "logps/rejected": -324.6407470703125, "loss": 0.3748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2883384227752686, "rewards/margins": 1.5365240573883057, "rewards/rejected": 1.751814603805542, "step": 34890 }, { "epoch": 1.6203166349412692, "grad_norm": 32.036415100097656, "learning_rate": 2.3009424764380888e-07, "logits/chosen": -19.111547470092773, "logits/rejected": -18.30694007873535, "logps/chosen": -365.75482177734375, "logps/rejected": -260.21636962890625, "loss": 0.41, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6475138664245605, "rewards/margins": 1.8681771755218506, "rewards/rejected": 1.779336929321289, "step": 34900 }, { "epoch": 1.6207809090487024, "grad_norm": 103.775390625, "learning_rate": 2.300168686259034e-07, "logits/chosen": -18.92945671081543, "logits/rejected": -17.65256118774414, "logps/chosen": -414.91748046875, "logps/rejected": -311.92608642578125, "loss": 0.516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5200676918029785, "rewards/margins": 0.9141233563423157, "rewards/rejected": 1.605944275856018, "step": 34910 }, { "epoch": 1.6212451831561354, "grad_norm": 15.156954765319824, "learning_rate": 2.299394896079979e-07, "logits/chosen": -19.992963790893555, "logits/rejected": -19.292753219604492, "logps/chosen": -412.64813232421875, "logps/rejected": -380.3089599609375, "loss": 0.7296, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1135430335998535, "rewards/margins": 0.6778503656387329, "rewards/rejected": 2.4356932640075684, "step": 34920 }, { "epoch": 1.6217094572635684, "grad_norm": 47.75934982299805, "learning_rate": 2.2986211059009238e-07, "logits/chosen": -18.200868606567383, "logits/rejected": -18.767047882080078, "logps/chosen": -309.904541015625, "logps/rejected": -366.2353210449219, "loss": 1.512, "rewards/accuracies": 0.5, "rewards/chosen": 2.2190189361572266, "rewards/margins": -0.5770694017410278, "rewards/rejected": 2.796088218688965, "step": 34930 }, { "epoch": 1.6221737313710014, "grad_norm": 52.622596740722656, "learning_rate": 2.2978473157218687e-07, "logits/chosen": -19.816675186157227, "logits/rejected": -18.234521865844727, "logps/chosen": -389.2904968261719, "logps/rejected": -287.3320007324219, "loss": 0.7991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4239895343780518, "rewards/margins": 1.4221198558807373, "rewards/rejected": 2.0018696784973145, "step": 34940 }, { "epoch": 1.6226380054784344, "grad_norm": 51.82897186279297, "learning_rate": 2.2970735255428135e-07, "logits/chosen": -18.732948303222656, "logits/rejected": -17.94440269470215, "logps/chosen": -330.79888916015625, "logps/rejected": -266.7808532714844, "loss": 0.5975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8366613388061523, "rewards/margins": 1.0184465646743774, "rewards/rejected": 1.818214774131775, "step": 34950 }, { "epoch": 1.6231022795858676, "grad_norm": 62.796382904052734, "learning_rate": 2.2962997353637586e-07, "logits/chosen": -19.25497817993164, "logits/rejected": -19.009859085083008, "logps/chosen": -364.67877197265625, "logps/rejected": -318.32928466796875, "loss": 0.4507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.999145746231079, "rewards/margins": 1.1756852865219116, "rewards/rejected": 1.8234599828720093, "step": 34960 }, { "epoch": 1.6235665536933004, "grad_norm": 188.841796875, "learning_rate": 2.2955259451847038e-07, "logits/chosen": -18.820865631103516, "logits/rejected": -18.388479232788086, "logps/chosen": -371.5177917480469, "logps/rejected": -305.7752380371094, "loss": 1.1989, "rewards/accuracies": 0.5, "rewards/chosen": 3.2038235664367676, "rewards/margins": 0.9494507908821106, "rewards/rejected": 2.2543725967407227, "step": 34970 }, { "epoch": 1.6240308278007336, "grad_norm": 2.482044219970703, "learning_rate": 2.2947521550056486e-07, "logits/chosen": -19.636865615844727, "logits/rejected": -18.042322158813477, "logps/chosen": -339.8609619140625, "logps/rejected": -219.83334350585938, "loss": 0.4945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1356985569000244, "rewards/margins": 2.0831151008605957, "rewards/rejected": 1.0525829792022705, "step": 34980 }, { "epoch": 1.6244951019081666, "grad_norm": 127.37923431396484, "learning_rate": 2.2939783648265935e-07, "logits/chosen": -19.762775421142578, "logits/rejected": -19.001834869384766, "logps/chosen": -425.29498291015625, "logps/rejected": -345.3815002441406, "loss": 0.703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0763423442840576, "rewards/margins": 0.8851093053817749, "rewards/rejected": 2.1912331581115723, "step": 34990 }, { "epoch": 1.6249593760155996, "grad_norm": 18.49428367614746, "learning_rate": 2.2932045746475383e-07, "logits/chosen": -19.652191162109375, "logits/rejected": -19.132936477661133, "logps/chosen": -470.6942443847656, "logps/rejected": -397.70648193359375, "loss": 0.398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.731298446655273, "rewards/margins": 2.1974949836730957, "rewards/rejected": 2.5338034629821777, "step": 35000 }, { "epoch": 1.6254236501230328, "grad_norm": 147.142333984375, "learning_rate": 2.2924307844684834e-07, "logits/chosen": -19.19632339477539, "logits/rejected": -18.38974380493164, "logps/chosen": -393.81610107421875, "logps/rejected": -302.9716796875, "loss": 0.7729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.38299560546875, "rewards/margins": 0.899908185005188, "rewards/rejected": 2.4830873012542725, "step": 35010 }, { "epoch": 1.6258879242304656, "grad_norm": 45.309608459472656, "learning_rate": 2.2916569942894285e-07, "logits/chosen": -20.266666412353516, "logits/rejected": -19.215940475463867, "logps/chosen": -371.89141845703125, "logps/rejected": -268.27679443359375, "loss": 0.2905, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.103708744049072, "rewards/margins": 1.6930471658706665, "rewards/rejected": 2.410661220550537, "step": 35020 }, { "epoch": 1.6263521983378988, "grad_norm": 15.428861618041992, "learning_rate": 2.2908832041103734e-07, "logits/chosen": -19.14399528503418, "logits/rejected": -18.505619049072266, "logps/chosen": -421.8312072753906, "logps/rejected": -472.68170166015625, "loss": 1.0293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.847795009613037, "rewards/margins": 0.568246066570282, "rewards/rejected": 3.2795491218566895, "step": 35030 }, { "epoch": 1.6268164724453316, "grad_norm": 0.8547976016998291, "learning_rate": 2.2901094139313182e-07, "logits/chosen": -18.316211700439453, "logits/rejected": -17.6021785736084, "logps/chosen": -436.9525451660156, "logps/rejected": -314.1694030761719, "loss": 0.4736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.630884885787964, "rewards/margins": 1.7432832717895508, "rewards/rejected": 1.887601613998413, "step": 35040 }, { "epoch": 1.6272807465527648, "grad_norm": 52.96958541870117, "learning_rate": 2.289335623752263e-07, "logits/chosen": -19.30831527709961, "logits/rejected": -19.054792404174805, "logps/chosen": -404.1130065917969, "logps/rejected": -383.92669677734375, "loss": 0.6631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.074965953826904, "rewards/margins": 0.43787717819213867, "rewards/rejected": 3.6370887756347656, "step": 35050 }, { "epoch": 1.6277450206601978, "grad_norm": 100.17134094238281, "learning_rate": 2.2885618335732082e-07, "logits/chosen": -20.377717971801758, "logits/rejected": -19.13614273071289, "logps/chosen": -547.4603271484375, "logps/rejected": -441.26239013671875, "loss": 0.3471, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.035723686218262, "rewards/margins": 1.6772855520248413, "rewards/rejected": 3.358438014984131, "step": 35060 }, { "epoch": 1.6282092947676308, "grad_norm": 65.22318267822266, "learning_rate": 2.2877880433941533e-07, "logits/chosen": -20.771629333496094, "logits/rejected": -19.7435245513916, "logps/chosen": -476.595703125, "logps/rejected": -394.8609313964844, "loss": 0.6411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.155073642730713, "rewards/margins": 1.3670194149017334, "rewards/rejected": 2.7880542278289795, "step": 35070 }, { "epoch": 1.628673568875064, "grad_norm": 182.02708435058594, "learning_rate": 2.2870142532150982e-07, "logits/chosen": -18.734134674072266, "logits/rejected": -17.2960205078125, "logps/chosen": -339.8819274902344, "logps/rejected": -261.79998779296875, "loss": 0.5916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5857014656066895, "rewards/margins": 1.0409033298492432, "rewards/rejected": 1.5447981357574463, "step": 35080 }, { "epoch": 1.6291378429824968, "grad_norm": 107.83368682861328, "learning_rate": 2.286240463036043e-07, "logits/chosen": -18.811140060424805, "logits/rejected": -18.739410400390625, "logps/chosen": -392.93243408203125, "logps/rejected": -376.26690673828125, "loss": 1.0824, "rewards/accuracies": 0.5, "rewards/chosen": 3.675260066986084, "rewards/margins": 0.35647183656692505, "rewards/rejected": 3.3187880516052246, "step": 35090 }, { "epoch": 1.62960211708993, "grad_norm": 148.59140014648438, "learning_rate": 2.2854666728569878e-07, "logits/chosen": -18.841814041137695, "logits/rejected": -18.97385025024414, "logps/chosen": -370.243896484375, "logps/rejected": -393.77752685546875, "loss": 0.9368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3787593841552734, "rewards/margins": 0.2785455286502838, "rewards/rejected": 2.1002135276794434, "step": 35100 }, { "epoch": 1.630066391197363, "grad_norm": 24.631282806396484, "learning_rate": 2.284692882677933e-07, "logits/chosen": -18.085542678833008, "logits/rejected": -17.211835861206055, "logps/chosen": -413.6932678222656, "logps/rejected": -245.4801025390625, "loss": 0.6475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5209860801696777, "rewards/margins": 1.1403303146362305, "rewards/rejected": 1.3806558847427368, "step": 35110 }, { "epoch": 1.630530665304796, "grad_norm": 17.4443359375, "learning_rate": 2.283919092498878e-07, "logits/chosen": -19.527132034301758, "logits/rejected": -18.50381088256836, "logps/chosen": -328.4378356933594, "logps/rejected": -298.3360595703125, "loss": 0.3546, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.446901798248291, "rewards/margins": 1.3874608278274536, "rewards/rejected": 2.059440851211548, "step": 35120 }, { "epoch": 1.630994939412229, "grad_norm": 73.4533462524414, "learning_rate": 2.2831453023198227e-07, "logits/chosen": -19.940292358398438, "logits/rejected": -19.386140823364258, "logps/chosen": -360.3889465332031, "logps/rejected": -369.22900390625, "loss": 0.5563, "rewards/accuracies": 0.5, "rewards/chosen": 3.33263897895813, "rewards/margins": 0.8991206884384155, "rewards/rejected": 2.433518409729004, "step": 35130 }, { "epoch": 1.631459213519662, "grad_norm": 3.5207221508026123, "learning_rate": 2.2823715121407678e-07, "logits/chosen": -18.536468505859375, "logits/rejected": -19.292316436767578, "logps/chosen": -254.99313354492188, "logps/rejected": -329.68670654296875, "loss": 1.3372, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.0913772583007812, "rewards/margins": -0.5194705128669739, "rewards/rejected": 2.6108477115631104, "step": 35140 }, { "epoch": 1.6319234876270952, "grad_norm": 19.636276245117188, "learning_rate": 2.2815977219617126e-07, "logits/chosen": -19.329240798950195, "logits/rejected": -18.450786590576172, "logps/chosen": -351.35369873046875, "logps/rejected": -195.11166381835938, "loss": 1.0562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2710914611816406, "rewards/margins": 1.0521172285079956, "rewards/rejected": 1.218974232673645, "step": 35150 }, { "epoch": 1.632387761734528, "grad_norm": 101.46916198730469, "learning_rate": 2.2808239317826577e-07, "logits/chosen": -19.66413688659668, "logits/rejected": -19.80288314819336, "logps/chosen": -301.5518798828125, "logps/rejected": -319.3148498535156, "loss": 1.196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3330740928649902, "rewards/margins": -0.11577565968036652, "rewards/rejected": 2.4488494396209717, "step": 35160 }, { "epoch": 1.6328520358419611, "grad_norm": 34.34040069580078, "learning_rate": 2.2800501416036028e-07, "logits/chosen": -17.6994686126709, "logits/rejected": -17.413318634033203, "logps/chosen": -282.3418884277344, "logps/rejected": -210.66427612304688, "loss": 0.3573, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3453874588012695, "rewards/margins": 1.3207290172576904, "rewards/rejected": 1.0246585607528687, "step": 35170 }, { "epoch": 1.6333163099493941, "grad_norm": 21.131206512451172, "learning_rate": 2.2792763514245474e-07, "logits/chosen": -18.26150131225586, "logits/rejected": -18.272356033325195, "logps/chosen": -343.9833984375, "logps/rejected": -286.79754638671875, "loss": 1.0115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.959181547164917, "rewards/margins": 0.0746951550245285, "rewards/rejected": 1.8844865560531616, "step": 35180 }, { "epoch": 1.6337805840568271, "grad_norm": 89.61807250976562, "learning_rate": 2.2785025612454925e-07, "logits/chosen": -18.439544677734375, "logits/rejected": -18.76750946044922, "logps/chosen": -387.2398376464844, "logps/rejected": -493.0941467285156, "loss": 0.8206, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.748523235321045, "rewards/margins": 0.5441159009933472, "rewards/rejected": 3.204407215118408, "step": 35190 }, { "epoch": 1.6342448581642601, "grad_norm": 55.5389289855957, "learning_rate": 2.2777287710664374e-07, "logits/chosen": -18.67257308959961, "logits/rejected": -18.192190170288086, "logps/chosen": -336.92535400390625, "logps/rejected": -243.6123046875, "loss": 0.8746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.599426746368408, "rewards/margins": 0.6907540559768677, "rewards/rejected": 1.908672571182251, "step": 35200 }, { "epoch": 1.6347091322716931, "grad_norm": 3.981740713119507, "learning_rate": 2.2769549808873825e-07, "logits/chosen": -19.217605590820312, "logits/rejected": -18.0358943939209, "logps/chosen": -432.7909240722656, "logps/rejected": -289.5056457519531, "loss": 0.7248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0857434272766113, "rewards/margins": 1.3985811471939087, "rewards/rejected": 1.6871620416641235, "step": 35210 }, { "epoch": 1.6351734063791263, "grad_norm": 32.984291076660156, "learning_rate": 2.2761811907083276e-07, "logits/chosen": -19.666807174682617, "logits/rejected": -18.604101181030273, "logps/chosen": -512.6805419921875, "logps/rejected": -418.53466796875, "loss": 0.5426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.051945686340332, "rewards/margins": 1.3524235486984253, "rewards/rejected": 2.699521780014038, "step": 35220 }, { "epoch": 1.635637680486559, "grad_norm": 55.40806579589844, "learning_rate": 2.2754074005292722e-07, "logits/chosen": -19.436914443969727, "logits/rejected": -18.155475616455078, "logps/chosen": -462.92449951171875, "logps/rejected": -366.1409606933594, "loss": 0.8769, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8167595863342285, "rewards/margins": 1.1463634967803955, "rewards/rejected": 2.670396327972412, "step": 35230 }, { "epoch": 1.6361019545939923, "grad_norm": 22.38697052001953, "learning_rate": 2.2746336103502173e-07, "logits/chosen": -19.221439361572266, "logits/rejected": -18.703933715820312, "logps/chosen": -302.0284118652344, "logps/rejected": -250.5421142578125, "loss": 1.0199, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9558823108673096, "rewards/margins": 0.23163798451423645, "rewards/rejected": 1.7242443561553955, "step": 35240 }, { "epoch": 1.6365662287014253, "grad_norm": 104.23566436767578, "learning_rate": 2.2738598201711622e-07, "logits/chosen": -19.65040397644043, "logits/rejected": -19.265026092529297, "logps/chosen": -276.53363037109375, "logps/rejected": -231.53573608398438, "loss": 0.9707, "rewards/accuracies": 0.5, "rewards/chosen": 1.6488231420516968, "rewards/margins": 0.29544150829315186, "rewards/rejected": 1.3533817529678345, "step": 35250 }, { "epoch": 1.6370305028088583, "grad_norm": 0.8534368276596069, "learning_rate": 2.2730860299921073e-07, "logits/chosen": -18.696109771728516, "logits/rejected": -17.80714988708496, "logps/chosen": -336.8827819824219, "logps/rejected": -253.1851348876953, "loss": 0.2877, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.691410779953003, "rewards/margins": 1.9358608722686768, "rewards/rejected": 1.7555497884750366, "step": 35260 }, { "epoch": 1.6374947769162915, "grad_norm": 260.58148193359375, "learning_rate": 2.2723122398130524e-07, "logits/chosen": -19.094274520874023, "logits/rejected": -19.95616912841797, "logps/chosen": -467.64190673828125, "logps/rejected": -485.22808837890625, "loss": 1.1096, "rewards/accuracies": 0.5, "rewards/chosen": 4.289233684539795, "rewards/margins": 0.23661425709724426, "rewards/rejected": 4.052619457244873, "step": 35270 }, { "epoch": 1.6379590510237243, "grad_norm": 2.7742884159088135, "learning_rate": 2.271538449633997e-07, "logits/chosen": -18.757831573486328, "logits/rejected": -17.62132453918457, "logps/chosen": -387.5865783691406, "logps/rejected": -258.4063720703125, "loss": 0.466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.759511709213257, "rewards/margins": 1.285131812095642, "rewards/rejected": 1.4743797779083252, "step": 35280 }, { "epoch": 1.6384233251311575, "grad_norm": 17.79293441772461, "learning_rate": 2.270764659454942e-07, "logits/chosen": -18.197872161865234, "logits/rejected": -18.4234676361084, "logps/chosen": -234.79443359375, "logps/rejected": -292.1028747558594, "loss": 0.9822, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9929144382476807, "rewards/margins": 0.5130147933959961, "rewards/rejected": 1.4798994064331055, "step": 35290 }, { "epoch": 1.6388875992385905, "grad_norm": 40.60113525390625, "learning_rate": 2.2699908692758872e-07, "logits/chosen": -18.764293670654297, "logits/rejected": -18.921627044677734, "logps/chosen": -367.140869140625, "logps/rejected": -364.48870849609375, "loss": 0.8069, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2618837356567383, "rewards/margins": -0.04688648134469986, "rewards/rejected": 2.3087704181671143, "step": 35300 }, { "epoch": 1.6393518733460235, "grad_norm": 1.3044931888580322, "learning_rate": 2.269217079096832e-07, "logits/chosen": -18.51070785522461, "logits/rejected": -17.668664932250977, "logps/chosen": -314.71136474609375, "logps/rejected": -252.6059112548828, "loss": 0.6209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.340178966522217, "rewards/margins": 1.8303794860839844, "rewards/rejected": 1.5097999572753906, "step": 35310 }, { "epoch": 1.6398161474534565, "grad_norm": 46.250030517578125, "learning_rate": 2.2684432889177772e-07, "logits/chosen": -18.964391708374023, "logits/rejected": -17.223878860473633, "logps/chosen": -399.2516174316406, "logps/rejected": -283.21453857421875, "loss": 0.7367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.981459617614746, "rewards/margins": 1.4383612871170044, "rewards/rejected": 1.5430984497070312, "step": 35320 }, { "epoch": 1.6402804215608895, "grad_norm": 7.680467128753662, "learning_rate": 2.2676694987387217e-07, "logits/chosen": -18.83933448791504, "logits/rejected": -17.583663940429688, "logps/chosen": -487.1044921875, "logps/rejected": -335.70831298828125, "loss": 0.5483, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.308088302612305, "rewards/margins": 1.8268101215362549, "rewards/rejected": 2.4812779426574707, "step": 35330 }, { "epoch": 1.6407446956683227, "grad_norm": 161.0398712158203, "learning_rate": 2.2668957085596669e-07, "logits/chosen": -19.07855224609375, "logits/rejected": -19.17035484313965, "logps/chosen": -523.2598876953125, "logps/rejected": -437.8212890625, "loss": 0.9054, "rewards/accuracies": 0.5, "rewards/chosen": 3.5391860008239746, "rewards/margins": 0.16749151051044464, "rewards/rejected": 3.371695041656494, "step": 35340 }, { "epoch": 1.6412089697757555, "grad_norm": 20.9194278717041, "learning_rate": 2.266121918380612e-07, "logits/chosen": -19.0539608001709, "logits/rejected": -18.332332611083984, "logps/chosen": -480.85400390625, "logps/rejected": -372.5166931152344, "loss": 0.3944, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.102086067199707, "rewards/margins": 1.7700462341308594, "rewards/rejected": 2.3320393562316895, "step": 35350 }, { "epoch": 1.6416732438831887, "grad_norm": 8.85448932647705, "learning_rate": 2.2653481282015568e-07, "logits/chosen": -19.77711296081543, "logits/rejected": -18.922529220581055, "logps/chosen": -351.5064697265625, "logps/rejected": -341.6529541015625, "loss": 0.688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3002090454101562, "rewards/margins": 0.829388439655304, "rewards/rejected": 2.470820426940918, "step": 35360 }, { "epoch": 1.6421375179906217, "grad_norm": 11.499796867370605, "learning_rate": 2.2645743380225017e-07, "logits/chosen": -19.635168075561523, "logits/rejected": -19.410985946655273, "logps/chosen": -440.7662048339844, "logps/rejected": -383.8070068359375, "loss": 0.8881, "rewards/accuracies": 0.5, "rewards/chosen": 3.8671059608459473, "rewards/margins": 0.2172672003507614, "rewards/rejected": 3.649838924407959, "step": 35370 }, { "epoch": 1.6426017920980547, "grad_norm": 19.698205947875977, "learning_rate": 2.2638005478434465e-07, "logits/chosen": -18.119680404663086, "logits/rejected": -17.778549194335938, "logps/chosen": -312.71124267578125, "logps/rejected": -262.1099853515625, "loss": 0.6701, "rewards/accuracies": 0.5, "rewards/chosen": 2.262897253036499, "rewards/margins": 0.7947046160697937, "rewards/rejected": 1.46819269657135, "step": 35380 }, { "epoch": 1.6430660662054877, "grad_norm": 2.253690004348755, "learning_rate": 2.2630267576643916e-07, "logits/chosen": -20.021949768066406, "logits/rejected": -18.904800415039062, "logps/chosen": -476.3831481933594, "logps/rejected": -289.9039306640625, "loss": 0.7224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.658625364303589, "rewards/margins": 1.641108751296997, "rewards/rejected": 2.017516851425171, "step": 35390 }, { "epoch": 1.6435303403129207, "grad_norm": 132.8892364501953, "learning_rate": 2.2622529674853367e-07, "logits/chosen": -19.621898651123047, "logits/rejected": -19.04384994506836, "logps/chosen": -480.1058654785156, "logps/rejected": -385.04327392578125, "loss": 0.4615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0876989364624023, "rewards/margins": 0.9921863675117493, "rewards/rejected": 2.0955123901367188, "step": 35400 }, { "epoch": 1.6439946144203539, "grad_norm": 44.523216247558594, "learning_rate": 2.2614791773062816e-07, "logits/chosen": -17.97014045715332, "logits/rejected": -17.599084854125977, "logps/chosen": -316.6424255371094, "logps/rejected": -292.4051818847656, "loss": 0.7838, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3666439056396484, "rewards/margins": 0.4022413194179535, "rewards/rejected": 1.964402437210083, "step": 35410 }, { "epoch": 1.6444588885277867, "grad_norm": 116.14939880371094, "learning_rate": 2.2607053871272264e-07, "logits/chosen": -19.076946258544922, "logits/rejected": -18.501140594482422, "logps/chosen": -467.114013671875, "logps/rejected": -400.35821533203125, "loss": 0.6695, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4158637523651123, "rewards/margins": 1.06623113155365, "rewards/rejected": 2.349632501602173, "step": 35420 }, { "epoch": 1.6449231626352199, "grad_norm": 95.78638458251953, "learning_rate": 2.2599315969481713e-07, "logits/chosen": -18.9550838470459, "logits/rejected": -18.272804260253906, "logps/chosen": -395.2988586425781, "logps/rejected": -266.93780517578125, "loss": 0.4708, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.164198637008667, "rewards/margins": 0.9461292028427124, "rewards/rejected": 1.218069314956665, "step": 35430 }, { "epoch": 1.6453874367426529, "grad_norm": 98.95767211914062, "learning_rate": 2.2591578067691164e-07, "logits/chosen": -20.833925247192383, "logits/rejected": -19.772842407226562, "logps/chosen": -428.595703125, "logps/rejected": -314.6382751464844, "loss": 0.4739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0283830165863037, "rewards/margins": 1.3549431562423706, "rewards/rejected": 1.673439621925354, "step": 35440 }, { "epoch": 1.6458517108500859, "grad_norm": 7.918117523193359, "learning_rate": 2.2583840165900615e-07, "logits/chosen": -19.144399642944336, "logits/rejected": -17.797842025756836, "logps/chosen": -505.42120361328125, "logps/rejected": -316.2107238769531, "loss": 0.3239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.163902759552002, "rewards/margins": 1.9268242120742798, "rewards/rejected": 2.2370784282684326, "step": 35450 }, { "epoch": 1.646315984957519, "grad_norm": 118.7330093383789, "learning_rate": 2.2576102264110064e-07, "logits/chosen": -18.43500328063965, "logits/rejected": -18.225601196289062, "logps/chosen": -320.5276794433594, "logps/rejected": -343.3287658691406, "loss": 1.2849, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.959812641143799, "rewards/margins": 0.1572170853614807, "rewards/rejected": 2.802595615386963, "step": 35460 }, { "epoch": 1.6467802590649518, "grad_norm": 212.969482421875, "learning_rate": 2.2568364362319512e-07, "logits/chosen": -19.969745635986328, "logits/rejected": -18.74479866027832, "logps/chosen": -324.648681640625, "logps/rejected": -299.4382629394531, "loss": 0.646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.34499454498291, "rewards/margins": 1.4426757097244263, "rewards/rejected": 2.9023189544677734, "step": 35470 }, { "epoch": 1.647244533172385, "grad_norm": 112.1402816772461, "learning_rate": 2.256062646052896e-07, "logits/chosen": -18.384845733642578, "logits/rejected": -18.73432731628418, "logps/chosen": -330.79193115234375, "logps/rejected": -323.4656677246094, "loss": 0.5799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0901877880096436, "rewards/margins": 0.7025728225708008, "rewards/rejected": 2.3876149654388428, "step": 35480 }, { "epoch": 1.647708807279818, "grad_norm": 41.62416076660156, "learning_rate": 2.2552888558738412e-07, "logits/chosen": -18.956430435180664, "logits/rejected": -17.89784812927246, "logps/chosen": -440.51959228515625, "logps/rejected": -352.51861572265625, "loss": 0.6983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6166090965270996, "rewards/margins": 1.2112382650375366, "rewards/rejected": 2.4053704738616943, "step": 35490 }, { "epoch": 1.648173081387251, "grad_norm": 143.8971405029297, "learning_rate": 2.2545150656947863e-07, "logits/chosen": -20.682348251342773, "logits/rejected": -20.01587677001953, "logps/chosen": -479.585205078125, "logps/rejected": -382.4889221191406, "loss": 0.3846, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.238629341125488, "rewards/margins": 1.4006789922714233, "rewards/rejected": 3.8379504680633545, "step": 35500 }, { "epoch": 1.648637355494684, "grad_norm": 72.77560424804688, "learning_rate": 2.253741275515731e-07, "logits/chosen": -19.781171798706055, "logits/rejected": -19.300697326660156, "logps/chosen": -486.9000549316406, "logps/rejected": -459.263916015625, "loss": 0.6511, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.1252546310424805, "rewards/margins": 0.49426302313804626, "rewards/rejected": 3.6309916973114014, "step": 35510 }, { "epoch": 1.649101629602117, "grad_norm": 177.9620819091797, "learning_rate": 2.252967485336676e-07, "logits/chosen": -18.603425979614258, "logits/rejected": -19.073152542114258, "logps/chosen": -387.524169921875, "logps/rejected": -378.72442626953125, "loss": 1.6537, "rewards/accuracies": 0.5, "rewards/chosen": 3.024061679840088, "rewards/margins": -0.5862616896629333, "rewards/rejected": 3.610323429107666, "step": 35520 }, { "epoch": 1.6495659037095503, "grad_norm": 94.85643005371094, "learning_rate": 2.2521936951576208e-07, "logits/chosen": -18.341184616088867, "logits/rejected": -19.169422149658203, "logps/chosen": -404.4195861816406, "logps/rejected": -484.1979064941406, "loss": 1.0194, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.551832675933838, "rewards/margins": -0.19027094542980194, "rewards/rejected": 3.7421035766601562, "step": 35530 }, { "epoch": 1.650030177816983, "grad_norm": 15.387199401855469, "learning_rate": 2.251419904978566e-07, "logits/chosen": -19.731250762939453, "logits/rejected": -19.194149017333984, "logps/chosen": -383.3247375488281, "logps/rejected": -313.6481018066406, "loss": 1.0908, "rewards/accuracies": 0.5, "rewards/chosen": 3.6777725219726562, "rewards/margins": 0.3653005063533783, "rewards/rejected": 3.312472105026245, "step": 35540 }, { "epoch": 1.6504944519244162, "grad_norm": 58.29808044433594, "learning_rate": 2.250646114799511e-07, "logits/chosen": -18.70408058166504, "logits/rejected": -18.186370849609375, "logps/chosen": -342.43804931640625, "logps/rejected": -261.01080322265625, "loss": 0.5205, "rewards/accuracies": 0.5, "rewards/chosen": 2.3041188716888428, "rewards/margins": 1.2206593751907349, "rewards/rejected": 1.0834596157073975, "step": 35550 }, { "epoch": 1.6509587260318492, "grad_norm": 51.68521499633789, "learning_rate": 2.249872324620456e-07, "logits/chosen": -19.80186653137207, "logits/rejected": -17.876556396484375, "logps/chosen": -385.0787048339844, "logps/rejected": -230.6625213623047, "loss": 0.3568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.87446665763855, "rewards/margins": 1.9820572137832642, "rewards/rejected": 1.8924089670181274, "step": 35560 }, { "epoch": 1.6514230001392822, "grad_norm": 70.55534362792969, "learning_rate": 2.2490985344414007e-07, "logits/chosen": -18.45816421508789, "logits/rejected": -17.555070877075195, "logps/chosen": -373.3912048339844, "logps/rejected": -299.16583251953125, "loss": 0.5835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8341450691223145, "rewards/margins": 1.1056134700775146, "rewards/rejected": 1.7285315990447998, "step": 35570 }, { "epoch": 1.6518872742467152, "grad_norm": 8.314337730407715, "learning_rate": 2.2483247442623456e-07, "logits/chosen": -18.96385383605957, "logits/rejected": -17.65917205810547, "logps/chosen": -456.8041076660156, "logps/rejected": -310.89080810546875, "loss": 1.3385, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8276755809783936, "rewards/margins": 0.7435418963432312, "rewards/rejected": 3.0841336250305176, "step": 35580 }, { "epoch": 1.6523515483541482, "grad_norm": 11.042426109313965, "learning_rate": 2.2475509540832907e-07, "logits/chosen": -19.78693389892578, "logits/rejected": -18.66042137145996, "logps/chosen": -319.7552490234375, "logps/rejected": -325.46905517578125, "loss": 0.5678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.041696548461914, "rewards/margins": 1.6090186834335327, "rewards/rejected": 1.432677984237671, "step": 35590 }, { "epoch": 1.6528158224615814, "grad_norm": 42.69918441772461, "learning_rate": 2.2467771639042358e-07, "logits/chosen": -18.606199264526367, "logits/rejected": -18.330575942993164, "logps/chosen": -311.1918029785156, "logps/rejected": -289.21337890625, "loss": 0.7097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.675351858139038, "rewards/margins": 0.6091238856315613, "rewards/rejected": 2.066228151321411, "step": 35600 }, { "epoch": 1.6532800965690142, "grad_norm": 0.6801556944847107, "learning_rate": 2.2460033737251804e-07, "logits/chosen": -18.15247917175293, "logits/rejected": -17.635501861572266, "logps/chosen": -361.7425842285156, "logps/rejected": -286.0306091308594, "loss": 0.6671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.989849090576172, "rewards/margins": 1.720149278640747, "rewards/rejected": 2.269699811935425, "step": 35610 }, { "epoch": 1.6537443706764474, "grad_norm": 4.295332431793213, "learning_rate": 2.2452295835461255e-07, "logits/chosen": -19.163265228271484, "logits/rejected": -17.55672264099121, "logps/chosen": -511.81854248046875, "logps/rejected": -275.4886779785156, "loss": 0.1999, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.362513065338135, "rewards/margins": 2.7409796714782715, "rewards/rejected": 1.621533751487732, "step": 35620 }, { "epoch": 1.6542086447838804, "grad_norm": 122.06627655029297, "learning_rate": 2.2444557933670704e-07, "logits/chosen": -19.5440616607666, "logits/rejected": -18.58333969116211, "logps/chosen": -288.4764404296875, "logps/rejected": -198.76437377929688, "loss": 0.2972, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7688345909118652, "rewards/margins": 1.898202896118164, "rewards/rejected": 0.8706321716308594, "step": 35630 }, { "epoch": 1.6546729188913134, "grad_norm": 13.547987937927246, "learning_rate": 2.2436820031880155e-07, "logits/chosen": -19.428159713745117, "logits/rejected": -18.610509872436523, "logps/chosen": -421.6756286621094, "logps/rejected": -336.18682861328125, "loss": 0.4734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.628304958343506, "rewards/margins": 1.7017011642456055, "rewards/rejected": 1.92660391330719, "step": 35640 }, { "epoch": 1.6551371929987466, "grad_norm": 1.3898710012435913, "learning_rate": 2.2429082130089606e-07, "logits/chosen": -18.49527359008789, "logits/rejected": -17.66230583190918, "logps/chosen": -310.23736572265625, "logps/rejected": -264.9208679199219, "loss": 0.7919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.979959726333618, "rewards/margins": 1.3562511205673218, "rewards/rejected": 1.6237084865570068, "step": 35650 }, { "epoch": 1.6556014671061794, "grad_norm": 13.033318519592285, "learning_rate": 2.2421344228299052e-07, "logits/chosen": -18.92361068725586, "logits/rejected": -18.98202896118164, "logps/chosen": -418.67529296875, "logps/rejected": -414.45751953125, "loss": 1.1579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.314319133758545, "rewards/margins": 0.03867485374212265, "rewards/rejected": 3.275643825531006, "step": 35660 }, { "epoch": 1.6560657412136126, "grad_norm": 1.2645982503890991, "learning_rate": 2.2413606326508503e-07, "logits/chosen": -19.093387603759766, "logits/rejected": -17.551624298095703, "logps/chosen": -372.8814697265625, "logps/rejected": -231.4445343017578, "loss": 0.553, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4752726554870605, "rewards/margins": 2.19588041305542, "rewards/rejected": 1.2793922424316406, "step": 35670 }, { "epoch": 1.6565300153210454, "grad_norm": 0.20873960852622986, "learning_rate": 2.2405868424717951e-07, "logits/chosen": -18.7869930267334, "logits/rejected": -19.00583839416504, "logps/chosen": -262.2859802246094, "logps/rejected": -271.6728210449219, "loss": 1.7149, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.3496453762054443, "rewards/margins": -0.6817768216133118, "rewards/rejected": 3.0314221382141113, "step": 35680 }, { "epoch": 1.6569942894284786, "grad_norm": 52.8120231628418, "learning_rate": 2.2398130522927402e-07, "logits/chosen": -18.276914596557617, "logits/rejected": -18.487266540527344, "logps/chosen": -393.69525146484375, "logps/rejected": -344.2067565917969, "loss": 0.9588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9125332832336426, "rewards/margins": 0.3767419457435608, "rewards/rejected": 2.5357913970947266, "step": 35690 }, { "epoch": 1.6574585635359116, "grad_norm": 61.57025146484375, "learning_rate": 2.2390392621136854e-07, "logits/chosen": -18.334970474243164, "logits/rejected": -18.128549575805664, "logps/chosen": -362.9594421386719, "logps/rejected": -368.07672119140625, "loss": 0.5583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3372159004211426, "rewards/margins": 1.061071515083313, "rewards/rejected": 2.276144504547119, "step": 35700 }, { "epoch": 1.6579228376433446, "grad_norm": 128.17979431152344, "learning_rate": 2.23826547193463e-07, "logits/chosen": -18.552303314208984, "logits/rejected": -17.103425979614258, "logps/chosen": -407.7348327636719, "logps/rejected": -329.865966796875, "loss": 0.4662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.276696681976318, "rewards/margins": 2.1893343925476074, "rewards/rejected": 2.087362051010132, "step": 35710 }, { "epoch": 1.6583871117507778, "grad_norm": 38.71144485473633, "learning_rate": 2.237491681755575e-07, "logits/chosen": -19.631668090820312, "logits/rejected": -17.476457595825195, "logps/chosen": -356.49383544921875, "logps/rejected": -250.8581085205078, "loss": 0.4696, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2686820030212402, "rewards/margins": 1.8611164093017578, "rewards/rejected": 1.4075652360916138, "step": 35720 }, { "epoch": 1.6588513858582106, "grad_norm": 41.02695083618164, "learning_rate": 2.23671789157652e-07, "logits/chosen": -17.83274269104004, "logits/rejected": -17.72574806213379, "logps/chosen": -363.9157409667969, "logps/rejected": -341.00689697265625, "loss": 1.0397, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.304513931274414, "rewards/margins": 0.6678140759468079, "rewards/rejected": 2.636699914932251, "step": 35730 }, { "epoch": 1.6593156599656438, "grad_norm": 106.39231872558594, "learning_rate": 2.235944101397465e-07, "logits/chosen": -18.88413429260254, "logits/rejected": -18.361669540405273, "logps/chosen": -380.1009826660156, "logps/rejected": -363.72943115234375, "loss": 0.409, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.728010892868042, "rewards/margins": 1.3619827032089233, "rewards/rejected": 2.366028070449829, "step": 35740 }, { "epoch": 1.6597799340730768, "grad_norm": 263.1391906738281, "learning_rate": 2.23517031121841e-07, "logits/chosen": -19.196735382080078, "logits/rejected": -19.31057357788086, "logps/chosen": -430.9955139160156, "logps/rejected": -355.08795166015625, "loss": 1.2036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.349701881408691, "rewards/margins": 0.7591695785522461, "rewards/rejected": 3.5905327796936035, "step": 35750 }, { "epoch": 1.6602442081805098, "grad_norm": 68.36357879638672, "learning_rate": 2.2343965210393547e-07, "logits/chosen": -18.75082778930664, "logits/rejected": -18.731517791748047, "logps/chosen": -395.5445556640625, "logps/rejected": -363.16912841796875, "loss": 0.5567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.736647844314575, "rewards/margins": 0.6582664251327515, "rewards/rejected": 2.0783815383911133, "step": 35760 }, { "epoch": 1.6607084822879428, "grad_norm": 108.09986114501953, "learning_rate": 2.2336227308602998e-07, "logits/chosen": -18.164175033569336, "logits/rejected": -17.92380714416504, "logps/chosen": -369.8001708984375, "logps/rejected": -654.2005004882812, "loss": 0.6684, "rewards/accuracies": 0.5, "rewards/chosen": 3.563793659210205, "rewards/margins": 1.3901660442352295, "rewards/rejected": 2.1736276149749756, "step": 35770 }, { "epoch": 1.6611727563953758, "grad_norm": 34.37152099609375, "learning_rate": 2.2328489406812447e-07, "logits/chosen": -18.893917083740234, "logits/rejected": -17.706912994384766, "logps/chosen": -420.5562438964844, "logps/rejected": -296.2657775878906, "loss": 0.233, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7407383918762207, "rewards/margins": 2.299696207046509, "rewards/rejected": 1.4410423040390015, "step": 35780 }, { "epoch": 1.661637030502809, "grad_norm": 1.4755266904830933, "learning_rate": 2.2320751505021898e-07, "logits/chosen": -20.343965530395508, "logits/rejected": -19.539409637451172, "logps/chosen": -374.7672119140625, "logps/rejected": -337.6604309082031, "loss": 0.696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.420548439025879, "rewards/margins": 1.5299098491668701, "rewards/rejected": 2.890639305114746, "step": 35790 }, { "epoch": 1.6621013046102417, "grad_norm": 189.7156219482422, "learning_rate": 2.2313013603231346e-07, "logits/chosen": -20.058456420898438, "logits/rejected": -19.878223419189453, "logps/chosen": -361.1369323730469, "logps/rejected": -282.76531982421875, "loss": 0.609, "rewards/accuracies": 0.5, "rewards/chosen": 3.7242209911346436, "rewards/margins": 0.995477557182312, "rewards/rejected": 2.728743076324463, "step": 35800 }, { "epoch": 1.662565578717675, "grad_norm": 54.883460998535156, "learning_rate": 2.2305275701440795e-07, "logits/chosen": -18.104389190673828, "logits/rejected": -18.41962242126465, "logps/chosen": -312.7201843261719, "logps/rejected": -290.50457763671875, "loss": 0.6444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.204535961151123, "rewards/margins": 0.6654725670814514, "rewards/rejected": 2.5390639305114746, "step": 35810 }, { "epoch": 1.663029852825108, "grad_norm": 111.69068145751953, "learning_rate": 2.2297537799650246e-07, "logits/chosen": -18.27562141418457, "logits/rejected": -17.95676040649414, "logps/chosen": -291.35943603515625, "logps/rejected": -257.5028381347656, "loss": 0.7532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8524346351623535, "rewards/margins": 0.8399432301521301, "rewards/rejected": 2.01249098777771, "step": 35820 }, { "epoch": 1.663494126932541, "grad_norm": 15.410225868225098, "learning_rate": 2.2289799897859694e-07, "logits/chosen": -19.582901000976562, "logits/rejected": -18.012407302856445, "logps/chosen": -394.9803161621094, "logps/rejected": -260.61944580078125, "loss": 0.4259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.936642646789551, "rewards/margins": 1.8269100189208984, "rewards/rejected": 1.1097323894500732, "step": 35830 }, { "epoch": 1.6639584010399742, "grad_norm": 79.80445861816406, "learning_rate": 2.2282061996069146e-07, "logits/chosen": -19.012670516967773, "logits/rejected": -18.52928924560547, "logps/chosen": -469.35821533203125, "logps/rejected": -396.56317138671875, "loss": 0.3968, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.514843702316284, "rewards/margins": 1.0903669595718384, "rewards/rejected": 2.4244766235351562, "step": 35840 }, { "epoch": 1.664422675147407, "grad_norm": 4.793145656585693, "learning_rate": 2.2274324094278594e-07, "logits/chosen": -18.48678970336914, "logits/rejected": -18.086179733276367, "logps/chosen": -359.5848083496094, "logps/rejected": -249.45028686523438, "loss": 0.7726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8063058853149414, "rewards/margins": 0.8307052850723267, "rewards/rejected": 1.9756004810333252, "step": 35850 }, { "epoch": 1.6648869492548402, "grad_norm": 0.13327914476394653, "learning_rate": 2.2266586192488043e-07, "logits/chosen": -19.666751861572266, "logits/rejected": -18.15913200378418, "logps/chosen": -409.3546447753906, "logps/rejected": -362.5995788574219, "loss": 0.8009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7517032623291016, "rewards/margins": 1.1165697574615479, "rewards/rejected": 2.6351332664489746, "step": 35860 }, { "epoch": 1.665351223362273, "grad_norm": 178.3239288330078, "learning_rate": 2.2258848290697494e-07, "logits/chosen": -18.516170501708984, "logits/rejected": -17.330368041992188, "logps/chosen": -276.9435729980469, "logps/rejected": -188.40625, "loss": 0.5496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.324021816253662, "rewards/margins": 1.2052943706512451, "rewards/rejected": 1.1187270879745483, "step": 35870 }, { "epoch": 1.6658154974697061, "grad_norm": 274.5578918457031, "learning_rate": 2.2251110388906942e-07, "logits/chosen": -18.499082565307617, "logits/rejected": -19.094192504882812, "logps/chosen": -304.23651123046875, "logps/rejected": -403.1973571777344, "loss": 2.0495, "rewards/accuracies": 0.5, "rewards/chosen": 2.5629801750183105, "rewards/margins": -1.1417111158370972, "rewards/rejected": 3.7046914100646973, "step": 35880 }, { "epoch": 1.6662797715771391, "grad_norm": 2.7272486686706543, "learning_rate": 2.2243372487116393e-07, "logits/chosen": -19.182865142822266, "logits/rejected": -18.796628952026367, "logps/chosen": -306.55670166015625, "logps/rejected": -286.95098876953125, "loss": 0.7358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.566741466522217, "rewards/margins": 1.0319318771362305, "rewards/rejected": 2.5348098278045654, "step": 35890 }, { "epoch": 1.6667440456845721, "grad_norm": 10.92218017578125, "learning_rate": 2.2235634585325842e-07, "logits/chosen": -19.124267578125, "logits/rejected": -17.615697860717773, "logps/chosen": -508.09356689453125, "logps/rejected": -295.749267578125, "loss": 1.131, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.174056053161621, "rewards/margins": 1.7173097133636475, "rewards/rejected": 2.4567461013793945, "step": 35900 }, { "epoch": 1.6672083197920053, "grad_norm": 0.5072470903396606, "learning_rate": 2.222789668353529e-07, "logits/chosen": -18.904550552368164, "logits/rejected": -18.409534454345703, "logps/chosen": -409.626220703125, "logps/rejected": -307.71881103515625, "loss": 0.7599, "rewards/accuracies": 0.5, "rewards/chosen": 4.048394203186035, "rewards/margins": 1.2005460262298584, "rewards/rejected": 2.8478481769561768, "step": 35910 }, { "epoch": 1.6676725938994381, "grad_norm": 36.204795837402344, "learning_rate": 2.2220158781744741e-07, "logits/chosen": -19.63149642944336, "logits/rejected": -17.567630767822266, "logps/chosen": -362.11968994140625, "logps/rejected": -222.90011596679688, "loss": 0.4586, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8508849143981934, "rewards/margins": 2.119372844696045, "rewards/rejected": 0.731512188911438, "step": 35920 }, { "epoch": 1.6681368680068713, "grad_norm": 20.698413848876953, "learning_rate": 2.2212420879954193e-07, "logits/chosen": -18.6871280670166, "logits/rejected": -18.004913330078125, "logps/chosen": -364.384033203125, "logps/rejected": -231.818359375, "loss": 0.5367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.208395004272461, "rewards/margins": 1.3730758428573608, "rewards/rejected": 1.8353191614151, "step": 35930 }, { "epoch": 1.6686011421143043, "grad_norm": 40.65029525756836, "learning_rate": 2.220468297816364e-07, "logits/chosen": -18.24199867248535, "logits/rejected": -17.148540496826172, "logps/chosen": -474.35980224609375, "logps/rejected": -249.6059112548828, "loss": 0.3571, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5530383586883545, "rewards/margins": 2.2287793159484863, "rewards/rejected": 1.3242594003677368, "step": 35940 }, { "epoch": 1.6690654162217373, "grad_norm": 21.83173370361328, "learning_rate": 2.219694507637309e-07, "logits/chosen": -18.97062873840332, "logits/rejected": -18.58562660217285, "logps/chosen": -404.3943176269531, "logps/rejected": -371.1211242675781, "loss": 1.1156, "rewards/accuracies": 0.5, "rewards/chosen": 2.8423848152160645, "rewards/margins": -0.14329442381858826, "rewards/rejected": 2.9856793880462646, "step": 35950 }, { "epoch": 1.6695296903291703, "grad_norm": 2.9561212062835693, "learning_rate": 2.2189207174582538e-07, "logits/chosen": -20.110797882080078, "logits/rejected": -18.98492431640625, "logps/chosen": -321.2512512207031, "logps/rejected": -245.00076293945312, "loss": 0.4226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.378357410430908, "rewards/margins": 1.4015979766845703, "rewards/rejected": 1.9767593145370483, "step": 35960 }, { "epoch": 1.6699939644366033, "grad_norm": 42.89305114746094, "learning_rate": 2.218146927279199e-07, "logits/chosen": -18.5271053314209, "logits/rejected": -17.600765228271484, "logps/chosen": -404.58502197265625, "logps/rejected": -323.10455322265625, "loss": 0.6736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.654515027999878, "rewards/margins": 0.8723565936088562, "rewards/rejected": 2.7821590900421143, "step": 35970 }, { "epoch": 1.6704582385440365, "grad_norm": 111.39105224609375, "learning_rate": 2.217373137100144e-07, "logits/chosen": -18.805810928344727, "logits/rejected": -18.728885650634766, "logps/chosen": -410.3196716308594, "logps/rejected": -434.53118896484375, "loss": 0.7365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.462609767913818, "rewards/margins": 0.3476162552833557, "rewards/rejected": 4.114994049072266, "step": 35980 }, { "epoch": 1.6709225126514693, "grad_norm": 19.925294876098633, "learning_rate": 2.216599346921089e-07, "logits/chosen": -19.881999969482422, "logits/rejected": -18.250553131103516, "logps/chosen": -274.80328369140625, "logps/rejected": -193.95852661132812, "loss": 0.6747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.226850986480713, "rewards/margins": 1.5709433555603027, "rewards/rejected": 0.6559076309204102, "step": 35990 }, { "epoch": 1.6713867867589025, "grad_norm": 15.92073917388916, "learning_rate": 2.2158255567420337e-07, "logits/chosen": -19.193828582763672, "logits/rejected": -18.070323944091797, "logps/chosen": -418.23052978515625, "logps/rejected": -316.61962890625, "loss": 0.5852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1819984912872314, "rewards/margins": 1.2559324502944946, "rewards/rejected": 1.9260660409927368, "step": 36000 }, { "epoch": 1.6718510608663355, "grad_norm": 80.02195739746094, "learning_rate": 2.2150517665629786e-07, "logits/chosen": -18.890270233154297, "logits/rejected": -18.247695922851562, "logps/chosen": -321.2904357910156, "logps/rejected": -266.89166259765625, "loss": 0.7002, "rewards/accuracies": 0.5, "rewards/chosen": 2.4386985301971436, "rewards/margins": 0.5629578232765198, "rewards/rejected": 1.8757407665252686, "step": 36010 }, { "epoch": 1.6723153349737685, "grad_norm": 75.16024017333984, "learning_rate": 2.2142779763839237e-07, "logits/chosen": -19.52925682067871, "logits/rejected": -18.381635665893555, "logps/chosen": -290.95196533203125, "logps/rejected": -267.6800231933594, "loss": 0.9023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.16365909576416, "rewards/margins": 0.3155529201030731, "rewards/rejected": 1.8481061458587646, "step": 36020 }, { "epoch": 1.6727796090812015, "grad_norm": 237.7160186767578, "learning_rate": 2.2135041862048688e-07, "logits/chosen": -18.974328994750977, "logits/rejected": -19.13658905029297, "logps/chosen": -285.15521240234375, "logps/rejected": -329.63006591796875, "loss": 1.3315, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.383338212966919, "rewards/margins": -0.3451636731624603, "rewards/rejected": 2.728501796722412, "step": 36030 }, { "epoch": 1.6732438831886345, "grad_norm": 113.0481185913086, "learning_rate": 2.2127303960258134e-07, "logits/chosen": -19.25143814086914, "logits/rejected": -17.884601593017578, "logps/chosen": -313.82525634765625, "logps/rejected": -268.9071960449219, "loss": 0.5582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.066448450088501, "rewards/margins": 1.6768566370010376, "rewards/rejected": 1.3895916938781738, "step": 36040 }, { "epoch": 1.6737081572960677, "grad_norm": 1.881131887435913, "learning_rate": 2.2119566058467585e-07, "logits/chosen": -18.45448875427246, "logits/rejected": -18.428363800048828, "logps/chosen": -328.19525146484375, "logps/rejected": -258.1986389160156, "loss": 0.63, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.569525957107544, "rewards/margins": 0.9009210467338562, "rewards/rejected": 2.668605089187622, "step": 36050 }, { "epoch": 1.6741724314035005, "grad_norm": 0.8564107418060303, "learning_rate": 2.2111828156677033e-07, "logits/chosen": -18.91403579711914, "logits/rejected": -19.445772171020508, "logps/chosen": -396.4788818359375, "logps/rejected": -422.3995056152344, "loss": 1.0307, "rewards/accuracies": 0.5, "rewards/chosen": 4.2828803062438965, "rewards/margins": 0.21728309988975525, "rewards/rejected": 4.0655975341796875, "step": 36060 }, { "epoch": 1.6746367055109337, "grad_norm": 18.388700485229492, "learning_rate": 2.2104090254886485e-07, "logits/chosen": -18.093799591064453, "logits/rejected": -18.378643035888672, "logps/chosen": -384.6834716796875, "logps/rejected": -359.9080505371094, "loss": 0.9947, "rewards/accuracies": 0.5, "rewards/chosen": 3.18306040763855, "rewards/margins": 0.02448444999754429, "rewards/rejected": 3.158576250076294, "step": 36070 }, { "epoch": 1.6751009796183667, "grad_norm": 216.53578186035156, "learning_rate": 2.2096352353095936e-07, "logits/chosen": -18.8685245513916, "logits/rejected": -17.511960983276367, "logps/chosen": -418.8409118652344, "logps/rejected": -259.7084045410156, "loss": 0.5351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.800793170928955, "rewards/margins": 2.2635440826416016, "rewards/rejected": 1.537248969078064, "step": 36080 }, { "epoch": 1.6755652537257997, "grad_norm": 2.0963850021362305, "learning_rate": 2.2088614451305381e-07, "logits/chosen": -18.972686767578125, "logits/rejected": -17.24507713317871, "logps/chosen": -428.89013671875, "logps/rejected": -276.1340637207031, "loss": 0.3413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.418280124664307, "rewards/margins": 2.35251784324646, "rewards/rejected": 2.0657620429992676, "step": 36090 }, { "epoch": 1.676029527833233, "grad_norm": 197.7886505126953, "learning_rate": 2.2080876549514833e-07, "logits/chosen": -18.566463470458984, "logits/rejected": -19.3458309173584, "logps/chosen": -380.6644592285156, "logps/rejected": -391.0460510253906, "loss": 1.2268, "rewards/accuracies": 0.5, "rewards/chosen": 3.5054564476013184, "rewards/margins": -0.2883976101875305, "rewards/rejected": 3.793853759765625, "step": 36100 }, { "epoch": 1.6764938019406657, "grad_norm": 26.244401931762695, "learning_rate": 2.207313864772428e-07, "logits/chosen": -18.225522994995117, "logits/rejected": -18.201879501342773, "logps/chosen": -391.3061828613281, "logps/rejected": -365.77325439453125, "loss": 1.1604, "rewards/accuracies": 0.5, "rewards/chosen": 3.279542922973633, "rewards/margins": -0.16434195637702942, "rewards/rejected": 3.443885087966919, "step": 36110 }, { "epoch": 1.6769580760480989, "grad_norm": 91.29773712158203, "learning_rate": 2.2065400745933732e-07, "logits/chosen": -18.568140029907227, "logits/rejected": -17.767616271972656, "logps/chosen": -370.51129150390625, "logps/rejected": -289.6629638671875, "loss": 0.9776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2354235649108887, "rewards/margins": 1.2643617391586304, "rewards/rejected": 1.9710619449615479, "step": 36120 }, { "epoch": 1.6774223501555319, "grad_norm": 68.04399108886719, "learning_rate": 2.2057662844143183e-07, "logits/chosen": -18.514122009277344, "logits/rejected": -18.004709243774414, "logps/chosen": -323.2174072265625, "logps/rejected": -278.0365295410156, "loss": 0.9781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3921759128570557, "rewards/margins": 0.52118319272995, "rewards/rejected": 1.870992660522461, "step": 36130 }, { "epoch": 1.6778866242629649, "grad_norm": 0.5175144076347351, "learning_rate": 2.204992494235263e-07, "logits/chosen": -19.624208450317383, "logits/rejected": -18.47539710998535, "logps/chosen": -398.8788146972656, "logps/rejected": -207.2701873779297, "loss": 0.1258, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.26393985748291, "rewards/margins": 3.540104389190674, "rewards/rejected": 1.7238353490829468, "step": 36140 }, { "epoch": 1.6783508983703979, "grad_norm": 175.84913635253906, "learning_rate": 2.204218704056208e-07, "logits/chosen": -19.006410598754883, "logits/rejected": -18.783884048461914, "logps/chosen": -315.17535400390625, "logps/rejected": -269.6747741699219, "loss": 0.8694, "rewards/accuracies": 0.5, "rewards/chosen": 2.3176417350769043, "rewards/margins": 0.30903592705726624, "rewards/rejected": 2.008605718612671, "step": 36150 }, { "epoch": 1.6788151724778309, "grad_norm": 181.12429809570312, "learning_rate": 2.203444913877153e-07, "logits/chosen": -18.789812088012695, "logits/rejected": -17.701000213623047, "logps/chosen": -314.1393127441406, "logps/rejected": -243.2525634765625, "loss": 0.9684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8016037940979004, "rewards/margins": 1.7800407409667969, "rewards/rejected": 1.0215630531311035, "step": 36160 }, { "epoch": 1.679279446585264, "grad_norm": 110.55928802490234, "learning_rate": 2.202671123698098e-07, "logits/chosen": -18.24970054626465, "logits/rejected": -17.529460906982422, "logps/chosen": -402.2933349609375, "logps/rejected": -356.95733642578125, "loss": 0.8918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.80863881111145, "rewards/margins": 0.2364199161529541, "rewards/rejected": 2.572218894958496, "step": 36170 }, { "epoch": 1.6797437206926968, "grad_norm": 70.69068145751953, "learning_rate": 2.201897333519043e-07, "logits/chosen": -19.388214111328125, "logits/rejected": -18.373191833496094, "logps/chosen": -360.8524475097656, "logps/rejected": -277.66754150390625, "loss": 0.6999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1926052570343018, "rewards/margins": 0.8091157674789429, "rewards/rejected": 2.3834891319274902, "step": 36180 }, { "epoch": 1.68020799480013, "grad_norm": 36.2708740234375, "learning_rate": 2.2011235433399877e-07, "logits/chosen": -19.42156219482422, "logits/rejected": -18.72736167907715, "logps/chosen": -470.0643005371094, "logps/rejected": -345.23980712890625, "loss": 0.2739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.01392126083374, "rewards/margins": 1.8332258462905884, "rewards/rejected": 3.1806957721710205, "step": 36190 }, { "epoch": 1.680672268907563, "grad_norm": 2.0396010875701904, "learning_rate": 2.2003497531609328e-07, "logits/chosen": -19.127094268798828, "logits/rejected": -18.438873291015625, "logps/chosen": -421.120849609375, "logps/rejected": -314.0697937011719, "loss": 0.38, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.590376615524292, "rewards/margins": 1.6341310739517212, "rewards/rejected": 1.95624577999115, "step": 36200 }, { "epoch": 1.681136543014996, "grad_norm": 29.99132537841797, "learning_rate": 2.1995759629818777e-07, "logits/chosen": -19.121633529663086, "logits/rejected": -18.043292999267578, "logps/chosen": -394.067138671875, "logps/rejected": -259.5943908691406, "loss": 0.4182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9894468784332275, "rewards/margins": 1.7639567852020264, "rewards/rejected": 1.225489854812622, "step": 36210 }, { "epoch": 1.681600817122429, "grad_norm": 60.809932708740234, "learning_rate": 2.1988021728028228e-07, "logits/chosen": -18.564769744873047, "logits/rejected": -17.700572967529297, "logps/chosen": -239.35977172851562, "logps/rejected": -194.05130004882812, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": 2.00185227394104, "rewards/margins": 0.9568998217582703, "rewards/rejected": 1.044952392578125, "step": 36220 }, { "epoch": 1.682065091229862, "grad_norm": 101.34345245361328, "learning_rate": 2.198028382623768e-07, "logits/chosen": -18.008647918701172, "logits/rejected": -17.079288482666016, "logps/chosen": -341.2054138183594, "logps/rejected": -210.13211059570312, "loss": 0.477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.033700704574585, "rewards/margins": 1.4730968475341797, "rewards/rejected": 0.56060391664505, "step": 36230 }, { "epoch": 1.6825293653372952, "grad_norm": 146.1301727294922, "learning_rate": 2.1972545924447125e-07, "logits/chosen": -18.012439727783203, "logits/rejected": -17.21531105041504, "logps/chosen": -344.3720397949219, "logps/rejected": -265.011962890625, "loss": 0.6585, "rewards/accuracies": 0.5, "rewards/chosen": 2.7721550464630127, "rewards/margins": 1.8499635457992554, "rewards/rejected": 0.922191321849823, "step": 36240 }, { "epoch": 1.682993639444728, "grad_norm": 3.4784553050994873, "learning_rate": 2.1964808022656576e-07, "logits/chosen": -19.31728744506836, "logits/rejected": -17.436140060424805, "logps/chosen": -592.2990112304688, "logps/rejected": -354.5982971191406, "loss": 0.316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 6.147280216217041, "rewards/margins": 2.777449369430542, "rewards/rejected": 3.36983060836792, "step": 36250 }, { "epoch": 1.6834579135521612, "grad_norm": 81.45601654052734, "learning_rate": 2.1957070120866024e-07, "logits/chosen": -18.146347045898438, "logits/rejected": -17.24337387084961, "logps/chosen": -457.2940979003906, "logps/rejected": -309.6209716796875, "loss": 0.5633, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3980183601379395, "rewards/margins": 0.9630220532417297, "rewards/rejected": 1.434996485710144, "step": 36260 }, { "epoch": 1.6839221876595942, "grad_norm": 151.01663208007812, "learning_rate": 2.1949332219075475e-07, "logits/chosen": -19.012874603271484, "logits/rejected": -17.752025604248047, "logps/chosen": -457.400390625, "logps/rejected": -356.733154296875, "loss": 0.554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5060315132141113, "rewards/margins": 1.236040472984314, "rewards/rejected": 2.269990921020508, "step": 36270 }, { "epoch": 1.6843864617670272, "grad_norm": 33.68182373046875, "learning_rate": 2.1941594317284924e-07, "logits/chosen": -19.020322799682617, "logits/rejected": -18.15425682067871, "logps/chosen": -423.8545837402344, "logps/rejected": -459.5022888183594, "loss": 0.8122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3023924827575684, "rewards/margins": 0.484898179769516, "rewards/rejected": 2.8174941539764404, "step": 36280 }, { "epoch": 1.6848507358744604, "grad_norm": 48.02205276489258, "learning_rate": 2.1933856415494372e-07, "logits/chosen": -19.153953552246094, "logits/rejected": -18.367652893066406, "logps/chosen": -236.4469757080078, "logps/rejected": -221.83547973632812, "loss": 0.6003, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6131677627563477, "rewards/margins": 0.2985168397426605, "rewards/rejected": 1.3146508932113647, "step": 36290 }, { "epoch": 1.6853150099818932, "grad_norm": 11.930521011352539, "learning_rate": 2.1926118513703823e-07, "logits/chosen": -20.08514976501465, "logits/rejected": -19.047208786010742, "logps/chosen": -307.5945739746094, "logps/rejected": -240.1221923828125, "loss": 0.3804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4173378944396973, "rewards/margins": 1.639552116394043, "rewards/rejected": 1.7777860164642334, "step": 36300 }, { "epoch": 1.6857792840893264, "grad_norm": 16.192575454711914, "learning_rate": 2.1918380611913272e-07, "logits/chosen": -19.829349517822266, "logits/rejected": -18.426029205322266, "logps/chosen": -451.72003173828125, "logps/rejected": -263.7168884277344, "loss": 0.2476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.113685131072998, "rewards/margins": 2.5923690795898438, "rewards/rejected": 1.5213158130645752, "step": 36310 }, { "epoch": 1.6862435581967594, "grad_norm": 53.41829299926758, "learning_rate": 2.1910642710122723e-07, "logits/chosen": -19.36319351196289, "logits/rejected": -18.956090927124023, "logps/chosen": -337.9734802246094, "logps/rejected": -348.02783203125, "loss": 0.8534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6965954303741455, "rewards/margins": 0.18574145436286926, "rewards/rejected": 2.5108542442321777, "step": 36320 }, { "epoch": 1.6867078323041924, "grad_norm": 77.67460632324219, "learning_rate": 2.1902904808332172e-07, "logits/chosen": -19.261266708374023, "logits/rejected": -18.890613555908203, "logps/chosen": -277.77911376953125, "logps/rejected": -238.1951904296875, "loss": 0.4552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6131081581115723, "rewards/margins": 0.888460636138916, "rewards/rejected": 1.7246477603912354, "step": 36330 }, { "epoch": 1.6871721064116254, "grad_norm": 0.28258374333381653, "learning_rate": 2.189516690654162e-07, "logits/chosen": -18.648365020751953, "logits/rejected": -17.772340774536133, "logps/chosen": -345.3391418457031, "logps/rejected": -260.7793884277344, "loss": 0.3065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.501244068145752, "rewards/margins": 2.033992290496826, "rewards/rejected": 1.4672517776489258, "step": 36340 }, { "epoch": 1.6876363805190584, "grad_norm": 69.83714294433594, "learning_rate": 2.188742900475107e-07, "logits/chosen": -19.554790496826172, "logits/rejected": -18.657649993896484, "logps/chosen": -447.863525390625, "logps/rejected": -331.66278076171875, "loss": 0.4389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.142783164978027, "rewards/margins": 1.445921540260315, "rewards/rejected": 2.6968610286712646, "step": 36350 }, { "epoch": 1.6881006546264916, "grad_norm": 62.221378326416016, "learning_rate": 2.187969110296052e-07, "logits/chosen": -18.51473617553711, "logits/rejected": -17.76094627380371, "logps/chosen": -380.21563720703125, "logps/rejected": -322.6216735839844, "loss": 0.6576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1195600032806396, "rewards/margins": 1.1323206424713135, "rewards/rejected": 1.9872392416000366, "step": 36360 }, { "epoch": 1.6885649287339244, "grad_norm": 107.8364486694336, "learning_rate": 2.187195320116997e-07, "logits/chosen": -18.256778717041016, "logits/rejected": -17.870426177978516, "logps/chosen": -372.85272216796875, "logps/rejected": -309.90673828125, "loss": 0.8264, "rewards/accuracies": 0.5, "rewards/chosen": 2.498655319213867, "rewards/margins": 0.48958006501197815, "rewards/rejected": 2.009075164794922, "step": 36370 }, { "epoch": 1.6890292028413576, "grad_norm": 37.71297836303711, "learning_rate": 2.186421529937942e-07, "logits/chosen": -18.7213077545166, "logits/rejected": -17.539382934570312, "logps/chosen": -511.78997802734375, "logps/rejected": -363.7550354003906, "loss": 0.6073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0965776443481445, "rewards/margins": 0.9654959440231323, "rewards/rejected": 3.1310813426971436, "step": 36380 }, { "epoch": 1.6894934769487906, "grad_norm": 118.55502319335938, "learning_rate": 2.1856477397588868e-07, "logits/chosen": -18.996105194091797, "logits/rejected": -18.571611404418945, "logps/chosen": -394.5328369140625, "logps/rejected": -391.8128356933594, "loss": 0.9617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.331775188446045, "rewards/margins": 0.9157913327217102, "rewards/rejected": 3.4159839153289795, "step": 36390 }, { "epoch": 1.6899577510562236, "grad_norm": 0.14644333720207214, "learning_rate": 2.184873949579832e-07, "logits/chosen": -19.406810760498047, "logits/rejected": -18.231178283691406, "logps/chosen": -515.7284545898438, "logps/rejected": -337.81536865234375, "loss": 0.6998, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.740680694580078, "rewards/margins": 1.6018726825714111, "rewards/rejected": 3.138808488845825, "step": 36400 }, { "epoch": 1.6904220251636566, "grad_norm": 130.2056121826172, "learning_rate": 2.1841001594007767e-07, "logits/chosen": -18.83920669555664, "logits/rejected": -18.27310562133789, "logps/chosen": -395.48699951171875, "logps/rejected": -443.6995544433594, "loss": 1.3415, "rewards/accuracies": 0.5, "rewards/chosen": 3.582845687866211, "rewards/margins": -0.025937747210264206, "rewards/rejected": 3.60878324508667, "step": 36410 }, { "epoch": 1.6908862992710896, "grad_norm": 43.43215560913086, "learning_rate": 2.1833263692217218e-07, "logits/chosen": -18.87087631225586, "logits/rejected": -18.941648483276367, "logps/chosen": -455.78753662109375, "logps/rejected": -367.2975158691406, "loss": 0.3855, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2645950317382812, "rewards/margins": 0.9651288986206055, "rewards/rejected": 2.2994658946990967, "step": 36420 }, { "epoch": 1.6913505733785228, "grad_norm": 68.62543487548828, "learning_rate": 2.1825525790426667e-07, "logits/chosen": -19.95587158203125, "logits/rejected": -18.866018295288086, "logps/chosen": -479.306884765625, "logps/rejected": -368.0107421875, "loss": 0.559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.14583158493042, "rewards/margins": 0.9148551821708679, "rewards/rejected": 3.230976104736328, "step": 36430 }, { "epoch": 1.6918148474859556, "grad_norm": 17.738224029541016, "learning_rate": 2.1817787888636115e-07, "logits/chosen": -18.785045623779297, "logits/rejected": -18.798404693603516, "logps/chosen": -475.61614990234375, "logps/rejected": -407.382568359375, "loss": 0.5179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7602362632751465, "rewards/margins": 0.7946160435676575, "rewards/rejected": 2.965620517730713, "step": 36440 }, { "epoch": 1.6922791215933888, "grad_norm": 127.65998840332031, "learning_rate": 2.1810049986845567e-07, "logits/chosen": -20.043132781982422, "logits/rejected": -18.752849578857422, "logps/chosen": -427.86163330078125, "logps/rejected": -273.52374267578125, "loss": 0.6389, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.067858695983887, "rewards/margins": 1.8302379846572876, "rewards/rejected": 2.2376205921173096, "step": 36450 }, { "epoch": 1.6927433957008218, "grad_norm": 136.781005859375, "learning_rate": 2.1802312085055015e-07, "logits/chosen": -20.01278305053711, "logits/rejected": -19.7381649017334, "logps/chosen": -517.001220703125, "logps/rejected": -414.9835510253906, "loss": 0.5081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6771693229675293, "rewards/margins": 1.4095698595046997, "rewards/rejected": 2.2675998210906982, "step": 36460 }, { "epoch": 1.6932076698082548, "grad_norm": 5.487959384918213, "learning_rate": 2.1794574183264466e-07, "logits/chosen": -18.420242309570312, "logits/rejected": -16.85150146484375, "logps/chosen": -427.39288330078125, "logps/rejected": -342.9590148925781, "loss": 0.6649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6035943031311035, "rewards/margins": 2.014439105987549, "rewards/rejected": 1.5891549587249756, "step": 36470 }, { "epoch": 1.693671943915688, "grad_norm": 51.72494888305664, "learning_rate": 2.1786836281473915e-07, "logits/chosen": -18.585155487060547, "logits/rejected": -17.406269073486328, "logps/chosen": -350.9826965332031, "logps/rejected": -220.5882568359375, "loss": 0.4557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.331446409225464, "rewards/margins": 1.4072072505950928, "rewards/rejected": 1.9242393970489502, "step": 36480 }, { "epoch": 1.6941362180231208, "grad_norm": 5.912140846252441, "learning_rate": 2.1779098379683363e-07, "logits/chosen": -18.65810775756836, "logits/rejected": -16.75541114807129, "logps/chosen": -362.50360107421875, "logps/rejected": -222.2044219970703, "loss": 0.2749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.500789165496826, "rewards/margins": 2.356365442276001, "rewards/rejected": 1.144423484802246, "step": 36490 }, { "epoch": 1.694600492130554, "grad_norm": 3.00632381439209, "learning_rate": 2.1771360477892814e-07, "logits/chosen": -18.9049129486084, "logits/rejected": -18.544017791748047, "logps/chosen": -342.9620056152344, "logps/rejected": -333.55792236328125, "loss": 1.1207, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5810840129852295, "rewards/margins": 0.31342199444770813, "rewards/rejected": 3.2676615715026855, "step": 36500 }, { "epoch": 1.6950647662379867, "grad_norm": 51.58922576904297, "learning_rate": 2.1763622576102263e-07, "logits/chosen": -18.928373336791992, "logits/rejected": -18.53456687927246, "logps/chosen": -284.2857360839844, "logps/rejected": -225.2522735595703, "loss": 1.161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9164209365844727, "rewards/margins": -0.03295425325632095, "rewards/rejected": 1.9493749141693115, "step": 36510 }, { "epoch": 1.69552904034542, "grad_norm": 14.17834186553955, "learning_rate": 2.175588467431171e-07, "logits/chosen": -18.109655380249023, "logits/rejected": -18.047100067138672, "logps/chosen": -394.08978271484375, "logps/rejected": -322.93072509765625, "loss": 0.6733, "rewards/accuracies": 0.5, "rewards/chosen": 2.8537399768829346, "rewards/margins": 0.9418808221817017, "rewards/rejected": 1.911859154701233, "step": 36520 }, { "epoch": 1.695993314452853, "grad_norm": 7.175756454467773, "learning_rate": 2.1748146772521162e-07, "logits/chosen": -18.100017547607422, "logits/rejected": -17.78913116455078, "logps/chosen": -277.72698974609375, "logps/rejected": -276.142578125, "loss": 0.9517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.629372000694275, "rewards/margins": 0.5350664854049683, "rewards/rejected": 1.0943057537078857, "step": 36530 }, { "epoch": 1.696457588560286, "grad_norm": 162.5, "learning_rate": 2.174040887073061e-07, "logits/chosen": -19.661949157714844, "logits/rejected": -20.244359970092773, "logps/chosen": -381.4830017089844, "logps/rejected": -363.7844543457031, "loss": 0.7865, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.411240339279175, "rewards/margins": 0.4479626715183258, "rewards/rejected": 2.9632773399353027, "step": 36540 }, { "epoch": 1.6969218626677192, "grad_norm": 121.01492309570312, "learning_rate": 2.1732670968940062e-07, "logits/chosen": -19.22224235534668, "logits/rejected": -19.344013214111328, "logps/chosen": -290.2191467285156, "logps/rejected": -268.85491943359375, "loss": 1.1371, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.123800277709961, "rewards/margins": -0.20101025700569153, "rewards/rejected": 2.32481050491333, "step": 36550 }, { "epoch": 1.697386136775152, "grad_norm": 114.88462829589844, "learning_rate": 2.1724933067149513e-07, "logits/chosen": -18.73659896850586, "logits/rejected": -18.464092254638672, "logps/chosen": -358.51043701171875, "logps/rejected": -290.36376953125, "loss": 0.7781, "rewards/accuracies": 0.5, "rewards/chosen": 3.5855133533477783, "rewards/margins": 0.4106075167655945, "rewards/rejected": 3.174906015396118, "step": 36560 }, { "epoch": 1.6978504108825851, "grad_norm": 19.592445373535156, "learning_rate": 2.171719516535896e-07, "logits/chosen": -19.144023895263672, "logits/rejected": -17.758472442626953, "logps/chosen": -426.75213623046875, "logps/rejected": -314.8552551269531, "loss": 0.3742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.201505184173584, "rewards/margins": 2.1360201835632324, "rewards/rejected": 2.0654845237731934, "step": 36570 }, { "epoch": 1.6983146849900181, "grad_norm": 166.0657196044922, "learning_rate": 2.170945726356841e-07, "logits/chosen": -18.73434829711914, "logits/rejected": -18.211570739746094, "logps/chosen": -362.92816162109375, "logps/rejected": -345.7518310546875, "loss": 0.9678, "rewards/accuracies": 0.5, "rewards/chosen": 2.510556221008301, "rewards/margins": 0.20257070660591125, "rewards/rejected": 2.307985544204712, "step": 36580 }, { "epoch": 1.6987789590974511, "grad_norm": 29.747053146362305, "learning_rate": 2.1701719361777859e-07, "logits/chosen": -19.092487335205078, "logits/rejected": -18.889205932617188, "logps/chosen": -320.2555847167969, "logps/rejected": -305.76531982421875, "loss": 0.8941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.318297863006592, "rewards/margins": 0.2903274893760681, "rewards/rejected": 2.027970314025879, "step": 36590 }, { "epoch": 1.6992432332048841, "grad_norm": 247.61354064941406, "learning_rate": 2.169398145998731e-07, "logits/chosen": -18.15231704711914, "logits/rejected": -17.808748245239258, "logps/chosen": -472.02069091796875, "logps/rejected": -450.46722412109375, "loss": 0.9573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8194096088409424, "rewards/margins": 0.2656955122947693, "rewards/rejected": 3.55371356010437, "step": 36600 }, { "epoch": 1.6997075073123171, "grad_norm": 89.11669921875, "learning_rate": 2.168624355819676e-07, "logits/chosen": -19.093225479125977, "logits/rejected": -18.98592185974121, "logps/chosen": -473.3915100097656, "logps/rejected": -446.8843688964844, "loss": 0.8256, "rewards/accuracies": 0.5, "rewards/chosen": 4.1750078201293945, "rewards/margins": 0.09586572647094727, "rewards/rejected": 4.079142093658447, "step": 36610 }, { "epoch": 1.7001717814197503, "grad_norm": 6.007997035980225, "learning_rate": 2.1678505656406207e-07, "logits/chosen": -19.25269317626953, "logits/rejected": -18.419071197509766, "logps/chosen": -396.82598876953125, "logps/rejected": -359.84661865234375, "loss": 0.7647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.896210193634033, "rewards/margins": 1.1743824481964111, "rewards/rejected": 1.721827745437622, "step": 36620 }, { "epoch": 1.700636055527183, "grad_norm": 67.5944595336914, "learning_rate": 2.1670767754615658e-07, "logits/chosen": -19.169322967529297, "logits/rejected": -18.47942352294922, "logps/chosen": -411.269287109375, "logps/rejected": -350.2200622558594, "loss": 0.5176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.689814805984497, "rewards/margins": 1.2009700536727905, "rewards/rejected": 2.488844871520996, "step": 36630 }, { "epoch": 1.7011003296346163, "grad_norm": 4.153077125549316, "learning_rate": 2.1663029852825106e-07, "logits/chosen": -19.691837310791016, "logits/rejected": -17.914112091064453, "logps/chosen": -513.1710205078125, "logps/rejected": -338.47454833984375, "loss": 0.8922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.139065265655518, "rewards/margins": 1.6208022832870483, "rewards/rejected": 3.518263339996338, "step": 36640 }, { "epoch": 1.7015646037420493, "grad_norm": 26.36417579650879, "learning_rate": 2.1655291951034557e-07, "logits/chosen": -18.405237197875977, "logits/rejected": -19.019447326660156, "logps/chosen": -312.865478515625, "logps/rejected": -379.4557800292969, "loss": 0.829, "rewards/accuracies": 0.5, "rewards/chosen": 2.720578670501709, "rewards/margins": 0.17942367494106293, "rewards/rejected": 2.5411548614501953, "step": 36650 }, { "epoch": 1.7020288778494823, "grad_norm": 1.8974651098251343, "learning_rate": 2.1647554049244009e-07, "logits/chosen": -19.637489318847656, "logits/rejected": -19.1712703704834, "logps/chosen": -446.028564453125, "logps/rejected": -401.3240966796875, "loss": 0.4604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9921722412109375, "rewards/margins": 1.7629032135009766, "rewards/rejected": 2.229269027709961, "step": 36660 }, { "epoch": 1.7024931519569155, "grad_norm": 20.470611572265625, "learning_rate": 2.1639816147453454e-07, "logits/chosen": -17.512859344482422, "logits/rejected": -16.831933975219727, "logps/chosen": -397.25872802734375, "logps/rejected": -285.0179138183594, "loss": 0.5603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.017199993133545, "rewards/margins": 1.6986627578735352, "rewards/rejected": 1.3185371160507202, "step": 36670 }, { "epoch": 1.7029574260643483, "grad_norm": 228.26165771484375, "learning_rate": 2.1632078245662905e-07, "logits/chosen": -18.9586238861084, "logits/rejected": -18.424152374267578, "logps/chosen": -429.6322326660156, "logps/rejected": -366.2449645996094, "loss": 0.9792, "rewards/accuracies": 0.5, "rewards/chosen": 3.8009719848632812, "rewards/margins": 0.872562050819397, "rewards/rejected": 2.9284098148345947, "step": 36680 }, { "epoch": 1.7034217001717815, "grad_norm": 5.422741413116455, "learning_rate": 2.1624340343872354e-07, "logits/chosen": -19.815683364868164, "logits/rejected": -18.814455032348633, "logps/chosen": -394.45819091796875, "logps/rejected": -250.92739868164062, "loss": 0.3996, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5794970989227295, "rewards/margins": 1.3176698684692383, "rewards/rejected": 2.261827230453491, "step": 36690 }, { "epoch": 1.7038859742792143, "grad_norm": 7.161382675170898, "learning_rate": 2.1616602442081805e-07, "logits/chosen": -18.375640869140625, "logits/rejected": -17.892778396606445, "logps/chosen": -399.95709228515625, "logps/rejected": -321.8836975097656, "loss": 0.6267, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4838099479675293, "rewards/margins": 0.8951497077941895, "rewards/rejected": 2.588660478591919, "step": 36700 }, { "epoch": 1.7043502483866475, "grad_norm": 30.590187072753906, "learning_rate": 2.1608864540291254e-07, "logits/chosen": -18.853466033935547, "logits/rejected": -17.969575881958008, "logps/chosen": -390.5350036621094, "logps/rejected": -365.2593994140625, "loss": 0.5292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.175538063049316, "rewards/margins": 1.2840933799743652, "rewards/rejected": 2.891444683074951, "step": 36710 }, { "epoch": 1.7048145224940805, "grad_norm": 20.84995460510254, "learning_rate": 2.1601126638500702e-07, "logits/chosen": -19.026439666748047, "logits/rejected": -18.783069610595703, "logps/chosen": -376.7286376953125, "logps/rejected": -352.7847900390625, "loss": 0.7867, "rewards/accuracies": 0.5, "rewards/chosen": 3.537351131439209, "rewards/margins": 0.6469793319702148, "rewards/rejected": 2.890371799468994, "step": 36720 }, { "epoch": 1.7052787966015135, "grad_norm": 56.94937515258789, "learning_rate": 2.1593388736710153e-07, "logits/chosen": -17.941974639892578, "logits/rejected": -17.520599365234375, "logps/chosen": -243.3471221923828, "logps/rejected": -194.9434356689453, "loss": 1.2996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8138763904571533, "rewards/margins": 0.11198077350854874, "rewards/rejected": 1.7018954753875732, "step": 36730 }, { "epoch": 1.7057430707089467, "grad_norm": 0.033643726259469986, "learning_rate": 2.1585650834919602e-07, "logits/chosen": -19.27825355529785, "logits/rejected": -18.380807876586914, "logps/chosen": -336.4454650878906, "logps/rejected": -260.9297790527344, "loss": 0.4418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.163210868835449, "rewards/margins": 2.2656519412994385, "rewards/rejected": 1.897559404373169, "step": 36740 }, { "epoch": 1.7062073448163795, "grad_norm": 0.7862631678581238, "learning_rate": 2.1577912933129053e-07, "logits/chosen": -19.040740966796875, "logits/rejected": -18.58426284790039, "logps/chosen": -496.5771484375, "logps/rejected": -372.6551818847656, "loss": 0.6351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.148965358734131, "rewards/margins": 0.9871259927749634, "rewards/rejected": 3.161839008331299, "step": 36750 }, { "epoch": 1.7066716189238127, "grad_norm": 89.06334686279297, "learning_rate": 2.15701750313385e-07, "logits/chosen": -20.47823143005371, "logits/rejected": -20.01723289489746, "logps/chosen": -358.57061767578125, "logps/rejected": -298.2204895019531, "loss": 0.7087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.198636770248413, "rewards/margins": 0.5516250729560852, "rewards/rejected": 2.6470119953155518, "step": 36760 }, { "epoch": 1.7071358930312457, "grad_norm": 164.2744140625, "learning_rate": 2.156243712954795e-07, "logits/chosen": -19.330982208251953, "logits/rejected": -18.79717445373535, "logps/chosen": -482.3871154785156, "logps/rejected": -416.1319274902344, "loss": 1.9193, "rewards/accuracies": 0.5, "rewards/chosen": 3.2320423126220703, "rewards/margins": -0.6145657300949097, "rewards/rejected": 3.8466084003448486, "step": 36770 }, { "epoch": 1.7076001671386787, "grad_norm": 103.09037017822266, "learning_rate": 2.15546992277574e-07, "logits/chosen": -19.062152862548828, "logits/rejected": -18.476633071899414, "logps/chosen": -430.15582275390625, "logps/rejected": -354.13592529296875, "loss": 0.5638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2987704277038574, "rewards/margins": 0.558763861656189, "rewards/rejected": 2.740006685256958, "step": 36780 }, { "epoch": 1.7080644412461117, "grad_norm": 34.55465316772461, "learning_rate": 2.154696132596685e-07, "logits/chosen": -18.717187881469727, "logits/rejected": -17.43096923828125, "logps/chosen": -354.73797607421875, "logps/rejected": -267.6020812988281, "loss": 0.667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8234922885894775, "rewards/margins": 1.398367166519165, "rewards/rejected": 1.425125002861023, "step": 36790 }, { "epoch": 1.7085287153535447, "grad_norm": 257.7079772949219, "learning_rate": 2.15392234241763e-07, "logits/chosen": -18.92568588256836, "logits/rejected": -17.877044677734375, "logps/chosen": -359.0839538574219, "logps/rejected": -323.2008056640625, "loss": 0.5311, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.902538537979126, "rewards/margins": 1.2416012287139893, "rewards/rejected": 1.660936951637268, "step": 36800 }, { "epoch": 1.7089929894609779, "grad_norm": 32.765106201171875, "learning_rate": 2.153148552238575e-07, "logits/chosen": -18.749757766723633, "logits/rejected": -17.88119125366211, "logps/chosen": -503.3731384277344, "logps/rejected": -416.708740234375, "loss": 0.3979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.579141616821289, "rewards/margins": 1.9403835535049438, "rewards/rejected": 2.6387581825256348, "step": 36810 }, { "epoch": 1.7094572635684107, "grad_norm": 37.47510528564453, "learning_rate": 2.1523747620595197e-07, "logits/chosen": -19.590524673461914, "logits/rejected": -18.958351135253906, "logps/chosen": -394.60150146484375, "logps/rejected": -381.7503356933594, "loss": 0.7182, "rewards/accuracies": 0.5, "rewards/chosen": 4.56776237487793, "rewards/margins": 0.9582312703132629, "rewards/rejected": 3.6095309257507324, "step": 36820 }, { "epoch": 1.7099215376758439, "grad_norm": 1.6396912336349487, "learning_rate": 2.1516009718804649e-07, "logits/chosen": -19.016681671142578, "logits/rejected": -17.620756149291992, "logps/chosen": -389.9831237792969, "logps/rejected": -280.6188049316406, "loss": 0.8094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.727483034133911, "rewards/margins": 1.8795263767242432, "rewards/rejected": 1.847956657409668, "step": 36830 }, { "epoch": 1.7103858117832769, "grad_norm": 67.70686340332031, "learning_rate": 2.1508271817014097e-07, "logits/chosen": -18.698841094970703, "logits/rejected": -18.868534088134766, "logps/chosen": -390.8171691894531, "logps/rejected": -382.91082763671875, "loss": 0.7751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0765128135681152, "rewards/margins": 0.6525183916091919, "rewards/rejected": 2.423994302749634, "step": 36840 }, { "epoch": 1.7108500858907099, "grad_norm": 74.38780975341797, "learning_rate": 2.1500533915223548e-07, "logits/chosen": -18.02829360961914, "logits/rejected": -17.627750396728516, "logps/chosen": -442.380859375, "logps/rejected": -330.04718017578125, "loss": 0.8507, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.803366184234619, "rewards/margins": 1.8020693063735962, "rewards/rejected": 2.0012972354888916, "step": 36850 }, { "epoch": 1.7113143599981429, "grad_norm": 22.63422966003418, "learning_rate": 2.1492796013432997e-07, "logits/chosen": -18.5543155670166, "logits/rejected": -18.40499496459961, "logps/chosen": -484.41168212890625, "logps/rejected": -521.1669921875, "loss": 1.0531, "rewards/accuracies": 0.5, "rewards/chosen": 4.362288951873779, "rewards/margins": 0.47619181871414185, "rewards/rejected": 3.8860976696014404, "step": 36860 }, { "epoch": 1.7117786341055758, "grad_norm": 165.487060546875, "learning_rate": 2.1485058111642445e-07, "logits/chosen": -18.31215476989746, "logits/rejected": -17.997053146362305, "logps/chosen": -325.83233642578125, "logps/rejected": -307.8849182128906, "loss": 1.5025, "rewards/accuracies": 0.5, "rewards/chosen": 3.1755175590515137, "rewards/margins": -0.2637370228767395, "rewards/rejected": 3.4392547607421875, "step": 36870 }, { "epoch": 1.712242908213009, "grad_norm": 85.03759765625, "learning_rate": 2.1477320209851896e-07, "logits/chosen": -19.43541145324707, "logits/rejected": -19.11455726623535, "logps/chosen": -367.72698974609375, "logps/rejected": -367.7350769042969, "loss": 0.9902, "rewards/accuracies": 0.5, "rewards/chosen": 3.28669810295105, "rewards/margins": 0.27301543951034546, "rewards/rejected": 3.0136826038360596, "step": 36880 }, { "epoch": 1.7127071823204418, "grad_norm": 65.64322662353516, "learning_rate": 2.1469582308061345e-07, "logits/chosen": -18.680335998535156, "logits/rejected": -17.234628677368164, "logps/chosen": -376.4737548828125, "logps/rejected": -248.5883026123047, "loss": 0.2533, "rewards/accuracies": 1.0, "rewards/chosen": 2.6902337074279785, "rewards/margins": 1.7473045587539673, "rewards/rejected": 0.9429292678833008, "step": 36890 }, { "epoch": 1.713171456427875, "grad_norm": 167.3829803466797, "learning_rate": 2.1461844406270796e-07, "logits/chosen": -18.027631759643555, "logits/rejected": -18.265010833740234, "logps/chosen": -374.85357666015625, "logps/rejected": -389.2226257324219, "loss": 0.6809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.615548610687256, "rewards/margins": 0.2243233174085617, "rewards/rejected": 2.3912250995635986, "step": 36900 }, { "epoch": 1.713635730535308, "grad_norm": 19.05622100830078, "learning_rate": 2.1454106504480244e-07, "logits/chosen": -19.140636444091797, "logits/rejected": -18.94776153564453, "logps/chosen": -413.21240234375, "logps/rejected": -436.908203125, "loss": 1.1704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8976759910583496, "rewards/margins": 0.28430476784706116, "rewards/rejected": 3.6133713722229004, "step": 36910 }, { "epoch": 1.714100004642741, "grad_norm": 70.21186065673828, "learning_rate": 2.1446368602689693e-07, "logits/chosen": -18.913227081298828, "logits/rejected": -17.706544876098633, "logps/chosen": -409.7281188964844, "logps/rejected": -249.4738006591797, "loss": 0.3241, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2794830799102783, "rewards/margins": 1.6885267496109009, "rewards/rejected": 1.5909565687179565, "step": 36920 }, { "epoch": 1.7145642787501743, "grad_norm": 143.12176513671875, "learning_rate": 2.1438630700899144e-07, "logits/chosen": -19.362085342407227, "logits/rejected": -17.99165916442871, "logps/chosen": -414.61578369140625, "logps/rejected": -338.8934631347656, "loss": 0.2973, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.475529670715332, "rewards/margins": 2.116616725921631, "rewards/rejected": 2.358912944793701, "step": 36930 }, { "epoch": 1.715028552857607, "grad_norm": 53.149593353271484, "learning_rate": 2.1430892799108592e-07, "logits/chosen": -19.112964630126953, "logits/rejected": -18.487056732177734, "logps/chosen": -583.6006469726562, "logps/rejected": -517.9869384765625, "loss": 1.2249, "rewards/accuracies": 0.5, "rewards/chosen": 4.446264266967773, "rewards/margins": 0.1314171850681305, "rewards/rejected": 4.314846992492676, "step": 36940 }, { "epoch": 1.7154928269650402, "grad_norm": 3.8505101203918457, "learning_rate": 2.142315489731804e-07, "logits/chosen": -19.091655731201172, "logits/rejected": -17.519996643066406, "logps/chosen": -488.82958984375, "logps/rejected": -328.5228271484375, "loss": 0.3445, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.651177406311035, "rewards/margins": 2.2870028018951416, "rewards/rejected": 2.3641743659973145, "step": 36950 }, { "epoch": 1.7159571010724732, "grad_norm": 8.67835521697998, "learning_rate": 2.1415416995527492e-07, "logits/chosen": -19.683300018310547, "logits/rejected": -19.258621215820312, "logps/chosen": -463.9615173339844, "logps/rejected": -390.51470947265625, "loss": 0.5923, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.783975124359131, "rewards/margins": 0.9293124079704285, "rewards/rejected": 3.8546626567840576, "step": 36960 }, { "epoch": 1.7164213751799062, "grad_norm": 6.461153030395508, "learning_rate": 2.140767909373694e-07, "logits/chosen": -19.30033302307129, "logits/rejected": -18.467472076416016, "logps/chosen": -366.3478698730469, "logps/rejected": -273.9032287597656, "loss": 0.6833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.913759708404541, "rewards/margins": 0.5517138242721558, "rewards/rejected": 2.3620457649230957, "step": 36970 }, { "epoch": 1.7168856492873392, "grad_norm": 75.26233673095703, "learning_rate": 2.1399941191946392e-07, "logits/chosen": -19.390840530395508, "logits/rejected": -18.070451736450195, "logps/chosen": -409.1629943847656, "logps/rejected": -243.0378875732422, "loss": 0.2325, "rewards/accuracies": 1.0, "rewards/chosen": 4.034801006317139, "rewards/margins": 2.2318625450134277, "rewards/rejected": 1.8029382228851318, "step": 36980 }, { "epoch": 1.7173499233947722, "grad_norm": 204.3409423828125, "learning_rate": 2.139220329015584e-07, "logits/chosen": -18.674057006835938, "logits/rejected": -17.882177352905273, "logps/chosen": -406.0505676269531, "logps/rejected": -371.43267822265625, "loss": 1.5475, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.314204454421997, "rewards/margins": 0.28795790672302246, "rewards/rejected": 3.026247024536133, "step": 36990 }, { "epoch": 1.7178141975022054, "grad_norm": 27.960847854614258, "learning_rate": 2.138446538836529e-07, "logits/chosen": -18.289966583251953, "logits/rejected": -17.72786521911621, "logps/chosen": -247.68539428710938, "logps/rejected": -194.19754028320312, "loss": 0.3031, "rewards/accuracies": 1.0, "rewards/chosen": 1.8040955066680908, "rewards/margins": 1.4487786293029785, "rewards/rejected": 0.35531672835350037, "step": 37000 }, { "epoch": 1.7182784716096382, "grad_norm": 1.7203046083450317, "learning_rate": 2.137672748657474e-07, "logits/chosen": -18.21988296508789, "logits/rejected": -17.547935485839844, "logps/chosen": -434.40179443359375, "logps/rejected": -296.6982727050781, "loss": 1.0453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.615757703781128, "rewards/margins": 1.0404595136642456, "rewards/rejected": 2.575298309326172, "step": 37010 }, { "epoch": 1.7187427457170714, "grad_norm": 106.27831268310547, "learning_rate": 2.1368989584784188e-07, "logits/chosen": -19.549644470214844, "logits/rejected": -17.547985076904297, "logps/chosen": -368.45458984375, "logps/rejected": -265.81561279296875, "loss": 1.1163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.024659156799316, "rewards/margins": 1.9903484582901, "rewards/rejected": 2.034310817718506, "step": 37020 }, { "epoch": 1.7192070198245044, "grad_norm": 0.32579725980758667, "learning_rate": 2.136125168299364e-07, "logits/chosen": -19.4143009185791, "logits/rejected": -18.768291473388672, "logps/chosen": -442.52606201171875, "logps/rejected": -348.7960510253906, "loss": 0.5124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3249168395996094, "rewards/margins": 1.5011718273162842, "rewards/rejected": 1.823744773864746, "step": 37030 }, { "epoch": 1.7196712939319374, "grad_norm": 62.134925842285156, "learning_rate": 2.1353513781203088e-07, "logits/chosen": -18.783260345458984, "logits/rejected": -18.238222122192383, "logps/chosen": -484.07403564453125, "logps/rejected": -396.92120361328125, "loss": 0.5661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.887906074523926, "rewards/margins": 1.2785388231277466, "rewards/rejected": 3.609367847442627, "step": 37040 }, { "epoch": 1.7201355680393704, "grad_norm": 26.992694854736328, "learning_rate": 2.1345775879412536e-07, "logits/chosen": -20.485502243041992, "logits/rejected": -19.506061553955078, "logps/chosen": -415.7943420410156, "logps/rejected": -326.8559265136719, "loss": 0.7721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.55391788482666, "rewards/margins": 1.3498508930206299, "rewards/rejected": 3.204066753387451, "step": 37050 }, { "epoch": 1.7205998421468034, "grad_norm": 159.5838623046875, "learning_rate": 2.1338037977621988e-07, "logits/chosen": -19.81119155883789, "logits/rejected": -18.423580169677734, "logps/chosen": -462.83160400390625, "logps/rejected": -295.3925476074219, "loss": 0.672, "rewards/accuracies": 0.5, "rewards/chosen": 4.773592948913574, "rewards/margins": 1.1626248359680176, "rewards/rejected": 3.6109683513641357, "step": 37060 }, { "epoch": 1.7210641162542366, "grad_norm": 76.27303314208984, "learning_rate": 2.1330300075831436e-07, "logits/chosen": -18.976215362548828, "logits/rejected": -17.572046279907227, "logps/chosen": -354.8653869628906, "logps/rejected": -234.88204956054688, "loss": 0.5411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7618050575256348, "rewards/margins": 1.6687650680541992, "rewards/rejected": 1.0930402278900146, "step": 37070 }, { "epoch": 1.7215283903616694, "grad_norm": 2.6419870853424072, "learning_rate": 2.1322562174040887e-07, "logits/chosen": -19.193050384521484, "logits/rejected": -17.685287475585938, "logps/chosen": -472.852294921875, "logps/rejected": -333.0932922363281, "loss": 0.6297, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.086296081542969, "rewards/margins": 2.233454942703247, "rewards/rejected": 1.8528410196304321, "step": 37080 }, { "epoch": 1.7219926644691026, "grad_norm": 282.9467468261719, "learning_rate": 2.1314824272250336e-07, "logits/chosen": -19.608196258544922, "logits/rejected": -18.222097396850586, "logps/chosen": -373.7020568847656, "logps/rejected": -378.66558837890625, "loss": 1.0635, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2821478843688965, "rewards/margins": 0.9304448962211609, "rewards/rejected": 2.351702928543091, "step": 37090 }, { "epoch": 1.7224569385765356, "grad_norm": 94.78925323486328, "learning_rate": 2.1307086370459784e-07, "logits/chosen": -19.96051597595215, "logits/rejected": -19.29084014892578, "logps/chosen": -311.655029296875, "logps/rejected": -242.7296905517578, "loss": 0.6855, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2093799114227295, "rewards/margins": 1.2871659994125366, "rewards/rejected": 1.9222137928009033, "step": 37100 }, { "epoch": 1.7229212126839686, "grad_norm": 22.644994735717773, "learning_rate": 2.1299348468669235e-07, "logits/chosen": -18.356365203857422, "logits/rejected": -17.592670440673828, "logps/chosen": -400.307861328125, "logps/rejected": -326.0292053222656, "loss": 0.4386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4667916297912598, "rewards/margins": 1.0488336086273193, "rewards/rejected": 2.4179580211639404, "step": 37110 }, { "epoch": 1.7233854867914018, "grad_norm": 0.2508104741573334, "learning_rate": 2.1291610566878684e-07, "logits/chosen": -18.645763397216797, "logits/rejected": -19.058645248413086, "logps/chosen": -388.36376953125, "logps/rejected": -404.575439453125, "loss": 1.6704, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2964961528778076, "rewards/margins": -0.32725656032562256, "rewards/rejected": 3.623753070831299, "step": 37120 }, { "epoch": 1.7238497608988346, "grad_norm": 8.859411239624023, "learning_rate": 2.1283872665088135e-07, "logits/chosen": -18.3619384765625, "logits/rejected": -17.90546417236328, "logps/chosen": -326.8319091796875, "logps/rejected": -247.08438110351562, "loss": 0.7409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.706670045852661, "rewards/margins": 1.3611751794815063, "rewards/rejected": 2.3454947471618652, "step": 37130 }, { "epoch": 1.7243140350062678, "grad_norm": 124.85717010498047, "learning_rate": 2.1276134763297583e-07, "logits/chosen": -18.77469253540039, "logits/rejected": -18.610431671142578, "logps/chosen": -404.2788391113281, "logps/rejected": -391.24542236328125, "loss": 1.0037, "rewards/accuracies": 0.5, "rewards/chosen": 2.0060527324676514, "rewards/margins": -0.014958620071411133, "rewards/rejected": 2.0210113525390625, "step": 37140 }, { "epoch": 1.7247783091137008, "grad_norm": 116.78489685058594, "learning_rate": 2.1268396861507032e-07, "logits/chosen": -18.879274368286133, "logits/rejected": -18.37262535095215, "logps/chosen": -394.3815612792969, "logps/rejected": -308.0113830566406, "loss": 1.0427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.384822368621826, "rewards/margins": 0.8947356343269348, "rewards/rejected": 2.4900870323181152, "step": 37150 }, { "epoch": 1.7252425832211338, "grad_norm": 18.082191467285156, "learning_rate": 2.1260658959716483e-07, "logits/chosen": -19.318775177001953, "logits/rejected": -18.852787017822266, "logps/chosen": -343.734375, "logps/rejected": -252.715576171875, "loss": 0.414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.219510316848755, "rewards/margins": 1.4318883419036865, "rewards/rejected": 1.7876218557357788, "step": 37160 }, { "epoch": 1.7257068573285668, "grad_norm": 190.26136779785156, "learning_rate": 2.1252921057925931e-07, "logits/chosen": -18.186132431030273, "logits/rejected": -18.053852081298828, "logps/chosen": -392.3628845214844, "logps/rejected": -331.3575134277344, "loss": 0.8274, "rewards/accuracies": 0.5, "rewards/chosen": 3.4438507556915283, "rewards/margins": 0.9531410336494446, "rewards/rejected": 2.4907095432281494, "step": 37170 }, { "epoch": 1.7261711314359998, "grad_norm": 207.02877807617188, "learning_rate": 2.1245183156135383e-07, "logits/chosen": -18.92496109008789, "logits/rejected": -18.952472686767578, "logps/chosen": -446.5216369628906, "logps/rejected": -422.10858154296875, "loss": 0.7328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.123141288757324, "rewards/margins": 0.2299329787492752, "rewards/rejected": 3.8932087421417236, "step": 37180 }, { "epoch": 1.726635405543433, "grad_norm": 308.9150390625, "learning_rate": 2.1237445254344828e-07, "logits/chosen": -18.980052947998047, "logits/rejected": -18.8741397857666, "logps/chosen": -422.447021484375, "logps/rejected": -400.7697448730469, "loss": 1.0542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.895702362060547, "rewards/margins": 1.1496179103851318, "rewards/rejected": 2.746084690093994, "step": 37190 }, { "epoch": 1.7270996796508657, "grad_norm": 74.6732406616211, "learning_rate": 2.122970735255428e-07, "logits/chosen": -19.103984832763672, "logits/rejected": -17.650609970092773, "logps/chosen": -489.260986328125, "logps/rejected": -294.8757629394531, "loss": 0.4798, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.129423141479492, "rewards/margins": 2.429676055908203, "rewards/rejected": 1.69974684715271, "step": 37200 }, { "epoch": 1.727563953758299, "grad_norm": 152.71983337402344, "learning_rate": 2.122196945076373e-07, "logits/chosen": -19.650882720947266, "logits/rejected": -18.807811737060547, "logps/chosen": -372.27545166015625, "logps/rejected": -360.5745544433594, "loss": 0.4482, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8643927574157715, "rewards/margins": 1.2544130086898804, "rewards/rejected": 2.6099801063537598, "step": 37210 }, { "epoch": 1.728028227865732, "grad_norm": 43.30728530883789, "learning_rate": 2.121423154897318e-07, "logits/chosen": -18.366498947143555, "logits/rejected": -16.890338897705078, "logps/chosen": -415.6421813964844, "logps/rejected": -286.7775573730469, "loss": 0.4844, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1226768493652344, "rewards/margins": 1.4175206422805786, "rewards/rejected": 1.7051563262939453, "step": 37220 }, { "epoch": 1.728492501973165, "grad_norm": 78.61561584472656, "learning_rate": 2.120649364718263e-07, "logits/chosen": -18.72500228881836, "logits/rejected": -18.35687828063965, "logps/chosen": -458.5293884277344, "logps/rejected": -406.4479064941406, "loss": 0.5398, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.842461109161377, "rewards/margins": 1.0902131795883179, "rewards/rejected": 2.7522480487823486, "step": 37230 }, { "epoch": 1.728956776080598, "grad_norm": 84.64399719238281, "learning_rate": 2.1198755745392076e-07, "logits/chosen": -19.050777435302734, "logits/rejected": -18.511035919189453, "logps/chosen": -341.66448974609375, "logps/rejected": -262.52850341796875, "loss": 0.5049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6704792976379395, "rewards/margins": 1.052132248878479, "rewards/rejected": 1.618346929550171, "step": 37240 }, { "epoch": 1.729421050188031, "grad_norm": 42.524112701416016, "learning_rate": 2.1191017843601527e-07, "logits/chosen": -19.959449768066406, "logits/rejected": -17.810155868530273, "logps/chosen": -458.92437744140625, "logps/rejected": -298.6179504394531, "loss": 0.5112, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.391620635986328, "rewards/margins": 2.1871955394744873, "rewards/rejected": 2.2044248580932617, "step": 37250 }, { "epoch": 1.7298853242954642, "grad_norm": 52.59172821044922, "learning_rate": 2.1183279941810978e-07, "logits/chosen": -19.335281372070312, "logits/rejected": -16.922317504882812, "logps/chosen": -455.13079833984375, "logps/rejected": -283.0319519042969, "loss": 0.4435, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0444629192352295, "rewards/margins": 1.5394480228424072, "rewards/rejected": 1.5050147771835327, "step": 37260 }, { "epoch": 1.730349598402897, "grad_norm": 124.946044921875, "learning_rate": 2.1175542040020427e-07, "logits/chosen": -19.511362075805664, "logits/rejected": -18.802677154541016, "logps/chosen": -407.7912902832031, "logps/rejected": -369.3935852050781, "loss": 0.6107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8054728507995605, "rewards/margins": 1.4162201881408691, "rewards/rejected": 2.3892524242401123, "step": 37270 }, { "epoch": 1.7308138725103301, "grad_norm": 164.8895263671875, "learning_rate": 2.1167804138229878e-07, "logits/chosen": -17.92124366760254, "logits/rejected": -18.977359771728516, "logps/chosen": -278.4205017089844, "logps/rejected": -359.4671936035156, "loss": 1.533, "rewards/accuracies": 0.5, "rewards/chosen": 2.7953410148620605, "rewards/margins": 0.12816444039344788, "rewards/rejected": 2.6671767234802246, "step": 37280 }, { "epoch": 1.7312781466177631, "grad_norm": 84.95022583007812, "learning_rate": 2.1160066236439324e-07, "logits/chosen": -19.208539962768555, "logits/rejected": -19.726276397705078, "logps/chosen": -409.51568603515625, "logps/rejected": -434.17352294921875, "loss": 1.0377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9852561950683594, "rewards/margins": 0.11400894820690155, "rewards/rejected": 3.8712470531463623, "step": 37290 }, { "epoch": 1.7317424207251961, "grad_norm": 6.446043014526367, "learning_rate": 2.1152328334648775e-07, "logits/chosen": -18.460031509399414, "logits/rejected": -18.715625762939453, "logps/chosen": -402.9241943359375, "logps/rejected": -315.48089599609375, "loss": 0.7711, "rewards/accuracies": 0.5, "rewards/chosen": 3.542062282562256, "rewards/margins": 1.0264835357666016, "rewards/rejected": 2.5155789852142334, "step": 37300 }, { "epoch": 1.7322066948326293, "grad_norm": 35.629188537597656, "learning_rate": 2.1144590432858226e-07, "logits/chosen": -19.438610076904297, "logits/rejected": -17.355316162109375, "logps/chosen": -421.4140625, "logps/rejected": -209.83544921875, "loss": 0.3813, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8182125091552734, "rewards/margins": 2.279297113418579, "rewards/rejected": 1.5389155149459839, "step": 37310 }, { "epoch": 1.7326709689400621, "grad_norm": 95.26296997070312, "learning_rate": 2.1136852531067675e-07, "logits/chosen": -18.73419761657715, "logits/rejected": -18.462684631347656, "logps/chosen": -369.8423767089844, "logps/rejected": -366.83892822265625, "loss": 1.0039, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.594092607498169, "rewards/margins": -0.25842729210853577, "rewards/rejected": 2.8525195121765137, "step": 37320 }, { "epoch": 1.7331352430474953, "grad_norm": 35.396060943603516, "learning_rate": 2.1129114629277126e-07, "logits/chosen": -18.628679275512695, "logits/rejected": -17.432079315185547, "logps/chosen": -458.64862060546875, "logps/rejected": -318.6935119628906, "loss": 0.4929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7826895713806152, "rewards/margins": 2.1648221015930176, "rewards/rejected": 1.6178674697875977, "step": 37330 }, { "epoch": 1.733599517154928, "grad_norm": 9.375755310058594, "learning_rate": 2.1121376727486574e-07, "logits/chosen": -19.163715362548828, "logits/rejected": -17.654430389404297, "logps/chosen": -469.3279724121094, "logps/rejected": -338.75592041015625, "loss": 0.3702, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.5124053955078125, "rewards/margins": 2.504180431365967, "rewards/rejected": 2.0082249641418457, "step": 37340 }, { "epoch": 1.7340637912623613, "grad_norm": 190.724609375, "learning_rate": 2.1113638825696023e-07, "logits/chosen": -18.872142791748047, "logits/rejected": -17.640544891357422, "logps/chosen": -385.84613037109375, "logps/rejected": -275.5655822753906, "loss": 0.6333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.343735933303833, "rewards/margins": 0.9165762066841125, "rewards/rejected": 2.4271597862243652, "step": 37350 }, { "epoch": 1.7345280653697943, "grad_norm": 80.2020492553711, "learning_rate": 2.1105900923905474e-07, "logits/chosen": -19.91586685180664, "logits/rejected": -17.873416900634766, "logps/chosen": -379.46600341796875, "logps/rejected": -253.54464721679688, "loss": 0.3259, "rewards/accuracies": 1.0, "rewards/chosen": 3.7731692790985107, "rewards/margins": 2.2069060802459717, "rewards/rejected": 1.566263198852539, "step": 37360 }, { "epoch": 1.7349923394772273, "grad_norm": 14.357043266296387, "learning_rate": 2.1098163022114922e-07, "logits/chosen": -19.45153045654297, "logits/rejected": -18.85259246826172, "logps/chosen": -326.79364013671875, "logps/rejected": -315.3156433105469, "loss": 1.0993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4991652965545654, "rewards/margins": 0.24597148597240448, "rewards/rejected": 2.2531936168670654, "step": 37370 }, { "epoch": 1.7354566135846605, "grad_norm": 2.4298856258392334, "learning_rate": 2.1090425120324373e-07, "logits/chosen": -19.62998390197754, "logits/rejected": -18.33489990234375, "logps/chosen": -361.42242431640625, "logps/rejected": -293.5273132324219, "loss": 0.9059, "rewards/accuracies": 0.5, "rewards/chosen": 3.468294620513916, "rewards/margins": 1.2373406887054443, "rewards/rejected": 2.2309536933898926, "step": 37380 }, { "epoch": 1.7359208876920933, "grad_norm": 17.33690071105957, "learning_rate": 2.1082687218533822e-07, "logits/chosen": -19.25430679321289, "logits/rejected": -18.398967742919922, "logps/chosen": -350.1191101074219, "logps/rejected": -310.0328369140625, "loss": 0.8898, "rewards/accuracies": 0.5, "rewards/chosen": 2.8641998767852783, "rewards/margins": 0.4774976670742035, "rewards/rejected": 2.386702299118042, "step": 37390 }, { "epoch": 1.7363851617995265, "grad_norm": 28.222576141357422, "learning_rate": 2.107494931674327e-07, "logits/chosen": -19.042644500732422, "logits/rejected": -17.937442779541016, "logps/chosen": -356.9857177734375, "logps/rejected": -240.60073852539062, "loss": 0.2529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.025566577911377, "rewards/margins": 2.2688090801239014, "rewards/rejected": 2.7567572593688965, "step": 37400 }, { "epoch": 1.7368494359069595, "grad_norm": 0.06511776894330978, "learning_rate": 2.1067211414952721e-07, "logits/chosen": -19.270870208740234, "logits/rejected": -17.60776138305664, "logps/chosen": -368.26422119140625, "logps/rejected": -256.08990478515625, "loss": 0.3534, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.083901405334473, "rewards/margins": 2.3054847717285156, "rewards/rejected": 1.778416395187378, "step": 37410 }, { "epoch": 1.7373137100143925, "grad_norm": 47.97732925415039, "learning_rate": 2.105947351316217e-07, "logits/chosen": -18.923242568969727, "logits/rejected": -18.609180450439453, "logps/chosen": -490.45025634765625, "logps/rejected": -491.03436279296875, "loss": 0.7826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.353328227996826, "rewards/margins": 0.5175867080688477, "rewards/rejected": 3.8357417583465576, "step": 37420 }, { "epoch": 1.7377779841218255, "grad_norm": 24.711055755615234, "learning_rate": 2.1051735611371618e-07, "logits/chosen": -18.459318161010742, "logits/rejected": -17.917572021484375, "logps/chosen": -320.141845703125, "logps/rejected": -317.8948059082031, "loss": 0.8276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.46418833732605, "rewards/margins": 0.7603057026863098, "rewards/rejected": 1.7038828134536743, "step": 37430 }, { "epoch": 1.7382422582292585, "grad_norm": 46.08196258544922, "learning_rate": 2.104399770958107e-07, "logits/chosen": -18.77798080444336, "logits/rejected": -18.328657150268555, "logps/chosen": -268.9912109375, "logps/rejected": -229.1240997314453, "loss": 0.6785, "rewards/accuracies": 0.5, "rewards/chosen": 2.5564181804656982, "rewards/margins": 1.0736335515975952, "rewards/rejected": 1.4827845096588135, "step": 37440 }, { "epoch": 1.7387065323366917, "grad_norm": 148.72572326660156, "learning_rate": 2.1036259807790518e-07, "logits/chosen": -19.310409545898438, "logits/rejected": -18.126293182373047, "logps/chosen": -448.6065979003906, "logps/rejected": -384.5547790527344, "loss": 0.667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.333543062210083, "rewards/margins": 0.9573896527290344, "rewards/rejected": 1.3761533498764038, "step": 37450 }, { "epoch": 1.7391708064441245, "grad_norm": 18.566612243652344, "learning_rate": 2.102852190599997e-07, "logits/chosen": -18.64055824279785, "logits/rejected": -17.725013732910156, "logps/chosen": -557.689208984375, "logps/rejected": -470.3465881347656, "loss": 0.5188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.251956462860107, "rewards/margins": 1.4949697256088257, "rewards/rejected": 2.7569868564605713, "step": 37460 }, { "epoch": 1.7396350805515577, "grad_norm": 103.49378967285156, "learning_rate": 2.1020784004209418e-07, "logits/chosen": -18.448013305664062, "logits/rejected": -18.085174560546875, "logps/chosen": -419.7818298339844, "logps/rejected": -458.73211669921875, "loss": 1.1514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9049553871154785, "rewards/margins": -0.004370951559394598, "rewards/rejected": 2.9093263149261475, "step": 37470 }, { "epoch": 1.7400993546589907, "grad_norm": 282.4964599609375, "learning_rate": 2.1013046102418866e-07, "logits/chosen": -18.521221160888672, "logits/rejected": -18.694740295410156, "logps/chosen": -572.3138427734375, "logps/rejected": -523.7191772460938, "loss": 1.5011, "rewards/accuracies": 0.5, "rewards/chosen": 4.317467212677002, "rewards/margins": 0.00899724941700697, "rewards/rejected": 4.308469772338867, "step": 37480 }, { "epoch": 1.7405636287664237, "grad_norm": 126.34719848632812, "learning_rate": 2.1005308200628317e-07, "logits/chosen": -20.138978958129883, "logits/rejected": -19.024700164794922, "logps/chosen": -563.0717163085938, "logps/rejected": -421.86126708984375, "loss": 0.3197, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.107358455657959, "rewards/margins": 1.7635208368301392, "rewards/rejected": 3.3438377380371094, "step": 37490 }, { "epoch": 1.7410279028738567, "grad_norm": 17.193628311157227, "learning_rate": 2.0997570298837766e-07, "logits/chosen": -18.824861526489258, "logits/rejected": -19.0135555267334, "logps/chosen": -259.1589660644531, "logps/rejected": -333.9880676269531, "loss": 1.1128, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2799136638641357, "rewards/margins": 0.06139373779296875, "rewards/rejected": 2.218519926071167, "step": 37500 }, { "epoch": 1.7414921769812897, "grad_norm": 173.18922424316406, "learning_rate": 2.0989832397047217e-07, "logits/chosen": -18.036861419677734, "logits/rejected": -17.526912689208984, "logps/chosen": -460.8453063964844, "logps/rejected": -389.3782653808594, "loss": 1.5443, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.144066095352173, "rewards/margins": -0.021651053801178932, "rewards/rejected": 3.1657166481018066, "step": 37510 }, { "epoch": 1.7419564510887229, "grad_norm": 17.16690444946289, "learning_rate": 2.0982094495256665e-07, "logits/chosen": -19.06365966796875, "logits/rejected": -18.676311492919922, "logps/chosen": -375.8605651855469, "logps/rejected": -404.8424377441406, "loss": 1.1991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9818975925445557, "rewards/margins": -0.11178513616323471, "rewards/rejected": 3.0936827659606934, "step": 37520 }, { "epoch": 1.7424207251961557, "grad_norm": 0.04008615016937256, "learning_rate": 2.0974356593466114e-07, "logits/chosen": -18.40169906616211, "logits/rejected": -17.428733825683594, "logps/chosen": -352.1160888671875, "logps/rejected": -238.48922729492188, "loss": 0.476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7454237937927246, "rewards/margins": 1.8939300775527954, "rewards/rejected": 0.8514933586120605, "step": 37530 }, { "epoch": 1.7428849993035889, "grad_norm": 164.0284423828125, "learning_rate": 2.0966618691675565e-07, "logits/chosen": -18.56894302368164, "logits/rejected": -18.564388275146484, "logps/chosen": -333.4556579589844, "logps/rejected": -328.5575866699219, "loss": 0.8458, "rewards/accuracies": 0.5, "rewards/chosen": 2.9041247367858887, "rewards/margins": 0.27600982785224915, "rewards/rejected": 2.628115177154541, "step": 37540 }, { "epoch": 1.7433492734110219, "grad_norm": 73.45747375488281, "learning_rate": 2.0958880789885013e-07, "logits/chosen": -18.71501922607422, "logits/rejected": -18.913251876831055, "logps/chosen": -373.0672302246094, "logps/rejected": -486.5210876464844, "loss": 1.4339, "rewards/accuracies": 0.5, "rewards/chosen": 2.870244264602661, "rewards/margins": -0.6320688128471375, "rewards/rejected": 3.5023128986358643, "step": 37550 }, { "epoch": 1.7438135475184549, "grad_norm": 0.6195703148841858, "learning_rate": 2.0951142888094465e-07, "logits/chosen": -18.896183013916016, "logits/rejected": -17.75090217590332, "logps/chosen": -412.6888732910156, "logps/rejected": -330.7779235839844, "loss": 0.6165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.311337947845459, "rewards/margins": 1.6112877130508423, "rewards/rejected": 2.7000505924224854, "step": 37560 }, { "epoch": 1.744277821625888, "grad_norm": 104.4993667602539, "learning_rate": 2.0943404986303913e-07, "logits/chosen": -18.17720603942871, "logits/rejected": -17.0325984954834, "logps/chosen": -386.2756652832031, "logps/rejected": -230.2197723388672, "loss": 0.8527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9335403442382812, "rewards/margins": 1.667738914489746, "rewards/rejected": 1.265802025794983, "step": 37570 }, { "epoch": 1.7447420957333208, "grad_norm": 21.171520233154297, "learning_rate": 2.0935667084513362e-07, "logits/chosen": -18.932218551635742, "logits/rejected": -17.514978408813477, "logps/chosen": -430.12451171875, "logps/rejected": -281.10101318359375, "loss": 0.4256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.92497181892395, "rewards/margins": 1.621335744857788, "rewards/rejected": 2.303635835647583, "step": 37580 }, { "epoch": 1.745206369840754, "grad_norm": 87.11384582519531, "learning_rate": 2.0927929182722813e-07, "logits/chosen": -19.79942512512207, "logits/rejected": -19.495494842529297, "logps/chosen": -368.45794677734375, "logps/rejected": -352.6942443847656, "loss": 1.3843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.748575210571289, "rewards/margins": 0.10646400600671768, "rewards/rejected": 3.6421115398406982, "step": 37590 }, { "epoch": 1.745670643948187, "grad_norm": 62.293922424316406, "learning_rate": 2.092019128093226e-07, "logits/chosen": -19.50873374938965, "logits/rejected": -18.35093116760254, "logps/chosen": -542.7542724609375, "logps/rejected": -358.0865173339844, "loss": 0.363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.243614673614502, "rewards/margins": 2.7071468830108643, "rewards/rejected": 2.5364675521850586, "step": 37600 }, { "epoch": 1.74613491805562, "grad_norm": 30.506200790405273, "learning_rate": 2.0912453379141712e-07, "logits/chosen": -18.734477996826172, "logits/rejected": -17.788705825805664, "logps/chosen": -404.743896484375, "logps/rejected": -354.9695739746094, "loss": 0.3744, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.884230375289917, "rewards/margins": 1.6550906896591187, "rewards/rejected": 2.2291393280029297, "step": 37610 }, { "epoch": 1.746599192163053, "grad_norm": 203.8961639404297, "learning_rate": 2.0904715477351158e-07, "logits/chosen": -18.75650405883789, "logits/rejected": -18.545021057128906, "logps/chosen": -469.890380859375, "logps/rejected": -405.6210632324219, "loss": 0.9486, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.404752731323242, "rewards/margins": 0.2518579363822937, "rewards/rejected": 3.1528944969177246, "step": 37620 }, { "epoch": 1.747063466270486, "grad_norm": 4.4086198806762695, "learning_rate": 2.089697757556061e-07, "logits/chosen": -19.157432556152344, "logits/rejected": -19.150985717773438, "logps/chosen": -404.5616455078125, "logps/rejected": -336.40325927734375, "loss": 0.7735, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1717417240142822, "rewards/margins": 0.5501238703727722, "rewards/rejected": 2.6216177940368652, "step": 37630 }, { "epoch": 1.7475277403779192, "grad_norm": 37.68844985961914, "learning_rate": 2.088923967377006e-07, "logits/chosen": -19.770875930786133, "logits/rejected": -18.743349075317383, "logps/chosen": -444.373046875, "logps/rejected": -363.8746337890625, "loss": 0.5097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.267777442932129, "rewards/margins": 1.4196933507919312, "rewards/rejected": 2.848083972930908, "step": 37640 }, { "epoch": 1.747992014485352, "grad_norm": 8.578107833862305, "learning_rate": 2.088150177197951e-07, "logits/chosen": -18.572038650512695, "logits/rejected": -17.712629318237305, "logps/chosen": -350.49993896484375, "logps/rejected": -278.71539306640625, "loss": 0.5113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.792724847793579, "rewards/margins": 1.0394926071166992, "rewards/rejected": 1.7532317638397217, "step": 37650 }, { "epoch": 1.7484562885927852, "grad_norm": 53.2607421875, "learning_rate": 2.087376387018896e-07, "logits/chosen": -19.526601791381836, "logits/rejected": -19.595409393310547, "logps/chosen": -337.5574035644531, "logps/rejected": -346.720458984375, "loss": 1.3423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8129401206970215, "rewards/margins": 0.21247467398643494, "rewards/rejected": 2.6004652976989746, "step": 37660 }, { "epoch": 1.7489205627002182, "grad_norm": 201.6190185546875, "learning_rate": 2.0866025968398406e-07, "logits/chosen": -19.90593910217285, "logits/rejected": -18.63787841796875, "logps/chosen": -335.7894287109375, "logps/rejected": -243.9749755859375, "loss": 0.2929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.693892002105713, "rewards/margins": 1.8741579055786133, "rewards/rejected": 1.8197338581085205, "step": 37670 }, { "epoch": 1.7493848368076512, "grad_norm": 21.834218978881836, "learning_rate": 2.0858288066607857e-07, "logits/chosen": -18.90606117248535, "logits/rejected": -18.862491607666016, "logps/chosen": -384.2298889160156, "logps/rejected": -390.84051513671875, "loss": 0.5313, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.229414463043213, "rewards/margins": 0.5529485940933228, "rewards/rejected": 1.6764657497406006, "step": 37680 }, { "epoch": 1.7498491109150842, "grad_norm": 59.08091354370117, "learning_rate": 2.0850550164817308e-07, "logits/chosen": -19.634538650512695, "logits/rejected": -18.95229148864746, "logps/chosen": -439.75201416015625, "logps/rejected": -307.5457458496094, "loss": 0.4903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.030610084533691, "rewards/margins": 1.5816059112548828, "rewards/rejected": 2.4490044116973877, "step": 37690 }, { "epoch": 1.7503133850225172, "grad_norm": 26.033174514770508, "learning_rate": 2.0842812263026757e-07, "logits/chosen": -18.874679565429688, "logits/rejected": -17.935714721679688, "logps/chosen": -393.1689453125, "logps/rejected": -289.1618347167969, "loss": 0.5858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6221511363983154, "rewards/margins": 1.7066848278045654, "rewards/rejected": 1.915466070175171, "step": 37700 }, { "epoch": 1.7507776591299504, "grad_norm": 287.3941955566406, "learning_rate": 2.0835074361236208e-07, "logits/chosen": -18.6653995513916, "logits/rejected": -19.214662551879883, "logps/chosen": -443.34112548828125, "logps/rejected": -444.04315185546875, "loss": 1.2053, "rewards/accuracies": 0.5, "rewards/chosen": 3.8442935943603516, "rewards/margins": -0.050058625638484955, "rewards/rejected": 3.8943519592285156, "step": 37710 }, { "epoch": 1.7512419332373832, "grad_norm": 151.799072265625, "learning_rate": 2.0827336459445654e-07, "logits/chosen": -19.146596908569336, "logits/rejected": -18.90952491760254, "logps/chosen": -260.00689697265625, "logps/rejected": -253.43283081054688, "loss": 0.8263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9124104976654053, "rewards/margins": 0.11267454922199249, "rewards/rejected": 1.7997362613677979, "step": 37720 }, { "epoch": 1.7517062073448164, "grad_norm": 131.55897521972656, "learning_rate": 2.0819598557655105e-07, "logits/chosen": -18.26826286315918, "logits/rejected": -18.200748443603516, "logps/chosen": -287.8888854980469, "logps/rejected": -263.1617736816406, "loss": 1.0398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4813437461853027, "rewards/margins": 0.4768550992012024, "rewards/rejected": 2.004488468170166, "step": 37730 }, { "epoch": 1.7521704814522494, "grad_norm": 158.08499145507812, "learning_rate": 2.0811860655864556e-07, "logits/chosen": -18.85816192626953, "logits/rejected": -18.574188232421875, "logps/chosen": -391.60321044921875, "logps/rejected": -339.34429931640625, "loss": 1.8062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4683902263641357, "rewards/margins": -0.07449746131896973, "rewards/rejected": 3.5428874492645264, "step": 37740 }, { "epoch": 1.7526347555596824, "grad_norm": 6.855625629425049, "learning_rate": 2.0804122754074004e-07, "logits/chosen": -19.54962730407715, "logits/rejected": -19.33487319946289, "logps/chosen": -428.72222900390625, "logps/rejected": -406.3287048339844, "loss": 0.8357, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4229750633239746, "rewards/margins": 0.2112623006105423, "rewards/rejected": 3.2117130756378174, "step": 37750 }, { "epoch": 1.7530990296671156, "grad_norm": 193.5024871826172, "learning_rate": 2.0796384852283455e-07, "logits/chosen": -18.407634735107422, "logits/rejected": -17.88027000427246, "logps/chosen": -330.49127197265625, "logps/rejected": -289.67681884765625, "loss": 0.6557, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0361857414245605, "rewards/margins": 0.4992343485355377, "rewards/rejected": 1.5369514226913452, "step": 37760 }, { "epoch": 1.7535633037745484, "grad_norm": 173.82066345214844, "learning_rate": 2.07886469504929e-07, "logits/chosen": -19.943445205688477, "logits/rejected": -19.15176010131836, "logps/chosen": -421.04010009765625, "logps/rejected": -345.49969482421875, "loss": 0.4074, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.069678783416748, "rewards/margins": 1.7424129247665405, "rewards/rejected": 2.327265977859497, "step": 37770 }, { "epoch": 1.7540275778819816, "grad_norm": 39.56620407104492, "learning_rate": 2.0780909048702352e-07, "logits/chosen": -19.382068634033203, "logits/rejected": -18.2496395111084, "logps/chosen": -508.17352294921875, "logps/rejected": -399.45025634765625, "loss": 0.242, "rewards/accuracies": 1.0, "rewards/chosen": 4.308411121368408, "rewards/margins": 2.0187721252441406, "rewards/rejected": 2.2896392345428467, "step": 37780 }, { "epoch": 1.7544918519894146, "grad_norm": 43.964141845703125, "learning_rate": 2.0773171146911803e-07, "logits/chosen": -20.056522369384766, "logits/rejected": -19.68222999572754, "logps/chosen": -324.08087158203125, "logps/rejected": -301.64678955078125, "loss": 0.7944, "rewards/accuracies": 0.5, "rewards/chosen": 3.6158015727996826, "rewards/margins": 0.37433719635009766, "rewards/rejected": 3.241464138031006, "step": 37790 }, { "epoch": 1.7549561260968476, "grad_norm": 69.18574523925781, "learning_rate": 2.0765433245121252e-07, "logits/chosen": -19.253664016723633, "logits/rejected": -19.231849670410156, "logps/chosen": -378.7952575683594, "logps/rejected": -352.1202697753906, "loss": 0.7942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4664604663848877, "rewards/margins": 1.1802767515182495, "rewards/rejected": 2.2861835956573486, "step": 37800 }, { "epoch": 1.7554204002042806, "grad_norm": 74.86248779296875, "learning_rate": 2.0757695343330703e-07, "logits/chosen": -19.55742835998535, "logits/rejected": -18.752723693847656, "logps/chosen": -517.0465698242188, "logps/rejected": -413.63037109375, "loss": 0.7777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.086173057556152, "rewards/margins": 0.7240846753120422, "rewards/rejected": 4.362088203430176, "step": 37810 }, { "epoch": 1.7558846743117136, "grad_norm": 53.11309814453125, "learning_rate": 2.074995744154015e-07, "logits/chosen": -18.313953399658203, "logits/rejected": -17.831722259521484, "logps/chosen": -274.76690673828125, "logps/rejected": -224.50582885742188, "loss": 1.2032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.772678852081299, "rewards/margins": 0.9674604535102844, "rewards/rejected": 1.8052184581756592, "step": 37820 }, { "epoch": 1.7563489484191468, "grad_norm": 119.5842056274414, "learning_rate": 2.07422195397496e-07, "logits/chosen": -17.855792999267578, "logits/rejected": -17.598663330078125, "logps/chosen": -322.7893371582031, "logps/rejected": -355.38421630859375, "loss": 1.3924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6953775882720947, "rewards/margins": 0.21571533381938934, "rewards/rejected": 2.4796621799468994, "step": 37830 }, { "epoch": 1.7568132225265796, "grad_norm": 203.5782928466797, "learning_rate": 2.073448163795905e-07, "logits/chosen": -18.31607437133789, "logits/rejected": -17.26864242553711, "logps/chosen": -311.1189270019531, "logps/rejected": -236.0132598876953, "loss": 1.0363, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0866575241088867, "rewards/margins": 1.0657896995544434, "rewards/rejected": 2.0208678245544434, "step": 37840 }, { "epoch": 1.7572774966340128, "grad_norm": 99.82759857177734, "learning_rate": 2.07267437361685e-07, "logits/chosen": -18.92304039001465, "logits/rejected": -17.91167640686035, "logps/chosen": -437.795654296875, "logps/rejected": -412.58477783203125, "loss": 0.6979, "rewards/accuracies": 0.5, "rewards/chosen": 4.245448589324951, "rewards/margins": 1.560821294784546, "rewards/rejected": 2.6846280097961426, "step": 37850 }, { "epoch": 1.7577417707414458, "grad_norm": 24.074804306030273, "learning_rate": 2.0719005834377948e-07, "logits/chosen": -19.479110717773438, "logits/rejected": -17.32010841369629, "logps/chosen": -460.0487365722656, "logps/rejected": -238.14865112304688, "loss": 0.4476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.964024782180786, "rewards/margins": 2.5951929092407227, "rewards/rejected": 1.3688321113586426, "step": 37860 }, { "epoch": 1.7582060448488788, "grad_norm": 76.19294738769531, "learning_rate": 2.0711267932587397e-07, "logits/chosen": -18.85213279724121, "logits/rejected": -18.658451080322266, "logps/chosen": -407.11590576171875, "logps/rejected": -317.44586181640625, "loss": 1.5513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3170535564422607, "rewards/margins": 0.6183274984359741, "rewards/rejected": 2.698725700378418, "step": 37870 }, { "epoch": 1.7586703189563118, "grad_norm": 199.13031005859375, "learning_rate": 2.0703530030796848e-07, "logits/chosen": -19.1280574798584, "logits/rejected": -18.53670310974121, "logps/chosen": -265.0286560058594, "logps/rejected": -243.8231964111328, "loss": 0.9698, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.341911792755127, "rewards/margins": 0.5358208417892456, "rewards/rejected": 1.8060909509658813, "step": 37880 }, { "epoch": 1.7591345930637448, "grad_norm": 57.71940612792969, "learning_rate": 2.06957921290063e-07, "logits/chosen": -18.243234634399414, "logits/rejected": -17.643085479736328, "logps/chosen": -428.4041442871094, "logps/rejected": -300.95672607421875, "loss": 0.1956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.471827030181885, "rewards/margins": 2.67380952835083, "rewards/rejected": 1.798017144203186, "step": 37890 }, { "epoch": 1.759598867171178, "grad_norm": 229.52438354492188, "learning_rate": 2.0688054227215747e-07, "logits/chosen": -19.384342193603516, "logits/rejected": -18.80238914489746, "logps/chosen": -428.696044921875, "logps/rejected": -366.92718505859375, "loss": 0.9157, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3774962425231934, "rewards/margins": 0.32189464569091797, "rewards/rejected": 3.0556018352508545, "step": 37900 }, { "epoch": 1.7600631412786107, "grad_norm": 20.518951416015625, "learning_rate": 2.0680316325425196e-07, "logits/chosen": -19.247509002685547, "logits/rejected": -19.98319435119629, "logps/chosen": -289.308349609375, "logps/rejected": -334.2695007324219, "loss": 1.1292, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1753876209259033, "rewards/margins": -0.15809965133666992, "rewards/rejected": 3.3334872722625732, "step": 37910 }, { "epoch": 1.760527415386044, "grad_norm": 25.470111846923828, "learning_rate": 2.0672578423634644e-07, "logits/chosen": -18.931293487548828, "logits/rejected": -18.430038452148438, "logps/chosen": -347.404541015625, "logps/rejected": -261.13055419921875, "loss": 0.5367, "rewards/accuracies": 0.5, "rewards/chosen": 2.7504007816314697, "rewards/margins": 1.2400065660476685, "rewards/rejected": 1.5103940963745117, "step": 37920 }, { "epoch": 1.760991689493477, "grad_norm": 223.1666259765625, "learning_rate": 2.0664840521844095e-07, "logits/chosen": -17.86359214782715, "logits/rejected": -18.211284637451172, "logps/chosen": -307.435546875, "logps/rejected": -336.0517272949219, "loss": 0.9542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.826734781265259, "rewards/margins": 0.362014502286911, "rewards/rejected": 2.4647202491760254, "step": 37930 }, { "epoch": 1.76145596360091, "grad_norm": 11.115640640258789, "learning_rate": 2.0657102620053547e-07, "logits/chosen": -19.036766052246094, "logits/rejected": -18.58123779296875, "logps/chosen": -434.2434997558594, "logps/rejected": -375.2203369140625, "loss": 0.2329, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.888867139816284, "rewards/margins": 1.841963768005371, "rewards/rejected": 2.046903610229492, "step": 37940 }, { "epoch": 1.7619202377083432, "grad_norm": 4.628462314605713, "learning_rate": 2.0649364718262995e-07, "logits/chosen": -19.150814056396484, "logits/rejected": -18.33791160583496, "logps/chosen": -361.21368408203125, "logps/rejected": -292.9229431152344, "loss": 0.6931, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7227158546447754, "rewards/margins": 1.4491955041885376, "rewards/rejected": 2.2735202312469482, "step": 37950 }, { "epoch": 1.762384511815776, "grad_norm": 84.84703826904297, "learning_rate": 2.0641626816472444e-07, "logits/chosen": -20.48573875427246, "logits/rejected": -18.657644271850586, "logps/chosen": -344.55438232421875, "logps/rejected": -265.2488098144531, "loss": 0.3566, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4546051025390625, "rewards/margins": 1.510675311088562, "rewards/rejected": 1.943929672241211, "step": 37960 }, { "epoch": 1.7628487859232091, "grad_norm": 5.474244117736816, "learning_rate": 2.0633888914681895e-07, "logits/chosen": -19.307268142700195, "logits/rejected": -18.685409545898438, "logps/chosen": -372.9040222167969, "logps/rejected": -319.06439208984375, "loss": 0.5423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6846413612365723, "rewards/margins": 2.2320499420166016, "rewards/rejected": 1.4525911808013916, "step": 37970 }, { "epoch": 1.7633130600306421, "grad_norm": 7.384393215179443, "learning_rate": 2.0626151012891343e-07, "logits/chosen": -18.387348175048828, "logits/rejected": -17.7783203125, "logps/chosen": -379.13763427734375, "logps/rejected": -321.88031005859375, "loss": 0.5711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0985281467437744, "rewards/margins": 1.0573049783706665, "rewards/rejected": 2.0412230491638184, "step": 37980 }, { "epoch": 1.7637773341380751, "grad_norm": 116.44029235839844, "learning_rate": 2.0618413111100794e-07, "logits/chosen": -18.409502029418945, "logits/rejected": -19.252477645874023, "logps/chosen": -368.68499755859375, "logps/rejected": -456.9693298339844, "loss": 1.2112, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 3.289271593093872, "rewards/margins": -0.7494107484817505, "rewards/rejected": 4.03868293762207, "step": 37990 }, { "epoch": 1.7642416082455081, "grad_norm": 22.329051971435547, "learning_rate": 2.0610675209310243e-07, "logits/chosen": -18.30140495300293, "logits/rejected": -18.112884521484375, "logps/chosen": -368.7573547363281, "logps/rejected": -341.4781494140625, "loss": 1.1806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5856893062591553, "rewards/margins": -0.057517100125551224, "rewards/rejected": 2.643206834793091, "step": 38000 }, { "epoch": 1.7647058823529411, "grad_norm": 207.3396759033203, "learning_rate": 2.060293730751969e-07, "logits/chosen": -18.75204086303711, "logits/rejected": -18.267484664916992, "logps/chosen": -330.5824279785156, "logps/rejected": -313.26287841796875, "loss": 0.7841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.437462568283081, "rewards/margins": 0.9885059595108032, "rewards/rejected": 2.448956251144409, "step": 38010 }, { "epoch": 1.7651701564603743, "grad_norm": 21.30625343322754, "learning_rate": 2.0595199405729142e-07, "logits/chosen": -19.116613388061523, "logits/rejected": -18.015544891357422, "logps/chosen": -389.5267333984375, "logps/rejected": -242.3553009033203, "loss": 0.2584, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2839951515197754, "rewards/margins": 2.2339189052581787, "rewards/rejected": 1.0500757694244385, "step": 38020 }, { "epoch": 1.7656344305678071, "grad_norm": 37.61703872680664, "learning_rate": 2.058746150393859e-07, "logits/chosen": -20.7560977935791, "logits/rejected": -20.10686683654785, "logps/chosen": -452.2276306152344, "logps/rejected": -465.91943359375, "loss": 0.7352, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.827206134796143, "rewards/margins": 0.6588584184646606, "rewards/rejected": 4.168347358703613, "step": 38030 }, { "epoch": 1.7660987046752403, "grad_norm": 85.68672180175781, "learning_rate": 2.0579723602148042e-07, "logits/chosen": -18.741838455200195, "logits/rejected": -18.21881103515625, "logps/chosen": -278.34637451171875, "logps/rejected": -246.02627563476562, "loss": 0.7123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.411255359649658, "rewards/margins": 1.091806173324585, "rewards/rejected": 1.3194491863250732, "step": 38040 }, { "epoch": 1.7665629787826733, "grad_norm": 56.13552474975586, "learning_rate": 2.057198570035749e-07, "logits/chosen": -18.903263092041016, "logits/rejected": -18.117794036865234, "logps/chosen": -445.12017822265625, "logps/rejected": -312.9195861816406, "loss": 0.496, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.727930784225464, "rewards/margins": 0.9906622171401978, "rewards/rejected": 2.7372686862945557, "step": 38050 }, { "epoch": 1.7670272528901063, "grad_norm": 117.87336730957031, "learning_rate": 2.056424779856694e-07, "logits/chosen": -19.557682037353516, "logits/rejected": -18.444000244140625, "logps/chosen": -348.140380859375, "logps/rejected": -307.58978271484375, "loss": 0.4441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0656023025512695, "rewards/margins": 0.944214940071106, "rewards/rejected": 2.121387481689453, "step": 38060 }, { "epoch": 1.7674915269975393, "grad_norm": 49.574974060058594, "learning_rate": 2.055650989677639e-07, "logits/chosen": -18.882007598876953, "logits/rejected": -18.62546157836914, "logps/chosen": -397.63592529296875, "logps/rejected": -324.46435546875, "loss": 0.4695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4533743858337402, "rewards/margins": 0.9704380035400391, "rewards/rejected": 2.4829366207122803, "step": 38070 }, { "epoch": 1.7679558011049723, "grad_norm": 190.62783813476562, "learning_rate": 2.0548771994985839e-07, "logits/chosen": -19.016653060913086, "logits/rejected": -18.138826370239258, "logps/chosen": -396.04681396484375, "logps/rejected": -337.39752197265625, "loss": 0.706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.435601234436035, "rewards/margins": 0.9083470106124878, "rewards/rejected": 3.527254819869995, "step": 38080 }, { "epoch": 1.7684200752124055, "grad_norm": 180.64193725585938, "learning_rate": 2.054103409319529e-07, "logits/chosen": -19.28117561340332, "logits/rejected": -17.64571762084961, "logps/chosen": -441.02960205078125, "logps/rejected": -353.5116882324219, "loss": 0.7668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.523454189300537, "rewards/margins": 1.2775230407714844, "rewards/rejected": 3.2459311485290527, "step": 38090 }, { "epoch": 1.7688843493198383, "grad_norm": 113.19271087646484, "learning_rate": 2.0533296191404736e-07, "logits/chosen": -19.802927017211914, "logits/rejected": -18.358722686767578, "logps/chosen": -428.22052001953125, "logps/rejected": -301.2130432128906, "loss": 0.428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.219503402709961, "rewards/margins": 2.0932843685150146, "rewards/rejected": 2.1262190341949463, "step": 38100 }, { "epoch": 1.7693486234272715, "grad_norm": 5.372422218322754, "learning_rate": 2.0525558289614187e-07, "logits/chosen": -18.669811248779297, "logits/rejected": -17.749900817871094, "logps/chosen": -382.31610107421875, "logps/rejected": -291.5544128417969, "loss": 1.1332, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0469272136688232, "rewards/margins": 0.5619236826896667, "rewards/rejected": 2.4850034713745117, "step": 38110 }, { "epoch": 1.7698128975347045, "grad_norm": 173.54638671875, "learning_rate": 2.0518594178002692e-07, "logits/chosen": -18.824853897094727, "logits/rejected": -18.94536590576172, "logps/chosen": -451.36883544921875, "logps/rejected": -444.41998291015625, "loss": 1.6649, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.203704357147217, "rewards/margins": -1.016643762588501, "rewards/rejected": 3.220348358154297, "step": 38120 }, { "epoch": 1.7702771716421375, "grad_norm": 9.635600090026855, "learning_rate": 2.051085627621214e-07, "logits/chosen": -18.43622398376465, "logits/rejected": -18.487688064575195, "logps/chosen": -356.2784729003906, "logps/rejected": -336.80108642578125, "loss": 1.3436, "rewards/accuracies": 0.5, "rewards/chosen": 2.4996306896209717, "rewards/margins": -0.10816202312707901, "rewards/rejected": 2.607792377471924, "step": 38130 }, { "epoch": 1.7707414457495707, "grad_norm": 114.14290618896484, "learning_rate": 2.050311837442159e-07, "logits/chosen": -20.25130844116211, "logits/rejected": -20.28887367248535, "logps/chosen": -439.326171875, "logps/rejected": -421.28826904296875, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": 5.493134021759033, "rewards/margins": 0.55095374584198, "rewards/rejected": 4.942180156707764, "step": 38140 }, { "epoch": 1.7712057198570035, "grad_norm": 20.623659133911133, "learning_rate": 2.049538047263104e-07, "logits/chosen": -18.449146270751953, "logits/rejected": -17.70958709716797, "logps/chosen": -348.0647277832031, "logps/rejected": -229.5446319580078, "loss": 0.7044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.515714406967163, "rewards/margins": 1.3095976114273071, "rewards/rejected": 1.2061166763305664, "step": 38150 }, { "epoch": 1.7716699939644367, "grad_norm": 182.70462036132812, "learning_rate": 2.048764257084049e-07, "logits/chosen": -19.40150260925293, "logits/rejected": -19.012434005737305, "logps/chosen": -560.6380004882812, "logps/rejected": -493.3968200683594, "loss": 0.5525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.507032871246338, "rewards/margins": 1.0848125219345093, "rewards/rejected": 3.4222207069396973, "step": 38160 }, { "epoch": 1.7721342680718695, "grad_norm": 5.672117233276367, "learning_rate": 2.047990466904994e-07, "logits/chosen": -19.279998779296875, "logits/rejected": -18.773588180541992, "logps/chosen": -379.4125061035156, "logps/rejected": -293.8102111816406, "loss": 0.5381, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.951040744781494, "rewards/margins": 1.6523892879486084, "rewards/rejected": 2.2986512184143066, "step": 38170 }, { "epoch": 1.7725985421793027, "grad_norm": 195.04293823242188, "learning_rate": 2.0472166767259388e-07, "logits/chosen": -19.90140151977539, "logits/rejected": -19.43124008178711, "logps/chosen": -415.00732421875, "logps/rejected": -328.2076721191406, "loss": 0.9136, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.088928699493408, "rewards/margins": 0.899976372718811, "rewards/rejected": 3.1889524459838867, "step": 38180 }, { "epoch": 1.7730628162867357, "grad_norm": 59.076541900634766, "learning_rate": 2.046442886546884e-07, "logits/chosen": -18.15921974182129, "logits/rejected": -17.574970245361328, "logps/chosen": -359.1155700683594, "logps/rejected": -294.677734375, "loss": 0.4209, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8391780853271484, "rewards/margins": 1.4728076457977295, "rewards/rejected": 2.366370439529419, "step": 38190 }, { "epoch": 1.7735270903941687, "grad_norm": 292.789306640625, "learning_rate": 2.0456690963678287e-07, "logits/chosen": -19.331972122192383, "logits/rejected": -18.764158248901367, "logps/chosen": -539.523193359375, "logps/rejected": -452.91845703125, "loss": 1.0838, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.636933326721191, "rewards/margins": 0.028692150488495827, "rewards/rejected": 4.608241081237793, "step": 38200 }, { "epoch": 1.7739913645016019, "grad_norm": 17.93087387084961, "learning_rate": 2.0448953061887739e-07, "logits/chosen": -18.581804275512695, "logits/rejected": -17.84625816345215, "logps/chosen": -371.2036437988281, "logps/rejected": -342.9272766113281, "loss": 0.6813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6772098541259766, "rewards/margins": 0.9842535257339478, "rewards/rejected": 1.6929563283920288, "step": 38210 }, { "epoch": 1.7744556386090347, "grad_norm": 0.06820342689752579, "learning_rate": 2.0441215160097187e-07, "logits/chosen": -18.573577880859375, "logits/rejected": -17.116674423217773, "logps/chosen": -375.2980651855469, "logps/rejected": -290.19866943359375, "loss": 1.0287, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.738466262817383, "rewards/margins": 1.0865777730941772, "rewards/rejected": 2.651888370513916, "step": 38220 }, { "epoch": 1.7749199127164679, "grad_norm": 198.35641479492188, "learning_rate": 2.0433477258306636e-07, "logits/chosen": -19.21920394897461, "logits/rejected": -19.495258331298828, "logps/chosen": -282.16192626953125, "logps/rejected": -253.38394165039062, "loss": 1.2566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.303612232208252, "rewards/margins": -0.08365680277347565, "rewards/rejected": 2.3872687816619873, "step": 38230 }, { "epoch": 1.7753841868239009, "grad_norm": 45.95980453491211, "learning_rate": 2.0425739356516087e-07, "logits/chosen": -18.672399520874023, "logits/rejected": -18.159223556518555, "logps/chosen": -332.163330078125, "logps/rejected": -352.7796325683594, "loss": 0.7395, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9979684352874756, "rewards/margins": 0.8174102902412415, "rewards/rejected": 2.180558443069458, "step": 38240 }, { "epoch": 1.7758484609313339, "grad_norm": 17.27371597290039, "learning_rate": 2.0418001454725535e-07, "logits/chosen": -19.39834213256836, "logits/rejected": -18.020139694213867, "logps/chosen": -379.03814697265625, "logps/rejected": -258.6978759765625, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 4.435862064361572, "rewards/margins": 2.014939069747925, "rewards/rejected": 2.4209225177764893, "step": 38250 }, { "epoch": 1.7763127350387669, "grad_norm": 11.215555191040039, "learning_rate": 2.0410263552934986e-07, "logits/chosen": -18.550029754638672, "logits/rejected": -17.841129302978516, "logps/chosen": -325.2254638671875, "logps/rejected": -293.386474609375, "loss": 1.019, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7447154521942139, "rewards/margins": 0.08241093158721924, "rewards/rejected": 1.6623045206069946, "step": 38260 }, { "epoch": 1.7767770091461998, "grad_norm": 92.93366241455078, "learning_rate": 2.0402525651144435e-07, "logits/chosen": -19.42653465270996, "logits/rejected": -18.097637176513672, "logps/chosen": -398.89959716796875, "logps/rejected": -316.10748291015625, "loss": 0.2063, "rewards/accuracies": 1.0, "rewards/chosen": 4.285168170928955, "rewards/margins": 1.9392725229263306, "rewards/rejected": 2.345895767211914, "step": 38270 }, { "epoch": 1.777241283253633, "grad_norm": 1.8651448488235474, "learning_rate": 2.0394787749353883e-07, "logits/chosen": -19.765670776367188, "logits/rejected": -17.70887565612793, "logps/chosen": -483.4180603027344, "logps/rejected": -317.66156005859375, "loss": 0.2029, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.844848155975342, "rewards/margins": 2.8218719959259033, "rewards/rejected": 2.0229761600494385, "step": 38280 }, { "epoch": 1.7777055573610658, "grad_norm": 22.38959503173828, "learning_rate": 2.0387049847563334e-07, "logits/chosen": -19.137897491455078, "logits/rejected": -17.607410430908203, "logps/chosen": -407.17059326171875, "logps/rejected": -293.286865234375, "loss": 0.3426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6270461082458496, "rewards/margins": 2.2838425636291504, "rewards/rejected": 1.3432035446166992, "step": 38290 }, { "epoch": 1.778169831468499, "grad_norm": 303.40362548828125, "learning_rate": 2.0379311945772783e-07, "logits/chosen": -18.471582412719727, "logits/rejected": -18.55230140686035, "logps/chosen": -374.71087646484375, "logps/rejected": -395.4823303222656, "loss": 1.0904, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.568483591079712, "rewards/margins": 0.13917537033557892, "rewards/rejected": 2.4293081760406494, "step": 38300 }, { "epoch": 1.778634105575932, "grad_norm": 54.807212829589844, "learning_rate": 2.0371574043982234e-07, "logits/chosen": -19.406953811645508, "logits/rejected": -18.549596786499023, "logps/chosen": -384.6416015625, "logps/rejected": -280.60052490234375, "loss": 0.3243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.731549263000488, "rewards/margins": 2.167001485824585, "rewards/rejected": 2.564547300338745, "step": 38310 }, { "epoch": 1.779098379683365, "grad_norm": 57.32366943359375, "learning_rate": 2.0363836142191682e-07, "logits/chosen": -17.918954849243164, "logits/rejected": -17.724843978881836, "logps/chosen": -367.7953186035156, "logps/rejected": -399.41949462890625, "loss": 1.1187, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8284590244293213, "rewards/margins": 0.19259171187877655, "rewards/rejected": 2.6358673572540283, "step": 38320 }, { "epoch": 1.779562653790798, "grad_norm": 21.947290420532227, "learning_rate": 2.035609824040113e-07, "logits/chosen": -19.342439651489258, "logits/rejected": -17.55227279663086, "logps/chosen": -511.4542541503906, "logps/rejected": -331.144775390625, "loss": 0.1999, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.693923473358154, "rewards/margins": 2.6923558712005615, "rewards/rejected": 2.001568078994751, "step": 38330 }, { "epoch": 1.780026927898231, "grad_norm": 65.61348724365234, "learning_rate": 2.0348360338610582e-07, "logits/chosen": -18.490859985351562, "logits/rejected": -17.545059204101562, "logps/chosen": -323.6346130371094, "logps/rejected": -209.26687622070312, "loss": 0.5008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7404818534851074, "rewards/margins": 1.7499992847442627, "rewards/rejected": 1.9904823303222656, "step": 38340 }, { "epoch": 1.7804912020056642, "grad_norm": 153.11636352539062, "learning_rate": 2.034062243682003e-07, "logits/chosen": -20.013601303100586, "logits/rejected": -18.947853088378906, "logps/chosen": -500.5025329589844, "logps/rejected": -410.8103942871094, "loss": 0.5345, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.078940391540527, "rewards/margins": 1.1132726669311523, "rewards/rejected": 3.965668201446533, "step": 38350 }, { "epoch": 1.780955476113097, "grad_norm": 14.532683372497559, "learning_rate": 2.033288453502948e-07, "logits/chosen": -19.529483795166016, "logits/rejected": -18.337467193603516, "logps/chosen": -379.2301330566406, "logps/rejected": -301.143310546875, "loss": 0.6191, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.652604579925537, "rewards/margins": 1.3942296504974365, "rewards/rejected": 2.2583751678466797, "step": 38360 }, { "epoch": 1.7814197502205302, "grad_norm": 3.7759268283843994, "learning_rate": 2.032514663323893e-07, "logits/chosen": -18.95234489440918, "logits/rejected": -18.077590942382812, "logps/chosen": -439.30389404296875, "logps/rejected": -405.04718017578125, "loss": 0.4596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9293606281280518, "rewards/margins": 1.59043550491333, "rewards/rejected": 2.33892560005188, "step": 38370 }, { "epoch": 1.7818840243279632, "grad_norm": 36.27803039550781, "learning_rate": 2.0317408731448379e-07, "logits/chosen": -18.324222564697266, "logits/rejected": -18.079967498779297, "logps/chosen": -346.4898986816406, "logps/rejected": -379.67913818359375, "loss": 1.2169, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2974231243133545, "rewards/margins": 0.01557002030313015, "rewards/rejected": 2.281853199005127, "step": 38380 }, { "epoch": 1.7823482984353962, "grad_norm": 135.70701599121094, "learning_rate": 2.030967082965783e-07, "logits/chosen": -19.46451187133789, "logits/rejected": -18.867061614990234, "logps/chosen": -416.60736083984375, "logps/rejected": -304.0669250488281, "loss": 0.8655, "rewards/accuracies": 0.5, "rewards/chosen": 3.210932970046997, "rewards/margins": 1.0899070501327515, "rewards/rejected": 2.121026039123535, "step": 38390 }, { "epoch": 1.7828125725428294, "grad_norm": 73.87947082519531, "learning_rate": 2.030193292786728e-07, "logits/chosen": -20.557018280029297, "logits/rejected": -19.18590545654297, "logps/chosen": -359.2771301269531, "logps/rejected": -278.81634521484375, "loss": 0.4562, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.362010955810547, "rewards/margins": 1.4059879779815674, "rewards/rejected": 2.9560234546661377, "step": 38400 }, { "epoch": 1.7832768466502622, "grad_norm": 0.37353312969207764, "learning_rate": 2.0294195026076727e-07, "logits/chosen": -19.286649703979492, "logits/rejected": -17.74978256225586, "logps/chosen": -363.19342041015625, "logps/rejected": -220.012939453125, "loss": 0.1917, "rewards/accuracies": 1.0, "rewards/chosen": 4.042052745819092, "rewards/margins": 2.237666606903076, "rewards/rejected": 1.8043861389160156, "step": 38410 }, { "epoch": 1.7837411207576954, "grad_norm": 19.263042449951172, "learning_rate": 2.0286457124286178e-07, "logits/chosen": -18.384428024291992, "logits/rejected": -18.569543838500977, "logps/chosen": -240.5528106689453, "logps/rejected": -321.5003662109375, "loss": 1.54, "rewards/accuracies": 0.5, "rewards/chosen": 2.2479803562164307, "rewards/margins": -0.24252894520759583, "rewards/rejected": 2.490509510040283, "step": 38420 }, { "epoch": 1.7842053948651284, "grad_norm": 48.781368255615234, "learning_rate": 2.0278719222495626e-07, "logits/chosen": -19.29886245727539, "logits/rejected": -18.677425384521484, "logps/chosen": -419.9747009277344, "logps/rejected": -355.43463134765625, "loss": 0.9204, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.507497310638428, "rewards/margins": 0.67180335521698, "rewards/rejected": 3.8356940746307373, "step": 38430 }, { "epoch": 1.7846696689725614, "grad_norm": 175.9037628173828, "learning_rate": 2.0270981320705078e-07, "logits/chosen": -18.772489547729492, "logits/rejected": -18.616924285888672, "logps/chosen": -327.61724853515625, "logps/rejected": -312.4500427246094, "loss": 1.0057, "rewards/accuracies": 0.5, "rewards/chosen": 2.8188421726226807, "rewards/margins": 0.12642788887023926, "rewards/rejected": 2.6924142837524414, "step": 38440 }, { "epoch": 1.7851339430799944, "grad_norm": 207.08673095703125, "learning_rate": 2.0263243418914529e-07, "logits/chosen": -18.354799270629883, "logits/rejected": -18.005586624145508, "logps/chosen": -358.28338623046875, "logps/rejected": -352.1625061035156, "loss": 0.9355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.650836944580078, "rewards/margins": -0.005662429146468639, "rewards/rejected": 3.6564993858337402, "step": 38450 }, { "epoch": 1.7855982171874274, "grad_norm": 30.10774803161621, "learning_rate": 2.0255505517123974e-07, "logits/chosen": -19.127134323120117, "logits/rejected": -18.84768295288086, "logps/chosen": -341.66412353515625, "logps/rejected": -295.89080810546875, "loss": 1.1882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7670698165893555, "rewards/margins": 0.12422160804271698, "rewards/rejected": 2.642848253250122, "step": 38460 }, { "epoch": 1.7860624912948606, "grad_norm": 99.00920104980469, "learning_rate": 2.0247767615333426e-07, "logits/chosen": -18.58677101135254, "logits/rejected": -18.510372161865234, "logps/chosen": -467.50714111328125, "logps/rejected": -424.532470703125, "loss": 0.7368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3850550651550293, "rewards/margins": 0.4351624846458435, "rewards/rejected": 2.949892520904541, "step": 38470 }, { "epoch": 1.7865267654022934, "grad_norm": 132.34439086914062, "learning_rate": 2.0240029713542874e-07, "logits/chosen": -18.937223434448242, "logits/rejected": -18.41851806640625, "logps/chosen": -400.59637451171875, "logps/rejected": -322.50384521484375, "loss": 0.7524, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.940746545791626, "rewards/margins": 0.2577349543571472, "rewards/rejected": 2.683011293411255, "step": 38480 }, { "epoch": 1.7869910395097266, "grad_norm": 1.168752670288086, "learning_rate": 2.0232291811752325e-07, "logits/chosen": -19.07823944091797, "logits/rejected": -17.782997131347656, "logps/chosen": -421.7450256347656, "logps/rejected": -283.984619140625, "loss": 0.3793, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6961982250213623, "rewards/margins": 1.873888373374939, "rewards/rejected": 1.8223097324371338, "step": 38490 }, { "epoch": 1.7874553136171596, "grad_norm": 85.877685546875, "learning_rate": 2.0224553909961776e-07, "logits/chosen": -19.594173431396484, "logits/rejected": -19.276302337646484, "logps/chosen": -362.4216613769531, "logps/rejected": -273.77008056640625, "loss": 0.4306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.898136854171753, "rewards/margins": 0.9618889093399048, "rewards/rejected": 2.936248302459717, "step": 38500 }, { "epoch": 1.7879195877245926, "grad_norm": 0.17808575928211212, "learning_rate": 2.0216816008171222e-07, "logits/chosen": -18.648487091064453, "logits/rejected": -18.581600189208984, "logps/chosen": -396.50323486328125, "logps/rejected": -398.0650634765625, "loss": 1.106, "rewards/accuracies": 0.5, "rewards/chosen": 3.5803592205047607, "rewards/margins": 0.3064785599708557, "rewards/rejected": 3.27388072013855, "step": 38510 }, { "epoch": 1.7883838618320256, "grad_norm": 137.29244995117188, "learning_rate": 2.0209078106380673e-07, "logits/chosen": -19.54570198059082, "logits/rejected": -19.42352294921875, "logps/chosen": -355.71270751953125, "logps/rejected": -265.85772705078125, "loss": 0.6881, "rewards/accuracies": 0.5, "rewards/chosen": 2.921321392059326, "rewards/margins": 0.6028732061386108, "rewards/rejected": 2.318448066711426, "step": 38520 }, { "epoch": 1.7888481359394586, "grad_norm": 77.779052734375, "learning_rate": 2.0201340204590122e-07, "logits/chosen": -19.044574737548828, "logits/rejected": -18.189529418945312, "logps/chosen": -352.91510009765625, "logps/rejected": -277.568115234375, "loss": 0.8624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.588392734527588, "rewards/margins": 0.9230436086654663, "rewards/rejected": 2.665349006652832, "step": 38530 }, { "epoch": 1.7893124100468918, "grad_norm": 155.31246948242188, "learning_rate": 2.0193602302799573e-07, "logits/chosen": -20.429336547851562, "logits/rejected": -20.152910232543945, "logps/chosen": -368.9073486328125, "logps/rejected": -327.49603271484375, "loss": 0.9736, "rewards/accuracies": 0.5, "rewards/chosen": 3.0508460998535156, "rewards/margins": 0.26003846526145935, "rewards/rejected": 2.7908074855804443, "step": 38540 }, { "epoch": 1.7897766841543246, "grad_norm": 11.52861213684082, "learning_rate": 2.0185864401009021e-07, "logits/chosen": -18.23111343383789, "logits/rejected": -17.726642608642578, "logps/chosen": -364.81304931640625, "logps/rejected": -320.9477844238281, "loss": 0.6595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.021472454071045, "rewards/margins": 1.55782151222229, "rewards/rejected": 1.4636505842208862, "step": 38550 }, { "epoch": 1.7902409582617578, "grad_norm": 0.12070293724536896, "learning_rate": 2.017812649921847e-07, "logits/chosen": -19.08674430847168, "logits/rejected": -18.343402862548828, "logps/chosen": -352.9628601074219, "logps/rejected": -294.3601379394531, "loss": 0.8644, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1679844856262207, "rewards/margins": 1.2584803104400635, "rewards/rejected": 1.9095041751861572, "step": 38560 }, { "epoch": 1.7907052323691908, "grad_norm": 56.25167465209961, "learning_rate": 2.017038859742792e-07, "logits/chosen": -20.52395248413086, "logits/rejected": -19.406936645507812, "logps/chosen": -359.0385437011719, "logps/rejected": -299.29949951171875, "loss": 0.4358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8718090057373047, "rewards/margins": 1.4191581010818481, "rewards/rejected": 2.452651023864746, "step": 38570 }, { "epoch": 1.7911695064766238, "grad_norm": 127.97477722167969, "learning_rate": 2.016265069563737e-07, "logits/chosen": -19.241313934326172, "logits/rejected": -18.763492584228516, "logps/chosen": -482.7789611816406, "logps/rejected": -414.993896484375, "loss": 0.6822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.6248064041137695, "rewards/margins": 0.7329580783843994, "rewards/rejected": 3.89184832572937, "step": 38580 }, { "epoch": 1.791633780584057, "grad_norm": 32.88682174682617, "learning_rate": 2.015491279384682e-07, "logits/chosen": -19.152250289916992, "logits/rejected": -18.656269073486328, "logps/chosen": -410.64788818359375, "logps/rejected": -360.834716796875, "loss": 0.6843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5510621070861816, "rewards/margins": 0.7492156624794006, "rewards/rejected": 2.8018462657928467, "step": 38590 }, { "epoch": 1.7920980546914898, "grad_norm": 107.10880279541016, "learning_rate": 2.014717489205627e-07, "logits/chosen": -18.874069213867188, "logits/rejected": -17.89988899230957, "logps/chosen": -412.85235595703125, "logps/rejected": -281.4095764160156, "loss": 0.6478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.184046745300293, "rewards/margins": 1.2859694957733154, "rewards/rejected": 1.898077368736267, "step": 38600 }, { "epoch": 1.792562328798923, "grad_norm": 80.35179138183594, "learning_rate": 2.0139436990265718e-07, "logits/chosen": -19.02704429626465, "logits/rejected": -19.03426170349121, "logps/chosen": -341.75323486328125, "logps/rejected": -383.64892578125, "loss": 0.801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.185652494430542, "rewards/margins": 0.6011330485343933, "rewards/rejected": 2.584519624710083, "step": 38610 }, { "epoch": 1.793026602906356, "grad_norm": 4.449008941650391, "learning_rate": 2.013169908847517e-07, "logits/chosen": -19.724178314208984, "logits/rejected": -18.339954376220703, "logps/chosen": -287.5868835449219, "logps/rejected": -261.187255859375, "loss": 0.5525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3973374366760254, "rewards/margins": 1.543021321296692, "rewards/rejected": 1.854315996170044, "step": 38620 }, { "epoch": 1.793490877013789, "grad_norm": 199.94033813476562, "learning_rate": 2.0123961186684617e-07, "logits/chosen": -17.723703384399414, "logits/rejected": -18.36686134338379, "logps/chosen": -389.95623779296875, "logps/rejected": -410.63897705078125, "loss": 1.271, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7179043292999268, "rewards/margins": -0.23438873887062073, "rewards/rejected": 3.9522929191589355, "step": 38630 }, { "epoch": 1.793955151121222, "grad_norm": 18.279502868652344, "learning_rate": 2.0116223284894068e-07, "logits/chosen": -18.44837760925293, "logits/rejected": -17.305706024169922, "logps/chosen": -399.25604248046875, "logps/rejected": -281.22869873046875, "loss": 0.581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7098841667175293, "rewards/margins": 1.2001712322235107, "rewards/rejected": 2.5097134113311768, "step": 38640 }, { "epoch": 1.794419425228655, "grad_norm": 75.37955474853516, "learning_rate": 2.0108485383103517e-07, "logits/chosen": -19.472492218017578, "logits/rejected": -18.59141731262207, "logps/chosen": -265.71856689453125, "logps/rejected": -275.00433349609375, "loss": 0.5666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.812216281890869, "rewards/margins": 1.3734747171401978, "rewards/rejected": 1.4387414455413818, "step": 38650 }, { "epoch": 1.7948836993360882, "grad_norm": 0.6057223081588745, "learning_rate": 2.0100747481312965e-07, "logits/chosen": -19.349706649780273, "logits/rejected": -18.429691314697266, "logps/chosen": -415.2662048339844, "logps/rejected": -287.32672119140625, "loss": 0.4636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.759183883666992, "rewards/margins": 1.6015586853027344, "rewards/rejected": 2.157625198364258, "step": 38660 }, { "epoch": 1.795347973443521, "grad_norm": 0.2540885806083679, "learning_rate": 2.0093009579522416e-07, "logits/chosen": -18.715335845947266, "logits/rejected": -17.95545196533203, "logps/chosen": -395.88055419921875, "logps/rejected": -276.20330810546875, "loss": 0.8267, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.44148063659668, "rewards/margins": 2.5158112049102783, "rewards/rejected": 1.9256699085235596, "step": 38670 }, { "epoch": 1.7958122475509541, "grad_norm": 11.018974304199219, "learning_rate": 2.0085271677731865e-07, "logits/chosen": -19.76954460144043, "logits/rejected": -18.640161514282227, "logps/chosen": -284.65814208984375, "logps/rejected": -276.5276794433594, "loss": 0.6469, "rewards/accuracies": 0.5, "rewards/chosen": 2.6071958541870117, "rewards/margins": 0.6159030199050903, "rewards/rejected": 1.9912925958633423, "step": 38680 }, { "epoch": 1.7962765216583871, "grad_norm": 191.39291381835938, "learning_rate": 2.0077533775941316e-07, "logits/chosen": -19.005714416503906, "logits/rejected": -17.957855224609375, "logps/chosen": -369.97344970703125, "logps/rejected": -237.0208740234375, "loss": 0.3871, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.391407012939453, "rewards/margins": 1.892085313796997, "rewards/rejected": 1.499321699142456, "step": 38690 }, { "epoch": 1.7967407957658201, "grad_norm": 79.9142074584961, "learning_rate": 2.0069795874150765e-07, "logits/chosen": -18.752229690551758, "logits/rejected": -17.742671966552734, "logps/chosen": -351.955322265625, "logps/rejected": -316.56024169921875, "loss": 0.7714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6816680431365967, "rewards/margins": 1.4350311756134033, "rewards/rejected": 2.2466368675231934, "step": 38700 }, { "epoch": 1.7972050698732531, "grad_norm": 268.1784973144531, "learning_rate": 2.0062057972360213e-07, "logits/chosen": -18.876691818237305, "logits/rejected": -17.883901596069336, "logps/chosen": -373.3984375, "logps/rejected": -307.8428955078125, "loss": 1.0434, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.515810489654541, "rewards/margins": 0.47803568840026855, "rewards/rejected": 3.0377750396728516, "step": 38710 }, { "epoch": 1.7976693439806861, "grad_norm": 8.496176719665527, "learning_rate": 2.0054320070569664e-07, "logits/chosen": -19.475858688354492, "logits/rejected": -17.521230697631836, "logps/chosen": -351.87933349609375, "logps/rejected": -196.1007843017578, "loss": 0.5046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.907733678817749, "rewards/margins": 2.006810426712036, "rewards/rejected": 0.9009234309196472, "step": 38720 }, { "epoch": 1.7981336180881193, "grad_norm": 29.491134643554688, "learning_rate": 2.0046582168779113e-07, "logits/chosen": -19.274700164794922, "logits/rejected": -18.628124237060547, "logps/chosen": -314.81134033203125, "logps/rejected": -324.31365966796875, "loss": 0.7352, "rewards/accuracies": 0.5, "rewards/chosen": 3.512676239013672, "rewards/margins": 0.3799781799316406, "rewards/rejected": 3.132697582244873, "step": 38730 }, { "epoch": 1.798597892195552, "grad_norm": 114.3528060913086, "learning_rate": 2.0038844266988564e-07, "logits/chosen": -19.395469665527344, "logits/rejected": -19.075279235839844, "logps/chosen": -362.34918212890625, "logps/rejected": -307.2751159667969, "loss": 0.7946, "rewards/accuracies": 0.5, "rewards/chosen": 2.960019111633301, "rewards/margins": 0.4748761057853699, "rewards/rejected": 2.4851431846618652, "step": 38740 }, { "epoch": 1.7990621663029853, "grad_norm": 129.32577514648438, "learning_rate": 2.0031106365198012e-07, "logits/chosen": -18.69876480102539, "logits/rejected": -17.744121551513672, "logps/chosen": -506.9571228027344, "logps/rejected": -348.34454345703125, "loss": 0.5541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0760083198547363, "rewards/margins": 0.8277075886726379, "rewards/rejected": 2.248300790786743, "step": 38750 }, { "epoch": 1.7995264404104183, "grad_norm": 255.70535278320312, "learning_rate": 2.002336846340746e-07, "logits/chosen": -19.624923706054688, "logits/rejected": -18.045970916748047, "logps/chosen": -479.52984619140625, "logps/rejected": -338.70294189453125, "loss": 0.5045, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.8413987159729, "rewards/margins": 2.7074432373046875, "rewards/rejected": 2.133955478668213, "step": 38760 }, { "epoch": 1.7999907145178513, "grad_norm": 67.58258819580078, "learning_rate": 2.0015630561616912e-07, "logits/chosen": -19.716218948364258, "logits/rejected": -17.553647994995117, "logps/chosen": -401.4753112792969, "logps/rejected": -222.7960968017578, "loss": 0.1344, "rewards/accuracies": 1.0, "rewards/chosen": 4.1705217361450195, "rewards/margins": 2.5838165283203125, "rewards/rejected": 1.5867050886154175, "step": 38770 }, { "epoch": 1.8004549886252845, "grad_norm": 197.37660217285156, "learning_rate": 2.000789265982636e-07, "logits/chosen": -19.295299530029297, "logits/rejected": -19.7413330078125, "logps/chosen": -403.621337890625, "logps/rejected": -368.11529541015625, "loss": 1.0553, "rewards/accuracies": 0.5, "rewards/chosen": 3.2540297508239746, "rewards/margins": 0.12696535885334015, "rewards/rejected": 3.1270642280578613, "step": 38780 }, { "epoch": 1.8009192627327173, "grad_norm": 1.2450523376464844, "learning_rate": 2.000015475803581e-07, "logits/chosen": -19.18217658996582, "logits/rejected": -19.19996452331543, "logps/chosen": -330.0047302246094, "logps/rejected": -319.7760009765625, "loss": 0.8526, "rewards/accuracies": 0.5, "rewards/chosen": 2.003913640975952, "rewards/margins": 0.2186312973499298, "rewards/rejected": 1.7852823734283447, "step": 38790 }, { "epoch": 1.8013835368401505, "grad_norm": 53.870880126953125, "learning_rate": 1.999241685624526e-07, "logits/chosen": -19.813602447509766, "logits/rejected": -18.81062126159668, "logps/chosen": -289.547119140625, "logps/rejected": -236.97109985351562, "loss": 0.5235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9993705749511719, "rewards/margins": 1.131888508796692, "rewards/rejected": 0.8674820065498352, "step": 38800 }, { "epoch": 1.8018478109475835, "grad_norm": 25.929105758666992, "learning_rate": 1.9984678954454708e-07, "logits/chosen": -19.21023178100586, "logits/rejected": -18.640546798706055, "logps/chosen": -355.7806091308594, "logps/rejected": -295.31158447265625, "loss": 0.6779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.99017333984375, "rewards/margins": 0.6724130511283875, "rewards/rejected": 2.3177599906921387, "step": 38810 }, { "epoch": 1.8023120850550165, "grad_norm": 12.560155868530273, "learning_rate": 1.997694105266416e-07, "logits/chosen": -19.350744247436523, "logits/rejected": -18.30569839477539, "logps/chosen": -436.3818359375, "logps/rejected": -324.3484802246094, "loss": 0.5208, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.2499918937683105, "rewards/margins": 1.6573009490966797, "rewards/rejected": 2.592690944671631, "step": 38820 }, { "epoch": 1.8027763591624495, "grad_norm": 1.3603887557983398, "learning_rate": 1.9969203150873608e-07, "logits/chosen": -18.445648193359375, "logits/rejected": -17.24856185913086, "logps/chosen": -345.0345458984375, "logps/rejected": -282.3761291503906, "loss": 0.7699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8265063762664795, "rewards/margins": 1.486889362335205, "rewards/rejected": 2.3396172523498535, "step": 38830 }, { "epoch": 1.8032406332698825, "grad_norm": 27.87067985534668, "learning_rate": 1.9961465249083057e-07, "logits/chosen": -18.99393653869629, "logits/rejected": -18.637842178344727, "logps/chosen": -321.2824401855469, "logps/rejected": -300.50494384765625, "loss": 0.7403, "rewards/accuracies": 0.5, "rewards/chosen": 3.3848178386688232, "rewards/margins": 0.2982962727546692, "rewards/rejected": 3.086521625518799, "step": 38840 }, { "epoch": 1.8037049073773157, "grad_norm": 225.27511596679688, "learning_rate": 1.9953727347292508e-07, "logits/chosen": -18.747398376464844, "logits/rejected": -19.42104721069336, "logps/chosen": -414.36700439453125, "logps/rejected": -535.0435791015625, "loss": 1.2945, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.740117311477661, "rewards/margins": -0.28730568289756775, "rewards/rejected": 4.027422904968262, "step": 38850 }, { "epoch": 1.8041691814847485, "grad_norm": 9.806913375854492, "learning_rate": 1.9945989445501956e-07, "logits/chosen": -19.7357120513916, "logits/rejected": -17.934463500976562, "logps/chosen": -441.260986328125, "logps/rejected": -329.2828063964844, "loss": 0.3597, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3577322959899902, "rewards/margins": 2.0117573738098145, "rewards/rejected": 1.3459751605987549, "step": 38860 }, { "epoch": 1.8046334555921817, "grad_norm": 184.14390563964844, "learning_rate": 1.9938251543711407e-07, "logits/chosen": -18.298969268798828, "logits/rejected": -17.765670776367188, "logps/chosen": -395.230224609375, "logps/rejected": -307.06781005859375, "loss": 0.7047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.01352858543396, "rewards/margins": 0.7412692308425903, "rewards/rejected": 2.272259473800659, "step": 38870 }, { "epoch": 1.8050977296996147, "grad_norm": 54.65314865112305, "learning_rate": 1.9930513641920856e-07, "logits/chosen": -19.49837875366211, "logits/rejected": -18.42726707458496, "logps/chosen": -312.98858642578125, "logps/rejected": -287.86749267578125, "loss": 0.8491, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0866920948028564, "rewards/margins": 0.6444166898727417, "rewards/rejected": 2.4422755241394043, "step": 38880 }, { "epoch": 1.8055620038070477, "grad_norm": 25.249740600585938, "learning_rate": 1.9922775740130304e-07, "logits/chosen": -20.296979904174805, "logits/rejected": -18.9278564453125, "logps/chosen": -347.24859619140625, "logps/rejected": -317.0222473144531, "loss": 0.7565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.941619634628296, "rewards/margins": 0.7602211236953735, "rewards/rejected": 2.181398391723633, "step": 38890 }, { "epoch": 1.8060262779144807, "grad_norm": 10.383135795593262, "learning_rate": 1.9915037838339755e-07, "logits/chosen": -19.010204315185547, "logits/rejected": -17.284250259399414, "logps/chosen": -445.54632568359375, "logps/rejected": -201.6156005859375, "loss": 0.428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.946281909942627, "rewards/margins": 2.9509778022766113, "rewards/rejected": 0.9953038096427917, "step": 38900 }, { "epoch": 1.8064905520219137, "grad_norm": 0.7711642980575562, "learning_rate": 1.9907299936549204e-07, "logits/chosen": -19.334177017211914, "logits/rejected": -18.194095611572266, "logps/chosen": -400.4447937011719, "logps/rejected": -282.6236877441406, "loss": 0.3946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3276588916778564, "rewards/margins": 1.5876481533050537, "rewards/rejected": 1.7400108575820923, "step": 38910 }, { "epoch": 1.8069548261293469, "grad_norm": 15.504659652709961, "learning_rate": 1.9899562034758655e-07, "logits/chosen": -19.264446258544922, "logits/rejected": -18.926143646240234, "logps/chosen": -346.0090026855469, "logps/rejected": -325.8766174316406, "loss": 0.627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2355093955993652, "rewards/margins": 1.1252859830856323, "rewards/rejected": 2.1102232933044434, "step": 38920 }, { "epoch": 1.8074191002367797, "grad_norm": 84.08727264404297, "learning_rate": 1.9891824132968103e-07, "logits/chosen": -18.127002716064453, "logits/rejected": -16.910297393798828, "logps/chosen": -394.20062255859375, "logps/rejected": -247.62069702148438, "loss": 0.4015, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.485610246658325, "rewards/margins": 2.306971311569214, "rewards/rejected": 1.1786386966705322, "step": 38930 }, { "epoch": 1.8078833743442129, "grad_norm": 105.26302337646484, "learning_rate": 1.9884086231177552e-07, "logits/chosen": -19.46180534362793, "logits/rejected": -18.128185272216797, "logps/chosen": -325.7748107910156, "logps/rejected": -268.24896240234375, "loss": 0.6504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5480079650878906, "rewards/margins": 1.1779420375823975, "rewards/rejected": 2.370065689086914, "step": 38940 }, { "epoch": 1.8083476484516459, "grad_norm": 128.2820587158203, "learning_rate": 1.9876348329387003e-07, "logits/chosen": -20.576091766357422, "logits/rejected": -20.16073226928711, "logps/chosen": -440.3689880371094, "logps/rejected": -389.2537536621094, "loss": 0.5686, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6323611736297607, "rewards/margins": 1.0150049924850464, "rewards/rejected": 2.617356061935425, "step": 38950 }, { "epoch": 1.8088119225590789, "grad_norm": 14.289934158325195, "learning_rate": 1.9868610427596452e-07, "logits/chosen": -18.996999740600586, "logits/rejected": -18.746479034423828, "logps/chosen": -361.81585693359375, "logps/rejected": -285.64984130859375, "loss": 1.3696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.789882183074951, "rewards/margins": 0.5116232633590698, "rewards/rejected": 2.278259038925171, "step": 38960 }, { "epoch": 1.809276196666512, "grad_norm": 0.17265266180038452, "learning_rate": 1.9860872525805903e-07, "logits/chosen": -19.45061683654785, "logits/rejected": -18.54629898071289, "logps/chosen": -406.50164794921875, "logps/rejected": -292.2026672363281, "loss": 0.4188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.105387210845947, "rewards/margins": 2.0657992362976074, "rewards/rejected": 2.039588451385498, "step": 38970 }, { "epoch": 1.8097404707739448, "grad_norm": 121.30317687988281, "learning_rate": 1.985313462401535e-07, "logits/chosen": -20.06963539123535, "logits/rejected": -19.91858673095703, "logps/chosen": -496.99737548828125, "logps/rejected": -451.78271484375, "loss": 0.5086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.43371057510376, "rewards/margins": 0.9023491144180298, "rewards/rejected": 3.5313613414764404, "step": 38980 }, { "epoch": 1.810204744881378, "grad_norm": 34.158966064453125, "learning_rate": 1.98453967222248e-07, "logits/chosen": -18.967540740966797, "logits/rejected": -18.37820816040039, "logps/chosen": -410.9419860839844, "logps/rejected": -303.3167724609375, "loss": 0.6311, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5237488746643066, "rewards/margins": 1.3135652542114258, "rewards/rejected": 2.210183620452881, "step": 38990 }, { "epoch": 1.8106690189888108, "grad_norm": 281.8072509765625, "learning_rate": 1.983765882043425e-07, "logits/chosen": -19.361177444458008, "logits/rejected": -19.110095977783203, "logps/chosen": -373.5771484375, "logps/rejected": -374.32354736328125, "loss": 0.738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5560085773468018, "rewards/margins": 0.5605858564376831, "rewards/rejected": 2.995422601699829, "step": 39000 }, { "epoch": 1.811133293096244, "grad_norm": 6.625002384185791, "learning_rate": 1.98299209186437e-07, "logits/chosen": -19.761585235595703, "logits/rejected": -19.05327796936035, "logps/chosen": -415.0400390625, "logps/rejected": -367.99945068359375, "loss": 0.8063, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.508614540100098, "rewards/margins": 1.3047324419021606, "rewards/rejected": 3.2038822174072266, "step": 39010 }, { "epoch": 1.811597567203677, "grad_norm": 169.05934143066406, "learning_rate": 1.982218301685315e-07, "logits/chosen": -19.169281005859375, "logits/rejected": -18.945993423461914, "logps/chosen": -427.06610107421875, "logps/rejected": -397.0052795410156, "loss": 0.7431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.069418907165527, "rewards/margins": 0.759023904800415, "rewards/rejected": 3.3103954792022705, "step": 39020 }, { "epoch": 1.81206184131111, "grad_norm": 16.49217987060547, "learning_rate": 1.9814445115062596e-07, "logits/chosen": -18.1529598236084, "logits/rejected": -17.462902069091797, "logps/chosen": -261.371337890625, "logps/rejected": -216.994873046875, "loss": 0.767, "rewards/accuracies": 0.5, "rewards/chosen": 1.6815779209136963, "rewards/margins": 1.122528314590454, "rewards/rejected": 0.5590494871139526, "step": 39030 }, { "epoch": 1.8125261154185432, "grad_norm": 29.519611358642578, "learning_rate": 1.9806707213272047e-07, "logits/chosen": -19.375080108642578, "logits/rejected": -17.869077682495117, "logps/chosen": -497.2645568847656, "logps/rejected": -319.07806396484375, "loss": 0.2323, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.473635673522949, "rewards/margins": 2.386054515838623, "rewards/rejected": 2.087580442428589, "step": 39040 }, { "epoch": 1.812990389525976, "grad_norm": 74.10272216796875, "learning_rate": 1.9798969311481498e-07, "logits/chosen": -19.839975357055664, "logits/rejected": -18.984050750732422, "logps/chosen": -453.00762939453125, "logps/rejected": -442.7735900878906, "loss": 0.7218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.135983943939209, "rewards/margins": 1.0865449905395508, "rewards/rejected": 3.049438953399658, "step": 39050 }, { "epoch": 1.8134546636334092, "grad_norm": 4.708135604858398, "learning_rate": 1.9791231409690947e-07, "logits/chosen": -19.42144012451172, "logits/rejected": -19.212249755859375, "logps/chosen": -479.5760803222656, "logps/rejected": -423.0091247558594, "loss": 1.1393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.7628302574157715, "rewards/margins": 0.5463758707046509, "rewards/rejected": 4.21645450592041, "step": 39060 }, { "epoch": 1.8139189377408422, "grad_norm": 23.115327835083008, "learning_rate": 1.9783493507900398e-07, "logits/chosen": -18.585891723632812, "logits/rejected": -18.314105987548828, "logps/chosen": -460.936279296875, "logps/rejected": -442.0828552246094, "loss": 0.9303, "rewards/accuracies": 0.5, "rewards/chosen": 4.933842658996582, "rewards/margins": 0.45015010237693787, "rewards/rejected": 4.483692646026611, "step": 39070 }, { "epoch": 1.8143832118482752, "grad_norm": 89.98526000976562, "learning_rate": 1.9775755606109844e-07, "logits/chosen": -18.063819885253906, "logits/rejected": -18.5395450592041, "logps/chosen": -358.18267822265625, "logps/rejected": -357.6058044433594, "loss": 0.943, "rewards/accuracies": 0.5, "rewards/chosen": 3.221078872680664, "rewards/margins": 0.4700256288051605, "rewards/rejected": 2.7510533332824707, "step": 39080 }, { "epoch": 1.8148474859557082, "grad_norm": 189.2230682373047, "learning_rate": 1.9768017704319295e-07, "logits/chosen": -18.969772338867188, "logits/rejected": -18.093097686767578, "logps/chosen": -481.19451904296875, "logps/rejected": -381.0621643066406, "loss": 0.4779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.949351787567139, "rewards/margins": 2.069805860519409, "rewards/rejected": 2.8795456886291504, "step": 39090 }, { "epoch": 1.8153117600631412, "grad_norm": 41.770286560058594, "learning_rate": 1.9760279802528746e-07, "logits/chosen": -18.190677642822266, "logits/rejected": -16.598487854003906, "logps/chosen": -339.954833984375, "logps/rejected": -195.31036376953125, "loss": 0.5051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8815054893493652, "rewards/margins": 2.4698617458343506, "rewards/rejected": 0.41164344549179077, "step": 39100 }, { "epoch": 1.8157760341705744, "grad_norm": 125.82703399658203, "learning_rate": 1.9752541900738195e-07, "logits/chosen": -18.79970932006836, "logits/rejected": -17.99990463256836, "logps/chosen": -440.4580993652344, "logps/rejected": -359.1248474121094, "loss": 0.6121, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7061991691589355, "rewards/margins": 1.6258728504180908, "rewards/rejected": 2.080326557159424, "step": 39110 }, { "epoch": 1.8162403082780072, "grad_norm": 11.638083457946777, "learning_rate": 1.9744803998947646e-07, "logits/chosen": -18.490488052368164, "logits/rejected": -18.158679962158203, "logps/chosen": -412.5597229003906, "logps/rejected": -383.84271240234375, "loss": 0.3825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5224595069885254, "rewards/margins": 1.1768441200256348, "rewards/rejected": 2.3456153869628906, "step": 39120 }, { "epoch": 1.8167045823854404, "grad_norm": 62.80474853515625, "learning_rate": 1.9737066097157092e-07, "logits/chosen": -18.22832679748535, "logits/rejected": -18.420869827270508, "logps/chosen": -430.54656982421875, "logps/rejected": -413.4935607910156, "loss": 1.0787, "rewards/accuracies": 0.5, "rewards/chosen": 3.340669631958008, "rewards/margins": 0.19940085709095, "rewards/rejected": 3.1412692070007324, "step": 39130 }, { "epoch": 1.8171688564928734, "grad_norm": 110.57643127441406, "learning_rate": 1.9729328195366543e-07, "logits/chosen": -18.627206802368164, "logits/rejected": -18.318614959716797, "logps/chosen": -317.14263916015625, "logps/rejected": -333.5918273925781, "loss": 1.0461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3223042488098145, "rewards/margins": 0.040697626769542694, "rewards/rejected": 2.281606435775757, "step": 39140 }, { "epoch": 1.8176331306003064, "grad_norm": 151.88125610351562, "learning_rate": 1.9721590293575994e-07, "logits/chosen": -18.29349708557129, "logits/rejected": -18.421443939208984, "logps/chosen": -390.85992431640625, "logps/rejected": -444.40869140625, "loss": 1.3021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0570902824401855, "rewards/margins": 0.0017242431640625, "rewards/rejected": 3.055366039276123, "step": 39150 }, { "epoch": 1.8180974047077394, "grad_norm": 74.83715057373047, "learning_rate": 1.9713852391785442e-07, "logits/chosen": -18.462242126464844, "logits/rejected": -17.598190307617188, "logps/chosen": -320.19122314453125, "logps/rejected": -243.5350799560547, "loss": 0.4257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6327567100524902, "rewards/margins": 1.717890977859497, "rewards/rejected": 0.9148657917976379, "step": 39160 }, { "epoch": 1.8185616788151724, "grad_norm": 0.07851200550794601, "learning_rate": 1.9706114489994893e-07, "logits/chosen": -19.524370193481445, "logits/rejected": -18.634220123291016, "logps/chosen": -363.1265869140625, "logps/rejected": -250.19473266601562, "loss": 0.7023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9298274517059326, "rewards/margins": 1.0798306465148926, "rewards/rejected": 1.849996566772461, "step": 39170 }, { "epoch": 1.8190259529226056, "grad_norm": 18.832780838012695, "learning_rate": 1.9698376588204342e-07, "logits/chosen": -18.799251556396484, "logits/rejected": -18.398754119873047, "logps/chosen": -400.06121826171875, "logps/rejected": -384.3586120605469, "loss": 1.004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.873793363571167, "rewards/margins": 0.6656016111373901, "rewards/rejected": 3.2081921100616455, "step": 39180 }, { "epoch": 1.8194902270300384, "grad_norm": 210.71347045898438, "learning_rate": 1.969063868641379e-07, "logits/chosen": -19.598041534423828, "logits/rejected": -19.15995216369629, "logps/chosen": -373.3858337402344, "logps/rejected": -407.12713623046875, "loss": 0.9386, "rewards/accuracies": 0.5, "rewards/chosen": 2.9922611713409424, "rewards/margins": 0.7523289918899536, "rewards/rejected": 2.2399322986602783, "step": 39190 }, { "epoch": 1.8199545011374716, "grad_norm": 83.92217254638672, "learning_rate": 1.9682900784623242e-07, "logits/chosen": -19.088146209716797, "logits/rejected": -18.210063934326172, "logps/chosen": -373.4699401855469, "logps/rejected": -329.82012939453125, "loss": 0.7037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.995353937149048, "rewards/margins": 1.250354528427124, "rewards/rejected": 1.7449989318847656, "step": 39200 }, { "epoch": 1.8204187752449046, "grad_norm": 197.25625610351562, "learning_rate": 1.967516288283269e-07, "logits/chosen": -18.305011749267578, "logits/rejected": -18.124984741210938, "logps/chosen": -376.39227294921875, "logps/rejected": -394.3177185058594, "loss": 1.5087, "rewards/accuracies": 0.5, "rewards/chosen": 3.0009305477142334, "rewards/margins": -0.5619708299636841, "rewards/rejected": 3.562901735305786, "step": 39210 }, { "epoch": 1.8208830493523376, "grad_norm": 25.188751220703125, "learning_rate": 1.966742498104214e-07, "logits/chosen": -19.041488647460938, "logits/rejected": -18.360015869140625, "logps/chosen": -319.07269287109375, "logps/rejected": -256.0443115234375, "loss": 0.6286, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1050076484680176, "rewards/margins": 1.37897789478302, "rewards/rejected": 1.7260301113128662, "step": 39220 }, { "epoch": 1.8213473234597708, "grad_norm": 14.283523559570312, "learning_rate": 1.965968707925159e-07, "logits/chosen": -18.888235092163086, "logits/rejected": -17.505111694335938, "logps/chosen": -410.4764099121094, "logps/rejected": -262.56500244140625, "loss": 0.4224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.575817108154297, "rewards/margins": 1.6623748540878296, "rewards/rejected": 1.9134423732757568, "step": 39230 }, { "epoch": 1.8218115975672036, "grad_norm": 17.20000648498535, "learning_rate": 1.9651949177461038e-07, "logits/chosen": -18.612323760986328, "logits/rejected": -17.586782455444336, "logps/chosen": -420.60211181640625, "logps/rejected": -296.86602783203125, "loss": 0.564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.3204474449157715, "rewards/margins": 2.1154227256774902, "rewards/rejected": 2.2050251960754395, "step": 39240 }, { "epoch": 1.8222758716746368, "grad_norm": 107.24335479736328, "learning_rate": 1.964421127567049e-07, "logits/chosen": -19.23590850830078, "logits/rejected": -19.082538604736328, "logps/chosen": -366.088623046875, "logps/rejected": -349.8421936035156, "loss": 0.9521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6352035999298096, "rewards/margins": 0.3316377103328705, "rewards/rejected": 2.3035659790039062, "step": 39250 }, { "epoch": 1.8227401457820698, "grad_norm": 92.8779525756836, "learning_rate": 1.9636473373879938e-07, "logits/chosen": -17.979055404663086, "logits/rejected": -17.794282913208008, "logps/chosen": -337.21484375, "logps/rejected": -329.49359130859375, "loss": 0.9923, "rewards/accuracies": 0.5, "rewards/chosen": 2.159522294998169, "rewards/margins": 0.15414538979530334, "rewards/rejected": 2.0053768157958984, "step": 39260 }, { "epoch": 1.8232044198895028, "grad_norm": 0.10127391666173935, "learning_rate": 1.9628735472089386e-07, "logits/chosen": -19.612863540649414, "logits/rejected": -18.3787841796875, "logps/chosen": -385.42144775390625, "logps/rejected": -358.18170166015625, "loss": 0.4191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.029102802276611, "rewards/margins": 1.9445264339447021, "rewards/rejected": 2.0845766067504883, "step": 39270 }, { "epoch": 1.8236686939969358, "grad_norm": 43.72026443481445, "learning_rate": 1.9620997570298837e-07, "logits/chosen": -19.094892501831055, "logits/rejected": -18.16330337524414, "logps/chosen": -438.18048095703125, "logps/rejected": -404.32635498046875, "loss": 0.4377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.470432758331299, "rewards/margins": 0.93768310546875, "rewards/rejected": 2.532749652862549, "step": 39280 }, { "epoch": 1.8241329681043688, "grad_norm": 35.53123092651367, "learning_rate": 1.9613259668508286e-07, "logits/chosen": -18.626911163330078, "logits/rejected": -17.360538482666016, "logps/chosen": -387.97625732421875, "logps/rejected": -321.23291015625, "loss": 0.3833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9661953449249268, "rewards/margins": 2.083103895187378, "rewards/rejected": 1.8830912113189697, "step": 39290 }, { "epoch": 1.824597242211802, "grad_norm": 120.56192016601562, "learning_rate": 1.9605521766717737e-07, "logits/chosen": -19.812721252441406, "logits/rejected": -19.511247634887695, "logps/chosen": -449.54779052734375, "logps/rejected": -425.40057373046875, "loss": 1.3727, "rewards/accuracies": 0.5, "rewards/chosen": 3.4172911643981934, "rewards/margins": -0.3419502377510071, "rewards/rejected": 3.7592415809631348, "step": 39300 }, { "epoch": 1.8250615163192347, "grad_norm": 2.025200128555298, "learning_rate": 1.9597783864927185e-07, "logits/chosen": -19.078067779541016, "logits/rejected": -17.136661529541016, "logps/chosen": -416.9359436035156, "logps/rejected": -287.3676452636719, "loss": 0.809, "rewards/accuracies": 0.5, "rewards/chosen": 3.396620988845825, "rewards/margins": 1.7257041931152344, "rewards/rejected": 1.6709169149398804, "step": 39310 }, { "epoch": 1.825525790426668, "grad_norm": 49.69078826904297, "learning_rate": 1.9590045963136634e-07, "logits/chosen": -19.323244094848633, "logits/rejected": -18.02664566040039, "logps/chosen": -305.19732666015625, "logps/rejected": -212.2201690673828, "loss": 0.3963, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.637180805206299, "rewards/margins": 1.806707739830017, "rewards/rejected": 0.8304735422134399, "step": 39320 }, { "epoch": 1.825990064534101, "grad_norm": 78.15991973876953, "learning_rate": 1.9582308061346085e-07, "logits/chosen": -19.8106689453125, "logits/rejected": -18.56199073791504, "logps/chosen": -454.78302001953125, "logps/rejected": -402.7136535644531, "loss": 0.4568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5967555046081543, "rewards/margins": 1.3838551044464111, "rewards/rejected": 2.2129006385803223, "step": 39330 }, { "epoch": 1.826454338641534, "grad_norm": 136.10450744628906, "learning_rate": 1.9574570159555534e-07, "logits/chosen": -19.547800064086914, "logits/rejected": -19.177539825439453, "logps/chosen": -364.76092529296875, "logps/rejected": -345.9748840332031, "loss": 1.0834, "rewards/accuracies": 0.5, "rewards/chosen": 3.40376615524292, "rewards/margins": 0.35338252782821655, "rewards/rejected": 3.050384044647217, "step": 39340 }, { "epoch": 1.826918612748967, "grad_norm": 42.17537307739258, "learning_rate": 1.9566832257764985e-07, "logits/chosen": -18.498241424560547, "logits/rejected": -17.92828941345215, "logps/chosen": -478.7593688964844, "logps/rejected": -422.1058654785156, "loss": 0.535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2289018630981445, "rewards/margins": 1.2409281730651855, "rewards/rejected": 2.98797345161438, "step": 39350 }, { "epoch": 1.8273828868564, "grad_norm": 58.92430877685547, "learning_rate": 1.9559094355974433e-07, "logits/chosen": -18.103578567504883, "logits/rejected": -18.408662796020508, "logps/chosen": -304.06048583984375, "logps/rejected": -304.6578063964844, "loss": 0.7446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5707685947418213, "rewards/margins": 0.414249986410141, "rewards/rejected": 2.1565184593200684, "step": 39360 }, { "epoch": 1.8278471609638332, "grad_norm": 53.35645294189453, "learning_rate": 1.9551356454183882e-07, "logits/chosen": -19.606477737426758, "logits/rejected": -19.1661319732666, "logps/chosen": -356.61151123046875, "logps/rejected": -333.87109375, "loss": 0.9809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.953697681427002, "rewards/margins": 0.530540406703949, "rewards/rejected": 2.4231576919555664, "step": 39370 }, { "epoch": 1.828311435071266, "grad_norm": 252.19915771484375, "learning_rate": 1.9543618552393333e-07, "logits/chosen": -18.775386810302734, "logits/rejected": -18.60964584350586, "logps/chosen": -431.05035400390625, "logps/rejected": -411.94696044921875, "loss": 1.2616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6254208087921143, "rewards/margins": 0.38096243143081665, "rewards/rejected": 3.244457960128784, "step": 39380 }, { "epoch": 1.8287757091786991, "grad_norm": 40.50927734375, "learning_rate": 1.953588065060278e-07, "logits/chosen": -18.631261825561523, "logits/rejected": -18.726987838745117, "logps/chosen": -367.91046142578125, "logps/rejected": -328.4602355957031, "loss": 0.7541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.893110752105713, "rewards/margins": 0.45362624526023865, "rewards/rejected": 2.4394845962524414, "step": 39390 }, { "epoch": 1.8292399832861321, "grad_norm": 149.77413940429688, "learning_rate": 1.9528142748812232e-07, "logits/chosen": -18.870349884033203, "logits/rejected": -18.819366455078125, "logps/chosen": -461.2935485839844, "logps/rejected": -435.0909118652344, "loss": 1.0562, "rewards/accuracies": 0.5, "rewards/chosen": 3.115385055541992, "rewards/margins": 0.003814196679741144, "rewards/rejected": 3.1115708351135254, "step": 39400 }, { "epoch": 1.8297042573935651, "grad_norm": 40.436805725097656, "learning_rate": 1.952040484702168e-07, "logits/chosen": -19.531007766723633, "logits/rejected": -18.312541961669922, "logps/chosen": -495.30908203125, "logps/rejected": -359.2719421386719, "loss": 0.721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.201609134674072, "rewards/margins": 1.571886420249939, "rewards/rejected": 2.6297223567962646, "step": 39410 }, { "epoch": 1.8301685315009983, "grad_norm": 98.69661712646484, "learning_rate": 1.951266694523113e-07, "logits/chosen": -18.794315338134766, "logits/rejected": -18.464935302734375, "logps/chosen": -461.20654296875, "logps/rejected": -374.7944030761719, "loss": 0.8462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.39931058883667, "rewards/margins": 0.47946038842201233, "rewards/rejected": 2.9198505878448486, "step": 39420 }, { "epoch": 1.8306328056084311, "grad_norm": 129.2644805908203, "learning_rate": 1.950492904344058e-07, "logits/chosen": -20.113426208496094, "logits/rejected": -19.11031150817871, "logps/chosen": -385.62835693359375, "logps/rejected": -329.5216064453125, "loss": 0.7815, "rewards/accuracies": 0.5, "rewards/chosen": 4.175444602966309, "rewards/margins": 1.3615429401397705, "rewards/rejected": 2.813901424407959, "step": 39430 }, { "epoch": 1.8310970797158643, "grad_norm": 50.43525695800781, "learning_rate": 1.949719114165003e-07, "logits/chosen": -19.67064094543457, "logits/rejected": -18.813261032104492, "logps/chosen": -436.4203186035156, "logps/rejected": -349.5705261230469, "loss": 0.7021, "rewards/accuracies": 0.5, "rewards/chosen": 3.917630672454834, "rewards/margins": 0.44181355834007263, "rewards/rejected": 3.4758172035217285, "step": 39440 }, { "epoch": 1.8315613538232973, "grad_norm": 58.01021194458008, "learning_rate": 1.948945323985948e-07, "logits/chosen": -18.22032356262207, "logits/rejected": -17.721572875976562, "logps/chosen": -335.60455322265625, "logps/rejected": -269.1905822753906, "loss": 1.0694, "rewards/accuracies": 0.5, "rewards/chosen": 2.6316826343536377, "rewards/margins": 0.6018205881118774, "rewards/rejected": 2.0298619270324707, "step": 39450 }, { "epoch": 1.8320256279307303, "grad_norm": 41.470237731933594, "learning_rate": 1.9481715338068926e-07, "logits/chosen": -18.82015609741211, "logits/rejected": -17.616600036621094, "logps/chosen": -448.1543884277344, "logps/rejected": -304.8208312988281, "loss": 0.6432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5656280517578125, "rewards/margins": 1.4511793851852417, "rewards/rejected": 2.1144490242004395, "step": 39460 }, { "epoch": 1.8324899020381633, "grad_norm": 151.96722412109375, "learning_rate": 1.9473977436278377e-07, "logits/chosen": -18.00925636291504, "logits/rejected": -17.9050350189209, "logps/chosen": -328.3096618652344, "logps/rejected": -319.2892761230469, "loss": 1.2471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.691755771636963, "rewards/margins": 0.12477853149175644, "rewards/rejected": 2.5669772624969482, "step": 39470 }, { "epoch": 1.8329541761455963, "grad_norm": 81.47173309326172, "learning_rate": 1.9466239534487828e-07, "logits/chosen": -18.71769142150879, "logits/rejected": -18.288040161132812, "logps/chosen": -420.87762451171875, "logps/rejected": -374.32086181640625, "loss": 0.9671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.263737916946411, "rewards/margins": -0.034041427075862885, "rewards/rejected": 3.2977795600891113, "step": 39480 }, { "epoch": 1.8334184502530295, "grad_norm": 0.13266734778881073, "learning_rate": 1.9458501632697277e-07, "logits/chosen": -19.0817813873291, "logits/rejected": -18.30011749267578, "logps/chosen": -532.6953735351562, "logps/rejected": -366.50421142578125, "loss": 1.0039, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.128368377685547, "rewards/margins": 1.3895059823989868, "rewards/rejected": 2.7388625144958496, "step": 39490 }, { "epoch": 1.8338827243604623, "grad_norm": 43.61323928833008, "learning_rate": 1.9450763730906728e-07, "logits/chosen": -19.044002532958984, "logits/rejected": -19.46303367614746, "logps/chosen": -396.9001159667969, "logps/rejected": -351.975830078125, "loss": 0.6341, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.352626323699951, "rewards/margins": 0.6125925779342651, "rewards/rejected": 2.7400336265563965, "step": 39500 }, { "epoch": 1.8343469984678955, "grad_norm": 0.14244182407855988, "learning_rate": 1.9443025829116174e-07, "logits/chosen": -19.064876556396484, "logits/rejected": -18.20855140686035, "logps/chosen": -375.7029724121094, "logps/rejected": -340.3040466308594, "loss": 0.6104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.060957908630371, "rewards/margins": 1.7222023010253906, "rewards/rejected": 2.3387563228607178, "step": 39510 }, { "epoch": 1.8348112725753285, "grad_norm": 60.6135139465332, "learning_rate": 1.9435287927325625e-07, "logits/chosen": -19.683307647705078, "logits/rejected": -18.201995849609375, "logps/chosen": -432.2291564941406, "logps/rejected": -307.89324951171875, "loss": 0.4366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.361728668212891, "rewards/margins": 2.112633228302002, "rewards/rejected": 2.2490954399108887, "step": 39520 }, { "epoch": 1.8352755466827615, "grad_norm": 248.48097229003906, "learning_rate": 1.9427550025535076e-07, "logits/chosen": -19.12930679321289, "logits/rejected": -18.37232780456543, "logps/chosen": -450.80487060546875, "logps/rejected": -327.8692932128906, "loss": 0.5437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.820314407348633, "rewards/margins": 2.0902135372161865, "rewards/rejected": 2.730100631713867, "step": 39530 }, { "epoch": 1.8357398207901945, "grad_norm": 126.58012390136719, "learning_rate": 1.9419812123744524e-07, "logits/chosen": -18.90846824645996, "logits/rejected": -18.8470458984375, "logps/chosen": -464.67486572265625, "logps/rejected": -421.15106201171875, "loss": 1.4795, "rewards/accuracies": 0.5, "rewards/chosen": 3.193467140197754, "rewards/margins": 0.07362174987792969, "rewards/rejected": 3.119845151901245, "step": 39540 }, { "epoch": 1.8362040948976275, "grad_norm": 35.20054244995117, "learning_rate": 1.9412074221953976e-07, "logits/chosen": -18.797298431396484, "logits/rejected": -19.03953742980957, "logps/chosen": -425.0852966308594, "logps/rejected": -416.044189453125, "loss": 0.891, "rewards/accuracies": 0.5, "rewards/chosen": 3.6003546714782715, "rewards/margins": 0.271159827709198, "rewards/rejected": 3.329195022583008, "step": 39550 }, { "epoch": 1.8366683690050607, "grad_norm": 40.098175048828125, "learning_rate": 1.9404336320163421e-07, "logits/chosen": -19.16802406311035, "logits/rejected": -19.502796173095703, "logps/chosen": -320.33074951171875, "logps/rejected": -316.42156982421875, "loss": 0.8579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.493349075317383, "rewards/margins": 1.2996017932891846, "rewards/rejected": 2.1937472820281982, "step": 39560 }, { "epoch": 1.8371326431124935, "grad_norm": 37.49817657470703, "learning_rate": 1.9396598418372872e-07, "logits/chosen": -18.989452362060547, "logits/rejected": -17.670679092407227, "logps/chosen": -416.9974670410156, "logps/rejected": -277.3008728027344, "loss": 0.2362, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.095821380615234, "rewards/margins": 2.3189585208892822, "rewards/rejected": 1.7768628597259521, "step": 39570 }, { "epoch": 1.8375969172199267, "grad_norm": 25.040287017822266, "learning_rate": 1.9388860516582324e-07, "logits/chosen": -19.974306106567383, "logits/rejected": -19.513763427734375, "logps/chosen": -447.35638427734375, "logps/rejected": -386.504638671875, "loss": 0.4513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.683507204055786, "rewards/margins": 0.8934186697006226, "rewards/rejected": 2.790088415145874, "step": 39580 }, { "epoch": 1.8380611913273597, "grad_norm": 111.54806518554688, "learning_rate": 1.9381122614791772e-07, "logits/chosen": -18.60314178466797, "logits/rejected": -18.26731300354004, "logps/chosen": -440.61322021484375, "logps/rejected": -363.59136962890625, "loss": 0.466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.450551986694336, "rewards/margins": 1.2171499729156494, "rewards/rejected": 2.2334022521972656, "step": 39590 }, { "epoch": 1.8385254654347927, "grad_norm": 20.77696990966797, "learning_rate": 1.9373384713001223e-07, "logits/chosen": -19.34474754333496, "logits/rejected": -18.943912506103516, "logps/chosen": -457.22406005859375, "logps/rejected": -396.05419921875, "loss": 0.5668, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9765751361846924, "rewards/margins": 1.5795382261276245, "rewards/rejected": 2.397036075592041, "step": 39600 }, { "epoch": 1.8389897395422259, "grad_norm": 309.9005432128906, "learning_rate": 1.936564681121067e-07, "logits/chosen": -18.025604248046875, "logits/rejected": -17.913061141967773, "logps/chosen": -317.4486999511719, "logps/rejected": -336.77166748046875, "loss": 0.9496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.188373327255249, "rewards/margins": 0.342505544424057, "rewards/rejected": 1.8458677530288696, "step": 39610 }, { "epoch": 1.8394540136496587, "grad_norm": 38.5223274230957, "learning_rate": 1.935790890942012e-07, "logits/chosen": -19.535686492919922, "logits/rejected": -18.748382568359375, "logps/chosen": -337.1523742675781, "logps/rejected": -205.57974243164062, "loss": 0.5719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.847970485687256, "rewards/margins": 1.1522183418273926, "rewards/rejected": 1.6957519054412842, "step": 39620 }, { "epoch": 1.8399182877570919, "grad_norm": 49.546810150146484, "learning_rate": 1.9350171007629571e-07, "logits/chosen": -19.380229949951172, "logits/rejected": -18.62322998046875, "logps/chosen": -489.1354064941406, "logps/rejected": -386.93603515625, "loss": 0.5842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6539947986602783, "rewards/margins": 0.8455937504768372, "rewards/rejected": 2.808401107788086, "step": 39630 }, { "epoch": 1.8403825618645249, "grad_norm": 4.083296298980713, "learning_rate": 1.934243310583902e-07, "logits/chosen": -19.302898406982422, "logits/rejected": -17.930553436279297, "logps/chosen": -500.59375, "logps/rejected": -366.26007080078125, "loss": 0.4174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.425240993499756, "rewards/margins": 1.6951000690460205, "rewards/rejected": 2.7301406860351562, "step": 39640 }, { "epoch": 1.8408468359719579, "grad_norm": 110.41399383544922, "learning_rate": 1.933469520404847e-07, "logits/chosen": -19.66768455505371, "logits/rejected": -19.20285987854004, "logps/chosen": -446.164794921875, "logps/rejected": -425.6826171875, "loss": 0.976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.556124448776245, "rewards/margins": -0.20071890950202942, "rewards/rejected": 3.7568435668945312, "step": 39650 }, { "epoch": 1.8413111100793909, "grad_norm": 176.99102783203125, "learning_rate": 1.9326957302257917e-07, "logits/chosen": -18.59183692932129, "logits/rejected": -17.957599639892578, "logps/chosen": -392.17474365234375, "logps/rejected": -383.6782531738281, "loss": 0.7416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.362270832061768, "rewards/margins": 1.4546709060668945, "rewards/rejected": 2.907599925994873, "step": 39660 }, { "epoch": 1.8417753841868238, "grad_norm": 34.72404479980469, "learning_rate": 1.9319219400467368e-07, "logits/chosen": -19.024593353271484, "logits/rejected": -19.02438735961914, "logps/chosen": -405.57464599609375, "logps/rejected": -466.47943115234375, "loss": 0.7071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4254069328308105, "rewards/margins": 0.8439775705337524, "rewards/rejected": 2.5814297199249268, "step": 39670 }, { "epoch": 1.842239658294257, "grad_norm": 8.039313316345215, "learning_rate": 1.931148149867682e-07, "logits/chosen": -18.598115921020508, "logits/rejected": -17.243738174438477, "logps/chosen": -384.7269592285156, "logps/rejected": -284.09552001953125, "loss": 0.5847, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.057851791381836, "rewards/margins": 2.619494915008545, "rewards/rejected": 1.4383571147918701, "step": 39680 }, { "epoch": 1.8427039324016898, "grad_norm": 17.599411010742188, "learning_rate": 1.9303743596886268e-07, "logits/chosen": -19.187583923339844, "logits/rejected": -18.5001220703125, "logps/chosen": -448.10546875, "logps/rejected": -343.2649230957031, "loss": 0.4901, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5170719623565674, "rewards/margins": 1.321816325187683, "rewards/rejected": 2.1952555179595947, "step": 39690 }, { "epoch": 1.843168206509123, "grad_norm": 7.7217698097229, "learning_rate": 1.9296005695095716e-07, "logits/chosen": -20.084341049194336, "logits/rejected": -19.03279685974121, "logps/chosen": -470.5340270996094, "logps/rejected": -433.85089111328125, "loss": 0.5325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.95449161529541, "rewards/margins": 1.5775864124298096, "rewards/rejected": 3.3769049644470215, "step": 39700 }, { "epoch": 1.843632480616556, "grad_norm": 57.36016845703125, "learning_rate": 1.9288267793305164e-07, "logits/chosen": -19.425067901611328, "logits/rejected": -18.429481506347656, "logps/chosen": -398.68548583984375, "logps/rejected": -269.7914123535156, "loss": 0.3606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.7135796546936035, "rewards/margins": 1.9763809442520142, "rewards/rejected": 2.737198829650879, "step": 39710 }, { "epoch": 1.844096754723989, "grad_norm": 16.037817001342773, "learning_rate": 1.9280529891514616e-07, "logits/chosen": -19.494647979736328, "logits/rejected": -18.215452194213867, "logps/chosen": -414.6734313964844, "logps/rejected": -401.79425048828125, "loss": 0.6566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2223122119903564, "rewards/margins": 0.9842061996459961, "rewards/rejected": 2.2381057739257812, "step": 39720 }, { "epoch": 1.844561028831422, "grad_norm": 17.76410675048828, "learning_rate": 1.9272791989724067e-07, "logits/chosen": -19.266109466552734, "logits/rejected": -18.523080825805664, "logps/chosen": -483.90875244140625, "logps/rejected": -350.9840087890625, "loss": 0.795, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4468207359313965, "rewards/margins": 1.1230201721191406, "rewards/rejected": 2.323800563812256, "step": 39730 }, { "epoch": 1.845025302938855, "grad_norm": 194.1837615966797, "learning_rate": 1.9265054087933515e-07, "logits/chosen": -18.37583351135254, "logits/rejected": -18.07534408569336, "logps/chosen": -301.29986572265625, "logps/rejected": -306.8439025878906, "loss": 1.2235, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.952327251434326, "rewards/margins": 0.363348126411438, "rewards/rejected": 2.5889792442321777, "step": 39740 }, { "epoch": 1.8454895770462882, "grad_norm": 174.1656494140625, "learning_rate": 1.9257316186142964e-07, "logits/chosen": -19.64273452758789, "logits/rejected": -18.66707992553711, "logps/chosen": -407.9732971191406, "logps/rejected": -327.41131591796875, "loss": 0.6148, "rewards/accuracies": 0.5, "rewards/chosen": 4.305050849914551, "rewards/margins": 1.560469388961792, "rewards/rejected": 2.744581699371338, "step": 39750 }, { "epoch": 1.845953851153721, "grad_norm": 2.1639082431793213, "learning_rate": 1.9249578284352412e-07, "logits/chosen": -19.0317325592041, "logits/rejected": -17.353229522705078, "logps/chosen": -327.0777282714844, "logps/rejected": -224.53988647460938, "loss": 0.5185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5733141899108887, "rewards/margins": 2.3136324882507324, "rewards/rejected": 1.2596814632415771, "step": 39760 }, { "epoch": 1.8464181252611542, "grad_norm": 27.26808738708496, "learning_rate": 1.9241840382561863e-07, "logits/chosen": -19.660343170166016, "logits/rejected": -19.47948455810547, "logps/chosen": -491.5213317871094, "logps/rejected": -371.7482604980469, "loss": 0.5315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.091152191162109, "rewards/margins": 0.9948946833610535, "rewards/rejected": 3.096257209777832, "step": 39770 }, { "epoch": 1.8468823993685872, "grad_norm": 26.91049575805664, "learning_rate": 1.9234102480771314e-07, "logits/chosen": -18.054407119750977, "logits/rejected": -18.333290100097656, "logps/chosen": -341.1024169921875, "logps/rejected": -356.31005859375, "loss": 1.6651, "rewards/accuracies": 0.5, "rewards/chosen": 2.1303648948669434, "rewards/margins": -0.6192935705184937, "rewards/rejected": 2.7496585845947266, "step": 39780 }, { "epoch": 1.8473466734760202, "grad_norm": 0.606452226638794, "learning_rate": 1.9226364578980763e-07, "logits/chosen": -18.947742462158203, "logits/rejected": -17.828596115112305, "logps/chosen": -349.2163391113281, "logps/rejected": -267.0475769042969, "loss": 0.5224, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.482487201690674, "rewards/margins": 1.83380925655365, "rewards/rejected": 1.6486778259277344, "step": 39790 }, { "epoch": 1.8478109475834534, "grad_norm": 13.756586074829102, "learning_rate": 1.9218626677190211e-07, "logits/chosen": -19.39286994934082, "logits/rejected": -19.01641082763672, "logps/chosen": -327.7982177734375, "logps/rejected": -308.437255859375, "loss": 0.5868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.827674388885498, "rewards/margins": 0.9684125781059265, "rewards/rejected": 1.8592618703842163, "step": 39800 }, { "epoch": 1.8482752216908862, "grad_norm": 27.810659408569336, "learning_rate": 1.9210888775399663e-07, "logits/chosen": -18.183237075805664, "logits/rejected": -17.41811752319336, "logps/chosen": -429.5592346191406, "logps/rejected": -330.6542663574219, "loss": 0.4713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.117412090301514, "rewards/margins": 1.5682086944580078, "rewards/rejected": 2.549203395843506, "step": 39810 }, { "epoch": 1.8487394957983194, "grad_norm": 12.670648574829102, "learning_rate": 1.920315087360911e-07, "logits/chosen": -19.8075008392334, "logits/rejected": -18.464582443237305, "logps/chosen": -461.8365173339844, "logps/rejected": -344.6409606933594, "loss": 0.5473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.637617111206055, "rewards/margins": 2.027590751647949, "rewards/rejected": 2.6100265979766846, "step": 39820 }, { "epoch": 1.8492037699057522, "grad_norm": 43.812774658203125, "learning_rate": 1.9195412971818562e-07, "logits/chosen": -18.49399757385254, "logits/rejected": -18.582225799560547, "logps/chosen": -410.5950622558594, "logps/rejected": -374.39495849609375, "loss": 1.006, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4285571575164795, "rewards/margins": 0.26550906896591187, "rewards/rejected": 3.163048028945923, "step": 39830 }, { "epoch": 1.8496680440131854, "grad_norm": 1.567320466041565, "learning_rate": 1.918767507002801e-07, "logits/chosen": -19.392215728759766, "logits/rejected": -17.309070587158203, "logps/chosen": -476.8065490722656, "logps/rejected": -233.09423828125, "loss": 0.3914, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.719426155090332, "rewards/margins": 2.6522881984710693, "rewards/rejected": 2.067138433456421, "step": 39840 }, { "epoch": 1.8501323181206184, "grad_norm": 135.16485595703125, "learning_rate": 1.917993716823746e-07, "logits/chosen": -20.31899642944336, "logits/rejected": -19.68796157836914, "logps/chosen": -403.3077087402344, "logps/rejected": -328.4266662597656, "loss": 0.7656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.772087574005127, "rewards/margins": 0.8446846008300781, "rewards/rejected": 2.927402973175049, "step": 39850 }, { "epoch": 1.8505965922280514, "grad_norm": 73.58692932128906, "learning_rate": 1.917219926644691e-07, "logits/chosen": -18.54693603515625, "logits/rejected": -17.709537506103516, "logps/chosen": -434.2875061035156, "logps/rejected": -356.3235778808594, "loss": 0.71, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9544174671173096, "rewards/margins": 1.5457254648208618, "rewards/rejected": 2.4086921215057373, "step": 39860 }, { "epoch": 1.8510608663354846, "grad_norm": 19.932294845581055, "learning_rate": 1.916446136465636e-07, "logits/chosen": -20.108333587646484, "logits/rejected": -19.813507080078125, "logps/chosen": -409.8507385253906, "logps/rejected": -415.67193603515625, "loss": 1.1393, "rewards/accuracies": 0.5, "rewards/chosen": 3.1515417098999023, "rewards/margins": 0.1629790961742401, "rewards/rejected": 2.988562822341919, "step": 39870 }, { "epoch": 1.8515251404429174, "grad_norm": 132.47486877441406, "learning_rate": 1.915672346286581e-07, "logits/chosen": -18.803245544433594, "logits/rejected": -19.04897117614746, "logps/chosen": -384.61968994140625, "logps/rejected": -339.10784912109375, "loss": 0.6603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.5958662033081055, "rewards/margins": 1.506211280822754, "rewards/rejected": 3.0896544456481934, "step": 39880 }, { "epoch": 1.8519894145503506, "grad_norm": 15.577553749084473, "learning_rate": 1.9148985561075258e-07, "logits/chosen": -19.209978103637695, "logits/rejected": -18.526012420654297, "logps/chosen": -421.76239013671875, "logps/rejected": -338.5984191894531, "loss": 0.538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.308614730834961, "rewards/margins": 0.9662928581237793, "rewards/rejected": 2.3423218727111816, "step": 39890 }, { "epoch": 1.8524536886577836, "grad_norm": 173.46176147460938, "learning_rate": 1.9141247659284707e-07, "logits/chosen": -17.54836082458496, "logits/rejected": -18.300325393676758, "logps/chosen": -246.85073852539062, "logps/rejected": -279.99639892578125, "loss": 1.6948, "rewards/accuracies": 0.5, "rewards/chosen": 1.1740094423294067, "rewards/margins": -1.021145224571228, "rewards/rejected": 2.1951546669006348, "step": 39900 }, { "epoch": 1.8529179627652166, "grad_norm": 62.47008514404297, "learning_rate": 1.9133509757494158e-07, "logits/chosen": -19.242206573486328, "logits/rejected": -18.498985290527344, "logps/chosen": -294.6220703125, "logps/rejected": -245.99215698242188, "loss": 0.5636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4275639057159424, "rewards/margins": 1.7107582092285156, "rewards/rejected": 1.7168054580688477, "step": 39910 }, { "epoch": 1.8533822368726496, "grad_norm": 249.59713745117188, "learning_rate": 1.9125771855703606e-07, "logits/chosen": -18.808475494384766, "logits/rejected": -18.493831634521484, "logps/chosen": -356.3470764160156, "logps/rejected": -360.91412353515625, "loss": 1.1749, "rewards/accuracies": 0.5, "rewards/chosen": 2.3688786029815674, "rewards/margins": 0.15993627905845642, "rewards/rejected": 2.208942413330078, "step": 39920 }, { "epoch": 1.8538465109800826, "grad_norm": 156.2954864501953, "learning_rate": 1.9118033953913058e-07, "logits/chosen": -18.465303421020508, "logits/rejected": -18.366132736206055, "logps/chosen": -254.4927978515625, "logps/rejected": -239.6279296875, "loss": 0.7867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6202566623687744, "rewards/margins": 0.5766303539276123, "rewards/rejected": 1.0436261892318726, "step": 39930 }, { "epoch": 1.8543107850875158, "grad_norm": 26.213285446166992, "learning_rate": 1.9110296052122503e-07, "logits/chosen": -18.530567169189453, "logits/rejected": -18.142704010009766, "logps/chosen": -353.343505859375, "logps/rejected": -350.8495788574219, "loss": 0.8001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9554686546325684, "rewards/margins": 0.6902548670768738, "rewards/rejected": 2.26521372795105, "step": 39940 }, { "epoch": 1.8547750591949486, "grad_norm": 11.918547630310059, "learning_rate": 1.9102558150331955e-07, "logits/chosen": -17.974409103393555, "logits/rejected": -18.143352508544922, "logps/chosen": -336.3720397949219, "logps/rejected": -319.54144287109375, "loss": 1.2548, "rewards/accuracies": 0.5, "rewards/chosen": 2.2593979835510254, "rewards/margins": 0.07574164122343063, "rewards/rejected": 2.1836562156677246, "step": 39950 }, { "epoch": 1.8552393333023818, "grad_norm": 90.58761596679688, "learning_rate": 1.9094820248541406e-07, "logits/chosen": -18.997745513916016, "logits/rejected": -17.76950454711914, "logps/chosen": -369.45770263671875, "logps/rejected": -238.8311309814453, "loss": 0.4655, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3989996910095215, "rewards/margins": 1.7751991748809814, "rewards/rejected": 1.6238006353378296, "step": 39960 }, { "epoch": 1.8557036074098148, "grad_norm": 3.673590660095215, "learning_rate": 1.9087082346750854e-07, "logits/chosen": -19.51689338684082, "logits/rejected": -19.5197696685791, "logps/chosen": -384.22833251953125, "logps/rejected": -415.9388732910156, "loss": 1.4385, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.105297803878784, "rewards/margins": 0.009661078453063965, "rewards/rejected": 3.0956368446350098, "step": 39970 }, { "epoch": 1.8561678815172478, "grad_norm": 56.812171936035156, "learning_rate": 1.9079344444960305e-07, "logits/chosen": -18.371599197387695, "logits/rejected": -18.66646957397461, "logps/chosen": -292.03814697265625, "logps/rejected": -367.2453918457031, "loss": 0.7239, "rewards/accuracies": 0.5, "rewards/chosen": 3.315223217010498, "rewards/margins": 0.7188326120376587, "rewards/rejected": 2.596390724182129, "step": 39980 }, { "epoch": 1.8566321556246808, "grad_norm": 76.19621276855469, "learning_rate": 1.907160654316975e-07, "logits/chosen": -18.923524856567383, "logits/rejected": -18.63567543029785, "logps/chosen": -391.1281433105469, "logps/rejected": -361.2611999511719, "loss": 0.8302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.033425807952881, "rewards/margins": 0.19141419231891632, "rewards/rejected": 3.8420119285583496, "step": 39990 }, { "epoch": 1.8570964297321138, "grad_norm": 0.22866973280906677, "learning_rate": 1.9063868641379202e-07, "logits/chosen": -18.604141235351562, "logits/rejected": -18.332441329956055, "logps/chosen": -356.2154541015625, "logps/rejected": -262.43206787109375, "loss": 1.6009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6075491905212402, "rewards/margins": 0.06098232418298721, "rewards/rejected": 2.54656720161438, "step": 40000 }, { "epoch": 1.857560703839547, "grad_norm": 85.28321075439453, "learning_rate": 1.9056130739588653e-07, "logits/chosen": -18.95699691772461, "logits/rejected": -19.00442886352539, "logps/chosen": -433.92041015625, "logps/rejected": -449.33428955078125, "loss": 0.9815, "rewards/accuracies": 0.5, "rewards/chosen": 4.024288177490234, "rewards/margins": 0.31341326236724854, "rewards/rejected": 3.710874080657959, "step": 40010 }, { "epoch": 1.8580249779469797, "grad_norm": 29.16326141357422, "learning_rate": 1.9048392837798102e-07, "logits/chosen": -19.867816925048828, "logits/rejected": -18.997873306274414, "logps/chosen": -358.85821533203125, "logps/rejected": -307.7769775390625, "loss": 0.4853, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8999228477478027, "rewards/margins": 1.0392253398895264, "rewards/rejected": 1.8606975078582764, "step": 40020 }, { "epoch": 1.858489252054413, "grad_norm": 12.054706573486328, "learning_rate": 1.9040654936007553e-07, "logits/chosen": -19.413789749145508, "logits/rejected": -17.791423797607422, "logps/chosen": -466.02239990234375, "logps/rejected": -337.6459045410156, "loss": 0.3144, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7302746772766113, "rewards/margins": 1.5383288860321045, "rewards/rejected": 2.1919455528259277, "step": 40030 }, { "epoch": 1.858953526161846, "grad_norm": 41.73338317871094, "learning_rate": 1.9032917034217e-07, "logits/chosen": -19.53717041015625, "logits/rejected": -18.90962791442871, "logps/chosen": -375.62335205078125, "logps/rejected": -400.63507080078125, "loss": 0.7944, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2100749015808105, "rewards/margins": 0.5433355569839478, "rewards/rejected": 2.6667392253875732, "step": 40040 }, { "epoch": 1.859417800269279, "grad_norm": 75.2046127319336, "learning_rate": 1.902517913242645e-07, "logits/chosen": -19.29924774169922, "logits/rejected": -17.971050262451172, "logps/chosen": -494.9769592285156, "logps/rejected": -316.48919677734375, "loss": 0.6369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.934853553771973, "rewards/margins": 2.6292545795440674, "rewards/rejected": 2.305598497390747, "step": 40050 }, { "epoch": 1.8598820743767122, "grad_norm": 89.73223876953125, "learning_rate": 1.90174412306359e-07, "logits/chosen": -19.26486587524414, "logits/rejected": -18.984405517578125, "logps/chosen": -387.3379211425781, "logps/rejected": -378.5634765625, "loss": 0.9335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.069347381591797, "rewards/margins": 0.2865050733089447, "rewards/rejected": 3.782841920852661, "step": 40060 }, { "epoch": 1.860346348484145, "grad_norm": 0.2160346508026123, "learning_rate": 1.900970332884535e-07, "logits/chosen": -19.13672637939453, "logits/rejected": -17.952571868896484, "logps/chosen": -399.70562744140625, "logps/rejected": -272.9770812988281, "loss": 0.7625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7594547271728516, "rewards/margins": 1.7667334079742432, "rewards/rejected": 1.9927209615707397, "step": 40070 }, { "epoch": 1.8608106225915781, "grad_norm": 75.19761657714844, "learning_rate": 1.90019654270548e-07, "logits/chosen": -19.164609909057617, "logits/rejected": -19.140949249267578, "logps/chosen": -402.1859130859375, "logps/rejected": -377.9799499511719, "loss": 0.3505, "rewards/accuracies": 1.0, "rewards/chosen": 2.8610548973083496, "rewards/margins": 1.0861537456512451, "rewards/rejected": 1.7749007940292358, "step": 40080 }, { "epoch": 1.8612748966990111, "grad_norm": 144.0029754638672, "learning_rate": 1.8994227525264247e-07, "logits/chosen": -19.96424102783203, "logits/rejected": -19.097105026245117, "logps/chosen": -436.9200134277344, "logps/rejected": -394.095458984375, "loss": 0.4585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9941704273223877, "rewards/margins": 1.1605031490325928, "rewards/rejected": 2.833667516708374, "step": 40090 }, { "epoch": 1.8617391708064441, "grad_norm": 52.34900665283203, "learning_rate": 1.8986489623473698e-07, "logits/chosen": -18.74271011352539, "logits/rejected": -17.616771697998047, "logps/chosen": -360.9837951660156, "logps/rejected": -293.0205993652344, "loss": 0.3026, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.044791221618652, "rewards/margins": 2.2778830528259277, "rewards/rejected": 1.766908049583435, "step": 40100 }, { "epoch": 1.8622034449138771, "grad_norm": 247.01123046875, "learning_rate": 1.897875172168315e-07, "logits/chosen": -18.421619415283203, "logits/rejected": -18.062332153320312, "logps/chosen": -404.5105895996094, "logps/rejected": -402.13604736328125, "loss": 1.3763, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.078404188156128, "rewards/margins": 0.16148391366004944, "rewards/rejected": 2.9169206619262695, "step": 40110 }, { "epoch": 1.8626677190213101, "grad_norm": 10.69644832611084, "learning_rate": 1.8971013819892597e-07, "logits/chosen": -18.92939567565918, "logits/rejected": -18.782745361328125, "logps/chosen": -321.5298767089844, "logps/rejected": -286.4219055175781, "loss": 0.6969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9396674633026123, "rewards/margins": 0.33016178011894226, "rewards/rejected": 2.6095058917999268, "step": 40120 }, { "epoch": 1.8631319931287433, "grad_norm": 46.06112289428711, "learning_rate": 1.8963275918102048e-07, "logits/chosen": -19.429920196533203, "logits/rejected": -19.45488166809082, "logps/chosen": -305.3562316894531, "logps/rejected": -322.5699157714844, "loss": 0.6832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.049118995666504, "rewards/margins": 0.8022197484970093, "rewards/rejected": 2.246899127960205, "step": 40130 }, { "epoch": 1.863596267236176, "grad_norm": 234.4733123779297, "learning_rate": 1.8955538016311494e-07, "logits/chosen": -18.922740936279297, "logits/rejected": -18.672176361083984, "logps/chosen": -432.61016845703125, "logps/rejected": -424.6289978027344, "loss": 1.0685, "rewards/accuracies": 0.5, "rewards/chosen": 3.3799712657928467, "rewards/margins": 0.23159393668174744, "rewards/rejected": 3.1483778953552246, "step": 40140 }, { "epoch": 1.8640605413436093, "grad_norm": 35.04645538330078, "learning_rate": 1.8947800114520945e-07, "logits/chosen": -18.776012420654297, "logits/rejected": -18.266002655029297, "logps/chosen": -411.2418518066406, "logps/rejected": -352.72906494140625, "loss": 0.706, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.767357349395752, "rewards/margins": 1.3747440576553345, "rewards/rejected": 2.392613410949707, "step": 40150 }, { "epoch": 1.8645248154510423, "grad_norm": 20.624841690063477, "learning_rate": 1.8940062212730396e-07, "logits/chosen": -18.37827491760254, "logits/rejected": -18.090465545654297, "logps/chosen": -261.93280029296875, "logps/rejected": -235.8500518798828, "loss": 0.964, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.5272486209869385, "rewards/margins": 0.49882835149765015, "rewards/rejected": 1.0284202098846436, "step": 40160 }, { "epoch": 1.8649890895584753, "grad_norm": 27.542930603027344, "learning_rate": 1.8932324310939845e-07, "logits/chosen": -19.848073959350586, "logits/rejected": -19.604015350341797, "logps/chosen": -417.1449279785156, "logps/rejected": -408.2287902832031, "loss": 0.5044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2415571212768555, "rewards/margins": 0.826004683971405, "rewards/rejected": 3.4155526161193848, "step": 40170 }, { "epoch": 1.8654533636659083, "grad_norm": 149.95089721679688, "learning_rate": 1.8924586409149293e-07, "logits/chosen": -19.091646194458008, "logits/rejected": -18.11416244506836, "logps/chosen": -304.0044860839844, "logps/rejected": -232.77914428710938, "loss": 0.5723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1449646949768066, "rewards/margins": 0.9833580255508423, "rewards/rejected": 2.161606550216675, "step": 40180 }, { "epoch": 1.8659176377733413, "grad_norm": 26.800657272338867, "learning_rate": 1.8916848507358742e-07, "logits/chosen": -18.33648681640625, "logits/rejected": -18.756820678710938, "logps/chosen": -331.8870544433594, "logps/rejected": -325.31646728515625, "loss": 1.0547, "rewards/accuracies": 0.5, "rewards/chosen": 3.252356767654419, "rewards/margins": 0.15201106667518616, "rewards/rejected": 3.1003453731536865, "step": 40190 }, { "epoch": 1.8663819118807745, "grad_norm": 164.8122100830078, "learning_rate": 1.8909110605568193e-07, "logits/chosen": -19.072139739990234, "logits/rejected": -18.481950759887695, "logps/chosen": -313.6358947753906, "logps/rejected": -290.66192626953125, "loss": 0.7829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.837550640106201, "rewards/margins": 0.4331381916999817, "rewards/rejected": 2.4044125080108643, "step": 40200 }, { "epoch": 1.8668461859882073, "grad_norm": 22.08422088623047, "learning_rate": 1.8901372703777644e-07, "logits/chosen": -18.59200668334961, "logits/rejected": -17.83524513244629, "logps/chosen": -300.2095642089844, "logps/rejected": -222.3847198486328, "loss": 0.7719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.87286639213562, "rewards/margins": 1.4184482097625732, "rewards/rejected": 1.4544183015823364, "step": 40210 }, { "epoch": 1.8673104600956405, "grad_norm": 72.5052261352539, "learning_rate": 1.8893634801987093e-07, "logits/chosen": -19.712236404418945, "logits/rejected": -19.056079864501953, "logps/chosen": -304.3998718261719, "logps/rejected": -248.71676635742188, "loss": 0.4069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7495386600494385, "rewards/margins": 1.6395399570465088, "rewards/rejected": 1.1099985837936401, "step": 40220 }, { "epoch": 1.8677747342030735, "grad_norm": 76.12739562988281, "learning_rate": 1.888589690019654e-07, "logits/chosen": -18.22835922241211, "logits/rejected": -17.122880935668945, "logps/chosen": -410.48638916015625, "logps/rejected": -248.03793334960938, "loss": 0.5931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.028218984603882, "rewards/margins": 1.6072019338607788, "rewards/rejected": 1.421017050743103, "step": 40230 }, { "epoch": 1.8682390083105065, "grad_norm": 75.3517837524414, "learning_rate": 1.887815899840599e-07, "logits/chosen": -19.71355628967285, "logits/rejected": -18.55075454711914, "logps/chosen": -466.3553161621094, "logps/rejected": -258.05621337890625, "loss": 0.3216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.489354610443115, "rewards/margins": 2.3494906425476074, "rewards/rejected": 2.1398637294769287, "step": 40240 }, { "epoch": 1.8687032824179397, "grad_norm": 52.36347579956055, "learning_rate": 1.887042109661544e-07, "logits/chosen": -18.6549072265625, "logits/rejected": -18.68666648864746, "logps/chosen": -311.11883544921875, "logps/rejected": -335.4409484863281, "loss": 0.9755, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8787254095077515, "rewards/margins": -0.20721617341041565, "rewards/rejected": 2.085941791534424, "step": 40250 }, { "epoch": 1.8691675565253725, "grad_norm": 241.08404541015625, "learning_rate": 1.8862683194824892e-07, "logits/chosen": -18.254642486572266, "logits/rejected": -18.855144500732422, "logps/chosen": -305.45843505859375, "logps/rejected": -337.2403869628906, "loss": 1.9325, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.930540084838867, "rewards/margins": -0.5355023145675659, "rewards/rejected": 3.4660427570343018, "step": 40260 }, { "epoch": 1.8696318306328057, "grad_norm": 58.83500289916992, "learning_rate": 1.885494529303434e-07, "logits/chosen": -19.84487533569336, "logits/rejected": -19.83444595336914, "logps/chosen": -518.7432861328125, "logps/rejected": -434.05731201171875, "loss": 0.636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.21201229095459, "rewards/margins": 0.36386004090309143, "rewards/rejected": 3.8481521606445312, "step": 40270 }, { "epoch": 1.8700961047402387, "grad_norm": 107.38700103759766, "learning_rate": 1.884720739124379e-07, "logits/chosen": -18.616283416748047, "logits/rejected": -19.552221298217773, "logps/chosen": -320.27020263671875, "logps/rejected": -389.55865478515625, "loss": 1.27, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.8691612482070923, "rewards/margins": -0.7680917978286743, "rewards/rejected": 2.6372530460357666, "step": 40280 }, { "epoch": 1.8705603788476717, "grad_norm": 7.066073894500732, "learning_rate": 1.8839469489453237e-07, "logits/chosen": -19.321542739868164, "logits/rejected": -18.543161392211914, "logps/chosen": -462.1659240722656, "logps/rejected": -413.29840087890625, "loss": 0.3956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.580192565917969, "rewards/margins": 1.864831566810608, "rewards/rejected": 2.7153611183166504, "step": 40290 }, { "epoch": 1.8710246529551047, "grad_norm": 78.31278991699219, "learning_rate": 1.8831731587662688e-07, "logits/chosen": -18.044950485229492, "logits/rejected": -17.99823760986328, "logps/chosen": -389.23675537109375, "logps/rejected": -405.6103210449219, "loss": 1.4635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.36334490776062, "rewards/margins": -0.15047943592071533, "rewards/rejected": 3.513823986053467, "step": 40300 }, { "epoch": 1.8714889270625377, "grad_norm": 210.17654418945312, "learning_rate": 1.882399368587214e-07, "logits/chosen": -19.043590545654297, "logits/rejected": -18.37945556640625, "logps/chosen": -396.4657897949219, "logps/rejected": -323.1004638671875, "loss": 0.662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5648553371429443, "rewards/margins": 0.635120689868927, "rewards/rejected": 1.9297351837158203, "step": 40310 }, { "epoch": 1.8719532011699709, "grad_norm": 23.019237518310547, "learning_rate": 1.8816255784081588e-07, "logits/chosen": -18.46396255493164, "logits/rejected": -17.762367248535156, "logps/chosen": -395.95697021484375, "logps/rejected": -317.19317626953125, "loss": 0.8368, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2789409160614014, "rewards/margins": 0.4310341477394104, "rewards/rejected": 2.8479065895080566, "step": 40320 }, { "epoch": 1.8724174752774037, "grad_norm": 100.57796478271484, "learning_rate": 1.8808517882291037e-07, "logits/chosen": -19.68527603149414, "logits/rejected": -19.433795928955078, "logps/chosen": -438.989501953125, "logps/rejected": -328.64947509765625, "loss": 0.5356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.574202060699463, "rewards/margins": 1.5661439895629883, "rewards/rejected": 3.0080575942993164, "step": 40330 }, { "epoch": 1.8728817493848369, "grad_norm": 27.353063583374023, "learning_rate": 1.8800779980500485e-07, "logits/chosen": -18.960002899169922, "logits/rejected": -18.98506736755371, "logps/chosen": -381.94866943359375, "logps/rejected": -445.8228454589844, "loss": 0.6708, "rewards/accuracies": 0.5, "rewards/chosen": 3.28389310836792, "rewards/margins": 0.5970484614372253, "rewards/rejected": 2.686844825744629, "step": 40340 }, { "epoch": 1.8733460234922699, "grad_norm": 89.79469299316406, "learning_rate": 1.8793042078709936e-07, "logits/chosen": -18.934406280517578, "logits/rejected": -17.968713760375977, "logps/chosen": -342.45098876953125, "logps/rejected": -245.9196319580078, "loss": 0.4635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5069756507873535, "rewards/margins": 2.1762053966522217, "rewards/rejected": 1.3307700157165527, "step": 40350 }, { "epoch": 1.8738102975997029, "grad_norm": 0.06427884101867676, "learning_rate": 1.8785304176919387e-07, "logits/chosen": -18.852270126342773, "logits/rejected": -18.003162384033203, "logps/chosen": -433.65460205078125, "logps/rejected": -265.2071228027344, "loss": 0.3959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.143467903137207, "rewards/margins": 2.6543984413146973, "rewards/rejected": 1.489069938659668, "step": 40360 }, { "epoch": 1.8742745717071359, "grad_norm": 1.1759384870529175, "learning_rate": 1.8777566275128833e-07, "logits/chosen": -18.127872467041016, "logits/rejected": -17.649320602416992, "logps/chosen": -361.63336181640625, "logps/rejected": -258.7246398925781, "loss": 0.7752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5575637817382812, "rewards/margins": 2.097430467605591, "rewards/rejected": 1.4601330757141113, "step": 40370 }, { "epoch": 1.8747388458145688, "grad_norm": 192.1327362060547, "learning_rate": 1.8769828373338284e-07, "logits/chosen": -20.361225128173828, "logits/rejected": -18.647716522216797, "logps/chosen": -405.7767028808594, "logps/rejected": -289.0561828613281, "loss": 0.5027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.56609582901001, "rewards/margins": 1.6580331325531006, "rewards/rejected": 2.90806245803833, "step": 40380 }, { "epoch": 1.875203119922002, "grad_norm": 112.66151428222656, "learning_rate": 1.8762090471547733e-07, "logits/chosen": -19.518583297729492, "logits/rejected": -19.29755973815918, "logps/chosen": -436.2041015625, "logps/rejected": -380.19439697265625, "loss": 0.8095, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.431378364562988, "rewards/margins": 0.41263896226882935, "rewards/rejected": 4.018739223480225, "step": 40390 }, { "epoch": 1.8756673940294348, "grad_norm": 145.4334259033203, "learning_rate": 1.8754352569757184e-07, "logits/chosen": -19.442115783691406, "logits/rejected": -18.981157302856445, "logps/chosen": -478.0984802246094, "logps/rejected": -409.22357177734375, "loss": 0.9215, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0541493892669678, "rewards/margins": -0.13061898946762085, "rewards/rejected": 3.1847681999206543, "step": 40400 }, { "epoch": 1.876131668136868, "grad_norm": 99.74574279785156, "learning_rate": 1.8746614667966635e-07, "logits/chosen": -18.710485458374023, "logits/rejected": -18.693622589111328, "logps/chosen": -464.53216552734375, "logps/rejected": -427.13214111328125, "loss": 1.2375, "rewards/accuracies": 0.5, "rewards/chosen": 3.873953342437744, "rewards/margins": 0.36572128534317017, "rewards/rejected": 3.5082321166992188, "step": 40410 }, { "epoch": 1.876595942244301, "grad_norm": 11.688509941101074, "learning_rate": 1.873887676617608e-07, "logits/chosen": -18.532573699951172, "logits/rejected": -17.823654174804688, "logps/chosen": -332.8304748535156, "logps/rejected": -245.19393920898438, "loss": 0.8972, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.632479429244995, "rewards/margins": 0.9667293429374695, "rewards/rejected": 1.6657501459121704, "step": 40420 }, { "epoch": 1.877060216351734, "grad_norm": 91.30522918701172, "learning_rate": 1.8731138864385532e-07, "logits/chosen": -19.682676315307617, "logits/rejected": -18.854829788208008, "logps/chosen": -453.65875244140625, "logps/rejected": -341.88507080078125, "loss": 0.3704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8635261058807373, "rewards/margins": 1.9519884586334229, "rewards/rejected": 1.9115374088287354, "step": 40430 }, { "epoch": 1.8775244904591673, "grad_norm": 153.2179718017578, "learning_rate": 1.8723400962594983e-07, "logits/chosen": -18.75215721130371, "logits/rejected": -17.70847511291504, "logps/chosen": -476.2431640625, "logps/rejected": -294.5590515136719, "loss": 0.4091, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.277919054031372, "rewards/margins": 1.457996129989624, "rewards/rejected": 1.8199228048324585, "step": 40440 }, { "epoch": 1.8779887645666, "grad_norm": 66.93238830566406, "learning_rate": 1.8715663060804432e-07, "logits/chosen": -17.8912410736084, "logits/rejected": -17.009199142456055, "logps/chosen": -292.82562255859375, "logps/rejected": -185.72857666015625, "loss": 0.5455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.507798433303833, "rewards/margins": 1.2132723331451416, "rewards/rejected": 1.2945257425308228, "step": 40450 }, { "epoch": 1.8784530386740332, "grad_norm": 1.8539031744003296, "learning_rate": 1.8707925159013883e-07, "logits/chosen": -19.174535751342773, "logits/rejected": -18.510982513427734, "logps/chosen": -362.98583984375, "logps/rejected": -301.3485412597656, "loss": 0.451, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.358860492706299, "rewards/margins": 1.5706151723861694, "rewards/rejected": 1.7882452011108398, "step": 40460 }, { "epoch": 1.8789173127814662, "grad_norm": 1.187587022781372, "learning_rate": 1.8700187257223329e-07, "logits/chosen": -18.875347137451172, "logits/rejected": -17.395282745361328, "logps/chosen": -410.628173828125, "logps/rejected": -275.31842041015625, "loss": 0.4315, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7915661334991455, "rewards/margins": 2.498955726623535, "rewards/rejected": 1.2926104068756104, "step": 40470 }, { "epoch": 1.8793815868888992, "grad_norm": 21.638492584228516, "learning_rate": 1.869244935543278e-07, "logits/chosen": -19.414108276367188, "logits/rejected": -18.50223731994629, "logps/chosen": -353.59857177734375, "logps/rejected": -243.43948364257812, "loss": 0.5418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.823622226715088, "rewards/margins": 1.063804030418396, "rewards/rejected": 2.7598183155059814, "step": 40480 }, { "epoch": 1.8798458609963322, "grad_norm": 3.957786798477173, "learning_rate": 1.868471145364223e-07, "logits/chosen": -17.896665573120117, "logits/rejected": -18.601993560791016, "logps/chosen": -325.55255126953125, "logps/rejected": -355.7304992675781, "loss": 1.6197, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.6533303260803223, "rewards/margins": -0.21862153708934784, "rewards/rejected": 2.8719518184661865, "step": 40490 }, { "epoch": 1.8803101351037652, "grad_norm": 10.206693649291992, "learning_rate": 1.867697355185168e-07, "logits/chosen": -18.172842025756836, "logits/rejected": -17.726688385009766, "logps/chosen": -316.6790771484375, "logps/rejected": -267.4171142578125, "loss": 0.8778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2586824893951416, "rewards/margins": 0.7134190201759338, "rewards/rejected": 1.5452635288238525, "step": 40500 }, { "epoch": 1.8807744092111984, "grad_norm": 30.34911346435547, "learning_rate": 1.866923565006113e-07, "logits/chosen": -19.18317413330078, "logits/rejected": -17.79473114013672, "logps/chosen": -383.3890075683594, "logps/rejected": -227.0292510986328, "loss": 0.6895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.050886631011963, "rewards/margins": 2.1935722827911377, "rewards/rejected": 1.8573143482208252, "step": 40510 }, { "epoch": 1.8812386833186312, "grad_norm": 168.3956756591797, "learning_rate": 1.8661497748270576e-07, "logits/chosen": -19.479799270629883, "logits/rejected": -19.145750045776367, "logps/chosen": -472.72845458984375, "logps/rejected": -483.6075134277344, "loss": 0.9731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3067402839660645, "rewards/margins": 0.41237372159957886, "rewards/rejected": 2.894366502761841, "step": 40520 }, { "epoch": 1.8817029574260644, "grad_norm": 184.07308959960938, "learning_rate": 1.8653759846480027e-07, "logits/chosen": -17.68899917602539, "logits/rejected": -17.938724517822266, "logps/chosen": -239.24453735351562, "logps/rejected": -238.2720184326172, "loss": 1.0106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0047436952590942, "rewards/margins": -0.009700489230453968, "rewards/rejected": 1.01444411277771, "step": 40530 }, { "epoch": 1.8821672315334974, "grad_norm": 38.85188674926758, "learning_rate": 1.8646021944689479e-07, "logits/chosen": -18.803909301757812, "logits/rejected": -18.604764938354492, "logps/chosen": -471.9732971191406, "logps/rejected": -391.73602294921875, "loss": 0.4487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0843400955200195, "rewards/margins": 1.1347349882125854, "rewards/rejected": 2.9496047496795654, "step": 40540 }, { "epoch": 1.8826315056409304, "grad_norm": 39.897727966308594, "learning_rate": 1.8638284042898927e-07, "logits/chosen": -18.583873748779297, "logits/rejected": -18.247085571289062, "logps/chosen": -435.2509765625, "logps/rejected": -358.788330078125, "loss": 0.7696, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.465510606765747, "rewards/margins": 1.3196903467178345, "rewards/rejected": 2.1458206176757812, "step": 40550 }, { "epoch": 1.8830957797483634, "grad_norm": 94.97000885009766, "learning_rate": 1.8630546141108378e-07, "logits/chosen": -18.37955093383789, "logits/rejected": -18.138065338134766, "logps/chosen": -369.64678955078125, "logps/rejected": -305.7745361328125, "loss": 0.9736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5886387825012207, "rewards/margins": 0.946487307548523, "rewards/rejected": 1.6421515941619873, "step": 40560 }, { "epoch": 1.8835600538557964, "grad_norm": 148.9486846923828, "learning_rate": 1.8622808239317824e-07, "logits/chosen": -18.833070755004883, "logits/rejected": -17.952360153198242, "logps/chosen": -437.63323974609375, "logps/rejected": -341.84918212890625, "loss": 0.5327, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.75844144821167, "rewards/margins": 1.6169102191925049, "rewards/rejected": 2.1415317058563232, "step": 40570 }, { "epoch": 1.8840243279632296, "grad_norm": 99.89226531982422, "learning_rate": 1.8615070337527275e-07, "logits/chosen": -18.7867374420166, "logits/rejected": -18.180749893188477, "logps/chosen": -404.0206298828125, "logps/rejected": -329.23724365234375, "loss": 0.485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.053552627563477, "rewards/margins": 1.2876145839691162, "rewards/rejected": 2.7659380435943604, "step": 40580 }, { "epoch": 1.8844886020706624, "grad_norm": 7.885277271270752, "learning_rate": 1.8607332435736726e-07, "logits/chosen": -18.915164947509766, "logits/rejected": -18.8170223236084, "logps/chosen": -443.2262268066406, "logps/rejected": -412.5616760253906, "loss": 0.4783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.792201042175293, "rewards/margins": 1.6555273532867432, "rewards/rejected": 3.136673927307129, "step": 40590 }, { "epoch": 1.8849528761780956, "grad_norm": 174.6820068359375, "learning_rate": 1.8599594533946175e-07, "logits/chosen": -18.95735740661621, "logits/rejected": -17.843393325805664, "logps/chosen": -429.8355407714844, "logps/rejected": -263.38140869140625, "loss": 0.6479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.7862162590026855, "rewards/margins": 2.9874351024627686, "rewards/rejected": 1.798781394958496, "step": 40600 }, { "epoch": 1.8854171502855286, "grad_norm": 56.53252029418945, "learning_rate": 1.8591856632155623e-07, "logits/chosen": -18.596355438232422, "logits/rejected": -18.65095329284668, "logps/chosen": -368.7806701660156, "logps/rejected": -419.1356506347656, "loss": 1.1412, "rewards/accuracies": 0.5, "rewards/chosen": 3.0660853385925293, "rewards/margins": -0.27945414185523987, "rewards/rejected": 3.3455395698547363, "step": 40610 }, { "epoch": 1.8858814243929616, "grad_norm": 55.618370056152344, "learning_rate": 1.8584118730365072e-07, "logits/chosen": -19.138595581054688, "logits/rejected": -17.719257354736328, "logps/chosen": -335.4477233886719, "logps/rejected": -187.20985412597656, "loss": 0.4039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5646908283233643, "rewards/margins": 1.7891082763671875, "rewards/rejected": 0.7755825519561768, "step": 40620 }, { "epoch": 1.8863456985003948, "grad_norm": 1.7805720567703247, "learning_rate": 1.8576380828574523e-07, "logits/chosen": -19.193660736083984, "logits/rejected": -18.276355743408203, "logps/chosen": -313.17950439453125, "logps/rejected": -298.66943359375, "loss": 0.6536, "rewards/accuracies": 0.5, "rewards/chosen": 3.4838452339172363, "rewards/margins": 1.6450923681259155, "rewards/rejected": 1.8387525081634521, "step": 40630 }, { "epoch": 1.8868099726078276, "grad_norm": 276.7765808105469, "learning_rate": 1.8568642926783974e-07, "logits/chosen": -20.130949020385742, "logits/rejected": -18.757747650146484, "logps/chosen": -351.0929870605469, "logps/rejected": -236.09701538085938, "loss": 0.7701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.590195894241333, "rewards/margins": 0.4868040084838867, "rewards/rejected": 2.1033921241760254, "step": 40640 }, { "epoch": 1.8872742467152608, "grad_norm": 52.085567474365234, "learning_rate": 1.8560905024993422e-07, "logits/chosen": -19.353313446044922, "logits/rejected": -19.430946350097656, "logps/chosen": -389.17657470703125, "logps/rejected": -338.4685974121094, "loss": 1.0636, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.877653121948242, "rewards/margins": -0.10621336847543716, "rewards/rejected": 2.9838662147521973, "step": 40650 }, { "epoch": 1.8877385208226936, "grad_norm": 187.24331665039062, "learning_rate": 1.855316712320287e-07, "logits/chosen": -18.96470069885254, "logits/rejected": -19.33328628540039, "logps/chosen": -300.1463623046875, "logps/rejected": -281.57550048828125, "loss": 1.6402, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.7082469463348389, "rewards/margins": -0.8328968286514282, "rewards/rejected": 2.5411438941955566, "step": 40660 }, { "epoch": 1.8882027949301268, "grad_norm": 258.4871520996094, "learning_rate": 1.854542922141232e-07, "logits/chosen": -17.90116310119629, "logits/rejected": -17.698265075683594, "logps/chosen": -382.01348876953125, "logps/rejected": -338.85650634765625, "loss": 1.1074, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4024040699005127, "rewards/margins": 0.44246283173561096, "rewards/rejected": 2.9599411487579346, "step": 40670 }, { "epoch": 1.8886670690375598, "grad_norm": 7.227470397949219, "learning_rate": 1.853769131962177e-07, "logits/chosen": -18.175586700439453, "logits/rejected": -17.982654571533203, "logps/chosen": -367.5165710449219, "logps/rejected": -311.3934631347656, "loss": 1.3555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7433860301971436, "rewards/margins": 0.1298607885837555, "rewards/rejected": 2.613525390625, "step": 40680 }, { "epoch": 1.8891313431449928, "grad_norm": 4.927196502685547, "learning_rate": 1.8529953417831222e-07, "logits/chosen": -18.984682083129883, "logits/rejected": -18.2802734375, "logps/chosen": -339.4980773925781, "logps/rejected": -302.1470642089844, "loss": 0.3119, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.308568239212036, "rewards/margins": 1.3955638408660889, "rewards/rejected": 1.9130042791366577, "step": 40690 }, { "epoch": 1.889595617252426, "grad_norm": 12.256628036499023, "learning_rate": 1.852221551604067e-07, "logits/chosen": -19.279552459716797, "logits/rejected": -19.056854248046875, "logps/chosen": -448.6436462402344, "logps/rejected": -322.6982727050781, "loss": 0.6359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4399962425231934, "rewards/margins": 0.8464176058769226, "rewards/rejected": 2.593578577041626, "step": 40700 }, { "epoch": 1.8900598913598587, "grad_norm": 0.034550830721855164, "learning_rate": 1.8514477614250119e-07, "logits/chosen": -18.415504455566406, "logits/rejected": -16.872547149658203, "logps/chosen": -366.4283752441406, "logps/rejected": -229.65591430664062, "loss": 0.3476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.504246473312378, "rewards/margins": 2.65630841255188, "rewards/rejected": 0.8479383587837219, "step": 40710 }, { "epoch": 1.890524165467292, "grad_norm": 12.399161338806152, "learning_rate": 1.8506739712459567e-07, "logits/chosen": -18.638614654541016, "logits/rejected": -17.123069763183594, "logps/chosen": -441.4169921875, "logps/rejected": -301.65911865234375, "loss": 0.4798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.137638568878174, "rewards/margins": 1.4140293598175049, "rewards/rejected": 1.7236089706420898, "step": 40720 }, { "epoch": 1.890988439574725, "grad_norm": 1.0384868383407593, "learning_rate": 1.8499001810669018e-07, "logits/chosen": -18.814064025878906, "logits/rejected": -18.052871704101562, "logps/chosen": -367.43023681640625, "logps/rejected": -262.71563720703125, "loss": 0.7858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.616945743560791, "rewards/margins": 1.399172067642212, "rewards/rejected": 2.2177734375, "step": 40730 }, { "epoch": 1.891452713682158, "grad_norm": 98.29150390625, "learning_rate": 1.849126390887847e-07, "logits/chosen": -18.098018646240234, "logits/rejected": -18.127544403076172, "logps/chosen": -310.4649963378906, "logps/rejected": -286.5006408691406, "loss": 0.7298, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5998165607452393, "rewards/margins": 0.7569957971572876, "rewards/rejected": 1.8428207635879517, "step": 40740 }, { "epoch": 1.891916987789591, "grad_norm": 183.9365234375, "learning_rate": 1.8483526007087918e-07, "logits/chosen": -19.12827491760254, "logits/rejected": -18.570636749267578, "logps/chosen": -403.41943359375, "logps/rejected": -341.60357666015625, "loss": 0.5567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6096298694610596, "rewards/margins": 1.4017972946166992, "rewards/rejected": 2.2078328132629395, "step": 40750 }, { "epoch": 1.892381261897024, "grad_norm": 45.7246208190918, "learning_rate": 1.8476561895476423e-07, "logits/chosen": -19.292455673217773, "logits/rejected": -17.367944717407227, "logps/chosen": -427.79925537109375, "logps/rejected": -287.15985107421875, "loss": 0.3798, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.792876243591309, "rewards/margins": 2.186594247817993, "rewards/rejected": 2.6062819957733154, "step": 40760 }, { "epoch": 1.8928455360044572, "grad_norm": 1.8522547483444214, "learning_rate": 1.846882399368587e-07, "logits/chosen": -18.871707916259766, "logits/rejected": -17.995351791381836, "logps/chosen": -400.61163330078125, "logps/rejected": -313.06744384765625, "loss": 0.7735, "rewards/accuracies": 0.5, "rewards/chosen": 3.543468952178955, "rewards/margins": 0.7011629939079285, "rewards/rejected": 2.842306137084961, "step": 40770 }, { "epoch": 1.89330981011189, "grad_norm": 0.3304930031299591, "learning_rate": 1.846108609189532e-07, "logits/chosen": -18.575775146484375, "logits/rejected": -17.590669631958008, "logps/chosen": -534.3975219726562, "logps/rejected": -370.03607177734375, "loss": 0.4394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.085509300231934, "rewards/margins": 2.281132459640503, "rewards/rejected": 2.804375648498535, "step": 40780 }, { "epoch": 1.8937740842193231, "grad_norm": 68.23910522460938, "learning_rate": 1.845334819010477e-07, "logits/chosen": -18.058032989501953, "logits/rejected": -17.8812313079834, "logps/chosen": -326.10955810546875, "logps/rejected": -274.1885070800781, "loss": 0.3928, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.432934045791626, "rewards/margins": 1.1075530052185059, "rewards/rejected": 1.3253811597824097, "step": 40790 }, { "epoch": 1.8942383583267561, "grad_norm": 95.68431091308594, "learning_rate": 1.844561028831422e-07, "logits/chosen": -18.678979873657227, "logits/rejected": -17.95983123779297, "logps/chosen": -412.10321044921875, "logps/rejected": -373.56488037109375, "loss": 0.8599, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.835416078567505, "rewards/margins": 0.9124525785446167, "rewards/rejected": 2.9229633808135986, "step": 40800 }, { "epoch": 1.8947026324341891, "grad_norm": 0.19176402688026428, "learning_rate": 1.843787238652367e-07, "logits/chosen": -18.88726043701172, "logits/rejected": -18.194826126098633, "logps/chosen": -373.16339111328125, "logps/rejected": -271.0094909667969, "loss": 0.8083, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7326273918151855, "rewards/margins": 1.4181041717529297, "rewards/rejected": 2.314523458480835, "step": 40810 }, { "epoch": 1.8951669065416221, "grad_norm": 261.4866943359375, "learning_rate": 1.843013448473312e-07, "logits/chosen": -18.787790298461914, "logits/rejected": -18.5304012298584, "logps/chosen": -393.77435302734375, "logps/rejected": -284.008544921875, "loss": 0.4625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.486787796020508, "rewards/margins": 2.20589542388916, "rewards/rejected": 1.2808921337127686, "step": 40820 }, { "epoch": 1.8956311806490551, "grad_norm": 48.867340087890625, "learning_rate": 1.8422396582942567e-07, "logits/chosen": -19.659496307373047, "logits/rejected": -19.64276123046875, "logps/chosen": -363.9090576171875, "logps/rejected": -347.51068115234375, "loss": 0.8007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.04218864440918, "rewards/margins": 1.0598936080932617, "rewards/rejected": 2.982295274734497, "step": 40830 }, { "epoch": 1.8960954547564883, "grad_norm": 2.0385451316833496, "learning_rate": 1.8414658681152019e-07, "logits/chosen": -18.690799713134766, "logits/rejected": -17.83199119567871, "logps/chosen": -418.1485290527344, "logps/rejected": -356.51043701171875, "loss": 0.642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9435062408447266, "rewards/margins": 0.8576622009277344, "rewards/rejected": 3.085843563079834, "step": 40840 }, { "epoch": 1.896559728863921, "grad_norm": 23.070755004882812, "learning_rate": 1.8406920779361467e-07, "logits/chosen": -19.501033782958984, "logits/rejected": -18.58663558959961, "logps/chosen": -444.8811950683594, "logps/rejected": -391.85693359375, "loss": 0.6542, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.533553123474121, "rewards/margins": 1.9606841802597046, "rewards/rejected": 2.572868824005127, "step": 40850 }, { "epoch": 1.8970240029713543, "grad_norm": 20.973712921142578, "learning_rate": 1.8399182877570918e-07, "logits/chosen": -19.632633209228516, "logits/rejected": -18.88309097290039, "logps/chosen": -375.1727600097656, "logps/rejected": -299.9381103515625, "loss": 0.4069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7879250049591064, "rewards/margins": 1.6836011409759521, "rewards/rejected": 2.104323625564575, "step": 40860 }, { "epoch": 1.8974882770787873, "grad_norm": 0.01633533462882042, "learning_rate": 1.8391444975780364e-07, "logits/chosen": -18.714967727661133, "logits/rejected": -17.740976333618164, "logps/chosen": -435.78973388671875, "logps/rejected": -333.8602294921875, "loss": 0.4459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.20736026763916, "rewards/margins": 1.9320348501205444, "rewards/rejected": 2.275324821472168, "step": 40870 }, { "epoch": 1.8979525511862203, "grad_norm": 28.908069610595703, "learning_rate": 1.8383707073989815e-07, "logits/chosen": -19.186321258544922, "logits/rejected": -19.09630012512207, "logps/chosen": -276.37359619140625, "logps/rejected": -307.68359375, "loss": 1.7454, "rewards/accuracies": 0.5, "rewards/chosen": 1.8092620372772217, "rewards/margins": -0.7131694555282593, "rewards/rejected": 2.5224316120147705, "step": 40880 }, { "epoch": 1.8984168252936535, "grad_norm": 42.05842590332031, "learning_rate": 1.8375969172199266e-07, "logits/chosen": -18.771900177001953, "logits/rejected": -18.673999786376953, "logps/chosen": -411.79620361328125, "logps/rejected": -359.14111328125, "loss": 1.003, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3816967010498047, "rewards/margins": 0.5229699015617371, "rewards/rejected": 2.858726739883423, "step": 40890 }, { "epoch": 1.8988810994010863, "grad_norm": 70.88987731933594, "learning_rate": 1.8368231270408715e-07, "logits/chosen": -19.032917022705078, "logits/rejected": -18.726558685302734, "logps/chosen": -399.57452392578125, "logps/rejected": -371.0084533691406, "loss": 1.5413, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.351408004760742, "rewards/margins": -0.23557224869728088, "rewards/rejected": 3.586979627609253, "step": 40900 }, { "epoch": 1.8993453735085195, "grad_norm": 65.14541625976562, "learning_rate": 1.8360493368618166e-07, "logits/chosen": -18.104084014892578, "logits/rejected": -17.717561721801758, "logps/chosen": -353.3931579589844, "logps/rejected": -290.53558349609375, "loss": 1.1073, "rewards/accuracies": 0.5, "rewards/chosen": 2.5579562187194824, "rewards/margins": 0.42897024750709534, "rewards/rejected": 2.12898588180542, "step": 40910 }, { "epoch": 1.8998096476159525, "grad_norm": 67.79031372070312, "learning_rate": 1.8352755466827612e-07, "logits/chosen": -19.773088455200195, "logits/rejected": -18.685453414916992, "logps/chosen": -403.7156677246094, "logps/rejected": -369.9937744140625, "loss": 0.8337, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1486966609954834, "rewards/margins": 0.39755016565322876, "rewards/rejected": 2.7511467933654785, "step": 40920 }, { "epoch": 1.9002739217233855, "grad_norm": 43.67757797241211, "learning_rate": 1.8345017565037063e-07, "logits/chosen": -19.156953811645508, "logits/rejected": -18.209571838378906, "logps/chosen": -360.5333251953125, "logps/rejected": -338.584228515625, "loss": 0.347, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.931082010269165, "rewards/margins": 1.6419737339019775, "rewards/rejected": 1.2891085147857666, "step": 40930 }, { "epoch": 1.9007381958308185, "grad_norm": 39.746665954589844, "learning_rate": 1.8337279663246514e-07, "logits/chosen": -19.015480041503906, "logits/rejected": -18.590904235839844, "logps/chosen": -466.47027587890625, "logps/rejected": -369.59930419921875, "loss": 0.2811, "rewards/accuracies": 1.0, "rewards/chosen": 4.445495128631592, "rewards/margins": 1.6521371603012085, "rewards/rejected": 2.7933576107025146, "step": 40940 }, { "epoch": 1.9012024699382515, "grad_norm": 116.78453826904297, "learning_rate": 1.8329541761455962e-07, "logits/chosen": -18.133729934692383, "logits/rejected": -17.287206649780273, "logps/chosen": -474.6222229003906, "logps/rejected": -341.72332763671875, "loss": 0.4375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5426933765411377, "rewards/margins": 1.3838669061660767, "rewards/rejected": 2.1588263511657715, "step": 40950 }, { "epoch": 1.9016667440456847, "grad_norm": 92.65613555908203, "learning_rate": 1.8321803859665414e-07, "logits/chosen": -18.382526397705078, "logits/rejected": -17.566570281982422, "logps/chosen": -358.9984130859375, "logps/rejected": -270.60528564453125, "loss": 0.6507, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.518308162689209, "rewards/margins": 1.0650572776794434, "rewards/rejected": 1.4532510042190552, "step": 40960 }, { "epoch": 1.9021310181531175, "grad_norm": 134.65548706054688, "learning_rate": 1.831406595787486e-07, "logits/chosen": -19.764204025268555, "logits/rejected": -18.821842193603516, "logps/chosen": -435.282958984375, "logps/rejected": -386.12591552734375, "loss": 0.7066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3262152671813965, "rewards/margins": 0.8379920125007629, "rewards/rejected": 2.4882233142852783, "step": 40970 }, { "epoch": 1.9025952922605507, "grad_norm": 37.398197174072266, "learning_rate": 1.830632805608431e-07, "logits/chosen": -19.419269561767578, "logits/rejected": -19.212894439697266, "logps/chosen": -393.00225830078125, "logps/rejected": -404.74761962890625, "loss": 1.027, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.9050402641296387, "rewards/margins": -0.3843788504600525, "rewards/rejected": 4.289419174194336, "step": 40980 }, { "epoch": 1.9030595663679837, "grad_norm": 41.819583892822266, "learning_rate": 1.8298590154293762e-07, "logits/chosen": -18.951885223388672, "logits/rejected": -18.719385147094727, "logps/chosen": -352.2027587890625, "logps/rejected": -254.42861938476562, "loss": 0.8299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.795685052871704, "rewards/margins": 0.9076236486434937, "rewards/rejected": 1.8880611658096313, "step": 40990 }, { "epoch": 1.9035238404754167, "grad_norm": 22.88490867614746, "learning_rate": 1.829085225250321e-07, "logits/chosen": -18.376140594482422, "logits/rejected": -17.67051887512207, "logps/chosen": -411.5204162597656, "logps/rejected": -241.49435424804688, "loss": 0.9153, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0976388454437256, "rewards/margins": 1.0291359424591064, "rewards/rejected": 2.068502902984619, "step": 41000 }, { "epoch": 1.9039881145828497, "grad_norm": 129.94418334960938, "learning_rate": 1.828311435071266e-07, "logits/chosen": -18.43577003479004, "logits/rejected": -18.62654685974121, "logps/chosen": -402.90142822265625, "logps/rejected": -399.99853515625, "loss": 1.0999, "rewards/accuracies": 0.5, "rewards/chosen": 3.097762107849121, "rewards/margins": -0.18729862570762634, "rewards/rejected": 3.285060405731201, "step": 41010 }, { "epoch": 1.9044523886902827, "grad_norm": 22.506433486938477, "learning_rate": 1.827537644892211e-07, "logits/chosen": -19.28252410888672, "logits/rejected": -17.727115631103516, "logps/chosen": -372.7185974121094, "logps/rejected": -235.7750701904297, "loss": 0.2665, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5446133613586426, "rewards/margins": 2.617171049118042, "rewards/rejected": 0.9274418950080872, "step": 41020 }, { "epoch": 1.9049166627977159, "grad_norm": 177.3533935546875, "learning_rate": 1.8267638547131558e-07, "logits/chosen": -18.35942268371582, "logits/rejected": -18.089981079101562, "logps/chosen": -384.0521240234375, "logps/rejected": -310.36505126953125, "loss": 0.6295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.620774030685425, "rewards/margins": 1.267488718032837, "rewards/rejected": 2.353285551071167, "step": 41030 }, { "epoch": 1.9053809369051486, "grad_norm": 6.230844497680664, "learning_rate": 1.825990064534101e-07, "logits/chosen": -19.860126495361328, "logits/rejected": -18.6687068939209, "logps/chosen": -462.1104431152344, "logps/rejected": -330.18414306640625, "loss": 0.672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.631371021270752, "rewards/margins": 1.6263911724090576, "rewards/rejected": 3.0049800872802734, "step": 41040 }, { "epoch": 1.9058452110125819, "grad_norm": 244.93370056152344, "learning_rate": 1.8252162743550458e-07, "logits/chosen": -18.718475341796875, "logits/rejected": -18.814064025878906, "logps/chosen": -409.2254638671875, "logps/rejected": -479.428955078125, "loss": 0.8702, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9639663696289062, "rewards/margins": 0.177938312292099, "rewards/rejected": 3.7860283851623535, "step": 41050 }, { "epoch": 1.9063094851200149, "grad_norm": 9.100346565246582, "learning_rate": 1.824442484175991e-07, "logits/chosen": -18.532243728637695, "logits/rejected": -18.179601669311523, "logps/chosen": -312.1676025390625, "logps/rejected": -288.5329895019531, "loss": 0.6658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5754029750823975, "rewards/margins": 0.7326535582542419, "rewards/rejected": 1.8427493572235107, "step": 41060 }, { "epoch": 1.9067737592274479, "grad_norm": 63.923614501953125, "learning_rate": 1.8236686939969358e-07, "logits/chosen": -18.80615234375, "logits/rejected": -18.155475616455078, "logps/chosen": -462.8282775878906, "logps/rejected": -373.9666748046875, "loss": 0.4254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9285056591033936, "rewards/margins": 1.428397536277771, "rewards/rejected": 2.500108242034912, "step": 41070 }, { "epoch": 1.907238033334881, "grad_norm": 256.437744140625, "learning_rate": 1.8228949038178806e-07, "logits/chosen": -20.279216766357422, "logits/rejected": -19.72043800354004, "logps/chosen": -432.16119384765625, "logps/rejected": -369.33489990234375, "loss": 0.8274, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4487290382385254, "rewards/margins": 0.743918776512146, "rewards/rejected": 2.704810380935669, "step": 41080 }, { "epoch": 1.9077023074423138, "grad_norm": 48.895450592041016, "learning_rate": 1.8221211136388257e-07, "logits/chosen": -19.044109344482422, "logits/rejected": -18.511707305908203, "logps/chosen": -379.8089904785156, "logps/rejected": -274.98419189453125, "loss": 0.3464, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5569889545440674, "rewards/margins": 1.3682540655136108, "rewards/rejected": 2.188735008239746, "step": 41090 }, { "epoch": 1.908166581549747, "grad_norm": 2.577849864959717, "learning_rate": 1.8213473234597706e-07, "logits/chosen": -19.184478759765625, "logits/rejected": -18.921539306640625, "logps/chosen": -318.99603271484375, "logps/rejected": -296.7742614746094, "loss": 0.7629, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9517409801483154, "rewards/margins": 0.5946419835090637, "rewards/rejected": 2.3570990562438965, "step": 41100 }, { "epoch": 1.90863085565718, "grad_norm": 1.464411735534668, "learning_rate": 1.8205735332807154e-07, "logits/chosen": -20.21404457092285, "logits/rejected": -18.148147583007812, "logps/chosen": -432.75946044921875, "logps/rejected": -260.7413635253906, "loss": 0.3467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.221149444580078, "rewards/margins": 2.7466702461242676, "rewards/rejected": 1.4744789600372314, "step": 41110 }, { "epoch": 1.909095129764613, "grad_norm": 102.66902923583984, "learning_rate": 1.8197997431016605e-07, "logits/chosen": -20.126129150390625, "logits/rejected": -19.520177841186523, "logps/chosen": -486.02581787109375, "logps/rejected": -431.66302490234375, "loss": 1.0182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.1256232261657715, "rewards/margins": 0.4342457354068756, "rewards/rejected": 4.69137716293335, "step": 41120 }, { "epoch": 1.909559403872046, "grad_norm": 9.5647554397583, "learning_rate": 1.8190259529226054e-07, "logits/chosen": -19.114225387573242, "logits/rejected": -18.432880401611328, "logps/chosen": -303.9975280761719, "logps/rejected": -239.7257843017578, "loss": 0.5935, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.279665231704712, "rewards/margins": 1.0138226747512817, "rewards/rejected": 1.2658425569534302, "step": 41130 }, { "epoch": 1.910023677979479, "grad_norm": 2.8881027698516846, "learning_rate": 1.8182521627435505e-07, "logits/chosen": -18.123565673828125, "logits/rejected": -17.368228912353516, "logps/chosen": -427.3846740722656, "logps/rejected": -313.02239990234375, "loss": 0.893, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9849720001220703, "rewards/margins": 0.9291328191757202, "rewards/rejected": 2.0558390617370605, "step": 41140 }, { "epoch": 1.9104879520869122, "grad_norm": 32.98337936401367, "learning_rate": 1.8174783725644953e-07, "logits/chosen": -19.475887298583984, "logits/rejected": -17.83868408203125, "logps/chosen": -351.90185546875, "logps/rejected": -266.11297607421875, "loss": 0.2571, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2212491035461426, "rewards/margins": 2.0390915870666504, "rewards/rejected": 1.182157278060913, "step": 41150 }, { "epoch": 1.910952226194345, "grad_norm": 21.9955997467041, "learning_rate": 1.8167045823854402e-07, "logits/chosen": -18.72780990600586, "logits/rejected": -17.49479866027832, "logps/chosen": -400.1333312988281, "logps/rejected": -263.8070983886719, "loss": 0.4066, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9661478996276855, "rewards/margins": 1.4879791736602783, "rewards/rejected": 1.4781686067581177, "step": 41160 }, { "epoch": 1.9114165003017782, "grad_norm": 0.3824159801006317, "learning_rate": 1.8159307922063853e-07, "logits/chosen": -19.361913681030273, "logits/rejected": -18.968143463134766, "logps/chosen": -332.6644287109375, "logps/rejected": -269.45904541015625, "loss": 0.6454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.287668228149414, "rewards/margins": 1.9103593826293945, "rewards/rejected": 2.3773088455200195, "step": 41170 }, { "epoch": 1.9118807744092112, "grad_norm": 10.576610565185547, "learning_rate": 1.8151570020273301e-07, "logits/chosen": -19.810741424560547, "logits/rejected": -18.427541732788086, "logps/chosen": -389.87811279296875, "logps/rejected": -298.9972839355469, "loss": 0.8685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.009499549865723, "rewards/margins": 1.440193772315979, "rewards/rejected": 2.569305419921875, "step": 41180 }, { "epoch": 1.9123450485166442, "grad_norm": 4.576326847076416, "learning_rate": 1.8143832118482753e-07, "logits/chosen": -18.887300491333008, "logits/rejected": -17.76617431640625, "logps/chosen": -524.1650390625, "logps/rejected": -342.2740478515625, "loss": 0.7554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.23687744140625, "rewards/margins": 1.6448452472686768, "rewards/rejected": 3.5920326709747314, "step": 41190 }, { "epoch": 1.9128093226240772, "grad_norm": 46.95045852661133, "learning_rate": 1.81360942166922e-07, "logits/chosen": -19.164207458496094, "logits/rejected": -19.017602920532227, "logps/chosen": -338.8641662597656, "logps/rejected": -279.83245849609375, "loss": 0.6018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2747020721435547, "rewards/margins": 1.1454848051071167, "rewards/rejected": 2.1292176246643066, "step": 41200 }, { "epoch": 1.9132735967315102, "grad_norm": 12.156571388244629, "learning_rate": 1.812835631490165e-07, "logits/chosen": -19.60324478149414, "logits/rejected": -18.863811492919922, "logps/chosen": -368.9756164550781, "logps/rejected": -283.54339599609375, "loss": 0.8295, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6390247344970703, "rewards/margins": 0.5469807386398315, "rewards/rejected": 2.092043876647949, "step": 41210 }, { "epoch": 1.9137378708389434, "grad_norm": 61.65111541748047, "learning_rate": 1.81206184131111e-07, "logits/chosen": -17.771697998046875, "logits/rejected": -17.342145919799805, "logps/chosen": -393.8671875, "logps/rejected": -296.0937194824219, "loss": 0.7492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.920696496963501, "rewards/margins": 0.8836283683776855, "rewards/rejected": 2.0370678901672363, "step": 41220 }, { "epoch": 1.9142021449463762, "grad_norm": 32.998382568359375, "learning_rate": 1.811288051132055e-07, "logits/chosen": -18.107133865356445, "logits/rejected": -16.798112869262695, "logps/chosen": -486.4071350097656, "logps/rejected": -263.64605712890625, "loss": 0.2443, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9842464923858643, "rewards/margins": 2.2991814613342285, "rewards/rejected": 1.6850652694702148, "step": 41230 }, { "epoch": 1.9146664190538094, "grad_norm": 4.432919025421143, "learning_rate": 1.810514260953e-07, "logits/chosen": -19.503787994384766, "logits/rejected": -17.725339889526367, "logps/chosen": -389.113037109375, "logps/rejected": -261.64703369140625, "loss": 0.5978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6070055961608887, "rewards/margins": 2.4401557445526123, "rewards/rejected": 1.1668498516082764, "step": 41240 }, { "epoch": 1.9151306931612424, "grad_norm": 39.564659118652344, "learning_rate": 1.809740470773945e-07, "logits/chosen": -17.743444442749023, "logits/rejected": -18.053163528442383, "logps/chosen": -297.1942138671875, "logps/rejected": -316.8614807128906, "loss": 1.2778, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.850811243057251, "rewards/margins": 0.12322940677404404, "rewards/rejected": 2.727581739425659, "step": 41250 }, { "epoch": 1.9155949672686754, "grad_norm": 2.0644474029541016, "learning_rate": 1.8089666805948897e-07, "logits/chosen": -19.122526168823242, "logits/rejected": -17.477428436279297, "logps/chosen": -457.29644775390625, "logps/rejected": -252.72628784179688, "loss": 0.1652, "rewards/accuracies": 1.0, "rewards/chosen": 4.483406066894531, "rewards/margins": 2.938478946685791, "rewards/rejected": 1.5449278354644775, "step": 41260 }, { "epoch": 1.9160592413761086, "grad_norm": 30.731046676635742, "learning_rate": 1.8081928904158348e-07, "logits/chosen": -19.402572631835938, "logits/rejected": -18.854978561401367, "logps/chosen": -330.1829833984375, "logps/rejected": -326.887451171875, "loss": 0.9805, "rewards/accuracies": 0.5, "rewards/chosen": 3.0286691188812256, "rewards/margins": 0.29117313027381897, "rewards/rejected": 2.7374958992004395, "step": 41270 }, { "epoch": 1.9165235154835414, "grad_norm": 43.53532028198242, "learning_rate": 1.8074191002367797e-07, "logits/chosen": -18.989730834960938, "logits/rejected": -18.977081298828125, "logps/chosen": -357.8609313964844, "logps/rejected": -372.24609375, "loss": 0.8209, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5041894912719727, "rewards/margins": 0.04392596334218979, "rewards/rejected": 2.460263252258301, "step": 41280 }, { "epoch": 1.9169877895909746, "grad_norm": 194.04855346679688, "learning_rate": 1.8066453100577248e-07, "logits/chosen": -19.729021072387695, "logits/rejected": -19.860309600830078, "logps/chosen": -405.2397155761719, "logps/rejected": -427.91400146484375, "loss": 0.9231, "rewards/accuracies": 0.5, "rewards/chosen": 2.701730251312256, "rewards/margins": -0.12767748534679413, "rewards/rejected": 2.8294076919555664, "step": 41290 }, { "epoch": 1.9174520636984076, "grad_norm": 38.033226013183594, "learning_rate": 1.8058715198786694e-07, "logits/chosen": -19.842586517333984, "logits/rejected": -19.446985244750977, "logps/chosen": -478.446533203125, "logps/rejected": -390.8656921386719, "loss": 0.7358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.8017377853393555, "rewards/margins": 1.2392890453338623, "rewards/rejected": 3.5624496936798096, "step": 41300 }, { "epoch": 1.9179163378058406, "grad_norm": 123.93733215332031, "learning_rate": 1.8050977296996145e-07, "logits/chosen": -19.282989501953125, "logits/rejected": -18.286245346069336, "logps/chosen": -453.7418518066406, "logps/rejected": -345.72283935546875, "loss": 0.2735, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.340610980987549, "rewards/margins": 1.6210386753082275, "rewards/rejected": 2.7195725440979004, "step": 41310 }, { "epoch": 1.9183806119132736, "grad_norm": 207.6820526123047, "learning_rate": 1.8043239395205596e-07, "logits/chosen": -19.71732521057129, "logits/rejected": -19.29093360900879, "logps/chosen": -456.5902404785156, "logps/rejected": -442.3194274902344, "loss": 0.5251, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.915905952453613, "rewards/margins": 1.3444631099700928, "rewards/rejected": 3.5714430809020996, "step": 41320 }, { "epoch": 1.9188448860207066, "grad_norm": 44.27511978149414, "learning_rate": 1.8035501493415045e-07, "logits/chosen": -18.07632827758789, "logits/rejected": -17.652362823486328, "logps/chosen": -228.7597198486328, "logps/rejected": -258.95477294921875, "loss": 0.9993, "rewards/accuracies": 0.5, "rewards/chosen": 2.140394449234009, "rewards/margins": 0.7259049415588379, "rewards/rejected": 1.4144893884658813, "step": 41330 }, { "epoch": 1.9193091601281398, "grad_norm": 173.1204833984375, "learning_rate": 1.8027763591624496e-07, "logits/chosen": -18.85347557067871, "logits/rejected": -18.992530822753906, "logps/chosen": -260.1573791503906, "logps/rejected": -291.87286376953125, "loss": 1.3659, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.9385654926300049, "rewards/margins": -0.6390262842178345, "rewards/rejected": 2.577591896057129, "step": 41340 }, { "epoch": 1.9197734342355726, "grad_norm": 152.2804412841797, "learning_rate": 1.8020025689833941e-07, "logits/chosen": -19.206787109375, "logits/rejected": -19.20973777770996, "logps/chosen": -485.16204833984375, "logps/rejected": -461.3768615722656, "loss": 1.1834, "rewards/accuracies": 0.5, "rewards/chosen": 3.814988613128662, "rewards/margins": -0.2860338091850281, "rewards/rejected": 4.101022720336914, "step": 41350 }, { "epoch": 1.9202377083430058, "grad_norm": 7.399023056030273, "learning_rate": 1.8012287788043393e-07, "logits/chosen": -19.38693618774414, "logits/rejected": -19.496244430541992, "logps/chosen": -329.14544677734375, "logps/rejected": -317.2917175292969, "loss": 1.2509, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7539031505584717, "rewards/margins": -0.37244611978530884, "rewards/rejected": 3.1263492107391357, "step": 41360 }, { "epoch": 1.9207019824504388, "grad_norm": 87.54349517822266, "learning_rate": 1.8004549886252844e-07, "logits/chosen": -19.0723819732666, "logits/rejected": -18.438953399658203, "logps/chosen": -336.4319763183594, "logps/rejected": -291.5293884277344, "loss": 0.3259, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4370675086975098, "rewards/margins": 1.5233824253082275, "rewards/rejected": 1.9136848449707031, "step": 41370 }, { "epoch": 1.9211662565578718, "grad_norm": 22.666231155395508, "learning_rate": 1.7996811984462292e-07, "logits/chosen": -18.778759002685547, "logits/rejected": -18.788219451904297, "logps/chosen": -346.14129638671875, "logps/rejected": -304.9402770996094, "loss": 1.5704, "rewards/accuracies": 0.5, "rewards/chosen": 3.8379549980163574, "rewards/margins": -0.020588446408510208, "rewards/rejected": 3.858543872833252, "step": 41380 }, { "epoch": 1.9216305306653048, "grad_norm": 19.29134750366211, "learning_rate": 1.7989074082671743e-07, "logits/chosen": -18.47934341430664, "logits/rejected": -17.995140075683594, "logps/chosen": -298.13525390625, "logps/rejected": -244.8441162109375, "loss": 0.5823, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.568817615509033, "rewards/margins": 1.3166487216949463, "rewards/rejected": 1.2521690130233765, "step": 41390 }, { "epoch": 1.9220948047727378, "grad_norm": 73.15249633789062, "learning_rate": 1.798133618088119e-07, "logits/chosen": -18.18905258178711, "logits/rejected": -18.13542938232422, "logps/chosen": -195.73489379882812, "logps/rejected": -200.7975616455078, "loss": 1.0422, "rewards/accuracies": 0.5, "rewards/chosen": 0.39616623520851135, "rewards/margins": -0.20458516478538513, "rewards/rejected": 0.6007513999938965, "step": 41400 }, { "epoch": 1.922559078880171, "grad_norm": 22.743322372436523, "learning_rate": 1.797359827909064e-07, "logits/chosen": -19.179584503173828, "logits/rejected": -17.97021484375, "logps/chosen": -321.6781005859375, "logps/rejected": -235.9026641845703, "loss": 0.5738, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.973453998565674, "rewards/margins": 1.5774251222610474, "rewards/rejected": 1.396028995513916, "step": 41410 }, { "epoch": 1.9230233529876037, "grad_norm": 68.14392852783203, "learning_rate": 1.7965860377300091e-07, "logits/chosen": -18.686607360839844, "logits/rejected": -17.596378326416016, "logps/chosen": -381.20196533203125, "logps/rejected": -234.3768768310547, "loss": 0.301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6770267486572266, "rewards/margins": 2.5700221061706543, "rewards/rejected": 1.1070042848587036, "step": 41420 }, { "epoch": 1.923487627095037, "grad_norm": 101.7695083618164, "learning_rate": 1.795812247550954e-07, "logits/chosen": -20.655628204345703, "logits/rejected": -19.15611457824707, "logps/chosen": -462.215576171875, "logps/rejected": -398.72064208984375, "loss": 0.5699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.521302700042725, "rewards/margins": 1.2992303371429443, "rewards/rejected": 3.2220730781555176, "step": 41430 }, { "epoch": 1.92395190120247, "grad_norm": 38.7008171081543, "learning_rate": 1.795038457371899e-07, "logits/chosen": -19.56140899658203, "logits/rejected": -19.14974021911621, "logps/chosen": -464.1019592285156, "logps/rejected": -446.5426330566406, "loss": 0.7849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.189399242401123, "rewards/margins": 0.15627673268318176, "rewards/rejected": 3.0331225395202637, "step": 41440 }, { "epoch": 1.924416175309903, "grad_norm": 3.513511896133423, "learning_rate": 1.7942646671928437e-07, "logits/chosen": -18.264142990112305, "logits/rejected": -17.597482681274414, "logps/chosen": -342.40972900390625, "logps/rejected": -254.33352661132812, "loss": 0.7491, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.575197458267212, "rewards/margins": 0.8850648999214172, "rewards/rejected": 1.69013249874115, "step": 41450 }, { "epoch": 1.9248804494173362, "grad_norm": 93.8425064086914, "learning_rate": 1.7934908770137888e-07, "logits/chosen": -19.137588500976562, "logits/rejected": -17.546913146972656, "logps/chosen": -368.1025695800781, "logps/rejected": -248.54013061523438, "loss": 0.5133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.418679714202881, "rewards/margins": 2.1087162494659424, "rewards/rejected": 1.309963345527649, "step": 41460 }, { "epoch": 1.925344723524769, "grad_norm": 131.63221740722656, "learning_rate": 1.792717086834734e-07, "logits/chosen": -20.1192569732666, "logits/rejected": -18.56850814819336, "logps/chosen": -393.3107604980469, "logps/rejected": -280.2889709472656, "loss": 0.6377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.165201187133789, "rewards/margins": 1.8331725597381592, "rewards/rejected": 2.332028865814209, "step": 41470 }, { "epoch": 1.9258089976322021, "grad_norm": 29.371238708496094, "learning_rate": 1.7919432966556788e-07, "logits/chosen": -19.27712631225586, "logits/rejected": -19.297332763671875, "logps/chosen": -408.1995544433594, "logps/rejected": -503.59075927734375, "loss": 0.9745, "rewards/accuracies": 0.5, "rewards/chosen": 3.6536636352539062, "rewards/margins": 0.32572370767593384, "rewards/rejected": 3.327939987182617, "step": 41480 }, { "epoch": 1.926273271739635, "grad_norm": 20.34711456298828, "learning_rate": 1.791169506476624e-07, "logits/chosen": -19.315235137939453, "logits/rejected": -17.751888275146484, "logps/chosen": -474.09222412109375, "logps/rejected": -300.96527099609375, "loss": 0.4476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8125803470611572, "rewards/margins": 1.1568934917449951, "rewards/rejected": 1.6556867361068726, "step": 41490 }, { "epoch": 1.9267375458470681, "grad_norm": 21.085979461669922, "learning_rate": 1.7903957162975685e-07, "logits/chosen": -19.387577056884766, "logits/rejected": -18.543180465698242, "logps/chosen": -467.0254821777344, "logps/rejected": -332.66729736328125, "loss": 0.3856, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.020853519439697, "rewards/margins": 1.3434669971466064, "rewards/rejected": 2.677386522293091, "step": 41500 }, { "epoch": 1.9272018199545011, "grad_norm": 66.6666030883789, "learning_rate": 1.7896219261185136e-07, "logits/chosen": -20.12454605102539, "logits/rejected": -17.84772300720215, "logps/chosen": -448.3226623535156, "logps/rejected": -277.3570251464844, "loss": 0.3529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.439051151275635, "rewards/margins": 1.596919059753418, "rewards/rejected": 2.8421316146850586, "step": 41510 }, { "epoch": 1.9276660940619341, "grad_norm": 61.137969970703125, "learning_rate": 1.7888481359394587e-07, "logits/chosen": -19.151113510131836, "logits/rejected": -18.83866310119629, "logps/chosen": -301.2470397949219, "logps/rejected": -327.4206237792969, "loss": 1.3997, "rewards/accuracies": 0.5, "rewards/chosen": 1.5324774980545044, "rewards/margins": -0.6488133072853088, "rewards/rejected": 2.181290626525879, "step": 41520 }, { "epoch": 1.9281303681693673, "grad_norm": 71.12199401855469, "learning_rate": 1.7880743457604035e-07, "logits/chosen": -19.115116119384766, "logits/rejected": -18.585981369018555, "logps/chosen": -411.3056640625, "logps/rejected": -295.3291015625, "loss": 0.4555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.365138530731201, "rewards/margins": 1.2744814157485962, "rewards/rejected": 2.0906574726104736, "step": 41530 }, { "epoch": 1.9285946422768, "grad_norm": 288.6244812011719, "learning_rate": 1.7873005555813484e-07, "logits/chosen": -17.590986251831055, "logits/rejected": -18.301939010620117, "logps/chosen": -268.8487243652344, "logps/rejected": -364.1654968261719, "loss": 1.3112, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8635573387145996, "rewards/margins": -0.4871065020561218, "rewards/rejected": 3.350663661956787, "step": 41540 }, { "epoch": 1.9290589163842333, "grad_norm": 20.083446502685547, "learning_rate": 1.7865267654022932e-07, "logits/chosen": -19.025821685791016, "logits/rejected": -17.665048599243164, "logps/chosen": -386.8284606933594, "logps/rejected": -258.5574035644531, "loss": 0.3532, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.647792339324951, "rewards/margins": 1.9664939641952515, "rewards/rejected": 0.6812983751296997, "step": 41550 }, { "epoch": 1.9295231904916663, "grad_norm": 39.70377731323242, "learning_rate": 1.7857529752232383e-07, "logits/chosen": -17.966814041137695, "logits/rejected": -17.44887924194336, "logps/chosen": -441.811767578125, "logps/rejected": -375.6711730957031, "loss": 0.3111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.313555717468262, "rewards/margins": 1.966379165649414, "rewards/rejected": 2.3471765518188477, "step": 41560 }, { "epoch": 1.9299874645990993, "grad_norm": 151.30210876464844, "learning_rate": 1.7849791850441835e-07, "logits/chosen": -18.95009994506836, "logits/rejected": -18.04052734375, "logps/chosen": -537.3592529296875, "logps/rejected": -425.36474609375, "loss": 1.1325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.5892839431762695, "rewards/margins": 1.4896371364593506, "rewards/rejected": 3.0996463298797607, "step": 41570 }, { "epoch": 1.9304517387065323, "grad_norm": 1.3598805665969849, "learning_rate": 1.7842053948651283e-07, "logits/chosen": -19.51272201538086, "logits/rejected": -18.618398666381836, "logps/chosen": -335.2660217285156, "logps/rejected": -355.0876770019531, "loss": 0.8383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.283097743988037, "rewards/margins": 0.8376711010932922, "rewards/rejected": 2.4454264640808105, "step": 41580 }, { "epoch": 1.9309160128139653, "grad_norm": 0.7070646286010742, "learning_rate": 1.7834316046860732e-07, "logits/chosen": -20.165729522705078, "logits/rejected": -17.811216354370117, "logps/chosen": -381.70428466796875, "logps/rejected": -213.7940673828125, "loss": 0.3725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.690023899078369, "rewards/margins": 2.455986499786377, "rewards/rejected": 1.2340377569198608, "step": 41590 }, { "epoch": 1.9313802869213985, "grad_norm": 7.011229038238525, "learning_rate": 1.782657814507018e-07, "logits/chosen": -18.870372772216797, "logits/rejected": -17.665077209472656, "logps/chosen": -339.5191345214844, "logps/rejected": -272.8272705078125, "loss": 0.3143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1194307804107666, "rewards/margins": 1.7500903606414795, "rewards/rejected": 1.369340181350708, "step": 41600 }, { "epoch": 1.9318445610288313, "grad_norm": 40.968143463134766, "learning_rate": 1.781884024327963e-07, "logits/chosen": -19.610055923461914, "logits/rejected": -18.5240535736084, "logps/chosen": -412.05548095703125, "logps/rejected": -257.2679748535156, "loss": 0.2912, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.065248489379883, "rewards/margins": 2.37137508392334, "rewards/rejected": 1.693873643875122, "step": 41610 }, { "epoch": 1.9323088351362645, "grad_norm": 94.80332946777344, "learning_rate": 1.7811102341489082e-07, "logits/chosen": -19.496898651123047, "logits/rejected": -19.075986862182617, "logps/chosen": -427.82147216796875, "logps/rejected": -345.037109375, "loss": 0.4729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.187195777893066, "rewards/margins": 1.0224028825759888, "rewards/rejected": 3.1647934913635254, "step": 41620 }, { "epoch": 1.9327731092436975, "grad_norm": 48.55424880981445, "learning_rate": 1.780336443969853e-07, "logits/chosen": -17.96558952331543, "logits/rejected": -17.925809860229492, "logps/chosen": -334.8277282714844, "logps/rejected": -385.27386474609375, "loss": 1.0011, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4522998332977295, "rewards/margins": 0.07391352951526642, "rewards/rejected": 3.3783867359161377, "step": 41630 }, { "epoch": 1.9332373833511305, "grad_norm": 2.511265277862549, "learning_rate": 1.779562653790798e-07, "logits/chosen": -18.628543853759766, "logits/rejected": -18.109148025512695, "logps/chosen": -376.51470947265625, "logps/rejected": -297.7054748535156, "loss": 0.4682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.386866807937622, "rewards/margins": 1.3381105661392212, "rewards/rejected": 2.0487565994262695, "step": 41640 }, { "epoch": 1.9337016574585635, "grad_norm": 29.457271575927734, "learning_rate": 1.778788863611743e-07, "logits/chosen": -18.73847007751465, "logits/rejected": -18.146732330322266, "logps/chosen": -368.03594970703125, "logps/rejected": -373.2772521972656, "loss": 1.3244, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.588923931121826, "rewards/margins": 0.28520888090133667, "rewards/rejected": 3.303715467453003, "step": 41650 }, { "epoch": 1.9341659315659965, "grad_norm": 1.0349782705307007, "learning_rate": 1.778015073432688e-07, "logits/chosen": -18.000198364257812, "logits/rejected": -17.22623062133789, "logps/chosen": -390.12701416015625, "logps/rejected": -348.7459411621094, "loss": 0.321, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0016138553619385, "rewards/margins": 1.9130277633666992, "rewards/rejected": 1.0885860919952393, "step": 41660 }, { "epoch": 1.9346302056734297, "grad_norm": 61.819583892822266, "learning_rate": 1.777241283253633e-07, "logits/chosen": -18.752695083618164, "logits/rejected": -19.286828994750977, "logps/chosen": -433.8597106933594, "logps/rejected": -381.32061767578125, "loss": 1.2254, "rewards/accuracies": 0.5, "rewards/chosen": 3.0242722034454346, "rewards/margins": -0.12673866748809814, "rewards/rejected": 3.1510109901428223, "step": 41670 }, { "epoch": 1.9350944797808625, "grad_norm": 446.17596435546875, "learning_rate": 1.7764674930745778e-07, "logits/chosen": -19.5504207611084, "logits/rejected": -19.056148529052734, "logps/chosen": -451.426513671875, "logps/rejected": -333.6573181152344, "loss": 0.6432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6688332557678223, "rewards/margins": 1.2713350057601929, "rewards/rejected": 2.397498607635498, "step": 41680 }, { "epoch": 1.9355587538882957, "grad_norm": 30.4458065032959, "learning_rate": 1.7756937028955227e-07, "logits/chosen": -19.485780715942383, "logits/rejected": -17.61471939086914, "logps/chosen": -488.89068603515625, "logps/rejected": -265.2599182128906, "loss": 0.1949, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.640312194824219, "rewards/margins": 3.447831392288208, "rewards/rejected": 1.1924803256988525, "step": 41690 }, { "epoch": 1.9360230279957287, "grad_norm": 10.151421546936035, "learning_rate": 1.7749199127164678e-07, "logits/chosen": -19.0444278717041, "logits/rejected": -18.467647552490234, "logps/chosen": -393.76239013671875, "logps/rejected": -358.8406677246094, "loss": 0.6308, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9197545051574707, "rewards/margins": 1.7250759601593018, "rewards/rejected": 2.19467830657959, "step": 41700 }, { "epoch": 1.9364873021031617, "grad_norm": 0.1124793291091919, "learning_rate": 1.7741461225374127e-07, "logits/chosen": -19.390804290771484, "logits/rejected": -18.49492073059082, "logps/chosen": -343.18316650390625, "logps/rejected": -281.08428955078125, "loss": 0.7903, "rewards/accuracies": 0.5, "rewards/chosen": 3.280679225921631, "rewards/margins": 0.935787558555603, "rewards/rejected": 2.3448917865753174, "step": 41710 }, { "epoch": 1.9369515762105949, "grad_norm": 40.92009353637695, "learning_rate": 1.7733723323583578e-07, "logits/chosen": -19.211307525634766, "logits/rejected": -16.931949615478516, "logps/chosen": -409.3004455566406, "logps/rejected": -263.5023498535156, "loss": 0.2769, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9088966846466064, "rewards/margins": 2.4898033142089844, "rewards/rejected": 1.4190934896469116, "step": 41720 }, { "epoch": 1.9374158503180277, "grad_norm": 3.275773048400879, "learning_rate": 1.7725985421793026e-07, "logits/chosen": -19.31033706665039, "logits/rejected": -18.317583084106445, "logps/chosen": -444.7098083496094, "logps/rejected": -398.18768310546875, "loss": 1.1703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.258541107177734, "rewards/margins": 0.49574798345565796, "rewards/rejected": 3.7627930641174316, "step": 41730 }, { "epoch": 1.9378801244254609, "grad_norm": 65.69766235351562, "learning_rate": 1.7718247520002475e-07, "logits/chosen": -18.82407569885254, "logits/rejected": -18.901548385620117, "logps/chosen": -406.52252197265625, "logps/rejected": -383.88299560546875, "loss": 1.0376, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.8257594108581543, "rewards/margins": -0.20953664183616638, "rewards/rejected": 3.0352962017059326, "step": 41740 }, { "epoch": 1.9383443985328939, "grad_norm": 5.922774314880371, "learning_rate": 1.7710509618211926e-07, "logits/chosen": -19.332046508789062, "logits/rejected": -19.312122344970703, "logps/chosen": -329.5233154296875, "logps/rejected": -341.7746276855469, "loss": 1.3826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.186479330062866, "rewards/margins": -0.34032008051872253, "rewards/rejected": 2.526799440383911, "step": 41750 }, { "epoch": 1.9388086726403269, "grad_norm": 46.448307037353516, "learning_rate": 1.7702771716421374e-07, "logits/chosen": -18.82052993774414, "logits/rejected": -17.574501037597656, "logps/chosen": -509.11456298828125, "logps/rejected": -408.8150939941406, "loss": 0.5987, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.486770153045654, "rewards/margins": 1.292130947113037, "rewards/rejected": 3.1946394443511963, "step": 41760 }, { "epoch": 1.9392729467477599, "grad_norm": 227.18348693847656, "learning_rate": 1.7695033814630825e-07, "logits/chosen": -19.003841400146484, "logits/rejected": -18.79587173461914, "logps/chosen": -398.9657287597656, "logps/rejected": -358.89874267578125, "loss": 0.6234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.681180477142334, "rewards/margins": 0.7695644497871399, "rewards/rejected": 2.9116158485412598, "step": 41770 }, { "epoch": 1.9397372208551928, "grad_norm": 1.8183854818344116, "learning_rate": 1.768729591284027e-07, "logits/chosen": -18.747526168823242, "logits/rejected": -16.964237213134766, "logps/chosen": -349.74786376953125, "logps/rejected": -186.1649932861328, "loss": 0.6405, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0539917945861816, "rewards/margins": 2.0212202072143555, "rewards/rejected": 1.032771348953247, "step": 41780 }, { "epoch": 1.940201494962626, "grad_norm": 43.96123123168945, "learning_rate": 1.7679558011049722e-07, "logits/chosen": -18.650047302246094, "logits/rejected": -18.40591812133789, "logps/chosen": -272.0751647949219, "logps/rejected": -284.0599670410156, "loss": 1.3763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.980844497680664, "rewards/margins": -0.5649893879890442, "rewards/rejected": 2.5458338260650635, "step": 41790 }, { "epoch": 1.9406657690700588, "grad_norm": 141.19386291503906, "learning_rate": 1.7671820109259173e-07, "logits/chosen": -18.056598663330078, "logits/rejected": -18.657302856445312, "logps/chosen": -301.15203857421875, "logps/rejected": -388.14263916015625, "loss": 1.6529, "rewards/accuracies": 0.5, "rewards/chosen": 1.8576443195343018, "rewards/margins": -0.8505823016166687, "rewards/rejected": 2.7082266807556152, "step": 41800 }, { "epoch": 1.941130043177492, "grad_norm": 0.053918082267045975, "learning_rate": 1.7664082207468622e-07, "logits/chosen": -18.042926788330078, "logits/rejected": -17.048425674438477, "logps/chosen": -355.3617858886719, "logps/rejected": -257.91876220703125, "loss": 0.3272, "rewards/accuracies": 1.0, "rewards/chosen": 2.737178087234497, "rewards/margins": 1.7999519109725952, "rewards/rejected": 0.9372262954711914, "step": 41810 }, { "epoch": 1.941594317284925, "grad_norm": 46.710811614990234, "learning_rate": 1.7656344305678073e-07, "logits/chosen": -18.69739532470703, "logits/rejected": -17.882610321044922, "logps/chosen": -415.2960510253906, "logps/rejected": -319.9702453613281, "loss": 0.6819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3474433422088623, "rewards/margins": 1.1147563457489014, "rewards/rejected": 2.2326865196228027, "step": 41820 }, { "epoch": 1.942058591392358, "grad_norm": 0.06180523335933685, "learning_rate": 1.764860640388752e-07, "logits/chosen": -19.755949020385742, "logits/rejected": -18.46652603149414, "logps/chosen": -486.7671813964844, "logps/rejected": -274.83172607421875, "loss": 0.505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8508830070495605, "rewards/margins": 2.5559451580047607, "rewards/rejected": 2.2949378490448, "step": 41830 }, { "epoch": 1.942522865499791, "grad_norm": 120.60487365722656, "learning_rate": 1.764086850209697e-07, "logits/chosen": -19.766525268554688, "logits/rejected": -19.525386810302734, "logps/chosen": -421.42608642578125, "logps/rejected": -414.13470458984375, "loss": 0.8938, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8900420665740967, "rewards/margins": 0.5964422821998596, "rewards/rejected": 3.293600559234619, "step": 41840 }, { "epoch": 1.942987139607224, "grad_norm": 1.1291451454162598, "learning_rate": 1.763313060030642e-07, "logits/chosen": -18.798389434814453, "logits/rejected": -18.306495666503906, "logps/chosen": -341.0750732421875, "logps/rejected": -255.960205078125, "loss": 0.4753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4351589679718018, "rewards/margins": 1.2617855072021484, "rewards/rejected": 1.1733735799789429, "step": 41850 }, { "epoch": 1.9434514137146572, "grad_norm": 19.077478408813477, "learning_rate": 1.762539269851587e-07, "logits/chosen": -18.93320655822754, "logits/rejected": -18.157819747924805, "logps/chosen": -293.173828125, "logps/rejected": -188.07913208007812, "loss": 0.594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.349526882171631, "rewards/margins": 1.1165224313735962, "rewards/rejected": 1.2330043315887451, "step": 41860 }, { "epoch": 1.94391568782209, "grad_norm": 61.20672607421875, "learning_rate": 1.761765479672532e-07, "logits/chosen": -18.19778823852539, "logits/rejected": -17.965435028076172, "logps/chosen": -247.052978515625, "logps/rejected": -185.07943725585938, "loss": 0.5618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2635270357131958, "rewards/margins": 0.9307971000671387, "rewards/rejected": 0.3327297270298004, "step": 41870 }, { "epoch": 1.9443799619295232, "grad_norm": 0.29020074009895325, "learning_rate": 1.7609916894934767e-07, "logits/chosen": -18.76310157775879, "logits/rejected": -17.7977352142334, "logps/chosen": -496.8155822753906, "logps/rejected": -414.122314453125, "loss": 0.4092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.029356956481934, "rewards/margins": 2.7671782970428467, "rewards/rejected": 2.262178659439087, "step": 41880 }, { "epoch": 1.9448442360369562, "grad_norm": 0.1752893030643463, "learning_rate": 1.7602178993144218e-07, "logits/chosen": -19.513376235961914, "logits/rejected": -17.678661346435547, "logps/chosen": -407.78887939453125, "logps/rejected": -253.1625518798828, "loss": 0.181, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.848244667053223, "rewards/margins": 3.369410991668701, "rewards/rejected": 2.4788334369659424, "step": 41890 }, { "epoch": 1.9453085101443892, "grad_norm": 12.651647567749023, "learning_rate": 1.759444109135367e-07, "logits/chosen": -20.347614288330078, "logits/rejected": -19.067096710205078, "logps/chosen": -370.77801513671875, "logps/rejected": -309.6104431152344, "loss": 0.4512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.965493679046631, "rewards/margins": 1.7088521718978882, "rewards/rejected": 3.256641387939453, "step": 41900 }, { "epoch": 1.9457727842518224, "grad_norm": 21.712677001953125, "learning_rate": 1.7586703189563117e-07, "logits/chosen": -18.701894760131836, "logits/rejected": -18.1309814453125, "logps/chosen": -438.88507080078125, "logps/rejected": -405.5716857910156, "loss": 0.9986, "rewards/accuracies": 0.5, "rewards/chosen": 3.803556442260742, "rewards/margins": 0.7997538447380066, "rewards/rejected": 3.00380277633667, "step": 41910 }, { "epoch": 1.9462370583592552, "grad_norm": 53.923091888427734, "learning_rate": 1.7578965287772569e-07, "logits/chosen": -19.48004722595215, "logits/rejected": -18.226320266723633, "logps/chosen": -425.39154052734375, "logps/rejected": -404.22161865234375, "loss": 1.4606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6481640338897705, "rewards/margins": 0.18473751842975616, "rewards/rejected": 3.4634265899658203, "step": 41920 }, { "epoch": 1.9467013324666884, "grad_norm": 13.286797523498535, "learning_rate": 1.7571227385982014e-07, "logits/chosen": -18.844654083251953, "logits/rejected": -17.718616485595703, "logps/chosen": -368.0936279296875, "logps/rejected": -320.0578918457031, "loss": 1.032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4703354835510254, "rewards/margins": 0.369737446308136, "rewards/rejected": 2.1005983352661133, "step": 41930 }, { "epoch": 1.9471656065741214, "grad_norm": 139.34661865234375, "learning_rate": 1.7563489484191465e-07, "logits/chosen": -20.14186668395996, "logits/rejected": -20.175561904907227, "logps/chosen": -391.60205078125, "logps/rejected": -395.7559814453125, "loss": 0.6907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.315645217895508, "rewards/margins": 0.5559312105178833, "rewards/rejected": 3.759713649749756, "step": 41940 }, { "epoch": 1.9476298806815544, "grad_norm": 32.459712982177734, "learning_rate": 1.7555751582400917e-07, "logits/chosen": -19.485506057739258, "logits/rejected": -19.77484130859375, "logps/chosen": -322.4972229003906, "logps/rejected": -381.1570129394531, "loss": 1.2361, "rewards/accuracies": 0.5, "rewards/chosen": 2.6029510498046875, "rewards/margins": -0.013689994812011719, "rewards/rejected": 2.61664080619812, "step": 41950 }, { "epoch": 1.9480941547889874, "grad_norm": 57.159332275390625, "learning_rate": 1.7548013680610365e-07, "logits/chosen": -20.37662124633789, "logits/rejected": -19.19899559020996, "logps/chosen": -404.94122314453125, "logps/rejected": -330.80279541015625, "loss": 0.7529, "rewards/accuracies": 0.5, "rewards/chosen": 3.1296286582946777, "rewards/margins": 0.7040113806724548, "rewards/rejected": 2.425616979598999, "step": 41960 }, { "epoch": 1.9485584288964204, "grad_norm": 3.4707765579223633, "learning_rate": 1.7540275778819816e-07, "logits/chosen": -18.759984970092773, "logits/rejected": -18.31571388244629, "logps/chosen": -362.45654296875, "logps/rejected": -312.62847900390625, "loss": 1.564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.883327007293701, "rewards/margins": 0.3821950852870941, "rewards/rejected": 2.501131772994995, "step": 41970 }, { "epoch": 1.9490227030038536, "grad_norm": 19.501375198364258, "learning_rate": 1.7532537877029262e-07, "logits/chosen": -19.43834686279297, "logits/rejected": -17.836729049682617, "logps/chosen": -384.9659423828125, "logps/rejected": -254.29483032226562, "loss": 0.1785, "rewards/accuracies": 1.0, "rewards/chosen": 3.2742221355438232, "rewards/margins": 2.1720080375671387, "rewards/rejected": 1.1022140979766846, "step": 41980 }, { "epoch": 1.9494869771112864, "grad_norm": 85.76067352294922, "learning_rate": 1.7524799975238713e-07, "logits/chosen": -19.64295768737793, "logits/rejected": -19.01586151123047, "logps/chosen": -404.44586181640625, "logps/rejected": -300.06561279296875, "loss": 0.5525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9515862464904785, "rewards/margins": 1.3644146919250488, "rewards/rejected": 1.5871714353561401, "step": 41990 }, { "epoch": 1.9499512512187196, "grad_norm": 271.0880126953125, "learning_rate": 1.7517062073448164e-07, "logits/chosen": -18.95743179321289, "logits/rejected": -18.5494441986084, "logps/chosen": -447.91839599609375, "logps/rejected": -420.1226501464844, "loss": 1.3707, "rewards/accuracies": 0.5, "rewards/chosen": 4.082684516906738, "rewards/margins": 0.2624328136444092, "rewards/rejected": 3.820251941680908, "step": 42000 }, { "epoch": 1.9504155253261526, "grad_norm": 3.7664313316345215, "learning_rate": 1.7509324171657613e-07, "logits/chosen": -18.971004486083984, "logits/rejected": -17.265583038330078, "logps/chosen": -502.1371154785156, "logps/rejected": -314.7098693847656, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 4.7346978187561035, "rewards/margins": 2.6189889907836914, "rewards/rejected": 2.115708827972412, "step": 42010 }, { "epoch": 1.9508797994335856, "grad_norm": 141.7394561767578, "learning_rate": 1.750158626986706e-07, "logits/chosen": -19.12458038330078, "logits/rejected": -19.260456085205078, "logps/chosen": -413.03692626953125, "logps/rejected": -386.90802001953125, "loss": 1.1102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5417098999023438, "rewards/margins": 0.2871529161930084, "rewards/rejected": 3.2545571327209473, "step": 42020 }, { "epoch": 1.9513440735410186, "grad_norm": 108.7498550415039, "learning_rate": 1.749384836807651e-07, "logits/chosen": -18.939237594604492, "logits/rejected": -18.296955108642578, "logps/chosen": -312.61785888671875, "logps/rejected": -290.549560546875, "loss": 0.7875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0016984939575195, "rewards/margins": 1.1267858743667603, "rewards/rejected": 1.8749125003814697, "step": 42030 }, { "epoch": 1.9518083476484516, "grad_norm": 0.22191722691059113, "learning_rate": 1.748611046628596e-07, "logits/chosen": -18.070674896240234, "logits/rejected": -16.946252822875977, "logps/chosen": -369.52374267578125, "logps/rejected": -252.5428009033203, "loss": 0.382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.01005220413208, "rewards/margins": 1.727684736251831, "rewards/rejected": 1.2823673486709595, "step": 42040 }, { "epoch": 1.9522726217558848, "grad_norm": 4.670910358428955, "learning_rate": 1.7478372564495412e-07, "logits/chosen": -19.46985626220703, "logits/rejected": -17.912425994873047, "logps/chosen": -349.1601257324219, "logps/rejected": -251.79006958007812, "loss": 0.5347, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.626579761505127, "rewards/margins": 2.0373384952545166, "rewards/rejected": 1.5892415046691895, "step": 42050 }, { "epoch": 1.9527368958633176, "grad_norm": 106.16140747070312, "learning_rate": 1.747063466270486e-07, "logits/chosen": -19.57090187072754, "logits/rejected": -18.313106536865234, "logps/chosen": -415.9850158691406, "logps/rejected": -280.5947265625, "loss": 0.7307, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0492026805877686, "rewards/margins": 0.9129678606987, "rewards/rejected": 2.136234760284424, "step": 42060 }, { "epoch": 1.9532011699707508, "grad_norm": 17.913908004760742, "learning_rate": 1.746289676091431e-07, "logits/chosen": -17.587223052978516, "logits/rejected": -16.91715431213379, "logps/chosen": -248.13778686523438, "logps/rejected": -167.19606018066406, "loss": 0.4034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3113977909088135, "rewards/margins": 1.446899175643921, "rewards/rejected": -0.13550150394439697, "step": 42070 }, { "epoch": 1.9536654440781838, "grad_norm": 114.52156066894531, "learning_rate": 1.7455158859123757e-07, "logits/chosen": -19.61550521850586, "logits/rejected": -18.412710189819336, "logps/chosen": -407.50909423828125, "logps/rejected": -324.54327392578125, "loss": 0.7637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6545403003692627, "rewards/margins": 0.7784953117370605, "rewards/rejected": 2.876044988632202, "step": 42080 }, { "epoch": 1.9541297181856168, "grad_norm": 120.78046417236328, "learning_rate": 1.7447420957333209e-07, "logits/chosen": -19.233076095581055, "logits/rejected": -18.9160099029541, "logps/chosen": -382.4099426269531, "logps/rejected": -323.91815185546875, "loss": 1.2078, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9522130489349365, "rewards/margins": -0.06416983902454376, "rewards/rejected": 2.016383171081543, "step": 42090 }, { "epoch": 1.95459399229305, "grad_norm": 53.8026008605957, "learning_rate": 1.743968305554266e-07, "logits/chosen": -19.62746810913086, "logits/rejected": -19.688920974731445, "logps/chosen": -469.4755859375, "logps/rejected": -418.8456115722656, "loss": 0.6597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.576084613800049, "rewards/margins": 0.24481935799121857, "rewards/rejected": 3.3312652111053467, "step": 42100 }, { "epoch": 1.9550582664004827, "grad_norm": 39.498043060302734, "learning_rate": 1.7431945153752108e-07, "logits/chosen": -19.72922134399414, "logits/rejected": -20.190519332885742, "logps/chosen": -476.51580810546875, "logps/rejected": -411.5259704589844, "loss": 0.815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.135726451873779, "rewards/margins": 0.19731232523918152, "rewards/rejected": 3.9384140968322754, "step": 42110 }, { "epoch": 1.955522540507916, "grad_norm": 28.563037872314453, "learning_rate": 1.7424207251961557e-07, "logits/chosen": -19.191001892089844, "logits/rejected": -18.889362335205078, "logps/chosen": -353.21478271484375, "logps/rejected": -379.08331298828125, "loss": 1.1402, "rewards/accuracies": 0.5, "rewards/chosen": 2.7773680686950684, "rewards/margins": 0.12673453986644745, "rewards/rejected": 2.6506335735321045, "step": 42120 }, { "epoch": 1.955986814615349, "grad_norm": 61.502925872802734, "learning_rate": 1.7416469350171005e-07, "logits/chosen": -18.764379501342773, "logits/rejected": -18.945253372192383, "logps/chosen": -447.41668701171875, "logps/rejected": -454.8837890625, "loss": 0.6754, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.547882556915283, "rewards/margins": 0.5679380297660828, "rewards/rejected": 3.9799447059631348, "step": 42130 }, { "epoch": 1.956451088722782, "grad_norm": 8.604982376098633, "learning_rate": 1.7408731448380456e-07, "logits/chosen": -19.997474670410156, "logits/rejected": -19.27553939819336, "logps/chosen": -378.7911071777344, "logps/rejected": -319.1512451171875, "loss": 0.6446, "rewards/accuracies": 0.5, "rewards/chosen": 3.7609028816223145, "rewards/margins": 1.2946207523345947, "rewards/rejected": 2.466282367706299, "step": 42140 }, { "epoch": 1.956915362830215, "grad_norm": 1.4211798906326294, "learning_rate": 1.7400993546589907e-07, "logits/chosen": -19.02813720703125, "logits/rejected": -18.488588333129883, "logps/chosen": -494.1678161621094, "logps/rejected": -374.5550231933594, "loss": 0.7195, "rewards/accuracies": 0.5, "rewards/chosen": 3.6108462810516357, "rewards/margins": 1.3974583148956299, "rewards/rejected": 2.213388442993164, "step": 42150 }, { "epoch": 1.957379636937648, "grad_norm": 63.73282241821289, "learning_rate": 1.7393255644799356e-07, "logits/chosen": -19.343429565429688, "logits/rejected": -18.797142028808594, "logps/chosen": -362.95697021484375, "logps/rejected": -321.71136474609375, "loss": 0.6872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1578094959259033, "rewards/margins": 1.4275996685028076, "rewards/rejected": 1.7302097082138062, "step": 42160 }, { "epoch": 1.9578439110450812, "grad_norm": 57.09855270385742, "learning_rate": 1.7385517743008804e-07, "logits/chosen": -18.54822540283203, "logits/rejected": -18.24931526184082, "logps/chosen": -401.02313232421875, "logps/rejected": -373.26177978515625, "loss": 0.6761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0234463214874268, "rewards/margins": 0.4791272282600403, "rewards/rejected": 2.5443191528320312, "step": 42170 }, { "epoch": 1.958308185152514, "grad_norm": 154.11880493164062, "learning_rate": 1.7377779841218253e-07, "logits/chosen": -18.970375061035156, "logits/rejected": -18.5712947845459, "logps/chosen": -457.22943115234375, "logps/rejected": -377.8633117675781, "loss": 0.3835, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.370490550994873, "rewards/margins": 1.9367644786834717, "rewards/rejected": 2.4337260723114014, "step": 42180 }, { "epoch": 1.9587724592599471, "grad_norm": 20.386016845703125, "learning_rate": 1.7370041939427704e-07, "logits/chosen": -19.0762882232666, "logits/rejected": -18.760961532592773, "logps/chosen": -369.5842590332031, "logps/rejected": -344.5333557128906, "loss": 0.5631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.380389451980591, "rewards/margins": 0.8900656700134277, "rewards/rejected": 2.490324020385742, "step": 42190 }, { "epoch": 1.9592367333673801, "grad_norm": 167.5911407470703, "learning_rate": 1.7362304037637155e-07, "logits/chosen": -18.319168090820312, "logits/rejected": -18.324697494506836, "logps/chosen": -406.43572998046875, "logps/rejected": -338.23980712890625, "loss": 0.9718, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.6893131732940674, "rewards/margins": -0.19626213610172272, "rewards/rejected": 2.88557505607605, "step": 42200 }, { "epoch": 1.9597010074748131, "grad_norm": 0.041381556540727615, "learning_rate": 1.73545661358466e-07, "logits/chosen": -18.631145477294922, "logits/rejected": -17.21847152709961, "logps/chosen": -364.7686767578125, "logps/rejected": -272.8631286621094, "loss": 1.2314, "rewards/accuracies": 0.5, "rewards/chosen": 3.1896023750305176, "rewards/margins": 1.1325193643569946, "rewards/rejected": 2.0570831298828125, "step": 42210 }, { "epoch": 1.9601652815822461, "grad_norm": 106.2726058959961, "learning_rate": 1.7346828234056052e-07, "logits/chosen": -18.820327758789062, "logits/rejected": -18.67880630493164, "logps/chosen": -376.2862854003906, "logps/rejected": -352.9109191894531, "loss": 0.8475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.41369366645813, "rewards/margins": 0.6035075187683105, "rewards/rejected": 2.8101859092712402, "step": 42220 }, { "epoch": 1.9606295556896791, "grad_norm": 64.3809585571289, "learning_rate": 1.73390903322655e-07, "logits/chosen": -17.52334213256836, "logits/rejected": -17.51614761352539, "logps/chosen": -406.5859069824219, "logps/rejected": -339.2823181152344, "loss": 1.3087, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.539121150970459, "rewards/margins": 0.19376564025878906, "rewards/rejected": 3.345355272293091, "step": 42230 }, { "epoch": 1.9610938297971123, "grad_norm": 17.580942153930664, "learning_rate": 1.7331352430474952e-07, "logits/chosen": -18.486419677734375, "logits/rejected": -17.371826171875, "logps/chosen": -478.7789001464844, "logps/rejected": -295.0704345703125, "loss": 0.2677, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.398895740509033, "rewards/margins": 1.65194571018219, "rewards/rejected": 1.7469501495361328, "step": 42240 }, { "epoch": 1.961558103904545, "grad_norm": 120.74161529541016, "learning_rate": 1.7323614528684403e-07, "logits/chosen": -19.03679656982422, "logits/rejected": -18.763671875, "logps/chosen": -357.3506164550781, "logps/rejected": -314.8415222167969, "loss": 0.9793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.439145803451538, "rewards/margins": -0.17062067985534668, "rewards/rejected": 2.6097664833068848, "step": 42250 }, { "epoch": 1.9620223780119783, "grad_norm": 104.36107635498047, "learning_rate": 1.731587662689385e-07, "logits/chosen": -19.00360107421875, "logits/rejected": -17.651092529296875, "logps/chosen": -278.1584777832031, "logps/rejected": -220.54696655273438, "loss": 0.5168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.619351625442505, "rewards/margins": 1.3226652145385742, "rewards/rejected": 1.2966862916946411, "step": 42260 }, { "epoch": 1.9624866521194113, "grad_norm": 78.03960418701172, "learning_rate": 1.7308912515282356e-07, "logits/chosen": -18.324214935302734, "logits/rejected": -18.541522979736328, "logps/chosen": -425.69329833984375, "logps/rejected": -413.7953186035156, "loss": 1.2754, "rewards/accuracies": 0.5, "rewards/chosen": 3.064189910888672, "rewards/margins": -0.4041732847690582, "rewards/rejected": 3.4683632850646973, "step": 42270 }, { "epoch": 1.9629509262268443, "grad_norm": 95.96202850341797, "learning_rate": 1.7301174613491805e-07, "logits/chosen": -19.303274154663086, "logits/rejected": -18.2388858795166, "logps/chosen": -507.72845458984375, "logps/rejected": -402.0015869140625, "loss": 1.1331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.327877044677734, "rewards/margins": 1.1774505376815796, "rewards/rejected": 3.1504263877868652, "step": 42280 }, { "epoch": 1.9634152003342775, "grad_norm": 49.47986602783203, "learning_rate": 1.7293436711701253e-07, "logits/chosen": -18.335693359375, "logits/rejected": -18.102415084838867, "logps/chosen": -315.3827209472656, "logps/rejected": -341.8612060546875, "loss": 0.8995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8328607082366943, "rewards/margins": 0.5845869779586792, "rewards/rejected": 2.2482738494873047, "step": 42290 }, { "epoch": 1.9638794744417103, "grad_norm": 19.729928970336914, "learning_rate": 1.7285698809910704e-07, "logits/chosen": -19.530757904052734, "logits/rejected": -19.318777084350586, "logps/chosen": -506.05364990234375, "logps/rejected": -428.2742614746094, "loss": 0.4966, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.099289417266846, "rewards/margins": 1.7289918661117554, "rewards/rejected": 3.37029767036438, "step": 42300 }, { "epoch": 1.9643437485491435, "grad_norm": 233.36663818359375, "learning_rate": 1.7277960908120153e-07, "logits/chosen": -19.280376434326172, "logits/rejected": -17.88804817199707, "logps/chosen": -497.549072265625, "logps/rejected": -333.58148193359375, "loss": 0.5551, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.568202257156372, "rewards/margins": 1.6397731304168701, "rewards/rejected": 1.9284292459487915, "step": 42310 }, { "epoch": 1.9648080226565763, "grad_norm": 55.04568099975586, "learning_rate": 1.7270223006329604e-07, "logits/chosen": -19.838180541992188, "logits/rejected": -17.942241668701172, "logps/chosen": -384.97637939453125, "logps/rejected": -266.31256103515625, "loss": 0.304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.927342176437378, "rewards/margins": 2.308371067047119, "rewards/rejected": 1.6189712285995483, "step": 42320 }, { "epoch": 1.9652722967640095, "grad_norm": 3.105358123779297, "learning_rate": 1.7262485104539052e-07, "logits/chosen": -18.9783935546875, "logits/rejected": -18.380611419677734, "logps/chosen": -344.51885986328125, "logps/rejected": -293.336181640625, "loss": 0.7574, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.993746042251587, "rewards/margins": 1.1231553554534912, "rewards/rejected": 1.8705905675888062, "step": 42330 }, { "epoch": 1.9657365708714425, "grad_norm": 0.2469439059495926, "learning_rate": 1.72547472027485e-07, "logits/chosen": -19.166337966918945, "logits/rejected": -17.634471893310547, "logps/chosen": -380.99786376953125, "logps/rejected": -255.995361328125, "loss": 0.2631, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9294326305389404, "rewards/margins": 2.516986608505249, "rewards/rejected": 1.4124460220336914, "step": 42340 }, { "epoch": 1.9662008449788755, "grad_norm": 90.91157531738281, "learning_rate": 1.7247009300957952e-07, "logits/chosen": -18.086551666259766, "logits/rejected": -17.6080265045166, "logps/chosen": -271.7884521484375, "logps/rejected": -223.40194702148438, "loss": 0.6521, "rewards/accuracies": 0.5, "rewards/chosen": 1.3489747047424316, "rewards/margins": 0.437513530254364, "rewards/rejected": 0.9114610552787781, "step": 42350 }, { "epoch": 1.9666651190863087, "grad_norm": 0.2069089412689209, "learning_rate": 1.72392713991674e-07, "logits/chosen": -20.35519790649414, "logits/rejected": -18.44029426574707, "logps/chosen": -492.00787353515625, "logps/rejected": -305.2986145019531, "loss": 0.4022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.576564788818359, "rewards/margins": 3.0931737422943115, "rewards/rejected": 1.4833909273147583, "step": 42360 }, { "epoch": 1.9671293931937415, "grad_norm": 115.63726806640625, "learning_rate": 1.7231533497376852e-07, "logits/chosen": -19.955718994140625, "logits/rejected": -18.944116592407227, "logps/chosen": -436.16607666015625, "logps/rejected": -371.13262939453125, "loss": 0.3441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.201809406280518, "rewards/margins": 1.3245360851287842, "rewards/rejected": 2.8772735595703125, "step": 42370 }, { "epoch": 1.9675936673011747, "grad_norm": 0.6980170011520386, "learning_rate": 1.72237955955863e-07, "logits/chosen": -20.969961166381836, "logits/rejected": -19.634765625, "logps/chosen": -367.7952880859375, "logps/rejected": -297.43792724609375, "loss": 0.6777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5845882892608643, "rewards/margins": 1.2687103748321533, "rewards/rejected": 2.31587815284729, "step": 42380 }, { "epoch": 1.9680579414086077, "grad_norm": 0.1970328688621521, "learning_rate": 1.7216057693795749e-07, "logits/chosen": -19.058860778808594, "logits/rejected": -18.54595184326172, "logps/chosen": -308.5770568847656, "logps/rejected": -263.58892822265625, "loss": 0.8817, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.317561149597168, "rewards/margins": 2.328768253326416, "rewards/rejected": 1.9887924194335938, "step": 42390 }, { "epoch": 1.9685222155160407, "grad_norm": 21.978666305541992, "learning_rate": 1.72083197920052e-07, "logits/chosen": -18.69919204711914, "logits/rejected": -17.50396728515625, "logps/chosen": -442.910400390625, "logps/rejected": -278.4095764160156, "loss": 0.3047, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.871068954467773, "rewards/margins": 2.4654316902160645, "rewards/rejected": 2.405637264251709, "step": 42400 }, { "epoch": 1.9689864896234737, "grad_norm": 168.72740173339844, "learning_rate": 1.7200581890214648e-07, "logits/chosen": -18.81775665283203, "logits/rejected": -18.572526931762695, "logps/chosen": -286.0657043457031, "logps/rejected": -300.25128173828125, "loss": 0.7951, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4391562938690186, "rewards/margins": 0.3987088203430176, "rewards/rejected": 2.040447235107422, "step": 42410 }, { "epoch": 1.9694507637309067, "grad_norm": 30.97897720336914, "learning_rate": 1.71928439884241e-07, "logits/chosen": -18.365507125854492, "logits/rejected": -18.53192138671875, "logps/chosen": -406.27691650390625, "logps/rejected": -391.1355285644531, "loss": 0.8741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4263834953308105, "rewards/margins": 0.8443703651428223, "rewards/rejected": 2.5820131301879883, "step": 42420 }, { "epoch": 1.9699150378383399, "grad_norm": 112.03108215332031, "learning_rate": 1.7185106086633548e-07, "logits/chosen": -19.452381134033203, "logits/rejected": -18.704242706298828, "logps/chosen": -356.79193115234375, "logps/rejected": -301.1029968261719, "loss": 0.6091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.123600482940674, "rewards/margins": 0.8347448110580444, "rewards/rejected": 2.288856029510498, "step": 42430 }, { "epoch": 1.9703793119457726, "grad_norm": 6.516988754272461, "learning_rate": 1.7177368184842996e-07, "logits/chosen": -19.364627838134766, "logits/rejected": -17.9683837890625, "logps/chosen": -348.7947692871094, "logps/rejected": -328.6549377441406, "loss": 1.2305, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6716971397399902, "rewards/margins": 0.936453640460968, "rewards/rejected": 2.735243320465088, "step": 42440 }, { "epoch": 1.9708435860532059, "grad_norm": 196.36648559570312, "learning_rate": 1.7169630283052447e-07, "logits/chosen": -18.778095245361328, "logits/rejected": -18.27724266052246, "logps/chosen": -324.69573974609375, "logps/rejected": -282.93377685546875, "loss": 0.9581, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1156582832336426, "rewards/margins": 0.017542142421007156, "rewards/rejected": 2.098116397857666, "step": 42450 }, { "epoch": 1.9713078601606389, "grad_norm": 122.5443115234375, "learning_rate": 1.7161892381261896e-07, "logits/chosen": -18.78476333618164, "logits/rejected": -17.390079498291016, "logps/chosen": -493.9261779785156, "logps/rejected": -351.29132080078125, "loss": 1.2858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9862494468688965, "rewards/margins": 1.3071436882019043, "rewards/rejected": 2.679105281829834, "step": 42460 }, { "epoch": 1.9717721342680719, "grad_norm": 88.9327163696289, "learning_rate": 1.7154154479471344e-07, "logits/chosen": -18.976919174194336, "logits/rejected": -18.64806365966797, "logps/chosen": -377.7452697753906, "logps/rejected": -267.22039794921875, "loss": 0.4985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4082839488983154, "rewards/margins": 1.5712618827819824, "rewards/rejected": 1.8370224237442017, "step": 42470 }, { "epoch": 1.9722364083755048, "grad_norm": 112.36805725097656, "learning_rate": 1.7146416577680796e-07, "logits/chosen": -19.06185531616211, "logits/rejected": -17.247304916381836, "logps/chosen": -396.55511474609375, "logps/rejected": -253.927490234375, "loss": 0.4084, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3138492107391357, "rewards/margins": 1.839725136756897, "rewards/rejected": 1.4741243124008179, "step": 42480 }, { "epoch": 1.9727006824829378, "grad_norm": 6.171538352966309, "learning_rate": 1.7138678675890244e-07, "logits/chosen": -20.070781707763672, "logits/rejected": -19.609960556030273, "logps/chosen": -575.5579833984375, "logps/rejected": -431.22723388671875, "loss": 0.5654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.123932838439941, "rewards/margins": 0.9642589688301086, "rewards/rejected": 3.1596741676330566, "step": 42490 }, { "epoch": 1.973164956590371, "grad_norm": 25.12061309814453, "learning_rate": 1.7130940774099695e-07, "logits/chosen": -18.254898071289062, "logits/rejected": -17.578622817993164, "logps/chosen": -294.4826354980469, "logps/rejected": -245.9952850341797, "loss": 0.3627, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5949156284332275, "rewards/margins": 1.0982025861740112, "rewards/rejected": 1.496713399887085, "step": 42500 }, { "epoch": 1.9736292306978038, "grad_norm": 26.953392028808594, "learning_rate": 1.7123202872309144e-07, "logits/chosen": -18.431467056274414, "logits/rejected": -17.445722579956055, "logps/chosen": -505.75244140625, "logps/rejected": -365.3111572265625, "loss": 0.7272, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.047900676727295, "rewards/margins": 1.3147169351577759, "rewards/rejected": 2.7331833839416504, "step": 42510 }, { "epoch": 1.974093504805237, "grad_norm": 199.93197631835938, "learning_rate": 1.7115464970518592e-07, "logits/chosen": -19.42055892944336, "logits/rejected": -19.280376434326172, "logps/chosen": -386.69732666015625, "logps/rejected": -356.3951416015625, "loss": 0.776, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2471301555633545, "rewards/margins": 0.7367416620254517, "rewards/rejected": 2.510388135910034, "step": 42520 }, { "epoch": 1.97455777891267, "grad_norm": 39.65922927856445, "learning_rate": 1.7107727068728043e-07, "logits/chosen": -19.84463882446289, "logits/rejected": -19.218265533447266, "logps/chosen": -376.66973876953125, "logps/rejected": -398.109130859375, "loss": 0.7524, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.252582550048828, "rewards/margins": 0.4045191705226898, "rewards/rejected": 2.8480632305145264, "step": 42530 }, { "epoch": 1.975022053020103, "grad_norm": 22.29667854309082, "learning_rate": 1.7099989166937492e-07, "logits/chosen": -20.41984748840332, "logits/rejected": -17.403892517089844, "logps/chosen": -562.3885498046875, "logps/rejected": -275.98052978515625, "loss": 0.1757, "rewards/accuracies": 1.0, "rewards/chosen": 4.491280555725098, "rewards/margins": 3.6037726402282715, "rewards/rejected": 0.8875080943107605, "step": 42540 }, { "epoch": 1.9754863271275362, "grad_norm": 114.50531005859375, "learning_rate": 1.7092251265146943e-07, "logits/chosen": -19.015518188476562, "logits/rejected": -18.52820587158203, "logps/chosen": -363.5414123535156, "logps/rejected": -283.53692626953125, "loss": 0.8421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6402125358581543, "rewards/margins": 0.7576706409454346, "rewards/rejected": 1.8825420141220093, "step": 42550 }, { "epoch": 1.975950601234969, "grad_norm": 190.35089111328125, "learning_rate": 1.7084513363356391e-07, "logits/chosen": -19.55683135986328, "logits/rejected": -20.228729248046875, "logps/chosen": -416.7555236816406, "logps/rejected": -503.43426513671875, "loss": 1.2595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.311269283294678, "rewards/margins": -0.22126264870166779, "rewards/rejected": 4.532531261444092, "step": 42560 }, { "epoch": 1.9764148753424022, "grad_norm": 9.415533065795898, "learning_rate": 1.707677546156584e-07, "logits/chosen": -18.35112762451172, "logits/rejected": -17.6039981842041, "logps/chosen": -351.7950744628906, "logps/rejected": -269.64398193359375, "loss": 0.8557, "rewards/accuracies": 0.5, "rewards/chosen": 2.909578800201416, "rewards/margins": 0.5406249165534973, "rewards/rejected": 2.3689537048339844, "step": 42570 }, { "epoch": 1.9768791494498352, "grad_norm": 241.12596130371094, "learning_rate": 1.706903755977529e-07, "logits/chosen": -18.47273826599121, "logits/rejected": -17.428218841552734, "logps/chosen": -475.93310546875, "logps/rejected": -321.4116516113281, "loss": 0.967, "rewards/accuracies": 0.5, "rewards/chosen": 4.048563003540039, "rewards/margins": 1.4395086765289307, "rewards/rejected": 2.6090540885925293, "step": 42580 }, { "epoch": 1.9773434235572682, "grad_norm": 70.10381317138672, "learning_rate": 1.706129965798474e-07, "logits/chosen": -19.110797882080078, "logits/rejected": -18.034690856933594, "logps/chosen": -297.1219177246094, "logps/rejected": -182.69216918945312, "loss": 0.564, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.687706708908081, "rewards/margins": 1.438546895980835, "rewards/rejected": 1.249159812927246, "step": 42590 }, { "epoch": 1.9778076976647012, "grad_norm": 39.83516311645508, "learning_rate": 1.705356175619419e-07, "logits/chosen": -19.323955535888672, "logits/rejected": -19.2705135345459, "logps/chosen": -356.835205078125, "logps/rejected": -387.76678466796875, "loss": 1.22, "rewards/accuracies": 0.5, "rewards/chosen": 4.079049587249756, "rewards/margins": 0.4265001714229584, "rewards/rejected": 3.6525497436523438, "step": 42600 }, { "epoch": 1.9782719717721342, "grad_norm": 41.27239990234375, "learning_rate": 1.704582385440364e-07, "logits/chosen": -18.159753799438477, "logits/rejected": -17.796751022338867, "logps/chosen": -313.92864990234375, "logps/rejected": -280.8551330566406, "loss": 0.5192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.001502513885498, "rewards/margins": 1.218759298324585, "rewards/rejected": 1.782743215560913, "step": 42610 }, { "epoch": 1.9787362458795674, "grad_norm": 2.8262076377868652, "learning_rate": 1.7038085952613088e-07, "logits/chosen": -19.307636260986328, "logits/rejected": -18.342281341552734, "logps/chosen": -513.97705078125, "logps/rejected": -330.73944091796875, "loss": 0.4663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.189436912536621, "rewards/margins": 2.154869556427002, "rewards/rejected": 2.034567356109619, "step": 42620 }, { "epoch": 1.9792005199870002, "grad_norm": 36.45701599121094, "learning_rate": 1.703034805082254e-07, "logits/chosen": -18.334264755249023, "logits/rejected": -17.598661422729492, "logps/chosen": -331.8418273925781, "logps/rejected": -262.47119140625, "loss": 0.6933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6086113452911377, "rewards/margins": 1.8923423290252686, "rewards/rejected": 1.71626877784729, "step": 42630 }, { "epoch": 1.9796647940944334, "grad_norm": 185.81417846679688, "learning_rate": 1.7022610149031987e-07, "logits/chosen": -19.275575637817383, "logits/rejected": -18.101253509521484, "logps/chosen": -396.3589172363281, "logps/rejected": -368.9793395996094, "loss": 0.6831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9753308296203613, "rewards/margins": 1.324432134628296, "rewards/rejected": 2.6508989334106445, "step": 42640 }, { "epoch": 1.9801290682018664, "grad_norm": 0.884814441204071, "learning_rate": 1.7014872247241438e-07, "logits/chosen": -19.740718841552734, "logits/rejected": -19.010665893554688, "logps/chosen": -278.6749572753906, "logps/rejected": -257.20208740234375, "loss": 0.7648, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.730346441268921, "rewards/margins": 0.8982359766960144, "rewards/rejected": 1.8321106433868408, "step": 42650 }, { "epoch": 1.9805933423092994, "grad_norm": 28.75130844116211, "learning_rate": 1.7007134345450887e-07, "logits/chosen": -19.417293548583984, "logits/rejected": -18.831127166748047, "logps/chosen": -260.8888854980469, "logps/rejected": -222.6279754638672, "loss": 1.0961, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4949498176574707, "rewards/margins": 0.6482788324356079, "rewards/rejected": 1.8466708660125732, "step": 42660 }, { "epoch": 1.9810576164167324, "grad_norm": 90.32318115234375, "learning_rate": 1.6999396443660335e-07, "logits/chosen": -18.464885711669922, "logits/rejected": -17.67670440673828, "logps/chosen": -268.4358215332031, "logps/rejected": -204.2285614013672, "loss": 0.6796, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.172250270843506, "rewards/margins": 0.9524529576301575, "rewards/rejected": 1.2197976112365723, "step": 42670 }, { "epoch": 1.9815218905241654, "grad_norm": 1.5498073101043701, "learning_rate": 1.6991658541869786e-07, "logits/chosen": -18.61171531677246, "logits/rejected": -18.93238639831543, "logps/chosen": -395.5128173828125, "logps/rejected": -364.78240966796875, "loss": 0.5702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8692097663879395, "rewards/margins": 1.785431146621704, "rewards/rejected": 2.0837786197662354, "step": 42680 }, { "epoch": 1.9819861646315986, "grad_norm": 42.68657302856445, "learning_rate": 1.6983920640079235e-07, "logits/chosen": -19.532024383544922, "logits/rejected": -18.326946258544922, "logps/chosen": -317.0226745605469, "logps/rejected": -265.54168701171875, "loss": 0.4093, "rewards/accuracies": 1.0, "rewards/chosen": 3.063898801803589, "rewards/margins": 0.8517156839370728, "rewards/rejected": 2.2121834754943848, "step": 42690 }, { "epoch": 1.9824504387390314, "grad_norm": 105.30624389648438, "learning_rate": 1.6976182738288686e-07, "logits/chosen": -18.526315689086914, "logits/rejected": -18.04207992553711, "logps/chosen": -409.75030517578125, "logps/rejected": -340.6923828125, "loss": 0.7406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.221479892730713, "rewards/margins": 1.3119150400161743, "rewards/rejected": 2.909564971923828, "step": 42700 }, { "epoch": 1.9829147128464646, "grad_norm": 52.59961700439453, "learning_rate": 1.6968444836498132e-07, "logits/chosen": -17.917322158813477, "logits/rejected": -17.402441024780273, "logps/chosen": -393.5122985839844, "logps/rejected": -325.802978515625, "loss": 0.4195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8346714973449707, "rewards/margins": 1.0956814289093018, "rewards/rejected": 1.7389898300170898, "step": 42710 }, { "epoch": 1.9833789869538976, "grad_norm": 82.57179260253906, "learning_rate": 1.6960706934707583e-07, "logits/chosen": -19.192888259887695, "logits/rejected": -18.96384048461914, "logps/chosen": -361.84014892578125, "logps/rejected": -342.47760009765625, "loss": 0.7303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.049080848693848, "rewards/margins": 0.5698219537734985, "rewards/rejected": 3.479259490966797, "step": 42720 }, { "epoch": 1.9838432610613306, "grad_norm": 63.12668228149414, "learning_rate": 1.6952969032917034e-07, "logits/chosen": -19.577016830444336, "logits/rejected": -18.201107025146484, "logps/chosen": -377.1256408691406, "logps/rejected": -284.0633544921875, "loss": 0.3392, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2209668159484863, "rewards/margins": 1.468400001525879, "rewards/rejected": 1.752566933631897, "step": 42730 }, { "epoch": 1.9843075351687638, "grad_norm": 208.67628479003906, "learning_rate": 1.6945231131126483e-07, "logits/chosen": -17.723651885986328, "logits/rejected": -17.835262298583984, "logps/chosen": -338.6669006347656, "logps/rejected": -377.8327941894531, "loss": 1.6214, "rewards/accuracies": 0.5, "rewards/chosen": 1.6934478282928467, "rewards/margins": -0.6750274896621704, "rewards/rejected": 2.3684754371643066, "step": 42740 }, { "epoch": 1.9847718092761966, "grad_norm": 298.027099609375, "learning_rate": 1.6937493229335934e-07, "logits/chosen": -19.360750198364258, "logits/rejected": -18.239891052246094, "logps/chosen": -400.114501953125, "logps/rejected": -298.5391540527344, "loss": 0.7942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3141417503356934, "rewards/margins": 1.0794624090194702, "rewards/rejected": 2.234679698944092, "step": 42750 }, { "epoch": 1.9852360833836298, "grad_norm": 93.66767120361328, "learning_rate": 1.692975532754538e-07, "logits/chosen": -19.63385009765625, "logits/rejected": -18.66754722595215, "logps/chosen": -383.0490417480469, "logps/rejected": -359.1695556640625, "loss": 0.3203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7915425300598145, "rewards/margins": 1.3849729299545288, "rewards/rejected": 2.406569719314575, "step": 42760 }, { "epoch": 1.9857003574910628, "grad_norm": 0.7686967849731445, "learning_rate": 1.692201742575483e-07, "logits/chosen": -18.925064086914062, "logits/rejected": -18.354278564453125, "logps/chosen": -324.7124938964844, "logps/rejected": -293.29132080078125, "loss": 0.6281, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7479004859924316, "rewards/margins": 1.4516489505767822, "rewards/rejected": 2.2962512969970703, "step": 42770 }, { "epoch": 1.9861646315984958, "grad_norm": 234.9791259765625, "learning_rate": 1.6914279523964282e-07, "logits/chosen": -18.90162467956543, "logits/rejected": -18.004005432128906, "logps/chosen": -519.3844604492188, "logps/rejected": -356.42315673828125, "loss": 0.6007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.032400608062744, "rewards/margins": 2.425293445587158, "rewards/rejected": 1.607107400894165, "step": 42780 }, { "epoch": 1.9866289057059288, "grad_norm": 295.9817199707031, "learning_rate": 1.690654162217373e-07, "logits/chosen": -19.73373031616211, "logits/rejected": -18.894338607788086, "logps/chosen": -472.12109375, "logps/rejected": -378.72900390625, "loss": 0.5409, "rewards/accuracies": 0.5, "rewards/chosen": 2.8070292472839355, "rewards/margins": 0.8468372225761414, "rewards/rejected": 1.9601919651031494, "step": 42790 }, { "epoch": 1.9870931798133618, "grad_norm": 191.9786376953125, "learning_rate": 1.6898803720383181e-07, "logits/chosen": -19.231739044189453, "logits/rejected": -18.649127960205078, "logps/chosen": -384.6370544433594, "logps/rejected": -323.24896240234375, "loss": 0.8198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.428586483001709, "rewards/margins": 0.6469709873199463, "rewards/rejected": 2.781615734100342, "step": 42800 }, { "epoch": 1.987557453920795, "grad_norm": 92.60738372802734, "learning_rate": 1.6891065818592627e-07, "logits/chosen": -19.108272552490234, "logits/rejected": -18.506776809692383, "logps/chosen": -314.53167724609375, "logps/rejected": -269.2769775390625, "loss": 0.6843, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0195119380950928, "rewards/margins": 0.9318313598632812, "rewards/rejected": 1.087680459022522, "step": 42810 }, { "epoch": 1.9880217280282277, "grad_norm": 7.471775531768799, "learning_rate": 1.6883327916802078e-07, "logits/chosen": -18.925113677978516, "logits/rejected": -18.147937774658203, "logps/chosen": -288.45050048828125, "logps/rejected": -271.7399597167969, "loss": 0.7438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6556789875030518, "rewards/margins": 0.8050069808959961, "rewards/rejected": 0.8506719470024109, "step": 42820 }, { "epoch": 1.988486002135661, "grad_norm": 176.4009246826172, "learning_rate": 1.687559001501153e-07, "logits/chosen": -19.64316177368164, "logits/rejected": -18.016353607177734, "logps/chosen": -487.11541748046875, "logps/rejected": -429.878662109375, "loss": 0.6773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.659114360809326, "rewards/margins": 1.600217580795288, "rewards/rejected": 3.058896780014038, "step": 42830 }, { "epoch": 1.988950276243094, "grad_norm": 44.559410095214844, "learning_rate": 1.6867852113220978e-07, "logits/chosen": -19.353778839111328, "logits/rejected": -18.72319221496582, "logps/chosen": -525.3070068359375, "logps/rejected": -418.0338439941406, "loss": 0.8936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.144699573516846, "rewards/margins": 0.7925774455070496, "rewards/rejected": 3.3521218299865723, "step": 42840 }, { "epoch": 1.989414550350527, "grad_norm": 31.815500259399414, "learning_rate": 1.686011421143043e-07, "logits/chosen": -17.82034683227539, "logits/rejected": -18.046810150146484, "logps/chosen": -379.72857666015625, "logps/rejected": -401.9202880859375, "loss": 1.3464, "rewards/accuracies": 0.5, "rewards/chosen": 2.3854479789733887, "rewards/margins": -0.37016764283180237, "rewards/rejected": 2.755615711212158, "step": 42850 }, { "epoch": 1.98987882445796, "grad_norm": 31.376440048217773, "learning_rate": 1.6852376309639878e-07, "logits/chosen": -19.199100494384766, "logits/rejected": -18.699865341186523, "logps/chosen": -356.4948425292969, "logps/rejected": -307.21124267578125, "loss": 0.6414, "rewards/accuracies": 0.5, "rewards/chosen": 2.7660183906555176, "rewards/margins": 0.7433110475540161, "rewards/rejected": 2.022707462310791, "step": 42860 }, { "epoch": 1.990343098565393, "grad_norm": 211.86959838867188, "learning_rate": 1.6844638407849326e-07, "logits/chosen": -18.3198184967041, "logits/rejected": -18.04985809326172, "logps/chosen": -295.4805603027344, "logps/rejected": -255.5246124267578, "loss": 0.8186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.563551425933838, "rewards/margins": 0.8810864686965942, "rewards/rejected": 1.682464838027954, "step": 42870 }, { "epoch": 1.9908073726728261, "grad_norm": 222.26268005371094, "learning_rate": 1.6836900506058777e-07, "logits/chosen": -18.705482482910156, "logits/rejected": -18.15180778503418, "logps/chosen": -429.8333435058594, "logps/rejected": -313.24127197265625, "loss": 0.9578, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9249725341796875, "rewards/margins": 0.6074541807174683, "rewards/rejected": 3.3175177574157715, "step": 42880 }, { "epoch": 1.991271646780259, "grad_norm": 15.935266494750977, "learning_rate": 1.6829162604268226e-07, "logits/chosen": -20.24281883239746, "logits/rejected": -18.747304916381836, "logps/chosen": -346.5256652832031, "logps/rejected": -277.51776123046875, "loss": 0.8299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6183717250823975, "rewards/margins": 1.1468099355697632, "rewards/rejected": 1.4715616703033447, "step": 42890 }, { "epoch": 1.9917359208876921, "grad_norm": 187.1658477783203, "learning_rate": 1.6821424702477677e-07, "logits/chosen": -18.951927185058594, "logits/rejected": -18.708765029907227, "logps/chosen": -380.2528076171875, "logps/rejected": -383.0405578613281, "loss": 0.9009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1705567836761475, "rewards/margins": 0.04806254059076309, "rewards/rejected": 3.1224942207336426, "step": 42900 }, { "epoch": 1.9922001949951251, "grad_norm": 57.209938049316406, "learning_rate": 1.6813686800687125e-07, "logits/chosen": -19.102767944335938, "logits/rejected": -19.1539249420166, "logps/chosen": -410.4493103027344, "logps/rejected": -384.46929931640625, "loss": 1.1194, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6201114654541016, "rewards/margins": 0.027861928567290306, "rewards/rejected": 3.592249631881714, "step": 42910 }, { "epoch": 1.9926644691025581, "grad_norm": 25.02916717529297, "learning_rate": 1.6805948898896574e-07, "logits/chosen": -19.72092056274414, "logits/rejected": -19.22296905517578, "logps/chosen": -394.1410217285156, "logps/rejected": -444.5887145996094, "loss": 1.2542, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3631865978240967, "rewards/margins": -0.5155624151229858, "rewards/rejected": 3.878749370574951, "step": 42920 }, { "epoch": 1.9931287432099913, "grad_norm": 75.94929504394531, "learning_rate": 1.6798210997106025e-07, "logits/chosen": -18.89240264892578, "logits/rejected": -18.30921173095703, "logps/chosen": -306.2771301269531, "logps/rejected": -243.04324340820312, "loss": 0.4248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2161993980407715, "rewards/margins": 1.1972811222076416, "rewards/rejected": 1.0189180374145508, "step": 42930 }, { "epoch": 1.993593017317424, "grad_norm": 0.9297624230384827, "learning_rate": 1.6790473095315473e-07, "logits/chosen": -18.44124984741211, "logits/rejected": -18.21430015563965, "logps/chosen": -351.44891357421875, "logps/rejected": -279.3031921386719, "loss": 0.4138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.307816982269287, "rewards/margins": 1.7808036804199219, "rewards/rejected": 1.5270133018493652, "step": 42940 }, { "epoch": 1.9940572914248573, "grad_norm": 160.25442504882812, "learning_rate": 1.6782735193524922e-07, "logits/chosen": -18.766618728637695, "logits/rejected": -18.379539489746094, "logps/chosen": -337.2171325683594, "logps/rejected": -246.2911834716797, "loss": 0.4898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9381449222564697, "rewards/margins": 1.4340803623199463, "rewards/rejected": 1.5040645599365234, "step": 42950 }, { "epoch": 1.9945215655322903, "grad_norm": 0.6847970485687256, "learning_rate": 1.6774997291734373e-07, "logits/chosen": -19.151453018188477, "logits/rejected": -18.407350540161133, "logps/chosen": -410.5625915527344, "logps/rejected": -291.9853820800781, "loss": 0.3544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.181870937347412, "rewards/margins": 1.9143197536468506, "rewards/rejected": 2.2675516605377197, "step": 42960 }, { "epoch": 1.9949858396397233, "grad_norm": 38.92863845825195, "learning_rate": 1.6767259389943822e-07, "logits/chosen": -19.115190505981445, "logits/rejected": -19.00725746154785, "logps/chosen": -351.235595703125, "logps/rejected": -350.4767761230469, "loss": 0.6112, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0446596145629883, "rewards/margins": 0.8389652967453003, "rewards/rejected": 2.2056946754455566, "step": 42970 }, { "epoch": 1.9954501137471563, "grad_norm": 126.25018310546875, "learning_rate": 1.6759521488153273e-07, "logits/chosen": -18.937074661254883, "logits/rejected": -17.931909561157227, "logps/chosen": -356.26873779296875, "logps/rejected": -264.59466552734375, "loss": 0.936, "rewards/accuracies": 0.5, "rewards/chosen": 2.301536798477173, "rewards/margins": 0.11090286076068878, "rewards/rejected": 2.190634250640869, "step": 42980 }, { "epoch": 1.9959143878545893, "grad_norm": 7.264323711395264, "learning_rate": 1.675178358636272e-07, "logits/chosen": -19.80937957763672, "logits/rejected": -19.24510383605957, "logps/chosen": -421.4012756347656, "logps/rejected": -355.1914978027344, "loss": 0.5199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6565098762512207, "rewards/margins": 1.218263030052185, "rewards/rejected": 2.438246965408325, "step": 42990 }, { "epoch": 1.9963786619620225, "grad_norm": 202.5471649169922, "learning_rate": 1.674404568457217e-07, "logits/chosen": -18.91192626953125, "logits/rejected": -18.335174560546875, "logps/chosen": -337.45416259765625, "logps/rejected": -272.4815673828125, "loss": 0.595, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.6160149574279785, "rewards/margins": 1.4410595893859863, "rewards/rejected": 1.1749554872512817, "step": 43000 }, { "epoch": 1.9968429360694553, "grad_norm": 34.556182861328125, "learning_rate": 1.673630778278162e-07, "logits/chosen": -18.788414001464844, "logits/rejected": -18.901456832885742, "logps/chosen": -365.40728759765625, "logps/rejected": -348.24822998046875, "loss": 1.2559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.964473009109497, "rewards/margins": 0.8091947436332703, "rewards/rejected": 3.155278444290161, "step": 43010 }, { "epoch": 1.9973072101768885, "grad_norm": 225.86732482910156, "learning_rate": 1.672856988099107e-07, "logits/chosen": -20.244436264038086, "logits/rejected": -18.82942771911621, "logps/chosen": -437.033203125, "logps/rejected": -288.118896484375, "loss": 0.7952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4224801063537598, "rewards/margins": 0.958354651927948, "rewards/rejected": 2.464125156402588, "step": 43020 }, { "epoch": 1.9977714842843215, "grad_norm": 331.02606201171875, "learning_rate": 1.672083197920052e-07, "logits/chosen": -19.6708927154541, "logits/rejected": -17.918664932250977, "logps/chosen": -442.3092346191406, "logps/rejected": -375.9132080078125, "loss": 0.4967, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.312874794006348, "rewards/margins": 2.2291321754455566, "rewards/rejected": 2.083742380142212, "step": 43030 }, { "epoch": 1.9982357583917545, "grad_norm": 43.973506927490234, "learning_rate": 1.671309407740997e-07, "logits/chosen": -19.210285186767578, "logits/rejected": -18.426347732543945, "logps/chosen": -378.27581787109375, "logps/rejected": -249.3658447265625, "loss": 0.3682, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.431980609893799, "rewards/margins": 2.1460816860198975, "rewards/rejected": 1.2858985662460327, "step": 43040 }, { "epoch": 1.9987000324991875, "grad_norm": 174.64822387695312, "learning_rate": 1.6705356175619417e-07, "logits/chosen": -18.49502944946289, "logits/rejected": -18.216371536254883, "logps/chosen": -326.7998046875, "logps/rejected": -278.6296691894531, "loss": 1.2139, "rewards/accuracies": 0.5, "rewards/chosen": 2.5384140014648438, "rewards/margins": 0.41517090797424316, "rewards/rejected": 2.1232430934906006, "step": 43050 }, { "epoch": 1.9991643066066205, "grad_norm": 12.09790325164795, "learning_rate": 1.6697618273828868e-07, "logits/chosen": -19.462692260742188, "logits/rejected": -18.917911529541016, "logps/chosen": -389.8304748535156, "logps/rejected": -309.2650146484375, "loss": 0.3707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.026179313659668, "rewards/margins": 1.8254146575927734, "rewards/rejected": 2.2007644176483154, "step": 43060 }, { "epoch": 1.9996285807140537, "grad_norm": 0.10581179708242416, "learning_rate": 1.6689880372038317e-07, "logits/chosen": -18.82790756225586, "logits/rejected": -18.014204025268555, "logps/chosen": -425.1649475097656, "logps/rejected": -328.4253234863281, "loss": 0.6441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.234313011169434, "rewards/margins": 2.0573599338531494, "rewards/rejected": 2.1769533157348633, "step": 43070 }, { "epoch": 2.0000928548214865, "grad_norm": 196.59950256347656, "learning_rate": 1.6682142470247768e-07, "logits/chosen": -18.15105628967285, "logits/rejected": -17.588058471679688, "logps/chosen": -372.8609924316406, "logps/rejected": -268.00592041015625, "loss": 0.9866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6962924003601074, "rewards/margins": 0.6773148775100708, "rewards/rejected": 2.018977642059326, "step": 43080 }, { "epoch": 2.0005571289289197, "grad_norm": 5.629337787628174, "learning_rate": 1.6674404568457217e-07, "logits/chosen": -17.97984504699707, "logits/rejected": -17.917356491088867, "logps/chosen": -280.91522216796875, "logps/rejected": -263.6933288574219, "loss": 1.0387, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7133979797363281, "rewards/margins": -0.03548111394047737, "rewards/rejected": 1.748879075050354, "step": 43090 }, { "epoch": 2.0010214030363525, "grad_norm": 153.08119201660156, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -19.49029541015625, "logits/rejected": -18.942785263061523, "logps/chosen": -362.5904235839844, "logps/rejected": -314.66094970703125, "loss": 0.7144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2144951820373535, "rewards/margins": 0.5452371835708618, "rewards/rejected": 3.6692581176757812, "step": 43100 }, { "epoch": 2.0014856771437857, "grad_norm": 60.19292068481445, "learning_rate": 1.6658928764876116e-07, "logits/chosen": -18.70186996459961, "logits/rejected": -18.678224563598633, "logps/chosen": -278.6621398925781, "logps/rejected": -292.4549865722656, "loss": 0.8975, "rewards/accuracies": 0.5, "rewards/chosen": 2.4095401763916016, "rewards/margins": 0.49304962158203125, "rewards/rejected": 1.9164905548095703, "step": 43110 }, { "epoch": 2.001949951251219, "grad_norm": 126.28675842285156, "learning_rate": 1.6651190863085565e-07, "logits/chosen": -19.060653686523438, "logits/rejected": -18.283344268798828, "logps/chosen": -469.97021484375, "logps/rejected": -440.99664306640625, "loss": 0.6171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.123930931091309, "rewards/margins": 1.2447168827056885, "rewards/rejected": 2.879214286804199, "step": 43120 }, { "epoch": 2.0024142253586517, "grad_norm": 0.7834712862968445, "learning_rate": 1.6643452961295016e-07, "logits/chosen": -19.819007873535156, "logits/rejected": -18.486324310302734, "logps/chosen": -457.813720703125, "logps/rejected": -302.3688049316406, "loss": 0.3836, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.918257236480713, "rewards/margins": 2.402801036834717, "rewards/rejected": 2.515456438064575, "step": 43130 }, { "epoch": 2.002878499466085, "grad_norm": 102.16133880615234, "learning_rate": 1.6635715059504462e-07, "logits/chosen": -19.034509658813477, "logits/rejected": -17.93874168395996, "logps/chosen": -392.40228271484375, "logps/rejected": -336.386474609375, "loss": 0.6889, "rewards/accuracies": 0.5, "rewards/chosen": 4.531684398651123, "rewards/margins": 1.7759332656860352, "rewards/rejected": 2.755751609802246, "step": 43140 }, { "epoch": 2.0033427735735176, "grad_norm": 299.1759033203125, "learning_rate": 1.6627977157713913e-07, "logits/chosen": -18.368276596069336, "logits/rejected": -18.33245849609375, "logps/chosen": -394.71148681640625, "logps/rejected": -410.65057373046875, "loss": 0.8554, "rewards/accuracies": 0.5, "rewards/chosen": 4.065228462219238, "rewards/margins": 1.2061169147491455, "rewards/rejected": 2.859111785888672, "step": 43150 }, { "epoch": 2.003807047680951, "grad_norm": 16.383806228637695, "learning_rate": 1.6620239255923364e-07, "logits/chosen": -19.625120162963867, "logits/rejected": -19.164653778076172, "logps/chosen": -383.2410888671875, "logps/rejected": -305.3226013183594, "loss": 0.8505, "rewards/accuracies": 0.5, "rewards/chosen": 3.362818479537964, "rewards/margins": 1.1389594078063965, "rewards/rejected": 2.2238590717315674, "step": 43160 }, { "epoch": 2.004271321788384, "grad_norm": 23.881324768066406, "learning_rate": 1.6612501354132812e-07, "logits/chosen": -19.31256675720215, "logits/rejected": -17.87353515625, "logps/chosen": -339.88409423828125, "logps/rejected": -160.4473419189453, "loss": 0.4212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.283626079559326, "rewards/margins": 2.1901144981384277, "rewards/rejected": 1.0935122966766357, "step": 43170 }, { "epoch": 2.004735595895817, "grad_norm": 1.1791579723358154, "learning_rate": 1.6604763452342263e-07, "logits/chosen": -19.561779022216797, "logits/rejected": -18.41356086730957, "logps/chosen": -434.28961181640625, "logps/rejected": -312.0922546386719, "loss": 0.1705, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.583281993865967, "rewards/margins": 2.8520777225494385, "rewards/rejected": 1.7312047481536865, "step": 43180 }, { "epoch": 2.00519987000325, "grad_norm": 24.892610549926758, "learning_rate": 1.659702555055171e-07, "logits/chosen": -18.712100982666016, "logits/rejected": -19.038166046142578, "logps/chosen": -345.15374755859375, "logps/rejected": -370.1224670410156, "loss": 0.9084, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1887831687927246, "rewards/margins": 0.5295813679695129, "rewards/rejected": 2.6592016220092773, "step": 43190 }, { "epoch": 2.005664144110683, "grad_norm": 52.94512176513672, "learning_rate": 1.658928764876116e-07, "logits/chosen": -19.494197845458984, "logits/rejected": -17.954666137695312, "logps/chosen": -461.47271728515625, "logps/rejected": -365.1761169433594, "loss": 0.3071, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.624210357666016, "rewards/margins": 1.867997407913208, "rewards/rejected": 2.7562131881713867, "step": 43200 }, { "epoch": 2.006128418218116, "grad_norm": 0.6979642510414124, "learning_rate": 1.6581549746970612e-07, "logits/chosen": -19.2427921295166, "logits/rejected": -18.720773696899414, "logps/chosen": -350.0297546386719, "logps/rejected": -300.64251708984375, "loss": 0.4933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.650286912918091, "rewards/margins": 1.5257889032363892, "rewards/rejected": 2.124497890472412, "step": 43210 }, { "epoch": 2.006592692325549, "grad_norm": 89.31816101074219, "learning_rate": 1.657381184518006e-07, "logits/chosen": -19.27754783630371, "logits/rejected": -18.093053817749023, "logps/chosen": -333.8737487792969, "logps/rejected": -244.72610473632812, "loss": 0.2803, "rewards/accuracies": 1.0, "rewards/chosen": 2.9887852668762207, "rewards/margins": 1.8675464391708374, "rewards/rejected": 1.1212389469146729, "step": 43220 }, { "epoch": 2.007056966432982, "grad_norm": 94.44107818603516, "learning_rate": 1.656607394338951e-07, "logits/chosen": -19.69171142578125, "logits/rejected": -18.22351837158203, "logps/chosen": -372.19921875, "logps/rejected": -269.7379455566406, "loss": 0.2568, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9306366443634033, "rewards/margins": 2.553914785385132, "rewards/rejected": 1.3767216205596924, "step": 43230 }, { "epoch": 2.0075212405404153, "grad_norm": 184.98452758789062, "learning_rate": 1.6558336041598957e-07, "logits/chosen": -18.220556259155273, "logits/rejected": -17.819141387939453, "logps/chosen": -375.0429382324219, "logps/rejected": -309.07208251953125, "loss": 0.6693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3158023357391357, "rewards/margins": 1.2654963731765747, "rewards/rejected": 2.0503058433532715, "step": 43240 }, { "epoch": 2.007985514647848, "grad_norm": 125.20943450927734, "learning_rate": 1.6550598139808408e-07, "logits/chosen": -19.109270095825195, "logits/rejected": -18.470256805419922, "logps/chosen": -471.4767150878906, "logps/rejected": -359.2432861328125, "loss": 0.8989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.3600172996521, "rewards/margins": 1.0874782800674438, "rewards/rejected": 3.2725396156311035, "step": 43250 }, { "epoch": 2.0084497887552812, "grad_norm": 1.8835896253585815, "learning_rate": 1.654286023801786e-07, "logits/chosen": -19.12906265258789, "logits/rejected": -18.1219539642334, "logps/chosen": -370.87640380859375, "logps/rejected": -246.88302612304688, "loss": 0.7036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.076744794845581, "rewards/margins": 1.4645298719406128, "rewards/rejected": 1.6122150421142578, "step": 43260 }, { "epoch": 2.008914062862714, "grad_norm": 174.23760986328125, "learning_rate": 1.6535122336227308e-07, "logits/chosen": -18.920347213745117, "logits/rejected": -19.143802642822266, "logps/chosen": -353.2509460449219, "logps/rejected": -321.3330993652344, "loss": 1.3477, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.574202060699463, "rewards/margins": -0.34162455797195435, "rewards/rejected": 2.9158267974853516, "step": 43270 }, { "epoch": 2.0093783369701472, "grad_norm": 98.78032684326172, "learning_rate": 1.652738443443676e-07, "logits/chosen": -19.10628890991211, "logits/rejected": -18.183948516845703, "logps/chosen": -274.35107421875, "logps/rejected": -223.37875366210938, "loss": 0.506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8454498052597046, "rewards/margins": 1.0866425037384033, "rewards/rejected": 0.7588075399398804, "step": 43280 }, { "epoch": 2.00984261107758, "grad_norm": 54.362449645996094, "learning_rate": 1.6519646532646205e-07, "logits/chosen": -19.129899978637695, "logits/rejected": -18.530967712402344, "logps/chosen": -393.87152099609375, "logps/rejected": -313.4814147949219, "loss": 0.4626, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.464874267578125, "rewards/margins": 1.6845369338989258, "rewards/rejected": 1.7803375720977783, "step": 43290 }, { "epoch": 2.010306885185013, "grad_norm": 4.285497665405273, "learning_rate": 1.6511908630855656e-07, "logits/chosen": -19.479536056518555, "logits/rejected": -18.243515014648438, "logps/chosen": -449.741455078125, "logps/rejected": -330.35369873046875, "loss": 0.1172, "rewards/accuracies": 1.0, "rewards/chosen": 5.641381740570068, "rewards/margins": 3.012434482574463, "rewards/rejected": 2.6289474964141846, "step": 43300 }, { "epoch": 2.0107711592924464, "grad_norm": 52.466312408447266, "learning_rate": 1.6504170729065107e-07, "logits/chosen": -18.625591278076172, "logits/rejected": -18.897043228149414, "logps/chosen": -439.64630126953125, "logps/rejected": -444.41778564453125, "loss": 1.9872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2454707622528076, "rewards/margins": -0.5677399635314941, "rewards/rejected": 3.8132107257843018, "step": 43310 }, { "epoch": 2.011235433399879, "grad_norm": 52.19387435913086, "learning_rate": 1.6496432827274555e-07, "logits/chosen": -19.54080581665039, "logits/rejected": -18.73016929626465, "logps/chosen": -423.10467529296875, "logps/rejected": -297.642333984375, "loss": 0.4717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7652077674865723, "rewards/margins": 1.7052485942840576, "rewards/rejected": 2.0599589347839355, "step": 43320 }, { "epoch": 2.0116997075073124, "grad_norm": 37.77513122558594, "learning_rate": 1.6488694925484007e-07, "logits/chosen": -18.976369857788086, "logits/rejected": -18.0948486328125, "logps/chosen": -492.64642333984375, "logps/rejected": -347.78472900390625, "loss": 0.3602, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.262795925140381, "rewards/margins": 1.9565372467041016, "rewards/rejected": 2.3062586784362793, "step": 43330 }, { "epoch": 2.012163981614745, "grad_norm": 129.7635040283203, "learning_rate": 1.6480957023693452e-07, "logits/chosen": -18.843578338623047, "logits/rejected": -18.54237174987793, "logps/chosen": -408.96002197265625, "logps/rejected": -307.0633544921875, "loss": 0.9662, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.434007167816162, "rewards/margins": 0.36063337326049805, "rewards/rejected": 3.073373556137085, "step": 43340 }, { "epoch": 2.0126282557221784, "grad_norm": 231.36241149902344, "learning_rate": 1.6473219121902904e-07, "logits/chosen": -19.145980834960938, "logits/rejected": -17.992612838745117, "logps/chosen": -427.03973388671875, "logps/rejected": -316.7688903808594, "loss": 0.6452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.9245829582214355, "rewards/margins": 1.445739984512329, "rewards/rejected": 3.4788429737091064, "step": 43350 }, { "epoch": 2.0130925298296116, "grad_norm": 84.72361755371094, "learning_rate": 1.6465481220112355e-07, "logits/chosen": -19.911497116088867, "logits/rejected": -19.563182830810547, "logps/chosen": -470.85693359375, "logps/rejected": -503.48907470703125, "loss": 0.8038, "rewards/accuracies": 0.5, "rewards/chosen": 4.427835941314697, "rewards/margins": 0.1665504425764084, "rewards/rejected": 4.261284828186035, "step": 43360 }, { "epoch": 2.0135568039370444, "grad_norm": 0.12910997867584229, "learning_rate": 1.6457743318321803e-07, "logits/chosen": -18.19911003112793, "logits/rejected": -18.445659637451172, "logps/chosen": -374.4989929199219, "logps/rejected": -383.4710693359375, "loss": 1.7535, "rewards/accuracies": 0.5, "rewards/chosen": 2.7777721881866455, "rewards/margins": -0.24811454117298126, "rewards/rejected": 3.0258865356445312, "step": 43370 }, { "epoch": 2.0140210780444776, "grad_norm": 10.809680938720703, "learning_rate": 1.6450005416531252e-07, "logits/chosen": -19.540218353271484, "logits/rejected": -19.116914749145508, "logps/chosen": -549.0435791015625, "logps/rejected": -409.51727294921875, "loss": 0.3278, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.960385322570801, "rewards/margins": 1.749969482421875, "rewards/rejected": 3.210416316986084, "step": 43380 }, { "epoch": 2.0144853521519104, "grad_norm": 256.6015930175781, "learning_rate": 1.64422675147407e-07, "logits/chosen": -18.32815170288086, "logits/rejected": -18.13675308227539, "logps/chosen": -337.45916748046875, "logps/rejected": -317.2220153808594, "loss": 1.3411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6423275470733643, "rewards/margins": 0.7900530099868774, "rewards/rejected": 1.8522746562957764, "step": 43390 }, { "epoch": 2.0149496262593436, "grad_norm": 15.153953552246094, "learning_rate": 1.643452961295015e-07, "logits/chosen": -19.228832244873047, "logits/rejected": -18.247806549072266, "logps/chosen": -365.6541748046875, "logps/rejected": -276.1657409667969, "loss": 0.3726, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.929943561553955, "rewards/margins": 1.3056867122650146, "rewards/rejected": 1.6242568492889404, "step": 43400 }, { "epoch": 2.0154139003667764, "grad_norm": 127.91546630859375, "learning_rate": 1.6426791711159602e-07, "logits/chosen": -19.208545684814453, "logits/rejected": -18.360340118408203, "logps/chosen": -338.85638427734375, "logps/rejected": -315.1033935546875, "loss": 1.1695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.12562894821167, "rewards/margins": 1.4354385137557983, "rewards/rejected": 2.690190553665161, "step": 43410 }, { "epoch": 2.0158781744742096, "grad_norm": 30.9373836517334, "learning_rate": 1.641905380936905e-07, "logits/chosen": -19.818750381469727, "logits/rejected": -18.795869827270508, "logps/chosen": -460.62579345703125, "logps/rejected": -328.4997253417969, "loss": 0.7991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6027610301971436, "rewards/margins": 1.179136037826538, "rewards/rejected": 2.4236252307891846, "step": 43420 }, { "epoch": 2.016342448581643, "grad_norm": 207.925048828125, "learning_rate": 1.64113159075785e-07, "logits/chosen": -18.367687225341797, "logits/rejected": -17.593856811523438, "logps/chosen": -508.607177734375, "logps/rejected": -345.14349365234375, "loss": 0.4106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8801937103271484, "rewards/margins": 1.5825397968292236, "rewards/rejected": 2.2976536750793457, "step": 43430 }, { "epoch": 2.0168067226890756, "grad_norm": 1.3444825410842896, "learning_rate": 1.6403578005787948e-07, "logits/chosen": -18.20176124572754, "logits/rejected": -17.731388092041016, "logps/chosen": -413.3147888183594, "logps/rejected": -375.74932861328125, "loss": 0.4456, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.7701873779296875, "rewards/margins": 1.6007217168807983, "rewards/rejected": 3.1694655418395996, "step": 43440 }, { "epoch": 2.017270996796509, "grad_norm": 200.1314239501953, "learning_rate": 1.63958401039974e-07, "logits/chosen": -19.052967071533203, "logits/rejected": -18.639366149902344, "logps/chosen": -490.77203369140625, "logps/rejected": -477.26373291015625, "loss": 1.3852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.144983291625977, "rewards/margins": -0.34244582056999207, "rewards/rejected": 4.487429618835449, "step": 43450 }, { "epoch": 2.0177352709039416, "grad_norm": 95.68463897705078, "learning_rate": 1.638810220220685e-07, "logits/chosen": -19.145633697509766, "logits/rejected": -18.418607711791992, "logps/chosen": -459.372314453125, "logps/rejected": -378.1930236816406, "loss": 0.7157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3522446155548096, "rewards/margins": 0.9347822070121765, "rewards/rejected": 2.4174625873565674, "step": 43460 }, { "epoch": 2.0181995450113748, "grad_norm": 128.4488983154297, "learning_rate": 1.6380364300416299e-07, "logits/chosen": -18.955612182617188, "logits/rejected": -18.836965560913086, "logps/chosen": -332.9996643066406, "logps/rejected": -314.9754333496094, "loss": 0.9699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0543596744537354, "rewards/margins": -0.035017289221286774, "rewards/rejected": 3.089376926422119, "step": 43470 }, { "epoch": 2.0186638191188075, "grad_norm": 113.73319244384766, "learning_rate": 1.6372626398625747e-07, "logits/chosen": -19.993698120117188, "logits/rejected": -18.98788833618164, "logps/chosen": -409.5205383300781, "logps/rejected": -362.7447509765625, "loss": 0.7397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.115355968475342, "rewards/margins": 0.8551725149154663, "rewards/rejected": 2.260183811187744, "step": 43480 }, { "epoch": 2.0191280932262408, "grad_norm": 100.70484161376953, "learning_rate": 1.6364888496835198e-07, "logits/chosen": -18.987300872802734, "logits/rejected": -18.18629264831543, "logps/chosen": -446.8900451660156, "logps/rejected": -353.4452819824219, "loss": 0.6759, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.436002016067505, "rewards/margins": 1.0187513828277588, "rewards/rejected": 2.417250633239746, "step": 43490 }, { "epoch": 2.019592367333674, "grad_norm": 68.18161010742188, "learning_rate": 1.6357150595044647e-07, "logits/chosen": -20.390901565551758, "logits/rejected": -19.152132034301758, "logps/chosen": -506.33050537109375, "logps/rejected": -399.13055419921875, "loss": 0.2496, "rewards/accuracies": 1.0, "rewards/chosen": 4.064632892608643, "rewards/margins": 1.5794216394424438, "rewards/rejected": 2.4852113723754883, "step": 43500 }, { "epoch": 2.0200566414411067, "grad_norm": 72.62743377685547, "learning_rate": 1.6349412693254098e-07, "logits/chosen": -19.990978240966797, "logits/rejected": -18.65668487548828, "logps/chosen": -411.095703125, "logps/rejected": -321.6908264160156, "loss": 0.5905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5721817016601562, "rewards/margins": 0.9437254071235657, "rewards/rejected": 2.6284565925598145, "step": 43510 }, { "epoch": 2.02052091554854, "grad_norm": 11.021870613098145, "learning_rate": 1.6341674791463546e-07, "logits/chosen": -19.22216796875, "logits/rejected": -18.56123924255371, "logps/chosen": -272.6209411621094, "logps/rejected": -229.1640625, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.719132900238037, "rewards/margins": 1.2709224224090576, "rewards/rejected": 1.4482104778289795, "step": 43520 }, { "epoch": 2.0209851896559727, "grad_norm": 43.50322723388672, "learning_rate": 1.6333936889672995e-07, "logits/chosen": -19.795278549194336, "logits/rejected": -19.237253189086914, "logps/chosen": -350.72821044921875, "logps/rejected": -278.3267822265625, "loss": 0.598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0587143898010254, "rewards/margins": 0.8003881573677063, "rewards/rejected": 2.258326292037964, "step": 43530 }, { "epoch": 2.021449463763406, "grad_norm": 10.947724342346191, "learning_rate": 1.6326198987882446e-07, "logits/chosen": -18.664409637451172, "logits/rejected": -18.917339324951172, "logps/chosen": -387.85821533203125, "logps/rejected": -334.3606872558594, "loss": 0.43, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.514301300048828, "rewards/margins": 1.3655037879943848, "rewards/rejected": 2.1487972736358643, "step": 43540 }, { "epoch": 2.0219137378708387, "grad_norm": 17.65291976928711, "learning_rate": 1.6318461086091894e-07, "logits/chosen": -18.779767990112305, "logits/rejected": -17.488143920898438, "logps/chosen": -328.36297607421875, "logps/rejected": -210.98941040039062, "loss": 0.8265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.051403522491455, "rewards/margins": 1.5530471801757812, "rewards/rejected": 1.498356580734253, "step": 43550 }, { "epoch": 2.022378011978272, "grad_norm": 154.88526916503906, "learning_rate": 1.6310723184301346e-07, "logits/chosen": -17.334434509277344, "logits/rejected": -17.4581298828125, "logps/chosen": -362.09967041015625, "logps/rejected": -382.16802978515625, "loss": 0.8614, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.367093086242676, "rewards/margins": 0.2766546607017517, "rewards/rejected": 2.0904388427734375, "step": 43560 }, { "epoch": 2.022842286085705, "grad_norm": 140.72264099121094, "learning_rate": 1.6302985282510794e-07, "logits/chosen": -18.510391235351562, "logits/rejected": -19.401817321777344, "logps/chosen": -341.4190368652344, "logps/rejected": -417.8285217285156, "loss": 1.2109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.1986615657806396, "rewards/margins": -0.2602941691875458, "rewards/rejected": 2.458955764770508, "step": 43570 }, { "epoch": 2.023306560193138, "grad_norm": 3.5174925327301025, "learning_rate": 1.6295247380720242e-07, "logits/chosen": -18.597198486328125, "logits/rejected": -18.025732040405273, "logps/chosen": -389.1134338378906, "logps/rejected": -301.93731689453125, "loss": 0.7547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8744053840637207, "rewards/margins": 1.6647151708602905, "rewards/rejected": 2.2096898555755615, "step": 43580 }, { "epoch": 2.023770834300571, "grad_norm": 82.90972137451172, "learning_rate": 1.6287509478929694e-07, "logits/chosen": -19.56165885925293, "logits/rejected": -18.857311248779297, "logps/chosen": -395.3017883300781, "logps/rejected": -372.6545715332031, "loss": 1.0222, "rewards/accuracies": 0.5, "rewards/chosen": 3.6854968070983887, "rewards/margins": 0.5504552125930786, "rewards/rejected": 3.1350417137145996, "step": 43590 }, { "epoch": 2.024235108408004, "grad_norm": 37.97420883178711, "learning_rate": 1.6279771577139142e-07, "logits/chosen": -18.97224235534668, "logits/rejected": -18.871570587158203, "logps/chosen": -448.43768310546875, "logps/rejected": -385.4935607910156, "loss": 0.4871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.003065586090088, "rewards/margins": 0.954226016998291, "rewards/rejected": 3.048839569091797, "step": 43600 }, { "epoch": 2.024699382515437, "grad_norm": 79.61986541748047, "learning_rate": 1.6272033675348593e-07, "logits/chosen": -19.17380142211914, "logits/rejected": -18.13258171081543, "logps/chosen": -275.38128662109375, "logps/rejected": -197.1827392578125, "loss": 0.5026, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7258186340332031, "rewards/margins": 0.9393804669380188, "rewards/rejected": 0.7864383459091187, "step": 43610 }, { "epoch": 2.0251636566228703, "grad_norm": 226.12339782714844, "learning_rate": 1.626429577355804e-07, "logits/chosen": -18.864641189575195, "logits/rejected": -18.579387664794922, "logps/chosen": -459.118408203125, "logps/rejected": -467.79034423828125, "loss": 1.1193, "rewards/accuracies": 0.5, "rewards/chosen": 2.9328808784484863, "rewards/margins": 0.02364959754049778, "rewards/rejected": 2.909231185913086, "step": 43620 }, { "epoch": 2.025627930730303, "grad_norm": 77.38624572753906, "learning_rate": 1.625655787176749e-07, "logits/chosen": -19.771636962890625, "logits/rejected": -17.93136215209961, "logps/chosen": -433.06536865234375, "logps/rejected": -310.97882080078125, "loss": 0.4279, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.640956878662109, "rewards/margins": 2.311305522918701, "rewards/rejected": 2.329651117324829, "step": 43630 }, { "epoch": 2.0260922048377363, "grad_norm": 38.647396087646484, "learning_rate": 1.624881996997694e-07, "logits/chosen": -19.14293098449707, "logits/rejected": -17.962221145629883, "logps/chosen": -431.10247802734375, "logps/rejected": -386.97479248046875, "loss": 0.509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.776058673858643, "rewards/margins": 1.5913214683532715, "rewards/rejected": 3.184737205505371, "step": 43640 }, { "epoch": 2.026556478945169, "grad_norm": 2.1781747341156006, "learning_rate": 1.624108206818639e-07, "logits/chosen": -18.545259475708008, "logits/rejected": -17.958526611328125, "logps/chosen": -367.34246826171875, "logps/rejected": -222.89163208007812, "loss": 0.4255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7273879051208496, "rewards/margins": 2.257629632949829, "rewards/rejected": 1.46975839138031, "step": 43650 }, { "epoch": 2.0270207530526023, "grad_norm": 37.88496398925781, "learning_rate": 1.623334416639584e-07, "logits/chosen": -19.5010929107666, "logits/rejected": -18.65804100036621, "logps/chosen": -394.9974365234375, "logps/rejected": -351.6631164550781, "loss": 0.442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9384078979492188, "rewards/margins": 1.6727358102798462, "rewards/rejected": 2.265672206878662, "step": 43660 }, { "epoch": 2.027485027160035, "grad_norm": 133.22152709960938, "learning_rate": 1.6225606264605287e-07, "logits/chosen": -18.983417510986328, "logits/rejected": -18.09908676147461, "logps/chosen": -338.9640808105469, "logps/rejected": -279.2247314453125, "loss": 0.6235, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1999611854553223, "rewards/margins": 1.2225511074066162, "rewards/rejected": 1.9774103164672852, "step": 43670 }, { "epoch": 2.0279493012674683, "grad_norm": 60.66046905517578, "learning_rate": 1.6217868362814738e-07, "logits/chosen": -18.06865119934082, "logits/rejected": -18.678081512451172, "logps/chosen": -269.568359375, "logps/rejected": -322.0293273925781, "loss": 1.9946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1902217864990234, "rewards/margins": -1.109871506690979, "rewards/rejected": 2.300093650817871, "step": 43680 }, { "epoch": 2.0284135753749015, "grad_norm": 127.6933364868164, "learning_rate": 1.621013046102419e-07, "logits/chosen": -18.648521423339844, "logits/rejected": -18.932979583740234, "logps/chosen": -312.5652770996094, "logps/rejected": -362.4858093261719, "loss": 1.51, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.3946731090545654, "rewards/margins": -0.5470027923583984, "rewards/rejected": 3.941675901412964, "step": 43690 }, { "epoch": 2.0288778494823343, "grad_norm": 1.2733112573623657, "learning_rate": 1.6202392559233638e-07, "logits/chosen": -18.570919036865234, "logits/rejected": -17.798246383666992, "logps/chosen": -381.9183044433594, "logps/rejected": -292.0871887207031, "loss": 0.5013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.96755313873291, "rewards/margins": 1.5141521692276, "rewards/rejected": 1.45340096950531, "step": 43700 }, { "epoch": 2.0293421235897675, "grad_norm": 9.642762184143066, "learning_rate": 1.6194654657443089e-07, "logits/chosen": -18.20855712890625, "logits/rejected": -17.238845825195312, "logps/chosen": -365.27996826171875, "logps/rejected": -312.60650634765625, "loss": 1.0888, "rewards/accuracies": 0.5, "rewards/chosen": 2.6802725791931152, "rewards/margins": 0.28227177262306213, "rewards/rejected": 2.398000955581665, "step": 43710 }, { "epoch": 2.0298063976972003, "grad_norm": 26.874799728393555, "learning_rate": 1.6186916755652534e-07, "logits/chosen": -18.545015335083008, "logits/rejected": -18.554492950439453, "logps/chosen": -271.12078857421875, "logps/rejected": -303.906982421875, "loss": 0.9195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.512308359146118, "rewards/margins": 0.5862661004066467, "rewards/rejected": 1.9260423183441162, "step": 43720 }, { "epoch": 2.0302706718046335, "grad_norm": 17.98691749572754, "learning_rate": 1.6179178853861986e-07, "logits/chosen": -18.902097702026367, "logits/rejected": -18.146846771240234, "logps/chosen": -348.888671875, "logps/rejected": -252.23403930664062, "loss": 0.5166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2231597900390625, "rewards/margins": 1.7168986797332764, "rewards/rejected": 1.5062614679336548, "step": 43730 }, { "epoch": 2.0307349459120663, "grad_norm": 2.72689151763916, "learning_rate": 1.6171440952071437e-07, "logits/chosen": -19.966550827026367, "logits/rejected": -19.004230499267578, "logps/chosen": -418.5528259277344, "logps/rejected": -315.55755615234375, "loss": 0.7299, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.002584934234619, "rewards/margins": 0.9508438110351562, "rewards/rejected": 3.051740884780884, "step": 43740 }, { "epoch": 2.0311992200194995, "grad_norm": 39.957427978515625, "learning_rate": 1.6163703050280885e-07, "logits/chosen": -19.064905166625977, "logits/rejected": -18.153112411499023, "logps/chosen": -357.832763671875, "logps/rejected": -256.33050537109375, "loss": 0.847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.30790114402771, "rewards/margins": 0.6984097361564636, "rewards/rejected": 1.6094913482666016, "step": 43750 }, { "epoch": 2.0316634941269327, "grad_norm": 13.265410423278809, "learning_rate": 1.6155965148490336e-07, "logits/chosen": -19.059368133544922, "logits/rejected": -18.483728408813477, "logps/chosen": -496.65106201171875, "logps/rejected": -414.60870361328125, "loss": 0.4828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.575697898864746, "rewards/margins": 1.8754189014434814, "rewards/rejected": 2.7002787590026855, "step": 43760 }, { "epoch": 2.0321277682343655, "grad_norm": 141.94305419921875, "learning_rate": 1.6148227246699782e-07, "logits/chosen": -19.14352798461914, "logits/rejected": -19.061721801757812, "logps/chosen": -422.20574951171875, "logps/rejected": -306.556640625, "loss": 0.704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.84051775932312, "rewards/margins": 1.4302527904510498, "rewards/rejected": 2.410264730453491, "step": 43770 }, { "epoch": 2.0325920423417987, "grad_norm": 11.805013656616211, "learning_rate": 1.6140489344909233e-07, "logits/chosen": -19.158063888549805, "logits/rejected": -18.264911651611328, "logps/chosen": -405.3963623046875, "logps/rejected": -275.7939453125, "loss": 0.6919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.279573440551758, "rewards/margins": 1.0840046405792236, "rewards/rejected": 2.195568799972534, "step": 43780 }, { "epoch": 2.0330563164492315, "grad_norm": 10.026327133178711, "learning_rate": 1.6132751443118684e-07, "logits/chosen": -19.177818298339844, "logits/rejected": -18.18737030029297, "logps/chosen": -453.17730712890625, "logps/rejected": -317.3675231933594, "loss": 0.2767, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.371773719787598, "rewards/margins": 1.946245551109314, "rewards/rejected": 2.4255282878875732, "step": 43790 }, { "epoch": 2.0335205905566647, "grad_norm": 141.96612548828125, "learning_rate": 1.6125013541328133e-07, "logits/chosen": -20.04958152770996, "logits/rejected": -19.321950912475586, "logps/chosen": -465.13385009765625, "logps/rejected": -339.201416015625, "loss": 0.4175, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.581883907318115, "rewards/margins": 2.1955323219299316, "rewards/rejected": 2.386352062225342, "step": 43800 }, { "epoch": 2.033984864664098, "grad_norm": 245.5798797607422, "learning_rate": 1.6117275639537584e-07, "logits/chosen": -18.56355094909668, "logits/rejected": -19.258073806762695, "logps/chosen": -372.07122802734375, "logps/rejected": -423.2225036621094, "loss": 1.588, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.116115093231201, "rewards/margins": -0.7817573547363281, "rewards/rejected": 3.8978724479675293, "step": 43810 }, { "epoch": 2.0344491387715307, "grad_norm": 0.1923658698797226, "learning_rate": 1.610953773774703e-07, "logits/chosen": -18.202762603759766, "logits/rejected": -17.023448944091797, "logps/chosen": -319.2657775878906, "logps/rejected": -180.75521850585938, "loss": 0.3841, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.395606517791748, "rewards/margins": 2.41418194770813, "rewards/rejected": 0.9814242124557495, "step": 43820 }, { "epoch": 2.034913412878964, "grad_norm": 8.344409942626953, "learning_rate": 1.610179983595648e-07, "logits/chosen": -18.859933853149414, "logits/rejected": -17.185901641845703, "logps/chosen": -393.3977355957031, "logps/rejected": -223.70156860351562, "loss": 0.3735, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4744887351989746, "rewards/margins": 2.6117613315582275, "rewards/rejected": 0.8627277612686157, "step": 43830 }, { "epoch": 2.0353776869863967, "grad_norm": 11.955114364624023, "learning_rate": 1.6094061934165932e-07, "logits/chosen": -18.686609268188477, "logits/rejected": -18.03091812133789, "logps/chosen": -300.4002990722656, "logps/rejected": -233.27029418945312, "loss": 0.4311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.255800247192383, "rewards/margins": 1.4558886289596558, "rewards/rejected": 0.7999114394187927, "step": 43840 }, { "epoch": 2.03584196109383, "grad_norm": 21.719459533691406, "learning_rate": 1.608632403237538e-07, "logits/chosen": -19.567066192626953, "logits/rejected": -18.70491600036621, "logps/chosen": -440.683837890625, "logps/rejected": -321.9432678222656, "loss": 0.5239, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.137805461883545, "rewards/margins": 1.6076061725616455, "rewards/rejected": 2.5301995277404785, "step": 43850 }, { "epoch": 2.0363062352012626, "grad_norm": 111.7563705444336, "learning_rate": 1.607858613058483e-07, "logits/chosen": -18.365694046020508, "logits/rejected": -17.813051223754883, "logps/chosen": -405.37469482421875, "logps/rejected": -412.855712890625, "loss": 0.84, "rewards/accuracies": 0.5, "rewards/chosen": 3.9356472492218018, "rewards/margins": 0.43713611364364624, "rewards/rejected": 3.498511552810669, "step": 43860 }, { "epoch": 2.036770509308696, "grad_norm": 2.5439224243164062, "learning_rate": 1.6070848228794278e-07, "logits/chosen": -19.887149810791016, "logits/rejected": -18.576770782470703, "logps/chosen": -511.3113708496094, "logps/rejected": -329.764404296875, "loss": 0.4619, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.040506839752197, "rewards/margins": 1.2240098714828491, "rewards/rejected": 2.8164963722229004, "step": 43870 }, { "epoch": 2.037234783416129, "grad_norm": 6.079258441925049, "learning_rate": 1.606311032700373e-07, "logits/chosen": -19.216644287109375, "logits/rejected": -17.840925216674805, "logps/chosen": -338.12506103515625, "logps/rejected": -257.6817932128906, "loss": 0.2437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0550386905670166, "rewards/margins": 2.1686978340148926, "rewards/rejected": 0.8863407969474792, "step": 43880 }, { "epoch": 2.037699057523562, "grad_norm": 24.911121368408203, "learning_rate": 1.605537242521318e-07, "logits/chosen": -19.397817611694336, "logits/rejected": -18.438823699951172, "logps/chosen": -393.8869323730469, "logps/rejected": -307.88848876953125, "loss": 0.4007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3288826942443848, "rewards/margins": 1.441016674041748, "rewards/rejected": 0.8878658413887024, "step": 43890 }, { "epoch": 2.038163331630995, "grad_norm": 111.09973907470703, "learning_rate": 1.6047634523422628e-07, "logits/chosen": -18.627452850341797, "logits/rejected": -17.7810115814209, "logps/chosen": -415.2084045410156, "logps/rejected": -336.29986572265625, "loss": 0.3238, "rewards/accuracies": 1.0, "rewards/chosen": 3.663546085357666, "rewards/margins": 1.4879400730133057, "rewards/rejected": 2.1756060123443604, "step": 43900 }, { "epoch": 2.038627605738428, "grad_norm": 3.3122622966766357, "learning_rate": 1.6039896621632077e-07, "logits/chosen": -18.89092445373535, "logits/rejected": -18.692304611206055, "logps/chosen": -291.49139404296875, "logps/rejected": -309.41546630859375, "loss": 1.2988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.579637050628662, "rewards/margins": 0.37119120359420776, "rewards/rejected": 2.2084460258483887, "step": 43910 }, { "epoch": 2.039091879845861, "grad_norm": 71.29049682617188, "learning_rate": 1.6032158719841525e-07, "logits/chosen": -18.39908218383789, "logits/rejected": -17.68307113647461, "logps/chosen": -405.9462585449219, "logps/rejected": -321.8298645019531, "loss": 0.7442, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5806193351745605, "rewards/margins": 0.39546331763267517, "rewards/rejected": 2.1851563453674316, "step": 43920 }, { "epoch": 2.039556153953294, "grad_norm": 78.1388168334961, "learning_rate": 1.6024420818050976e-07, "logits/chosen": -19.314279556274414, "logits/rejected": -18.00027847290039, "logps/chosen": -500.8694763183594, "logps/rejected": -284.3395080566406, "loss": 0.2167, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.058642864227295, "rewards/margins": 3.53456449508667, "rewards/rejected": 1.5240782499313354, "step": 43930 }, { "epoch": 2.040020428060727, "grad_norm": 31.562170028686523, "learning_rate": 1.6016682916260428e-07, "logits/chosen": -19.240480422973633, "logits/rejected": -18.040454864501953, "logps/chosen": -331.22882080078125, "logps/rejected": -229.0900115966797, "loss": 0.2201, "rewards/accuracies": 1.0, "rewards/chosen": 3.4194672107696533, "rewards/margins": 2.3092525005340576, "rewards/rejected": 1.1102144718170166, "step": 43940 }, { "epoch": 2.0404847021681602, "grad_norm": 4.48773717880249, "learning_rate": 1.6008945014469876e-07, "logits/chosen": -20.367015838623047, "logits/rejected": -19.634126663208008, "logps/chosen": -387.16302490234375, "logps/rejected": -328.8512878417969, "loss": 0.6997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.853621244430542, "rewards/margins": 0.8190258741378784, "rewards/rejected": 3.034595251083374, "step": 43950 }, { "epoch": 2.040948976275593, "grad_norm": 145.2864990234375, "learning_rate": 1.6001207112679325e-07, "logits/chosen": -20.236507415771484, "logits/rejected": -18.462791442871094, "logps/chosen": -525.7477416992188, "logps/rejected": -335.683837890625, "loss": 0.7344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.579693794250488, "rewards/margins": 1.5949147939682007, "rewards/rejected": 2.984778881072998, "step": 43960 }, { "epoch": 2.0414132503830262, "grad_norm": 6.615602016448975, "learning_rate": 1.5993469210888773e-07, "logits/chosen": -19.03082847595215, "logits/rejected": -17.718093872070312, "logps/chosen": -440.9115295410156, "logps/rejected": -277.76385498046875, "loss": 0.1688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.54274320602417, "rewards/margins": 2.4061923027038574, "rewards/rejected": 2.1365513801574707, "step": 43970 }, { "epoch": 2.041877524490459, "grad_norm": 30.71063995361328, "learning_rate": 1.5985731309098224e-07, "logits/chosen": -19.10533905029297, "logits/rejected": -19.01184844970703, "logps/chosen": -287.79376220703125, "logps/rejected": -343.8359680175781, "loss": 1.0032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.157589912414551, "rewards/margins": -0.1942053586244583, "rewards/rejected": 3.351795196533203, "step": 43980 }, { "epoch": 2.042341798597892, "grad_norm": 128.26942443847656, "learning_rate": 1.5977993407307675e-07, "logits/chosen": -18.73978042602539, "logits/rejected": -19.015321731567383, "logps/chosen": -295.1971130371094, "logps/rejected": -308.900146484375, "loss": 0.6936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6218032836914062, "rewards/margins": 0.4249234199523926, "rewards/rejected": 2.1968798637390137, "step": 43990 }, { "epoch": 2.0428060727053254, "grad_norm": 45.81336212158203, "learning_rate": 1.5970255505517124e-07, "logits/chosen": -19.26645278930664, "logits/rejected": -18.466594696044922, "logps/chosen": -432.95550537109375, "logps/rejected": -371.661376953125, "loss": 0.4375, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.242400169372559, "rewards/margins": 1.8031498193740845, "rewards/rejected": 2.4392504692077637, "step": 44000 }, { "epoch": 2.043270346812758, "grad_norm": 48.51417541503906, "learning_rate": 1.5962517603726572e-07, "logits/chosen": -19.358095169067383, "logits/rejected": -19.025449752807617, "logps/chosen": -343.2065734863281, "logps/rejected": -341.8878479003906, "loss": 0.9732, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2041919231414795, "rewards/margins": 1.0093789100646973, "rewards/rejected": 2.1948132514953613, "step": 44010 }, { "epoch": 2.0437346209201914, "grad_norm": 0.920308530330658, "learning_rate": 1.595477970193602e-07, "logits/chosen": -19.321151733398438, "logits/rejected": -17.630245208740234, "logps/chosen": -361.30548095703125, "logps/rejected": -194.04214477539062, "loss": 0.3232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3485913276672363, "rewards/margins": 2.8846030235290527, "rewards/rejected": 0.46398788690567017, "step": 44020 }, { "epoch": 2.044198895027624, "grad_norm": 0.8200368881225586, "learning_rate": 1.5947041800145472e-07, "logits/chosen": -19.309860229492188, "logits/rejected": -18.110265731811523, "logps/chosen": -499.68927001953125, "logps/rejected": -398.56903076171875, "loss": 0.5287, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.556606769561768, "rewards/margins": 1.8816314935684204, "rewards/rejected": 2.6749753952026367, "step": 44030 }, { "epoch": 2.0446631691350574, "grad_norm": 48.76943588256836, "learning_rate": 1.5939303898354923e-07, "logits/chosen": -19.444366455078125, "logits/rejected": -18.149459838867188, "logps/chosen": -357.961181640625, "logps/rejected": -326.1849060058594, "loss": 0.434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2354302406311035, "rewards/margins": 0.8374460339546204, "rewards/rejected": 2.397984504699707, "step": 44040 }, { "epoch": 2.04512744324249, "grad_norm": 25.732938766479492, "learning_rate": 1.593156599656437e-07, "logits/chosen": -19.887292861938477, "logits/rejected": -19.766918182373047, "logps/chosen": -374.4064025878906, "logps/rejected": -390.79217529296875, "loss": 0.9658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7511885166168213, "rewards/margins": 0.07631361484527588, "rewards/rejected": 2.674874782562256, "step": 44050 }, { "epoch": 2.0455917173499234, "grad_norm": 150.5370330810547, "learning_rate": 1.592382809477382e-07, "logits/chosen": -18.816097259521484, "logits/rejected": -18.460187911987305, "logps/chosen": -354.4054870605469, "logps/rejected": -373.19952392578125, "loss": 0.9955, "rewards/accuracies": 0.5, "rewards/chosen": 3.128528118133545, "rewards/margins": 0.4721924662590027, "rewards/rejected": 2.6563353538513184, "step": 44060 }, { "epoch": 2.0460559914573566, "grad_norm": 24.90580177307129, "learning_rate": 1.5916090192983268e-07, "logits/chosen": -19.199357986450195, "logits/rejected": -18.723215103149414, "logps/chosen": -381.5589294433594, "logps/rejected": -335.72576904296875, "loss": 0.4612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9871089458465576, "rewards/margins": 1.7876001596450806, "rewards/rejected": 2.1995089054107666, "step": 44070 }, { "epoch": 2.0465202655647894, "grad_norm": 19.450353622436523, "learning_rate": 1.590835229119272e-07, "logits/chosen": -19.149276733398438, "logits/rejected": -18.404420852661133, "logps/chosen": -352.33721923828125, "logps/rejected": -278.6883850097656, "loss": 0.8485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.339384078979492, "rewards/margins": 1.403841257095337, "rewards/rejected": 1.9355424642562866, "step": 44080 }, { "epoch": 2.0469845396722226, "grad_norm": 298.6210632324219, "learning_rate": 1.590061438940217e-07, "logits/chosen": -18.758028030395508, "logits/rejected": -18.24876594543457, "logps/chosen": -541.47900390625, "logps/rejected": -484.01837158203125, "loss": 1.1917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.486216068267822, "rewards/margins": 0.9743067622184753, "rewards/rejected": 3.5119094848632812, "step": 44090 }, { "epoch": 2.0474488137796554, "grad_norm": 6.528420925140381, "learning_rate": 1.5892876487611617e-07, "logits/chosen": -18.579631805419922, "logits/rejected": -18.514265060424805, "logps/chosen": -476.4835510253906, "logps/rejected": -406.49017333984375, "loss": 0.5206, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.297604560852051, "rewards/margins": 1.865557312965393, "rewards/rejected": 2.4320473670959473, "step": 44100 }, { "epoch": 2.0479130878870886, "grad_norm": 81.32535552978516, "learning_rate": 1.5885138585821068e-07, "logits/chosen": -19.564712524414062, "logits/rejected": -19.556758880615234, "logps/chosen": -457.8355407714844, "logps/rejected": -452.9439392089844, "loss": 0.8587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2237274646759033, "rewards/margins": 0.020428037270903587, "rewards/rejected": 3.2032992839813232, "step": 44110 }, { "epoch": 2.0483773619945214, "grad_norm": 29.79289436340332, "learning_rate": 1.587740068403052e-07, "logits/chosen": -19.28916358947754, "logits/rejected": -17.865707397460938, "logps/chosen": -422.12005615234375, "logps/rejected": -280.4598083496094, "loss": 0.6121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.544414520263672, "rewards/margins": 1.6438162326812744, "rewards/rejected": 1.9005985260009766, "step": 44120 }, { "epoch": 2.0488416361019546, "grad_norm": 40.82925796508789, "learning_rate": 1.5869662782239967e-07, "logits/chosen": -19.178258895874023, "logits/rejected": -18.428855895996094, "logps/chosen": -458.1837463378906, "logps/rejected": -331.52935791015625, "loss": 0.5013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.528571128845215, "rewards/margins": 2.3579394817352295, "rewards/rejected": 2.170631170272827, "step": 44130 }, { "epoch": 2.049305910209388, "grad_norm": 10.986750602722168, "learning_rate": 1.5861924880449418e-07, "logits/chosen": -19.56899642944336, "logits/rejected": -19.241008758544922, "logps/chosen": -373.0215148925781, "logps/rejected": -368.80072021484375, "loss": 1.2274, "rewards/accuracies": 0.5, "rewards/chosen": 2.2810356616973877, "rewards/margins": -0.4385271668434143, "rewards/rejected": 2.719562292098999, "step": 44140 }, { "epoch": 2.0497701843168206, "grad_norm": 40.409244537353516, "learning_rate": 1.5854186978658864e-07, "logits/chosen": -19.062564849853516, "logits/rejected": -18.682857513427734, "logps/chosen": -377.9269104003906, "logps/rejected": -313.3818664550781, "loss": 0.7141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2056031227111816, "rewards/margins": 0.7371256351470947, "rewards/rejected": 2.468477725982666, "step": 44150 }, { "epoch": 2.050234458424254, "grad_norm": 1.6588857173919678, "learning_rate": 1.5846449076868315e-07, "logits/chosen": -18.927005767822266, "logits/rejected": -18.10716438293457, "logps/chosen": -279.32586669921875, "logps/rejected": -233.3151092529297, "loss": 0.3766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7860264778137207, "rewards/margins": 1.7894439697265625, "rewards/rejected": 0.9965823292732239, "step": 44160 }, { "epoch": 2.0506987325316866, "grad_norm": 0.0419570617377758, "learning_rate": 1.5838711175077766e-07, "logits/chosen": -18.735380172729492, "logits/rejected": -18.244300842285156, "logps/chosen": -457.140869140625, "logps/rejected": -369.33721923828125, "loss": 1.0671, "rewards/accuracies": 0.5, "rewards/chosen": 4.854659080505371, "rewards/margins": 0.7588618993759155, "rewards/rejected": 4.095797061920166, "step": 44170 }, { "epoch": 2.0511630066391198, "grad_norm": 16.305788040161133, "learning_rate": 1.5830973273287215e-07, "logits/chosen": -18.794776916503906, "logits/rejected": -18.736316680908203, "logps/chosen": -372.3314514160156, "logps/rejected": -409.85711669921875, "loss": 1.0297, "rewards/accuracies": 0.5, "rewards/chosen": 2.720329999923706, "rewards/margins": 0.13239583373069763, "rewards/rejected": 2.5879340171813965, "step": 44180 }, { "epoch": 2.051627280746553, "grad_norm": 65.8563232421875, "learning_rate": 1.5823235371496666e-07, "logits/chosen": -18.15009880065918, "logits/rejected": -18.154386520385742, "logps/chosen": -385.48773193359375, "logps/rejected": -343.5293884277344, "loss": 0.5839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.198035478591919, "rewards/margins": 0.5917134284973145, "rewards/rejected": 2.6063218116760254, "step": 44190 }, { "epoch": 2.0520915548539858, "grad_norm": 9.795013427734375, "learning_rate": 1.5815497469706112e-07, "logits/chosen": -19.635520935058594, "logits/rejected": -18.20004653930664, "logps/chosen": -476.3470153808594, "logps/rejected": -318.243408203125, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": 4.670693874359131, "rewards/margins": 2.567357301712036, "rewards/rejected": 2.1033363342285156, "step": 44200 }, { "epoch": 2.052555828961419, "grad_norm": 153.6326141357422, "learning_rate": 1.5807759567915563e-07, "logits/chosen": -19.26543617248535, "logits/rejected": -19.027393341064453, "logps/chosen": -332.979248046875, "logps/rejected": -290.26239013671875, "loss": 1.5249, "rewards/accuracies": 0.5, "rewards/chosen": 2.3405113220214844, "rewards/margins": -0.6180634498596191, "rewards/rejected": 2.9585747718811035, "step": 44210 }, { "epoch": 2.0530201030688517, "grad_norm": 67.72727966308594, "learning_rate": 1.5800021666125014e-07, "logits/chosen": -18.17770004272461, "logits/rejected": -17.333410263061523, "logps/chosen": -395.3578186035156, "logps/rejected": -281.63262939453125, "loss": 0.308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1824278831481934, "rewards/margins": 1.773411750793457, "rewards/rejected": 1.4090160131454468, "step": 44220 }, { "epoch": 2.053484377176285, "grad_norm": 28.735933303833008, "learning_rate": 1.5792283764334463e-07, "logits/chosen": -20.217052459716797, "logits/rejected": -18.61902618408203, "logps/chosen": -424.498779296875, "logps/rejected": -298.47637939453125, "loss": 0.3721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7183735370635986, "rewards/margins": 1.9310696125030518, "rewards/rejected": 1.7873036861419678, "step": 44230 }, { "epoch": 2.0539486512837177, "grad_norm": 63.196624755859375, "learning_rate": 1.5784545862543914e-07, "logits/chosen": -18.774202346801758, "logits/rejected": -18.478557586669922, "logps/chosen": -401.06072998046875, "logps/rejected": -407.11956787109375, "loss": 1.2396, "rewards/accuracies": 0.5, "rewards/chosen": 3.187532901763916, "rewards/margins": 0.5057083368301392, "rewards/rejected": 2.6818246841430664, "step": 44240 }, { "epoch": 2.054412925391151, "grad_norm": 55.188472747802734, "learning_rate": 1.577680796075336e-07, "logits/chosen": -18.473966598510742, "logits/rejected": -17.877832412719727, "logps/chosen": -362.7775573730469, "logps/rejected": -313.56646728515625, "loss": 0.7065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.248692750930786, "rewards/margins": 1.4221822023391724, "rewards/rejected": 1.8265101909637451, "step": 44250 }, { "epoch": 2.054877199498584, "grad_norm": 117.20250701904297, "learning_rate": 1.576907005896281e-07, "logits/chosen": -19.678329467773438, "logits/rejected": -19.42686653137207, "logps/chosen": -406.5075988769531, "logps/rejected": -352.2984313964844, "loss": 0.8657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.467780113220215, "rewards/margins": 1.4322572946548462, "rewards/rejected": 3.0355234146118164, "step": 44260 }, { "epoch": 2.055341473606017, "grad_norm": 6.372093677520752, "learning_rate": 1.5761332157172262e-07, "logits/chosen": -19.08062744140625, "logits/rejected": -18.44280433654785, "logps/chosen": -329.0363464355469, "logps/rejected": -312.66400146484375, "loss": 0.5016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5831847190856934, "rewards/margins": 0.9234328269958496, "rewards/rejected": 2.6597514152526855, "step": 44270 }, { "epoch": 2.05580574771345, "grad_norm": 33.298492431640625, "learning_rate": 1.575359425538171e-07, "logits/chosen": -18.94599723815918, "logits/rejected": -18.031963348388672, "logps/chosen": -588.2603759765625, "logps/rejected": -417.0831604003906, "loss": 0.3059, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.453891277313232, "rewards/margins": 1.6289907693862915, "rewards/rejected": 2.8249011039733887, "step": 44280 }, { "epoch": 2.056270021820883, "grad_norm": 158.2041778564453, "learning_rate": 1.574585635359116e-07, "logits/chosen": -18.990346908569336, "logits/rejected": -18.570669174194336, "logps/chosen": -352.65460205078125, "logps/rejected": -282.65570068359375, "loss": 0.713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.700759172439575, "rewards/margins": 1.0280177593231201, "rewards/rejected": 1.672741174697876, "step": 44290 }, { "epoch": 2.056734295928316, "grad_norm": 50.46996307373047, "learning_rate": 1.5738118451800607e-07, "logits/chosen": -19.58876609802246, "logits/rejected": -18.87890625, "logps/chosen": -347.52313232421875, "logps/rejected": -276.07281494140625, "loss": 0.5522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.807985782623291, "rewards/margins": 1.1834065914154053, "rewards/rejected": 1.6245791912078857, "step": 44300 }, { "epoch": 2.057198570035749, "grad_norm": 94.53269958496094, "learning_rate": 1.5730380550010058e-07, "logits/chosen": -19.28606605529785, "logits/rejected": -18.7186336517334, "logps/chosen": -388.80706787109375, "logps/rejected": -311.6744689941406, "loss": 0.2942, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.094686508178711, "rewards/margins": 1.7633997201919556, "rewards/rejected": 2.331286907196045, "step": 44310 }, { "epoch": 2.057662844143182, "grad_norm": 7.058959484100342, "learning_rate": 1.572264264821951e-07, "logits/chosen": -19.174413681030273, "logits/rejected": -18.245223999023438, "logps/chosen": -329.1571044921875, "logps/rejected": -256.21917724609375, "loss": 0.6124, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2039692401885986, "rewards/margins": 1.115386962890625, "rewards/rejected": 2.0885818004608154, "step": 44320 }, { "epoch": 2.0581271182506153, "grad_norm": 57.89052963256836, "learning_rate": 1.5714904746428958e-07, "logits/chosen": -19.721508026123047, "logits/rejected": -18.865671157836914, "logps/chosen": -384.0709228515625, "logps/rejected": -340.97283935546875, "loss": 0.6727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0722291469573975, "rewards/margins": 1.1179646253585815, "rewards/rejected": 1.9542644023895264, "step": 44330 }, { "epoch": 2.058591392358048, "grad_norm": 5.765979766845703, "learning_rate": 1.5707166844638407e-07, "logits/chosen": -18.298748016357422, "logits/rejected": -16.972387313842773, "logps/chosen": -464.2315368652344, "logps/rejected": -392.3150329589844, "loss": 0.4652, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.773266553878784, "rewards/margins": 2.1013169288635254, "rewards/rejected": 1.6719497442245483, "step": 44340 }, { "epoch": 2.0590556664654813, "grad_norm": 86.01512145996094, "learning_rate": 1.5699428942847855e-07, "logits/chosen": -19.13800811767578, "logits/rejected": -18.33344268798828, "logps/chosen": -348.0096740722656, "logps/rejected": -278.3216857910156, "loss": 0.4744, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.636997699737549, "rewards/margins": 1.4167158603668213, "rewards/rejected": 2.2202820777893066, "step": 44350 }, { "epoch": 2.059519940572914, "grad_norm": 157.5078125, "learning_rate": 1.5691691041057306e-07, "logits/chosen": -18.81991958618164, "logits/rejected": -18.552677154541016, "logps/chosen": -334.08575439453125, "logps/rejected": -287.5174255371094, "loss": 1.4429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7571234703063965, "rewards/margins": 0.6093018651008606, "rewards/rejected": 2.1478216648101807, "step": 44360 }, { "epoch": 2.0599842146803473, "grad_norm": 9.59049129486084, "learning_rate": 1.5683953139266757e-07, "logits/chosen": -18.519460678100586, "logits/rejected": -17.84267234802246, "logps/chosen": -271.30780029296875, "logps/rejected": -239.92269897460938, "loss": 0.8455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1881988048553467, "rewards/margins": 0.957220733165741, "rewards/rejected": 1.23097825050354, "step": 44370 }, { "epoch": 2.06044848878778, "grad_norm": 85.14077758789062, "learning_rate": 1.5676215237476206e-07, "logits/chosen": -18.89286231994629, "logits/rejected": -18.074350357055664, "logps/chosen": -453.2315368652344, "logps/rejected": -327.555419921875, "loss": 0.5796, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.938816547393799, "rewards/margins": 0.868455708026886, "rewards/rejected": 3.0703606605529785, "step": 44380 }, { "epoch": 2.0609127628952133, "grad_norm": 69.93873596191406, "learning_rate": 1.5668477335685654e-07, "logits/chosen": -18.789554595947266, "logits/rejected": -17.574169158935547, "logps/chosen": -343.353271484375, "logps/rejected": -248.10574340820312, "loss": 0.4049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8367419242858887, "rewards/margins": 1.6065994501113892, "rewards/rejected": 1.2301424741744995, "step": 44390 }, { "epoch": 2.0613770370026465, "grad_norm": 78.31140899658203, "learning_rate": 1.5660739433895103e-07, "logits/chosen": -19.340723037719727, "logits/rejected": -18.754045486450195, "logps/chosen": -389.92059326171875, "logps/rejected": -337.51123046875, "loss": 0.5336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.89068341255188, "rewards/margins": 1.0802587270736694, "rewards/rejected": 2.810424566268921, "step": 44400 }, { "epoch": 2.0618413111100793, "grad_norm": 79.36211395263672, "learning_rate": 1.5653001532104554e-07, "logits/chosen": -19.470287322998047, "logits/rejected": -19.28995704650879, "logps/chosen": -484.4955139160156, "logps/rejected": -411.2801208496094, "loss": 0.8356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.666630983352661, "rewards/margins": 1.135451316833496, "rewards/rejected": 2.531179428100586, "step": 44410 }, { "epoch": 2.0623055852175125, "grad_norm": 100.20289611816406, "learning_rate": 1.5645263630314005e-07, "logits/chosen": -19.675758361816406, "logits/rejected": -18.138776779174805, "logps/chosen": -450.307373046875, "logps/rejected": -393.76751708984375, "loss": 0.458, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.553898334503174, "rewards/margins": 1.6418384313583374, "rewards/rejected": 2.912059783935547, "step": 44420 }, { "epoch": 2.0627698593249453, "grad_norm": 59.90205383300781, "learning_rate": 1.5637525728523453e-07, "logits/chosen": -19.112478256225586, "logits/rejected": -18.60858726501465, "logps/chosen": -397.4090881347656, "logps/rejected": -387.5352478027344, "loss": 1.0373, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1643776893615723, "rewards/margins": 0.2492973357439041, "rewards/rejected": 2.9150805473327637, "step": 44430 }, { "epoch": 2.0632341334323785, "grad_norm": 193.4189910888672, "learning_rate": 1.5629787826732902e-07, "logits/chosen": -18.920482635498047, "logits/rejected": -18.354095458984375, "logps/chosen": -405.94415283203125, "logps/rejected": -354.48211669921875, "loss": 0.9488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1857926845550537, "rewards/margins": 0.40878763794898987, "rewards/rejected": 2.777005195617676, "step": 44440 }, { "epoch": 2.0636984075398117, "grad_norm": 74.60523223876953, "learning_rate": 1.562204992494235e-07, "logits/chosen": -18.821918487548828, "logits/rejected": -18.002178192138672, "logps/chosen": -483.18927001953125, "logps/rejected": -439.09918212890625, "loss": 0.471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.72836446762085, "rewards/margins": 1.449876070022583, "rewards/rejected": 3.2784886360168457, "step": 44450 }, { "epoch": 2.0641626816472445, "grad_norm": 0.6560462713241577, "learning_rate": 1.5614312023151802e-07, "logits/chosen": -18.632600784301758, "logits/rejected": -17.968666076660156, "logps/chosen": -367.23712158203125, "logps/rejected": -296.384521484375, "loss": 0.8607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6661858558654785, "rewards/margins": 1.2361140251159668, "rewards/rejected": 2.430072069168091, "step": 44460 }, { "epoch": 2.0646269557546777, "grad_norm": 67.61932373046875, "learning_rate": 1.5606574121361253e-07, "logits/chosen": -19.009380340576172, "logits/rejected": -18.982728958129883, "logps/chosen": -348.11474609375, "logps/rejected": -325.9267272949219, "loss": 1.0386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5774929523468018, "rewards/margins": 0.4865763783454895, "rewards/rejected": 3.090916395187378, "step": 44470 }, { "epoch": 2.0650912298621105, "grad_norm": 115.96571350097656, "learning_rate": 1.55988362195707e-07, "logits/chosen": -19.517122268676758, "logits/rejected": -18.664791107177734, "logps/chosen": -369.2749938964844, "logps/rejected": -232.596435546875, "loss": 0.3583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2317912578582764, "rewards/margins": 2.489309787750244, "rewards/rejected": 0.7424818873405457, "step": 44480 }, { "epoch": 2.0655555039695437, "grad_norm": 179.7848358154297, "learning_rate": 1.559109831778015e-07, "logits/chosen": -19.10613441467285, "logits/rejected": -18.625492095947266, "logps/chosen": -373.42901611328125, "logps/rejected": -386.9351806640625, "loss": 0.8864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8011326789855957, "rewards/margins": 0.3476064205169678, "rewards/rejected": 3.453526258468628, "step": 44490 }, { "epoch": 2.0660197780769765, "grad_norm": 0.26628023386001587, "learning_rate": 1.5583360415989598e-07, "logits/chosen": -19.58695411682129, "logits/rejected": -18.68540382385254, "logps/chosen": -354.6360778808594, "logps/rejected": -328.16973876953125, "loss": 0.3819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.200944900512695, "rewards/margins": 2.8158605098724365, "rewards/rejected": 1.3850847482681274, "step": 44500 }, { "epoch": 2.0664840521844097, "grad_norm": 112.9385757446289, "learning_rate": 1.557562251419905e-07, "logits/chosen": -19.097036361694336, "logits/rejected": -18.48335075378418, "logps/chosen": -439.935546875, "logps/rejected": -237.85693359375, "loss": 0.2101, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.346457004547119, "rewards/margins": 3.2241909503936768, "rewards/rejected": 1.1222659349441528, "step": 44510 }, { "epoch": 2.066948326291843, "grad_norm": 52.70500183105469, "learning_rate": 1.55678846124085e-07, "logits/chosen": -19.285892486572266, "logits/rejected": -19.163789749145508, "logps/chosen": -510.0409240722656, "logps/rejected": -548.5515747070312, "loss": 1.1734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.359889030456543, "rewards/margins": -0.019287467002868652, "rewards/rejected": 4.379177093505859, "step": 44520 }, { "epoch": 2.0674126003992757, "grad_norm": 195.40704345703125, "learning_rate": 1.5560146710617946e-07, "logits/chosen": -18.824474334716797, "logits/rejected": -18.032516479492188, "logps/chosen": -425.57733154296875, "logps/rejected": -334.8840026855469, "loss": 1.917, "rewards/accuracies": 0.5, "rewards/chosen": 2.1512019634246826, "rewards/margins": -0.713032066822052, "rewards/rejected": 2.864234209060669, "step": 44530 }, { "epoch": 2.067876874506709, "grad_norm": 5.247282981872559, "learning_rate": 1.5552408808827397e-07, "logits/chosen": -19.56167221069336, "logits/rejected": -18.858774185180664, "logps/chosen": -467.67205810546875, "logps/rejected": -389.0614318847656, "loss": 0.6969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.761174440383911, "rewards/margins": 1.3832765817642212, "rewards/rejected": 2.3778977394104004, "step": 44540 }, { "epoch": 2.0683411486141416, "grad_norm": 165.46095275878906, "learning_rate": 1.5544670907036846e-07, "logits/chosen": -19.737634658813477, "logits/rejected": -18.440664291381836, "logps/chosen": -392.67779541015625, "logps/rejected": -284.57769775390625, "loss": 0.5091, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5543339252471924, "rewards/margins": 1.8704372644424438, "rewards/rejected": 1.683896780014038, "step": 44550 }, { "epoch": 2.068805422721575, "grad_norm": 193.2729034423828, "learning_rate": 1.5536933005246297e-07, "logits/chosen": -18.944124221801758, "logits/rejected": -18.858524322509766, "logps/chosen": -297.901611328125, "logps/rejected": -248.04037475585938, "loss": 1.2185, "rewards/accuracies": 0.5, "rewards/chosen": 2.756855010986328, "rewards/margins": -0.05688854306936264, "rewards/rejected": 2.8137433528900146, "step": 44560 }, { "epoch": 2.0692696968290076, "grad_norm": 267.96942138671875, "learning_rate": 1.5529195103455748e-07, "logits/chosen": -19.412233352661133, "logits/rejected": -18.571136474609375, "logps/chosen": -515.9132690429688, "logps/rejected": -349.26898193359375, "loss": 0.4394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1356513500213623, "rewards/margins": 1.665045142173767, "rewards/rejected": 1.4706063270568848, "step": 44570 }, { "epoch": 2.069733970936441, "grad_norm": 12.91019344329834, "learning_rate": 1.5521457201665194e-07, "logits/chosen": -19.832006454467773, "logits/rejected": -18.36894416809082, "logps/chosen": -416.406982421875, "logps/rejected": -268.38336181640625, "loss": 0.3586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.869227647781372, "rewards/margins": 2.151733160018921, "rewards/rejected": 1.717494249343872, "step": 44580 }, { "epoch": 2.070198245043874, "grad_norm": 34.2572021484375, "learning_rate": 1.5513719299874645e-07, "logits/chosen": -19.677776336669922, "logits/rejected": -19.24652099609375, "logps/chosen": -485.135009765625, "logps/rejected": -305.5161437988281, "loss": 0.7924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0371365547180176, "rewards/margins": 0.9920118451118469, "rewards/rejected": 2.0451245307922363, "step": 44590 }, { "epoch": 2.070662519151307, "grad_norm": 32.76735305786133, "learning_rate": 1.5505981398084094e-07, "logits/chosen": -18.50727653503418, "logits/rejected": -18.149263381958008, "logps/chosen": -347.27349853515625, "logps/rejected": -388.0011901855469, "loss": 0.8123, "rewards/accuracies": 0.5, "rewards/chosen": 3.2799713611602783, "rewards/margins": 0.29346269369125366, "rewards/rejected": 2.986508846282959, "step": 44600 }, { "epoch": 2.07112679325874, "grad_norm": 1.3079837560653687, "learning_rate": 1.5498243496293545e-07, "logits/chosen": -19.45037078857422, "logits/rejected": -18.139217376708984, "logps/chosen": -470.4851989746094, "logps/rejected": -337.39715576171875, "loss": 0.4152, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.753957748413086, "rewards/margins": 1.4321000576019287, "rewards/rejected": 2.3218579292297363, "step": 44610 }, { "epoch": 2.071591067366173, "grad_norm": 5.101710796356201, "learning_rate": 1.5490505594502996e-07, "logits/chosen": -18.718704223632812, "logits/rejected": -17.70883560180664, "logps/chosen": -378.7703857421875, "logps/rejected": -336.6570739746094, "loss": 0.5778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.472081422805786, "rewards/margins": 1.7540756464004517, "rewards/rejected": 1.7180061340332031, "step": 44620 }, { "epoch": 2.072055341473606, "grad_norm": 136.7376251220703, "learning_rate": 1.5482767692712442e-07, "logits/chosen": -19.452014923095703, "logits/rejected": -18.305801391601562, "logps/chosen": -483.70526123046875, "logps/rejected": -431.68560791015625, "loss": 0.6002, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9311859607696533, "rewards/margins": 1.219226598739624, "rewards/rejected": 2.7119593620300293, "step": 44630 }, { "epoch": 2.0725196155810393, "grad_norm": 49.48411560058594, "learning_rate": 1.5475029790921893e-07, "logits/chosen": -19.782066345214844, "logits/rejected": -19.51841926574707, "logps/chosen": -476.48046875, "logps/rejected": -436.33892822265625, "loss": 0.7458, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.742284059524536, "rewards/margins": 0.5503314137458801, "rewards/rejected": 3.1919522285461426, "step": 44640 }, { "epoch": 2.072983889688472, "grad_norm": 116.50507354736328, "learning_rate": 1.546729188913134e-07, "logits/chosen": -20.205215454101562, "logits/rejected": -18.83177947998047, "logps/chosen": -490.5948181152344, "logps/rejected": -393.7406311035156, "loss": 0.3732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.130326271057129, "rewards/margins": 1.9395253658294678, "rewards/rejected": 2.190800666809082, "step": 44650 }, { "epoch": 2.0734481637959052, "grad_norm": 66.5692367553711, "learning_rate": 1.5459553987340792e-07, "logits/chosen": -20.083585739135742, "logits/rejected": -18.529651641845703, "logps/chosen": -441.6969299316406, "logps/rejected": -308.4511413574219, "loss": 0.438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5260956287384033, "rewards/margins": 1.333756685256958, "rewards/rejected": 2.192338705062866, "step": 44660 }, { "epoch": 2.073912437903338, "grad_norm": 240.1263427734375, "learning_rate": 1.5451816085550244e-07, "logits/chosen": -19.32722282409668, "logits/rejected": -18.82768440246582, "logps/chosen": -431.28228759765625, "logps/rejected": -313.4977722167969, "loss": 0.7349, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.446563720703125, "rewards/margins": 1.3393833637237549, "rewards/rejected": 2.10718035697937, "step": 44670 }, { "epoch": 2.0743767120107712, "grad_norm": 48.975730895996094, "learning_rate": 1.544407818375969e-07, "logits/chosen": -18.934314727783203, "logits/rejected": -18.48580551147461, "logps/chosen": -384.22918701171875, "logps/rejected": -331.8057556152344, "loss": 1.0717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0090065002441406, "rewards/margins": 0.39434972405433655, "rewards/rejected": 2.614656925201416, "step": 44680 }, { "epoch": 2.074840986118204, "grad_norm": 1.1579678058624268, "learning_rate": 1.543634028196914e-07, "logits/chosen": -18.847976684570312, "logits/rejected": -18.1159725189209, "logps/chosen": -345.47509765625, "logps/rejected": -243.6181640625, "loss": 0.8567, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9371297359466553, "rewards/margins": 1.1521724462509155, "rewards/rejected": 1.7849572896957397, "step": 44690 }, { "epoch": 2.075305260225637, "grad_norm": 114.41967010498047, "learning_rate": 1.542860238017859e-07, "logits/chosen": -19.491466522216797, "logits/rejected": -19.876117706298828, "logps/chosen": -354.96478271484375, "logps/rejected": -391.1564636230469, "loss": 1.5619, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.017874002456665, "rewards/margins": -0.8148336410522461, "rewards/rejected": 3.832707643508911, "step": 44700 }, { "epoch": 2.0757695343330704, "grad_norm": 2.64723539352417, "learning_rate": 1.542086447838804e-07, "logits/chosen": -19.921585083007812, "logits/rejected": -18.954853057861328, "logps/chosen": -444.648681640625, "logps/rejected": -377.21929931640625, "loss": 0.721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.075814723968506, "rewards/margins": 1.4631417989730835, "rewards/rejected": 1.6126725673675537, "step": 44710 }, { "epoch": 2.076233808440503, "grad_norm": 35.21903610229492, "learning_rate": 1.541312657659749e-07, "logits/chosen": -19.372974395751953, "logits/rejected": -18.751537322998047, "logps/chosen": -438.8077697753906, "logps/rejected": -337.8717041015625, "loss": 0.3991, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9340667724609375, "rewards/margins": 1.5252491235733032, "rewards/rejected": 2.408817768096924, "step": 44720 }, { "epoch": 2.0766980825479364, "grad_norm": 46.684852600097656, "learning_rate": 1.5405388674806937e-07, "logits/chosen": -18.180057525634766, "logits/rejected": -17.51225471496582, "logps/chosen": -261.06561279296875, "logps/rejected": -200.37710571289062, "loss": 0.6591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4370217323303223, "rewards/margins": 1.319008231163025, "rewards/rejected": 1.118013620376587, "step": 44730 }, { "epoch": 2.077162356655369, "grad_norm": 98.92718505859375, "learning_rate": 1.5397650773016388e-07, "logits/chosen": -19.558822631835938, "logits/rejected": -19.28559684753418, "logps/chosen": -455.42718505859375, "logps/rejected": -366.34844970703125, "loss": 0.7153, "rewards/accuracies": 0.5, "rewards/chosen": 3.3725712299346924, "rewards/margins": 0.8933928608894348, "rewards/rejected": 2.4791784286499023, "step": 44740 }, { "epoch": 2.0776266307628024, "grad_norm": 141.51461791992188, "learning_rate": 1.538991287122584e-07, "logits/chosen": -19.236621856689453, "logits/rejected": -18.22053337097168, "logps/chosen": -355.9595642089844, "logps/rejected": -231.7849884033203, "loss": 0.4823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6211066246032715, "rewards/margins": 1.5690487623214722, "rewards/rejected": 1.0520575046539307, "step": 44750 }, { "epoch": 2.078090904870235, "grad_norm": 15.902229309082031, "learning_rate": 1.5382174969435288e-07, "logits/chosen": -19.011545181274414, "logits/rejected": -18.81007194519043, "logps/chosen": -375.1598205566406, "logps/rejected": -392.5360107421875, "loss": 0.753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1602158546447754, "rewards/margins": 0.9278785586357117, "rewards/rejected": 2.232337474822998, "step": 44760 }, { "epoch": 2.0785551789776684, "grad_norm": 136.81736755371094, "learning_rate": 1.5374437067644736e-07, "logits/chosen": -18.580434799194336, "logits/rejected": -17.59258460998535, "logps/chosen": -494.500244140625, "logps/rejected": -289.48663330078125, "loss": 0.8391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8754138946533203, "rewards/margins": 2.133944511413574, "rewards/rejected": 1.741469144821167, "step": 44770 }, { "epoch": 2.0790194530851016, "grad_norm": 12.789924621582031, "learning_rate": 1.5366699165854185e-07, "logits/chosen": -18.816936492919922, "logits/rejected": -19.092426300048828, "logps/chosen": -306.80975341796875, "logps/rejected": -366.166015625, "loss": 1.008, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6693050861358643, "rewards/margins": -0.1687186062335968, "rewards/rejected": 2.8380236625671387, "step": 44780 }, { "epoch": 2.0794837271925344, "grad_norm": 13.685957908630371, "learning_rate": 1.5358961264063636e-07, "logits/chosen": -19.458669662475586, "logits/rejected": -18.467491149902344, "logps/chosen": -487.74566650390625, "logps/rejected": -337.0449523925781, "loss": 0.4392, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5624899864196777, "rewards/margins": 1.9518648386001587, "rewards/rejected": 1.6106250286102295, "step": 44790 }, { "epoch": 2.0799480012999676, "grad_norm": 6.705014705657959, "learning_rate": 1.5351223362273087e-07, "logits/chosen": -18.390390396118164, "logits/rejected": -18.70661163330078, "logps/chosen": -261.6879577636719, "logps/rejected": -285.94488525390625, "loss": 0.5445, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.59527587890625, "rewards/margins": 0.8716599345207214, "rewards/rejected": 1.7236160039901733, "step": 44800 }, { "epoch": 2.0804122754074004, "grad_norm": 40.727901458740234, "learning_rate": 1.5343485460482536e-07, "logits/chosen": -19.048263549804688, "logits/rejected": -18.728668212890625, "logps/chosen": -387.4823303222656, "logps/rejected": -350.05621337890625, "loss": 0.3408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8266243934631348, "rewards/margins": 1.5057293176651, "rewards/rejected": 2.320894718170166, "step": 44810 }, { "epoch": 2.0808765495148336, "grad_norm": 44.32724380493164, "learning_rate": 1.5335747558691984e-07, "logits/chosen": -18.984302520751953, "logits/rejected": -17.448305130004883, "logps/chosen": -397.53167724609375, "logps/rejected": -279.88250732421875, "loss": 0.4522, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6501364707946777, "rewards/margins": 2.0759291648864746, "rewards/rejected": 1.5742073059082031, "step": 44820 }, { "epoch": 2.081340823622267, "grad_norm": 89.90520477294922, "learning_rate": 1.5328009656901432e-07, "logits/chosen": -19.902637481689453, "logits/rejected": -18.187963485717773, "logps/chosen": -381.3639831542969, "logps/rejected": -303.32952880859375, "loss": 0.6749, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.793117046356201, "rewards/margins": 1.62177312374115, "rewards/rejected": 2.17134428024292, "step": 44830 }, { "epoch": 2.0818050977296996, "grad_norm": 266.37371826171875, "learning_rate": 1.5320271755110884e-07, "logits/chosen": -19.603775024414062, "logits/rejected": -18.300565719604492, "logps/chosen": -539.5211181640625, "logps/rejected": -448.8544921875, "loss": 0.6118, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.543011665344238, "rewards/margins": 1.6418869495391846, "rewards/rejected": 2.9011244773864746, "step": 44840 }, { "epoch": 2.082269371837133, "grad_norm": 62.36045455932617, "learning_rate": 1.5312533853320335e-07, "logits/chosen": -19.112163543701172, "logits/rejected": -18.4324951171875, "logps/chosen": -353.4870300292969, "logps/rejected": -291.5110778808594, "loss": 0.3087, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9961109161376953, "rewards/margins": 1.3480262756347656, "rewards/rejected": 1.6480846405029297, "step": 44850 }, { "epoch": 2.0827336459445656, "grad_norm": 176.66253662109375, "learning_rate": 1.5304795951529783e-07, "logits/chosen": -18.153915405273438, "logits/rejected": -17.6276912689209, "logps/chosen": -410.14727783203125, "logps/rejected": -337.40631103515625, "loss": 1.0008, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.55088472366333, "rewards/margins": 0.8198311924934387, "rewards/rejected": 2.731053113937378, "step": 44860 }, { "epoch": 2.0831979200519988, "grad_norm": 107.2819595336914, "learning_rate": 1.5297058049739232e-07, "logits/chosen": -18.591386795043945, "logits/rejected": -18.691097259521484, "logps/chosen": -346.83526611328125, "logps/rejected": -311.26092529296875, "loss": 0.6381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.608039140701294, "rewards/margins": 0.31663280725479126, "rewards/rejected": 2.2914061546325684, "step": 44870 }, { "epoch": 2.0836621941594315, "grad_norm": 8.586682319641113, "learning_rate": 1.528932014794868e-07, "logits/chosen": -18.985469818115234, "logits/rejected": -19.053422927856445, "logps/chosen": -356.63922119140625, "logps/rejected": -298.3666076660156, "loss": 0.5374, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.531804323196411, "rewards/margins": 0.6976671814918518, "rewards/rejected": 1.834136962890625, "step": 44880 }, { "epoch": 2.0841264682668648, "grad_norm": 1.7188185453414917, "learning_rate": 1.5281582246158131e-07, "logits/chosen": -18.437580108642578, "logits/rejected": -17.723846435546875, "logps/chosen": -364.7179260253906, "logps/rejected": -238.69570922851562, "loss": 0.5167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4897446632385254, "rewards/margins": 1.428932547569275, "rewards/rejected": 1.06081223487854, "step": 44890 }, { "epoch": 2.084590742374298, "grad_norm": 5.204619884490967, "learning_rate": 1.5273844344367582e-07, "logits/chosen": -19.57561683654785, "logits/rejected": -18.046283721923828, "logps/chosen": -362.03167724609375, "logps/rejected": -285.2923278808594, "loss": 0.8832, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.101430892944336, "rewards/margins": 1.1799817085266113, "rewards/rejected": 1.9214494228363037, "step": 44900 }, { "epoch": 2.0850550164817307, "grad_norm": 0.05879409983754158, "learning_rate": 1.526610644257703e-07, "logits/chosen": -19.489810943603516, "logits/rejected": -18.67053985595703, "logps/chosen": -509.56158447265625, "logps/rejected": -350.6594543457031, "loss": 0.3347, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.534327030181885, "rewards/margins": 2.2295520305633545, "rewards/rejected": 2.3047749996185303, "step": 44910 }, { "epoch": 2.085519290589164, "grad_norm": 145.40155029296875, "learning_rate": 1.525836854078648e-07, "logits/chosen": -19.688127517700195, "logits/rejected": -19.265369415283203, "logps/chosen": -332.36053466796875, "logps/rejected": -438.13958740234375, "loss": 1.0766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7570061683654785, "rewards/margins": 0.6724632978439331, "rewards/rejected": 3.084542751312256, "step": 44920 }, { "epoch": 2.0859835646965967, "grad_norm": 3.5338311195373535, "learning_rate": 1.5250630638995928e-07, "logits/chosen": -19.205978393554688, "logits/rejected": -18.226787567138672, "logps/chosen": -334.7752990722656, "logps/rejected": -220.8482208251953, "loss": 0.5127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0076262950897217, "rewards/margins": 1.5608279705047607, "rewards/rejected": 1.446798324584961, "step": 44930 }, { "epoch": 2.08644783880403, "grad_norm": 21.041643142700195, "learning_rate": 1.524289273720538e-07, "logits/chosen": -19.531784057617188, "logits/rejected": -18.480937957763672, "logps/chosen": -418.521240234375, "logps/rejected": -274.2870788574219, "loss": 0.3927, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1557536125183105, "rewards/margins": 2.2453434467315674, "rewards/rejected": 1.9104101657867432, "step": 44940 }, { "epoch": 2.0869121129114627, "grad_norm": 3.3910417556762695, "learning_rate": 1.523515483541483e-07, "logits/chosen": -19.528966903686523, "logits/rejected": -18.294841766357422, "logps/chosen": -380.619140625, "logps/rejected": -261.19024658203125, "loss": 0.361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4071125984191895, "rewards/margins": 2.6885509490966797, "rewards/rejected": 0.7185612916946411, "step": 44950 }, { "epoch": 2.087376387018896, "grad_norm": 188.7362823486328, "learning_rate": 1.5227416933624276e-07, "logits/chosen": -19.14694595336914, "logits/rejected": -18.930450439453125, "logps/chosen": -401.1681213378906, "logps/rejected": -328.0955505371094, "loss": 0.2717, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.195674896240234, "rewards/margins": 1.8357759714126587, "rewards/rejected": 2.359898567199707, "step": 44960 }, { "epoch": 2.087840661126329, "grad_norm": 49.45674514770508, "learning_rate": 1.5219679031833727e-07, "logits/chosen": -19.271554946899414, "logits/rejected": -18.96303939819336, "logps/chosen": -382.7579650878906, "logps/rejected": -375.456298828125, "loss": 1.3374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5660014152526855, "rewards/margins": 0.0964343324303627, "rewards/rejected": 3.469566822052002, "step": 44970 }, { "epoch": 2.088304935233762, "grad_norm": 12.51368522644043, "learning_rate": 1.5211941130043176e-07, "logits/chosen": -19.949298858642578, "logits/rejected": -18.973468780517578, "logps/chosen": -282.46368408203125, "logps/rejected": -260.35650634765625, "loss": 0.8308, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.173658847808838, "rewards/margins": 1.0023791790008545, "rewards/rejected": 2.1712794303894043, "step": 44980 }, { "epoch": 2.088769209341195, "grad_norm": 1.1916536092758179, "learning_rate": 1.5204203228252627e-07, "logits/chosen": -19.224681854248047, "logits/rejected": -18.05304527282715, "logps/chosen": -493.09283447265625, "logps/rejected": -341.11956787109375, "loss": 0.8298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.241663932800293, "rewards/margins": 1.8372522592544556, "rewards/rejected": 2.404411792755127, "step": 44990 }, { "epoch": 2.089233483448628, "grad_norm": 84.24034881591797, "learning_rate": 1.5196465326462078e-07, "logits/chosen": -18.073463439941406, "logits/rejected": -18.712291717529297, "logps/chosen": -313.4960021972656, "logps/rejected": -322.5466003417969, "loss": 1.1259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5191702842712402, "rewards/margins": -0.029331039637327194, "rewards/rejected": 2.5485012531280518, "step": 45000 }, { "epoch": 2.089697757556061, "grad_norm": 148.9741668701172, "learning_rate": 1.5188727424671524e-07, "logits/chosen": -19.61789321899414, "logits/rejected": -18.542644500732422, "logps/chosen": -488.98760986328125, "logps/rejected": -387.24871826171875, "loss": 0.3945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.940083980560303, "rewards/margins": 2.0200047492980957, "rewards/rejected": 2.920078754425049, "step": 45010 }, { "epoch": 2.0901620316634943, "grad_norm": 134.73477172851562, "learning_rate": 1.5180989522880975e-07, "logits/chosen": -19.100711822509766, "logits/rejected": -19.90873146057129, "logps/chosen": -320.61724853515625, "logps/rejected": -316.57330322265625, "loss": 0.9902, "rewards/accuracies": 0.5, "rewards/chosen": 1.8212785720825195, "rewards/margins": -0.08917972445487976, "rewards/rejected": 1.9104582071304321, "step": 45020 }, { "epoch": 2.090626305770927, "grad_norm": 105.68289184570312, "learning_rate": 1.5173251621090423e-07, "logits/chosen": -19.155067443847656, "logits/rejected": -19.334524154663086, "logps/chosen": -449.2694396972656, "logps/rejected": -411.30194091796875, "loss": 0.5277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.671066761016846, "rewards/margins": 1.4008156061172485, "rewards/rejected": 3.2702510356903076, "step": 45030 }, { "epoch": 2.0910905798783603, "grad_norm": 0.26638978719711304, "learning_rate": 1.5165513719299874e-07, "logits/chosen": -20.221220016479492, "logits/rejected": -19.149578094482422, "logps/chosen": -383.0754089355469, "logps/rejected": -304.159423828125, "loss": 0.4997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.100732326507568, "rewards/margins": 1.4070591926574707, "rewards/rejected": 2.6936726570129395, "step": 45040 }, { "epoch": 2.091554853985793, "grad_norm": 82.94407653808594, "learning_rate": 1.5157775817509326e-07, "logits/chosen": -18.84616470336914, "logits/rejected": -18.77886962890625, "logps/chosen": -382.6479187011719, "logps/rejected": -359.8411560058594, "loss": 1.1568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.6491899490356445, "rewards/margins": 0.7279529571533203, "rewards/rejected": 3.921236515045166, "step": 45050 }, { "epoch": 2.0920191280932263, "grad_norm": 266.08758544921875, "learning_rate": 1.5150037915718771e-07, "logits/chosen": -17.985355377197266, "logits/rejected": -18.003040313720703, "logps/chosen": -326.25201416015625, "logps/rejected": -354.97015380859375, "loss": 0.5212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4359543323516846, "rewards/margins": 1.922320008277893, "rewards/rejected": 1.513634443283081, "step": 45060 }, { "epoch": 2.092483402200659, "grad_norm": 90.24127960205078, "learning_rate": 1.5142300013928223e-07, "logits/chosen": -19.497379302978516, "logits/rejected": -19.35367202758789, "logps/chosen": -360.31939697265625, "logps/rejected": -352.2226867675781, "loss": 0.7878, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.907611846923828, "rewards/margins": 1.1165663003921509, "rewards/rejected": 2.7910451889038086, "step": 45070 }, { "epoch": 2.0929476763080923, "grad_norm": 5.849235534667969, "learning_rate": 1.513456211213767e-07, "logits/chosen": -19.402368545532227, "logits/rejected": -18.122337341308594, "logps/chosen": -400.33172607421875, "logps/rejected": -298.3547668457031, "loss": 0.2726, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.446208953857422, "rewards/margins": 2.991973400115967, "rewards/rejected": 1.454235315322876, "step": 45080 }, { "epoch": 2.0934119504155255, "grad_norm": 9.512832641601562, "learning_rate": 1.5126824210347122e-07, "logits/chosen": -19.56393051147461, "logits/rejected": -18.501739501953125, "logps/chosen": -415.47076416015625, "logps/rejected": -314.7469177246094, "loss": 0.3599, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3242530822753906, "rewards/margins": 1.2201499938964844, "rewards/rejected": 2.104102849960327, "step": 45090 }, { "epoch": 2.0938762245229583, "grad_norm": 48.36989212036133, "learning_rate": 1.5119086308556573e-07, "logits/chosen": -18.695335388183594, "logits/rejected": -18.654796600341797, "logps/chosen": -412.4236755371094, "logps/rejected": -432.0747985839844, "loss": 1.1421, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.462900161743164, "rewards/margins": 0.40347081422805786, "rewards/rejected": 3.059429407119751, "step": 45100 }, { "epoch": 2.0943404986303915, "grad_norm": 254.8941192626953, "learning_rate": 1.511134840676602e-07, "logits/chosen": -20.360172271728516, "logits/rejected": -18.90369987487793, "logps/chosen": -492.6693420410156, "logps/rejected": -360.4525451660156, "loss": 0.5964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5724029541015625, "rewards/margins": 1.8280563354492188, "rewards/rejected": 2.7443463802337646, "step": 45110 }, { "epoch": 2.0948047727378243, "grad_norm": 1.0807827711105347, "learning_rate": 1.510361050497547e-07, "logits/chosen": -20.720455169677734, "logits/rejected": -19.599098205566406, "logps/chosen": -366.15740966796875, "logps/rejected": -297.78131103515625, "loss": 0.5899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9102635383605957, "rewards/margins": 1.7880828380584717, "rewards/rejected": 2.122180461883545, "step": 45120 }, { "epoch": 2.0952690468452575, "grad_norm": 85.23966979980469, "learning_rate": 1.509587260318492e-07, "logits/chosen": -18.241802215576172, "logits/rejected": -18.236957550048828, "logps/chosen": -396.7330322265625, "logps/rejected": -386.99609375, "loss": 1.3649, "rewards/accuracies": 0.5, "rewards/chosen": 2.9337940216064453, "rewards/margins": 0.29482218623161316, "rewards/rejected": 2.638972043991089, "step": 45130 }, { "epoch": 2.0957333209526903, "grad_norm": 58.87978744506836, "learning_rate": 1.508813470139437e-07, "logits/chosen": -19.205402374267578, "logits/rejected": -18.175996780395508, "logps/chosen": -376.69647216796875, "logps/rejected": -242.62533569335938, "loss": 0.2875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6546244621276855, "rewards/margins": 1.6387220621109009, "rewards/rejected": 1.0159022808074951, "step": 45140 }, { "epoch": 2.0961975950601235, "grad_norm": 44.6015739440918, "learning_rate": 1.508039679960382e-07, "logits/chosen": -20.136075973510742, "logits/rejected": -18.442611694335938, "logps/chosen": -440.1355895996094, "logps/rejected": -284.85321044921875, "loss": 0.4807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.202517032623291, "rewards/margins": 2.4283337593078613, "rewards/rejected": 2.7741832733154297, "step": 45150 }, { "epoch": 2.0966618691675567, "grad_norm": 40.560726165771484, "learning_rate": 1.5072658897813267e-07, "logits/chosen": -18.883588790893555, "logits/rejected": -18.640045166015625, "logps/chosen": -414.4923400878906, "logps/rejected": -362.1506652832031, "loss": 1.5008, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8393218517303467, "rewards/margins": 0.19698183238506317, "rewards/rejected": 3.6423401832580566, "step": 45160 }, { "epoch": 2.0971261432749895, "grad_norm": 86.56692504882812, "learning_rate": 1.5064920996022718e-07, "logits/chosen": -20.335880279541016, "logits/rejected": -20.31234359741211, "logps/chosen": -419.35198974609375, "logps/rejected": -354.1116638183594, "loss": 0.3319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.059573173522949, "rewards/margins": 1.3620750904083252, "rewards/rejected": 2.697497844696045, "step": 45170 }, { "epoch": 2.0975904173824227, "grad_norm": 112.36517333984375, "learning_rate": 1.5057183094232166e-07, "logits/chosen": -18.38050651550293, "logits/rejected": -17.11606216430664, "logps/chosen": -444.1830139160156, "logps/rejected": -361.2713317871094, "loss": 0.2267, "rewards/accuracies": 1.0, "rewards/chosen": 3.454625368118286, "rewards/margins": 2.116023540496826, "rewards/rejected": 1.3386017084121704, "step": 45180 }, { "epoch": 2.0980546914898555, "grad_norm": 66.77265167236328, "learning_rate": 1.5049445192441618e-07, "logits/chosen": -18.95250129699707, "logits/rejected": -18.449811935424805, "logps/chosen": -367.0890197753906, "logps/rejected": -332.2874755859375, "loss": 0.7451, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1613101959228516, "rewards/margins": 0.37974125146865845, "rewards/rejected": 2.781569004058838, "step": 45190 }, { "epoch": 2.0985189655972887, "grad_norm": 53.81304168701172, "learning_rate": 1.5041707290651066e-07, "logits/chosen": -19.09137725830078, "logits/rejected": -17.64394760131836, "logps/chosen": -302.92913818359375, "logps/rejected": -203.33828735351562, "loss": 0.3361, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4675002098083496, "rewards/margins": 1.591975450515747, "rewards/rejected": 0.8755248785018921, "step": 45200 }, { "epoch": 2.0989832397047214, "grad_norm": 11.47939682006836, "learning_rate": 1.5033969388860515e-07, "logits/chosen": -18.758323669433594, "logits/rejected": -18.257305145263672, "logps/chosen": -355.12005615234375, "logps/rejected": -299.9915771484375, "loss": 1.4638, "rewards/accuracies": 0.5, "rewards/chosen": 3.2265357971191406, "rewards/margins": -0.02704448625445366, "rewards/rejected": 3.253580093383789, "step": 45210 }, { "epoch": 2.0994475138121547, "grad_norm": 43.408973693847656, "learning_rate": 1.5026231487069966e-07, "logits/chosen": -19.248687744140625, "logits/rejected": -18.72295570373535, "logps/chosen": -347.4604187011719, "logps/rejected": -333.5646057128906, "loss": 1.3085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.233790636062622, "rewards/margins": 0.31514573097229004, "rewards/rejected": 2.918644666671753, "step": 45220 }, { "epoch": 2.099911787919588, "grad_norm": 79.29955291748047, "learning_rate": 1.5018493585279414e-07, "logits/chosen": -18.79244613647461, "logits/rejected": -18.190366744995117, "logps/chosen": -460.97430419921875, "logps/rejected": -356.10809326171875, "loss": 0.6681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8368277549743652, "rewards/margins": 1.385024905204773, "rewards/rejected": 2.451802968978882, "step": 45230 }, { "epoch": 2.1003760620270207, "grad_norm": 78.03582763671875, "learning_rate": 1.5010755683488865e-07, "logits/chosen": -18.203937530517578, "logits/rejected": -17.757123947143555, "logps/chosen": -340.56585693359375, "logps/rejected": -302.839111328125, "loss": 0.9626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9430084228515625, "rewards/margins": 0.6847160458564758, "rewards/rejected": 2.2582924365997314, "step": 45240 }, { "epoch": 2.100840336134454, "grad_norm": 57.315773010253906, "learning_rate": 1.5003017781698314e-07, "logits/chosen": -19.360437393188477, "logits/rejected": -18.296104431152344, "logps/chosen": -415.08197021484375, "logps/rejected": -302.8036804199219, "loss": 0.5754, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9533441066741943, "rewards/margins": 0.9859046936035156, "rewards/rejected": 1.96743905544281, "step": 45250 }, { "epoch": 2.1013046102418866, "grad_norm": 247.88641357421875, "learning_rate": 1.4995279879907762e-07, "logits/chosen": -19.42205238342285, "logits/rejected": -18.38230323791504, "logps/chosen": -356.2047424316406, "logps/rejected": -374.4033203125, "loss": 1.0154, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0186190605163574, "rewards/margins": 0.2888185977935791, "rewards/rejected": 2.7298007011413574, "step": 45260 }, { "epoch": 2.10176888434932, "grad_norm": 0.5633729100227356, "learning_rate": 1.4987541978117213e-07, "logits/chosen": -20.515310287475586, "logits/rejected": -19.919889450073242, "logps/chosen": -392.8396911621094, "logps/rejected": -324.8146667480469, "loss": 0.5441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8761565685272217, "rewards/margins": 1.1322224140167236, "rewards/rejected": 2.743934154510498, "step": 45270 }, { "epoch": 2.102233158456753, "grad_norm": 43.61875534057617, "learning_rate": 1.4979804076326662e-07, "logits/chosen": -18.96971893310547, "logits/rejected": -18.130151748657227, "logps/chosen": -483.9049377441406, "logps/rejected": -386.4327087402344, "loss": 0.5481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.9081244468688965, "rewards/margins": 1.4694035053253174, "rewards/rejected": 3.438720703125, "step": 45280 }, { "epoch": 2.102697432564186, "grad_norm": 32.14234161376953, "learning_rate": 1.4972066174536113e-07, "logits/chosen": -19.981332778930664, "logits/rejected": -18.243698120117188, "logps/chosen": -394.67364501953125, "logps/rejected": -250.8757781982422, "loss": 0.3054, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.179922103881836, "rewards/margins": 2.1787397861480713, "rewards/rejected": 2.0011823177337646, "step": 45290 }, { "epoch": 2.103161706671619, "grad_norm": 257.7904968261719, "learning_rate": 1.4964328272745561e-07, "logits/chosen": -18.933513641357422, "logits/rejected": -18.52969741821289, "logps/chosen": -353.24578857421875, "logps/rejected": -323.78643798828125, "loss": 0.561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.446402072906494, "rewards/margins": 0.7093623876571655, "rewards/rejected": 1.7370399236679077, "step": 45300 }, { "epoch": 2.103625980779052, "grad_norm": 68.83973693847656, "learning_rate": 1.495659037095501e-07, "logits/chosen": -20.143648147583008, "logits/rejected": -19.202919006347656, "logps/chosen": -417.0787048339844, "logps/rejected": -340.6068115234375, "loss": 0.2716, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4167895317077637, "rewards/margins": 2.2987053394317627, "rewards/rejected": 1.1180849075317383, "step": 45310 }, { "epoch": 2.104090254886485, "grad_norm": 4.190691947937012, "learning_rate": 1.494885246916446e-07, "logits/chosen": -19.48028564453125, "logits/rejected": -18.665679931640625, "logps/chosen": -370.1878967285156, "logps/rejected": -330.1237487792969, "loss": 0.6778, "rewards/accuracies": 0.5, "rewards/chosen": 3.320725202560425, "rewards/margins": 0.8547745943069458, "rewards/rejected": 2.4659504890441895, "step": 45320 }, { "epoch": 2.104554528993918, "grad_norm": 8.272676467895508, "learning_rate": 1.494111456737391e-07, "logits/chosen": -20.5948486328125, "logits/rejected": -20.30840492248535, "logps/chosen": -653.0349731445312, "logps/rejected": -349.7455749511719, "loss": 0.3479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.798460483551025, "rewards/margins": 1.540855050086975, "rewards/rejected": 3.2576053142547607, "step": 45330 }, { "epoch": 2.105018803101351, "grad_norm": 225.3589630126953, "learning_rate": 1.493337666558336e-07, "logits/chosen": -19.620447158813477, "logits/rejected": -19.396678924560547, "logps/chosen": -400.8050231933594, "logps/rejected": -397.35772705078125, "loss": 0.9497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.085641384124756, "rewards/margins": 0.5680271983146667, "rewards/rejected": 2.5176146030426025, "step": 45340 }, { "epoch": 2.1054830772087842, "grad_norm": 8.199030876159668, "learning_rate": 1.492563876379281e-07, "logits/chosen": -18.410202026367188, "logits/rejected": -17.543947219848633, "logps/chosen": -429.01837158203125, "logps/rejected": -314.1185607910156, "loss": 0.4411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.966902256011963, "rewards/margins": 2.0943970680236816, "rewards/rejected": 1.8725054264068604, "step": 45350 }, { "epoch": 2.105947351316217, "grad_norm": 234.05113220214844, "learning_rate": 1.4917900862002258e-07, "logits/chosen": -18.546916961669922, "logits/rejected": -17.61776351928711, "logps/chosen": -379.7079772949219, "logps/rejected": -320.7392578125, "loss": 0.7078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0689263343811035, "rewards/margins": 1.2859939336776733, "rewards/rejected": 1.7829320430755615, "step": 45360 }, { "epoch": 2.1064116254236502, "grad_norm": 4.873209476470947, "learning_rate": 1.491016296021171e-07, "logits/chosen": -18.486255645751953, "logits/rejected": -17.530187606811523, "logps/chosen": -484.2682189941406, "logps/rejected": -343.543701171875, "loss": 0.7739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8794925212860107, "rewards/margins": 1.7518399953842163, "rewards/rejected": 2.127652645111084, "step": 45370 }, { "epoch": 2.106875899531083, "grad_norm": 1.3810207843780518, "learning_rate": 1.490242505842116e-07, "logits/chosen": -19.27651596069336, "logits/rejected": -17.773386001586914, "logps/chosen": -426.721435546875, "logps/rejected": -276.54461669921875, "loss": 0.6238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.093203544616699, "rewards/margins": 1.554495930671692, "rewards/rejected": 2.5387074947357178, "step": 45380 }, { "epoch": 2.1073401736385162, "grad_norm": 163.24269104003906, "learning_rate": 1.4894687156630608e-07, "logits/chosen": -18.54347801208496, "logits/rejected": -19.009254455566406, "logps/chosen": -352.3524169921875, "logps/rejected": -360.72369384765625, "loss": 1.3306, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.018022537231445, "rewards/margins": 0.09004902839660645, "rewards/rejected": 3.927974224090576, "step": 45390 }, { "epoch": 2.107804447745949, "grad_norm": 9.086923599243164, "learning_rate": 1.4886949254840057e-07, "logits/chosen": -19.395084381103516, "logits/rejected": -18.599369049072266, "logps/chosen": -319.3992614746094, "logps/rejected": -277.78375244140625, "loss": 0.3837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.369595766067505, "rewards/margins": 1.0929040908813477, "rewards/rejected": 1.2766914367675781, "step": 45400 }, { "epoch": 2.108268721853382, "grad_norm": 89.19853210449219, "learning_rate": 1.4879211353049505e-07, "logits/chosen": -19.403228759765625, "logits/rejected": -18.48747444152832, "logps/chosen": -438.78338623046875, "logps/rejected": -338.02557373046875, "loss": 0.3099, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4996161460876465, "rewards/margins": 2.5931267738342285, "rewards/rejected": 0.9064895510673523, "step": 45410 }, { "epoch": 2.1087329959608154, "grad_norm": 34.60182571411133, "learning_rate": 1.4871473451258956e-07, "logits/chosen": -19.920169830322266, "logits/rejected": -19.212453842163086, "logps/chosen": -493.21514892578125, "logps/rejected": -405.5977783203125, "loss": 0.8281, "rewards/accuracies": 0.5, "rewards/chosen": 4.200211524963379, "rewards/margins": 0.6228700876235962, "rewards/rejected": 3.5773415565490723, "step": 45420 }, { "epoch": 2.109197270068248, "grad_norm": 250.937255859375, "learning_rate": 1.4863735549468408e-07, "logits/chosen": -18.984134674072266, "logits/rejected": -18.462194442749023, "logps/chosen": -448.6748962402344, "logps/rejected": -473.8605041503906, "loss": 1.0157, "rewards/accuracies": 0.5, "rewards/chosen": 3.58756947517395, "rewards/margins": 0.5170302987098694, "rewards/rejected": 3.0705389976501465, "step": 45430 }, { "epoch": 2.1096615441756814, "grad_norm": 100.24185180664062, "learning_rate": 1.4855997647677853e-07, "logits/chosen": -19.235136032104492, "logits/rejected": -18.63705825805664, "logps/chosen": -421.21453857421875, "logps/rejected": -365.34307861328125, "loss": 0.5587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.564629554748535, "rewards/margins": 1.5480730533599854, "rewards/rejected": 3.0165562629699707, "step": 45440 }, { "epoch": 2.110125818283114, "grad_norm": 23.929393768310547, "learning_rate": 1.4848259745887305e-07, "logits/chosen": -19.566011428833008, "logits/rejected": -18.45886993408203, "logps/chosen": -379.7837829589844, "logps/rejected": -222.48019409179688, "loss": 0.4434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7025845050811768, "rewards/margins": 2.0173935890197754, "rewards/rejected": 1.6851905584335327, "step": 45450 }, { "epoch": 2.1105900923905474, "grad_norm": 316.12994384765625, "learning_rate": 1.4840521844096753e-07, "logits/chosen": -19.218202590942383, "logits/rejected": -18.854183197021484, "logps/chosen": -424.20843505859375, "logps/rejected": -329.4411315917969, "loss": 0.8666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7924742698669434, "rewards/margins": 0.8491240739822388, "rewards/rejected": 1.9433501958847046, "step": 45460 }, { "epoch": 2.1110543664979806, "grad_norm": 30.811281204223633, "learning_rate": 1.4832783942306204e-07, "logits/chosen": -18.169898986816406, "logits/rejected": -17.561378479003906, "logps/chosen": -425.6954040527344, "logps/rejected": -332.54583740234375, "loss": 0.8506, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1448869705200195, "rewards/margins": 1.661799669265747, "rewards/rejected": 2.4830873012542725, "step": 45470 }, { "epoch": 2.1115186406054134, "grad_norm": 41.2418327331543, "learning_rate": 1.4825046040515655e-07, "logits/chosen": -19.115886688232422, "logits/rejected": -18.1595458984375, "logps/chosen": -353.22503662109375, "logps/rejected": -292.6691589355469, "loss": 0.4548, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9328927993774414, "rewards/margins": 1.105331540107727, "rewards/rejected": 1.8275611400604248, "step": 45480 }, { "epoch": 2.1119829147128466, "grad_norm": 0.19862137734889984, "learning_rate": 1.48173081387251e-07, "logits/chosen": -18.94651985168457, "logits/rejected": -17.629873275756836, "logps/chosen": -485.37225341796875, "logps/rejected": -353.8472595214844, "loss": 0.766, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.188170433044434, "rewards/margins": 1.8378016948699951, "rewards/rejected": 2.3503684997558594, "step": 45490 }, { "epoch": 2.1124471888202794, "grad_norm": 0.09840814769268036, "learning_rate": 1.4809570236934552e-07, "logits/chosen": -19.589099884033203, "logits/rejected": -18.535587310791016, "logps/chosen": -353.27960205078125, "logps/rejected": -286.84039306640625, "loss": 1.2851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6743240356445312, "rewards/margins": 0.9192558526992798, "rewards/rejected": 1.755068063735962, "step": 45500 }, { "epoch": 2.1129114629277126, "grad_norm": 110.05516052246094, "learning_rate": 1.4801832335144e-07, "logits/chosen": -18.73106575012207, "logits/rejected": -18.403837203979492, "logps/chosen": -427.6396484375, "logps/rejected": -389.33331298828125, "loss": 0.8155, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.924495220184326, "rewards/margins": 1.1503760814666748, "rewards/rejected": 2.7741189002990723, "step": 45510 }, { "epoch": 2.1133757370351454, "grad_norm": 218.16119384765625, "learning_rate": 1.4794094433353452e-07, "logits/chosen": -19.330142974853516, "logits/rejected": -18.701786041259766, "logps/chosen": -522.7275390625, "logps/rejected": -399.300537109375, "loss": 0.7286, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.223978519439697, "rewards/margins": 0.9992515444755554, "rewards/rejected": 3.224726915359497, "step": 45520 }, { "epoch": 2.1138400111425786, "grad_norm": 26.906557083129883, "learning_rate": 1.4786356531562903e-07, "logits/chosen": -19.969585418701172, "logits/rejected": -19.358707427978516, "logps/chosen": -357.7971496582031, "logps/rejected": -304.9964904785156, "loss": 0.5274, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.336155414581299, "rewards/margins": 1.0383999347686768, "rewards/rejected": 2.297755002975464, "step": 45530 }, { "epoch": 2.114304285250012, "grad_norm": 16.83333969116211, "learning_rate": 1.477861862977235e-07, "logits/chosen": -19.48737907409668, "logits/rejected": -18.574522018432617, "logps/chosen": -434.77392578125, "logps/rejected": -348.08245849609375, "loss": 0.6984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9450485706329346, "rewards/margins": 1.440300703048706, "rewards/rejected": 2.5047478675842285, "step": 45540 }, { "epoch": 2.1147685593574446, "grad_norm": 1.6780346632003784, "learning_rate": 1.47708807279818e-07, "logits/chosen": -19.082481384277344, "logits/rejected": -18.566858291625977, "logps/chosen": -397.6392517089844, "logps/rejected": -396.10321044921875, "loss": 0.5209, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.687645435333252, "rewards/margins": 1.0816242694854736, "rewards/rejected": 2.6060211658477783, "step": 45550 }, { "epoch": 2.115232833464878, "grad_norm": 207.0624542236328, "learning_rate": 1.4763142826191248e-07, "logits/chosen": -19.75168800354004, "logits/rejected": -18.566879272460938, "logps/chosen": -361.3135681152344, "logps/rejected": -276.03759765625, "loss": 0.7098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5532009601593018, "rewards/margins": 0.8472212553024292, "rewards/rejected": 2.705979585647583, "step": 45560 }, { "epoch": 2.1156971075723106, "grad_norm": 148.48739624023438, "learning_rate": 1.47554049244007e-07, "logits/chosen": -19.376062393188477, "logits/rejected": -18.36057472229004, "logps/chosen": -400.7433166503906, "logps/rejected": -358.2461853027344, "loss": 0.7276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.646907329559326, "rewards/margins": 0.7837805151939392, "rewards/rejected": 2.863126277923584, "step": 45570 }, { "epoch": 2.1161613816797438, "grad_norm": 55.60082244873047, "learning_rate": 1.474766702261015e-07, "logits/chosen": -19.574962615966797, "logits/rejected": -18.468591690063477, "logps/chosen": -456.1444396972656, "logps/rejected": -304.1674499511719, "loss": 0.3563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.145951271057129, "rewards/margins": 1.968868613243103, "rewards/rejected": 2.1770830154418945, "step": 45580 }, { "epoch": 2.1166256557871765, "grad_norm": 151.2653045654297, "learning_rate": 1.4739929120819597e-07, "logits/chosen": -18.778989791870117, "logits/rejected": -18.71072769165039, "logps/chosen": -331.59783935546875, "logps/rejected": -337.2554626464844, "loss": 1.0888, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9624416828155518, "rewards/margins": -0.28842461109161377, "rewards/rejected": 3.250866413116455, "step": 45590 }, { "epoch": 2.1170899298946098, "grad_norm": 10.77286434173584, "learning_rate": 1.4732191219029048e-07, "logits/chosen": -19.263957977294922, "logits/rejected": -18.0086727142334, "logps/chosen": -412.5250549316406, "logps/rejected": -281.41680908203125, "loss": 0.3195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.593341112136841, "rewards/margins": 1.5904285907745361, "rewards/rejected": 2.0029122829437256, "step": 45600 }, { "epoch": 2.117554204002043, "grad_norm": 57.899044036865234, "learning_rate": 1.4724453317238496e-07, "logits/chosen": -19.02182960510254, "logits/rejected": -18.738147735595703, "logps/chosen": -280.2665710449219, "logps/rejected": -262.2132568359375, "loss": 0.6064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.346359968185425, "rewards/margins": 1.0777137279510498, "rewards/rejected": 1.268646001815796, "step": 45610 }, { "epoch": 2.1180184781094757, "grad_norm": 174.64093017578125, "learning_rate": 1.4716715415447947e-07, "logits/chosen": -18.939268112182617, "logits/rejected": -18.02277183532715, "logps/chosen": -458.9313049316406, "logps/rejected": -399.6234436035156, "loss": 1.0142, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.525969505310059, "rewards/margins": 1.5615915060043335, "rewards/rejected": 2.964378833770752, "step": 45620 }, { "epoch": 2.118482752216909, "grad_norm": 18.777847290039062, "learning_rate": 1.4708977513657398e-07, "logits/chosen": -18.574234008789062, "logits/rejected": -18.292766571044922, "logps/chosen": -291.21136474609375, "logps/rejected": -332.9618225097656, "loss": 0.7186, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6475350856781006, "rewards/margins": 1.4647767543792725, "rewards/rejected": 1.182758092880249, "step": 45630 }, { "epoch": 2.1189470263243417, "grad_norm": 22.747289657592773, "learning_rate": 1.4701239611866844e-07, "logits/chosen": -19.358474731445312, "logits/rejected": -18.828866958618164, "logps/chosen": -440.78900146484375, "logps/rejected": -372.2601013183594, "loss": 0.9808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4532291889190674, "rewards/margins": 0.8151838183403015, "rewards/rejected": 2.638045072555542, "step": 45640 }, { "epoch": 2.119411300431775, "grad_norm": 132.12522888183594, "learning_rate": 1.4693501710076295e-07, "logits/chosen": -19.858585357666016, "logits/rejected": -18.984683990478516, "logps/chosen": -384.2977600097656, "logps/rejected": -346.44757080078125, "loss": 0.5889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6264853477478027, "rewards/margins": 1.0712181329727173, "rewards/rejected": 2.555267095565796, "step": 45650 }, { "epoch": 2.119875574539208, "grad_norm": 0.13005810976028442, "learning_rate": 1.4685763808285744e-07, "logits/chosen": -19.222745895385742, "logits/rejected": -18.571908950805664, "logps/chosen": -317.3880615234375, "logps/rejected": -289.90167236328125, "loss": 1.1668, "rewards/accuracies": 0.5, "rewards/chosen": 2.815256118774414, "rewards/margins": 0.6701111197471619, "rewards/rejected": 2.1451449394226074, "step": 45660 }, { "epoch": 2.120339848646641, "grad_norm": 181.24057006835938, "learning_rate": 1.4678025906495195e-07, "logits/chosen": -19.447542190551758, "logits/rejected": -19.51317024230957, "logps/chosen": -349.205810546875, "logps/rejected": -389.14837646484375, "loss": 1.0943, "rewards/accuracies": 0.5, "rewards/chosen": 3.971414089202881, "rewards/margins": 0.33557072281837463, "rewards/rejected": 3.6358437538146973, "step": 45670 }, { "epoch": 2.120804122754074, "grad_norm": 52.67782211303711, "learning_rate": 1.4670288004704643e-07, "logits/chosen": -18.767772674560547, "logits/rejected": -17.87015151977539, "logps/chosen": -474.20574951171875, "logps/rejected": -339.88751220703125, "loss": 0.6502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.75878643989563, "rewards/margins": 1.2418795824050903, "rewards/rejected": 2.516907215118408, "step": 45680 }, { "epoch": 2.121268396861507, "grad_norm": 143.98214721679688, "learning_rate": 1.4662550102914092e-07, "logits/chosen": -19.53288459777832, "logits/rejected": -18.627490997314453, "logps/chosen": -379.08465576171875, "logps/rejected": -247.0988006591797, "loss": 0.5716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7145018577575684, "rewards/margins": 2.0995116233825684, "rewards/rejected": 1.614990234375, "step": 45690 }, { "epoch": 2.12173267096894, "grad_norm": 22.968942642211914, "learning_rate": 1.4654812201123543e-07, "logits/chosen": -19.27614974975586, "logits/rejected": -18.030704498291016, "logps/chosen": -453.8174743652344, "logps/rejected": -336.9143371582031, "loss": 0.3121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.340322971343994, "rewards/margins": 2.102125406265259, "rewards/rejected": 2.2381973266601562, "step": 45700 }, { "epoch": 2.122196945076373, "grad_norm": 8.047600746154785, "learning_rate": 1.4647074299332992e-07, "logits/chosen": -19.556537628173828, "logits/rejected": -18.75009536743164, "logps/chosen": -416.51556396484375, "logps/rejected": -279.7408752441406, "loss": 0.6392, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.067835569381714, "rewards/margins": 0.9919745326042175, "rewards/rejected": 2.0758609771728516, "step": 45710 }, { "epoch": 2.122661219183806, "grad_norm": 54.480648040771484, "learning_rate": 1.4639336397542443e-07, "logits/chosen": -19.157798767089844, "logits/rejected": -20.170574188232422, "logps/chosen": -288.78985595703125, "logps/rejected": -380.3854064941406, "loss": 1.7521, "rewards/accuracies": 0.5, "rewards/chosen": 2.8530144691467285, "rewards/margins": -1.0643607378005981, "rewards/rejected": 3.9173755645751953, "step": 45720 }, { "epoch": 2.1231254932912393, "grad_norm": 44.85091781616211, "learning_rate": 1.463159849575189e-07, "logits/chosen": -19.0462589263916, "logits/rejected": -18.032407760620117, "logps/chosen": -427.7596130371094, "logps/rejected": -382.273193359375, "loss": 0.4981, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1209306716918945, "rewards/margins": 1.9331775903701782, "rewards/rejected": 2.1877529621124268, "step": 45730 }, { "epoch": 2.123589767398672, "grad_norm": 23.858121871948242, "learning_rate": 1.462386059396134e-07, "logits/chosen": -20.761816024780273, "logits/rejected": -19.065401077270508, "logps/chosen": -374.62518310546875, "logps/rejected": -316.3326110839844, "loss": 0.6584, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8558125495910645, "rewards/margins": 1.09256911277771, "rewards/rejected": 1.763243317604065, "step": 45740 }, { "epoch": 2.1240540415061053, "grad_norm": 5.721181392669678, "learning_rate": 1.461612269217079e-07, "logits/chosen": -19.68789291381836, "logits/rejected": -19.09699058532715, "logps/chosen": -485.4871520996094, "logps/rejected": -355.6490478515625, "loss": 0.2764, "rewards/accuracies": 1.0, "rewards/chosen": 3.9377589225769043, "rewards/margins": 1.5305718183517456, "rewards/rejected": 2.407186985015869, "step": 45750 }, { "epoch": 2.124518315613538, "grad_norm": 45.084197998046875, "learning_rate": 1.460838479038024e-07, "logits/chosen": -19.034530639648438, "logits/rejected": -18.30868148803711, "logps/chosen": -362.0330505371094, "logps/rejected": -318.7282409667969, "loss": 0.7691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7393460273742676, "rewards/margins": 1.439965844154358, "rewards/rejected": 2.29938006401062, "step": 45760 }, { "epoch": 2.1249825897209713, "grad_norm": 67.98906707763672, "learning_rate": 1.460064688858969e-07, "logits/chosen": -18.5196533203125, "logits/rejected": -17.929241180419922, "logps/chosen": -414.0989685058594, "logps/rejected": -326.8999938964844, "loss": 0.7068, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7992067337036133, "rewards/margins": 0.9508149027824402, "rewards/rejected": 1.8483917713165283, "step": 45770 }, { "epoch": 2.125446863828404, "grad_norm": 7.51886510848999, "learning_rate": 1.459290898679914e-07, "logits/chosen": -19.625608444213867, "logits/rejected": -17.595478057861328, "logps/chosen": -436.90093994140625, "logps/rejected": -241.9094696044922, "loss": 0.5468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.291024208068848, "rewards/margins": 2.651808261871338, "rewards/rejected": 1.6392158269882202, "step": 45780 }, { "epoch": 2.1259111379358373, "grad_norm": 78.68721771240234, "learning_rate": 1.4585171085008587e-07, "logits/chosen": -19.064023971557617, "logits/rejected": -18.88934898376465, "logps/chosen": -437.88201904296875, "logps/rejected": -347.906982421875, "loss": 0.6331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5181221961975098, "rewards/margins": 1.6059882640838623, "rewards/rejected": 1.912133812904358, "step": 45790 }, { "epoch": 2.1263754120432705, "grad_norm": 41.94697189331055, "learning_rate": 1.4577433183218039e-07, "logits/chosen": -18.256977081298828, "logits/rejected": -17.22694206237793, "logps/chosen": -379.84423828125, "logps/rejected": -309.315185546875, "loss": 0.2045, "rewards/accuracies": 1.0, "rewards/chosen": 3.5212504863739014, "rewards/margins": 2.432112455368042, "rewards/rejected": 1.0891374349594116, "step": 45800 }, { "epoch": 2.1268396861507033, "grad_norm": 0.19444365799427032, "learning_rate": 1.4569695281427487e-07, "logits/chosen": -20.01448631286621, "logits/rejected": -19.0224552154541, "logps/chosen": -317.36077880859375, "logps/rejected": -223.61563110351562, "loss": 0.6004, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0774574279785156, "rewards/margins": 1.1240428686141968, "rewards/rejected": 1.9534145593643188, "step": 45810 }, { "epoch": 2.1273039602581365, "grad_norm": 1.5576865673065186, "learning_rate": 1.4561957379636938e-07, "logits/chosen": -18.13243293762207, "logits/rejected": -18.11103057861328, "logps/chosen": -238.19345092773438, "logps/rejected": -236.8037872314453, "loss": 0.6203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.728494644165039, "rewards/margins": 0.8372660875320435, "rewards/rejected": 0.8912284970283508, "step": 45820 }, { "epoch": 2.1277682343655693, "grad_norm": 23.05275535583496, "learning_rate": 1.4554219477846387e-07, "logits/chosen": -19.3726749420166, "logits/rejected": -19.347810745239258, "logps/chosen": -344.5293884277344, "logps/rejected": -370.60614013671875, "loss": 0.9554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5058932304382324, "rewards/margins": 0.09960637241601944, "rewards/rejected": 2.4062867164611816, "step": 45830 }, { "epoch": 2.1282325084730025, "grad_norm": 14.664737701416016, "learning_rate": 1.4546481576055835e-07, "logits/chosen": -19.728397369384766, "logits/rejected": -19.153226852416992, "logps/chosen": -388.055908203125, "logps/rejected": -403.0387878417969, "loss": 0.4208, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.144440174102783, "rewards/margins": 1.7317430973052979, "rewards/rejected": 2.4126968383789062, "step": 45840 }, { "epoch": 2.1286967825804357, "grad_norm": 50.87214660644531, "learning_rate": 1.4538743674265286e-07, "logits/chosen": -18.681095123291016, "logits/rejected": -17.268497467041016, "logps/chosen": -389.6741638183594, "logps/rejected": -194.50125122070312, "loss": 0.5253, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.788412570953369, "rewards/margins": 2.5903677940368652, "rewards/rejected": 1.1980445384979248, "step": 45850 }, { "epoch": 2.1291610566878685, "grad_norm": 83.65766143798828, "learning_rate": 1.4531005772474735e-07, "logits/chosen": -18.778045654296875, "logits/rejected": -18.274545669555664, "logps/chosen": -323.40057373046875, "logps/rejected": -250.99887084960938, "loss": 0.6586, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3364455699920654, "rewards/margins": 1.0862629413604736, "rewards/rejected": 1.2501826286315918, "step": 45860 }, { "epoch": 2.1296253307953017, "grad_norm": 50.74257278442383, "learning_rate": 1.4523267870684183e-07, "logits/chosen": -18.552059173583984, "logits/rejected": -18.13469886779785, "logps/chosen": -386.0132751464844, "logps/rejected": -339.72100830078125, "loss": 0.8032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.914055347442627, "rewards/margins": 0.7278011441230774, "rewards/rejected": 2.1862542629241943, "step": 45870 }, { "epoch": 2.1300896049027345, "grad_norm": 5.771464824676514, "learning_rate": 1.4515529968893634e-07, "logits/chosen": -19.35615348815918, "logits/rejected": -18.816375732421875, "logps/chosen": -406.649658203125, "logps/rejected": -471.33087158203125, "loss": 0.8495, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.412426471710205, "rewards/margins": 0.5748969316482544, "rewards/rejected": 3.8375296592712402, "step": 45880 }, { "epoch": 2.1305538790101677, "grad_norm": 1.673797369003296, "learning_rate": 1.4507792067103083e-07, "logits/chosen": -18.975025177001953, "logits/rejected": -18.301807403564453, "logps/chosen": -374.162353515625, "logps/rejected": -282.10711669921875, "loss": 0.6598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2891926765441895, "rewards/margins": 2.0180625915527344, "rewards/rejected": 1.2711297273635864, "step": 45890 }, { "epoch": 2.1310181531176005, "grad_norm": 190.55271911621094, "learning_rate": 1.4500054165312534e-07, "logits/chosen": -19.31876564025879, "logits/rejected": -18.360523223876953, "logps/chosen": -361.522705078125, "logps/rejected": -314.5412292480469, "loss": 0.8255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6166083812713623, "rewards/margins": 1.270673394203186, "rewards/rejected": 2.3459341526031494, "step": 45900 }, { "epoch": 2.1314824272250337, "grad_norm": 89.61202239990234, "learning_rate": 1.4492316263521982e-07, "logits/chosen": -19.658714294433594, "logits/rejected": -18.81875991821289, "logps/chosen": -396.2729187011719, "logps/rejected": -377.994873046875, "loss": 0.5052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.434131145477295, "rewards/margins": 0.9119476079940796, "rewards/rejected": 2.5221831798553467, "step": 45910 }, { "epoch": 2.131946701332467, "grad_norm": 15.616262435913086, "learning_rate": 1.448457836173143e-07, "logits/chosen": -19.534893035888672, "logits/rejected": -19.004287719726562, "logps/chosen": -437.76641845703125, "logps/rejected": -376.9309997558594, "loss": 1.0891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.498631715774536, "rewards/margins": 0.594323992729187, "rewards/rejected": 2.9043078422546387, "step": 45920 }, { "epoch": 2.1324109754398997, "grad_norm": 233.69210815429688, "learning_rate": 1.4476840459940882e-07, "logits/chosen": -19.493549346923828, "logits/rejected": -19.393489837646484, "logps/chosen": -472.14862060546875, "logps/rejected": -470.48175048828125, "loss": 1.0736, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.861510753631592, "rewards/margins": 0.4167497754096985, "rewards/rejected": 3.444761276245117, "step": 45930 }, { "epoch": 2.132875249547333, "grad_norm": 137.33505249023438, "learning_rate": 1.446910255815033e-07, "logits/chosen": -19.726520538330078, "logits/rejected": -19.084749221801758, "logps/chosen": -424.61944580078125, "logps/rejected": -382.005126953125, "loss": 0.8685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.714421272277832, "rewards/margins": 1.2282021045684814, "rewards/rejected": 3.4862194061279297, "step": 45940 }, { "epoch": 2.1333395236547656, "grad_norm": 12.088990211486816, "learning_rate": 1.4461364656359782e-07, "logits/chosen": -19.370981216430664, "logits/rejected": -17.955230712890625, "logps/chosen": -498.74176025390625, "logps/rejected": -401.080322265625, "loss": 0.6713, "rewards/accuracies": 0.5, "rewards/chosen": 4.590513229370117, "rewards/margins": 1.3827979564666748, "rewards/rejected": 3.2077155113220215, "step": 45950 }, { "epoch": 2.133803797762199, "grad_norm": 25.36074447631836, "learning_rate": 1.445362675456923e-07, "logits/chosen": -19.61597442626953, "logits/rejected": -18.724689483642578, "logps/chosen": -335.0513610839844, "logps/rejected": -279.57696533203125, "loss": 0.5263, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.801062822341919, "rewards/margins": 1.6476056575775146, "rewards/rejected": 1.1534572839736938, "step": 45960 }, { "epoch": 2.1342680718696316, "grad_norm": 45.19428253173828, "learning_rate": 1.4445888852778679e-07, "logits/chosen": -19.58597755432129, "logits/rejected": -18.31032943725586, "logps/chosen": -478.3109436035156, "logps/rejected": -350.41705322265625, "loss": 0.599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.601428508758545, "rewards/margins": 1.0786844491958618, "rewards/rejected": 2.5227444171905518, "step": 45970 }, { "epoch": 2.134732345977065, "grad_norm": 10.17039680480957, "learning_rate": 1.443815095098813e-07, "logits/chosen": -19.625526428222656, "logits/rejected": -18.66333770751953, "logps/chosen": -353.83184814453125, "logps/rejected": -271.14251708984375, "loss": 0.6854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5044445991516113, "rewards/margins": 1.4328961372375488, "rewards/rejected": 2.0715479850769043, "step": 45980 }, { "epoch": 2.135196620084498, "grad_norm": 0.050891779363155365, "learning_rate": 1.4430413049197578e-07, "logits/chosen": -20.158405303955078, "logits/rejected": -18.37545394897461, "logps/chosen": -490.531982421875, "logps/rejected": -343.3045654296875, "loss": 0.2733, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.706017971038818, "rewards/margins": 3.4448986053466797, "rewards/rejected": 2.2611193656921387, "step": 45990 }, { "epoch": 2.135660894191931, "grad_norm": 0.7387182116508484, "learning_rate": 1.442267514740703e-07, "logits/chosen": -19.596210479736328, "logits/rejected": -18.31707191467285, "logps/chosen": -400.0845642089844, "logps/rejected": -307.501708984375, "loss": 0.3917, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.07666015625, "rewards/margins": 1.8576968908309937, "rewards/rejected": 1.218963384628296, "step": 46000 }, { "epoch": 2.136125168299364, "grad_norm": 238.3279571533203, "learning_rate": 1.441493724561648e-07, "logits/chosen": -18.775291442871094, "logits/rejected": -18.752233505249023, "logps/chosen": -369.33978271484375, "logps/rejected": -345.5730285644531, "loss": 0.735, "rewards/accuracies": 0.5, "rewards/chosen": 2.8973281383514404, "rewards/margins": 0.12472172826528549, "rewards/rejected": 2.772606611251831, "step": 46010 }, { "epoch": 2.136589442406797, "grad_norm": 154.08580017089844, "learning_rate": 1.4407199343825926e-07, "logits/chosen": -19.317197799682617, "logits/rejected": -18.205493927001953, "logps/chosen": -295.56890869140625, "logps/rejected": -262.7839050292969, "loss": 0.5571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5634822845458984, "rewards/margins": 0.7269500494003296, "rewards/rejected": 1.8365322351455688, "step": 46020 }, { "epoch": 2.13705371651423, "grad_norm": 20.278921127319336, "learning_rate": 1.4399461442035377e-07, "logits/chosen": -20.03090476989746, "logits/rejected": -18.741674423217773, "logps/chosen": -443.12841796875, "logps/rejected": -354.30718994140625, "loss": 0.3488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.138667583465576, "rewards/margins": 1.8519542217254639, "rewards/rejected": 2.2867133617401123, "step": 46030 }, { "epoch": 2.137517990621663, "grad_norm": 212.0060577392578, "learning_rate": 1.4391723540244826e-07, "logits/chosen": -19.967823028564453, "logits/rejected": -19.219038009643555, "logps/chosen": -410.92572021484375, "logps/rejected": -365.17987060546875, "loss": 0.7228, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0740580558776855, "rewards/margins": 0.9004462957382202, "rewards/rejected": 3.173612117767334, "step": 46040 }, { "epoch": 2.137982264729096, "grad_norm": 25.60675811767578, "learning_rate": 1.4383985638454277e-07, "logits/chosen": -19.14558982849121, "logits/rejected": -18.37433624267578, "logps/chosen": -383.6414489746094, "logps/rejected": -303.66046142578125, "loss": 0.4212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.834991931915283, "rewards/margins": 1.9512698650360107, "rewards/rejected": 1.8837215900421143, "step": 46050 }, { "epoch": 2.1384465388365292, "grad_norm": 0.16936741769313812, "learning_rate": 1.4376247736663728e-07, "logits/chosen": -18.974193572998047, "logits/rejected": -18.145002365112305, "logps/chosen": -405.23455810546875, "logps/rejected": -370.4970703125, "loss": 1.0163, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3011627197265625, "rewards/margins": 1.4595305919647217, "rewards/rejected": 2.841632843017578, "step": 46060 }, { "epoch": 2.138910812943962, "grad_norm": 25.223054885864258, "learning_rate": 1.4368509834873174e-07, "logits/chosen": -20.520227432250977, "logits/rejected": -19.829761505126953, "logps/chosen": -363.6697998046875, "logps/rejected": -269.86761474609375, "loss": 0.6408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2922768592834473, "rewards/margins": 0.7673166990280151, "rewards/rejected": 2.5249600410461426, "step": 46070 }, { "epoch": 2.1393750870513952, "grad_norm": 19.504297256469727, "learning_rate": 1.4360771933082625e-07, "logits/chosen": -18.455204010009766, "logits/rejected": -18.57023048400879, "logps/chosen": -395.8484802246094, "logps/rejected": -407.9023132324219, "loss": 1.4999, "rewards/accuracies": 0.5, "rewards/chosen": 3.3623898029327393, "rewards/margins": 0.10046157985925674, "rewards/rejected": 3.2619285583496094, "step": 46080 }, { "epoch": 2.139839361158828, "grad_norm": 310.18328857421875, "learning_rate": 1.4353034031292074e-07, "logits/chosen": -18.215978622436523, "logits/rejected": -17.906856536865234, "logps/chosen": -446.27313232421875, "logps/rejected": -429.88055419921875, "loss": 0.8679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.380087852478027, "rewards/margins": 1.240225076675415, "rewards/rejected": 3.1398630142211914, "step": 46090 }, { "epoch": 2.140303635266261, "grad_norm": 95.29527282714844, "learning_rate": 1.4345296129501525e-07, "logits/chosen": -18.770374298095703, "logits/rejected": -18.370647430419922, "logps/chosen": -434.42059326171875, "logps/rejected": -529.0655517578125, "loss": 0.5444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6590569019317627, "rewards/margins": 1.150445580482483, "rewards/rejected": 2.5086112022399902, "step": 46100 }, { "epoch": 2.1407679093736944, "grad_norm": 44.332759857177734, "learning_rate": 1.433755822771097e-07, "logits/chosen": -19.6805477142334, "logits/rejected": -18.34903335571289, "logps/chosen": -442.2062072753906, "logps/rejected": -347.956787109375, "loss": 0.2789, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.791243076324463, "rewards/margins": 1.844622254371643, "rewards/rejected": 2.9466211795806885, "step": 46110 }, { "epoch": 2.141232183481127, "grad_norm": 21.336198806762695, "learning_rate": 1.4329820325920422e-07, "logits/chosen": -19.688846588134766, "logits/rejected": -18.266727447509766, "logps/chosen": -341.95587158203125, "logps/rejected": -256.4734802246094, "loss": 0.6155, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.004363775253296, "rewards/margins": 1.042407751083374, "rewards/rejected": 1.9619560241699219, "step": 46120 }, { "epoch": 2.1416964575885604, "grad_norm": 46.21236801147461, "learning_rate": 1.4322082424129873e-07, "logits/chosen": -18.85411834716797, "logits/rejected": -17.859745025634766, "logps/chosen": -386.2826843261719, "logps/rejected": -270.4510803222656, "loss": 0.6125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9190433025360107, "rewards/margins": 1.2697029113769531, "rewards/rejected": 1.6493403911590576, "step": 46130 }, { "epoch": 2.142160731695993, "grad_norm": 17.140987396240234, "learning_rate": 1.4314344522339321e-07, "logits/chosen": -19.36490821838379, "logits/rejected": -18.24747085571289, "logps/chosen": -391.01922607421875, "logps/rejected": -305.67584228515625, "loss": 0.4132, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4338784217834473, "rewards/margins": 1.952455759048462, "rewards/rejected": 0.4814225137233734, "step": 46140 }, { "epoch": 2.1426250058034264, "grad_norm": 9.686246871948242, "learning_rate": 1.4306606620548772e-07, "logits/chosen": -19.38071060180664, "logits/rejected": -17.47662925720215, "logps/chosen": -390.5254821777344, "logps/rejected": -234.2592315673828, "loss": 0.4201, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.321837902069092, "rewards/margins": 2.0958340167999268, "rewards/rejected": 1.2260041236877441, "step": 46150 }, { "epoch": 2.143089279910859, "grad_norm": 77.95231628417969, "learning_rate": 1.429886871875822e-07, "logits/chosen": -19.78970718383789, "logits/rejected": -18.917064666748047, "logps/chosen": -404.77142333984375, "logps/rejected": -295.75811767578125, "loss": 0.539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.075303554534912, "rewards/margins": 2.1974034309387207, "rewards/rejected": 1.8779001235961914, "step": 46160 }, { "epoch": 2.1435535540182924, "grad_norm": 38.30781936645508, "learning_rate": 1.429113081696767e-07, "logits/chosen": -19.656890869140625, "logits/rejected": -18.538944244384766, "logps/chosen": -381.10589599609375, "logps/rejected": -317.6774597167969, "loss": 0.2975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7390332221984863, "rewards/margins": 1.8494513034820557, "rewards/rejected": 1.8895819187164307, "step": 46170 }, { "epoch": 2.1440178281257256, "grad_norm": 2.171563148498535, "learning_rate": 1.428339291517712e-07, "logits/chosen": -19.062782287597656, "logits/rejected": -18.348865509033203, "logps/chosen": -415.855224609375, "logps/rejected": -377.50958251953125, "loss": 0.6994, "rewards/accuracies": 0.5, "rewards/chosen": 4.177062034606934, "rewards/margins": 0.6670879125595093, "rewards/rejected": 3.5099735260009766, "step": 46180 }, { "epoch": 2.1444821022331584, "grad_norm": 47.63816833496094, "learning_rate": 1.427565501338657e-07, "logits/chosen": -18.996196746826172, "logits/rejected": -18.45761489868164, "logps/chosen": -323.7903747558594, "logps/rejected": -359.76165771484375, "loss": 0.9146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.826514959335327, "rewards/margins": 0.20474401116371155, "rewards/rejected": 2.6217708587646484, "step": 46190 }, { "epoch": 2.1449463763405916, "grad_norm": 118.43742370605469, "learning_rate": 1.426791711159602e-07, "logits/chosen": -18.812284469604492, "logits/rejected": -17.96322250366211, "logps/chosen": -267.2968444824219, "logps/rejected": -256.8349609375, "loss": 0.6316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.209796905517578, "rewards/margins": 0.7322725653648376, "rewards/rejected": 1.4775245189666748, "step": 46200 }, { "epoch": 2.1454106504480244, "grad_norm": 26.30538558959961, "learning_rate": 1.4260179209805469e-07, "logits/chosen": -19.794490814208984, "logits/rejected": -18.628189086914062, "logps/chosen": -273.35504150390625, "logps/rejected": -225.22048950195312, "loss": 0.7193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0275750160217285, "rewards/margins": 1.0714385509490967, "rewards/rejected": 0.9561364054679871, "step": 46210 }, { "epoch": 2.1458749245554576, "grad_norm": 37.50614929199219, "learning_rate": 1.4252441308014917e-07, "logits/chosen": -20.427701950073242, "logits/rejected": -19.35603904724121, "logps/chosen": -481.41033935546875, "logps/rejected": -424.00946044921875, "loss": 0.451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.269471168518066, "rewards/margins": 1.6278202533721924, "rewards/rejected": 2.641651153564453, "step": 46220 }, { "epoch": 2.146339198662891, "grad_norm": 98.15292358398438, "learning_rate": 1.4244703406224368e-07, "logits/chosen": -19.839401245117188, "logits/rejected": -19.3060359954834, "logps/chosen": -483.70465087890625, "logps/rejected": -345.85845947265625, "loss": 0.733, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9688656330108643, "rewards/margins": 0.5322455167770386, "rewards/rejected": 3.4366202354431152, "step": 46230 }, { "epoch": 2.1468034727703236, "grad_norm": 84.04419708251953, "learning_rate": 1.4236965504433817e-07, "logits/chosen": -18.891084671020508, "logits/rejected": -18.964988708496094, "logps/chosen": -390.59869384765625, "logps/rejected": -376.3833312988281, "loss": 1.2132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.17008900642395, "rewards/margins": 0.37288764119148254, "rewards/rejected": 2.797201156616211, "step": 46240 }, { "epoch": 2.147267746877757, "grad_norm": 66.13687133789062, "learning_rate": 1.4229227602643268e-07, "logits/chosen": -20.168460845947266, "logits/rejected": -20.320932388305664, "logps/chosen": -311.51544189453125, "logps/rejected": -296.9961242675781, "loss": 1.0646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8391740322113037, "rewards/margins": -0.2750023305416107, "rewards/rejected": 3.1141762733459473, "step": 46250 }, { "epoch": 2.1477320209851896, "grad_norm": 34.55917739868164, "learning_rate": 1.4221489700852716e-07, "logits/chosen": -18.15304946899414, "logits/rejected": -16.948965072631836, "logps/chosen": -449.0482482910156, "logps/rejected": -307.4422912597656, "loss": 0.2309, "rewards/accuracies": 1.0, "rewards/chosen": 3.3772075176239014, "rewards/margins": 2.0208983421325684, "rewards/rejected": 1.356309175491333, "step": 46260 }, { "epoch": 2.1481962950926228, "grad_norm": 69.4583740234375, "learning_rate": 1.4213751799062165e-07, "logits/chosen": -20.048297882080078, "logits/rejected": -19.572711944580078, "logps/chosen": -505.933349609375, "logps/rejected": -379.03564453125, "loss": 0.3358, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.586602210998535, "rewards/margins": 1.7220468521118164, "rewards/rejected": 2.864555597305298, "step": 46270 }, { "epoch": 2.1486605692000555, "grad_norm": 165.09901428222656, "learning_rate": 1.4206013897271616e-07, "logits/chosen": -19.946338653564453, "logits/rejected": -17.83637809753418, "logps/chosen": -482.0338439941406, "logps/rejected": -252.62612915039062, "loss": 0.2299, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.205108642578125, "rewards/margins": 2.5185062885284424, "rewards/rejected": 1.686602234840393, "step": 46280 }, { "epoch": 2.1491248433074888, "grad_norm": 1.7097110748291016, "learning_rate": 1.4198275995481064e-07, "logits/chosen": -19.234447479248047, "logits/rejected": -17.891164779663086, "logps/chosen": -344.7555236816406, "logps/rejected": -233.95242309570312, "loss": 0.2771, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.124780654907227, "rewards/margins": 2.4663565158843994, "rewards/rejected": 1.6584237813949585, "step": 46290 }, { "epoch": 2.149589117414922, "grad_norm": 73.30511474609375, "learning_rate": 1.4190538093690516e-07, "logits/chosen": -19.731931686401367, "logits/rejected": -19.109323501586914, "logps/chosen": -245.5128173828125, "logps/rejected": -218.6890106201172, "loss": 0.5736, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4499189853668213, "rewards/margins": 0.6543445587158203, "rewards/rejected": 0.795574426651001, "step": 46300 }, { "epoch": 2.1500533915223548, "grad_norm": 20.495927810668945, "learning_rate": 1.418357398207902e-07, "logits/chosen": -18.447702407836914, "logits/rejected": -17.85324478149414, "logps/chosen": -383.52740478515625, "logps/rejected": -297.83380126953125, "loss": 0.6011, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3889126777648926, "rewards/margins": 1.3607916831970215, "rewards/rejected": 1.0281208753585815, "step": 46310 }, { "epoch": 2.150517665629788, "grad_norm": 44.62126541137695, "learning_rate": 1.417583608028847e-07, "logits/chosen": -20.925012588500977, "logits/rejected": -19.411869049072266, "logps/chosen": -475.6736755371094, "logps/rejected": -372.45269775390625, "loss": 0.6817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.5145158767700195, "rewards/margins": 1.0793026685714722, "rewards/rejected": 3.435213565826416, "step": 46320 }, { "epoch": 2.1509819397372207, "grad_norm": 3.2832694053649902, "learning_rate": 1.4168098178497918e-07, "logits/chosen": -19.58987045288086, "logits/rejected": -18.314899444580078, "logps/chosen": -478.76031494140625, "logps/rejected": -302.7521057128906, "loss": 0.4239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.917888164520264, "rewards/margins": 2.4442734718322754, "rewards/rejected": 2.47361421585083, "step": 46330 }, { "epoch": 2.151446213844654, "grad_norm": 1.3349967002868652, "learning_rate": 1.4160360276707366e-07, "logits/chosen": -18.362316131591797, "logits/rejected": -17.703201293945312, "logps/chosen": -396.39605712890625, "logps/rejected": -359.67889404296875, "loss": 0.834, "rewards/accuracies": 0.5, "rewards/chosen": 3.4641849994659424, "rewards/margins": 1.1072442531585693, "rewards/rejected": 2.356940507888794, "step": 46340 }, { "epoch": 2.1519104879520867, "grad_norm": 65.96430206298828, "learning_rate": 1.4152622374916817e-07, "logits/chosen": -19.381681442260742, "logits/rejected": -18.481822967529297, "logps/chosen": -483.22119140625, "logps/rejected": -323.92010498046875, "loss": 0.6114, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.116278648376465, "rewards/margins": 1.286070466041565, "rewards/rejected": 2.8302078247070312, "step": 46350 }, { "epoch": 2.15237476205952, "grad_norm": 26.39028549194336, "learning_rate": 1.4144884473126268e-07, "logits/chosen": -19.333887100219727, "logits/rejected": -19.67647933959961, "logps/chosen": -373.420166015625, "logps/rejected": -382.6513671875, "loss": 0.8862, "rewards/accuracies": 0.5, "rewards/chosen": 3.1432104110717773, "rewards/margins": 0.3440927565097809, "rewards/rejected": 2.7991175651550293, "step": 46360 }, { "epoch": 2.152839036166953, "grad_norm": 0.7922843098640442, "learning_rate": 1.4137146571335714e-07, "logits/chosen": -18.839134216308594, "logits/rejected": -17.12610626220703, "logps/chosen": -495.2667541503906, "logps/rejected": -280.51348876953125, "loss": 0.2143, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.448665142059326, "rewards/margins": 2.849243640899658, "rewards/rejected": 1.5994219779968262, "step": 46370 }, { "epoch": 2.153303310274386, "grad_norm": 42.260799407958984, "learning_rate": 1.4129408669545165e-07, "logits/chosen": -19.350839614868164, "logits/rejected": -18.42849349975586, "logps/chosen": -383.61346435546875, "logps/rejected": -269.57464599609375, "loss": 0.7297, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.402503252029419, "rewards/margins": 1.6305935382843018, "rewards/rejected": 1.7719099521636963, "step": 46380 }, { "epoch": 2.153767584381819, "grad_norm": 102.74724578857422, "learning_rate": 1.4121670767754614e-07, "logits/chosen": -19.271146774291992, "logits/rejected": -18.948028564453125, "logps/chosen": -286.5395202636719, "logps/rejected": -263.11785888671875, "loss": 0.4602, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.70520281791687, "rewards/margins": 1.3644112348556519, "rewards/rejected": 2.3407914638519287, "step": 46390 }, { "epoch": 2.154231858489252, "grad_norm": 183.1321563720703, "learning_rate": 1.4113932865964065e-07, "logits/chosen": -18.405353546142578, "logits/rejected": -19.01955223083496, "logps/chosen": -343.0476379394531, "logps/rejected": -433.5264587402344, "loss": 1.4586, "rewards/accuracies": 0.5, "rewards/chosen": 2.865600824356079, "rewards/margins": -0.6557713747024536, "rewards/rejected": 3.5213723182678223, "step": 46400 }, { "epoch": 2.154696132596685, "grad_norm": 1.9720526933670044, "learning_rate": 1.4106194964173516e-07, "logits/chosen": -19.106426239013672, "logits/rejected": -18.393390655517578, "logps/chosen": -468.28204345703125, "logps/rejected": -393.3132019042969, "loss": 0.5127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.677059173583984, "rewards/margins": 1.0652652978897095, "rewards/rejected": 3.6117939949035645, "step": 46410 }, { "epoch": 2.155160406704118, "grad_norm": 82.36841583251953, "learning_rate": 1.4098457062382962e-07, "logits/chosen": -20.368087768554688, "logits/rejected": -19.851449966430664, "logps/chosen": -491.1832580566406, "logps/rejected": -461.164306640625, "loss": 0.2013, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.784062385559082, "rewards/margins": 2.0549309253692627, "rewards/rejected": 3.729130983352661, "step": 46420 }, { "epoch": 2.155624680811551, "grad_norm": 7.222930431365967, "learning_rate": 1.4090719160592413e-07, "logits/chosen": -19.197181701660156, "logits/rejected": -17.77008628845215, "logps/chosen": -373.2122802734375, "logps/rejected": -235.7384033203125, "loss": 0.5358, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.63059401512146, "rewards/margins": 1.6189517974853516, "rewards/rejected": 1.0116422176361084, "step": 46430 }, { "epoch": 2.1560889549189843, "grad_norm": 142.75706481933594, "learning_rate": 1.4082981258801861e-07, "logits/chosen": -19.193124771118164, "logits/rejected": -19.654970169067383, "logps/chosen": -399.3094787597656, "logps/rejected": -370.4019775390625, "loss": 1.5575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4646174907684326, "rewards/margins": 0.1052815169095993, "rewards/rejected": 3.3593356609344482, "step": 46440 }, { "epoch": 2.156553229026417, "grad_norm": 174.398681640625, "learning_rate": 1.4075243357011313e-07, "logits/chosen": -18.761775970458984, "logits/rejected": -17.990341186523438, "logps/chosen": -475.70849609375, "logps/rejected": -376.68389892578125, "loss": 0.6209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.062679290771484, "rewards/margins": 1.671534776687622, "rewards/rejected": 2.3911447525024414, "step": 46450 }, { "epoch": 2.1570175031338503, "grad_norm": 50.65518569946289, "learning_rate": 1.4067505455220764e-07, "logits/chosen": -19.108890533447266, "logits/rejected": -19.54135513305664, "logps/chosen": -452.4269104003906, "logps/rejected": -461.0865173339844, "loss": 1.1539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.963308811187744, "rewards/margins": 0.41113823652267456, "rewards/rejected": 3.5521702766418457, "step": 46460 }, { "epoch": 2.157481777241283, "grad_norm": 52.55311584472656, "learning_rate": 1.405976755343021e-07, "logits/chosen": -18.69312858581543, "logits/rejected": -18.228092193603516, "logps/chosen": -384.8632507324219, "logps/rejected": -320.8753967285156, "loss": 1.102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.989391803741455, "rewards/margins": 0.44073209166526794, "rewards/rejected": 2.5486598014831543, "step": 46470 }, { "epoch": 2.1579460513487163, "grad_norm": 117.91228485107422, "learning_rate": 1.405202965163966e-07, "logits/chosen": -19.49454689025879, "logits/rejected": -18.88559341430664, "logps/chosen": -483.6858825683594, "logps/rejected": -361.907470703125, "loss": 0.4627, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.367393493652344, "rewards/margins": 1.153743863105774, "rewards/rejected": 3.2136497497558594, "step": 46480 }, { "epoch": 2.158410325456149, "grad_norm": 1.300036072731018, "learning_rate": 1.404429174984911e-07, "logits/chosen": -18.782644271850586, "logits/rejected": -18.641263961791992, "logps/chosen": -361.14996337890625, "logps/rejected": -327.764892578125, "loss": 1.2442, "rewards/accuracies": 0.5, "rewards/chosen": 3.574734926223755, "rewards/margins": 1.0691020488739014, "rewards/rejected": 2.5056324005126953, "step": 46490 }, { "epoch": 2.1588745995635823, "grad_norm": 42.691856384277344, "learning_rate": 1.403655384805856e-07, "logits/chosen": -18.421279907226562, "logits/rejected": -18.202037811279297, "logps/chosen": -317.51947021484375, "logps/rejected": -326.7822265625, "loss": 0.824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9074459075927734, "rewards/margins": 1.2219270467758179, "rewards/rejected": 1.6855186223983765, "step": 46500 }, { "epoch": 2.1593388736710155, "grad_norm": 142.6796875, "learning_rate": 1.4028815946268011e-07, "logits/chosen": -20.14849281311035, "logits/rejected": -18.79172134399414, "logps/chosen": -480.56378173828125, "logps/rejected": -344.5960388183594, "loss": 0.4854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.160391807556152, "rewards/margins": 1.403314232826233, "rewards/rejected": 2.757077932357788, "step": 46510 }, { "epoch": 2.1598031477784483, "grad_norm": 41.225425720214844, "learning_rate": 1.4021078044477457e-07, "logits/chosen": -19.173215866088867, "logits/rejected": -17.744823455810547, "logps/chosen": -405.9722595214844, "logps/rejected": -310.8084716796875, "loss": 0.4642, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8861308097839355, "rewards/margins": 1.7132043838500977, "rewards/rejected": 2.172926425933838, "step": 46520 }, { "epoch": 2.1602674218858815, "grad_norm": 99.0025405883789, "learning_rate": 1.4013340142686908e-07, "logits/chosen": -20.183361053466797, "logits/rejected": -19.89126968383789, "logps/chosen": -509.9781188964844, "logps/rejected": -491.24334716796875, "loss": 0.5059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.940517425537109, "rewards/margins": 1.8742790222167969, "rewards/rejected": 3.0662384033203125, "step": 46530 }, { "epoch": 2.1607316959933143, "grad_norm": 171.42669677734375, "learning_rate": 1.4005602240896357e-07, "logits/chosen": -19.60957145690918, "logits/rejected": -19.6337947845459, "logps/chosen": -444.7891540527344, "logps/rejected": -403.85400390625, "loss": 1.0475, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.919931411743164, "rewards/margins": 0.12903182208538055, "rewards/rejected": 3.7908992767333984, "step": 46540 }, { "epoch": 2.1611959701007475, "grad_norm": 33.89794921875, "learning_rate": 1.3997864339105808e-07, "logits/chosen": -19.411518096923828, "logits/rejected": -19.297374725341797, "logps/chosen": -392.6081237792969, "logps/rejected": -389.1720886230469, "loss": 0.7894, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2065868377685547, "rewards/margins": 0.6609011292457581, "rewards/rejected": 2.5456857681274414, "step": 46550 }, { "epoch": 2.1616602442081807, "grad_norm": 54.7841796875, "learning_rate": 1.399012643731526e-07, "logits/chosen": -19.383861541748047, "logits/rejected": -18.765588760375977, "logps/chosen": -406.00555419921875, "logps/rejected": -324.3957214355469, "loss": 0.5293, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9399943351745605, "rewards/margins": 1.2573871612548828, "rewards/rejected": 1.6826069355010986, "step": 46560 }, { "epoch": 2.1621245183156135, "grad_norm": 1.4763015508651733, "learning_rate": 1.3982388535524705e-07, "logits/chosen": -19.96225929260254, "logits/rejected": -19.01470947265625, "logps/chosen": -381.5290222167969, "logps/rejected": -280.3420104980469, "loss": 0.6688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7314155101776123, "rewards/margins": 1.1978834867477417, "rewards/rejected": 2.53353214263916, "step": 46570 }, { "epoch": 2.1625887924230467, "grad_norm": 86.91130828857422, "learning_rate": 1.3974650633734156e-07, "logits/chosen": -19.156890869140625, "logits/rejected": -17.529541015625, "logps/chosen": -509.9849548339844, "logps/rejected": -264.1738586425781, "loss": 0.731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2619717121124268, "rewards/margins": 1.6112102270126343, "rewards/rejected": 1.6507612466812134, "step": 46580 }, { "epoch": 2.1630530665304795, "grad_norm": 25.462209701538086, "learning_rate": 1.3966912731943607e-07, "logits/chosen": -18.81632423400879, "logits/rejected": -18.102855682373047, "logps/chosen": -406.03973388671875, "logps/rejected": -323.4173889160156, "loss": 0.3047, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5184597969055176, "rewards/margins": 1.7245609760284424, "rewards/rejected": 1.7938982248306274, "step": 46590 }, { "epoch": 2.1635173406379127, "grad_norm": 1.7526495456695557, "learning_rate": 1.3959174830153056e-07, "logits/chosen": -18.39836883544922, "logits/rejected": -17.59967613220215, "logps/chosen": -308.74102783203125, "logps/rejected": -209.42477416992188, "loss": 0.4606, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2061500549316406, "rewards/margins": 1.446323037147522, "rewards/rejected": 0.7598272562026978, "step": 46600 }, { "epoch": 2.1639816147453454, "grad_norm": 24.991737365722656, "learning_rate": 1.3951436928362504e-07, "logits/chosen": -18.979290008544922, "logits/rejected": -17.10334014892578, "logps/chosen": -357.64215087890625, "logps/rejected": -246.4036865234375, "loss": 0.4477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.259636402130127, "rewards/margins": 1.5542864799499512, "rewards/rejected": 0.7053501009941101, "step": 46610 }, { "epoch": 2.1644458888527787, "grad_norm": 271.7593078613281, "learning_rate": 1.3943699026571953e-07, "logits/chosen": -18.362302780151367, "logits/rejected": -18.353330612182617, "logps/chosen": -293.2906188964844, "logps/rejected": -330.2109375, "loss": 0.8002, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.262176036834717, "rewards/margins": 0.2954323887825012, "rewards/rejected": 1.9667437076568604, "step": 46620 }, { "epoch": 2.164910162960212, "grad_norm": 52.51002502441406, "learning_rate": 1.3935961124781404e-07, "logits/chosen": -18.55731201171875, "logits/rejected": -17.95389175415039, "logps/chosen": -364.43304443359375, "logps/rejected": -256.4346008300781, "loss": 0.6007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.801595449447632, "rewards/margins": 0.7552834153175354, "rewards/rejected": 2.046312093734741, "step": 46630 }, { "epoch": 2.1653744370676447, "grad_norm": 117.46411895751953, "learning_rate": 1.3928223222990855e-07, "logits/chosen": -20.334758758544922, "logits/rejected": -19.64153289794922, "logps/chosen": -415.6639709472656, "logps/rejected": -346.84222412109375, "loss": 0.3551, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.378235816955566, "rewards/margins": 1.6552995443344116, "rewards/rejected": 2.722935676574707, "step": 46640 }, { "epoch": 2.165838711175078, "grad_norm": 18.92319679260254, "learning_rate": 1.3920485321200303e-07, "logits/chosen": -19.629281997680664, "logits/rejected": -19.141094207763672, "logps/chosen": -459.5296936035156, "logps/rejected": -494.4903259277344, "loss": 1.3749, "rewards/accuracies": 0.5, "rewards/chosen": 3.750917434692383, "rewards/margins": -0.35349178314208984, "rewards/rejected": 4.104409217834473, "step": 46650 }, { "epoch": 2.1663029852825106, "grad_norm": 217.1620635986328, "learning_rate": 1.3912747419409752e-07, "logits/chosen": -19.402109146118164, "logits/rejected": -19.403568267822266, "logps/chosen": -359.75103759765625, "logps/rejected": -377.7682800292969, "loss": 1.1193, "rewards/accuracies": 0.5, "rewards/chosen": 2.1265463829040527, "rewards/margins": -0.13605792820453644, "rewards/rejected": 2.262603998184204, "step": 46660 }, { "epoch": 2.166767259389944, "grad_norm": 166.97116088867188, "learning_rate": 1.39050095176192e-07, "logits/chosen": -19.70707130432129, "logits/rejected": -18.836952209472656, "logps/chosen": -340.6947021484375, "logps/rejected": -303.63775634765625, "loss": 0.3166, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8153915405273438, "rewards/margins": 1.9543901681900024, "rewards/rejected": 1.8610013723373413, "step": 46670 }, { "epoch": 2.167231533497377, "grad_norm": 2.2112653255462646, "learning_rate": 1.3897271615828651e-07, "logits/chosen": -19.071544647216797, "logits/rejected": -18.4522705078125, "logps/chosen": -447.7386779785156, "logps/rejected": -440.88165283203125, "loss": 0.574, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.336587429046631, "rewards/margins": 1.6700788736343384, "rewards/rejected": 3.666508913040161, "step": 46680 }, { "epoch": 2.16769580760481, "grad_norm": 45.25165557861328, "learning_rate": 1.3889533714038103e-07, "logits/chosen": -19.05141830444336, "logits/rejected": -18.946212768554688, "logps/chosen": -357.0248718261719, "logps/rejected": -321.8647155761719, "loss": 0.6825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0022013187408447, "rewards/margins": 0.6259311437606812, "rewards/rejected": 2.376270055770874, "step": 46690 }, { "epoch": 2.168160081712243, "grad_norm": 32.53641891479492, "learning_rate": 1.388179581224755e-07, "logits/chosen": -18.412588119506836, "logits/rejected": -18.2658748626709, "logps/chosen": -286.68206787109375, "logps/rejected": -262.7809753417969, "loss": 0.5286, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.241224765777588, "rewards/margins": 1.2404173612594604, "rewards/rejected": 2.000807523727417, "step": 46700 }, { "epoch": 2.168624355819676, "grad_norm": 195.1881866455078, "learning_rate": 1.3874057910457e-07, "logits/chosen": -18.371341705322266, "logits/rejected": -18.414072036743164, "logps/chosen": -286.13262939453125, "logps/rejected": -297.21221923828125, "loss": 1.4108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.425337314605713, "rewards/margins": -0.3356664180755615, "rewards/rejected": 2.7610039710998535, "step": 46710 }, { "epoch": 2.169088629927109, "grad_norm": 16.031085968017578, "learning_rate": 1.3866320008666448e-07, "logits/chosen": -18.752574920654297, "logits/rejected": -18.72779655456543, "logps/chosen": -418.54571533203125, "logps/rejected": -389.2054138183594, "loss": 0.7572, "rewards/accuracies": 0.5, "rewards/chosen": 4.016878604888916, "rewards/margins": 0.3723260462284088, "rewards/rejected": 3.644552707672119, "step": 46720 }, { "epoch": 2.169552904034542, "grad_norm": 13.032320976257324, "learning_rate": 1.38585821068759e-07, "logits/chosen": -19.33095359802246, "logits/rejected": -18.227609634399414, "logps/chosen": -321.03948974609375, "logps/rejected": -218.80996704101562, "loss": 0.5032, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.966649293899536, "rewards/margins": 1.8032976388931274, "rewards/rejected": 1.1633517742156982, "step": 46730 }, { "epoch": 2.170017178141975, "grad_norm": 13.572994232177734, "learning_rate": 1.385084420508535e-07, "logits/chosen": -19.364139556884766, "logits/rejected": -18.7166748046875, "logps/chosen": -356.7471618652344, "logps/rejected": -314.6592102050781, "loss": 0.7649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7249369621276855, "rewards/margins": 0.8005515336990356, "rewards/rejected": 1.924385666847229, "step": 46740 }, { "epoch": 2.1704814522494082, "grad_norm": 132.9683837890625, "learning_rate": 1.38431063032948e-07, "logits/chosen": -19.23305892944336, "logits/rejected": -18.676651000976562, "logps/chosen": -342.79400634765625, "logps/rejected": -367.19525146484375, "loss": 1.0346, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.638972759246826, "rewards/margins": 0.564242959022522, "rewards/rejected": 2.0747299194335938, "step": 46750 }, { "epoch": 2.170945726356841, "grad_norm": 53.2081298828125, "learning_rate": 1.3835368401504247e-07, "logits/chosen": -18.563146591186523, "logits/rejected": -18.47612190246582, "logps/chosen": -424.18841552734375, "logps/rejected": -393.24945068359375, "loss": 0.8221, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.416574478149414, "rewards/margins": 1.0474101305007935, "rewards/rejected": 2.36916446685791, "step": 46760 }, { "epoch": 2.1714100004642742, "grad_norm": 58.583492279052734, "learning_rate": 1.3827630499713696e-07, "logits/chosen": -18.73186683654785, "logits/rejected": -18.27032470703125, "logps/chosen": -416.57012939453125, "logps/rejected": -368.0399475097656, "loss": 0.8488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.836977481842041, "rewards/margins": 0.9268752932548523, "rewards/rejected": 1.910102128982544, "step": 46770 }, { "epoch": 2.171874274571707, "grad_norm": 6.134212017059326, "learning_rate": 1.3819892597923147e-07, "logits/chosen": -19.405742645263672, "logits/rejected": -18.41720199584961, "logps/chosen": -433.70806884765625, "logps/rejected": -325.290283203125, "loss": 0.6208, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.959157943725586, "rewards/margins": 1.6337331533432007, "rewards/rejected": 2.325424909591675, "step": 46780 }, { "epoch": 2.1723385486791402, "grad_norm": 52.280311584472656, "learning_rate": 1.3812154696132598e-07, "logits/chosen": -19.306941986083984, "logits/rejected": -18.48255729675293, "logps/chosen": -343.53948974609375, "logps/rejected": -318.37664794921875, "loss": 1.1098, "rewards/accuracies": 0.5, "rewards/chosen": 2.8841593265533447, "rewards/margins": 0.22880029678344727, "rewards/rejected": 2.6553590297698975, "step": 46790 }, { "epoch": 2.172802822786573, "grad_norm": 215.19935607910156, "learning_rate": 1.3804416794342044e-07, "logits/chosen": -18.69332504272461, "logits/rejected": -18.371440887451172, "logps/chosen": -478.7552185058594, "logps/rejected": -462.2110290527344, "loss": 1.0059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.111708164215088, "rewards/margins": 0.7181158065795898, "rewards/rejected": 3.393592119216919, "step": 46800 }, { "epoch": 2.173267096894006, "grad_norm": 218.91880798339844, "learning_rate": 1.3796678892551495e-07, "logits/chosen": -19.388269424438477, "logits/rejected": -18.811283111572266, "logps/chosen": -351.06549072265625, "logps/rejected": -405.6315002441406, "loss": 0.5588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1945011615753174, "rewards/margins": 0.634817898273468, "rewards/rejected": 2.5596835613250732, "step": 46810 }, { "epoch": 2.1737313710014394, "grad_norm": 58.953880310058594, "learning_rate": 1.3788940990760943e-07, "logits/chosen": -19.6583251953125, "logits/rejected": -18.889747619628906, "logps/chosen": -505.2826232910156, "logps/rejected": -392.66424560546875, "loss": 0.6665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9513773918151855, "rewards/margins": 1.6944503784179688, "rewards/rejected": 2.256927251815796, "step": 46820 }, { "epoch": 2.174195645108872, "grad_norm": 77.29151916503906, "learning_rate": 1.3781203088970395e-07, "logits/chosen": -18.465137481689453, "logits/rejected": -18.678396224975586, "logps/chosen": -378.77178955078125, "logps/rejected": -327.6981506347656, "loss": 1.1705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3765530586242676, "rewards/margins": -0.23804469406604767, "rewards/rejected": 2.6145973205566406, "step": 46830 }, { "epoch": 2.1746599192163054, "grad_norm": 28.199050903320312, "learning_rate": 1.3773465187179846e-07, "logits/chosen": -20.397602081298828, "logits/rejected": -19.73828887939453, "logps/chosen": -329.7938537597656, "logps/rejected": -293.0541076660156, "loss": 0.869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0472354888916016, "rewards/margins": 0.6372033357620239, "rewards/rejected": 2.410032033920288, "step": 46840 }, { "epoch": 2.175124193323738, "grad_norm": 203.6134490966797, "learning_rate": 1.3765727285389292e-07, "logits/chosen": -19.05495834350586, "logits/rejected": -18.77170181274414, "logps/chosen": -317.3575134277344, "logps/rejected": -301.97314453125, "loss": 1.5973, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.0024266242980957, "rewards/margins": -0.7843674421310425, "rewards/rejected": 2.7867941856384277, "step": 46850 }, { "epoch": 2.1755884674311714, "grad_norm": 0.03245452791452408, "learning_rate": 1.3757989383598743e-07, "logits/chosen": -19.551265716552734, "logits/rejected": -18.43201446533203, "logps/chosen": -454.09930419921875, "logps/rejected": -434.16033935546875, "loss": 0.7747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.264081001281738, "rewards/margins": 1.5839182138442993, "rewards/rejected": 3.6801624298095703, "step": 46860 }, { "epoch": 2.176052741538604, "grad_norm": 280.6514892578125, "learning_rate": 1.375025148180819e-07, "logits/chosen": -19.038925170898438, "logits/rejected": -18.35886001586914, "logps/chosen": -286.11712646484375, "logps/rejected": -284.259521484375, "loss": 0.7975, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.165787696838379, "rewards/margins": 1.6002042293548584, "rewards/rejected": 1.5655837059020996, "step": 46870 }, { "epoch": 2.1765170156460374, "grad_norm": 43.85625457763672, "learning_rate": 1.3742513580017642e-07, "logits/chosen": -18.3233585357666, "logits/rejected": -17.868850708007812, "logps/chosen": -343.913818359375, "logps/rejected": -290.61419677734375, "loss": 0.7483, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.027841091156006, "rewards/margins": 1.429673433303833, "rewards/rejected": 1.5981680154800415, "step": 46880 }, { "epoch": 2.1769812897534706, "grad_norm": 91.00689697265625, "learning_rate": 1.3734775678227093e-07, "logits/chosen": -19.464052200317383, "logits/rejected": -17.41324234008789, "logps/chosen": -399.46856689453125, "logps/rejected": -258.83709716796875, "loss": 0.4136, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.54840087890625, "rewards/margins": 2.688368320465088, "rewards/rejected": 2.860032796859741, "step": 46890 }, { "epoch": 2.1774455638609034, "grad_norm": 146.29013061523438, "learning_rate": 1.372703777643654e-07, "logits/chosen": -19.19491958618164, "logits/rejected": -18.974939346313477, "logps/chosen": -355.02410888671875, "logps/rejected": -318.7579650878906, "loss": 0.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9925537109375, "rewards/margins": 1.540747046470642, "rewards/rejected": 1.451806664466858, "step": 46900 }, { "epoch": 2.1779098379683366, "grad_norm": 77.59335327148438, "learning_rate": 1.371929987464599e-07, "logits/chosen": -19.314462661743164, "logits/rejected": -17.794368743896484, "logps/chosen": -351.81610107421875, "logps/rejected": -213.73934936523438, "loss": 0.5027, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7512848377227783, "rewards/margins": 1.9580094814300537, "rewards/rejected": 1.7932755947113037, "step": 46910 }, { "epoch": 2.1783741120757694, "grad_norm": 67.48815155029297, "learning_rate": 1.371156197285544e-07, "logits/chosen": -19.292951583862305, "logits/rejected": -18.936262130737305, "logps/chosen": -365.4549560546875, "logps/rejected": -356.12286376953125, "loss": 0.7322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.259995937347412, "rewards/margins": 0.7199826836585999, "rewards/rejected": 1.5400134325027466, "step": 46920 }, { "epoch": 2.1788383861832026, "grad_norm": 16.97645378112793, "learning_rate": 1.370382407106489e-07, "logits/chosen": -19.043338775634766, "logits/rejected": -18.46576499938965, "logps/chosen": -351.3094787597656, "logps/rejected": -304.8255310058594, "loss": 0.9074, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.746100902557373, "rewards/margins": 0.6909027099609375, "rewards/rejected": 3.0551981925964355, "step": 46930 }, { "epoch": 2.179302660290636, "grad_norm": 115.12787628173828, "learning_rate": 1.369608616927434e-07, "logits/chosen": -18.559040069580078, "logits/rejected": -17.136127471923828, "logps/chosen": -356.3940734863281, "logps/rejected": -222.1490936279297, "loss": 0.3366, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.709357261657715, "rewards/margins": 2.3493404388427734, "rewards/rejected": 0.36001691222190857, "step": 46940 }, { "epoch": 2.1797669343980686, "grad_norm": 95.53494262695312, "learning_rate": 1.3688348267483787e-07, "logits/chosen": -19.494291305541992, "logits/rejected": -18.049755096435547, "logps/chosen": -511.5767517089844, "logps/rejected": -353.60491943359375, "loss": 0.591, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.448147773742676, "rewards/margins": 1.9980003833770752, "rewards/rejected": 2.4501469135284424, "step": 46950 }, { "epoch": 2.180231208505502, "grad_norm": 31.028772354125977, "learning_rate": 1.3680610365693238e-07, "logits/chosen": -18.673887252807617, "logits/rejected": -17.963518142700195, "logps/chosen": -301.66046142578125, "logps/rejected": -220.1142120361328, "loss": 0.4595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5387825965881348, "rewards/margins": 1.578590989112854, "rewards/rejected": 0.9601919054985046, "step": 46960 }, { "epoch": 2.1806954826129346, "grad_norm": 12.428231239318848, "learning_rate": 1.3672872463902687e-07, "logits/chosen": -19.11641502380371, "logits/rejected": -17.84213638305664, "logps/chosen": -397.1878356933594, "logps/rejected": -319.5691833496094, "loss": 0.4195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.86032772064209, "rewards/margins": 2.2415404319763184, "rewards/rejected": 2.6187872886657715, "step": 46970 }, { "epoch": 2.1811597567203678, "grad_norm": 3.579716444015503, "learning_rate": 1.3665134562112138e-07, "logits/chosen": -20.181743621826172, "logits/rejected": -18.117835998535156, "logps/chosen": -450.6318359375, "logps/rejected": -289.848388671875, "loss": 0.5804, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.298709392547607, "rewards/margins": 1.8615005016326904, "rewards/rejected": 2.437209129333496, "step": 46980 }, { "epoch": 2.1816240308278005, "grad_norm": 189.69564819335938, "learning_rate": 1.365739666032159e-07, "logits/chosen": -19.08487319946289, "logits/rejected": -18.473419189453125, "logps/chosen": -367.7419738769531, "logps/rejected": -304.63250732421875, "loss": 0.7404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.685884952545166, "rewards/margins": 0.5894829034805298, "rewards/rejected": 2.096402168273926, "step": 46990 }, { "epoch": 2.1820883049352338, "grad_norm": 20.582868576049805, "learning_rate": 1.3649658758531035e-07, "logits/chosen": -20.01116943359375, "logits/rejected": -18.71200942993164, "logps/chosen": -377.1781005859375, "logps/rejected": -297.76788330078125, "loss": 0.4755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.824997663497925, "rewards/margins": 1.9373500347137451, "rewards/rejected": 1.8876478672027588, "step": 47000 }, { "epoch": 2.182552579042667, "grad_norm": 128.4571075439453, "learning_rate": 1.3641920856740486e-07, "logits/chosen": -18.175683975219727, "logits/rejected": -17.990741729736328, "logps/chosen": -381.1490783691406, "logps/rejected": -386.07513427734375, "loss": 0.6672, "rewards/accuracies": 0.5, "rewards/chosen": 3.341411590576172, "rewards/margins": 1.4741847515106201, "rewards/rejected": 1.8672269582748413, "step": 47010 }, { "epoch": 2.1830168531500997, "grad_norm": 12.513261795043945, "learning_rate": 1.3634182954949934e-07, "logits/chosen": -20.57489585876465, "logits/rejected": -19.686342239379883, "logps/chosen": -406.19805908203125, "logps/rejected": -380.47918701171875, "loss": 0.3593, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.042436599731445, "rewards/margins": 2.397637128829956, "rewards/rejected": 1.6447992324829102, "step": 47020 }, { "epoch": 2.183481127257533, "grad_norm": 120.58574676513672, "learning_rate": 1.3626445053159385e-07, "logits/chosen": -18.66900634765625, "logits/rejected": -17.772964477539062, "logps/chosen": -329.15423583984375, "logps/rejected": -255.2848358154297, "loss": 0.8524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.828228712081909, "rewards/margins": 1.579574704170227, "rewards/rejected": 1.248653769493103, "step": 47030 }, { "epoch": 2.1839454013649657, "grad_norm": 93.61547088623047, "learning_rate": 1.3618707151368834e-07, "logits/chosen": -19.848867416381836, "logits/rejected": -18.473752975463867, "logps/chosen": -339.7508544921875, "logps/rejected": -231.5645294189453, "loss": 0.3896, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.069528341293335, "rewards/margins": 1.8551013469696045, "rewards/rejected": 1.2144269943237305, "step": 47040 }, { "epoch": 2.184409675472399, "grad_norm": 162.7008056640625, "learning_rate": 1.3610969249578282e-07, "logits/chosen": -19.21293067932129, "logits/rejected": -18.21261215209961, "logps/chosen": -377.137939453125, "logps/rejected": -320.75811767578125, "loss": 0.3826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.007136106491089, "rewards/margins": 1.5838391780853271, "rewards/rejected": 1.4232970476150513, "step": 47050 }, { "epoch": 2.184873949579832, "grad_norm": 159.81202697753906, "learning_rate": 1.3603231347787733e-07, "logits/chosen": -18.847537994384766, "logits/rejected": -19.340213775634766, "logps/chosen": -450.46466064453125, "logps/rejected": -579.2865600585938, "loss": 1.2025, "rewards/accuracies": 0.5, "rewards/chosen": 4.231733798980713, "rewards/margins": -0.027734851464629173, "rewards/rejected": 4.2594685554504395, "step": 47060 }, { "epoch": 2.185338223687265, "grad_norm": 38.086124420166016, "learning_rate": 1.3595493445997182e-07, "logits/chosen": -19.829147338867188, "logits/rejected": -19.18856430053711, "logps/chosen": -427.4956970214844, "logps/rejected": -360.39093017578125, "loss": 0.2432, "rewards/accuracies": 1.0, "rewards/chosen": 3.95585298538208, "rewards/margins": 1.865896224975586, "rewards/rejected": 2.089956760406494, "step": 47070 }, { "epoch": 2.185802497794698, "grad_norm": 150.30844116210938, "learning_rate": 1.3587755544206633e-07, "logits/chosen": -18.80493927001953, "logits/rejected": -19.100574493408203, "logps/chosen": -335.302734375, "logps/rejected": -330.11688232421875, "loss": 0.8846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0103650093078613, "rewards/margins": 0.6571086645126343, "rewards/rejected": 2.3532564640045166, "step": 47080 }, { "epoch": 2.186266771902131, "grad_norm": 47.401466369628906, "learning_rate": 1.3580017642416082e-07, "logits/chosen": -19.394062042236328, "logits/rejected": -19.262720108032227, "logps/chosen": -437.4080505371094, "logps/rejected": -416.94610595703125, "loss": 0.9096, "rewards/accuracies": 0.5, "rewards/chosen": 4.0088348388671875, "rewards/margins": 0.437549352645874, "rewards/rejected": 3.5712857246398926, "step": 47090 }, { "epoch": 2.186731046009564, "grad_norm": 103.76119232177734, "learning_rate": 1.357227974062553e-07, "logits/chosen": -18.674823760986328, "logits/rejected": -18.063899993896484, "logps/chosen": -422.88690185546875, "logps/rejected": -319.0794372558594, "loss": 0.9355, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.09554386138916, "rewards/margins": 0.8181180953979492, "rewards/rejected": 3.2774252891540527, "step": 47100 }, { "epoch": 2.187195320116997, "grad_norm": 14.17915153503418, "learning_rate": 1.356454183883498e-07, "logits/chosen": -19.25905990600586, "logits/rejected": -18.59836769104004, "logps/chosen": -423.8990173339844, "logps/rejected": -347.893310546875, "loss": 0.2016, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.20550537109375, "rewards/margins": 2.4659957885742188, "rewards/rejected": 1.7395099401474, "step": 47110 }, { "epoch": 2.18765959422443, "grad_norm": 35.806583404541016, "learning_rate": 1.355680393704443e-07, "logits/chosen": -18.483905792236328, "logits/rejected": -17.852087020874023, "logps/chosen": -363.74700927734375, "logps/rejected": -351.6943359375, "loss": 0.8532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.331117630004883, "rewards/margins": 1.186509370803833, "rewards/rejected": 2.14460825920105, "step": 47120 }, { "epoch": 2.1881238683318633, "grad_norm": 100.6536636352539, "learning_rate": 1.354906603525388e-07, "logits/chosen": -18.8695125579834, "logits/rejected": -18.686147689819336, "logps/chosen": -375.54327392578125, "logps/rejected": -382.56298828125, "loss": 0.949, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.249518632888794, "rewards/margins": 0.36950066685676575, "rewards/rejected": 2.8800177574157715, "step": 47130 }, { "epoch": 2.188588142439296, "grad_norm": 21.626012802124023, "learning_rate": 1.354132813346333e-07, "logits/chosen": -19.0687313079834, "logits/rejected": -18.483272552490234, "logps/chosen": -351.5306701660156, "logps/rejected": -264.75762939453125, "loss": 0.5465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.301345348358154, "rewards/margins": 2.2532548904418945, "rewards/rejected": 2.0480904579162598, "step": 47140 }, { "epoch": 2.1890524165467293, "grad_norm": 11.323012351989746, "learning_rate": 1.3533590231672778e-07, "logits/chosen": -18.289630889892578, "logits/rejected": -19.354747772216797, "logps/chosen": -328.49029541015625, "logps/rejected": -345.76220703125, "loss": 1.7071, "rewards/accuracies": 0.5, "rewards/chosen": 2.261898994445801, "rewards/margins": -0.8682994842529297, "rewards/rejected": 3.1301987171173096, "step": 47150 }, { "epoch": 2.189516690654162, "grad_norm": 128.29940795898438, "learning_rate": 1.352585232988223e-07, "logits/chosen": -19.534082412719727, "logits/rejected": -18.92845344543457, "logps/chosen": -473.12713623046875, "logps/rejected": -399.9187927246094, "loss": 0.6383, "rewards/accuracies": 0.5, "rewards/chosen": 3.844088315963745, "rewards/margins": 0.964270293712616, "rewards/rejected": 2.8798179626464844, "step": 47160 }, { "epoch": 2.1899809647615953, "grad_norm": 69.35325622558594, "learning_rate": 1.3518114428091677e-07, "logits/chosen": -19.003570556640625, "logits/rejected": -19.13974380493164, "logps/chosen": -436.3741149902344, "logps/rejected": -360.7523498535156, "loss": 0.5985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.529613971710205, "rewards/margins": 0.6791614294052124, "rewards/rejected": 1.8504524230957031, "step": 47170 }, { "epoch": 2.190445238869028, "grad_norm": 152.83079528808594, "learning_rate": 1.3510376526301129e-07, "logits/chosen": -18.528217315673828, "logits/rejected": -18.404727935791016, "logps/chosen": -400.5928039550781, "logps/rejected": -321.7173767089844, "loss": 0.9278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.878239631652832, "rewards/margins": -0.07658664882183075, "rewards/rejected": 2.9548263549804688, "step": 47180 }, { "epoch": 2.1909095129764613, "grad_norm": 0.5343432426452637, "learning_rate": 1.3502638624510577e-07, "logits/chosen": -18.843914031982422, "logits/rejected": -18.055492401123047, "logps/chosen": -286.6153564453125, "logps/rejected": -260.51385498046875, "loss": 0.5827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2139225006103516, "rewards/margins": 1.2680573463439941, "rewards/rejected": 1.9458653926849365, "step": 47190 }, { "epoch": 2.1913737870838945, "grad_norm": 101.76986694335938, "learning_rate": 1.3494900722720025e-07, "logits/chosen": -18.480806350708008, "logits/rejected": -18.34929847717285, "logps/chosen": -283.14849853515625, "logps/rejected": -242.89431762695312, "loss": 0.4784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.127628803253174, "rewards/margins": 1.1188380718231201, "rewards/rejected": 1.008790373802185, "step": 47200 }, { "epoch": 2.1918380611913273, "grad_norm": 74.14558410644531, "learning_rate": 1.3487162820929477e-07, "logits/chosen": -18.840965270996094, "logits/rejected": -17.877376556396484, "logps/chosen": -325.0279235839844, "logps/rejected": -200.95118713378906, "loss": 0.5078, "rewards/accuracies": 0.5, "rewards/chosen": 2.929811477661133, "rewards/margins": 1.5181944370269775, "rewards/rejected": 1.4116170406341553, "step": 47210 }, { "epoch": 2.1923023352987605, "grad_norm": 22.88888168334961, "learning_rate": 1.3479424919138928e-07, "logits/chosen": -20.068967819213867, "logits/rejected": -18.982982635498047, "logps/chosen": -335.6365051269531, "logps/rejected": -227.60531616210938, "loss": 0.2851, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5920348167419434, "rewards/margins": 2.001619815826416, "rewards/rejected": 1.5904147624969482, "step": 47220 }, { "epoch": 2.1927666094061933, "grad_norm": 141.3307342529297, "learning_rate": 1.3471687017348376e-07, "logits/chosen": -18.94137191772461, "logits/rejected": -18.292264938354492, "logps/chosen": -501.2371520996094, "logps/rejected": -380.36669921875, "loss": 0.3979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3543753623962402, "rewards/margins": 1.9450132846832275, "rewards/rejected": 1.409362554550171, "step": 47230 }, { "epoch": 2.1932308835136265, "grad_norm": 1.33689546585083, "learning_rate": 1.3463949115557825e-07, "logits/chosen": -19.568801879882812, "logits/rejected": -19.0760440826416, "logps/chosen": -292.89349365234375, "logps/rejected": -222.0282745361328, "loss": 0.9847, "rewards/accuracies": 0.5, "rewards/chosen": 2.1526381969451904, "rewards/margins": 0.49731722474098206, "rewards/rejected": 1.6553208827972412, "step": 47240 }, { "epoch": 2.1936951576210593, "grad_norm": 43.568511962890625, "learning_rate": 1.3456211213767273e-07, "logits/chosen": -19.96590232849121, "logits/rejected": -18.44107437133789, "logps/chosen": -372.0172119140625, "logps/rejected": -270.69683837890625, "loss": 0.3959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.830878496170044, "rewards/margins": 1.3550808429718018, "rewards/rejected": 2.475797176361084, "step": 47250 }, { "epoch": 2.1941594317284925, "grad_norm": 28.50288200378418, "learning_rate": 1.3448473311976724e-07, "logits/chosen": -19.069944381713867, "logits/rejected": -18.746116638183594, "logps/chosen": -358.98052978515625, "logps/rejected": -295.12774658203125, "loss": 0.9137, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0372157096862793, "rewards/margins": 1.130002737045288, "rewards/rejected": 1.9072134494781494, "step": 47260 }, { "epoch": 2.1946237058359257, "grad_norm": 10.134218215942383, "learning_rate": 1.3440735410186175e-07, "logits/chosen": -19.28738021850586, "logits/rejected": -17.1041202545166, "logps/chosen": -495.34637451171875, "logps/rejected": -303.45159912109375, "loss": 0.19, "rewards/accuracies": 1.0, "rewards/chosen": 4.917529106140137, "rewards/margins": 3.203056812286377, "rewards/rejected": 1.714472770690918, "step": 47270 }, { "epoch": 2.1950879799433585, "grad_norm": 0.5891667008399963, "learning_rate": 1.343299750839562e-07, "logits/chosen": -19.6625919342041, "logits/rejected": -18.807310104370117, "logps/chosen": -358.771484375, "logps/rejected": -349.6980895996094, "loss": 1.0105, "rewards/accuracies": 0.5, "rewards/chosen": 2.290640354156494, "rewards/margins": 0.43769851326942444, "rewards/rejected": 1.8529422283172607, "step": 47280 }, { "epoch": 2.1955522540507917, "grad_norm": 17.232398986816406, "learning_rate": 1.3425259606605072e-07, "logits/chosen": -18.679048538208008, "logits/rejected": -17.5390625, "logps/chosen": -290.85284423828125, "logps/rejected": -192.58529663085938, "loss": 0.1876, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.556567668914795, "rewards/margins": 2.5617756843566895, "rewards/rejected": 0.9947921633720398, "step": 47290 }, { "epoch": 2.1960165281582245, "grad_norm": 37.02775573730469, "learning_rate": 1.341752170481452e-07, "logits/chosen": -19.04609489440918, "logits/rejected": -18.185091018676758, "logps/chosen": -416.59112548828125, "logps/rejected": -326.3624572753906, "loss": 0.3815, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.888702869415283, "rewards/margins": 1.0938674211502075, "rewards/rejected": 1.7948356866836548, "step": 47300 }, { "epoch": 2.1964808022656577, "grad_norm": 1.1400226354599, "learning_rate": 1.3409783803023972e-07, "logits/chosen": -19.727184295654297, "logits/rejected": -18.214170455932617, "logps/chosen": -372.27862548828125, "logps/rejected": -256.988525390625, "loss": 0.5229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.460200786590576, "rewards/margins": 2.0302329063415527, "rewards/rejected": 2.4299676418304443, "step": 47310 }, { "epoch": 2.1969450763730904, "grad_norm": 121.5804214477539, "learning_rate": 1.3402045901233423e-07, "logits/chosen": -18.579439163208008, "logits/rejected": -18.42180824279785, "logps/chosen": -363.1866760253906, "logps/rejected": -363.39447021484375, "loss": 0.8843, "rewards/accuracies": 0.5, "rewards/chosen": 3.5486557483673096, "rewards/margins": 0.2167131006717682, "rewards/rejected": 3.331942319869995, "step": 47320 }, { "epoch": 2.1974093504805237, "grad_norm": 0.21394236385822296, "learning_rate": 1.339430799944287e-07, "logits/chosen": -18.55978775024414, "logits/rejected": -17.5582332611084, "logps/chosen": -349.9202575683594, "logps/rejected": -239.88827514648438, "loss": 0.4128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9739270210266113, "rewards/margins": 2.0567338466644287, "rewards/rejected": 0.9171932339668274, "step": 47330 }, { "epoch": 2.197873624587957, "grad_norm": 33.431129455566406, "learning_rate": 1.338657009765232e-07, "logits/chosen": -18.95195198059082, "logits/rejected": -17.87932586669922, "logps/chosen": -479.3331604003906, "logps/rejected": -371.10052490234375, "loss": 0.8841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.7634358406066895, "rewards/margins": 1.2442247867584229, "rewards/rejected": 3.5192108154296875, "step": 47340 }, { "epoch": 2.1983378986953896, "grad_norm": 80.98392486572266, "learning_rate": 1.3378832195861769e-07, "logits/chosen": -18.651622772216797, "logits/rejected": -18.19332504272461, "logps/chosen": -402.9142150878906, "logps/rejected": -320.6359558105469, "loss": 0.5437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8751628398895264, "rewards/margins": 0.8519479632377625, "rewards/rejected": 2.023214817047119, "step": 47350 }, { "epoch": 2.198802172802823, "grad_norm": 127.7263412475586, "learning_rate": 1.337109429407122e-07, "logits/chosen": -19.159448623657227, "logits/rejected": -19.307634353637695, "logps/chosen": -376.7716979980469, "logps/rejected": -434.781005859375, "loss": 1.0648, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0436880588531494, "rewards/margins": -0.3507593274116516, "rewards/rejected": 3.3944478034973145, "step": 47360 }, { "epoch": 2.1992664469102556, "grad_norm": 185.29859924316406, "learning_rate": 1.336335639228067e-07, "logits/chosen": -18.584749221801758, "logits/rejected": -17.62420654296875, "logps/chosen": -422.4560546875, "logps/rejected": -331.22064208984375, "loss": 0.8597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4364051818847656, "rewards/margins": 1.459566354751587, "rewards/rejected": 1.9768390655517578, "step": 47370 }, { "epoch": 2.199730721017689, "grad_norm": 241.8072509765625, "learning_rate": 1.3355618490490117e-07, "logits/chosen": -18.02478790283203, "logits/rejected": -18.076122283935547, "logps/chosen": -270.4791564941406, "logps/rejected": -277.59259033203125, "loss": 0.9891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.468759059906006, "rewards/margins": 0.6862072944641113, "rewards/rejected": 1.7825520038604736, "step": 47380 }, { "epoch": 2.200194995125122, "grad_norm": 203.5984649658203, "learning_rate": 1.3347880588699568e-07, "logits/chosen": -19.529109954833984, "logits/rejected": -18.968814849853516, "logps/chosen": -440.5359802246094, "logps/rejected": -390.7411193847656, "loss": 1.3085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.22363018989563, "rewards/margins": 0.028489088639616966, "rewards/rejected": 3.195141315460205, "step": 47390 }, { "epoch": 2.200659269232555, "grad_norm": 0.19116109609603882, "learning_rate": 1.3340142686909016e-07, "logits/chosen": -19.101816177368164, "logits/rejected": -17.9091796875, "logps/chosen": -555.6968994140625, "logps/rejected": -410.60186767578125, "loss": 0.6446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.694055557250977, "rewards/margins": 1.5925469398498535, "rewards/rejected": 3.101508617401123, "step": 47400 }, { "epoch": 2.201123543339988, "grad_norm": 12.315177917480469, "learning_rate": 1.3332404785118467e-07, "logits/chosen": -19.000919342041016, "logits/rejected": -18.25889778137207, "logps/chosen": -314.05975341796875, "logps/rejected": -232.0290985107422, "loss": 0.5464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4325828552246094, "rewards/margins": 1.4440540075302124, "rewards/rejected": 0.988528847694397, "step": 47410 }, { "epoch": 2.201587817447421, "grad_norm": 31.602584838867188, "learning_rate": 1.3324666883327919e-07, "logits/chosen": -19.5512752532959, "logits/rejected": -19.74164581298828, "logps/chosen": -330.63128662109375, "logps/rejected": -321.338623046875, "loss": 1.1357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.033782958984375, "rewards/margins": 0.44177117943763733, "rewards/rejected": 2.5920119285583496, "step": 47420 }, { "epoch": 2.202052091554854, "grad_norm": 66.02809143066406, "learning_rate": 1.3316928981537364e-07, "logits/chosen": -18.370372772216797, "logits/rejected": -17.84622573852539, "logps/chosen": -343.7940368652344, "logps/rejected": -297.05364990234375, "loss": 0.4886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5446901321411133, "rewards/margins": 0.9116263389587402, "rewards/rejected": 1.6330639123916626, "step": 47430 }, { "epoch": 2.202516365662287, "grad_norm": 190.73728942871094, "learning_rate": 1.3309191079746816e-07, "logits/chosen": -18.399585723876953, "logits/rejected": -18.224748611450195, "logps/chosen": -307.7901916503906, "logps/rejected": -332.09930419921875, "loss": 0.9842, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5370726585388184, "rewards/margins": 0.5336312055587769, "rewards/rejected": 2.003441572189331, "step": 47440 }, { "epoch": 2.20298063976972, "grad_norm": 127.36519622802734, "learning_rate": 1.3301453177956264e-07, "logits/chosen": -18.554302215576172, "logits/rejected": -18.008243560791016, "logps/chosen": -289.59527587890625, "logps/rejected": -250.66104125976562, "loss": 0.5904, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3020806312561035, "rewards/margins": 1.1526676416397095, "rewards/rejected": 1.1494128704071045, "step": 47450 }, { "epoch": 2.2034449138771532, "grad_norm": 0.619354248046875, "learning_rate": 1.3293715276165715e-07, "logits/chosen": -19.45944595336914, "logits/rejected": -18.349712371826172, "logps/chosen": -467.60748291015625, "logps/rejected": -374.0882873535156, "loss": 0.2481, "rewards/accuracies": 1.0, "rewards/chosen": 5.630837440490723, "rewards/margins": 2.3586597442626953, "rewards/rejected": 3.2721774578094482, "step": 47460 }, { "epoch": 2.203909187984586, "grad_norm": 125.88345336914062, "learning_rate": 1.3285977374375166e-07, "logits/chosen": -18.36502456665039, "logits/rejected": -17.500957489013672, "logps/chosen": -386.52099609375, "logps/rejected": -299.4201965332031, "loss": 0.3413, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.413630247116089, "rewards/margins": 1.9353396892547607, "rewards/rejected": 1.4782906770706177, "step": 47470 }, { "epoch": 2.2043734620920192, "grad_norm": 24.18565559387207, "learning_rate": 1.3278239472584612e-07, "logits/chosen": -18.956798553466797, "logits/rejected": -17.936176300048828, "logps/chosen": -371.70867919921875, "logps/rejected": -244.35665893554688, "loss": 0.3533, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7275009155273438, "rewards/margins": 1.4288551807403564, "rewards/rejected": 1.2986454963684082, "step": 47480 }, { "epoch": 2.204837736199452, "grad_norm": 36.50209045410156, "learning_rate": 1.3270501570794063e-07, "logits/chosen": -19.68804359436035, "logits/rejected": -18.320575714111328, "logps/chosen": -311.6837158203125, "logps/rejected": -252.20156860351562, "loss": 0.9221, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.402937412261963, "rewards/margins": 0.71379554271698, "rewards/rejected": 1.6891415119171143, "step": 47490 }, { "epoch": 2.205302010306885, "grad_norm": 2.942322254180908, "learning_rate": 1.3262763669003512e-07, "logits/chosen": -18.567567825317383, "logits/rejected": -17.417587280273438, "logps/chosen": -452.385009765625, "logps/rejected": -246.4429168701172, "loss": 0.5522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3642241954803467, "rewards/margins": 1.5296287536621094, "rewards/rejected": 1.8345954418182373, "step": 47500 }, { "epoch": 2.2057662844143184, "grad_norm": 50.72163391113281, "learning_rate": 1.3255025767212963e-07, "logits/chosen": -18.36956787109375, "logits/rejected": -18.828838348388672, "logps/chosen": -330.8645935058594, "logps/rejected": -336.6202392578125, "loss": 0.6531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.315798282623291, "rewards/margins": 1.1596542596817017, "rewards/rejected": 2.1561439037323, "step": 47510 }, { "epoch": 2.206230558521751, "grad_norm": 22.13621711730957, "learning_rate": 1.3247287865422411e-07, "logits/chosen": -19.25922393798828, "logits/rejected": -18.439434051513672, "logps/chosen": -386.7960510253906, "logps/rejected": -317.6695251464844, "loss": 0.8648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.801287651062012, "rewards/margins": 1.0435044765472412, "rewards/rejected": 3.7577831745147705, "step": 47520 }, { "epoch": 2.2066948326291844, "grad_norm": 254.1313018798828, "learning_rate": 1.323954996363186e-07, "logits/chosen": -18.81332015991211, "logits/rejected": -18.411365509033203, "logps/chosen": -452.9635314941406, "logps/rejected": -406.1042785644531, "loss": 0.9255, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.372034072875977, "rewards/margins": 0.7717176079750061, "rewards/rejected": 3.6003165245056152, "step": 47530 }, { "epoch": 2.207159106736617, "grad_norm": 3.431727409362793, "learning_rate": 1.323181206184131e-07, "logits/chosen": -18.212221145629883, "logits/rejected": -17.12118148803711, "logps/chosen": -380.31396484375, "logps/rejected": -272.1005554199219, "loss": 0.6758, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.198428630828857, "rewards/margins": 2.054161787033081, "rewards/rejected": 2.1442670822143555, "step": 47540 }, { "epoch": 2.2076233808440504, "grad_norm": 105.39350128173828, "learning_rate": 1.322407416005076e-07, "logits/chosen": -19.29475212097168, "logits/rejected": -17.949024200439453, "logps/chosen": -445.671875, "logps/rejected": -312.57440185546875, "loss": 0.8379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2764854431152344, "rewards/margins": 1.627401351928711, "rewards/rejected": 1.6490840911865234, "step": 47550 }, { "epoch": 2.208087654951483, "grad_norm": 200.01651000976562, "learning_rate": 1.321633625826021e-07, "logits/chosen": -19.4636287689209, "logits/rejected": -18.688915252685547, "logps/chosen": -430.9768981933594, "logps/rejected": -350.41741943359375, "loss": 0.6946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.243382692337036, "rewards/margins": 0.816148579120636, "rewards/rejected": 2.427233934402466, "step": 47560 }, { "epoch": 2.2085519290589164, "grad_norm": 78.85716247558594, "learning_rate": 1.320859835646966e-07, "logits/chosen": -18.480716705322266, "logits/rejected": -18.51784896850586, "logps/chosen": -305.26361083984375, "logps/rejected": -281.3788757324219, "loss": 0.8855, "rewards/accuracies": 0.5, "rewards/chosen": 2.6351797580718994, "rewards/margins": 0.7109566926956177, "rewards/rejected": 1.9242231845855713, "step": 47570 }, { "epoch": 2.2090162031663496, "grad_norm": 59.91316223144531, "learning_rate": 1.3200860454679108e-07, "logits/chosen": -19.51751708984375, "logits/rejected": -18.880859375, "logps/chosen": -391.63067626953125, "logps/rejected": -343.3740539550781, "loss": 0.7743, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.405682325363159, "rewards/margins": 0.28130626678466797, "rewards/rejected": 3.124375820159912, "step": 47580 }, { "epoch": 2.2094804772737824, "grad_norm": 111.38314819335938, "learning_rate": 1.3193122552888559e-07, "logits/chosen": -18.88515281677246, "logits/rejected": -18.039196014404297, "logps/chosen": -404.2290954589844, "logps/rejected": -276.44708251953125, "loss": 0.3465, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.438525438308716, "rewards/margins": 1.7804899215698242, "rewards/rejected": 1.6580352783203125, "step": 47590 }, { "epoch": 2.2099447513812156, "grad_norm": 57.902130126953125, "learning_rate": 1.3185384651098007e-07, "logits/chosen": -18.244821548461914, "logits/rejected": -17.509906768798828, "logps/chosen": -352.3826904296875, "logps/rejected": -265.15924072265625, "loss": 0.9743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4624757766723633, "rewards/margins": 0.6837457418441772, "rewards/rejected": 1.7787303924560547, "step": 47600 }, { "epoch": 2.2104090254886484, "grad_norm": 181.8629913330078, "learning_rate": 1.3177646749307458e-07, "logits/chosen": -19.03152084350586, "logits/rejected": -18.6468505859375, "logps/chosen": -461.44091796875, "logps/rejected": -422.92596435546875, "loss": 1.1272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.611539840698242, "rewards/margins": -0.065812848508358, "rewards/rejected": 3.6773524284362793, "step": 47610 }, { "epoch": 2.2108732995960816, "grad_norm": 47.41392135620117, "learning_rate": 1.3169908847516907e-07, "logits/chosen": -19.688610076904297, "logits/rejected": -19.193655014038086, "logps/chosen": -339.69329833984375, "logps/rejected": -227.0854034423828, "loss": 0.6695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8652706146240234, "rewards/margins": 1.6333229541778564, "rewards/rejected": 2.231947422027588, "step": 47620 }, { "epoch": 2.2113375737035144, "grad_norm": 115.57815551757812, "learning_rate": 1.3162170945726355e-07, "logits/chosen": -18.990230560302734, "logits/rejected": -17.25176239013672, "logps/chosen": -326.1847229003906, "logps/rejected": -197.64285278320312, "loss": 0.5126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.577709197998047, "rewards/margins": 3.217203140258789, "rewards/rejected": 0.36050641536712646, "step": 47630 }, { "epoch": 2.2118018478109476, "grad_norm": 8.191479682922363, "learning_rate": 1.3154433043935806e-07, "logits/chosen": -18.404638290405273, "logits/rejected": -17.785036087036133, "logps/chosen": -351.2052307128906, "logps/rejected": -391.4911804199219, "loss": 1.3372, "rewards/accuracies": 0.5, "rewards/chosen": 2.6722607612609863, "rewards/margins": 0.025757789611816406, "rewards/rejected": 2.64650297164917, "step": 47640 }, { "epoch": 2.212266121918381, "grad_norm": 234.32135009765625, "learning_rate": 1.3146695142145255e-07, "logits/chosen": -19.210567474365234, "logits/rejected": -19.364503860473633, "logps/chosen": -499.3177185058594, "logps/rejected": -413.2792053222656, "loss": 0.4205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.46375036239624, "rewards/margins": 1.77910578250885, "rewards/rejected": 2.6846444606781006, "step": 47650 }, { "epoch": 2.2127303960258136, "grad_norm": 144.6669921875, "learning_rate": 1.3138957240354706e-07, "logits/chosen": -19.836933135986328, "logits/rejected": -18.58590316772461, "logps/chosen": -410.22894287109375, "logps/rejected": -334.46417236328125, "loss": 0.291, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7158806324005127, "rewards/margins": 1.9220988750457764, "rewards/rejected": 1.7937812805175781, "step": 47660 }, { "epoch": 2.2131946701332468, "grad_norm": 3.3340911865234375, "learning_rate": 1.3131219338564154e-07, "logits/chosen": -18.68330955505371, "logits/rejected": -17.50484848022461, "logps/chosen": -432.12188720703125, "logps/rejected": -350.9248962402344, "loss": 0.4395, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.6154022216796875, "rewards/margins": 2.30554461479187, "rewards/rejected": 2.309857130050659, "step": 47670 }, { "epoch": 2.2136589442406795, "grad_norm": 185.3313446044922, "learning_rate": 1.3123481436773603e-07, "logits/chosen": -19.493967056274414, "logits/rejected": -18.692577362060547, "logps/chosen": -458.41986083984375, "logps/rejected": -431.1668395996094, "loss": 0.5973, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.145112991333008, "rewards/margins": 1.1696960926055908, "rewards/rejected": 2.975416898727417, "step": 47680 }, { "epoch": 2.2141232183481128, "grad_norm": 104.07015991210938, "learning_rate": 1.3115743534983054e-07, "logits/chosen": -18.895191192626953, "logits/rejected": -18.353105545043945, "logps/chosen": -414.8363342285156, "logps/rejected": -306.4601745605469, "loss": 0.4723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.116243839263916, "rewards/margins": 1.7620372772216797, "rewards/rejected": 1.3542065620422363, "step": 47690 }, { "epoch": 2.2145874924555455, "grad_norm": 82.13330078125, "learning_rate": 1.3108005633192503e-07, "logits/chosen": -19.07613754272461, "logits/rejected": -18.62975311279297, "logps/chosen": -364.1336669921875, "logps/rejected": -323.42608642578125, "loss": 0.7388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.730445146560669, "rewards/margins": 0.6919878721237183, "rewards/rejected": 2.0384573936462402, "step": 47700 }, { "epoch": 2.2150517665629788, "grad_norm": 35.80500030517578, "learning_rate": 1.310026773140195e-07, "logits/chosen": -18.49202537536621, "logits/rejected": -18.36306381225586, "logps/chosen": -439.2164001464844, "logps/rejected": -460.2862243652344, "loss": 0.6072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7523484230041504, "rewards/margins": 0.8005258440971375, "rewards/rejected": 2.951822280883789, "step": 47710 }, { "epoch": 2.215516040670412, "grad_norm": 0.5587204098701477, "learning_rate": 1.3092529829611402e-07, "logits/chosen": -18.934045791625977, "logits/rejected": -18.77334976196289, "logps/chosen": -411.92572021484375, "logps/rejected": -348.18719482421875, "loss": 0.8245, "rewards/accuracies": 0.5, "rewards/chosen": 3.710291624069214, "rewards/margins": 0.8795350790023804, "rewards/rejected": 2.830756425857544, "step": 47720 }, { "epoch": 2.2159803147778447, "grad_norm": 188.18592834472656, "learning_rate": 1.308479192782085e-07, "logits/chosen": -19.857688903808594, "logits/rejected": -19.70780372619629, "logps/chosen": -445.88360595703125, "logps/rejected": -438.4884338378906, "loss": 0.7444, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.474116802215576, "rewards/margins": 0.5722177028656006, "rewards/rejected": 2.901899576187134, "step": 47730 }, { "epoch": 2.216444588885278, "grad_norm": 18.66620635986328, "learning_rate": 1.3077054026030302e-07, "logits/chosen": -18.35795783996582, "logits/rejected": -18.396587371826172, "logps/chosen": -385.4718322753906, "logps/rejected": -473.474609375, "loss": 0.9505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4299869537353516, "rewards/margins": 0.42822790145874023, "rewards/rejected": 3.0017590522766113, "step": 47740 }, { "epoch": 2.2169088629927107, "grad_norm": 42.06885528564453, "learning_rate": 1.306931612423975e-07, "logits/chosen": -18.128177642822266, "logits/rejected": -17.776729583740234, "logps/chosen": -498.01708984375, "logps/rejected": -436.4278869628906, "loss": 0.9908, "rewards/accuracies": 0.5, "rewards/chosen": 3.461094617843628, "rewards/margins": 0.628923237323761, "rewards/rejected": 2.832171678543091, "step": 47750 }, { "epoch": 2.217373137100144, "grad_norm": 42.92601013183594, "learning_rate": 1.30615782224492e-07, "logits/chosen": -18.917293548583984, "logits/rejected": -17.836780548095703, "logps/chosen": -370.050048828125, "logps/rejected": -229.4915313720703, "loss": 0.4426, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.241241931915283, "rewards/margins": 0.8760737180709839, "rewards/rejected": 1.3651679754257202, "step": 47760 }, { "epoch": 2.217837411207577, "grad_norm": 239.0183563232422, "learning_rate": 1.305384032065865e-07, "logits/chosen": -18.828210830688477, "logits/rejected": -18.22507095336914, "logps/chosen": -456.892822265625, "logps/rejected": -383.1485900878906, "loss": 0.4963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.830469846725464, "rewards/margins": 1.4569817781448364, "rewards/rejected": 1.373488187789917, "step": 47770 }, { "epoch": 2.21830168531501, "grad_norm": 172.6162872314453, "learning_rate": 1.3046102418868098e-07, "logits/chosen": -19.019681930541992, "logits/rejected": -18.88119125366211, "logps/chosen": -461.97747802734375, "logps/rejected": -418.1917419433594, "loss": 0.715, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.941094398498535, "rewards/margins": 1.2507691383361816, "rewards/rejected": 3.6903252601623535, "step": 47780 }, { "epoch": 2.218765959422443, "grad_norm": 138.5512237548828, "learning_rate": 1.303836451707755e-07, "logits/chosen": -18.79410171508789, "logits/rejected": -18.853702545166016, "logps/chosen": -406.728759765625, "logps/rejected": -369.00372314453125, "loss": 0.8381, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.735567569732666, "rewards/margins": 1.2434570789337158, "rewards/rejected": 2.49211049079895, "step": 47790 }, { "epoch": 2.219230233529876, "grad_norm": 57.90736389160156, "learning_rate": 1.3030626615286998e-07, "logits/chosen": -20.06100845336914, "logits/rejected": -18.447246551513672, "logps/chosen": -403.3743896484375, "logps/rejected": -347.5372619628906, "loss": 0.4163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.009605407714844, "rewards/margins": 2.3506643772125244, "rewards/rejected": 1.6589412689208984, "step": 47800 }, { "epoch": 2.219694507637309, "grad_norm": 8.308503150939941, "learning_rate": 1.3022888713496446e-07, "logits/chosen": -19.21441078186035, "logits/rejected": -18.041656494140625, "logps/chosen": -416.576904296875, "logps/rejected": -269.9273681640625, "loss": 0.3233, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0897440910339355, "rewards/margins": 1.3037084341049194, "rewards/rejected": 1.786035180091858, "step": 47810 }, { "epoch": 2.220158781744742, "grad_norm": 0.930754542350769, "learning_rate": 1.3015150811705898e-07, "logits/chosen": -18.48495101928711, "logits/rejected": -17.854907989501953, "logps/chosen": -248.6041717529297, "logps/rejected": -193.46954345703125, "loss": 0.9778, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9861030578613281, "rewards/margins": 1.3198447227478027, "rewards/rejected": 0.6662582159042358, "step": 47820 }, { "epoch": 2.220623055852175, "grad_norm": 33.58244705200195, "learning_rate": 1.3007412909915346e-07, "logits/chosen": -19.10655403137207, "logits/rejected": -19.097187042236328, "logps/chosen": -379.69451904296875, "logps/rejected": -377.45281982421875, "loss": 0.7635, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.131131887435913, "rewards/margins": 0.22758281230926514, "rewards/rejected": 2.9035491943359375, "step": 47830 }, { "epoch": 2.2210873299596083, "grad_norm": 47.027069091796875, "learning_rate": 1.2999675008124797e-07, "logits/chosen": -19.379837036132812, "logits/rejected": -18.59389877319336, "logps/chosen": -373.85882568359375, "logps/rejected": -293.1117858886719, "loss": 0.435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.802978038787842, "rewards/margins": 1.3458434343338013, "rewards/rejected": 2.45713472366333, "step": 47840 }, { "epoch": 2.221551604067041, "grad_norm": 19.305465698242188, "learning_rate": 1.2991937106334248e-07, "logits/chosen": -18.582061767578125, "logits/rejected": -18.157794952392578, "logps/chosen": -397.0272521972656, "logps/rejected": -419.14947509765625, "loss": 0.8197, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7686171531677246, "rewards/margins": 0.995361328125, "rewards/rejected": 2.7732558250427246, "step": 47850 }, { "epoch": 2.2220158781744743, "grad_norm": 86.20555114746094, "learning_rate": 1.2984199204543694e-07, "logits/chosen": -18.858158111572266, "logits/rejected": -18.9238338470459, "logps/chosen": -454.0957946777344, "logps/rejected": -481.7818298339844, "loss": 0.6145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.597261428833008, "rewards/margins": 0.47710877656936646, "rewards/rejected": 3.120152711868286, "step": 47860 }, { "epoch": 2.222480152281907, "grad_norm": 16.919389724731445, "learning_rate": 1.2976461302753145e-07, "logits/chosen": -19.151535034179688, "logits/rejected": -18.273244857788086, "logps/chosen": -449.1080627441406, "logps/rejected": -365.3011169433594, "loss": 0.2937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.222540855407715, "rewards/margins": 1.459919810295105, "rewards/rejected": 2.7626209259033203, "step": 47870 }, { "epoch": 2.2229444263893403, "grad_norm": 25.33974266052246, "learning_rate": 1.2968723400962594e-07, "logits/chosen": -19.87837028503418, "logits/rejected": -19.062232971191406, "logps/chosen": -420.49896240234375, "logps/rejected": -356.66400146484375, "loss": 0.3505, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.634965896606445, "rewards/margins": 1.5099681615829468, "rewards/rejected": 3.124997615814209, "step": 47880 }, { "epoch": 2.2234087004967735, "grad_norm": 0.8370918035507202, "learning_rate": 1.2960985499172045e-07, "logits/chosen": -19.070390701293945, "logits/rejected": -19.457508087158203, "logps/chosen": -422.59674072265625, "logps/rejected": -437.29095458984375, "loss": 0.9082, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.3252153396606445, "rewards/margins": 0.335819810628891, "rewards/rejected": 3.9893951416015625, "step": 47890 }, { "epoch": 2.2238729746042063, "grad_norm": 47.927391052246094, "learning_rate": 1.2953247597381496e-07, "logits/chosen": -19.46937370300293, "logits/rejected": -19.33194351196289, "logps/chosen": -410.2015686035156, "logps/rejected": -400.0554504394531, "loss": 0.8963, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2832260131835938, "rewards/margins": 0.18055778741836548, "rewards/rejected": 3.102668523788452, "step": 47900 }, { "epoch": 2.2243372487116395, "grad_norm": 90.26610565185547, "learning_rate": 1.2945509695590942e-07, "logits/chosen": -18.718399047851562, "logits/rejected": -17.82515525817871, "logps/chosen": -429.78582763671875, "logps/rejected": -404.8733825683594, "loss": 0.4325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.214639663696289, "rewards/margins": 1.255998134613037, "rewards/rejected": 2.958641529083252, "step": 47910 }, { "epoch": 2.2248015228190723, "grad_norm": 157.6435546875, "learning_rate": 1.2937771793800393e-07, "logits/chosen": -18.99732208251953, "logits/rejected": -18.34867286682129, "logps/chosen": -336.5526123046875, "logps/rejected": -273.3663635253906, "loss": 0.8155, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4633569717407227, "rewards/margins": 1.3978493213653564, "rewards/rejected": 1.0655076503753662, "step": 47920 }, { "epoch": 2.2252657969265055, "grad_norm": 172.1907958984375, "learning_rate": 1.2930033892009841e-07, "logits/chosen": -19.739152908325195, "logits/rejected": -19.025053024291992, "logps/chosen": -423.8382873535156, "logps/rejected": -363.94696044921875, "loss": 0.6197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0615785121917725, "rewards/margins": 0.48683324456214905, "rewards/rejected": 2.5747456550598145, "step": 47930 }, { "epoch": 2.2257300710339383, "grad_norm": 61.1480598449707, "learning_rate": 1.2922295990219293e-07, "logits/chosen": -18.765193939208984, "logits/rejected": -18.388988494873047, "logps/chosen": -412.348388671875, "logps/rejected": -323.67205810546875, "loss": 0.7896, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9410998821258545, "rewards/margins": 0.9451386332511902, "rewards/rejected": 1.9959611892700195, "step": 47940 }, { "epoch": 2.2261943451413715, "grad_norm": 160.06002807617188, "learning_rate": 1.2914558088428738e-07, "logits/chosen": -19.681360244750977, "logits/rejected": -18.993532180786133, "logps/chosen": -473.83465576171875, "logps/rejected": -399.684326171875, "loss": 0.261, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.455152988433838, "rewards/margins": 1.7817890644073486, "rewards/rejected": 2.673363447189331, "step": 47950 }, { "epoch": 2.2266586192488047, "grad_norm": 2.1094846725463867, "learning_rate": 1.290682018663819e-07, "logits/chosen": -19.60957908630371, "logits/rejected": -17.845081329345703, "logps/chosen": -498.0235290527344, "logps/rejected": -414.55419921875, "loss": 0.6416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.722636699676514, "rewards/margins": 1.9045131206512451, "rewards/rejected": 2.8181233406066895, "step": 47960 }, { "epoch": 2.2271228933562375, "grad_norm": 71.05442810058594, "learning_rate": 1.289908228484764e-07, "logits/chosen": -20.166757583618164, "logits/rejected": -18.452125549316406, "logps/chosen": -306.001953125, "logps/rejected": -193.7216033935547, "loss": 0.4657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.199331045150757, "rewards/margins": 1.5294177532196045, "rewards/rejected": 0.6699134111404419, "step": 47970 }, { "epoch": 2.2275871674636707, "grad_norm": 30.50956153869629, "learning_rate": 1.289134438305709e-07, "logits/chosen": -19.41812515258789, "logits/rejected": -18.953046798706055, "logps/chosen": -283.13128662109375, "logps/rejected": -289.8423767089844, "loss": 0.713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3022267818450928, "rewards/margins": 0.7041743993759155, "rewards/rejected": 1.5980526208877563, "step": 47980 }, { "epoch": 2.2280514415711035, "grad_norm": 28.552284240722656, "learning_rate": 1.288360648126654e-07, "logits/chosen": -18.724781036376953, "logits/rejected": -18.728351593017578, "logps/chosen": -321.50421142578125, "logps/rejected": -332.0616149902344, "loss": 0.6797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0464041233062744, "rewards/margins": 0.42973747849464417, "rewards/rejected": 2.616666793823242, "step": 47990 }, { "epoch": 2.2285157156785367, "grad_norm": 1.9950731992721558, "learning_rate": 1.287586857947599e-07, "logits/chosen": -19.352903366088867, "logits/rejected": -18.904714584350586, "logps/chosen": -328.3970642089844, "logps/rejected": -289.444580078125, "loss": 0.6648, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.793285846710205, "rewards/margins": 1.163625955581665, "rewards/rejected": 1.6296600103378296, "step": 48000 }, { "epoch": 2.2289799897859695, "grad_norm": 178.78392028808594, "learning_rate": 1.2868130677685437e-07, "logits/chosen": -19.16986846923828, "logits/rejected": -18.994558334350586, "logps/chosen": -273.5465393066406, "logps/rejected": -245.91171264648438, "loss": 0.9576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.517733097076416, "rewards/margins": 0.8473923802375793, "rewards/rejected": 2.6703407764434814, "step": 48010 }, { "epoch": 2.2294442638934027, "grad_norm": 45.79673385620117, "learning_rate": 1.2860392775894888e-07, "logits/chosen": -18.23582649230957, "logits/rejected": -17.910686492919922, "logps/chosen": -405.24530029296875, "logps/rejected": -317.15899658203125, "loss": 1.061, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.61202335357666, "rewards/margins": 0.24451586604118347, "rewards/rejected": 2.3675074577331543, "step": 48020 }, { "epoch": 2.229908538000836, "grad_norm": 144.12413024902344, "learning_rate": 1.2852654874104337e-07, "logits/chosen": -19.309139251708984, "logits/rejected": -19.751785278320312, "logps/chosen": -357.53741455078125, "logps/rejected": -378.52532958984375, "loss": 1.8082, "rewards/accuracies": 0.5, "rewards/chosen": 1.9852708578109741, "rewards/margins": -1.1269280910491943, "rewards/rejected": 3.112199068069458, "step": 48030 }, { "epoch": 2.2303728121082687, "grad_norm": 61.72270202636719, "learning_rate": 1.2844916972313788e-07, "logits/chosen": -19.178142547607422, "logits/rejected": -18.431407928466797, "logps/chosen": -394.6253967285156, "logps/rejected": -341.74462890625, "loss": 0.4184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.628500461578369, "rewards/margins": 1.4433377981185913, "rewards/rejected": 2.18516206741333, "step": 48040 }, { "epoch": 2.230837086215702, "grad_norm": 241.2208251953125, "learning_rate": 1.2837179070523236e-07, "logits/chosen": -19.597341537475586, "logits/rejected": -18.019229888916016, "logps/chosen": -504.6151428222656, "logps/rejected": -359.7589111328125, "loss": 0.5442, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.868552207946777, "rewards/margins": 2.2304606437683105, "rewards/rejected": 2.638091564178467, "step": 48050 }, { "epoch": 2.2313013603231346, "grad_norm": 184.64881896972656, "learning_rate": 1.2829441168732685e-07, "logits/chosen": -19.242210388183594, "logits/rejected": -18.638996124267578, "logps/chosen": -425.8775329589844, "logps/rejected": -325.63775634765625, "loss": 0.8823, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.553797245025635, "rewards/margins": 1.7700281143188477, "rewards/rejected": 2.783768653869629, "step": 48060 }, { "epoch": 2.231765634430568, "grad_norm": 120.2800521850586, "learning_rate": 1.2821703266942136e-07, "logits/chosen": -18.880550384521484, "logits/rejected": -18.818166732788086, "logps/chosen": -436.611083984375, "logps/rejected": -396.41558837890625, "loss": 0.8907, "rewards/accuracies": 0.5, "rewards/chosen": 3.2406227588653564, "rewards/margins": 0.4449094831943512, "rewards/rejected": 2.795713186264038, "step": 48070 }, { "epoch": 2.2322299085380006, "grad_norm": 101.8835678100586, "learning_rate": 1.2813965365151585e-07, "logits/chosen": -19.492006301879883, "logits/rejected": -19.30124282836914, "logps/chosen": -437.69390869140625, "logps/rejected": -449.61944580078125, "loss": 0.7228, "rewards/accuracies": 0.5, "rewards/chosen": 3.6402969360351562, "rewards/margins": 0.3934803903102875, "rewards/rejected": 3.2468161582946777, "step": 48080 }, { "epoch": 2.232694182645434, "grad_norm": 29.528167724609375, "learning_rate": 1.2806227463361036e-07, "logits/chosen": -18.18361473083496, "logits/rejected": -17.99896240234375, "logps/chosen": -406.6979064941406, "logps/rejected": -345.0995788574219, "loss": 0.9583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.80137038230896, "rewards/margins": 0.3076961934566498, "rewards/rejected": 2.4936742782592773, "step": 48090 }, { "epoch": 2.233158456752867, "grad_norm": 0.8046380281448364, "learning_rate": 1.2798489561570484e-07, "logits/chosen": -17.8125057220459, "logits/rejected": -16.732446670532227, "logps/chosen": -377.73028564453125, "logps/rejected": -274.5352783203125, "loss": 0.8091, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5837044715881348, "rewards/margins": 1.3899128437042236, "rewards/rejected": 2.193791627883911, "step": 48100 }, { "epoch": 2.2336227308603, "grad_norm": 26.862133026123047, "learning_rate": 1.2790751659779933e-07, "logits/chosen": -18.473209381103516, "logits/rejected": -17.487808227539062, "logps/chosen": -572.2471923828125, "logps/rejected": -365.65435791015625, "loss": 0.2921, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.144677639007568, "rewards/margins": 2.327644109725952, "rewards/rejected": 2.817033529281616, "step": 48110 }, { "epoch": 2.234087004967733, "grad_norm": 49.50883102416992, "learning_rate": 1.2783013757989384e-07, "logits/chosen": -18.762557983398438, "logits/rejected": -17.88831329345703, "logps/chosen": -365.34674072265625, "logps/rejected": -313.9187316894531, "loss": 0.5623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2282967567443848, "rewards/margins": 1.6338164806365967, "rewards/rejected": 1.594480276107788, "step": 48120 }, { "epoch": 2.234551279075166, "grad_norm": 8.14099407196045, "learning_rate": 1.2775275856198832e-07, "logits/chosen": -18.85181427001953, "logits/rejected": -17.599403381347656, "logps/chosen": -419.0210876464844, "logps/rejected": -307.1964111328125, "loss": 0.2827, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.71342134475708, "rewards/margins": 3.491811752319336, "rewards/rejected": 1.221609115600586, "step": 48130 }, { "epoch": 2.235015553182599, "grad_norm": 29.0576114654541, "learning_rate": 1.2767537954408283e-07, "logits/chosen": -18.14259910583496, "logits/rejected": -18.31584358215332, "logps/chosen": -313.06817626953125, "logps/rejected": -230.870849609375, "loss": 0.6022, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.424164295196533, "rewards/margins": 1.1950596570968628, "rewards/rejected": 1.22910475730896, "step": 48140 }, { "epoch": 2.235479827290032, "grad_norm": 21.176942825317383, "learning_rate": 1.2759800052617732e-07, "logits/chosen": -19.95561408996582, "logits/rejected": -18.474136352539062, "logps/chosen": -349.9950256347656, "logps/rejected": -208.61666870117188, "loss": 0.3076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5795986652374268, "rewards/margins": 2.784585475921631, "rewards/rejected": 0.7950128316879272, "step": 48150 }, { "epoch": 2.235944101397465, "grad_norm": 21.59510040283203, "learning_rate": 1.275206215082718e-07, "logits/chosen": -19.64406967163086, "logits/rejected": -18.535890579223633, "logps/chosen": -355.1490783691406, "logps/rejected": -251.70791625976562, "loss": 0.3234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.564948320388794, "rewards/margins": 1.542986273765564, "rewards/rejected": 2.0219619274139404, "step": 48160 }, { "epoch": 2.2364083755048982, "grad_norm": 2.0962538719177246, "learning_rate": 1.2744324249036632e-07, "logits/chosen": -19.843040466308594, "logits/rejected": -18.578645706176758, "logps/chosen": -372.3313903808594, "logps/rejected": -302.9393615722656, "loss": 0.3032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.711777448654175, "rewards/margins": 2.2818424701690674, "rewards/rejected": 1.4299345016479492, "step": 48170 }, { "epoch": 2.236872649612331, "grad_norm": 135.180908203125, "learning_rate": 1.273658634724608e-07, "logits/chosen": -19.2406005859375, "logits/rejected": -18.992860794067383, "logps/chosen": -358.49908447265625, "logps/rejected": -292.5971374511719, "loss": 0.5775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2922523021698, "rewards/margins": 0.7445885539054871, "rewards/rejected": 2.547663450241089, "step": 48180 }, { "epoch": 2.2373369237197642, "grad_norm": 78.54228210449219, "learning_rate": 1.2728848445455528e-07, "logits/chosen": -19.79549217224121, "logits/rejected": -18.70895004272461, "logps/chosen": -424.2327575683594, "logps/rejected": -328.4817199707031, "loss": 0.2714, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.137007236480713, "rewards/margins": 2.054762363433838, "rewards/rejected": 2.082245111465454, "step": 48190 }, { "epoch": 2.237801197827197, "grad_norm": 149.91046142578125, "learning_rate": 1.272111054366498e-07, "logits/chosen": -19.50929832458496, "logits/rejected": -18.600589752197266, "logps/chosen": -580.3818359375, "logps/rejected": -400.0980224609375, "loss": 0.3696, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.765198707580566, "rewards/margins": 2.315347909927368, "rewards/rejected": 2.449850559234619, "step": 48200 }, { "epoch": 2.23826547193463, "grad_norm": 1.5339679718017578, "learning_rate": 1.2713372641874428e-07, "logits/chosen": -18.61396598815918, "logits/rejected": -17.18778419494629, "logps/chosen": -392.3238830566406, "logps/rejected": -257.25933837890625, "loss": 0.5044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2949042320251465, "rewards/margins": 2.0187489986419678, "rewards/rejected": 1.2761552333831787, "step": 48210 }, { "epoch": 2.2387297460420634, "grad_norm": 5.6728010177612305, "learning_rate": 1.270563474008388e-07, "logits/chosen": -19.140588760375977, "logits/rejected": -18.422115325927734, "logps/chosen": -333.18572998046875, "logps/rejected": -295.17535400390625, "loss": 0.3563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.781059980392456, "rewards/margins": 1.9407020807266235, "rewards/rejected": 1.840358018875122, "step": 48220 }, { "epoch": 2.239194020149496, "grad_norm": 200.15313720703125, "learning_rate": 1.2697896838293328e-07, "logits/chosen": -19.509138107299805, "logits/rejected": -18.489316940307617, "logps/chosen": -395.02557373046875, "logps/rejected": -333.3259582519531, "loss": 0.4947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8017821311950684, "rewards/margins": 1.9337255954742432, "rewards/rejected": 1.8680568933486938, "step": 48230 }, { "epoch": 2.2396582942569294, "grad_norm": 56.950931549072266, "learning_rate": 1.2690158936502776e-07, "logits/chosen": -18.58955192565918, "logits/rejected": -18.378887176513672, "logps/chosen": -271.3702392578125, "logps/rejected": -212.8507843017578, "loss": 0.3632, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4650416374206543, "rewards/margins": 1.3846551179885864, "rewards/rejected": 1.080386757850647, "step": 48240 }, { "epoch": 2.240122568364362, "grad_norm": 78.5865478515625, "learning_rate": 1.2682421034712227e-07, "logits/chosen": -18.462993621826172, "logits/rejected": -17.297319412231445, "logps/chosen": -452.5189514160156, "logps/rejected": -333.3835754394531, "loss": 0.4232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.381643772125244, "rewards/margins": 1.7321531772613525, "rewards/rejected": 1.6494905948638916, "step": 48250 }, { "epoch": 2.2405868424717954, "grad_norm": 213.5125274658203, "learning_rate": 1.2674683132921676e-07, "logits/chosen": -18.191755294799805, "logits/rejected": -18.16366958618164, "logps/chosen": -395.14959716796875, "logps/rejected": -401.069091796875, "loss": 0.6216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7730202674865723, "rewards/margins": 1.1417677402496338, "rewards/rejected": 2.6312527656555176, "step": 48260 }, { "epoch": 2.241051116579228, "grad_norm": 1.8404065370559692, "learning_rate": 1.2666945231131127e-07, "logits/chosen": -18.560163497924805, "logits/rejected": -17.64781951904297, "logps/chosen": -433.63690185546875, "logps/rejected": -345.9521789550781, "loss": 0.5527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.467285633087158, "rewards/margins": 1.7188844680786133, "rewards/rejected": 1.7484010457992554, "step": 48270 }, { "epoch": 2.2415153906866614, "grad_norm": 7.694446563720703, "learning_rate": 1.2659207329340575e-07, "logits/chosen": -18.088083267211914, "logits/rejected": -17.483186721801758, "logps/chosen": -416.4466857910156, "logps/rejected": -302.7184143066406, "loss": 0.3816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.035214424133301, "rewards/margins": 2.1871936321258545, "rewards/rejected": 1.8480209112167358, "step": 48280 }, { "epoch": 2.2419796647940946, "grad_norm": 147.90208435058594, "learning_rate": 1.2651469427550024e-07, "logits/chosen": -18.9075927734375, "logits/rejected": -18.571592330932617, "logps/chosen": -372.3385009765625, "logps/rejected": -309.87506103515625, "loss": 0.9179, "rewards/accuracies": 0.5, "rewards/chosen": 2.6047239303588867, "rewards/margins": 0.22013166546821594, "rewards/rejected": 2.384592294692993, "step": 48290 }, { "epoch": 2.2424439389015274, "grad_norm": 76.2752914428711, "learning_rate": 1.2643731525759475e-07, "logits/chosen": -18.75632095336914, "logits/rejected": -18.40428352355957, "logps/chosen": -398.434814453125, "logps/rejected": -372.0731201171875, "loss": 0.351, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.207862377166748, "rewards/margins": 1.5134103298187256, "rewards/rejected": 2.6944525241851807, "step": 48300 }, { "epoch": 2.2429082130089606, "grad_norm": 122.74921417236328, "learning_rate": 1.2635993623968923e-07, "logits/chosen": -18.692718505859375, "logits/rejected": -18.85287094116211, "logps/chosen": -386.836669921875, "logps/rejected": -359.2540588378906, "loss": 1.4963, "rewards/accuracies": 0.5, "rewards/chosen": 2.6348187923431396, "rewards/margins": -0.39249274134635925, "rewards/rejected": 3.0273118019104004, "step": 48310 }, { "epoch": 2.2433724871163934, "grad_norm": 18.018787384033203, "learning_rate": 1.2628255722178375e-07, "logits/chosen": -18.783039093017578, "logits/rejected": -17.963754653930664, "logps/chosen": -429.4081115722656, "logps/rejected": -419.35455322265625, "loss": 0.8672, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.176363945007324, "rewards/margins": 0.15606307983398438, "rewards/rejected": 4.02030086517334, "step": 48320 }, { "epoch": 2.2438367612238266, "grad_norm": 42.954368591308594, "learning_rate": 1.2620517820387823e-07, "logits/chosen": -18.650548934936523, "logits/rejected": -16.673656463623047, "logps/chosen": -465.5062561035156, "logps/rejected": -257.4568176269531, "loss": 0.1854, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9275848865509033, "rewards/margins": 3.7409820556640625, "rewards/rejected": 0.18660281598567963, "step": 48330 }, { "epoch": 2.24430103533126, "grad_norm": 5.612911701202393, "learning_rate": 1.2612779918597272e-07, "logits/chosen": -19.825246810913086, "logits/rejected": -18.91085433959961, "logps/chosen": -361.9878234863281, "logps/rejected": -310.01312255859375, "loss": 0.5353, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.548816680908203, "rewards/margins": 1.692383050918579, "rewards/rejected": 2.856433391571045, "step": 48340 }, { "epoch": 2.2447653094386926, "grad_norm": 3.124021291732788, "learning_rate": 1.2605042016806723e-07, "logits/chosen": -18.057186126708984, "logits/rejected": -18.479618072509766, "logps/chosen": -328.77435302734375, "logps/rejected": -394.41693115234375, "loss": 1.2887, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2343711853027344, "rewards/margins": -0.09902322292327881, "rewards/rejected": 2.3333945274353027, "step": 48350 }, { "epoch": 2.245229583546126, "grad_norm": 153.39938354492188, "learning_rate": 1.259730411501617e-07, "logits/chosen": -17.920040130615234, "logits/rejected": -17.68435287475586, "logps/chosen": -300.0909118652344, "logps/rejected": -336.22900390625, "loss": 0.8204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.34397292137146, "rewards/margins": 0.3625035285949707, "rewards/rejected": 1.9814693927764893, "step": 48360 }, { "epoch": 2.2456938576535586, "grad_norm": 79.67333984375, "learning_rate": 1.2589566213225622e-07, "logits/chosen": -18.466026306152344, "logits/rejected": -18.819046020507812, "logps/chosen": -302.89337158203125, "logps/rejected": -366.6675109863281, "loss": 1.9076, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.1698355674743652, "rewards/margins": -1.2175296545028687, "rewards/rejected": 3.3873653411865234, "step": 48370 }, { "epoch": 2.2461581317609918, "grad_norm": 80.32495880126953, "learning_rate": 1.258182831143507e-07, "logits/chosen": -18.01007843017578, "logits/rejected": -18.054603576660156, "logps/chosen": -404.48052978515625, "logps/rejected": -382.28826904296875, "loss": 0.8978, "rewards/accuracies": 0.5, "rewards/chosen": 3.6674256324768066, "rewards/margins": 0.9389973878860474, "rewards/rejected": 2.728428363800049, "step": 48380 }, { "epoch": 2.2466224058684245, "grad_norm": 4.880553245544434, "learning_rate": 1.257409040964452e-07, "logits/chosen": -19.048601150512695, "logits/rejected": -17.98322868347168, "logps/chosen": -426.0133361816406, "logps/rejected": -317.75360107421875, "loss": 0.4193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6537163257598877, "rewards/margins": 2.4622414112091064, "rewards/rejected": 1.1914751529693604, "step": 48390 }, { "epoch": 2.2470866799758578, "grad_norm": 2.7384443283081055, "learning_rate": 1.256635250785397e-07, "logits/chosen": -19.000673294067383, "logits/rejected": -18.300779342651367, "logps/chosen": -406.65545654296875, "logps/rejected": -349.88970947265625, "loss": 0.639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5474212169647217, "rewards/margins": 1.417908787727356, "rewards/rejected": 2.129512310028076, "step": 48400 }, { "epoch": 2.247550954083291, "grad_norm": 15.992958068847656, "learning_rate": 1.255861460606342e-07, "logits/chosen": -19.352033615112305, "logits/rejected": -18.79184341430664, "logps/chosen": -312.7740783691406, "logps/rejected": -254.5224609375, "loss": 0.7377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.514035701751709, "rewards/margins": 0.9730668067932129, "rewards/rejected": 1.540968894958496, "step": 48410 }, { "epoch": 2.2480152281907237, "grad_norm": 83.81327819824219, "learning_rate": 1.255087670427287e-07, "logits/chosen": -18.725650787353516, "logits/rejected": -18.187620162963867, "logps/chosen": -451.4161071777344, "logps/rejected": -377.6519470214844, "loss": 0.7248, "rewards/accuracies": 0.5, "rewards/chosen": 3.653667449951172, "rewards/margins": 0.5476795434951782, "rewards/rejected": 3.105988025665283, "step": 48420 }, { "epoch": 2.248479502298157, "grad_norm": 85.00305938720703, "learning_rate": 1.2543138802482316e-07, "logits/chosen": -19.48013687133789, "logits/rejected": -18.32073211669922, "logps/chosen": -409.43572998046875, "logps/rejected": -279.4845886230469, "loss": 0.312, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.373159408569336, "rewards/margins": 1.6183443069458008, "rewards/rejected": 1.7548153400421143, "step": 48430 }, { "epoch": 2.2489437764055897, "grad_norm": 51.88655090332031, "learning_rate": 1.2535400900691767e-07, "logits/chosen": -19.148048400878906, "logits/rejected": -17.967662811279297, "logps/chosen": -372.6764831542969, "logps/rejected": -274.69268798828125, "loss": 0.1739, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.620617389678955, "rewards/margins": 2.452775478363037, "rewards/rejected": 1.167842149734497, "step": 48440 }, { "epoch": 2.249408050513023, "grad_norm": 12.07708740234375, "learning_rate": 1.2527662998901218e-07, "logits/chosen": -19.90736961364746, "logits/rejected": -18.321718215942383, "logps/chosen": -376.16265869140625, "logps/rejected": -335.8001403808594, "loss": 0.3316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.40538215637207, "rewards/margins": 2.1108031272888184, "rewards/rejected": 2.294579267501831, "step": 48450 }, { "epoch": 2.2498723246204557, "grad_norm": 41.053550720214844, "learning_rate": 1.2519925097110667e-07, "logits/chosen": -20.17097282409668, "logits/rejected": -19.534578323364258, "logps/chosen": -422.205078125, "logps/rejected": -334.57745361328125, "loss": 0.898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6720492839813232, "rewards/margins": 1.069549798965454, "rewards/rejected": 2.60249924659729, "step": 48460 }, { "epoch": 2.250336598727889, "grad_norm": 71.15138244628906, "learning_rate": 1.2512187195320118e-07, "logits/chosen": -18.213340759277344, "logits/rejected": -17.91838836669922, "logps/chosen": -375.7818603515625, "logps/rejected": -367.261962890625, "loss": 1.0595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.865093469619751, "rewards/margins": 0.8191041946411133, "rewards/rejected": 2.0459890365600586, "step": 48470 }, { "epoch": 2.250800872835322, "grad_norm": 0.03540743142366409, "learning_rate": 1.2504449293529564e-07, "logits/chosen": -19.554302215576172, "logits/rejected": -16.981794357299805, "logps/chosen": -465.28021240234375, "logps/rejected": -186.3820037841797, "loss": 0.1547, "rewards/accuracies": 1.0, "rewards/chosen": 5.282497882843018, "rewards/margins": 4.3107991218566895, "rewards/rejected": 0.9716987609863281, "step": 48480 }, { "epoch": 2.251265146942755, "grad_norm": 142.4523468017578, "learning_rate": 1.2496711391739015e-07, "logits/chosen": -19.514129638671875, "logits/rejected": -19.2210693359375, "logps/chosen": -395.450439453125, "logps/rejected": -357.89617919921875, "loss": 0.7256, "rewards/accuracies": 0.5, "rewards/chosen": 3.19571852684021, "rewards/margins": 0.8080103993415833, "rewards/rejected": 2.3877081871032715, "step": 48490 }, { "epoch": 2.251729421050188, "grad_norm": 36.15022277832031, "learning_rate": 1.2488973489948466e-07, "logits/chosen": -19.658369064331055, "logits/rejected": -18.739431381225586, "logps/chosen": -377.39337158203125, "logps/rejected": -269.3875732421875, "loss": 0.3903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4329724311828613, "rewards/margins": 1.7061550617218018, "rewards/rejected": 1.7268173694610596, "step": 48500 }, { "epoch": 2.252193695157621, "grad_norm": 65.07100677490234, "learning_rate": 1.2481235588157914e-07, "logits/chosen": -19.72159767150879, "logits/rejected": -19.5330810546875, "logps/chosen": -512.0459594726562, "logps/rejected": -378.1333923339844, "loss": 1.08, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.659666538238525, "rewards/margins": 0.4011245369911194, "rewards/rejected": 4.258542060852051, "step": 48510 }, { "epoch": 2.252657969265054, "grad_norm": 126.38406372070312, "learning_rate": 1.2473497686367363e-07, "logits/chosen": -18.97199821472168, "logits/rejected": -18.360565185546875, "logps/chosen": -420.62298583984375, "logps/rejected": -322.59027099609375, "loss": 0.3911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.357480525970459, "rewards/margins": 1.540536642074585, "rewards/rejected": 2.816944122314453, "step": 48520 }, { "epoch": 2.253122243372487, "grad_norm": 31.062576293945312, "learning_rate": 1.2465759784576814e-07, "logits/chosen": -19.823503494262695, "logits/rejected": -18.717578887939453, "logps/chosen": -451.71893310546875, "logps/rejected": -307.4293212890625, "loss": 0.2475, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.1238603591918945, "rewards/margins": 2.6578240394592285, "rewards/rejected": 2.466036319732666, "step": 48530 }, { "epoch": 2.25358651747992, "grad_norm": 59.99653625488281, "learning_rate": 1.2458021882786262e-07, "logits/chosen": -19.448261260986328, "logits/rejected": -18.387569427490234, "logps/chosen": -503.12628173828125, "logps/rejected": -361.37042236328125, "loss": 0.4943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.975600242614746, "rewards/margins": 1.9806970357894897, "rewards/rejected": 2.994903087615967, "step": 48540 }, { "epoch": 2.2540507915873533, "grad_norm": 9.997907638549805, "learning_rate": 1.2450283980995714e-07, "logits/chosen": -18.833433151245117, "logits/rejected": -18.254436492919922, "logps/chosen": -420.1332092285156, "logps/rejected": -301.8024597167969, "loss": 0.7365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5819153785705566, "rewards/margins": 0.9290720224380493, "rewards/rejected": 2.6528429985046387, "step": 48550 }, { "epoch": 2.254515065694786, "grad_norm": 139.29177856445312, "learning_rate": 1.2442546079205162e-07, "logits/chosen": -18.393829345703125, "logits/rejected": -18.601924896240234, "logps/chosen": -382.363037109375, "logps/rejected": -430.00177001953125, "loss": 0.8472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6048362255096436, "rewards/margins": 0.824212908744812, "rewards/rejected": 2.780622959136963, "step": 48560 }, { "epoch": 2.2549793398022193, "grad_norm": 42.662750244140625, "learning_rate": 1.243480817741461e-07, "logits/chosen": -19.00777816772461, "logits/rejected": -18.50986671447754, "logps/chosen": -391.4454345703125, "logps/rejected": -345.81988525390625, "loss": 0.7502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7600905895233154, "rewards/margins": 1.0354034900665283, "rewards/rejected": 2.7246875762939453, "step": 48570 }, { "epoch": 2.255443613909652, "grad_norm": 274.3146057128906, "learning_rate": 1.2427070275624062e-07, "logits/chosen": -19.370121002197266, "logits/rejected": -17.891801834106445, "logps/chosen": -384.9032897949219, "logps/rejected": -288.5068054199219, "loss": 0.7016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.052892684936523, "rewards/margins": 1.0303537845611572, "rewards/rejected": 3.022538423538208, "step": 48580 }, { "epoch": 2.2559078880170853, "grad_norm": 153.76060485839844, "learning_rate": 1.241933237383351e-07, "logits/chosen": -18.40404510498047, "logits/rejected": -17.880083084106445, "logps/chosen": -309.1376037597656, "logps/rejected": -265.44921875, "loss": 0.7825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.164041519165039, "rewards/margins": 0.8008167147636414, "rewards/rejected": 1.363224744796753, "step": 48590 }, { "epoch": 2.256372162124518, "grad_norm": 22.266586303710938, "learning_rate": 1.241159447204296e-07, "logits/chosen": -18.66843032836914, "logits/rejected": -17.797515869140625, "logps/chosen": -393.2762451171875, "logps/rejected": -303.56524658203125, "loss": 0.4162, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.275146007537842, "rewards/margins": 2.1416778564453125, "rewards/rejected": 1.1334683895111084, "step": 48600 }, { "epoch": 2.2568364362319513, "grad_norm": 0.13347995281219482, "learning_rate": 1.240385657025241e-07, "logits/chosen": -18.124244689941406, "logits/rejected": -17.800228118896484, "logps/chosen": -313.81927490234375, "logps/rejected": -384.1224365234375, "loss": 1.7975, "rewards/accuracies": 0.5, "rewards/chosen": 3.1346678733825684, "rewards/margins": 0.13325290381908417, "rewards/rejected": 3.0014150142669678, "step": 48610 }, { "epoch": 2.2573007103393845, "grad_norm": 22.823610305786133, "learning_rate": 1.2396118668461858e-07, "logits/chosen": -18.74597930908203, "logits/rejected": -18.213930130004883, "logps/chosen": -400.47052001953125, "logps/rejected": -322.6221618652344, "loss": 1.0173, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8178858757019043, "rewards/margins": 0.5693492889404297, "rewards/rejected": 3.2485363483428955, "step": 48620 }, { "epoch": 2.2577649844468173, "grad_norm": 107.74459075927734, "learning_rate": 1.238838076667131e-07, "logits/chosen": -18.738435745239258, "logits/rejected": -18.700517654418945, "logps/chosen": -486.61065673828125, "logps/rejected": -395.55755615234375, "loss": 0.96, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.074936389923096, "rewards/margins": 0.44621363282203674, "rewards/rejected": 3.6287224292755127, "step": 48630 }, { "epoch": 2.2582292585542505, "grad_norm": 24.67232894897461, "learning_rate": 1.2380642864880758e-07, "logits/chosen": -18.772380828857422, "logits/rejected": -18.681503295898438, "logps/chosen": -370.3028259277344, "logps/rejected": -350.89935302734375, "loss": 0.9242, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.610896348953247, "rewards/margins": 0.13079099357128143, "rewards/rejected": 3.4801056385040283, "step": 48640 }, { "epoch": 2.2586935326616833, "grad_norm": 15.449776649475098, "learning_rate": 1.237290496309021e-07, "logits/chosen": -17.69015884399414, "logits/rejected": -17.261316299438477, "logps/chosen": -384.4583435058594, "logps/rejected": -316.6711120605469, "loss": 0.7191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8814144134521484, "rewards/margins": 1.2590148448944092, "rewards/rejected": 2.6223995685577393, "step": 48650 }, { "epoch": 2.2591578067691165, "grad_norm": 1.0501257181167603, "learning_rate": 1.2365167061299657e-07, "logits/chosen": -20.45631980895996, "logits/rejected": -19.20882225036621, "logps/chosen": -372.65777587890625, "logps/rejected": -315.9621887207031, "loss": 0.4589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7146103382110596, "rewards/margins": 2.0065600872039795, "rewards/rejected": 1.7080501317977905, "step": 48660 }, { "epoch": 2.2596220808765497, "grad_norm": 11.875863075256348, "learning_rate": 1.2357429159509106e-07, "logits/chosen": -19.843141555786133, "logits/rejected": -17.719118118286133, "logps/chosen": -378.8621520996094, "logps/rejected": -240.1802520751953, "loss": 0.4016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.475852966308594, "rewards/margins": 2.9358067512512207, "rewards/rejected": 1.5400458574295044, "step": 48670 }, { "epoch": 2.2600863549839825, "grad_norm": 1.0938435792922974, "learning_rate": 1.2349691257718557e-07, "logits/chosen": -17.835674285888672, "logits/rejected": -17.36811065673828, "logps/chosen": -311.8175048828125, "logps/rejected": -327.5482177734375, "loss": 0.8616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8351707458496094, "rewards/margins": 0.7667819857597351, "rewards/rejected": 2.0683884620666504, "step": 48680 }, { "epoch": 2.2605506290914157, "grad_norm": 96.91085815429688, "learning_rate": 1.2341953355928006e-07, "logits/chosen": -18.779172897338867, "logits/rejected": -18.839069366455078, "logps/chosen": -454.9124450683594, "logps/rejected": -402.49871826171875, "loss": 0.7443, "rewards/accuracies": 0.5, "rewards/chosen": 3.299353837966919, "rewards/margins": 0.7707023024559021, "rewards/rejected": 2.528651475906372, "step": 48690 }, { "epoch": 2.2610149031988485, "grad_norm": 125.94276428222656, "learning_rate": 1.2334215454137457e-07, "logits/chosen": -19.29244041442871, "logits/rejected": -18.949186325073242, "logps/chosen": -321.41912841796875, "logps/rejected": -289.0907287597656, "loss": 0.3266, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.699026584625244, "rewards/margins": 1.6358184814453125, "rewards/rejected": 2.06320858001709, "step": 48700 }, { "epoch": 2.2614791773062817, "grad_norm": 66.87522888183594, "learning_rate": 1.2326477552346905e-07, "logits/chosen": -18.870197296142578, "logits/rejected": -17.685791015625, "logps/chosen": -427.04949951171875, "logps/rejected": -374.9136657714844, "loss": 0.6873, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.487226963043213, "rewards/margins": 1.0249614715576172, "rewards/rejected": 3.4622650146484375, "step": 48710 }, { "epoch": 2.261943451413715, "grad_norm": 73.69522094726562, "learning_rate": 1.2318739650556354e-07, "logits/chosen": -18.82659912109375, "logits/rejected": -17.834407806396484, "logps/chosen": -420.548828125, "logps/rejected": -293.34674072265625, "loss": 0.5283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7779178619384766, "rewards/margins": 1.2632006406784058, "rewards/rejected": 2.5147173404693604, "step": 48720 }, { "epoch": 2.2624077255211477, "grad_norm": 38.97298049926758, "learning_rate": 1.2311001748765805e-07, "logits/chosen": -18.957286834716797, "logits/rejected": -18.48497772216797, "logps/chosen": -610.8343505859375, "logps/rejected": -464.62554931640625, "loss": 0.4393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.836501121520996, "rewards/margins": 1.236581563949585, "rewards/rejected": 3.599919557571411, "step": 48730 }, { "epoch": 2.262871999628581, "grad_norm": 184.411865234375, "learning_rate": 1.2303263846975253e-07, "logits/chosen": -19.48948860168457, "logits/rejected": -19.490957260131836, "logps/chosen": -460.44317626953125, "logps/rejected": -514.6350708007812, "loss": 1.2584, "rewards/accuracies": 0.5, "rewards/chosen": 4.024735450744629, "rewards/margins": -0.18767495453357697, "rewards/rejected": 4.2124104499816895, "step": 48740 }, { "epoch": 2.2633362737360136, "grad_norm": 78.37833404541016, "learning_rate": 1.2295525945184704e-07, "logits/chosen": -18.704666137695312, "logits/rejected": -18.294836044311523, "logps/chosen": -367.3099670410156, "logps/rejected": -266.7979736328125, "loss": 0.4955, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7667738199234009, "rewards/margins": 0.6560443043708801, "rewards/rejected": 1.110729455947876, "step": 48750 }, { "epoch": 2.263800547843447, "grad_norm": 58.87654495239258, "learning_rate": 1.2287788043394153e-07, "logits/chosen": -19.12198829650879, "logits/rejected": -17.985153198242188, "logps/chosen": -296.38543701171875, "logps/rejected": -174.69859313964844, "loss": 0.486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0926265716552734, "rewards/margins": 1.8275737762451172, "rewards/rejected": 0.2650529742240906, "step": 48760 }, { "epoch": 2.2642648219508796, "grad_norm": 62.44227600097656, "learning_rate": 1.2280050141603601e-07, "logits/chosen": -19.653213500976562, "logits/rejected": -19.573129653930664, "logps/chosen": -441.6150817871094, "logps/rejected": -397.58740234375, "loss": 0.4531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.723141670227051, "rewards/margins": 1.2724475860595703, "rewards/rejected": 3.4506938457489014, "step": 48770 }, { "epoch": 2.264729096058313, "grad_norm": 285.625732421875, "learning_rate": 1.2272312239813052e-07, "logits/chosen": -19.27806282043457, "logits/rejected": -18.421674728393555, "logps/chosen": -333.63287353515625, "logps/rejected": -313.7829284667969, "loss": 0.8508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7910429239273071, "rewards/margins": 0.03280411660671234, "rewards/rejected": 1.7582387924194336, "step": 48780 }, { "epoch": 2.265193370165746, "grad_norm": 153.88734436035156, "learning_rate": 1.22645743380225e-07, "logits/chosen": -18.60249137878418, "logits/rejected": -17.517223358154297, "logps/chosen": -414.30108642578125, "logps/rejected": -280.01654052734375, "loss": 0.5846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.436634063720703, "rewards/margins": 2.1280860900878906, "rewards/rejected": 2.3085482120513916, "step": 48790 }, { "epoch": 2.265657644273179, "grad_norm": 28.814973831176758, "learning_rate": 1.225683643623195e-07, "logits/chosen": -18.696308135986328, "logits/rejected": -17.570877075195312, "logps/chosen": -337.91644287109375, "logps/rejected": -294.626953125, "loss": 0.9106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5806267261505127, "rewards/margins": 1.2043359279632568, "rewards/rejected": 1.3762904405593872, "step": 48800 }, { "epoch": 2.266121918380612, "grad_norm": 32.608028411865234, "learning_rate": 1.22490985344414e-07, "logits/chosen": -19.835294723510742, "logits/rejected": -19.546052932739258, "logps/chosen": -398.2005310058594, "logps/rejected": -321.31292724609375, "loss": 0.7175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.965578079223633, "rewards/margins": 1.734574556350708, "rewards/rejected": 2.231003522872925, "step": 48810 }, { "epoch": 2.266586192488045, "grad_norm": 113.9476318359375, "learning_rate": 1.224136063265085e-07, "logits/chosen": -18.49078369140625, "logits/rejected": -17.483619689941406, "logps/chosen": -478.8536071777344, "logps/rejected": -293.12286376953125, "loss": 0.5461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.168940544128418, "rewards/margins": 2.1033146381378174, "rewards/rejected": 2.0656259059906006, "step": 48820 }, { "epoch": 2.267050466595478, "grad_norm": 90.24771881103516, "learning_rate": 1.22336227308603e-07, "logits/chosen": -19.153146743774414, "logits/rejected": -17.91482162475586, "logps/chosen": -378.5452880859375, "logps/rejected": -298.3785400390625, "loss": 0.739, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8322486877441406, "rewards/margins": 1.7779725790023804, "rewards/rejected": 2.0542759895324707, "step": 48830 }, { "epoch": 2.267514740702911, "grad_norm": 94.96527862548828, "learning_rate": 1.2225884829069749e-07, "logits/chosen": -19.403532028198242, "logits/rejected": -19.507583618164062, "logps/chosen": -393.26080322265625, "logps/rejected": -395.13665771484375, "loss": 0.5755, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.464169979095459, "rewards/margins": 1.2037925720214844, "rewards/rejected": 3.2603774070739746, "step": 48840 }, { "epoch": 2.267979014810344, "grad_norm": 78.04344177246094, "learning_rate": 1.2218146927279197e-07, "logits/chosen": -19.999120712280273, "logits/rejected": -19.168710708618164, "logps/chosen": -384.37432861328125, "logps/rejected": -370.6874084472656, "loss": 0.4941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8087358474731445, "rewards/margins": 1.28523850440979, "rewards/rejected": 1.523497223854065, "step": 48850 }, { "epoch": 2.2684432889177772, "grad_norm": 0.8709651231765747, "learning_rate": 1.2210409025488648e-07, "logits/chosen": -18.66568374633789, "logits/rejected": -17.794422149658203, "logps/chosen": -447.39263916015625, "logps/rejected": -300.55328369140625, "loss": 1.0106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.006077766418457, "rewards/margins": 1.5576289892196655, "rewards/rejected": 2.4484493732452393, "step": 48860 }, { "epoch": 2.26890756302521, "grad_norm": 165.82998657226562, "learning_rate": 1.22026711236981e-07, "logits/chosen": -20.086322784423828, "logits/rejected": -19.190837860107422, "logps/chosen": -290.7212219238281, "logps/rejected": -297.0143737792969, "loss": 0.8391, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9686601161956787, "rewards/margins": 0.7941542863845825, "rewards/rejected": 2.1745057106018066, "step": 48870 }, { "epoch": 2.2693718371326432, "grad_norm": 152.92518615722656, "learning_rate": 1.2194933221907548e-07, "logits/chosen": -18.713058471679688, "logits/rejected": -17.944171905517578, "logps/chosen": -398.2002258300781, "logps/rejected": -365.18011474609375, "loss": 0.457, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.532910346984863, "rewards/margins": 2.42421817779541, "rewards/rejected": 2.108691930770874, "step": 48880 }, { "epoch": 2.269836111240076, "grad_norm": 0.0454682856798172, "learning_rate": 1.2187195320116996e-07, "logits/chosen": -18.729093551635742, "logits/rejected": -17.73213005065918, "logps/chosen": -427.00634765625, "logps/rejected": -351.59002685546875, "loss": 0.3596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9132602214813232, "rewards/margins": 2.249533176422119, "rewards/rejected": 1.6637274026870728, "step": 48890 }, { "epoch": 2.270300385347509, "grad_norm": 25.589824676513672, "learning_rate": 1.2179457418326445e-07, "logits/chosen": -18.188274383544922, "logits/rejected": -17.327693939208984, "logps/chosen": -436.78985595703125, "logps/rejected": -312.0946960449219, "loss": 1.0151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3596577644348145, "rewards/margins": 1.3148235082626343, "rewards/rejected": 2.0448343753814697, "step": 48900 }, { "epoch": 2.270764659454942, "grad_norm": 1.164022445678711, "learning_rate": 1.2171719516535896e-07, "logits/chosen": -19.02567481994629, "logits/rejected": -17.838369369506836, "logps/chosen": -342.6036071777344, "logps/rejected": -280.8992614746094, "loss": 0.7759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1767425537109375, "rewards/margins": 1.4738513231277466, "rewards/rejected": 1.7028915882110596, "step": 48910 }, { "epoch": 2.271228933562375, "grad_norm": 196.373046875, "learning_rate": 1.2163981614745344e-07, "logits/chosen": -18.925752639770508, "logits/rejected": -18.402976989746094, "logps/chosen": -283.5876770019531, "logps/rejected": -242.9748992919922, "loss": 0.9181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5365688800811768, "rewards/margins": 0.7404434084892273, "rewards/rejected": 1.7961254119873047, "step": 48920 }, { "epoch": 2.2716932076698084, "grad_norm": 47.768245697021484, "learning_rate": 1.2156243712954796e-07, "logits/chosen": -20.027502059936523, "logits/rejected": -19.00118637084961, "logps/chosen": -391.12896728515625, "logps/rejected": -259.02099609375, "loss": 0.6625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.369302749633789, "rewards/margins": 1.3001601696014404, "rewards/rejected": 2.0691428184509277, "step": 48930 }, { "epoch": 2.272157481777241, "grad_norm": 78.41287231445312, "learning_rate": 1.2148505811164244e-07, "logits/chosen": -18.641464233398438, "logits/rejected": -18.004535675048828, "logps/chosen": -393.29412841796875, "logps/rejected": -316.53985595703125, "loss": 0.4688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5648293495178223, "rewards/margins": 0.9220815896987915, "rewards/rejected": 1.6427481174468994, "step": 48940 }, { "epoch": 2.2726217558846744, "grad_norm": 68.64684295654297, "learning_rate": 1.2140767909373693e-07, "logits/chosen": -19.17037582397461, "logits/rejected": -19.020044326782227, "logps/chosen": -367.13336181640625, "logps/rejected": -280.71612548828125, "loss": 0.584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9175338745117188, "rewards/margins": 0.7424816489219666, "rewards/rejected": 2.1750519275665283, "step": 48950 }, { "epoch": 2.273086029992107, "grad_norm": 249.23452758789062, "learning_rate": 1.2133030007583144e-07, "logits/chosen": -19.398530960083008, "logits/rejected": -18.5330753326416, "logps/chosen": -394.6222229003906, "logps/rejected": -348.5527648925781, "loss": 0.4734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.23302960395813, "rewards/margins": 1.0894190073013306, "rewards/rejected": 2.1436104774475098, "step": 48960 }, { "epoch": 2.2735503040995404, "grad_norm": 75.1928482055664, "learning_rate": 1.2125292105792592e-07, "logits/chosen": -19.055139541625977, "logits/rejected": -18.49027442932129, "logps/chosen": -457.32904052734375, "logps/rejected": -370.78094482421875, "loss": 0.7123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7598891258239746, "rewards/margins": 1.2170007228851318, "rewards/rejected": 2.542888641357422, "step": 48970 }, { "epoch": 2.274014578206973, "grad_norm": 220.6499786376953, "learning_rate": 1.2117554204002043e-07, "logits/chosen": -18.70575523376465, "logits/rejected": -18.922590255737305, "logps/chosen": -314.97247314453125, "logps/rejected": -298.50006103515625, "loss": 1.0394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8195669651031494, "rewards/margins": 0.15110114216804504, "rewards/rejected": 2.668466091156006, "step": 48980 }, { "epoch": 2.2744788523144064, "grad_norm": 195.34490966796875, "learning_rate": 1.2109816302211492e-07, "logits/chosen": -20.182212829589844, "logits/rejected": -18.30957794189453, "logps/chosen": -297.87261962890625, "logps/rejected": -238.30154418945312, "loss": 0.423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6214680671691895, "rewards/margins": 1.6330595016479492, "rewards/rejected": 0.9884087443351746, "step": 48990 }, { "epoch": 2.2749431264218396, "grad_norm": 179.62142944335938, "learning_rate": 1.210207840042094e-07, "logits/chosen": -18.927034378051758, "logits/rejected": -18.71286392211914, "logps/chosen": -347.78338623046875, "logps/rejected": -303.5568542480469, "loss": 0.6303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.759366989135742, "rewards/margins": 0.9744014739990234, "rewards/rejected": 1.7849657535552979, "step": 49000 }, { "epoch": 2.2754074005292724, "grad_norm": 90.81837463378906, "learning_rate": 1.2094340498630391e-07, "logits/chosen": -19.272388458251953, "logits/rejected": -18.51251220703125, "logps/chosen": -445.3212890625, "logps/rejected": -294.65875244140625, "loss": 0.6517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.707705497741699, "rewards/margins": 2.324410915374756, "rewards/rejected": 2.3832945823669434, "step": 49010 }, { "epoch": 2.2758716746367056, "grad_norm": 111.83588409423828, "learning_rate": 1.208660259683984e-07, "logits/chosen": -19.321857452392578, "logits/rejected": -18.39084243774414, "logps/chosen": -413.44976806640625, "logps/rejected": -293.2261657714844, "loss": 0.6809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.615910291671753, "rewards/margins": 1.633679986000061, "rewards/rejected": 1.9822304248809814, "step": 49020 }, { "epoch": 2.2763359487441384, "grad_norm": 66.90518188476562, "learning_rate": 1.207886469504929e-07, "logits/chosen": -19.499401092529297, "logits/rejected": -19.283512115478516, "logps/chosen": -344.11968994140625, "logps/rejected": -291.91021728515625, "loss": 0.7893, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.952091693878174, "rewards/margins": 1.0293607711791992, "rewards/rejected": 1.9227313995361328, "step": 49030 }, { "epoch": 2.2768002228515716, "grad_norm": 135.45716857910156, "learning_rate": 1.207112679325874e-07, "logits/chosen": -17.762290954589844, "logits/rejected": -17.52490234375, "logps/chosen": -305.41070556640625, "logps/rejected": -275.50616455078125, "loss": 0.9255, "rewards/accuracies": 0.5, "rewards/chosen": 1.4667223691940308, "rewards/margins": 0.5416664481163025, "rewards/rejected": 0.9250558614730835, "step": 49040 }, { "epoch": 2.2772644969590043, "grad_norm": 91.10179901123047, "learning_rate": 1.2063388891468188e-07, "logits/chosen": -19.059757232666016, "logits/rejected": -17.430927276611328, "logps/chosen": -460.1766052246094, "logps/rejected": -288.46630859375, "loss": 0.2432, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.979816436767578, "rewards/margins": 2.5152792930603027, "rewards/rejected": 1.4645369052886963, "step": 49050 }, { "epoch": 2.2777287710664376, "grad_norm": 21.811296463012695, "learning_rate": 1.205565098967764e-07, "logits/chosen": -19.010812759399414, "logits/rejected": -18.664897918701172, "logps/chosen": -341.5997314453125, "logps/rejected": -311.4327392578125, "loss": 0.2929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.216978073120117, "rewards/margins": 1.4971050024032593, "rewards/rejected": 1.719873070716858, "step": 49060 }, { "epoch": 2.278193045173871, "grad_norm": 9.909829139709473, "learning_rate": 1.2047913087887088e-07, "logits/chosen": -19.479022979736328, "logits/rejected": -18.021665573120117, "logps/chosen": -430.0882263183594, "logps/rejected": -337.81817626953125, "loss": 0.9247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6444783210754395, "rewards/margins": 0.6032081842422485, "rewards/rejected": 3.0412702560424805, "step": 49070 }, { "epoch": 2.2786573192813036, "grad_norm": 16.1317195892334, "learning_rate": 1.204017518609654e-07, "logits/chosen": -19.78021240234375, "logits/rejected": -18.69063377380371, "logps/chosen": -401.68438720703125, "logps/rejected": -376.0710754394531, "loss": 0.6917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.105236530303955, "rewards/margins": 1.7856838703155518, "rewards/rejected": 2.3195528984069824, "step": 49080 }, { "epoch": 2.2791215933887368, "grad_norm": 39.080745697021484, "learning_rate": 1.2032437284305987e-07, "logits/chosen": -19.364185333251953, "logits/rejected": -18.295143127441406, "logps/chosen": -350.16650390625, "logps/rejected": -260.2926025390625, "loss": 0.5942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7121453285217285, "rewards/margins": 1.6188606023788452, "rewards/rejected": 2.093285083770752, "step": 49090 }, { "epoch": 2.27958586749617, "grad_norm": 165.87481689453125, "learning_rate": 1.2024699382515436e-07, "logits/chosen": -19.29070281982422, "logits/rejected": -18.922245025634766, "logps/chosen": -278.2463684082031, "logps/rejected": -272.3848876953125, "loss": 0.8102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8860149383544922, "rewards/margins": 0.6791423559188843, "rewards/rejected": 1.2068727016448975, "step": 49100 }, { "epoch": 2.2800501416036028, "grad_norm": NaN, "learning_rate": 1.201773527090394e-07, "logits/chosen": -18.658790588378906, "logits/rejected": -18.900291442871094, "logps/chosen": -410.27001953125, "logps/rejected": -370.41912841796875, "loss": 0.6429, "rewards/accuracies": 0.5, "rewards/chosen": 3.9257240295410156, "rewards/margins": 0.9142335057258606, "rewards/rejected": 3.011490821838379, "step": 49110 }, { "epoch": 2.280514415711036, "grad_norm": 52.0681266784668, "learning_rate": 1.2009997369113392e-07, "logits/chosen": -19.145885467529297, "logits/rejected": -16.966922760009766, "logps/chosen": -462.6380920410156, "logps/rejected": -220.85986328125, "loss": 0.1523, "rewards/accuracies": 1.0, "rewards/chosen": 4.339855670928955, "rewards/margins": 3.0790412425994873, "rewards/rejected": 1.2608143091201782, "step": 49120 }, { "epoch": 2.2809786898184687, "grad_norm": 73.81060791015625, "learning_rate": 1.200225946732284e-07, "logits/chosen": -18.727046966552734, "logits/rejected": -17.521501541137695, "logps/chosen": -468.0960998535156, "logps/rejected": -334.60052490234375, "loss": 0.5543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.251832485198975, "rewards/margins": 1.6702167987823486, "rewards/rejected": 2.581615924835205, "step": 49130 }, { "epoch": 2.281442963925902, "grad_norm": 73.77495574951172, "learning_rate": 1.199452156553229e-07, "logits/chosen": -20.254175186157227, "logits/rejected": -18.25925064086914, "logps/chosen": -322.6603088378906, "logps/rejected": -282.44903564453125, "loss": 0.5065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6896233558654785, "rewards/margins": 1.2257643938064575, "rewards/rejected": 1.4638588428497314, "step": 49140 }, { "epoch": 2.2819072380333347, "grad_norm": 0.38738900423049927, "learning_rate": 1.198678366374174e-07, "logits/chosen": -19.78132438659668, "logits/rejected": -19.74307632446289, "logps/chosen": -430.54791259765625, "logps/rejected": -430.7080078125, "loss": 0.8291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.121673107147217, "rewards/margins": 1.1896586418151855, "rewards/rejected": 2.9320149421691895, "step": 49150 }, { "epoch": 2.282371512140768, "grad_norm": 325.4396667480469, "learning_rate": 1.1979045761951188e-07, "logits/chosen": -18.690319061279297, "logits/rejected": -19.117706298828125, "logps/chosen": -306.7487487792969, "logps/rejected": -341.8228759765625, "loss": 0.8898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9792709350585938, "rewards/margins": 0.7994214296340942, "rewards/rejected": 2.179849624633789, "step": 49160 }, { "epoch": 2.282835786248201, "grad_norm": 12.259069442749023, "learning_rate": 1.197130786016064e-07, "logits/chosen": -19.29385757446289, "logits/rejected": -18.320669174194336, "logps/chosen": -413.65234375, "logps/rejected": -323.9017639160156, "loss": 0.4304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.692547559738159, "rewards/margins": 1.9300276041030884, "rewards/rejected": 1.7625198364257812, "step": 49170 }, { "epoch": 2.283300060355634, "grad_norm": 289.72186279296875, "learning_rate": 1.1963569958370088e-07, "logits/chosen": -18.5949764251709, "logits/rejected": -18.38869857788086, "logps/chosen": -346.2918395996094, "logps/rejected": -315.14312744140625, "loss": 0.8036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0475988388061523, "rewards/margins": 0.7821947932243347, "rewards/rejected": 2.265404224395752, "step": 49180 }, { "epoch": 2.283764334463067, "grad_norm": 29.78386116027832, "learning_rate": 1.1955832056579536e-07, "logits/chosen": -18.639934539794922, "logits/rejected": -18.635923385620117, "logps/chosen": -363.2157287597656, "logps/rejected": -385.3576965332031, "loss": 0.5587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3630359172821045, "rewards/margins": 0.8339546918869019, "rewards/rejected": 2.529081344604492, "step": 49190 }, { "epoch": 2.2842286085705, "grad_norm": 95.48957824707031, "learning_rate": 1.1948094154788988e-07, "logits/chosen": -19.528249740600586, "logits/rejected": -18.573850631713867, "logps/chosen": -394.04510498046875, "logps/rejected": -261.7771911621094, "loss": 0.3871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.361073970794678, "rewards/margins": 2.3421473503112793, "rewards/rejected": 2.0189268589019775, "step": 49200 }, { "epoch": 2.284692882677933, "grad_norm": 37.230350494384766, "learning_rate": 1.1940356252998436e-07, "logits/chosen": -19.35154151916504, "logits/rejected": -18.978391647338867, "logps/chosen": -386.73895263671875, "logps/rejected": -328.6669616699219, "loss": 0.4353, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.943376064300537, "rewards/margins": 1.6290957927703857, "rewards/rejected": 3.3142802715301514, "step": 49210 }, { "epoch": 2.285157156785366, "grad_norm": 33.52630615234375, "learning_rate": 1.1932618351207887e-07, "logits/chosen": -19.086063385009766, "logits/rejected": -17.98005485534668, "logps/chosen": -477.67449951171875, "logps/rejected": -298.22918701171875, "loss": 0.2174, "rewards/accuracies": 1.0, "rewards/chosen": 3.922593593597412, "rewards/margins": 2.45036244392395, "rewards/rejected": 1.472231388092041, "step": 49220 }, { "epoch": 2.285621430892799, "grad_norm": 55.1295051574707, "learning_rate": 1.1924880449417336e-07, "logits/chosen": -19.371623992919922, "logits/rejected": -19.47340965270996, "logps/chosen": -342.7206726074219, "logps/rejected": -287.8827819824219, "loss": 0.4613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0362536907196045, "rewards/margins": 1.358202576637268, "rewards/rejected": 1.678051233291626, "step": 49230 }, { "epoch": 2.2860857050002323, "grad_norm": 3.7805819511413574, "learning_rate": 1.1917142547626785e-07, "logits/chosen": -19.11871910095215, "logits/rejected": -19.16736602783203, "logps/chosen": -434.8720703125, "logps/rejected": -361.6612243652344, "loss": 1.1054, "rewards/accuracies": 0.5, "rewards/chosen": 3.529799699783325, "rewards/margins": 0.6299239993095398, "rewards/rejected": 2.8998758792877197, "step": 49240 }, { "epoch": 2.286549979107665, "grad_norm": 207.07781982421875, "learning_rate": 1.1909404645836235e-07, "logits/chosen": -18.520580291748047, "logits/rejected": -17.912935256958008, "logps/chosen": -440.3406677246094, "logps/rejected": -609.516845703125, "loss": 1.4701, "rewards/accuracies": 0.5, "rewards/chosen": 3.705038547515869, "rewards/margins": 0.38011109828948975, "rewards/rejected": 3.324927568435669, "step": 49250 }, { "epoch": 2.2870142532150983, "grad_norm": 45.97379684448242, "learning_rate": 1.1901666744045684e-07, "logits/chosen": -19.19329833984375, "logits/rejected": -17.92219352722168, "logps/chosen": -381.67919921875, "logps/rejected": -285.402587890625, "loss": 0.3325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7927982807159424, "rewards/margins": 1.8660955429077148, "rewards/rejected": 1.9267024993896484, "step": 49260 }, { "epoch": 2.287478527322531, "grad_norm": 106.95782470703125, "learning_rate": 1.1893928842255134e-07, "logits/chosen": -19.090106964111328, "logits/rejected": -19.179386138916016, "logps/chosen": -347.75457763671875, "logps/rejected": -390.3179626464844, "loss": 0.926, "rewards/accuracies": 0.5, "rewards/chosen": 3.1483993530273438, "rewards/margins": 0.01764211617410183, "rewards/rejected": 3.1307573318481445, "step": 49270 }, { "epoch": 2.2879428014299643, "grad_norm": 2.7168984413146973, "learning_rate": 1.1886190940464583e-07, "logits/chosen": -19.411165237426758, "logits/rejected": -18.233362197875977, "logps/chosen": -530.5300903320312, "logps/rejected": -377.0667419433594, "loss": 0.4631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.0605010986328125, "rewards/margins": 2.0862224102020264, "rewards/rejected": 2.974278450012207, "step": 49280 }, { "epoch": 2.288407075537397, "grad_norm": 0.2252519577741623, "learning_rate": 1.1878453038674033e-07, "logits/chosen": -20.11768913269043, "logits/rejected": -18.675718307495117, "logps/chosen": -364.294189453125, "logps/rejected": -251.8408203125, "loss": 0.2503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7574527263641357, "rewards/margins": 2.651808261871338, "rewards/rejected": 1.1056445837020874, "step": 49290 }, { "epoch": 2.2888713496448303, "grad_norm": 16.580717086791992, "learning_rate": 1.1870715136883482e-07, "logits/chosen": -18.16810417175293, "logits/rejected": -17.234590530395508, "logps/chosen": -430.88031005859375, "logps/rejected": -308.63714599609375, "loss": 0.7158, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0929384231567383, "rewards/margins": 2.0143918991088867, "rewards/rejected": 1.0785465240478516, "step": 49300 }, { "epoch": 2.2893356237522635, "grad_norm": 16.152685165405273, "learning_rate": 1.1862977235092931e-07, "logits/chosen": -18.378049850463867, "logits/rejected": -17.87647819519043, "logps/chosen": -306.57623291015625, "logps/rejected": -262.34991455078125, "loss": 1.1158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.947587251663208, "rewards/margins": 0.6011716723442078, "rewards/rejected": 1.346415638923645, "step": 49310 }, { "epoch": 2.2897998978596963, "grad_norm": 28.80752182006836, "learning_rate": 1.1855239333302381e-07, "logits/chosen": -19.04432487487793, "logits/rejected": -19.412668228149414, "logps/chosen": -312.6873474121094, "logps/rejected": -319.3626708984375, "loss": 0.9447, "rewards/accuracies": 0.5, "rewards/chosen": 1.9814090728759766, "rewards/margins": 0.10114939510822296, "rewards/rejected": 1.8802595138549805, "step": 49320 }, { "epoch": 2.2902641719671295, "grad_norm": 61.90559387207031, "learning_rate": 1.1847501431511831e-07, "logits/chosen": -18.690200805664062, "logits/rejected": -17.684885025024414, "logps/chosen": -377.18902587890625, "logps/rejected": -289.63897705078125, "loss": 0.437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0131993293762207, "rewards/margins": 1.3012930154800415, "rewards/rejected": 1.711905837059021, "step": 49330 }, { "epoch": 2.2907284460745623, "grad_norm": 185.82748413085938, "learning_rate": 1.1839763529721281e-07, "logits/chosen": -19.646045684814453, "logits/rejected": -18.241619110107422, "logps/chosen": -442.02508544921875, "logps/rejected": -337.5299072265625, "loss": 0.7533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.706188917160034, "rewards/margins": 1.1384036540985107, "rewards/rejected": 2.5677852630615234, "step": 49340 }, { "epoch": 2.2911927201819955, "grad_norm": 2.7272279262542725, "learning_rate": 1.183202562793073e-07, "logits/chosen": -18.562183380126953, "logits/rejected": -18.540132522583008, "logps/chosen": -440.5535583496094, "logps/rejected": -418.0370178222656, "loss": 0.8876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6767449378967285, "rewards/margins": 0.7914038896560669, "rewards/rejected": 2.885341167449951, "step": 49350 }, { "epoch": 2.2916569942894283, "grad_norm": 31.21487045288086, "learning_rate": 1.1824287726140179e-07, "logits/chosen": -19.365449905395508, "logits/rejected": -18.65207290649414, "logps/chosen": -396.2258605957031, "logps/rejected": -285.71380615234375, "loss": 0.5515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4193062782287598, "rewards/margins": 1.145025372505188, "rewards/rejected": 2.2742810249328613, "step": 49360 }, { "epoch": 2.2921212683968615, "grad_norm": 140.45550537109375, "learning_rate": 1.1816549824349629e-07, "logits/chosen": -18.451400756835938, "logits/rejected": -17.403440475463867, "logps/chosen": -376.8382873535156, "logps/rejected": -264.5853271484375, "loss": 0.9226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7134366035461426, "rewards/margins": 1.5441378355026245, "rewards/rejected": 2.169297933578491, "step": 49370 }, { "epoch": 2.2925855425042947, "grad_norm": 19.650365829467773, "learning_rate": 1.1808811922559079e-07, "logits/chosen": -18.753314971923828, "logits/rejected": -18.752796173095703, "logps/chosen": -456.98419189453125, "logps/rejected": -446.11724853515625, "loss": 0.8839, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7136096954345703, "rewards/margins": 0.4080325961112976, "rewards/rejected": 3.305577039718628, "step": 49380 }, { "epoch": 2.2930498166117275, "grad_norm": 27.73661994934082, "learning_rate": 1.1801074020768529e-07, "logits/chosen": -19.727779388427734, "logits/rejected": -19.11618423461914, "logps/chosen": -344.4054870605469, "logps/rejected": -317.68084716796875, "loss": 0.8304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9801299571990967, "rewards/margins": 0.8195106387138367, "rewards/rejected": 2.1606192588806152, "step": 49390 }, { "epoch": 2.2935140907191607, "grad_norm": 0.7171890735626221, "learning_rate": 1.1793336118977977e-07, "logits/chosen": -20.045936584472656, "logits/rejected": -18.99074935913086, "logps/chosen": -433.49090576171875, "logps/rejected": -342.785400390625, "loss": 0.4182, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7208142280578613, "rewards/margins": 1.6230342388153076, "rewards/rejected": 2.0977797508239746, "step": 49400 }, { "epoch": 2.2939783648265935, "grad_norm": 51.82822036743164, "learning_rate": 1.1785598217187427e-07, "logits/chosen": -19.56187629699707, "logits/rejected": -18.79109764099121, "logps/chosen": -388.69024658203125, "logps/rejected": -318.7559814453125, "loss": 0.7542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9091315269470215, "rewards/margins": 0.586207389831543, "rewards/rejected": 3.3229243755340576, "step": 49410 }, { "epoch": 2.2944426389340267, "grad_norm": 11.940564155578613, "learning_rate": 1.1777860315396875e-07, "logits/chosen": -18.006481170654297, "logits/rejected": -17.90756607055664, "logps/chosen": -371.7447204589844, "logps/rejected": -383.8550720214844, "loss": 1.1105, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0426025390625, "rewards/margins": 0.8522111773490906, "rewards/rejected": 3.1903913021087646, "step": 49420 }, { "epoch": 2.2949069130414594, "grad_norm": 10.479959487915039, "learning_rate": 1.1770122413606326e-07, "logits/chosen": -18.56275177001953, "logits/rejected": -17.368816375732422, "logps/chosen": -383.66351318359375, "logps/rejected": -266.5758972167969, "loss": 0.2094, "rewards/accuracies": 1.0, "rewards/chosen": 3.424773693084717, "rewards/margins": 2.3282341957092285, "rewards/rejected": 1.0965393781661987, "step": 49430 }, { "epoch": 2.2953711871488927, "grad_norm": 2.3841848373413086, "learning_rate": 1.1762384511815776e-07, "logits/chosen": -18.736501693725586, "logits/rejected": -17.33749008178711, "logps/chosen": -395.3234558105469, "logps/rejected": -275.76873779296875, "loss": 0.296, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.606273651123047, "rewards/margins": 2.9411728382110596, "rewards/rejected": 1.6651010513305664, "step": 49440 }, { "epoch": 2.295835461256326, "grad_norm": 269.1985778808594, "learning_rate": 1.1754646610025225e-07, "logits/chosen": -19.36582374572754, "logits/rejected": -18.455921173095703, "logps/chosen": -381.5390625, "logps/rejected": -363.1421813964844, "loss": 1.084, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2620625495910645, "rewards/margins": 0.11960703134536743, "rewards/rejected": 3.142455577850342, "step": 49450 }, { "epoch": 2.2962997353637586, "grad_norm": 34.66826248168945, "learning_rate": 1.1746908708234675e-07, "logits/chosen": -18.608552932739258, "logits/rejected": -18.554927825927734, "logps/chosen": -227.4562530517578, "logps/rejected": -275.1769714355469, "loss": 0.6983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8047449588775635, "rewards/margins": 0.48363131284713745, "rewards/rejected": 1.3211135864257812, "step": 49460 }, { "epoch": 2.296764009471192, "grad_norm": 130.64590454101562, "learning_rate": 1.1739170806444123e-07, "logits/chosen": -19.766386032104492, "logits/rejected": -19.2122802734375, "logps/chosen": -415.23406982421875, "logps/rejected": -270.2650146484375, "loss": 1.4199, "rewards/accuracies": 0.5, "rewards/chosen": 3.337338924407959, "rewards/margins": 0.23237808048725128, "rewards/rejected": 3.1049611568450928, "step": 49470 }, { "epoch": 2.2972282835786246, "grad_norm": 125.22200012207031, "learning_rate": 1.1731432904653574e-07, "logits/chosen": -19.651226043701172, "logits/rejected": -19.353708267211914, "logps/chosen": -486.51068115234375, "logps/rejected": -466.034912109375, "loss": 1.3571, "rewards/accuracies": 0.5, "rewards/chosen": 4.570712089538574, "rewards/margins": 0.03465750068426132, "rewards/rejected": 4.536055088043213, "step": 49480 }, { "epoch": 2.297692557686058, "grad_norm": 34.15616989135742, "learning_rate": 1.1723695002863024e-07, "logits/chosen": -18.881013870239258, "logits/rejected": -18.33675193786621, "logps/chosen": -405.0200500488281, "logps/rejected": -300.3785095214844, "loss": 0.5466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.457829475402832, "rewards/margins": 1.0268770456314087, "rewards/rejected": 1.4309526681900024, "step": 49490 }, { "epoch": 2.298156831793491, "grad_norm": 2.001628875732422, "learning_rate": 1.1715957101072472e-07, "logits/chosen": -18.737506866455078, "logits/rejected": -17.805288314819336, "logps/chosen": -415.8892517089844, "logps/rejected": -389.33154296875, "loss": 0.6332, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.1383256912231445, "rewards/margins": 1.2395892143249512, "rewards/rejected": 2.8987362384796143, "step": 49500 }, { "epoch": 2.298621105900924, "grad_norm": 77.46568298339844, "learning_rate": 1.1708219199281922e-07, "logits/chosen": -19.93773651123047, "logits/rejected": -19.94646453857422, "logps/chosen": -270.30419921875, "logps/rejected": -284.7364196777344, "loss": 1.2462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.789876937866211, "rewards/margins": 0.0981285348534584, "rewards/rejected": 2.6917483806610107, "step": 49510 }, { "epoch": 2.299085380008357, "grad_norm": 24.545263290405273, "learning_rate": 1.1700481297491371e-07, "logits/chosen": -18.726417541503906, "logits/rejected": -18.32331085205078, "logps/chosen": -375.27520751953125, "logps/rejected": -283.8210754394531, "loss": 0.5645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4687209129333496, "rewards/margins": 1.6040351390838623, "rewards/rejected": 1.864686369895935, "step": 49520 }, { "epoch": 2.29954965411579, "grad_norm": 0.0008203360484912992, "learning_rate": 1.1692743395700822e-07, "logits/chosen": -18.766342163085938, "logits/rejected": -17.50778579711914, "logps/chosen": -347.60400390625, "logps/rejected": -204.32882690429688, "loss": 0.3495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.283400774002075, "rewards/margins": 2.6049375534057617, "rewards/rejected": 0.6784634590148926, "step": 49530 }, { "epoch": 2.300013928223223, "grad_norm": 26.196077346801758, "learning_rate": 1.168500549391027e-07, "logits/chosen": -18.9074764251709, "logits/rejected": -18.44282341003418, "logps/chosen": -448.4849548339844, "logps/rejected": -392.2050476074219, "loss": 0.7033, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.5705437660217285, "rewards/margins": 1.296934962272644, "rewards/rejected": 3.273608684539795, "step": 49540 }, { "epoch": 2.3004782023306563, "grad_norm": 76.8907699584961, "learning_rate": 1.167726759211972e-07, "logits/chosen": -19.307641983032227, "logits/rejected": -17.365764617919922, "logps/chosen": -424.5565490722656, "logps/rejected": -299.5284729003906, "loss": 0.3547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.075333118438721, "rewards/margins": 2.536123752593994, "rewards/rejected": 1.539209246635437, "step": 49550 }, { "epoch": 2.300942476438089, "grad_norm": 100.59783935546875, "learning_rate": 1.166952969032917e-07, "logits/chosen": -19.797866821289062, "logits/rejected": -18.152420043945312, "logps/chosen": -349.60162353515625, "logps/rejected": -258.2093200683594, "loss": 0.5712, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.132810354232788, "rewards/margins": 1.5297526121139526, "rewards/rejected": 1.603057861328125, "step": 49560 }, { "epoch": 2.3014067505455222, "grad_norm": 2.125816822052002, "learning_rate": 1.1661791788538618e-07, "logits/chosen": -19.263778686523438, "logits/rejected": -19.12078094482422, "logps/chosen": -422.88092041015625, "logps/rejected": -330.13470458984375, "loss": 0.8022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.108330249786377, "rewards/margins": 1.1523422002792358, "rewards/rejected": 1.9559879302978516, "step": 49570 }, { "epoch": 2.301871024652955, "grad_norm": 40.080291748046875, "learning_rate": 1.165405388674807e-07, "logits/chosen": -19.201993942260742, "logits/rejected": -18.618257522583008, "logps/chosen": -467.94329833984375, "logps/rejected": -367.263427734375, "loss": 0.2435, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.782109260559082, "rewards/margins": 2.1564676761627197, "rewards/rejected": 2.625641345977783, "step": 49580 }, { "epoch": 2.3023352987603882, "grad_norm": 126.93033599853516, "learning_rate": 1.1646315984957518e-07, "logits/chosen": -19.462114334106445, "logits/rejected": -18.840862274169922, "logps/chosen": -384.84014892578125, "logps/rejected": -348.12762451171875, "loss": 0.8803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4883689880371094, "rewards/margins": 0.88420170545578, "rewards/rejected": 2.6041674613952637, "step": 49590 }, { "epoch": 2.302799572867821, "grad_norm": 42.41307067871094, "learning_rate": 1.1638578083166968e-07, "logits/chosen": -20.11719512939453, "logits/rejected": -19.520313262939453, "logps/chosen": -434.5738830566406, "logps/rejected": -414.17608642578125, "loss": 0.7098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.493582248687744, "rewards/margins": 0.46642884612083435, "rewards/rejected": 4.027153015136719, "step": 49600 }, { "epoch": 2.303263846975254, "grad_norm": 46.74150085449219, "learning_rate": 1.1630840181376416e-07, "logits/chosen": -19.361799240112305, "logits/rejected": -17.57949447631836, "logps/chosen": -496.0440368652344, "logps/rejected": -277.83837890625, "loss": 0.1687, "rewards/accuracies": 1.0, "rewards/chosen": 5.109863758087158, "rewards/margins": 2.615687608718872, "rewards/rejected": 2.4941763877868652, "step": 49610 }, { "epoch": 2.3037281210826874, "grad_norm": 55.93336486816406, "learning_rate": 1.1623102279585867e-07, "logits/chosen": -18.716569900512695, "logits/rejected": -18.275707244873047, "logps/chosen": -330.86639404296875, "logps/rejected": -302.8436584472656, "loss": 1.2219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.580716133117676, "rewards/margins": 0.4130930006504059, "rewards/rejected": 2.1676230430603027, "step": 49620 }, { "epoch": 2.30419239519012, "grad_norm": 86.19282531738281, "learning_rate": 1.1615364377795317e-07, "logits/chosen": -18.399555206298828, "logits/rejected": -17.102598190307617, "logps/chosen": -396.0342712402344, "logps/rejected": -231.7109832763672, "loss": 0.4053, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8165926933288574, "rewards/margins": 2.506408214569092, "rewards/rejected": 1.3101847171783447, "step": 49630 }, { "epoch": 2.3046566692975534, "grad_norm": 70.81926727294922, "learning_rate": 1.1607626476004766e-07, "logits/chosen": -19.39370346069336, "logits/rejected": -18.214588165283203, "logps/chosen": -431.2210388183594, "logps/rejected": -358.18133544921875, "loss": 0.545, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1791229248046875, "rewards/margins": 1.585023045539856, "rewards/rejected": 1.5941003561019897, "step": 49640 }, { "epoch": 2.305120943404986, "grad_norm": 41.01969528198242, "learning_rate": 1.1599888574214216e-07, "logits/chosen": -18.481952667236328, "logits/rejected": -17.91059684753418, "logps/chosen": -306.2436828613281, "logps/rejected": -275.2229919433594, "loss": 0.4615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.731400966644287, "rewards/margins": 1.3192861080169678, "rewards/rejected": 2.4121153354644775, "step": 49650 }, { "epoch": 2.3055852175124194, "grad_norm": 17.99207305908203, "learning_rate": 1.1592150672423664e-07, "logits/chosen": -19.301067352294922, "logits/rejected": -18.993040084838867, "logps/chosen": -308.2817687988281, "logps/rejected": -347.41534423828125, "loss": 0.889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.436295986175537, "rewards/margins": 0.37613680958747864, "rewards/rejected": 2.060159206390381, "step": 49660 }, { "epoch": 2.306049491619852, "grad_norm": 147.3331756591797, "learning_rate": 1.1584412770633115e-07, "logits/chosen": -18.72165870666504, "logits/rejected": -18.116037368774414, "logps/chosen": -472.2943420410156, "logps/rejected": -394.4642028808594, "loss": 0.8699, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8971550464630127, "rewards/margins": 0.6475347280502319, "rewards/rejected": 3.2496204376220703, "step": 49670 }, { "epoch": 2.3065137657272854, "grad_norm": 42.515045166015625, "learning_rate": 1.1576674868842565e-07, "logits/chosen": -19.77475357055664, "logits/rejected": -18.672359466552734, "logps/chosen": -309.9624328613281, "logps/rejected": -228.4188995361328, "loss": 0.3527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.525583267211914, "rewards/margins": 1.4262748956680298, "rewards/rejected": 2.099308490753174, "step": 49680 }, { "epoch": 2.3069780398347186, "grad_norm": 0.035052549093961716, "learning_rate": 1.1568936967052013e-07, "logits/chosen": -19.165714263916016, "logits/rejected": -18.864591598510742, "logps/chosen": -382.94744873046875, "logps/rejected": -386.4185485839844, "loss": 1.077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.019068241119385, "rewards/margins": 0.6055030822753906, "rewards/rejected": 3.413565158843994, "step": 49690 }, { "epoch": 2.3074423139421514, "grad_norm": 0.938563883304596, "learning_rate": 1.1561199065261463e-07, "logits/chosen": -18.11653709411621, "logits/rejected": -17.19829750061035, "logps/chosen": -449.91522216796875, "logps/rejected": -281.1397399902344, "loss": 0.6353, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.646148204803467, "rewards/margins": 3.2055046558380127, "rewards/rejected": 1.4406445026397705, "step": 49700 }, { "epoch": 2.3079065880495846, "grad_norm": 80.56715393066406, "learning_rate": 1.1553461163470912e-07, "logits/chosen": -19.46274757385254, "logits/rejected": -19.99014663696289, "logps/chosen": -363.45001220703125, "logps/rejected": -376.90350341796875, "loss": 1.1742, "rewards/accuracies": 0.5, "rewards/chosen": 3.093975305557251, "rewards/margins": -0.19705459475517273, "rewards/rejected": 3.291029691696167, "step": 49710 }, { "epoch": 2.3083708621570174, "grad_norm": 24.238264083862305, "learning_rate": 1.1545723261680363e-07, "logits/chosen": -19.336200714111328, "logits/rejected": -17.857513427734375, "logps/chosen": -442.79217529296875, "logps/rejected": -307.18548583984375, "loss": 0.3383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.298482418060303, "rewards/margins": 2.4288625717163086, "rewards/rejected": 1.8696205615997314, "step": 49720 }, { "epoch": 2.3088351362644506, "grad_norm": 5.899300575256348, "learning_rate": 1.1537985359889811e-07, "logits/chosen": -19.33365249633789, "logits/rejected": -17.549938201904297, "logps/chosen": -428.826416015625, "logps/rejected": -310.0191650390625, "loss": 0.439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9118599891662598, "rewards/margins": 1.9431995153427124, "rewards/rejected": 1.9686603546142578, "step": 49730 }, { "epoch": 2.3092994103718834, "grad_norm": 54.06083297729492, "learning_rate": 1.1530247458099261e-07, "logits/chosen": -20.722543716430664, "logits/rejected": -19.929025650024414, "logps/chosen": -356.5089416503906, "logps/rejected": -347.27593994140625, "loss": 0.5738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8616302013397217, "rewards/margins": 1.3066288232803345, "rewards/rejected": 2.5550010204315186, "step": 49740 }, { "epoch": 2.3097636844793166, "grad_norm": 45.569435119628906, "learning_rate": 1.1522509556308711e-07, "logits/chosen": -19.063684463500977, "logits/rejected": -17.39714241027832, "logps/chosen": -557.3013916015625, "logps/rejected": -385.96533203125, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": 5.3130364418029785, "rewards/margins": 3.18180513381958, "rewards/rejected": 2.1312315464019775, "step": 49750 }, { "epoch": 2.31022795858675, "grad_norm": 12.176347732543945, "learning_rate": 1.151477165451816e-07, "logits/chosen": -19.349712371826172, "logits/rejected": -18.138086318969727, "logps/chosen": -436.77593994140625, "logps/rejected": -254.75405883789062, "loss": 0.3613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8407299518585205, "rewards/margins": 2.1523799896240234, "rewards/rejected": 1.688349962234497, "step": 49760 }, { "epoch": 2.3106922326941826, "grad_norm": 36.44378662109375, "learning_rate": 1.150703375272761e-07, "logits/chosen": -20.337413787841797, "logits/rejected": -19.015764236450195, "logps/chosen": -364.9980773925781, "logps/rejected": -229.1834259033203, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/chosen": 4.410312175750732, "rewards/margins": 2.0440897941589355, "rewards/rejected": 2.3662216663360596, "step": 49770 }, { "epoch": 2.3111565068016158, "grad_norm": 12.467851638793945, "learning_rate": 1.1499295850937059e-07, "logits/chosen": -20.992692947387695, "logits/rejected": -20.112598419189453, "logps/chosen": -381.237060546875, "logps/rejected": -352.16156005859375, "loss": 0.4818, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.822924613952637, "rewards/margins": 1.8640148639678955, "rewards/rejected": 2.958909511566162, "step": 49780 }, { "epoch": 2.3116207809090485, "grad_norm": 37.9397087097168, "learning_rate": 1.1491557949146509e-07, "logits/chosen": -19.672367095947266, "logits/rejected": -18.145015716552734, "logps/chosen": -394.36199951171875, "logps/rejected": -275.365234375, "loss": 0.4532, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3008320331573486, "rewards/margins": 1.7024815082550049, "rewards/rejected": 1.5983504056930542, "step": 49790 }, { "epoch": 2.3120850550164818, "grad_norm": 8.872573852539062, "learning_rate": 1.1483820047355959e-07, "logits/chosen": -18.557884216308594, "logits/rejected": -16.878812789916992, "logps/chosen": -406.4165954589844, "logps/rejected": -251.1995849609375, "loss": 0.3493, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.384111404418945, "rewards/margins": 3.2085113525390625, "rewards/rejected": 1.1756006479263306, "step": 49800 }, { "epoch": 2.3125493291239145, "grad_norm": 2.6486477851867676, "learning_rate": 1.1476082145565407e-07, "logits/chosen": -19.455900192260742, "logits/rejected": -19.269145965576172, "logps/chosen": -427.7032165527344, "logps/rejected": -379.904296875, "loss": 0.8943, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9564738273620605, "rewards/margins": 1.1718717813491821, "rewards/rejected": 2.784602642059326, "step": 49810 }, { "epoch": 2.3130136032313477, "grad_norm": 171.6950225830078, "learning_rate": 1.1468344243774858e-07, "logits/chosen": -19.581497192382812, "logits/rejected": -18.57392120361328, "logps/chosen": -341.2920227050781, "logps/rejected": -279.02899169921875, "loss": 0.4153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8501486778259277, "rewards/margins": 1.2970216274261475, "rewards/rejected": 2.5531272888183594, "step": 49820 }, { "epoch": 2.313477877338781, "grad_norm": 0.024575449526309967, "learning_rate": 1.1460606341984307e-07, "logits/chosen": -19.53810691833496, "logits/rejected": -18.656108856201172, "logps/chosen": -342.6286926269531, "logps/rejected": -251.05075073242188, "loss": 0.4665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.667235851287842, "rewards/margins": 2.0317864418029785, "rewards/rejected": 0.6354494690895081, "step": 49830 }, { "epoch": 2.3139421514462137, "grad_norm": 113.741455078125, "learning_rate": 1.1452868440193757e-07, "logits/chosen": -19.012582778930664, "logits/rejected": -18.367416381835938, "logps/chosen": -430.0054626464844, "logps/rejected": -430.3121643066406, "loss": 1.0403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0372586250305176, "rewards/margins": 0.379576712846756, "rewards/rejected": 2.657681941986084, "step": 49840 }, { "epoch": 2.314406425553647, "grad_norm": 1.0312341451644897, "learning_rate": 1.1445130538403205e-07, "logits/chosen": -20.541736602783203, "logits/rejected": -19.087141036987305, "logps/chosen": -379.91339111328125, "logps/rejected": -234.69808959960938, "loss": 0.2281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.251641273498535, "rewards/margins": 2.6417717933654785, "rewards/rejected": 1.609870195388794, "step": 49850 }, { "epoch": 2.3148706996610797, "grad_norm": 145.77230834960938, "learning_rate": 1.1437392636612655e-07, "logits/chosen": -19.22061538696289, "logits/rejected": -18.579879760742188, "logps/chosen": -476.09375, "logps/rejected": -439.29425048828125, "loss": 0.7981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.920447826385498, "rewards/margins": 1.08953857421875, "rewards/rejected": 2.8309097290039062, "step": 49860 }, { "epoch": 2.315334973768513, "grad_norm": 10.90469741821289, "learning_rate": 1.1429654734822106e-07, "logits/chosen": -19.033462524414062, "logits/rejected": -18.059471130371094, "logps/chosen": -286.30792236328125, "logps/rejected": -155.00027465820312, "loss": 0.4688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.215209484100342, "rewards/margins": 1.897762656211853, "rewards/rejected": 0.3174469769001007, "step": 49870 }, { "epoch": 2.3157992478759457, "grad_norm": 216.00315856933594, "learning_rate": 1.1421916833031555e-07, "logits/chosen": -19.252017974853516, "logits/rejected": -18.59964370727539, "logps/chosen": -397.6937255859375, "logps/rejected": -357.8157958984375, "loss": 0.7957, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0953216552734375, "rewards/margins": 0.4159668982028961, "rewards/rejected": 2.679354429244995, "step": 49880 }, { "epoch": 2.316263521983379, "grad_norm": 64.63367462158203, "learning_rate": 1.1414178931241004e-07, "logits/chosen": -19.25421905517578, "logits/rejected": -18.860820770263672, "logps/chosen": -385.87353515625, "logps/rejected": -283.91949462890625, "loss": 0.9476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.301177978515625, "rewards/margins": 0.8845831751823425, "rewards/rejected": 2.416594982147217, "step": 49890 }, { "epoch": 2.316727796090812, "grad_norm": 109.05713653564453, "learning_rate": 1.1406441029450453e-07, "logits/chosen": -18.904306411743164, "logits/rejected": -18.151397705078125, "logps/chosen": -453.59844970703125, "logps/rejected": -432.3907165527344, "loss": 1.0201, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.462475299835205, "rewards/margins": 0.29237011075019836, "rewards/rejected": 3.17010498046875, "step": 49900 }, { "epoch": 2.317192070198245, "grad_norm": 5.770057678222656, "learning_rate": 1.1398703127659904e-07, "logits/chosen": -18.326608657836914, "logits/rejected": -17.414623260498047, "logps/chosen": -418.79071044921875, "logps/rejected": -281.2838134765625, "loss": 0.2256, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2784552574157715, "rewards/margins": 2.9160144329071045, "rewards/rejected": 1.3624413013458252, "step": 49910 }, { "epoch": 2.317656344305678, "grad_norm": 0.48018741607666016, "learning_rate": 1.1390965225869354e-07, "logits/chosen": -19.34076690673828, "logits/rejected": -17.485761642456055, "logps/chosen": -370.7799072265625, "logps/rejected": -243.00576782226562, "loss": 0.2165, "rewards/accuracies": 1.0, "rewards/chosen": 3.8175406455993652, "rewards/margins": 2.550258159637451, "rewards/rejected": 1.2672823667526245, "step": 49920 }, { "epoch": 2.3181206184131113, "grad_norm": 14.405365943908691, "learning_rate": 1.1383227324078802e-07, "logits/chosen": -18.38602638244629, "logits/rejected": -17.530742645263672, "logps/chosen": -392.5121154785156, "logps/rejected": -285.7557678222656, "loss": 0.5192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3224360942840576, "rewards/margins": 1.7453734874725342, "rewards/rejected": 1.577062726020813, "step": 49930 }, { "epoch": 2.318584892520544, "grad_norm": 37.525882720947266, "learning_rate": 1.1375489422288252e-07, "logits/chosen": -18.249588012695312, "logits/rejected": -17.560413360595703, "logps/chosen": -459.68768310546875, "logps/rejected": -383.5227966308594, "loss": 0.5389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0017590522766113, "rewards/margins": 0.7641502022743225, "rewards/rejected": 2.2376084327697754, "step": 49940 }, { "epoch": 2.3190491666279773, "grad_norm": 28.148530960083008, "learning_rate": 1.13677515204977e-07, "logits/chosen": -19.117263793945312, "logits/rejected": -18.40978240966797, "logps/chosen": -454.5966796875, "logps/rejected": -344.27496337890625, "loss": 0.6219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.711355209350586, "rewards/margins": 1.467015266418457, "rewards/rejected": 3.244340419769287, "step": 49950 }, { "epoch": 2.31951344073541, "grad_norm": 44.57265090942383, "learning_rate": 1.1360013618707152e-07, "logits/chosen": -19.4714298248291, "logits/rejected": -18.87410545349121, "logps/chosen": -392.7992858886719, "logps/rejected": -294.9001770019531, "loss": 0.81, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.201857805252075, "rewards/margins": 0.4030976891517639, "rewards/rejected": 1.798760175704956, "step": 49960 }, { "epoch": 2.3199777148428433, "grad_norm": 4.914966106414795, "learning_rate": 1.13522757169166e-07, "logits/chosen": -18.318737030029297, "logits/rejected": -16.786548614501953, "logps/chosen": -365.6429138183594, "logps/rejected": -214.0236053466797, "loss": 0.3859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8665804862976074, "rewards/margins": 2.095923900604248, "rewards/rejected": 0.7706564664840698, "step": 49970 }, { "epoch": 2.320441988950276, "grad_norm": 11.008932113647461, "learning_rate": 1.134453781512605e-07, "logits/chosen": -19.58452796936035, "logits/rejected": -19.22124481201172, "logps/chosen": -469.3700256347656, "logps/rejected": -454.50341796875, "loss": 0.6265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.3787617683410645, "rewards/margins": 0.6823018789291382, "rewards/rejected": 3.6964588165283203, "step": 49980 }, { "epoch": 2.3209062630577093, "grad_norm": 67.3553695678711, "learning_rate": 1.13367999133355e-07, "logits/chosen": -19.19979476928711, "logits/rejected": -18.08270835876465, "logps/chosen": -428.77947998046875, "logps/rejected": -274.1176452636719, "loss": 0.3859, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.951674699783325, "rewards/margins": 2.2839388847351074, "rewards/rejected": 1.6677356958389282, "step": 49990 }, { "epoch": 2.3213705371651425, "grad_norm": 85.32784271240234, "learning_rate": 1.1329062011544948e-07, "logits/chosen": -19.08176040649414, "logits/rejected": -18.7038516998291, "logps/chosen": -382.85205078125, "logps/rejected": -298.134521484375, "loss": 0.7643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6512560844421387, "rewards/margins": 0.6200577616691589, "rewards/rejected": 3.031198263168335, "step": 50000 }, { "epoch": 2.3218348112725753, "grad_norm": 11.577462196350098, "learning_rate": 1.1321324109754399e-07, "logits/chosen": -19.058815002441406, "logits/rejected": -18.57894515991211, "logps/chosen": -392.5751953125, "logps/rejected": -296.98736572265625, "loss": 0.5974, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.35955810546875, "rewards/margins": 1.4019088745117188, "rewards/rejected": 1.9576489925384521, "step": 50010 }, { "epoch": 2.3222990853800085, "grad_norm": 0.663914680480957, "learning_rate": 1.1313586207963848e-07, "logits/chosen": -20.213943481445312, "logits/rejected": -19.28788948059082, "logps/chosen": -452.9325256347656, "logps/rejected": -323.16961669921875, "loss": 0.7768, "rewards/accuracies": 0.5, "rewards/chosen": 3.9399619102478027, "rewards/margins": 1.3645579814910889, "rewards/rejected": 2.575404167175293, "step": 50020 }, { "epoch": 2.3227633594874413, "grad_norm": 14.391131401062012, "learning_rate": 1.1305848306173298e-07, "logits/chosen": -19.179927825927734, "logits/rejected": -18.559972763061523, "logps/chosen": -417.716064453125, "logps/rejected": -381.34661865234375, "loss": 0.357, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4413490295410156, "rewards/margins": 1.1182835102081299, "rewards/rejected": 2.3230655193328857, "step": 50030 }, { "epoch": 2.3232276335948745, "grad_norm": 54.142181396484375, "learning_rate": 1.1298110404382747e-07, "logits/chosen": -18.957775115966797, "logits/rejected": -18.243165969848633, "logps/chosen": -415.6537170410156, "logps/rejected": -344.1680603027344, "loss": 0.463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.469444990158081, "rewards/margins": 1.520716667175293, "rewards/rejected": 1.9487285614013672, "step": 50040 }, { "epoch": 2.3236919077023073, "grad_norm": 247.4488525390625, "learning_rate": 1.1290372502592196e-07, "logits/chosen": -19.207256317138672, "logits/rejected": -18.734294891357422, "logps/chosen": -453.2469787597656, "logps/rejected": -326.6224670410156, "loss": 0.4129, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.258849143981934, "rewards/margins": 2.193618059158325, "rewards/rejected": 2.0652315616607666, "step": 50050 }, { "epoch": 2.3241561818097405, "grad_norm": 49.96281433105469, "learning_rate": 1.1282634600801647e-07, "logits/chosen": -18.64442253112793, "logits/rejected": -18.78622055053711, "logps/chosen": -346.07977294921875, "logps/rejected": -357.95318603515625, "loss": 0.8673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5124430656433105, "rewards/margins": 0.23310931026935577, "rewards/rejected": 2.2793338298797607, "step": 50060 }, { "epoch": 2.3246204559171737, "grad_norm": 319.1328430175781, "learning_rate": 1.1274896699011096e-07, "logits/chosen": -19.144775390625, "logits/rejected": -17.71942138671875, "logps/chosen": -322.5876770019531, "logps/rejected": -221.12240600585938, "loss": 0.4301, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4518322944641113, "rewards/margins": 2.3418972492218018, "rewards/rejected": 1.1099350452423096, "step": 50070 }, { "epoch": 2.3250847300246065, "grad_norm": 7.527309894561768, "learning_rate": 1.1267158797220545e-07, "logits/chosen": -18.622905731201172, "logits/rejected": -18.69915199279785, "logps/chosen": -377.7876281738281, "logps/rejected": -392.4043273925781, "loss": 1.4289, "rewards/accuracies": 0.5, "rewards/chosen": 3.1018013954162598, "rewards/margins": -0.06106722354888916, "rewards/rejected": 3.1628689765930176, "step": 50080 }, { "epoch": 2.3255490041320397, "grad_norm": 33.46430587768555, "learning_rate": 1.1259420895429994e-07, "logits/chosen": -18.14204978942871, "logits/rejected": -19.01332664489746, "logps/chosen": -428.66009521484375, "logps/rejected": -434.5668029785156, "loss": 1.4871, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 3.436164140701294, "rewards/margins": -0.9216070175170898, "rewards/rejected": 4.357771396636963, "step": 50090 }, { "epoch": 2.3260132782394725, "grad_norm": 1.7354042530059814, "learning_rate": 1.1251682993639444e-07, "logits/chosen": -18.721406936645508, "logits/rejected": -17.083879470825195, "logps/chosen": -419.74310302734375, "logps/rejected": -256.94000244140625, "loss": 0.4359, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.439016819000244, "rewards/margins": 2.6963722705841064, "rewards/rejected": 1.7426445484161377, "step": 50100 }, { "epoch": 2.3264775523469057, "grad_norm": 143.64280700683594, "learning_rate": 1.1243945091848895e-07, "logits/chosen": -19.147464752197266, "logits/rejected": -18.590364456176758, "logps/chosen": -460.8006896972656, "logps/rejected": -406.79327392578125, "loss": 0.8394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.362429618835449, "rewards/margins": 1.1002472639083862, "rewards/rejected": 3.2621827125549316, "step": 50110 }, { "epoch": 2.3269418264543384, "grad_norm": 97.7774658203125, "learning_rate": 1.1236207190058343e-07, "logits/chosen": -18.253358840942383, "logits/rejected": -18.366008758544922, "logps/chosen": -337.1078796386719, "logps/rejected": -376.9445495605469, "loss": 0.7075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.586278200149536, "rewards/margins": 0.8410733938217163, "rewards/rejected": 2.7452049255371094, "step": 50120 }, { "epoch": 2.3274061005617717, "grad_norm": 54.30851745605469, "learning_rate": 1.1228469288267793e-07, "logits/chosen": -19.736072540283203, "logits/rejected": -18.964527130126953, "logps/chosen": -365.45684814453125, "logps/rejected": -282.223876953125, "loss": 0.4228, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.422480583190918, "rewards/margins": 2.5199711322784424, "rewards/rejected": 1.902509331703186, "step": 50130 }, { "epoch": 2.327870374669205, "grad_norm": 42.22492980957031, "learning_rate": 1.1220731386477242e-07, "logits/chosen": -18.735715866088867, "logits/rejected": -17.661787033081055, "logps/chosen": -402.9789123535156, "logps/rejected": -267.8605041503906, "loss": 0.4788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6470820903778076, "rewards/margins": 1.120774745941162, "rewards/rejected": 1.5263073444366455, "step": 50140 }, { "epoch": 2.3283346487766376, "grad_norm": 65.98057556152344, "learning_rate": 1.1212993484686691e-07, "logits/chosen": -20.609943389892578, "logits/rejected": -19.306446075439453, "logps/chosen": -428.11920166015625, "logps/rejected": -332.07904052734375, "loss": 0.6982, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.3761396408081055, "rewards/margins": 1.6416199207305908, "rewards/rejected": 2.7345199584960938, "step": 50150 }, { "epoch": 2.328798922884071, "grad_norm": 54.87825012207031, "learning_rate": 1.1205255582896142e-07, "logits/chosen": -18.881351470947266, "logits/rejected": -19.481433868408203, "logps/chosen": -252.4002685546875, "logps/rejected": -288.97705078125, "loss": 1.0642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0487401485443115, "rewards/margins": -0.03626983240246773, "rewards/rejected": 2.085010051727295, "step": 50160 }, { "epoch": 2.3292631969915036, "grad_norm": 166.5458526611328, "learning_rate": 1.1197517681105591e-07, "logits/chosen": -18.388172149658203, "logits/rejected": -18.260269165039062, "logps/chosen": -452.60394287109375, "logps/rejected": -384.70001220703125, "loss": 0.9244, "rewards/accuracies": 0.5, "rewards/chosen": 2.508233070373535, "rewards/margins": 0.5434675216674805, "rewards/rejected": 1.9647653102874756, "step": 50170 }, { "epoch": 2.329727471098937, "grad_norm": 114.82982635498047, "learning_rate": 1.1189779779315041e-07, "logits/chosen": -19.58678436279297, "logits/rejected": -17.892061233520508, "logps/chosen": -398.341796875, "logps/rejected": -275.8819274902344, "loss": 0.5781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9830498695373535, "rewards/margins": 1.7826896905899048, "rewards/rejected": 2.200360059738159, "step": 50180 }, { "epoch": 2.3301917452063696, "grad_norm": 3.9931697845458984, "learning_rate": 1.1182041877524489e-07, "logits/chosen": -19.668495178222656, "logits/rejected": -18.907880783081055, "logps/chosen": -417.5418395996094, "logps/rejected": -342.6850280761719, "loss": 0.3914, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9982807636260986, "rewards/margins": 2.0807557106018066, "rewards/rejected": 1.917525053024292, "step": 50190 }, { "epoch": 2.330656019313803, "grad_norm": 94.46359252929688, "learning_rate": 1.1174303975733939e-07, "logits/chosen": -19.68079948425293, "logits/rejected": -18.467288970947266, "logps/chosen": -400.6124572753906, "logps/rejected": -350.19256591796875, "loss": 0.2965, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.905031681060791, "rewards/margins": 2.4953699111938477, "rewards/rejected": 2.4096617698669434, "step": 50200 }, { "epoch": 2.331120293421236, "grad_norm": 2.030937433242798, "learning_rate": 1.1166566073943389e-07, "logits/chosen": -18.37946891784668, "logits/rejected": -18.236984252929688, "logps/chosen": -328.259033203125, "logps/rejected": -374.89471435546875, "loss": 1.061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.096611022949219, "rewards/margins": 0.5843722224235535, "rewards/rejected": 3.5122387409210205, "step": 50210 }, { "epoch": 2.331584567528669, "grad_norm": 76.73477935791016, "learning_rate": 1.1158828172152839e-07, "logits/chosen": -19.554346084594727, "logits/rejected": -19.309511184692383, "logps/chosen": -312.60345458984375, "logps/rejected": -315.64520263671875, "loss": 1.079, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.615290403366089, "rewards/margins": -0.15243546664714813, "rewards/rejected": 2.767725706100464, "step": 50220 }, { "epoch": 2.332048841636102, "grad_norm": 42.787689208984375, "learning_rate": 1.1151090270362288e-07, "logits/chosen": -18.602590560913086, "logits/rejected": -18.52733039855957, "logps/chosen": -356.9593505859375, "logps/rejected": -380.0087890625, "loss": 0.5692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.857999801635742, "rewards/margins": 0.9756816625595093, "rewards/rejected": 2.8823180198669434, "step": 50230 }, { "epoch": 2.332513115743535, "grad_norm": 31.84331512451172, "learning_rate": 1.1143352368571737e-07, "logits/chosen": -18.307382583618164, "logits/rejected": -18.36689567565918, "logps/chosen": -374.422119140625, "logps/rejected": -384.4783020019531, "loss": 0.7973, "rewards/accuracies": 0.5, "rewards/chosen": 2.466790199279785, "rewards/margins": 0.37342873215675354, "rewards/rejected": 2.0933613777160645, "step": 50240 }, { "epoch": 2.332977389850968, "grad_norm": 98.23603057861328, "learning_rate": 1.1135614466781188e-07, "logits/chosen": -19.43479347229004, "logits/rejected": -18.374217987060547, "logps/chosen": -356.25811767578125, "logps/rejected": -273.1447448730469, "loss": 0.4533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4241394996643066, "rewards/margins": 1.9716408252716064, "rewards/rejected": 1.4524987936019897, "step": 50250 }, { "epoch": 2.333441663958401, "grad_norm": 45.8820915222168, "learning_rate": 1.1127876564990637e-07, "logits/chosen": -18.39630699157715, "logits/rejected": -18.08915138244629, "logps/chosen": -454.9638671875, "logps/rejected": -323.8406677246094, "loss": 1.1019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1128029823303223, "rewards/margins": 0.976751446723938, "rewards/rejected": 2.1360514163970947, "step": 50260 }, { "epoch": 2.333905938065834, "grad_norm": 81.70341491699219, "learning_rate": 1.1120138663200086e-07, "logits/chosen": -19.03196907043457, "logits/rejected": -19.279640197753906, "logps/chosen": -405.7814025878906, "logps/rejected": -394.2832336425781, "loss": 0.6666, "rewards/accuracies": 0.5, "rewards/chosen": 3.3861968517303467, "rewards/margins": 1.0764108896255493, "rewards/rejected": 2.309785842895508, "step": 50270 }, { "epoch": 2.3343702121732672, "grad_norm": 4.799286365509033, "learning_rate": 1.1112400761409536e-07, "logits/chosen": -18.681598663330078, "logits/rejected": -17.926502227783203, "logps/chosen": -356.1922302246094, "logps/rejected": -229.4464569091797, "loss": 0.7714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.71854829788208, "rewards/margins": 1.9131367206573486, "rewards/rejected": 1.8054113388061523, "step": 50280 }, { "epoch": 2.3348344862807, "grad_norm": 14.02425765991211, "learning_rate": 1.1104662859618985e-07, "logits/chosen": -18.81037712097168, "logits/rejected": -18.686851501464844, "logps/chosen": -418.4183044433594, "logps/rejected": -374.24346923828125, "loss": 0.9382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7622737884521484, "rewards/margins": 0.5357305407524109, "rewards/rejected": 3.2265429496765137, "step": 50290 }, { "epoch": 2.335298760388133, "grad_norm": 207.797119140625, "learning_rate": 1.1096924957828436e-07, "logits/chosen": -19.393911361694336, "logits/rejected": -18.21783447265625, "logps/chosen": -461.9087829589844, "logps/rejected": -343.53436279296875, "loss": 1.0562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.162657737731934, "rewards/margins": 1.6779972314834595, "rewards/rejected": 2.4846606254577637, "step": 50300 }, { "epoch": 2.335763034495566, "grad_norm": 189.9642791748047, "learning_rate": 1.1089187056037884e-07, "logits/chosen": -19.758159637451172, "logits/rejected": -18.143001556396484, "logps/chosen": -426.3892517089844, "logps/rejected": -370.0565490722656, "loss": 0.7527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.239657878875732, "rewards/margins": 1.3352839946746826, "rewards/rejected": 2.90437388420105, "step": 50310 }, { "epoch": 2.336227308602999, "grad_norm": 10.528942108154297, "learning_rate": 1.1081449154247334e-07, "logits/chosen": -18.932199478149414, "logits/rejected": -18.53120231628418, "logps/chosen": -286.54681396484375, "logps/rejected": -202.5475311279297, "loss": 0.5081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.582636833190918, "rewards/margins": 0.8706437349319458, "rewards/rejected": 0.7119930982589722, "step": 50320 }, { "epoch": 2.3366915827104324, "grad_norm": 242.90638732910156, "learning_rate": 1.1073711252456783e-07, "logits/chosen": -19.235902786254883, "logits/rejected": -18.556888580322266, "logps/chosen": -471.08123779296875, "logps/rejected": -417.95587158203125, "loss": 0.68, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.137364387512207, "rewards/margins": 1.0068212747573853, "rewards/rejected": 4.1305437088012695, "step": 50330 }, { "epoch": 2.337155856817865, "grad_norm": 16.32832145690918, "learning_rate": 1.1065973350666232e-07, "logits/chosen": -19.78609848022461, "logits/rejected": -18.467266082763672, "logps/chosen": -293.2395324707031, "logps/rejected": -248.3306427001953, "loss": 0.5211, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7287776470184326, "rewards/margins": 1.4805858135223389, "rewards/rejected": 1.2481917142868042, "step": 50340 }, { "epoch": 2.3376201309252984, "grad_norm": 57.175052642822266, "learning_rate": 1.1058235448875683e-07, "logits/chosen": -18.4930362701416, "logits/rejected": -17.885292053222656, "logps/chosen": -261.6459045410156, "logps/rejected": -239.2626495361328, "loss": 0.7034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3144633769989014, "rewards/margins": 1.2233935594558716, "rewards/rejected": 1.0910698175430298, "step": 50350 }, { "epoch": 2.338084405032731, "grad_norm": 15.304092407226562, "learning_rate": 1.1050497547085132e-07, "logits/chosen": -19.296749114990234, "logits/rejected": -18.851884841918945, "logps/chosen": -469.96600341796875, "logps/rejected": -343.841064453125, "loss": 0.3242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.792334079742432, "rewards/margins": 2.133700132369995, "rewards/rejected": 2.6586337089538574, "step": 50360 }, { "epoch": 2.3385486791401644, "grad_norm": 55.14570617675781, "learning_rate": 1.1042759645294582e-07, "logits/chosen": -19.034366607666016, "logits/rejected": -17.794506072998047, "logps/chosen": -336.6707458496094, "logps/rejected": -201.3890838623047, "loss": 0.4513, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2870278358459473, "rewards/margins": 1.9934009313583374, "rewards/rejected": 1.2936267852783203, "step": 50370 }, { "epoch": 2.3390129532475976, "grad_norm": 134.13131713867188, "learning_rate": 1.103502174350403e-07, "logits/chosen": -18.945091247558594, "logits/rejected": -17.752866744995117, "logps/chosen": -477.710205078125, "logps/rejected": -352.99932861328125, "loss": 0.4342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.545027732849121, "rewards/margins": 2.113931894302368, "rewards/rejected": 2.431095838546753, "step": 50380 }, { "epoch": 2.3394772273550304, "grad_norm": 11.05502700805664, "learning_rate": 1.102728384171348e-07, "logits/chosen": -19.013301849365234, "logits/rejected": -17.982349395751953, "logps/chosen": -353.2056884765625, "logps/rejected": -282.144775390625, "loss": 0.3655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1167356967926025, "rewards/margins": 1.8351480960845947, "rewards/rejected": 1.2815876007080078, "step": 50390 }, { "epoch": 2.3399415014624636, "grad_norm": 14.605975151062012, "learning_rate": 1.1019545939922931e-07, "logits/chosen": -19.632083892822266, "logits/rejected": -18.791400909423828, "logps/chosen": -420.509033203125, "logps/rejected": -343.9130554199219, "loss": 0.4466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9850735664367676, "rewards/margins": 1.2037080526351929, "rewards/rejected": 1.7813657522201538, "step": 50400 }, { "epoch": 2.3404057755698964, "grad_norm": 107.68400573730469, "learning_rate": 1.101180803813238e-07, "logits/chosen": -18.834346771240234, "logits/rejected": -18.02680778503418, "logps/chosen": -447.52899169921875, "logps/rejected": -287.05523681640625, "loss": 0.4994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.991978883743286, "rewards/margins": 1.971634864807129, "rewards/rejected": 2.0203440189361572, "step": 50410 }, { "epoch": 2.3408700496773296, "grad_norm": 60.39045333862305, "learning_rate": 1.100407013634183e-07, "logits/chosen": -19.342937469482422, "logits/rejected": -18.658056259155273, "logps/chosen": -390.96124267578125, "logps/rejected": -303.3495178222656, "loss": 0.2787, "rewards/accuracies": 1.0, "rewards/chosen": 3.1079580783843994, "rewards/margins": 1.7200372219085693, "rewards/rejected": 1.387920618057251, "step": 50420 }, { "epoch": 2.3413343237847624, "grad_norm": 220.1587371826172, "learning_rate": 1.0996332234551278e-07, "logits/chosen": -17.957683563232422, "logits/rejected": -18.53077507019043, "logps/chosen": -260.64459228515625, "logps/rejected": -350.25128173828125, "loss": 1.7826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.7805182933807373, "rewards/margins": -0.18796448409557343, "rewards/rejected": 1.9684827327728271, "step": 50430 }, { "epoch": 2.3417985978921956, "grad_norm": 85.9382553100586, "learning_rate": 1.0988594332760728e-07, "logits/chosen": -18.684101104736328, "logits/rejected": -18.54054832458496, "logps/chosen": -366.24676513671875, "logps/rejected": -390.14501953125, "loss": 1.1738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4585070610046387, "rewards/margins": -0.2372249811887741, "rewards/rejected": 2.695732355117798, "step": 50440 }, { "epoch": 2.342262871999629, "grad_norm": 0.08380007743835449, "learning_rate": 1.0980856430970178e-07, "logits/chosen": -19.60588836669922, "logits/rejected": -18.558998107910156, "logps/chosen": -373.8392028808594, "logps/rejected": -276.0927429199219, "loss": 0.9444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.918109178543091, "rewards/margins": 1.3548362255096436, "rewards/rejected": 2.563272476196289, "step": 50450 }, { "epoch": 2.3427271461070616, "grad_norm": 55.314937591552734, "learning_rate": 1.0973118529179627e-07, "logits/chosen": -18.638538360595703, "logits/rejected": -17.883899688720703, "logps/chosen": -371.0359191894531, "logps/rejected": -238.158203125, "loss": 0.3444, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7098255157470703, "rewards/margins": 1.191241979598999, "rewards/rejected": 1.5185837745666504, "step": 50460 }, { "epoch": 2.343191420214495, "grad_norm": 25.810102462768555, "learning_rate": 1.0965380627389077e-07, "logits/chosen": -19.232614517211914, "logits/rejected": -18.709369659423828, "logps/chosen": -357.5069274902344, "logps/rejected": -294.61212158203125, "loss": 1.0226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2261269092559814, "rewards/margins": 0.6165154576301575, "rewards/rejected": 2.609611988067627, "step": 50470 }, { "epoch": 2.3436556943219276, "grad_norm": 23.84708595275879, "learning_rate": 1.0957642725598526e-07, "logits/chosen": -18.840511322021484, "logits/rejected": -18.597652435302734, "logps/chosen": -492.0926818847656, "logps/rejected": -460.77508544921875, "loss": 1.3407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9197514057159424, "rewards/margins": 0.1742035448551178, "rewards/rejected": 3.7455477714538574, "step": 50480 }, { "epoch": 2.3441199684293608, "grad_norm": 17.135576248168945, "learning_rate": 1.0949904823807975e-07, "logits/chosen": -18.913787841796875, "logits/rejected": -18.021753311157227, "logps/chosen": -430.2911682128906, "logps/rejected": -327.3677978515625, "loss": 0.5602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9353795051574707, "rewards/margins": 2.2351574897766113, "rewards/rejected": 1.700222373008728, "step": 50490 }, { "epoch": 2.3445842425367935, "grad_norm": 4.513642311096191, "learning_rate": 1.0942166922017425e-07, "logits/chosen": -19.564727783203125, "logits/rejected": -17.9561824798584, "logps/chosen": -361.50933837890625, "logps/rejected": -294.3038635253906, "loss": 0.1929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.715710163116455, "rewards/margins": 2.1913371086120605, "rewards/rejected": 1.5243732929229736, "step": 50500 }, { "epoch": 2.3450485166442268, "grad_norm": 54.78626251220703, "learning_rate": 1.0934429020226875e-07, "logits/chosen": -19.038448333740234, "logits/rejected": -18.730743408203125, "logps/chosen": -383.57427978515625, "logps/rejected": -323.3662414550781, "loss": 0.9125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9448819160461426, "rewards/margins": 1.359909176826477, "rewards/rejected": 2.584972858428955, "step": 50510 }, { "epoch": 2.34551279075166, "grad_norm": 6.406073570251465, "learning_rate": 1.0926691118436324e-07, "logits/chosen": -19.872127532958984, "logits/rejected": -18.65487289428711, "logps/chosen": -433.77642822265625, "logps/rejected": -353.7384948730469, "loss": 0.7181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8949382305145264, "rewards/margins": 1.1496961116790771, "rewards/rejected": 2.74524188041687, "step": 50520 }, { "epoch": 2.3459770648590927, "grad_norm": 93.54692840576172, "learning_rate": 1.0918953216645773e-07, "logits/chosen": -19.351818084716797, "logits/rejected": -17.629844665527344, "logps/chosen": -471.7822265625, "logps/rejected": -315.18560791015625, "loss": 0.4834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.031140327453613, "rewards/margins": 2.3479208946228027, "rewards/rejected": 1.6832191944122314, "step": 50530 }, { "epoch": 2.346441338966526, "grad_norm": 5.786893367767334, "learning_rate": 1.0911215314855224e-07, "logits/chosen": -20.03485679626465, "logits/rejected": -18.428714752197266, "logps/chosen": -332.29168701171875, "logps/rejected": -201.92735290527344, "loss": 0.2673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.8054957389831543, "rewards/margins": 3.3785042762756348, "rewards/rejected": 0.42699161171913147, "step": 50540 }, { "epoch": 2.3469056130739587, "grad_norm": 64.2606201171875, "learning_rate": 1.0903477413064673e-07, "logits/chosen": -19.041101455688477, "logits/rejected": -17.349140167236328, "logps/chosen": -414.41217041015625, "logps/rejected": -315.3678283691406, "loss": 1.2016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.44450044631958, "rewards/margins": 0.9083612561225891, "rewards/rejected": 2.5361390113830566, "step": 50550 }, { "epoch": 2.347369887181392, "grad_norm": 141.1687774658203, "learning_rate": 1.0895739511274123e-07, "logits/chosen": -18.068864822387695, "logits/rejected": -18.702045440673828, "logps/chosen": -323.7880859375, "logps/rejected": -396.7835693359375, "loss": 1.1062, "rewards/accuracies": 0.5, "rewards/chosen": 2.505220890045166, "rewards/margins": 0.08638377487659454, "rewards/rejected": 2.418837070465088, "step": 50560 }, { "epoch": 2.3478341612888247, "grad_norm": 0.009399677626788616, "learning_rate": 1.0888001609483571e-07, "logits/chosen": -19.523717880249023, "logits/rejected": -17.580123901367188, "logps/chosen": -461.48004150390625, "logps/rejected": -193.84127807617188, "loss": 0.1778, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.035440921783447, "rewards/margins": 3.1723875999450684, "rewards/rejected": 0.8630535006523132, "step": 50570 }, { "epoch": 2.348298435396258, "grad_norm": 77.02774047851562, "learning_rate": 1.0880263707693021e-07, "logits/chosen": -18.593460083007812, "logits/rejected": -17.739530563354492, "logps/chosen": -383.7080078125, "logps/rejected": -328.08819580078125, "loss": 0.3513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.395287036895752, "rewards/margins": 2.274237871170044, "rewards/rejected": 1.121049165725708, "step": 50580 }, { "epoch": 2.348762709503691, "grad_norm": 6.57335090637207, "learning_rate": 1.0872525805902472e-07, "logits/chosen": -19.803958892822266, "logits/rejected": -18.686233520507812, "logps/chosen": -543.032958984375, "logps/rejected": -372.29388427734375, "loss": 0.8536, "rewards/accuracies": 0.5, "rewards/chosen": 3.4972119331359863, "rewards/margins": 1.2067558765411377, "rewards/rejected": 2.2904560565948486, "step": 50590 }, { "epoch": 2.349226983611124, "grad_norm": 46.13620376586914, "learning_rate": 1.0864787904111921e-07, "logits/chosen": -18.79116439819336, "logits/rejected": -19.119272232055664, "logps/chosen": -394.73431396484375, "logps/rejected": -412.9327087402344, "loss": 1.2292, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.450364828109741, "rewards/margins": -0.4509105086326599, "rewards/rejected": 2.901275873184204, "step": 50600 }, { "epoch": 2.349691257718557, "grad_norm": 31.150449752807617, "learning_rate": 1.085705000232137e-07, "logits/chosen": -19.832307815551758, "logits/rejected": -19.04766845703125, "logps/chosen": -344.42974853515625, "logps/rejected": -317.76214599609375, "loss": 0.5453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4082794189453125, "rewards/margins": 1.0422765016555786, "rewards/rejected": 2.3660025596618652, "step": 50610 }, { "epoch": 2.35015553182599, "grad_norm": 166.63446044921875, "learning_rate": 1.0849312100530819e-07, "logits/chosen": -19.94345474243164, "logits/rejected": -19.08135986328125, "logps/chosen": -412.8988342285156, "logps/rejected": -367.8439025878906, "loss": 0.6158, "rewards/accuracies": 0.5, "rewards/chosen": 3.641756772994995, "rewards/margins": 0.7910820245742798, "rewards/rejected": 2.850675106048584, "step": 50620 }, { "epoch": 2.350619805933423, "grad_norm": 34.506919860839844, "learning_rate": 1.0841574198740269e-07, "logits/chosen": -20.862293243408203, "logits/rejected": -19.61157989501953, "logps/chosen": -522.8045654296875, "logps/rejected": -351.0613708496094, "loss": 0.2632, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.329773902893066, "rewards/margins": 2.2194669246673584, "rewards/rejected": 2.110307216644287, "step": 50630 }, { "epoch": 2.351084080040856, "grad_norm": 187.49781799316406, "learning_rate": 1.0833836296949719e-07, "logits/chosen": -18.327014923095703, "logits/rejected": -17.634572982788086, "logps/chosen": -470.40960693359375, "logps/rejected": -407.67218017578125, "loss": 0.8988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.7866530418396, "rewards/margins": 1.5314679145812988, "rewards/rejected": 3.2551848888397217, "step": 50640 }, { "epoch": 2.351548354148289, "grad_norm": 65.60550689697266, "learning_rate": 1.0826098395159168e-07, "logits/chosen": -18.412029266357422, "logits/rejected": -17.876922607421875, "logps/chosen": -379.8757629394531, "logps/rejected": -338.5580139160156, "loss": 1.8841, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.645224094390869, "rewards/margins": -0.5974835753440857, "rewards/rejected": 3.2427074909210205, "step": 50650 }, { "epoch": 2.3520126282557223, "grad_norm": 35.05329132080078, "learning_rate": 1.0818360493368618e-07, "logits/chosen": -18.53701400756836, "logits/rejected": -18.137004852294922, "logps/chosen": -299.80950927734375, "logps/rejected": -277.8412780761719, "loss": 1.0798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.280639410018921, "rewards/margins": 0.23893657326698303, "rewards/rejected": 1.0417028665542603, "step": 50660 }, { "epoch": 2.352476902363155, "grad_norm": 176.89004516601562, "learning_rate": 1.0810622591578067e-07, "logits/chosen": -18.731338500976562, "logits/rejected": -18.737890243530273, "logps/chosen": -357.90460205078125, "logps/rejected": -316.70458984375, "loss": 0.8187, "rewards/accuracies": 0.5, "rewards/chosen": 2.783336639404297, "rewards/margins": 1.1662280559539795, "rewards/rejected": 1.6171085834503174, "step": 50670 }, { "epoch": 2.3529411764705883, "grad_norm": 6.646115779876709, "learning_rate": 1.0802884689787516e-07, "logits/chosen": -17.663558959960938, "logits/rejected": -17.275869369506836, "logps/chosen": -299.7032165527344, "logps/rejected": -290.37860107421875, "loss": 0.5019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7511894702911377, "rewards/margins": 1.7775331735610962, "rewards/rejected": 0.9736565351486206, "step": 50680 }, { "epoch": 2.353405450578021, "grad_norm": 26.647043228149414, "learning_rate": 1.0795146787996966e-07, "logits/chosen": -20.28953742980957, "logits/rejected": -18.6671199798584, "logps/chosen": -550.0303955078125, "logps/rejected": -405.27947998046875, "loss": 0.325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.0871357917785645, "rewards/margins": 2.3356661796569824, "rewards/rejected": 2.751469135284424, "step": 50690 }, { "epoch": 2.3538697246854543, "grad_norm": 63.305747985839844, "learning_rate": 1.0787408886206416e-07, "logits/chosen": -18.309772491455078, "logits/rejected": -18.327905654907227, "logps/chosen": -335.9209289550781, "logps/rejected": -375.8184814453125, "loss": 1.1654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0629334449768066, "rewards/margins": 0.5826355814933777, "rewards/rejected": 1.4802979230880737, "step": 50700 }, { "epoch": 2.354333998792887, "grad_norm": 0.14425955712795258, "learning_rate": 1.0779670984415866e-07, "logits/chosen": -18.653249740600586, "logits/rejected": -17.495723724365234, "logps/chosen": -436.8954162597656, "logps/rejected": -275.2532653808594, "loss": 0.3899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.397054672241211, "rewards/margins": 3.1713778972625732, "rewards/rejected": 1.225676417350769, "step": 50710 }, { "epoch": 2.3547982729003203, "grad_norm": 49.445587158203125, "learning_rate": 1.0771933082625314e-07, "logits/chosen": -20.703189849853516, "logits/rejected": -19.168140411376953, "logps/chosen": -276.1772766113281, "logps/rejected": -249.8974151611328, "loss": 0.6025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0387778282165527, "rewards/margins": 1.2950260639190674, "rewards/rejected": 1.743751883506775, "step": 50720 }, { "epoch": 2.3552625470077535, "grad_norm": 31.15139389038086, "learning_rate": 1.0764195180834764e-07, "logits/chosen": -19.247169494628906, "logits/rejected": -18.229990005493164, "logps/chosen": -411.8779296875, "logps/rejected": -349.03460693359375, "loss": 0.4169, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.482402324676514, "rewards/margins": 1.9877605438232422, "rewards/rejected": 2.4946422576904297, "step": 50730 }, { "epoch": 2.3557268211151863, "grad_norm": 141.8727569580078, "learning_rate": 1.0756457279044214e-07, "logits/chosen": -18.853492736816406, "logits/rejected": -18.1348876953125, "logps/chosen": -460.3484802246094, "logps/rejected": -398.6622619628906, "loss": 0.9161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7488296031951904, "rewards/margins": 1.0416412353515625, "rewards/rejected": 2.707188367843628, "step": 50740 }, { "epoch": 2.3561910952226195, "grad_norm": 244.78330993652344, "learning_rate": 1.0748719377253664e-07, "logits/chosen": -19.039104461669922, "logits/rejected": -18.4815731048584, "logps/chosen": -372.72222900390625, "logps/rejected": -276.1877136230469, "loss": 1.0635, "rewards/accuracies": 0.5, "rewards/chosen": 4.468903541564941, "rewards/margins": 1.8868303298950195, "rewards/rejected": 2.58207368850708, "step": 50750 }, { "epoch": 2.3566553693300527, "grad_norm": 210.09437561035156, "learning_rate": 1.0740981475463112e-07, "logits/chosen": -18.852691650390625, "logits/rejected": -18.626060485839844, "logps/chosen": -351.2876281738281, "logps/rejected": -360.0190124511719, "loss": 0.7839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8653311729431152, "rewards/margins": 0.3946695327758789, "rewards/rejected": 2.4706616401672363, "step": 50760 }, { "epoch": 2.3571196434374855, "grad_norm": 166.25657653808594, "learning_rate": 1.0733243573672562e-07, "logits/chosen": -19.139253616333008, "logits/rejected": -18.449092864990234, "logps/chosen": -404.560791015625, "logps/rejected": -369.9821472167969, "loss": 1.0676, "rewards/accuracies": 0.5, "rewards/chosen": 4.443466663360596, "rewards/margins": 1.2297977209091187, "rewards/rejected": 3.2136693000793457, "step": 50770 }, { "epoch": 2.3575839175449187, "grad_norm": 86.8367919921875, "learning_rate": 1.0725505671882012e-07, "logits/chosen": -18.75832748413086, "logits/rejected": -18.708023071289062, "logps/chosen": -368.37762451171875, "logps/rejected": -331.6116638183594, "loss": 1.3266, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.041246175765991, "rewards/margins": -0.24438941478729248, "rewards/rejected": 2.2856357097625732, "step": 50780 }, { "epoch": 2.3580481916523515, "grad_norm": 10.473212242126465, "learning_rate": 1.0717767770091462e-07, "logits/chosen": -18.721433639526367, "logits/rejected": -18.330060958862305, "logps/chosen": -378.5962219238281, "logps/rejected": -296.87689208984375, "loss": 0.5101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.834139585494995, "rewards/margins": 1.2927227020263672, "rewards/rejected": 2.541417360305786, "step": 50790 }, { "epoch": 2.3585124657597847, "grad_norm": 60.46617889404297, "learning_rate": 1.0710029868300912e-07, "logits/chosen": -18.74034309387207, "logits/rejected": -18.29692840576172, "logps/chosen": -395.9425964355469, "logps/rejected": -425.5392150878906, "loss": 0.8913, "rewards/accuracies": 0.5, "rewards/chosen": 3.3850700855255127, "rewards/margins": 0.027260709553956985, "rewards/rejected": 3.357809543609619, "step": 50800 }, { "epoch": 2.3589767398672175, "grad_norm": 6.230620384216309, "learning_rate": 1.070229196651036e-07, "logits/chosen": -19.406688690185547, "logits/rejected": -18.246295928955078, "logps/chosen": -397.0964050292969, "logps/rejected": -279.0213928222656, "loss": 0.6673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.338258266448975, "rewards/margins": 2.0960912704467773, "rewards/rejected": 2.2421669960021973, "step": 50810 }, { "epoch": 2.3594410139746507, "grad_norm": 4.007818222045898, "learning_rate": 1.069455406471981e-07, "logits/chosen": -19.494176864624023, "logits/rejected": -18.64341163635254, "logps/chosen": -341.34930419921875, "logps/rejected": -291.2767028808594, "loss": 0.4256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.726879119873047, "rewards/margins": 1.915765404701233, "rewards/rejected": 1.811113715171814, "step": 50820 }, { "epoch": 2.359905288082084, "grad_norm": 0.37965431809425354, "learning_rate": 1.068681616292926e-07, "logits/chosen": -18.706880569458008, "logits/rejected": -17.854015350341797, "logps/chosen": -407.981689453125, "logps/rejected": -348.1717224121094, "loss": 0.6075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8329427242279053, "rewards/margins": 1.4711984395980835, "rewards/rejected": 1.3617444038391113, "step": 50830 }, { "epoch": 2.3603695621895167, "grad_norm": 38.8393440246582, "learning_rate": 1.067907826113871e-07, "logits/chosen": -19.70010757446289, "logits/rejected": -19.444324493408203, "logps/chosen": -423.391357421875, "logps/rejected": -388.4625244140625, "loss": 1.0174, "rewards/accuracies": 0.5, "rewards/chosen": 3.300495147705078, "rewards/margins": 0.20935888588428497, "rewards/rejected": 3.0911355018615723, "step": 50840 }, { "epoch": 2.36083383629695, "grad_norm": 3.7672362327575684, "learning_rate": 1.0671340359348159e-07, "logits/chosen": -19.80504608154297, "logits/rejected": -16.977153778076172, "logps/chosen": -381.9172058105469, "logps/rejected": -171.1837615966797, "loss": 0.1451, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6693472862243652, "rewards/margins": 3.0918197631835938, "rewards/rejected": 0.5775278806686401, "step": 50850 }, { "epoch": 2.3612981104043826, "grad_norm": 15.660600662231445, "learning_rate": 1.0663602457557608e-07, "logits/chosen": -18.504661560058594, "logits/rejected": -18.09307098388672, "logps/chosen": -393.905029296875, "logps/rejected": -347.09906005859375, "loss": 0.631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2613766193389893, "rewards/margins": 0.5955191850662231, "rewards/rejected": 1.6658570766448975, "step": 50860 }, { "epoch": 2.361762384511816, "grad_norm": 40.705379486083984, "learning_rate": 1.0655864555767058e-07, "logits/chosen": -19.259883880615234, "logits/rejected": -19.20676040649414, "logps/chosen": -340.60406494140625, "logps/rejected": -350.87005615234375, "loss": 0.3147, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5269203186035156, "rewards/margins": 1.7384014129638672, "rewards/rejected": 1.7885195016860962, "step": 50870 }, { "epoch": 2.3622266586192486, "grad_norm": 43.690513610839844, "learning_rate": 1.0648126653976506e-07, "logits/chosen": -17.51712417602539, "logits/rejected": -17.463895797729492, "logps/chosen": -391.6995544433594, "logps/rejected": -338.8078308105469, "loss": 1.2367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.214463472366333, "rewards/margins": 0.5361863374710083, "rewards/rejected": 1.6782770156860352, "step": 50880 }, { "epoch": 2.362690932726682, "grad_norm": 213.64743041992188, "learning_rate": 1.0640388752185957e-07, "logits/chosen": -19.14822006225586, "logits/rejected": -19.2153377532959, "logps/chosen": -406.4618835449219, "logps/rejected": -432.2034606933594, "loss": 1.0364, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 4.345488548278809, "rewards/margins": 0.2626630365848541, "rewards/rejected": 4.082825660705566, "step": 50890 }, { "epoch": 2.363155206834115, "grad_norm": 58.683082580566406, "learning_rate": 1.0632650850395407e-07, "logits/chosen": -18.60445785522461, "logits/rejected": -18.159053802490234, "logps/chosen": -315.03814697265625, "logps/rejected": -311.141357421875, "loss": 0.7254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0581350326538086, "rewards/margins": 1.2944271564483643, "rewards/rejected": 1.7637078762054443, "step": 50900 }, { "epoch": 2.363619480941548, "grad_norm": 258.2039794921875, "learning_rate": 1.0624912948604855e-07, "logits/chosen": -19.313297271728516, "logits/rejected": -18.853349685668945, "logps/chosen": -434.33154296875, "logps/rejected": -369.859619140625, "loss": 0.4932, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.424551010131836, "rewards/margins": 1.5336414575576782, "rewards/rejected": 2.8909099102020264, "step": 50910 }, { "epoch": 2.364083755048981, "grad_norm": 67.25167083740234, "learning_rate": 1.0617175046814305e-07, "logits/chosen": -19.353073120117188, "logits/rejected": -18.101560592651367, "logps/chosen": -401.8206481933594, "logps/rejected": -305.4599304199219, "loss": 0.5023, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.707487106323242, "rewards/margins": 1.568601369857788, "rewards/rejected": 2.138885974884033, "step": 50920 }, { "epoch": 2.364548029156414, "grad_norm": 246.5919647216797, "learning_rate": 1.0609437145023755e-07, "logits/chosen": -18.846385955810547, "logits/rejected": -18.622421264648438, "logps/chosen": -414.6363220214844, "logps/rejected": -390.94439697265625, "loss": 0.9143, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0498476028442383, "rewards/margins": 0.7325287461280823, "rewards/rejected": 2.3173186779022217, "step": 50930 }, { "epoch": 2.365012303263847, "grad_norm": 45.17414093017578, "learning_rate": 1.0601699243233205e-07, "logits/chosen": -18.381620407104492, "logits/rejected": -17.767484664916992, "logps/chosen": -394.72430419921875, "logps/rejected": -334.33758544921875, "loss": 0.6137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.977792263031006, "rewards/margins": 1.5889008045196533, "rewards/rejected": 1.3888914585113525, "step": 50940 }, { "epoch": 2.36547657737128, "grad_norm": 28.768444061279297, "learning_rate": 1.0593961341442655e-07, "logits/chosen": -19.072397232055664, "logits/rejected": -18.916940689086914, "logps/chosen": -439.61346435546875, "logps/rejected": -380.37640380859375, "loss": 0.3114, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.209721565246582, "rewards/margins": 2.2245941162109375, "rewards/rejected": 1.9851276874542236, "step": 50950 }, { "epoch": 2.365940851478713, "grad_norm": 32.93187713623047, "learning_rate": 1.0586223439652103e-07, "logits/chosen": -19.486785888671875, "logits/rejected": -18.68716049194336, "logps/chosen": -314.8328857421875, "logps/rejected": -240.85302734375, "loss": 0.3304, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.48799204826355, "rewards/margins": 2.6217780113220215, "rewards/rejected": 0.8662139773368835, "step": 50960 }, { "epoch": 2.3664051255861462, "grad_norm": 61.55921173095703, "learning_rate": 1.0578485537861553e-07, "logits/chosen": -20.16868782043457, "logits/rejected": -18.8912296295166, "logps/chosen": -331.184326171875, "logps/rejected": -225.3132781982422, "loss": 0.5293, "rewards/accuracies": 0.5, "rewards/chosen": 2.4550061225891113, "rewards/margins": 1.1955641508102417, "rewards/rejected": 1.2594420909881592, "step": 50970 }, { "epoch": 2.366869399693579, "grad_norm": 286.2605285644531, "learning_rate": 1.0570747636071003e-07, "logits/chosen": -18.552751541137695, "logits/rejected": -18.394489288330078, "logps/chosen": -416.4664001464844, "logps/rejected": -381.32501220703125, "loss": 0.8248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.506077289581299, "rewards/margins": 1.8429205417633057, "rewards/rejected": 2.6631569862365723, "step": 50980 }, { "epoch": 2.3673336738010122, "grad_norm": 57.03959655761719, "learning_rate": 1.0563009734280453e-07, "logits/chosen": -18.929920196533203, "logits/rejected": -17.94839096069336, "logps/chosen": -319.07403564453125, "logps/rejected": -270.02252197265625, "loss": 0.5889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.484832286834717, "rewards/margins": 1.315815806388855, "rewards/rejected": 1.1690163612365723, "step": 50990 }, { "epoch": 2.367797947908445, "grad_norm": 76.73939514160156, "learning_rate": 1.0555271832489901e-07, "logits/chosen": -19.004255294799805, "logits/rejected": -19.720355987548828, "logps/chosen": -382.5801696777344, "logps/rejected": -345.65069580078125, "loss": 0.7539, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.891735792160034, "rewards/margins": 1.180748701095581, "rewards/rejected": 1.7109873294830322, "step": 51000 }, { "epoch": 2.368262222015878, "grad_norm": 47.76373291015625, "learning_rate": 1.0547533930699351e-07, "logits/chosen": -19.9011173248291, "logits/rejected": -19.176546096801758, "logps/chosen": -358.5382080078125, "logps/rejected": -384.6855163574219, "loss": 0.781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.772059679031372, "rewards/margins": 0.8973544836044312, "rewards/rejected": 2.8747050762176514, "step": 51010 }, { "epoch": 2.368726496123311, "grad_norm": 96.05882263183594, "learning_rate": 1.05397960289088e-07, "logits/chosen": -18.864110946655273, "logits/rejected": -17.908227920532227, "logps/chosen": -402.67718505859375, "logps/rejected": -382.48529052734375, "loss": 0.3373, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.496852159500122, "rewards/margins": 1.2303407192230225, "rewards/rejected": 2.2665114402770996, "step": 51020 }, { "epoch": 2.369190770230744, "grad_norm": 19.27985191345215, "learning_rate": 1.053205812711825e-07, "logits/chosen": -19.204912185668945, "logits/rejected": -18.454877853393555, "logps/chosen": -305.2166442871094, "logps/rejected": -312.3861083984375, "loss": 0.4579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4655375480651855, "rewards/margins": 1.0517746210098267, "rewards/rejected": 1.4137630462646484, "step": 51030 }, { "epoch": 2.3696550443381774, "grad_norm": 111.04641723632812, "learning_rate": 1.05243202253277e-07, "logits/chosen": -19.67498779296875, "logits/rejected": -18.900684356689453, "logps/chosen": -359.46563720703125, "logps/rejected": -348.81610107421875, "loss": 1.4337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.080155611038208, "rewards/margins": 0.2882879376411438, "rewards/rejected": 2.791867733001709, "step": 51040 }, { "epoch": 2.37011931844561, "grad_norm": 3.3313148021698, "learning_rate": 1.0516582323537149e-07, "logits/chosen": -19.643592834472656, "logits/rejected": -18.329479217529297, "logps/chosen": -385.58544921875, "logps/rejected": -275.45281982421875, "loss": 0.3043, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.122452259063721, "rewards/margins": 2.5287671089172363, "rewards/rejected": 2.593684673309326, "step": 51050 }, { "epoch": 2.3705835925530434, "grad_norm": 47.20683670043945, "learning_rate": 1.0508844421746599e-07, "logits/chosen": -18.929567337036133, "logits/rejected": -18.162525177001953, "logps/chosen": -356.1730651855469, "logps/rejected": -374.67413330078125, "loss": 1.0369, "rewards/accuracies": 0.5, "rewards/chosen": 2.2907662391662598, "rewards/margins": 0.5114272236824036, "rewards/rejected": 1.7793388366699219, "step": 51060 }, { "epoch": 2.371047866660476, "grad_norm": 86.56478881835938, "learning_rate": 1.0501106519956048e-07, "logits/chosen": -19.490991592407227, "logits/rejected": -18.134748458862305, "logps/chosen": -496.02099609375, "logps/rejected": -355.6267395019531, "loss": 0.536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.39315128326416, "rewards/margins": 1.9796479940414429, "rewards/rejected": 2.4135029315948486, "step": 51070 }, { "epoch": 2.3715121407679094, "grad_norm": 42.167442321777344, "learning_rate": 1.0493368618165498e-07, "logits/chosen": -19.784860610961914, "logits/rejected": -18.153072357177734, "logps/chosen": -420.875, "logps/rejected": -301.5435485839844, "loss": 0.2032, "rewards/accuracies": 1.0, "rewards/chosen": 4.244149208068848, "rewards/margins": 2.738173007965088, "rewards/rejected": 1.5059759616851807, "step": 51080 }, { "epoch": 2.371976414875342, "grad_norm": 1.9980123043060303, "learning_rate": 1.0485630716374948e-07, "logits/chosen": -19.003503799438477, "logits/rejected": -17.92862892150879, "logps/chosen": -412.0262756347656, "logps/rejected": -297.09222412109375, "loss": 0.4683, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5906567573547363, "rewards/margins": 1.8575102090835571, "rewards/rejected": 1.7331463098526, "step": 51090 }, { "epoch": 2.3724406889827754, "grad_norm": 137.41839599609375, "learning_rate": 1.0477892814584396e-07, "logits/chosen": -19.12710952758789, "logits/rejected": -18.244842529296875, "logps/chosen": -371.51470947265625, "logps/rejected": -315.2466125488281, "loss": 1.3982, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0459015369415283, "rewards/margins": -0.0700279250741005, "rewards/rejected": 3.11592960357666, "step": 51100 }, { "epoch": 2.3729049630902086, "grad_norm": 0.7907148599624634, "learning_rate": 1.0470154912793846e-07, "logits/chosen": -18.742584228515625, "logits/rejected": -16.823566436767578, "logps/chosen": -499.2388610839844, "logps/rejected": -319.8853454589844, "loss": 0.3187, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.3038105964660645, "rewards/margins": 3.7218031883239746, "rewards/rejected": 1.5820077657699585, "step": 51110 }, { "epoch": 2.3733692371976414, "grad_norm": 78.59832000732422, "learning_rate": 1.0462417011003295e-07, "logits/chosen": -18.74997329711914, "logits/rejected": -17.68095588684082, "logps/chosen": -406.99359130859375, "logps/rejected": -265.8130798339844, "loss": 0.3288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.460443496704102, "rewards/margins": 2.740327835083008, "rewards/rejected": 1.7201154232025146, "step": 51120 }, { "epoch": 2.3738335113050746, "grad_norm": 8.494327545166016, "learning_rate": 1.0454679109212746e-07, "logits/chosen": -19.29255485534668, "logits/rejected": -18.675888061523438, "logps/chosen": -391.17889404296875, "logps/rejected": -360.09234619140625, "loss": 0.5337, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.498968601226807, "rewards/margins": 1.8557758331298828, "rewards/rejected": 2.6431922912597656, "step": 51130 }, { "epoch": 2.3742977854125074, "grad_norm": 8.576530456542969, "learning_rate": 1.0446941207422196e-07, "logits/chosen": -19.374364852905273, "logits/rejected": -18.236066818237305, "logps/chosen": -357.5295715332031, "logps/rejected": -302.35491943359375, "loss": 0.5325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.066038131713867, "rewards/margins": 0.9868025779724121, "rewards/rejected": 2.079235792160034, "step": 51140 }, { "epoch": 2.3747620595199406, "grad_norm": 0.4200402796268463, "learning_rate": 1.0439203305631644e-07, "logits/chosen": -18.49759864807129, "logits/rejected": -18.239604949951172, "logps/chosen": -351.0871276855469, "logps/rejected": -297.9448547363281, "loss": 1.178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.692166566848755, "rewards/margins": 0.837685763835907, "rewards/rejected": 1.8544807434082031, "step": 51150 }, { "epoch": 2.375226333627374, "grad_norm": 2.8503260612487793, "learning_rate": 1.0431465403841094e-07, "logits/chosen": -19.60689353942871, "logits/rejected": -19.273527145385742, "logps/chosen": -370.6208801269531, "logps/rejected": -355.48480224609375, "loss": 0.8875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7349612712860107, "rewards/margins": 0.5982919931411743, "rewards/rejected": 3.136669635772705, "step": 51160 }, { "epoch": 2.3756906077348066, "grad_norm": 234.6033935546875, "learning_rate": 1.0423727502050542e-07, "logits/chosen": -19.07663345336914, "logits/rejected": -18.476158142089844, "logps/chosen": -491.880859375, "logps/rejected": -438.79058837890625, "loss": 0.7663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.4449591636657715, "rewards/margins": 1.2249089479446411, "rewards/rejected": 3.220050096511841, "step": 51170 }, { "epoch": 2.3761548818422398, "grad_norm": 0.37142977118492126, "learning_rate": 1.0415989600259994e-07, "logits/chosen": -18.704998016357422, "logits/rejected": -18.353548049926758, "logps/chosen": -336.72271728515625, "logps/rejected": -310.13104248046875, "loss": 1.0933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2536563873291016, "rewards/margins": 1.041743516921997, "rewards/rejected": 2.2119126319885254, "step": 51180 }, { "epoch": 2.3766191559496725, "grad_norm": 0.014751414768397808, "learning_rate": 1.0408251698469443e-07, "logits/chosen": -18.524675369262695, "logits/rejected": -17.869962692260742, "logps/chosen": -448.0167541503906, "logps/rejected": -367.82891845703125, "loss": 0.5391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.685042381286621, "rewards/margins": 1.8671118021011353, "rewards/rejected": 2.8179306983947754, "step": 51190 }, { "epoch": 2.3770834300571058, "grad_norm": 12.307623863220215, "learning_rate": 1.0400513796678892e-07, "logits/chosen": -20.01214027404785, "logits/rejected": -18.53944206237793, "logps/chosen": -474.2706604003906, "logps/rejected": -376.0108642578125, "loss": 0.6241, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.264299392700195, "rewards/margins": 1.8125946521759033, "rewards/rejected": 2.451704978942871, "step": 51200 }, { "epoch": 2.377547704164539, "grad_norm": 243.43069458007812, "learning_rate": 1.0392775894888342e-07, "logits/chosen": -19.48248291015625, "logits/rejected": -18.174182891845703, "logps/chosen": -430.02703857421875, "logps/rejected": -292.10467529296875, "loss": 1.0658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.611330509185791, "rewards/margins": 0.888654887676239, "rewards/rejected": 2.722675085067749, "step": 51210 }, { "epoch": 2.3780119782719717, "grad_norm": 11.932686805725098, "learning_rate": 1.038503799309779e-07, "logits/chosen": -18.8424072265625, "logits/rejected": -17.00754737854004, "logps/chosen": -386.54266357421875, "logps/rejected": -220.25497436523438, "loss": 0.2238, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.0257792472839355, "rewards/margins": 3.3864760398864746, "rewards/rejected": 1.639303207397461, "step": 51220 }, { "epoch": 2.378476252379405, "grad_norm": 3.8455541133880615, "learning_rate": 1.0377300091307241e-07, "logits/chosen": -19.388269424438477, "logits/rejected": -19.40372657775879, "logps/chosen": -387.7572326660156, "logps/rejected": -304.7286682128906, "loss": 0.7631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4728522300720215, "rewards/margins": 1.0715510845184326, "rewards/rejected": 2.401301145553589, "step": 51230 }, { "epoch": 2.3789405264868377, "grad_norm": 68.82952880859375, "learning_rate": 1.036956218951669e-07, "logits/chosen": -18.526443481445312, "logits/rejected": -18.569780349731445, "logps/chosen": -360.24017333984375, "logps/rejected": -402.8949890136719, "loss": 0.7537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7274527549743652, "rewards/margins": 0.2901988923549652, "rewards/rejected": 2.437253713607788, "step": 51240 }, { "epoch": 2.379404800594271, "grad_norm": 36.10953140258789, "learning_rate": 1.036182428772614e-07, "logits/chosen": -19.60948944091797, "logits/rejected": -18.241233825683594, "logps/chosen": -463.40716552734375, "logps/rejected": -316.0796813964844, "loss": 0.3015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7624564170837402, "rewards/margins": 1.6315584182739258, "rewards/rejected": 2.1308987140655518, "step": 51250 }, { "epoch": 2.3798690747017037, "grad_norm": 8.898277282714844, "learning_rate": 1.035408638593559e-07, "logits/chosen": -19.321557998657227, "logits/rejected": -19.30527687072754, "logps/chosen": -360.084716796875, "logps/rejected": -320.5736083984375, "loss": 0.9013, "rewards/accuracies": 0.5, "rewards/chosen": 2.6931824684143066, "rewards/margins": 0.2379571497440338, "rewards/rejected": 2.4552252292633057, "step": 51260 }, { "epoch": 2.380333348809137, "grad_norm": 148.74647521972656, "learning_rate": 1.0346348484145039e-07, "logits/chosen": -18.71042823791504, "logits/rejected": -18.009973526000977, "logps/chosen": -542.6558837890625, "logps/rejected": -412.0111389160156, "loss": 0.4487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.460585594177246, "rewards/margins": 1.0668079853057861, "rewards/rejected": 3.393777370452881, "step": 51270 }, { "epoch": 2.38079762291657, "grad_norm": 0.46371564269065857, "learning_rate": 1.0338610582354489e-07, "logits/chosen": -18.5612735748291, "logits/rejected": -17.523120880126953, "logps/chosen": -363.31842041015625, "logps/rejected": -281.05609130859375, "loss": 0.652, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.132053852081299, "rewards/margins": 1.764980673789978, "rewards/rejected": 1.3670728206634521, "step": 51280 }, { "epoch": 2.381261897024003, "grad_norm": 53.577354431152344, "learning_rate": 1.0330872680563937e-07, "logits/chosen": -18.118619918823242, "logits/rejected": -17.166120529174805, "logps/chosen": -392.076416015625, "logps/rejected": -256.7294006347656, "loss": 0.5486, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.968066692352295, "rewards/margins": 1.5918455123901367, "rewards/rejected": 1.376220941543579, "step": 51290 }, { "epoch": 2.381726171131436, "grad_norm": 141.2989501953125, "learning_rate": 1.0323134778773387e-07, "logits/chosen": -19.197376251220703, "logits/rejected": -19.35801887512207, "logps/chosen": -485.19921875, "logps/rejected": -414.4263610839844, "loss": 0.5373, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.547079563140869, "rewards/margins": 0.8055227398872375, "rewards/rejected": 3.7415568828582764, "step": 51300 }, { "epoch": 2.382190445238869, "grad_norm": 16.180580139160156, "learning_rate": 1.0315396876982837e-07, "logits/chosen": -18.998371124267578, "logits/rejected": -18.50840950012207, "logps/chosen": -320.07464599609375, "logps/rejected": -269.78704833984375, "loss": 1.0826, "rewards/accuracies": 0.5, "rewards/chosen": 2.8029589653015137, "rewards/margins": 1.1066253185272217, "rewards/rejected": 1.696333885192871, "step": 51310 }, { "epoch": 2.382654719346302, "grad_norm": 75.7812728881836, "learning_rate": 1.0307658975192287e-07, "logits/chosen": -19.514678955078125, "logits/rejected": -19.230379104614258, "logps/chosen": -494.924072265625, "logps/rejected": -441.9117736816406, "loss": 0.8929, "rewards/accuracies": 0.5, "rewards/chosen": 3.737959384918213, "rewards/margins": 0.6149019002914429, "rewards/rejected": 3.1230576038360596, "step": 51320 }, { "epoch": 2.383118993453735, "grad_norm": 13.104023933410645, "learning_rate": 1.0299921073401737e-07, "logits/chosen": -18.682552337646484, "logits/rejected": -18.083797454833984, "logps/chosen": -294.5064697265625, "logps/rejected": -229.171875, "loss": 0.8356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3591208457946777, "rewards/margins": 1.217313289642334, "rewards/rejected": 1.1418075561523438, "step": 51330 }, { "epoch": 2.383583267561168, "grad_norm": 88.47373962402344, "learning_rate": 1.0292183171611185e-07, "logits/chosen": -18.785160064697266, "logits/rejected": -18.255321502685547, "logps/chosen": -405.98004150390625, "logps/rejected": -360.2223205566406, "loss": 0.5621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5617775917053223, "rewards/margins": 1.1379728317260742, "rewards/rejected": 2.423804998397827, "step": 51340 }, { "epoch": 2.3840475416686013, "grad_norm": 51.11948776245117, "learning_rate": 1.0284445269820635e-07, "logits/chosen": -19.06980323791504, "logits/rejected": -18.3315486907959, "logps/chosen": -382.4485168457031, "logps/rejected": -352.3001403808594, "loss": 0.6459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9713382720947266, "rewards/margins": 0.9435240030288696, "rewards/rejected": 2.0278143882751465, "step": 51350 }, { "epoch": 2.384511815776034, "grad_norm": 3.851264715194702, "learning_rate": 1.0276707368030083e-07, "logits/chosen": -19.538347244262695, "logits/rejected": -18.13565444946289, "logps/chosen": -341.94439697265625, "logps/rejected": -214.6077117919922, "loss": 0.302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7072179317474365, "rewards/margins": 1.4981027841567993, "rewards/rejected": 1.2091152667999268, "step": 51360 }, { "epoch": 2.3849760898834673, "grad_norm": 1.942116379737854, "learning_rate": 1.0268969466239535e-07, "logits/chosen": -18.49443244934082, "logits/rejected": -18.602970123291016, "logps/chosen": -379.584228515625, "logps/rejected": -348.29498291015625, "loss": 0.8204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2837777137756348, "rewards/margins": 1.1207544803619385, "rewards/rejected": 2.1630234718322754, "step": 51370 }, { "epoch": 2.3854403639909, "grad_norm": 114.67428588867188, "learning_rate": 1.0261231564448984e-07, "logits/chosen": -18.48276138305664, "logits/rejected": -17.545368194580078, "logps/chosen": -430.5609436035156, "logps/rejected": -330.8449401855469, "loss": 0.8518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.297046184539795, "rewards/margins": 1.7463794946670532, "rewards/rejected": 2.5506668090820312, "step": 51380 }, { "epoch": 2.3859046380983333, "grad_norm": 145.4461212158203, "learning_rate": 1.0253493662658433e-07, "logits/chosen": -20.188312530517578, "logits/rejected": -19.9738712310791, "logps/chosen": -396.8358154296875, "logps/rejected": -391.87554931640625, "loss": 0.8487, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.102787494659424, "rewards/margins": 0.35118165612220764, "rewards/rejected": 2.751605749130249, "step": 51390 }, { "epoch": 2.386368912205766, "grad_norm": 219.77023315429688, "learning_rate": 1.0245755760867883e-07, "logits/chosen": -20.35994529724121, "logits/rejected": -20.114917755126953, "logps/chosen": -293.78802490234375, "logps/rejected": -293.07257080078125, "loss": 0.6435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.289848804473877, "rewards/margins": 0.9965587854385376, "rewards/rejected": 2.293290138244629, "step": 51400 }, { "epoch": 2.3868331863131993, "grad_norm": 65.83842468261719, "learning_rate": 1.0238017859077331e-07, "logits/chosen": -18.786523818969727, "logits/rejected": -17.39992904663086, "logps/chosen": -360.1717224121094, "logps/rejected": -274.24627685546875, "loss": 1.1627, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.336371660232544, "rewards/margins": 1.498626470565796, "rewards/rejected": 1.8377447128295898, "step": 51410 }, { "epoch": 2.3872974604206325, "grad_norm": 190.81947326660156, "learning_rate": 1.0230279957286782e-07, "logits/chosen": -19.582828521728516, "logits/rejected": -18.543121337890625, "logps/chosen": -382.9510192871094, "logps/rejected": -317.4969787597656, "loss": 0.9626, "rewards/accuracies": 0.5, "rewards/chosen": 3.480125904083252, "rewards/margins": 1.4513275623321533, "rewards/rejected": 2.0287981033325195, "step": 51420 }, { "epoch": 2.3877617345280653, "grad_norm": 52.26060485839844, "learning_rate": 1.0222542055496231e-07, "logits/chosen": -19.20132064819336, "logits/rejected": -18.356201171875, "logps/chosen": -411.04534912109375, "logps/rejected": -319.68377685546875, "loss": 0.4365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6393344402313232, "rewards/margins": 1.9548790454864502, "rewards/rejected": 1.6844558715820312, "step": 51430 }, { "epoch": 2.3882260086354985, "grad_norm": 113.76664733886719, "learning_rate": 1.021480415370568e-07, "logits/chosen": -19.94146728515625, "logits/rejected": -18.877927780151367, "logps/chosen": -374.10284423828125, "logps/rejected": -274.5773010253906, "loss": 0.5608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8010525703430176, "rewards/margins": 0.9391506314277649, "rewards/rejected": 2.8619015216827393, "step": 51440 }, { "epoch": 2.3886902827429313, "grad_norm": 1.7726850509643555, "learning_rate": 1.020706625191513e-07, "logits/chosen": -19.083951950073242, "logits/rejected": -18.71772003173828, "logps/chosen": -448.427978515625, "logps/rejected": -366.5491027832031, "loss": 0.5125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.397812366485596, "rewards/margins": 1.80336594581604, "rewards/rejected": 2.5944464206695557, "step": 51450 }, { "epoch": 2.3891545568503645, "grad_norm": 29.724279403686523, "learning_rate": 1.0199328350124579e-07, "logits/chosen": -18.482730865478516, "logits/rejected": -18.509187698364258, "logps/chosen": -372.3534240722656, "logps/rejected": -318.1612548828125, "loss": 0.5909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.278967380523682, "rewards/margins": 1.7010467052459717, "rewards/rejected": 2.577920436859131, "step": 51460 }, { "epoch": 2.3896188309577973, "grad_norm": 0.21326984465122223, "learning_rate": 1.019159044833403e-07, "logits/chosen": -19.311805725097656, "logits/rejected": -18.54511833190918, "logps/chosen": -379.0792541503906, "logps/rejected": -274.1875, "loss": 1.1329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.0273332595825195, "rewards/margins": 1.6152549982070923, "rewards/rejected": 2.4120779037475586, "step": 51470 }, { "epoch": 2.3900831050652305, "grad_norm": 209.643310546875, "learning_rate": 1.0183852546543478e-07, "logits/chosen": -18.227066040039062, "logits/rejected": -17.31473159790039, "logps/chosen": -414.5692443847656, "logps/rejected": -328.5973205566406, "loss": 0.6882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.449751377105713, "rewards/margins": 1.3278337717056274, "rewards/rejected": 2.121918201446533, "step": 51480 }, { "epoch": 2.3905473791726637, "grad_norm": 180.6197509765625, "learning_rate": 1.0176114644752928e-07, "logits/chosen": -18.70557975769043, "logits/rejected": -18.02695655822754, "logps/chosen": -417.8544006347656, "logps/rejected": -354.65081787109375, "loss": 0.6514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.500402450561523, "rewards/margins": 0.6617228388786316, "rewards/rejected": 3.838679790496826, "step": 51490 }, { "epoch": 2.3910116532800965, "grad_norm": 87.1932601928711, "learning_rate": 1.0168376742962378e-07, "logits/chosen": -18.63644790649414, "logits/rejected": -17.80426025390625, "logps/chosen": -400.9890441894531, "logps/rejected": -325.39263916015625, "loss": 0.6339, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0373315811157227, "rewards/margins": 1.0772984027862549, "rewards/rejected": 1.9600334167480469, "step": 51500 }, { "epoch": 2.3914759273875297, "grad_norm": 27.3400821685791, "learning_rate": 1.0160638841171827e-07, "logits/chosen": -18.977603912353516, "logits/rejected": -18.94558334350586, "logps/chosen": -497.86553955078125, "logps/rejected": -426.6211853027344, "loss": 0.5244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.9503679275512695, "rewards/margins": 1.101853609085083, "rewards/rejected": 3.848515033721924, "step": 51510 }, { "epoch": 2.3919402014949624, "grad_norm": 2.151137113571167, "learning_rate": 1.0152900939381278e-07, "logits/chosen": -19.197525024414062, "logits/rejected": -18.659812927246094, "logps/chosen": -261.99371337890625, "logps/rejected": -250.1627655029297, "loss": 0.8158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.363233804702759, "rewards/margins": 0.9608932733535767, "rewards/rejected": 1.402340292930603, "step": 51520 }, { "epoch": 2.3924044756023957, "grad_norm": 205.57408142089844, "learning_rate": 1.0145163037590726e-07, "logits/chosen": -19.380817413330078, "logits/rejected": -18.717966079711914, "logps/chosen": -393.9321594238281, "logps/rejected": -382.5892639160156, "loss": 1.0132, "rewards/accuracies": 0.5, "rewards/chosen": 3.6742711067199707, "rewards/margins": 1.402467966079712, "rewards/rejected": 2.271803140640259, "step": 51530 }, { "epoch": 2.3928687497098284, "grad_norm": 19.28977394104004, "learning_rate": 1.0137425135800176e-07, "logits/chosen": -19.101055145263672, "logits/rejected": -18.459197998046875, "logps/chosen": -516.5225219726562, "logps/rejected": -366.4501953125, "loss": 0.5893, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4146218299865723, "rewards/margins": 0.8957303762435913, "rewards/rejected": 2.5188910961151123, "step": 51540 }, { "epoch": 2.3933330238172617, "grad_norm": 57.229774475097656, "learning_rate": 1.0129687234009624e-07, "logits/chosen": -18.96953010559082, "logits/rejected": -19.08859634399414, "logps/chosen": -358.33758544921875, "logps/rejected": -359.37664794921875, "loss": 0.8662, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7435479164123535, "rewards/margins": 1.1739795207977295, "rewards/rejected": 2.569568157196045, "step": 51550 }, { "epoch": 2.393797297924695, "grad_norm": 14.727118492126465, "learning_rate": 1.0121949332219076e-07, "logits/chosen": -18.867544174194336, "logits/rejected": -18.32331085205078, "logps/chosen": -402.74041748046875, "logps/rejected": -341.03570556640625, "loss": 1.0532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1338067054748535, "rewards/margins": 0.7154667377471924, "rewards/rejected": 2.418339967727661, "step": 51560 }, { "epoch": 2.3942615720321276, "grad_norm": 131.1521759033203, "learning_rate": 1.0114211430428525e-07, "logits/chosen": -18.4467830657959, "logits/rejected": -18.607013702392578, "logps/chosen": -354.77166748046875, "logps/rejected": -348.0090637207031, "loss": 0.9954, "rewards/accuracies": 0.5, "rewards/chosen": 2.9547839164733887, "rewards/margins": 0.7853895425796509, "rewards/rejected": 2.1693942546844482, "step": 51570 }, { "epoch": 2.394725846139561, "grad_norm": 24.19586944580078, "learning_rate": 1.0106473528637974e-07, "logits/chosen": -19.4401912689209, "logits/rejected": -18.791927337646484, "logps/chosen": -504.3501892089844, "logps/rejected": -372.21527099609375, "loss": 0.9007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7999744415283203, "rewards/margins": 0.8713167905807495, "rewards/rejected": 2.9286580085754395, "step": 51580 }, { "epoch": 2.395190120246994, "grad_norm": 67.16671752929688, "learning_rate": 1.0098735626847424e-07, "logits/chosen": -17.998231887817383, "logits/rejected": -17.763429641723633, "logps/chosen": -322.7484436035156, "logps/rejected": -396.15533447265625, "loss": 1.0881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.303891658782959, "rewards/margins": -0.10059042274951935, "rewards/rejected": 2.404481887817383, "step": 51590 }, { "epoch": 2.395654394354427, "grad_norm": 59.933982849121094, "learning_rate": 1.0090997725056872e-07, "logits/chosen": -19.01504135131836, "logits/rejected": -17.996692657470703, "logps/chosen": -430.29534912109375, "logps/rejected": -270.4065856933594, "loss": 0.4782, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0019874572753906, "rewards/margins": 1.7473504543304443, "rewards/rejected": 1.2546367645263672, "step": 51600 }, { "epoch": 2.39611866846186, "grad_norm": 13.36488151550293, "learning_rate": 1.0083259823266323e-07, "logits/chosen": -18.81401824951172, "logits/rejected": -17.560298919677734, "logps/chosen": -465.77197265625, "logps/rejected": -262.65997314453125, "loss": 0.8106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.6305108070373535, "rewards/margins": 1.9113311767578125, "rewards/rejected": 2.719179153442383, "step": 51610 }, { "epoch": 2.396582942569293, "grad_norm": 0.3901233375072479, "learning_rate": 1.0075521921475773e-07, "logits/chosen": -19.213748931884766, "logits/rejected": -19.16867446899414, "logps/chosen": -386.4517517089844, "logps/rejected": -327.13226318359375, "loss": 0.6504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.344595432281494, "rewards/margins": 1.8749099969863892, "rewards/rejected": 1.4696853160858154, "step": 51620 }, { "epoch": 2.397047216676726, "grad_norm": 73.43248748779297, "learning_rate": 1.0067784019685222e-07, "logits/chosen": -19.118656158447266, "logits/rejected": -17.9606990814209, "logps/chosen": -302.5402526855469, "logps/rejected": -171.32046508789062, "loss": 0.6215, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.025266170501709, "rewards/margins": 2.182446241378784, "rewards/rejected": -0.1571800708770752, "step": 51630 }, { "epoch": 2.397511490784159, "grad_norm": 33.107444763183594, "learning_rate": 1.0060046117894671e-07, "logits/chosen": -18.422863006591797, "logits/rejected": -17.850372314453125, "logps/chosen": -360.97137451171875, "logps/rejected": -292.0047912597656, "loss": 0.4066, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1651206016540527, "rewards/margins": 1.4369102716445923, "rewards/rejected": 1.728210210800171, "step": 51640 }, { "epoch": 2.397975764891592, "grad_norm": 125.01863098144531, "learning_rate": 1.005230821610412e-07, "logits/chosen": -18.947507858276367, "logits/rejected": -18.706836700439453, "logps/chosen": -329.66168212890625, "logps/rejected": -354.1485290527344, "loss": 1.1748, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9297101497650146, "rewards/margins": -0.07830516993999481, "rewards/rejected": 3.0080151557922363, "step": 51650 }, { "epoch": 2.3984400389990252, "grad_norm": 10.735454559326172, "learning_rate": 1.0044570314313571e-07, "logits/chosen": -18.693389892578125, "logits/rejected": -17.915664672851562, "logps/chosen": -287.12042236328125, "logps/rejected": -215.4589080810547, "loss": 0.3284, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.0148329734802246, "rewards/margins": 2.061913013458252, "rewards/rejected": 0.9529203176498413, "step": 51660 }, { "epoch": 2.398904313106458, "grad_norm": 16.845937728881836, "learning_rate": 1.003683241252302e-07, "logits/chosen": -18.877750396728516, "logits/rejected": -18.282123565673828, "logps/chosen": -348.3383483886719, "logps/rejected": -303.865478515625, "loss": 1.1351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.145986557006836, "rewards/margins": 1.118457555770874, "rewards/rejected": 2.027529001235962, "step": 51670 }, { "epoch": 2.3993685872138912, "grad_norm": 0.038835082203149796, "learning_rate": 1.0029094510732469e-07, "logits/chosen": -19.44815444946289, "logits/rejected": -18.536636352539062, "logps/chosen": -495.884521484375, "logps/rejected": -389.6971435546875, "loss": 0.6673, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.061233043670654, "rewards/margins": 1.8709516525268555, "rewards/rejected": 2.190281391143799, "step": 51680 }, { "epoch": 2.399832861321324, "grad_norm": 99.52351379394531, "learning_rate": 1.0021356608941919e-07, "logits/chosen": -20.029094696044922, "logits/rejected": -18.8177490234375, "logps/chosen": -420.6897888183594, "logps/rejected": -272.43536376953125, "loss": 0.2313, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.946845054626465, "rewards/margins": 2.8574931621551514, "rewards/rejected": 2.0893514156341553, "step": 51690 }, { "epoch": 2.4002971354287572, "grad_norm": 12.44329833984375, "learning_rate": 1.0013618707151368e-07, "logits/chosen": -19.639568328857422, "logits/rejected": -18.543041229248047, "logps/chosen": -377.177001953125, "logps/rejected": -294.1299133300781, "loss": 0.4859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8400967121124268, "rewards/margins": 1.6557689905166626, "rewards/rejected": 1.1843277215957642, "step": 51700 }, { "epoch": 2.40076140953619, "grad_norm": 11.441489219665527, "learning_rate": 1.0005880805360819e-07, "logits/chosen": -18.824283599853516, "logits/rejected": -17.890384674072266, "logps/chosen": -357.45819091796875, "logps/rejected": -334.12158203125, "loss": 0.7466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8542377948760986, "rewards/margins": 0.898654580116272, "rewards/rejected": 1.9555832147598267, "step": 51710 }, { "epoch": 2.401225683643623, "grad_norm": 1.5158218145370483, "learning_rate": 9.998142903570267e-08, "logits/chosen": -19.594179153442383, "logits/rejected": -19.10431671142578, "logps/chosen": -471.06915283203125, "logps/rejected": -398.9303283691406, "loss": 0.3929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6787216663360596, "rewards/margins": 1.7157341241836548, "rewards/rejected": 1.9629875421524048, "step": 51720 }, { "epoch": 2.4016899577510564, "grad_norm": 220.5394287109375, "learning_rate": 9.990405001779717e-08, "logits/chosen": -18.891674041748047, "logits/rejected": -17.748287200927734, "logps/chosen": -398.8538513183594, "logps/rejected": -295.05767822265625, "loss": 0.4986, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.159210681915283, "rewards/margins": 1.835622787475586, "rewards/rejected": 2.3235878944396973, "step": 51730 }, { "epoch": 2.402154231858489, "grad_norm": 3.258880853652954, "learning_rate": 9.982667099989167e-08, "logits/chosen": -18.67497444152832, "logits/rejected": -18.2148494720459, "logps/chosen": -381.9164123535156, "logps/rejected": -308.48126220703125, "loss": 0.4929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.37144136428833, "rewards/margins": 1.2162315845489502, "rewards/rejected": 1.1552098989486694, "step": 51740 }, { "epoch": 2.4026185059659224, "grad_norm": 139.8908233642578, "learning_rate": 9.974929198198615e-08, "logits/chosen": -18.36475944519043, "logits/rejected": -16.84976577758789, "logps/chosen": -469.4867248535156, "logps/rejected": -342.63275146484375, "loss": 0.7162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.440042972564697, "rewards/margins": 2.1883010864257812, "rewards/rejected": 2.251741886138916, "step": 51750 }, { "epoch": 2.403082780073355, "grad_norm": 62.4910774230957, "learning_rate": 9.967191296408066e-08, "logits/chosen": -19.45227813720703, "logits/rejected": -18.67721939086914, "logps/chosen": -478.73944091796875, "logps/rejected": -321.05694580078125, "loss": 0.3735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9253430366516113, "rewards/margins": 2.2212724685668945, "rewards/rejected": 1.7040704488754272, "step": 51760 }, { "epoch": 2.4035470541807884, "grad_norm": 63.2062873840332, "learning_rate": 9.959453394617515e-08, "logits/chosen": -19.5044002532959, "logits/rejected": -17.74415397644043, "logps/chosen": -438.17462158203125, "logps/rejected": -291.5958557128906, "loss": 0.4113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.54937744140625, "rewards/margins": 2.0691323280334473, "rewards/rejected": 1.4802449941635132, "step": 51770 }, { "epoch": 2.404011328288221, "grad_norm": 1.39053475856781, "learning_rate": 9.951715492826965e-08, "logits/chosen": -18.553550720214844, "logits/rejected": -17.795251846313477, "logps/chosen": -432.5482482910156, "logps/rejected": -292.0967712402344, "loss": 0.8726, "rewards/accuracies": 0.5, "rewards/chosen": 3.164581775665283, "rewards/margins": 1.1747334003448486, "rewards/rejected": 1.9898481369018555, "step": 51780 }, { "epoch": 2.4044756023956544, "grad_norm": 61.9300537109375, "learning_rate": 9.943977591036413e-08, "logits/chosen": -19.298011779785156, "logits/rejected": -18.179187774658203, "logps/chosen": -346.08465576171875, "logps/rejected": -214.44210815429688, "loss": 0.7252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.565793991088867, "rewards/margins": 1.6651222705841064, "rewards/rejected": 0.9006717801094055, "step": 51790 }, { "epoch": 2.4049398765030876, "grad_norm": 116.13648223876953, "learning_rate": 9.936239689245863e-08, "logits/chosen": -19.739818572998047, "logits/rejected": -20.08095932006836, "logps/chosen": -413.28277587890625, "logps/rejected": -485.154296875, "loss": 0.8987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5176143646240234, "rewards/margins": 0.17857417464256287, "rewards/rejected": 3.3390402793884277, "step": 51800 }, { "epoch": 2.4054041506105204, "grad_norm": 3.1118946075439453, "learning_rate": 9.928501787455314e-08, "logits/chosen": -18.62567138671875, "logits/rejected": -18.73052978515625, "logps/chosen": -227.87600708007812, "logps/rejected": -227.6412811279297, "loss": 0.7666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9105383157730103, "rewards/margins": 0.8479295969009399, "rewards/rejected": 1.0626085996627808, "step": 51810 }, { "epoch": 2.4058684247179536, "grad_norm": 48.59450912475586, "learning_rate": 9.920763885664763e-08, "logits/chosen": -18.534353256225586, "logits/rejected": -17.42264747619629, "logps/chosen": -361.76397705078125, "logps/rejected": -249.1409454345703, "loss": 0.3513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.305997371673584, "rewards/margins": 2.0229132175445557, "rewards/rejected": 1.2830843925476074, "step": 51820 }, { "epoch": 2.4063326988253864, "grad_norm": 187.103271484375, "learning_rate": 9.913025983874212e-08, "logits/chosen": -20.379302978515625, "logits/rejected": -19.509700775146484, "logps/chosen": -523.5079956054688, "logps/rejected": -459.09991455078125, "loss": 0.5118, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.147825241088867, "rewards/margins": 1.4817036390304565, "rewards/rejected": 3.6661217212677, "step": 51830 }, { "epoch": 2.4067969729328196, "grad_norm": 1.777469515800476, "learning_rate": 9.905288082083661e-08, "logits/chosen": -18.585538864135742, "logits/rejected": -18.93744659423828, "logps/chosen": -420.966552734375, "logps/rejected": -419.7073669433594, "loss": 0.9507, "rewards/accuracies": 0.5, "rewards/chosen": 3.7387442588806152, "rewards/margins": 0.8122587203979492, "rewards/rejected": 2.926485538482666, "step": 51840 }, { "epoch": 2.4072612470402523, "grad_norm": 89.76358032226562, "learning_rate": 9.897550180293111e-08, "logits/chosen": -17.909374237060547, "logits/rejected": -18.153308868408203, "logps/chosen": -354.12200927734375, "logps/rejected": -374.02386474609375, "loss": 1.0691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.777101993560791, "rewards/margins": -0.1343080997467041, "rewards/rejected": 2.911410093307495, "step": 51850 }, { "epoch": 2.4077255211476856, "grad_norm": 60.91559600830078, "learning_rate": 9.889812278502562e-08, "logits/chosen": -19.2753963470459, "logits/rejected": -18.94025230407715, "logps/chosen": -349.82305908203125, "logps/rejected": -291.37225341796875, "loss": 0.5989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.912205219268799, "rewards/margins": 1.3693430423736572, "rewards/rejected": 1.5428626537322998, "step": 51860 }, { "epoch": 2.408189795255119, "grad_norm": 36.89433288574219, "learning_rate": 9.88207437671201e-08, "logits/chosen": -18.991750717163086, "logits/rejected": -17.740907669067383, "logps/chosen": -485.8447265625, "logps/rejected": -330.17315673828125, "loss": 0.6709, "rewards/accuracies": 0.5, "rewards/chosen": 3.313660144805908, "rewards/margins": 1.3574717044830322, "rewards/rejected": 1.9561882019042969, "step": 51870 }, { "epoch": 2.4086540693625516, "grad_norm": 11.770181655883789, "learning_rate": 9.87433647492146e-08, "logits/chosen": -19.09296226501465, "logits/rejected": -18.635835647583008, "logps/chosen": -367.4580078125, "logps/rejected": -307.18096923828125, "loss": 0.4042, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.878718137741089, "rewards/margins": 0.9475467801094055, "rewards/rejected": 2.931171417236328, "step": 51880 }, { "epoch": 2.4091183434699848, "grad_norm": 14.259027481079102, "learning_rate": 9.866598573130909e-08, "logits/chosen": -19.739261627197266, "logits/rejected": -18.576934814453125, "logps/chosen": -428.44134521484375, "logps/rejected": -373.3174133300781, "loss": 0.3365, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.771768093109131, "rewards/margins": 1.7672624588012695, "rewards/rejected": 2.004505157470703, "step": 51890 }, { "epoch": 2.4095826175774175, "grad_norm": 3.353588819503784, "learning_rate": 9.85886067134036e-08, "logits/chosen": -18.847808837890625, "logits/rejected": -17.63507080078125, "logps/chosen": -407.3900146484375, "logps/rejected": -283.23046875, "loss": 0.4437, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0951433181762695, "rewards/margins": 1.7235647439956665, "rewards/rejected": 1.3715784549713135, "step": 51900 }, { "epoch": 2.4100468916848508, "grad_norm": 55.684024810791016, "learning_rate": 9.851122769549808e-08, "logits/chosen": -19.388416290283203, "logits/rejected": -18.151737213134766, "logps/chosen": -376.3045654296875, "logps/rejected": -276.1285705566406, "loss": 0.6266, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3767426013946533, "rewards/margins": 1.7809966802597046, "rewards/rejected": 1.595746397972107, "step": 51910 }, { "epoch": 2.4105111657922835, "grad_norm": 3.995222806930542, "learning_rate": 9.843384867759258e-08, "logits/chosen": -18.547046661376953, "logits/rejected": -17.967113494873047, "logps/chosen": -357.4786376953125, "logps/rejected": -284.8984680175781, "loss": 0.8387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.493614912033081, "rewards/margins": 0.9701966047286987, "rewards/rejected": 1.523418664932251, "step": 51920 }, { "epoch": 2.4109754398997167, "grad_norm": 5.204098224639893, "learning_rate": 9.835646965968708e-08, "logits/chosen": -19.049259185791016, "logits/rejected": -19.18890380859375, "logps/chosen": -424.159423828125, "logps/rejected": -372.1482849121094, "loss": 0.6098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6314892768859863, "rewards/margins": 0.7361665964126587, "rewards/rejected": 2.89532208442688, "step": 51930 }, { "epoch": 2.41143971400715, "grad_norm": 207.64723205566406, "learning_rate": 9.827909064178156e-08, "logits/chosen": -18.411388397216797, "logits/rejected": -17.826448440551758, "logps/chosen": -430.28582763671875, "logps/rejected": -346.96759033203125, "loss": 0.5153, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.669625759124756, "rewards/margins": 1.3769997358322144, "rewards/rejected": 2.292625904083252, "step": 51940 }, { "epoch": 2.4119039881145827, "grad_norm": 36.48777389526367, "learning_rate": 9.820171162387607e-08, "logits/chosen": -18.447856903076172, "logits/rejected": -17.629344940185547, "logps/chosen": -454.5301818847656, "logps/rejected": -304.80145263671875, "loss": 0.9499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.074080228805542, "rewards/margins": 0.7622987031936646, "rewards/rejected": 2.311781406402588, "step": 51950 }, { "epoch": 2.412368262222016, "grad_norm": 0.952692449092865, "learning_rate": 9.812433260597056e-08, "logits/chosen": -18.83212661743164, "logits/rejected": -17.428775787353516, "logps/chosen": -437.0194396972656, "logps/rejected": -255.8266143798828, "loss": 0.4213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.858105421066284, "rewards/margins": 1.8289155960083008, "rewards/rejected": 1.0291898250579834, "step": 51960 }, { "epoch": 2.4128325363294487, "grad_norm": 60.009979248046875, "learning_rate": 9.804695358806506e-08, "logits/chosen": -18.6541748046875, "logits/rejected": -18.08004379272461, "logps/chosen": -390.1468200683594, "logps/rejected": -323.66314697265625, "loss": 0.8833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1272809505462646, "rewards/margins": 1.219989538192749, "rewards/rejected": 1.9072914123535156, "step": 51970 }, { "epoch": 2.413296810436882, "grad_norm": 1.4386781454086304, "learning_rate": 9.796957457015956e-08, "logits/chosen": -18.88196563720703, "logits/rejected": -19.407482147216797, "logps/chosen": -330.9807434082031, "logps/rejected": -360.6976013183594, "loss": 1.1377, "rewards/accuracies": 0.5, "rewards/chosen": 2.109034299850464, "rewards/margins": 0.6988223791122437, "rewards/rejected": 1.4102119207382202, "step": 51980 }, { "epoch": 2.413761084544315, "grad_norm": 93.01100158691406, "learning_rate": 9.789219555225404e-08, "logits/chosen": -19.044099807739258, "logits/rejected": -17.55759620666504, "logps/chosen": -313.3808898925781, "logps/rejected": -265.101806640625, "loss": 0.3758, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.9221177101135254, "rewards/margins": 1.7525627613067627, "rewards/rejected": 1.1695549488067627, "step": 51990 }, { "epoch": 2.414225358651748, "grad_norm": 121.72410583496094, "learning_rate": 9.781481653434855e-08, "logits/chosen": -18.861492156982422, "logits/rejected": -17.335567474365234, "logps/chosen": -442.38470458984375, "logps/rejected": -306.69647216796875, "loss": 0.2827, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.944361925125122, "rewards/margins": 2.5655853748321533, "rewards/rejected": 1.3787771463394165, "step": 52000 }, { "epoch": 2.414689632759181, "grad_norm": 247.82315063476562, "learning_rate": 9.773743751644304e-08, "logits/chosen": -19.31220817565918, "logits/rejected": -18.911052703857422, "logps/chosen": -433.8993225097656, "logps/rejected": -396.59027099609375, "loss": 0.8045, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5829837322235107, "rewards/margins": 0.6110235452651978, "rewards/rejected": 2.9719605445861816, "step": 52010 }, { "epoch": 2.415153906866614, "grad_norm": 53.0450553894043, "learning_rate": 9.766005849853753e-08, "logits/chosen": -19.486494064331055, "logits/rejected": -19.33405113220215, "logps/chosen": -356.32415771484375, "logps/rejected": -346.77117919921875, "loss": 0.7533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.407480239868164, "rewards/margins": 0.6843968629837036, "rewards/rejected": 2.72308349609375, "step": 52020 }, { "epoch": 2.415618180974047, "grad_norm": 11.67152214050293, "learning_rate": 9.758267948063202e-08, "logits/chosen": -18.54569435119629, "logits/rejected": -18.38919448852539, "logps/chosen": -335.4137268066406, "logps/rejected": -362.8252868652344, "loss": 1.0359, "rewards/accuracies": 0.5, "rewards/chosen": 2.408193588256836, "rewards/margins": 0.5299569368362427, "rewards/rejected": 1.8782367706298828, "step": 52030 }, { "epoch": 2.4160824550814803, "grad_norm": 15.861530303955078, "learning_rate": 9.750530046272652e-08, "logits/chosen": -18.55472755432129, "logits/rejected": -18.258499145507812, "logps/chosen": -389.3084716796875, "logps/rejected": -335.7750244140625, "loss": 0.8526, "rewards/accuracies": 0.5, "rewards/chosen": 2.887620687484741, "rewards/margins": 1.1130011081695557, "rewards/rejected": 1.774619698524475, "step": 52040 }, { "epoch": 2.416546729188913, "grad_norm": 71.24702453613281, "learning_rate": 9.742792144482103e-08, "logits/chosen": -18.41921043395996, "logits/rejected": -17.09954833984375, "logps/chosen": -407.12921142578125, "logps/rejected": -285.5580139160156, "loss": 0.8502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1432085037231445, "rewards/margins": 2.0234978199005127, "rewards/rejected": 2.1197104454040527, "step": 52050 }, { "epoch": 2.4170110032963463, "grad_norm": 164.77871704101562, "learning_rate": 9.735054242691551e-08, "logits/chosen": -19.043474197387695, "logits/rejected": -18.14158058166504, "logps/chosen": -473.52874755859375, "logps/rejected": -333.6964111328125, "loss": 0.5282, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.711422920227051, "rewards/margins": 1.7349092960357666, "rewards/rejected": 2.976513624191284, "step": 52060 }, { "epoch": 2.417475277403779, "grad_norm": 233.1392822265625, "learning_rate": 9.727316340901001e-08, "logits/chosen": -18.93708610534668, "logits/rejected": -19.10481071472168, "logps/chosen": -429.8255310058594, "logps/rejected": -381.65869140625, "loss": 1.1567, "rewards/accuracies": 0.5, "rewards/chosen": 4.023995876312256, "rewards/margins": 0.5188432931900024, "rewards/rejected": 3.505152463912964, "step": 52070 }, { "epoch": 2.4179395515112123, "grad_norm": 5.26289176940918, "learning_rate": 9.71957843911045e-08, "logits/chosen": -18.642488479614258, "logits/rejected": -17.48469352722168, "logps/chosen": -414.99468994140625, "logps/rejected": -249.78872680664062, "loss": 0.2887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.309592247009277, "rewards/margins": 3.1689841747283936, "rewards/rejected": 1.1406080722808838, "step": 52080 }, { "epoch": 2.418403825618645, "grad_norm": 190.0361328125, "learning_rate": 9.7118405373199e-08, "logits/chosen": -19.178966522216797, "logits/rejected": -18.223796844482422, "logps/chosen": -372.02093505859375, "logps/rejected": -257.0486145019531, "loss": 0.4067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.100710391998291, "rewards/margins": 1.9120762348175049, "rewards/rejected": 1.1886341571807861, "step": 52090 }, { "epoch": 2.4188680997260783, "grad_norm": 210.98521423339844, "learning_rate": 9.70410263552935e-08, "logits/chosen": -20.015100479125977, "logits/rejected": -18.476001739501953, "logps/chosen": -435.72027587890625, "logps/rejected": -351.61541748046875, "loss": 0.9167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.152809143066406, "rewards/margins": 0.7297316789627075, "rewards/rejected": 3.42307710647583, "step": 52100 }, { "epoch": 2.4193323738335115, "grad_norm": 61.07728576660156, "learning_rate": 9.696364733738799e-08, "logits/chosen": -19.386646270751953, "logits/rejected": -18.857465744018555, "logps/chosen": -396.23388671875, "logps/rejected": -325.0874328613281, "loss": 0.9666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8059639930725098, "rewards/margins": 0.5547760725021362, "rewards/rejected": 3.251187801361084, "step": 52110 }, { "epoch": 2.4197966479409443, "grad_norm": 2.5451056957244873, "learning_rate": 9.688626831948249e-08, "logits/chosen": -18.29681968688965, "logits/rejected": -17.21796226501465, "logps/chosen": -398.6857604980469, "logps/rejected": -319.892822265625, "loss": 0.3665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.484520435333252, "rewards/margins": 2.1288645267486572, "rewards/rejected": 1.3556561470031738, "step": 52120 }, { "epoch": 2.4202609220483775, "grad_norm": 11.688169479370117, "learning_rate": 9.680888930157697e-08, "logits/chosen": -19.720426559448242, "logits/rejected": -19.634052276611328, "logps/chosen": -451.9898376464844, "logps/rejected": -383.82623291015625, "loss": 0.8832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0892210006713867, "rewards/margins": 0.5241972804069519, "rewards/rejected": 2.565023422241211, "step": 52130 }, { "epoch": 2.4207251961558103, "grad_norm": 70.41138458251953, "learning_rate": 9.673151028367147e-08, "logits/chosen": -18.742496490478516, "logits/rejected": -19.069053649902344, "logps/chosen": -376.3521423339844, "logps/rejected": -319.2285461425781, "loss": 0.5659, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2767438888549805, "rewards/margins": 1.473394751548767, "rewards/rejected": 2.803349018096924, "step": 52140 }, { "epoch": 2.4211894702632435, "grad_norm": 4.1715407371521, "learning_rate": 9.665413126576597e-08, "logits/chosen": -19.344358444213867, "logits/rejected": -18.11751365661621, "logps/chosen": -364.1631164550781, "logps/rejected": -280.76300048828125, "loss": 0.996, "rewards/accuracies": 0.5, "rewards/chosen": 2.793828010559082, "rewards/margins": 1.0050227642059326, "rewards/rejected": 1.7888050079345703, "step": 52150 }, { "epoch": 2.4216537443706763, "grad_norm": 68.1063461303711, "learning_rate": 9.657675224786047e-08, "logits/chosen": -19.42993927001953, "logits/rejected": -18.547107696533203, "logps/chosen": -385.07989501953125, "logps/rejected": -340.74798583984375, "loss": 0.2944, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9509620666503906, "rewards/margins": 1.8872137069702148, "rewards/rejected": 2.063748598098755, "step": 52160 }, { "epoch": 2.4221180184781095, "grad_norm": 61.22378921508789, "learning_rate": 9.649937322995497e-08, "logits/chosen": -18.177026748657227, "logits/rejected": -17.62015724182129, "logps/chosen": -365.1144714355469, "logps/rejected": -211.22982788085938, "loss": 0.5194, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.914196252822876, "rewards/margins": 1.0684211254119873, "rewards/rejected": 0.8457750082015991, "step": 52170 }, { "epoch": 2.4225822925855427, "grad_norm": 3.3793153762817383, "learning_rate": 9.642199421204945e-08, "logits/chosen": -18.716903686523438, "logits/rejected": -19.48043441772461, "logps/chosen": -234.75357055664062, "logps/rejected": -271.7528076171875, "loss": 1.5349, "rewards/accuracies": 0.5, "rewards/chosen": 2.1100399494171143, "rewards/margins": -0.42889365553855896, "rewards/rejected": 2.538933515548706, "step": 52180 }, { "epoch": 2.4230465666929755, "grad_norm": 335.6700744628906, "learning_rate": 9.634461519414396e-08, "logits/chosen": -18.92059898376465, "logits/rejected": -17.944387435913086, "logps/chosen": -449.3589782714844, "logps/rejected": -385.86956787109375, "loss": 0.765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.369206190109253, "rewards/margins": 1.936889886856079, "rewards/rejected": 1.4323166608810425, "step": 52190 }, { "epoch": 2.4235108408004087, "grad_norm": 39.92717361450195, "learning_rate": 9.626723617623845e-08, "logits/chosen": -19.385841369628906, "logits/rejected": -18.747020721435547, "logps/chosen": -485.3379821777344, "logps/rejected": -447.6564025878906, "loss": 0.7988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.391772270202637, "rewards/margins": 0.4892605245113373, "rewards/rejected": 3.9025120735168457, "step": 52200 }, { "epoch": 2.4239751149078415, "grad_norm": 268.2527770996094, "learning_rate": 9.618985715833294e-08, "logits/chosen": -19.131336212158203, "logits/rejected": -18.611759185791016, "logps/chosen": -454.94873046875, "logps/rejected": -424.8526306152344, "loss": 0.7917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.107016563415527, "rewards/margins": 1.2688068151474, "rewards/rejected": 2.838209629058838, "step": 52210 }, { "epoch": 2.4244393890152747, "grad_norm": 67.31273651123047, "learning_rate": 9.611247814042743e-08, "logits/chosen": -19.58257484436035, "logits/rejected": -19.145116806030273, "logps/chosen": -410.9646911621094, "logps/rejected": -373.5148620605469, "loss": 0.8056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.543532371520996, "rewards/margins": 1.447000503540039, "rewards/rejected": 3.0965325832366943, "step": 52220 }, { "epoch": 2.4249036631227074, "grad_norm": 95.01344299316406, "learning_rate": 9.603509912252193e-08, "logits/chosen": -20.291866302490234, "logits/rejected": -19.30344009399414, "logps/chosen": -431.7395935058594, "logps/rejected": -420.78314208984375, "loss": 0.7288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.815448045730591, "rewards/margins": 0.5667324066162109, "rewards/rejected": 3.2487151622772217, "step": 52230 }, { "epoch": 2.4253679372301407, "grad_norm": 37.09100341796875, "learning_rate": 9.595772010461644e-08, "logits/chosen": -19.09465217590332, "logits/rejected": -18.6724910736084, "logps/chosen": -499.55487060546875, "logps/rejected": -419.8435974121094, "loss": 0.4738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.639775037765503, "rewards/margins": 0.9980878829956055, "rewards/rejected": 2.6416871547698975, "step": 52240 }, { "epoch": 2.425832211337574, "grad_norm": 28.545669555664062, "learning_rate": 9.588034108671092e-08, "logits/chosen": -20.02505874633789, "logits/rejected": -18.350934982299805, "logps/chosen": -385.7862243652344, "logps/rejected": -316.2553405761719, "loss": 0.3858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.3477790355682373, "rewards/margins": 1.3749287128448486, "rewards/rejected": 0.9728503227233887, "step": 52250 }, { "epoch": 2.4262964854450066, "grad_norm": 260.5021057128906, "learning_rate": 9.580296206880542e-08, "logits/chosen": -19.548221588134766, "logits/rejected": -19.696874618530273, "logps/chosen": -404.3484802246094, "logps/rejected": -379.11468505859375, "loss": 1.0191, "rewards/accuracies": 0.5, "rewards/chosen": 3.300870180130005, "rewards/margins": 0.8078147172927856, "rewards/rejected": 2.4930553436279297, "step": 52260 }, { "epoch": 2.42676075955244, "grad_norm": 103.360107421875, "learning_rate": 9.57255830508999e-08, "logits/chosen": -18.490524291992188, "logits/rejected": -17.18595314025879, "logps/chosen": -344.0563659667969, "logps/rejected": -212.19259643554688, "loss": 0.2878, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1339821815490723, "rewards/margins": 2.7866318225860596, "rewards/rejected": 0.34735047817230225, "step": 52270 }, { "epoch": 2.4272250336598726, "grad_norm": 19.73297119140625, "learning_rate": 9.56482040329944e-08, "logits/chosen": -18.960020065307617, "logits/rejected": -18.225086212158203, "logps/chosen": -535.1553955078125, "logps/rejected": -444.36480712890625, "loss": 0.8077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.164656639099121, "rewards/margins": 1.5804800987243652, "rewards/rejected": 2.584176540374756, "step": 52280 }, { "epoch": 2.427689307767306, "grad_norm": 225.62701416015625, "learning_rate": 9.557082501508892e-08, "logits/chosen": -18.25754165649414, "logits/rejected": -17.746898651123047, "logps/chosen": -399.66241455078125, "logps/rejected": -452.5951232910156, "loss": 1.101, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2492358684539795, "rewards/margins": -0.07280679047107697, "rewards/rejected": 3.322042465209961, "step": 52290 }, { "epoch": 2.4281535818747386, "grad_norm": 265.11431884765625, "learning_rate": 9.54934459971834e-08, "logits/chosen": -19.040817260742188, "logits/rejected": -19.282855987548828, "logps/chosen": -367.5455322265625, "logps/rejected": -360.9029846191406, "loss": 1.4776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4387145042419434, "rewards/margins": -0.10917673259973526, "rewards/rejected": 3.547891616821289, "step": 52300 }, { "epoch": 2.428617855982172, "grad_norm": 0.6855429410934448, "learning_rate": 9.54160669792779e-08, "logits/chosen": -19.495344161987305, "logits/rejected": -17.825565338134766, "logps/chosen": -361.51666259765625, "logps/rejected": -237.7659912109375, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 4.33581018447876, "rewards/margins": 2.5715904235839844, "rewards/rejected": 1.7642199993133545, "step": 52310 }, { "epoch": 2.429082130089605, "grad_norm": 9.429095268249512, "learning_rate": 9.533868796137238e-08, "logits/chosen": -18.981863021850586, "logits/rejected": -17.62860679626465, "logps/chosen": -403.694580078125, "logps/rejected": -253.60635375976562, "loss": 0.3165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9050586223602295, "rewards/margins": 2.5331859588623047, "rewards/rejected": 1.371872901916504, "step": 52320 }, { "epoch": 2.429546404197038, "grad_norm": 12.733175277709961, "learning_rate": 9.526130894346688e-08, "logits/chosen": -18.8961124420166, "logits/rejected": -17.83388328552246, "logps/chosen": -269.7683410644531, "logps/rejected": -223.19595336914062, "loss": 0.5668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9254958629608154, "rewards/margins": 1.8061634302139282, "rewards/rejected": 0.11933255195617676, "step": 52330 }, { "epoch": 2.430010678304471, "grad_norm": 62.89555358886719, "learning_rate": 9.518392992556138e-08, "logits/chosen": -19.470108032226562, "logits/rejected": -18.411853790283203, "logps/chosen": -383.11627197265625, "logps/rejected": -376.66180419921875, "loss": 0.3843, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6498630046844482, "rewards/margins": 1.763105034828186, "rewards/rejected": 1.8867578506469727, "step": 52340 }, { "epoch": 2.430474952411904, "grad_norm": 4.017137050628662, "learning_rate": 9.510655090765588e-08, "logits/chosen": -19.439382553100586, "logits/rejected": -18.440305709838867, "logps/chosen": -325.8505554199219, "logps/rejected": -236.19296264648438, "loss": 0.3826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.992668628692627, "rewards/margins": 1.6473503112792969, "rewards/rejected": 1.3453184366226196, "step": 52350 }, { "epoch": 2.430939226519337, "grad_norm": 52.62515640258789, "learning_rate": 9.502917188975038e-08, "logits/chosen": -18.538410186767578, "logits/rejected": -17.981000900268555, "logps/chosen": -370.2021179199219, "logps/rejected": -319.71551513671875, "loss": 1.0006, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0811667442321777, "rewards/margins": 0.6264278888702393, "rewards/rejected": 2.4547390937805176, "step": 52360 }, { "epoch": 2.43140350062677, "grad_norm": 2.6176702976226807, "learning_rate": 9.495179287184486e-08, "logits/chosen": -20.018611907958984, "logits/rejected": -18.94346046447754, "logps/chosen": -391.28802490234375, "logps/rejected": -335.72332763671875, "loss": 0.545, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.296046733856201, "rewards/margins": 1.9711921215057373, "rewards/rejected": 2.324854850769043, "step": 52370 }, { "epoch": 2.431867774734203, "grad_norm": 65.51981353759766, "learning_rate": 9.487441385393936e-08, "logits/chosen": -18.710460662841797, "logits/rejected": -18.299415588378906, "logps/chosen": -400.90203857421875, "logps/rejected": -325.2452392578125, "loss": 0.7529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.168440103530884, "rewards/margins": 0.9421218633651733, "rewards/rejected": 2.226318359375, "step": 52380 }, { "epoch": 2.4323320488416362, "grad_norm": 35.24729919433594, "learning_rate": 9.479703483603386e-08, "logits/chosen": -19.472591400146484, "logits/rejected": -17.886943817138672, "logps/chosen": -471.2684631347656, "logps/rejected": -297.1360778808594, "loss": 0.598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9512603282928467, "rewards/margins": 1.8925727605819702, "rewards/rejected": 2.058687686920166, "step": 52390 }, { "epoch": 2.432796322949069, "grad_norm": 0.9837467670440674, "learning_rate": 9.471965581812835e-08, "logits/chosen": -18.991466522216797, "logits/rejected": -18.452585220336914, "logps/chosen": -468.66229248046875, "logps/rejected": -490.6407775878906, "loss": 0.7006, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.613011360168457, "rewards/margins": 1.273472785949707, "rewards/rejected": 3.33953857421875, "step": 52400 }, { "epoch": 2.433260597056502, "grad_norm": 51.9882698059082, "learning_rate": 9.464227680022285e-08, "logits/chosen": -18.842952728271484, "logits/rejected": -17.872234344482422, "logps/chosen": -385.2704162597656, "logps/rejected": -298.8762512207031, "loss": 1.044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4585983753204346, "rewards/margins": 0.715036928653717, "rewards/rejected": 2.743561029434204, "step": 52410 }, { "epoch": 2.4337248711639354, "grad_norm": 21.27477264404297, "learning_rate": 9.456489778231734e-08, "logits/chosen": -19.348602294921875, "logits/rejected": -17.79212188720703, "logps/chosen": -441.9908142089844, "logps/rejected": -280.52490234375, "loss": 0.3749, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.364941120147705, "rewards/margins": 1.9124571084976196, "rewards/rejected": 1.452483892440796, "step": 52420 }, { "epoch": 2.434189145271368, "grad_norm": 7.728953838348389, "learning_rate": 9.448751876441184e-08, "logits/chosen": -18.980234146118164, "logits/rejected": -19.56497573852539, "logps/chosen": -407.90081787109375, "logps/rejected": -380.39141845703125, "loss": 1.1328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.27661395072937, "rewards/margins": 0.2240484058856964, "rewards/rejected": 3.0525660514831543, "step": 52430 }, { "epoch": 2.4346534193788014, "grad_norm": 61.566993713378906, "learning_rate": 9.441013974650633e-08, "logits/chosen": -18.508586883544922, "logits/rejected": -17.901988983154297, "logps/chosen": -412.35394287109375, "logps/rejected": -361.46148681640625, "loss": 0.6326, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.680971145629883, "rewards/margins": 1.4762018918991089, "rewards/rejected": 1.2047691345214844, "step": 52440 }, { "epoch": 2.435117693486234, "grad_norm": 86.65106964111328, "learning_rate": 9.433276072860083e-08, "logits/chosen": -18.841856002807617, "logits/rejected": -19.053911209106445, "logps/chosen": -441.7528381347656, "logps/rejected": -428.40252685546875, "loss": 0.7317, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8133575916290283, "rewards/margins": 0.2723000645637512, "rewards/rejected": 2.541057586669922, "step": 52450 }, { "epoch": 2.4355819675936674, "grad_norm": 22.24408721923828, "learning_rate": 9.425538171069532e-08, "logits/chosen": -18.589418411254883, "logits/rejected": -17.868900299072266, "logps/chosen": -344.62646484375, "logps/rejected": -293.4200134277344, "loss": 1.2009, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3485727310180664, "rewards/margins": 0.9490371942520142, "rewards/rejected": 1.3995354175567627, "step": 52460 }, { "epoch": 2.4360462417011, "grad_norm": 43.82035827636719, "learning_rate": 9.417800269278981e-08, "logits/chosen": -18.79083824157715, "logits/rejected": -18.10642433166504, "logps/chosen": -412.048583984375, "logps/rejected": -361.8126220703125, "loss": 0.335, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.997581958770752, "rewards/margins": 1.4330065250396729, "rewards/rejected": 2.564574956893921, "step": 52470 }, { "epoch": 2.4365105158085334, "grad_norm": 239.89466857910156, "learning_rate": 9.410062367488431e-08, "logits/chosen": -19.813955307006836, "logits/rejected": -18.817594528198242, "logps/chosen": -376.54400634765625, "logps/rejected": -320.82305908203125, "loss": 0.8553, "rewards/accuracies": 0.5, "rewards/chosen": 2.831690788269043, "rewards/margins": 0.7741363644599915, "rewards/rejected": 2.057554244995117, "step": 52480 }, { "epoch": 2.4369747899159666, "grad_norm": 176.9278564453125, "learning_rate": 9.402324465697881e-08, "logits/chosen": -18.306371688842773, "logits/rejected": -17.978893280029297, "logps/chosen": -311.85107421875, "logps/rejected": -278.4219665527344, "loss": 0.7758, "rewards/accuracies": 0.5, "rewards/chosen": 2.603126049041748, "rewards/margins": 1.0571595430374146, "rewards/rejected": 1.545966386795044, "step": 52490 }, { "epoch": 2.4374390640233994, "grad_norm": 82.06341552734375, "learning_rate": 9.394586563907331e-08, "logits/chosen": -17.938610076904297, "logits/rejected": -18.555442810058594, "logps/chosen": -382.46221923828125, "logps/rejected": -447.2582092285156, "loss": 1.0068, "rewards/accuracies": 0.5, "rewards/chosen": 2.775521993637085, "rewards/margins": -0.1527244746685028, "rewards/rejected": 2.9282467365264893, "step": 52500 }, { "epoch": 2.4379033381308326, "grad_norm": 30.483251571655273, "learning_rate": 9.387622452295835e-08, "logits/chosen": -19.4129581451416, "logits/rejected": -18.32835578918457, "logps/chosen": -474.1927185058594, "logps/rejected": -333.5645446777344, "loss": 0.5214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.394877910614014, "rewards/margins": 2.0099589824676514, "rewards/rejected": 2.3849194049835205, "step": 52510 }, { "epoch": 2.4383676122382654, "grad_norm": 29.894630432128906, "learning_rate": 9.379884550505284e-08, "logits/chosen": -18.630565643310547, "logits/rejected": -18.822792053222656, "logps/chosen": -346.56427001953125, "logps/rejected": -351.0835876464844, "loss": 1.2251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.55667781829834, "rewards/margins": 0.29460838437080383, "rewards/rejected": 4.262069225311279, "step": 52520 }, { "epoch": 2.4388318863456986, "grad_norm": 228.75965881347656, "learning_rate": 9.372146648714734e-08, "logits/chosen": -20.007043838500977, "logits/rejected": -18.988082885742188, "logps/chosen": -325.32537841796875, "logps/rejected": -321.65838623046875, "loss": 0.5471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.29528546333313, "rewards/margins": 0.9203794598579407, "rewards/rejected": 2.374906063079834, "step": 52530 }, { "epoch": 2.4392961604531314, "grad_norm": 31.74364471435547, "learning_rate": 9.364408746924184e-08, "logits/chosen": -19.483692169189453, "logits/rejected": -18.414670944213867, "logps/chosen": -406.73980712890625, "logps/rejected": -293.53399658203125, "loss": 0.3711, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1793854236602783, "rewards/margins": 1.6516258716583252, "rewards/rejected": 1.5277594327926636, "step": 52540 }, { "epoch": 2.4397604345605646, "grad_norm": 14.17137336730957, "learning_rate": 9.356670845133634e-08, "logits/chosen": -18.50222396850586, "logits/rejected": -18.195751190185547, "logps/chosen": -504.5497131347656, "logps/rejected": -405.3612365722656, "loss": 0.3711, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.272032260894775, "rewards/margins": 1.7687833309173584, "rewards/rejected": 3.503248929977417, "step": 52550 }, { "epoch": 2.440224708667998, "grad_norm": 115.93478393554688, "learning_rate": 9.348932943343082e-08, "logits/chosen": -19.936439514160156, "logits/rejected": -18.934062957763672, "logps/chosen": -407.0545959472656, "logps/rejected": -333.84649658203125, "loss": 0.5622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4450275897979736, "rewards/margins": 1.081411361694336, "rewards/rejected": 2.363616466522217, "step": 52560 }, { "epoch": 2.4406889827754306, "grad_norm": 5.906702041625977, "learning_rate": 9.341195041552532e-08, "logits/chosen": -18.681503295898438, "logits/rejected": -18.308345794677734, "logps/chosen": -272.72540283203125, "logps/rejected": -272.2685852050781, "loss": 0.7372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8592753410339355, "rewards/margins": 0.7191359400749207, "rewards/rejected": 2.14013934135437, "step": 52570 }, { "epoch": 2.4411532568828638, "grad_norm": 0.06123937666416168, "learning_rate": 9.333457139761982e-08, "logits/chosen": -18.986026763916016, "logits/rejected": -17.27159881591797, "logps/chosen": -390.8397521972656, "logps/rejected": -237.1435089111328, "loss": 0.2895, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.631551265716553, "rewards/margins": 2.8016412258148193, "rewards/rejected": 1.8299099206924438, "step": 52580 }, { "epoch": 2.4416175309902965, "grad_norm": 104.17240905761719, "learning_rate": 9.325719237971432e-08, "logits/chosen": -19.04607391357422, "logits/rejected": -18.740581512451172, "logps/chosen": -381.3258056640625, "logps/rejected": -365.1946716308594, "loss": 1.2128, "rewards/accuracies": 0.5, "rewards/chosen": 2.6529481410980225, "rewards/margins": -0.1938338279724121, "rewards/rejected": 2.8467822074890137, "step": 52590 }, { "epoch": 2.4420818050977298, "grad_norm": 61.45869445800781, "learning_rate": 9.31798133618088e-08, "logits/chosen": -19.3354434967041, "logits/rejected": -19.45724868774414, "logps/chosen": -451.00604248046875, "logps/rejected": -428.3560485839844, "loss": 0.7332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.672015428543091, "rewards/margins": 0.25322091579437256, "rewards/rejected": 3.4187941551208496, "step": 52600 }, { "epoch": 2.4425460792051625, "grad_norm": 167.76502990722656, "learning_rate": 9.31024343439033e-08, "logits/chosen": -18.717998504638672, "logits/rejected": -18.29434585571289, "logps/chosen": -377.97100830078125, "logps/rejected": -327.9827575683594, "loss": 1.132, "rewards/accuracies": 0.5, "rewards/chosen": 4.40394926071167, "rewards/margins": 0.7450919151306152, "rewards/rejected": 3.6588573455810547, "step": 52610 }, { "epoch": 2.4430103533125958, "grad_norm": 99.6194839477539, "learning_rate": 9.30250553259978e-08, "logits/chosen": -19.006526947021484, "logits/rejected": -18.90946388244629, "logps/chosen": -401.0516662597656, "logps/rejected": -377.1720886230469, "loss": 0.9345, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.506688356399536, "rewards/margins": 0.33547207713127136, "rewards/rejected": 2.1712160110473633, "step": 52620 }, { "epoch": 2.443474627420029, "grad_norm": 38.46796798706055, "learning_rate": 9.29476763080923e-08, "logits/chosen": -18.773027420043945, "logits/rejected": -17.943408966064453, "logps/chosen": -522.7804565429688, "logps/rejected": -419.05877685546875, "loss": 0.9786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.053083896636963, "rewards/margins": 1.996546983718872, "rewards/rejected": 3.0565364360809326, "step": 52630 }, { "epoch": 2.4439389015274617, "grad_norm": 133.35906982421875, "learning_rate": 9.287029729018679e-08, "logits/chosen": -18.325847625732422, "logits/rejected": -17.695087432861328, "logps/chosen": -360.81695556640625, "logps/rejected": -284.5075988769531, "loss": 0.5442, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0320754051208496, "rewards/margins": 1.30808424949646, "rewards/rejected": 1.723990797996521, "step": 52640 }, { "epoch": 2.444403175634895, "grad_norm": 22.38286781311035, "learning_rate": 9.279291827228128e-08, "logits/chosen": -18.534826278686523, "logits/rejected": -17.937820434570312, "logps/chosen": -481.7925720214844, "logps/rejected": -450.39892578125, "loss": 0.7398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.103934288024902, "rewards/margins": 1.07676362991333, "rewards/rejected": 3.0271713733673096, "step": 52650 }, { "epoch": 2.4448674497423277, "grad_norm": 41.00062942504883, "learning_rate": 9.271553925437578e-08, "logits/chosen": -18.831567764282227, "logits/rejected": -18.4093017578125, "logps/chosen": -396.80059814453125, "logps/rejected": -341.8354187011719, "loss": 0.7802, "rewards/accuracies": 0.5, "rewards/chosen": 2.540584087371826, "rewards/margins": 0.8081091046333313, "rewards/rejected": 1.73247492313385, "step": 52660 }, { "epoch": 2.445331723849761, "grad_norm": 1.0888993740081787, "learning_rate": 9.263816023647027e-08, "logits/chosen": -18.652469635009766, "logits/rejected": -18.504383087158203, "logps/chosen": -249.54129028320312, "logps/rejected": -277.07550048828125, "loss": 1.2585, "rewards/accuracies": 0.5, "rewards/chosen": 1.9825786352157593, "rewards/margins": 0.5754782557487488, "rewards/rejected": 1.4071003198623657, "step": 52670 }, { "epoch": 2.4457959979571937, "grad_norm": 73.51753997802734, "learning_rate": 9.256078121856477e-08, "logits/chosen": -19.370777130126953, "logits/rejected": -18.682300567626953, "logps/chosen": -433.98651123046875, "logps/rejected": -410.171875, "loss": 0.525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.273977756500244, "rewards/margins": 1.553009033203125, "rewards/rejected": 2.720968723297119, "step": 52680 }, { "epoch": 2.446260272064627, "grad_norm": 0.1528969705104828, "learning_rate": 9.248340220065927e-08, "logits/chosen": -18.682632446289062, "logits/rejected": -17.353055953979492, "logps/chosen": -359.2645263671875, "logps/rejected": -227.337890625, "loss": 0.5307, "rewards/accuracies": 0.5, "rewards/chosen": 3.327235460281372, "rewards/margins": 2.0401480197906494, "rewards/rejected": 1.2870875597000122, "step": 52690 }, { "epoch": 2.44672454617206, "grad_norm": 4.255746841430664, "learning_rate": 9.240602318275376e-08, "logits/chosen": -19.75338363647461, "logits/rejected": -19.42854881286621, "logps/chosen": -405.2943115234375, "logps/rejected": -295.4364318847656, "loss": 0.7064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2330257892608643, "rewards/margins": 1.5029652118682861, "rewards/rejected": 1.7300605773925781, "step": 52700 }, { "epoch": 2.447188820279493, "grad_norm": 43.488773345947266, "learning_rate": 9.232864416484825e-08, "logits/chosen": -18.151065826416016, "logits/rejected": -18.57870101928711, "logps/chosen": -341.4718322753906, "logps/rejected": -376.32122802734375, "loss": 0.9431, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3741185665130615, "rewards/margins": 0.06093263626098633, "rewards/rejected": 2.313185691833496, "step": 52710 }, { "epoch": 2.447653094386926, "grad_norm": 33.23506164550781, "learning_rate": 9.225126514694274e-08, "logits/chosen": -20.553958892822266, "logits/rejected": -18.78965950012207, "logps/chosen": -576.7752685546875, "logps/rejected": -362.69891357421875, "loss": 0.2061, "rewards/accuracies": 1.0, "rewards/chosen": 5.528518199920654, "rewards/margins": 2.809966564178467, "rewards/rejected": 2.7185518741607666, "step": 52720 }, { "epoch": 2.448117368494359, "grad_norm": 27.410512924194336, "learning_rate": 9.217388612903725e-08, "logits/chosen": -18.361522674560547, "logits/rejected": -18.051362991333008, "logps/chosen": -404.3231201171875, "logps/rejected": -363.07830810546875, "loss": 0.9379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9871978759765625, "rewards/margins": 0.41599082946777344, "rewards/rejected": 2.57120680809021, "step": 52730 }, { "epoch": 2.448581642601792, "grad_norm": 136.61514282226562, "learning_rate": 9.209650711113175e-08, "logits/chosen": -18.935365676879883, "logits/rejected": -19.54258155822754, "logps/chosen": -393.152099609375, "logps/rejected": -398.12640380859375, "loss": 0.6969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8916122913360596, "rewards/margins": 0.3901534676551819, "rewards/rejected": 3.5014591217041016, "step": 52740 }, { "epoch": 2.449045916709225, "grad_norm": 89.72128295898438, "learning_rate": 9.201912809322623e-08, "logits/chosen": -19.81081771850586, "logits/rejected": -19.831668853759766, "logps/chosen": -471.0439453125, "logps/rejected": -379.93414306640625, "loss": 0.6923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.915484666824341, "rewards/margins": 0.832839846611023, "rewards/rejected": 3.0826451778411865, "step": 52750 }, { "epoch": 2.449510190816658, "grad_norm": 100.608154296875, "learning_rate": 9.194174907532073e-08, "logits/chosen": -18.67953872680664, "logits/rejected": -18.181060791015625, "logps/chosen": -394.12579345703125, "logps/rejected": -320.3655700683594, "loss": 0.6298, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.715498685836792, "rewards/margins": 1.5097973346710205, "rewards/rejected": 2.2057013511657715, "step": 52760 }, { "epoch": 2.4499744649240913, "grad_norm": 52.639732360839844, "learning_rate": 9.186437005741523e-08, "logits/chosen": -18.90070343017578, "logits/rejected": -18.622203826904297, "logps/chosen": -438.00872802734375, "logps/rejected": -399.0712585449219, "loss": 0.6666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4323055744171143, "rewards/margins": 0.7303697466850281, "rewards/rejected": 2.7019360065460205, "step": 52770 }, { "epoch": 2.450438739031524, "grad_norm": 47.49347686767578, "learning_rate": 9.178699103950973e-08, "logits/chosen": -18.164684295654297, "logits/rejected": -17.511207580566406, "logps/chosen": -374.7956848144531, "logps/rejected": -290.7565002441406, "loss": 0.7473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.195460796356201, "rewards/margins": 1.2942014932632446, "rewards/rejected": 1.901259422302246, "step": 52780 }, { "epoch": 2.4509030131389573, "grad_norm": 13.33978271484375, "learning_rate": 9.170961202160422e-08, "logits/chosen": -19.36500358581543, "logits/rejected": -18.6283016204834, "logps/chosen": -471.19207763671875, "logps/rejected": -440.49969482421875, "loss": 0.8058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.8131608963012695, "rewards/margins": 1.637880563735962, "rewards/rejected": 3.1752800941467285, "step": 52790 }, { "epoch": 2.45136728724639, "grad_norm": 17.483999252319336, "learning_rate": 9.163223300369871e-08, "logits/chosen": -19.565330505371094, "logits/rejected": -18.915849685668945, "logps/chosen": -411.73095703125, "logps/rejected": -424.59027099609375, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": 3.695040464401245, "rewards/margins": 0.9028486013412476, "rewards/rejected": 2.792192220687866, "step": 52800 }, { "epoch": 2.4518315613538233, "grad_norm": 115.097900390625, "learning_rate": 9.155485398579321e-08, "logits/chosen": -18.886022567749023, "logits/rejected": -18.426990509033203, "logps/chosen": -378.2606506347656, "logps/rejected": -283.294921875, "loss": 0.4471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.045058250427246, "rewards/margins": 2.3137729167938232, "rewards/rejected": 1.7312850952148438, "step": 52810 }, { "epoch": 2.4522958354612565, "grad_norm": 20.44198226928711, "learning_rate": 9.14774749678877e-08, "logits/chosen": -18.928180694580078, "logits/rejected": -17.916667938232422, "logps/chosen": -384.2861328125, "logps/rejected": -297.7209777832031, "loss": 0.5785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.968132495880127, "rewards/margins": 1.7691224813461304, "rewards/rejected": 2.199009656906128, "step": 52820 }, { "epoch": 2.4527601095686893, "grad_norm": 92.74085998535156, "learning_rate": 9.14000959499822e-08, "logits/chosen": -18.637142181396484, "logits/rejected": -19.18198585510254, "logps/chosen": -357.756103515625, "logps/rejected": -400.45550537109375, "loss": 0.6593, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9977521896362305, "rewards/margins": 0.287645548582077, "rewards/rejected": 2.710106372833252, "step": 52830 }, { "epoch": 2.4532243836761225, "grad_norm": 1.0766388177871704, "learning_rate": 9.132271693207669e-08, "logits/chosen": -20.358016967773438, "logits/rejected": -17.887798309326172, "logps/chosen": -519.4771728515625, "logps/rejected": -319.60858154296875, "loss": 0.3389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.569159507751465, "rewards/margins": 3.1991684436798096, "rewards/rejected": 2.3699915409088135, "step": 52840 }, { "epoch": 2.4536886577835553, "grad_norm": 98.19666290283203, "learning_rate": 9.124533791417119e-08, "logits/chosen": -18.867164611816406, "logits/rejected": -18.286638259887695, "logps/chosen": -352.6781921386719, "logps/rejected": -423.68865966796875, "loss": 0.8887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8656017780303955, "rewards/margins": 0.5468343496322632, "rewards/rejected": 3.318767547607422, "step": 52850 }, { "epoch": 2.4541529318909885, "grad_norm": 0.5152074098587036, "learning_rate": 9.116795889626568e-08, "logits/chosen": -18.795087814331055, "logits/rejected": -18.222179412841797, "logps/chosen": -464.79119873046875, "logps/rejected": -336.62701416015625, "loss": 0.5001, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2391610145568848, "rewards/margins": 1.5329691171646118, "rewards/rejected": 1.7061916589736938, "step": 52860 }, { "epoch": 2.4546172059984217, "grad_norm": 165.82797241210938, "learning_rate": 9.109057987836018e-08, "logits/chosen": -19.11812400817871, "logits/rejected": -17.554203033447266, "logps/chosen": -433.6280212402344, "logps/rejected": -291.87200927734375, "loss": 0.8729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.765183925628662, "rewards/margins": 2.024695873260498, "rewards/rejected": 1.7404884099960327, "step": 52870 }, { "epoch": 2.4550814801058545, "grad_norm": 207.3883819580078, "learning_rate": 9.101320086045468e-08, "logits/chosen": -19.208358764648438, "logits/rejected": -18.267791748046875, "logps/chosen": -405.10107421875, "logps/rejected": -316.56964111328125, "loss": 0.5686, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4466490745544434, "rewards/margins": 1.9768043756484985, "rewards/rejected": 1.4698448181152344, "step": 52880 }, { "epoch": 2.4555457542132877, "grad_norm": 28.134014129638672, "learning_rate": 9.093582184254917e-08, "logits/chosen": -19.403486251831055, "logits/rejected": -17.526958465576172, "logps/chosen": -451.88519287109375, "logps/rejected": -262.77056884765625, "loss": 0.4406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6767914295196533, "rewards/margins": 2.0350022315979004, "rewards/rejected": 1.641789197921753, "step": 52890 }, { "epoch": 2.4560100283207205, "grad_norm": 2.50492525100708, "learning_rate": 9.085844282464366e-08, "logits/chosen": -18.728912353515625, "logits/rejected": -17.81188201904297, "logps/chosen": -342.4564208984375, "logps/rejected": -277.9976806640625, "loss": 0.7088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.398334503173828, "rewards/margins": 1.5261520147323608, "rewards/rejected": 2.872182607650757, "step": 52900 }, { "epoch": 2.4564743024281537, "grad_norm": 59.48302459716797, "learning_rate": 9.078106380673816e-08, "logits/chosen": -19.073383331298828, "logits/rejected": -18.32935905456543, "logps/chosen": -372.93878173828125, "logps/rejected": -322.39935302734375, "loss": 1.1825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1938908100128174, "rewards/margins": 0.846282958984375, "rewards/rejected": 2.3476078510284424, "step": 52910 }, { "epoch": 2.4569385765355864, "grad_norm": 57.137630462646484, "learning_rate": 9.070368478883266e-08, "logits/chosen": -18.608339309692383, "logits/rejected": -18.9163761138916, "logps/chosen": -347.2594909667969, "logps/rejected": -336.2740783691406, "loss": 1.3921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.000788927078247, "rewards/margins": 0.41367483139038086, "rewards/rejected": 2.587114095687866, "step": 52920 }, { "epoch": 2.4574028506430197, "grad_norm": 3.5220093727111816, "learning_rate": 9.062630577092716e-08, "logits/chosen": -18.708972930908203, "logits/rejected": -17.82581901550293, "logps/chosen": -312.96844482421875, "logps/rejected": -188.66281127929688, "loss": 0.3046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6834282875061035, "rewards/margins": 2.65857195854187, "rewards/rejected": 1.0248565673828125, "step": 52930 }, { "epoch": 2.457867124750453, "grad_norm": 63.9622917175293, "learning_rate": 9.054892675302164e-08, "logits/chosen": -18.825542449951172, "logits/rejected": -19.410221099853516, "logps/chosen": -370.0029602050781, "logps/rejected": -387.0541687011719, "loss": 2.0488, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.430605411529541, "rewards/margins": -1.0037038326263428, "rewards/rejected": 3.434309482574463, "step": 52940 }, { "epoch": 2.4583313988578857, "grad_norm": 1.538900375366211, "learning_rate": 9.047154773511614e-08, "logits/chosen": -19.5253849029541, "logits/rejected": -19.056690216064453, "logps/chosen": -305.8416442871094, "logps/rejected": -298.82696533203125, "loss": 1.3064, "rewards/accuracies": 0.5, "rewards/chosen": 4.301974296569824, "rewards/margins": 0.7796311378479004, "rewards/rejected": 3.522343397140503, "step": 52950 }, { "epoch": 2.458795672965319, "grad_norm": 3.3379945755004883, "learning_rate": 9.039416871721063e-08, "logits/chosen": -19.16591453552246, "logits/rejected": -19.372528076171875, "logps/chosen": -486.78350830078125, "logps/rejected": -492.36053466796875, "loss": 0.6044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.310935020446777, "rewards/margins": 1.1355581283569336, "rewards/rejected": 3.1753761768341064, "step": 52960 }, { "epoch": 2.4592599470727516, "grad_norm": 6.526524543762207, "learning_rate": 9.031678969930514e-08, "logits/chosen": -18.231739044189453, "logits/rejected": -17.77451515197754, "logps/chosen": -284.8003234863281, "logps/rejected": -250.16622924804688, "loss": 1.131, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9994010925292969, "rewards/margins": 0.7895939946174622, "rewards/rejected": 1.20980703830719, "step": 52970 }, { "epoch": 2.459724221180185, "grad_norm": 179.8590545654297, "learning_rate": 9.023941068139963e-08, "logits/chosen": -19.610605239868164, "logits/rejected": -19.337602615356445, "logps/chosen": -458.3441467285156, "logps/rejected": -324.51544189453125, "loss": 0.4999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9318695068359375, "rewards/margins": 1.996029257774353, "rewards/rejected": 1.935840368270874, "step": 52980 }, { "epoch": 2.4601884952876176, "grad_norm": 27.212787628173828, "learning_rate": 9.016203166349412e-08, "logits/chosen": -20.020885467529297, "logits/rejected": -19.021320343017578, "logps/chosen": -414.17230224609375, "logps/rejected": -342.9061584472656, "loss": 0.371, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2324554920196533, "rewards/margins": 1.744394063949585, "rewards/rejected": 1.4880611896514893, "step": 52990 }, { "epoch": 2.460652769395051, "grad_norm": 0.8494476675987244, "learning_rate": 9.008465264558862e-08, "logits/chosen": -20.436975479125977, "logits/rejected": -18.909521102905273, "logps/chosen": -375.67413330078125, "logps/rejected": -250.97280883789062, "loss": 0.3849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.3519160747528076, "rewards/margins": 1.3119792938232422, "rewards/rejected": 1.0399367809295654, "step": 53000 }, { "epoch": 2.461117043502484, "grad_norm": 1.7234383821487427, "learning_rate": 9.00072736276831e-08, "logits/chosen": -19.96510124206543, "logits/rejected": -19.11067008972168, "logps/chosen": -314.42230224609375, "logps/rejected": -227.0402069091797, "loss": 0.5106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9555227756500244, "rewards/margins": 1.769323706626892, "rewards/rejected": 1.1861991882324219, "step": 53010 }, { "epoch": 2.461581317609917, "grad_norm": 117.92572021484375, "learning_rate": 8.992989460977761e-08, "logits/chosen": -17.91692543029785, "logits/rejected": -18.0966854095459, "logps/chosen": -423.23193359375, "logps/rejected": -312.2185974121094, "loss": 1.2481, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.2778098583221436, "rewards/margins": -0.45647192001342773, "rewards/rejected": 2.734281539916992, "step": 53020 }, { "epoch": 2.46204559171735, "grad_norm": 22.213788986206055, "learning_rate": 8.985251559187211e-08, "logits/chosen": -19.167652130126953, "logits/rejected": -18.94988441467285, "logps/chosen": -339.1443786621094, "logps/rejected": -328.0871887207031, "loss": 0.552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.726442813873291, "rewards/margins": 1.0538861751556396, "rewards/rejected": 1.6725571155548096, "step": 53030 }, { "epoch": 2.462509865824783, "grad_norm": 125.41015625, "learning_rate": 8.97751365739666e-08, "logits/chosen": -18.577163696289062, "logits/rejected": -17.85585594177246, "logps/chosen": -423.9007873535156, "logps/rejected": -340.72003173828125, "loss": 0.5954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.525798797607422, "rewards/margins": 1.5325441360473633, "rewards/rejected": 1.99325430393219, "step": 53040 }, { "epoch": 2.462974139932216, "grad_norm": 73.17523956298828, "learning_rate": 8.96977575560611e-08, "logits/chosen": -18.730716705322266, "logits/rejected": -18.67973518371582, "logps/chosen": -362.34283447265625, "logps/rejected": -323.02142333984375, "loss": 0.7921, "rewards/accuracies": 0.5, "rewards/chosen": 3.061082363128662, "rewards/margins": 1.0455119609832764, "rewards/rejected": 2.0155701637268066, "step": 53050 }, { "epoch": 2.463438414039649, "grad_norm": 11.709811210632324, "learning_rate": 8.962037853815558e-08, "logits/chosen": -19.602874755859375, "logits/rejected": -18.765933990478516, "logps/chosen": -382.92919921875, "logps/rejected": -341.40411376953125, "loss": 1.0895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.851043224334717, "rewards/margins": 0.2390923798084259, "rewards/rejected": 3.6119511127471924, "step": 53060 }, { "epoch": 2.463902688147082, "grad_norm": 15.805018424987793, "learning_rate": 8.954299952025009e-08, "logits/chosen": -19.131664276123047, "logits/rejected": -18.665271759033203, "logps/chosen": -375.32513427734375, "logps/rejected": -360.06964111328125, "loss": 0.5224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8366332054138184, "rewards/margins": 0.5162451863288879, "rewards/rejected": 2.3203883171081543, "step": 53070 }, { "epoch": 2.4643669622545152, "grad_norm": 0.030355559661984444, "learning_rate": 8.946562050234458e-08, "logits/chosen": -18.62700080871582, "logits/rejected": -17.834274291992188, "logps/chosen": -358.45098876953125, "logps/rejected": -209.24301147460938, "loss": 0.2796, "rewards/accuracies": 1.0, "rewards/chosen": 4.56339168548584, "rewards/margins": 2.9727768898010254, "rewards/rejected": 1.5906155109405518, "step": 53080 }, { "epoch": 2.464831236361948, "grad_norm": 26.099252700805664, "learning_rate": 8.938824148443907e-08, "logits/chosen": -19.454259872436523, "logits/rejected": -18.244550704956055, "logps/chosen": -338.2759094238281, "logps/rejected": -282.4815673828125, "loss": 0.6346, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.281080722808838, "rewards/margins": 1.2919307947158813, "rewards/rejected": 1.989149808883667, "step": 53090 }, { "epoch": 2.4652955104693812, "grad_norm": 4.565036296844482, "learning_rate": 8.931086246653357e-08, "logits/chosen": -18.929279327392578, "logits/rejected": -18.146692276000977, "logps/chosen": -339.83514404296875, "logps/rejected": -294.15252685546875, "loss": 0.5485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.814359188079834, "rewards/margins": 1.5388084650039673, "rewards/rejected": 1.2755509614944458, "step": 53100 }, { "epoch": 2.465759784576814, "grad_norm": 32.71097183227539, "learning_rate": 8.923348344862807e-08, "logits/chosen": -18.59048843383789, "logits/rejected": -18.35839080810547, "logps/chosen": -488.56256103515625, "logps/rejected": -407.59857177734375, "loss": 0.8837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.739154815673828, "rewards/margins": 0.27950435876846313, "rewards/rejected": 3.4596505165100098, "step": 53110 }, { "epoch": 2.466224058684247, "grad_norm": 1.3312326669692993, "learning_rate": 8.915610443072257e-08, "logits/chosen": -18.447404861450195, "logits/rejected": -17.55900764465332, "logps/chosen": -441.84747314453125, "logps/rejected": -347.44976806640625, "loss": 0.5435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0779266357421875, "rewards/margins": 1.6186764240264893, "rewards/rejected": 1.4592500925064087, "step": 53120 }, { "epoch": 2.46668833279168, "grad_norm": 17.232173919677734, "learning_rate": 8.907872541281705e-08, "logits/chosen": -19.084997177124023, "logits/rejected": -17.85615348815918, "logps/chosen": -383.1418151855469, "logps/rejected": -273.7635803222656, "loss": 0.4535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.016648769378662, "rewards/margins": 1.9093023538589478, "rewards/rejected": 2.107346534729004, "step": 53130 }, { "epoch": 2.467152606899113, "grad_norm": 278.799560546875, "learning_rate": 8.900134639491155e-08, "logits/chosen": -19.168113708496094, "logits/rejected": -18.65479278564453, "logps/chosen": -297.0926513671875, "logps/rejected": -267.53656005859375, "loss": 0.6042, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4971084594726562, "rewards/margins": 0.45006808638572693, "rewards/rejected": 2.0470404624938965, "step": 53140 }, { "epoch": 2.4676168810065464, "grad_norm": 22.172725677490234, "learning_rate": 8.892396737700605e-08, "logits/chosen": -18.334415435791016, "logits/rejected": -18.569988250732422, "logps/chosen": -388.02581787109375, "logps/rejected": -342.24920654296875, "loss": 1.1268, "rewards/accuracies": 0.5, "rewards/chosen": 2.509913921356201, "rewards/margins": 0.4919828474521637, "rewards/rejected": 2.0179309844970703, "step": 53150 }, { "epoch": 2.468081155113979, "grad_norm": 120.73959350585938, "learning_rate": 8.884658835910055e-08, "logits/chosen": -18.330135345458984, "logits/rejected": -17.080896377563477, "logps/chosen": -311.81329345703125, "logps/rejected": -191.91319274902344, "loss": 0.32, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5095980167388916, "rewards/margins": 2.052821159362793, "rewards/rejected": 0.45677700638771057, "step": 53160 }, { "epoch": 2.4685454292214124, "grad_norm": 63.09660720825195, "learning_rate": 8.876920934119504e-08, "logits/chosen": -18.749719619750977, "logits/rejected": -18.41802406311035, "logps/chosen": -364.3975524902344, "logps/rejected": -372.7663879394531, "loss": 0.8826, "rewards/accuracies": 0.5, "rewards/chosen": 3.352736234664917, "rewards/margins": 0.6537086963653564, "rewards/rejected": 2.6990272998809814, "step": 53170 }, { "epoch": 2.469009703328845, "grad_norm": 165.71434020996094, "learning_rate": 8.869183032328953e-08, "logits/chosen": -20.36628532409668, "logits/rejected": -19.352813720703125, "logps/chosen": -317.58758544921875, "logps/rejected": -263.81256103515625, "loss": 1.069, "rewards/accuracies": 0.5, "rewards/chosen": 2.4509520530700684, "rewards/margins": 0.11588659137487411, "rewards/rejected": 2.3350656032562256, "step": 53180 }, { "epoch": 2.4694739774362784, "grad_norm": 144.1902313232422, "learning_rate": 8.861445130538403e-08, "logits/chosen": -18.31009292602539, "logits/rejected": -18.365192413330078, "logps/chosen": -386.2212219238281, "logps/rejected": -293.32537841796875, "loss": 0.8142, "rewards/accuracies": 0.5, "rewards/chosen": 3.780604600906372, "rewards/margins": 0.46417856216430664, "rewards/rejected": 3.3164258003234863, "step": 53190 }, { "epoch": 2.469938251543711, "grad_norm": 15.944648742675781, "learning_rate": 8.853707228747851e-08, "logits/chosen": -19.37063980102539, "logits/rejected": -19.011266708374023, "logps/chosen": -409.3065490722656, "logps/rejected": -384.59405517578125, "loss": 0.7104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8880457878112793, "rewards/margins": 0.7466461658477783, "rewards/rejected": 3.14139986038208, "step": 53200 }, { "epoch": 2.4704025256511444, "grad_norm": 286.2671813964844, "learning_rate": 8.845969326957302e-08, "logits/chosen": -19.867326736450195, "logits/rejected": -19.506471633911133, "logps/chosen": -353.8517761230469, "logps/rejected": -344.84381103515625, "loss": 0.9777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.249138593673706, "rewards/margins": 0.2949987053871155, "rewards/rejected": 2.9541401863098145, "step": 53210 }, { "epoch": 2.4708667997585776, "grad_norm": 71.57482147216797, "learning_rate": 8.838231425166752e-08, "logits/chosen": -19.194286346435547, "logits/rejected": -18.517467498779297, "logps/chosen": -483.19281005859375, "logps/rejected": -330.912841796875, "loss": 0.3027, "rewards/accuracies": 1.0, "rewards/chosen": 4.121125221252441, "rewards/margins": 1.87026846408844, "rewards/rejected": 2.2508559226989746, "step": 53220 }, { "epoch": 2.4713310738660104, "grad_norm": 23.716842651367188, "learning_rate": 8.830493523376201e-08, "logits/chosen": -19.47345733642578, "logits/rejected": -18.805740356445312, "logps/chosen": -426.19305419921875, "logps/rejected": -374.88262939453125, "loss": 0.6532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3777718544006348, "rewards/margins": 1.0386416912078857, "rewards/rejected": 2.33912992477417, "step": 53230 }, { "epoch": 2.4717953479734436, "grad_norm": 2.0976531505584717, "learning_rate": 8.82275562158565e-08, "logits/chosen": -19.05476951599121, "logits/rejected": -19.19516372680664, "logps/chosen": -378.136474609375, "logps/rejected": -366.2614440917969, "loss": 0.8058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.856606960296631, "rewards/margins": 1.3201172351837158, "rewards/rejected": 2.536489963531494, "step": 53240 }, { "epoch": 2.472259622080877, "grad_norm": 8.913054466247559, "learning_rate": 8.815017719795099e-08, "logits/chosen": -20.176612854003906, "logits/rejected": -19.19167137145996, "logps/chosen": -458.4794006347656, "logps/rejected": -345.5992736816406, "loss": 0.5178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.971259593963623, "rewards/margins": 1.585656762123108, "rewards/rejected": 3.385603427886963, "step": 53250 }, { "epoch": 2.4727238961883096, "grad_norm": 3.5922014713287354, "learning_rate": 8.80727981800455e-08, "logits/chosen": -18.091312408447266, "logits/rejected": -17.382474899291992, "logps/chosen": -349.3730163574219, "logps/rejected": -308.455322265625, "loss": 0.6609, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.069509506225586, "rewards/margins": 1.9819352626800537, "rewards/rejected": 1.0875743627548218, "step": 53260 }, { "epoch": 2.473188170295743, "grad_norm": 248.99569702148438, "learning_rate": 8.799541916213999e-08, "logits/chosen": -17.220966339111328, "logits/rejected": -17.905864715576172, "logps/chosen": -347.62896728515625, "logps/rejected": -366.1524658203125, "loss": 1.1449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.231924057006836, "rewards/margins": 0.289397656917572, "rewards/rejected": 1.9425264596939087, "step": 53270 }, { "epoch": 2.4736524444031756, "grad_norm": 91.98844909667969, "learning_rate": 8.791804014423448e-08, "logits/chosen": -19.388484954833984, "logits/rejected": -18.790449142456055, "logps/chosen": -432.3544921875, "logps/rejected": -389.8876953125, "loss": 0.717, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9015345573425293, "rewards/margins": 0.5672696828842163, "rewards/rejected": 3.3342652320861816, "step": 53280 }, { "epoch": 2.4741167185106088, "grad_norm": 76.56887817382812, "learning_rate": 8.784066112632898e-08, "logits/chosen": -19.05177879333496, "logits/rejected": -18.316617965698242, "logps/chosen": -429.0751953125, "logps/rejected": -427.743408203125, "loss": 0.5107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.687058687210083, "rewards/margins": 1.5331649780273438, "rewards/rejected": 2.1538939476013184, "step": 53290 }, { "epoch": 2.4745809926180415, "grad_norm": 35.849884033203125, "learning_rate": 8.776328210842347e-08, "logits/chosen": -19.694427490234375, "logits/rejected": -19.29973793029785, "logps/chosen": -550.4076538085938, "logps/rejected": -430.33563232421875, "loss": 0.5302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.3989973068237305, "rewards/margins": 0.875471293926239, "rewards/rejected": 3.5235257148742676, "step": 53300 }, { "epoch": 2.4750452667254748, "grad_norm": 54.363895416259766, "learning_rate": 8.768590309051798e-08, "logits/chosen": -18.163816452026367, "logits/rejected": -18.446014404296875, "logps/chosen": -422.26416015625, "logps/rejected": -397.75018310546875, "loss": 0.8362, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.174086332321167, "rewards/margins": 0.07224148511886597, "rewards/rejected": 3.101844549179077, "step": 53310 }, { "epoch": 2.475509540832908, "grad_norm": 30.76283836364746, "learning_rate": 8.760852407261246e-08, "logits/chosen": -18.631948471069336, "logits/rejected": -17.356552124023438, "logps/chosen": -300.1866455078125, "logps/rejected": -218.138671875, "loss": 0.5332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.2683374881744385, "rewards/margins": 1.0104385614395142, "rewards/rejected": 1.2578989267349243, "step": 53320 }, { "epoch": 2.4759738149403407, "grad_norm": 108.11909484863281, "learning_rate": 8.753114505470696e-08, "logits/chosen": -19.02565574645996, "logits/rejected": -18.67680549621582, "logps/chosen": -333.6589660644531, "logps/rejected": -284.5177307128906, "loss": 0.4891, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9268546104431152, "rewards/margins": 1.9797637462615967, "rewards/rejected": 1.94709050655365, "step": 53330 }, { "epoch": 2.476438089047774, "grad_norm": 253.16171264648438, "learning_rate": 8.745376603680146e-08, "logits/chosen": -19.385765075683594, "logits/rejected": -18.99085235595703, "logps/chosen": -380.48736572265625, "logps/rejected": -324.2588195800781, "loss": 0.9475, "rewards/accuracies": 0.5, "rewards/chosen": 2.650084972381592, "rewards/margins": 0.6486150622367859, "rewards/rejected": 2.0014700889587402, "step": 53340 }, { "epoch": 2.4769023631552067, "grad_norm": 3.28078293800354, "learning_rate": 8.737638701889594e-08, "logits/chosen": -18.54046630859375, "logits/rejected": -17.929048538208008, "logps/chosen": -458.5543518066406, "logps/rejected": -328.6601867675781, "loss": 0.4842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.086897850036621, "rewards/margins": 2.0996716022491455, "rewards/rejected": 1.9872262477874756, "step": 53350 }, { "epoch": 2.47736663726264, "grad_norm": 69.78158569335938, "learning_rate": 8.729900800099046e-08, "logits/chosen": -19.07516098022461, "logits/rejected": -18.898061752319336, "logps/chosen": -466.8509826660156, "logps/rejected": -415.8826599121094, "loss": 0.5941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2420146465301514, "rewards/margins": 0.9909971356391907, "rewards/rejected": 2.2510178089141846, "step": 53360 }, { "epoch": 2.4778309113700727, "grad_norm": 83.76651763916016, "learning_rate": 8.722162898308494e-08, "logits/chosen": -19.071195602416992, "logits/rejected": -18.6489315032959, "logps/chosen": -458.22369384765625, "logps/rejected": -402.3866882324219, "loss": 0.7882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3286638259887695, "rewards/margins": 1.2615770101547241, "rewards/rejected": 3.067086935043335, "step": 53370 }, { "epoch": 2.478295185477506, "grad_norm": 76.27908325195312, "learning_rate": 8.714424996517944e-08, "logits/chosen": -18.605783462524414, "logits/rejected": -17.44391632080078, "logps/chosen": -436.6500549316406, "logps/rejected": -292.59130859375, "loss": 0.247, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.574066638946533, "rewards/margins": 1.9949960708618164, "rewards/rejected": 1.579070806503296, "step": 53380 }, { "epoch": 2.478759459584939, "grad_norm": 2.2546212673187256, "learning_rate": 8.706687094727392e-08, "logits/chosen": -19.344757080078125, "logits/rejected": -18.65651512145996, "logps/chosen": -415.41973876953125, "logps/rejected": -328.61785888671875, "loss": 0.5262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.114940643310547, "rewards/margins": 2.0174167156219482, "rewards/rejected": 2.0975241661071777, "step": 53390 }, { "epoch": 2.479223733692372, "grad_norm": 139.08445739746094, "learning_rate": 8.698949192936843e-08, "logits/chosen": -19.50443458557129, "logits/rejected": -19.828866958618164, "logps/chosen": -320.37030029296875, "logps/rejected": -323.11370849609375, "loss": 0.815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2189106941223145, "rewards/margins": 0.4753190875053406, "rewards/rejected": 2.743591547012329, "step": 53400 }, { "epoch": 2.479688007799805, "grad_norm": 4.875733852386475, "learning_rate": 8.691211291146293e-08, "logits/chosen": -18.94982147216797, "logits/rejected": -18.02223014831543, "logps/chosen": -404.48284912109375, "logps/rejected": -340.37054443359375, "loss": 0.3998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4327049255371094, "rewards/margins": 1.1514228582382202, "rewards/rejected": 2.2812821865081787, "step": 53410 }, { "epoch": 2.480152281907238, "grad_norm": 58.507354736328125, "learning_rate": 8.683473389355742e-08, "logits/chosen": -18.333839416503906, "logits/rejected": -17.4576358795166, "logps/chosen": -259.11138916015625, "logps/rejected": -173.1888427734375, "loss": 0.4174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.212473154067993, "rewards/margins": 1.8705991506576538, "rewards/rejected": 0.34187403321266174, "step": 53420 }, { "epoch": 2.480616556014671, "grad_norm": 188.66941833496094, "learning_rate": 8.675735487565192e-08, "logits/chosen": -20.095548629760742, "logits/rejected": -19.471776962280273, "logps/chosen": -404.57415771484375, "logps/rejected": -333.8827209472656, "loss": 0.7554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7608859539031982, "rewards/margins": 0.6866496801376343, "rewards/rejected": 3.0742363929748535, "step": 53430 }, { "epoch": 2.481080830122104, "grad_norm": 220.65220642089844, "learning_rate": 8.66799758577464e-08, "logits/chosen": -19.11285972595215, "logits/rejected": -18.74709701538086, "logps/chosen": -335.41363525390625, "logps/rejected": -335.6896057128906, "loss": 1.1509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5152411460876465, "rewards/margins": 0.6855764389038086, "rewards/rejected": 2.829664945602417, "step": 53440 }, { "epoch": 2.481545104229537, "grad_norm": 98.54949188232422, "learning_rate": 8.660259683984091e-08, "logits/chosen": -18.7879638671875, "logits/rejected": -18.73197364807129, "logps/chosen": -325.0191345214844, "logps/rejected": -339.6106262207031, "loss": 0.6583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7989861965179443, "rewards/margins": 0.4626041352748871, "rewards/rejected": 2.3363823890686035, "step": 53450 }, { "epoch": 2.4820093783369703, "grad_norm": 0.14856994152069092, "learning_rate": 8.652521782193541e-08, "logits/chosen": -19.973384857177734, "logits/rejected": -18.178897857666016, "logps/chosen": -447.8164978027344, "logps/rejected": -341.5196228027344, "loss": 0.6404, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.823359489440918, "rewards/margins": 2.0016844272613525, "rewards/rejected": 2.8216755390167236, "step": 53460 }, { "epoch": 2.482473652444403, "grad_norm": 17.779829025268555, "learning_rate": 8.64478388040299e-08, "logits/chosen": -19.149608612060547, "logits/rejected": -18.628461837768555, "logps/chosen": -313.5843200683594, "logps/rejected": -215.615966796875, "loss": 0.6324, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7319278717041016, "rewards/margins": 1.7689815759658813, "rewards/rejected": 0.9629461169242859, "step": 53470 }, { "epoch": 2.4829379265518363, "grad_norm": 186.136962890625, "learning_rate": 8.637045978612439e-08, "logits/chosen": -18.97147560119629, "logits/rejected": -18.863422393798828, "logps/chosen": -428.45562744140625, "logps/rejected": -424.4153747558594, "loss": 0.5665, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.215961456298828, "rewards/margins": 0.5632351040840149, "rewards/rejected": 2.652726411819458, "step": 53480 }, { "epoch": 2.483402200659269, "grad_norm": 54.51342010498047, "learning_rate": 8.629308076821888e-08, "logits/chosen": -18.626623153686523, "logits/rejected": -18.20929718017578, "logps/chosen": -302.24456787109375, "logps/rejected": -316.717041015625, "loss": 1.2621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.948499321937561, "rewards/margins": 0.46555987000465393, "rewards/rejected": 1.4829394817352295, "step": 53490 }, { "epoch": 2.4838664747667023, "grad_norm": 99.69163513183594, "learning_rate": 8.621570175031339e-08, "logits/chosen": -19.226640701293945, "logits/rejected": -18.151525497436523, "logps/chosen": -476.545654296875, "logps/rejected": -365.4356994628906, "loss": 0.7091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.324535369873047, "rewards/margins": 0.6950254440307617, "rewards/rejected": 2.629509925842285, "step": 53500 }, { "epoch": 2.484330748874135, "grad_norm": 1.2326356172561646, "learning_rate": 8.613832273240787e-08, "logits/chosen": -19.609134674072266, "logits/rejected": -18.474056243896484, "logps/chosen": -397.9308166503906, "logps/rejected": -243.3746795654297, "loss": 0.7927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2678229808807373, "rewards/margins": 1.3750050067901611, "rewards/rejected": 1.8928178548812866, "step": 53510 }, { "epoch": 2.4847950229815683, "grad_norm": 107.68511199951172, "learning_rate": 8.606094371450237e-08, "logits/chosen": -19.737064361572266, "logits/rejected": -19.66309928894043, "logps/chosen": -356.148681640625, "logps/rejected": -308.10089111328125, "loss": 0.703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.204614162445068, "rewards/margins": 1.3358930349349976, "rewards/rejected": 2.8687210083007812, "step": 53520 }, { "epoch": 2.4852592970890015, "grad_norm": 211.10874938964844, "learning_rate": 8.598356469659687e-08, "logits/chosen": -18.832120895385742, "logits/rejected": -18.86856460571289, "logps/chosen": -377.77642822265625, "logps/rejected": -379.9696350097656, "loss": 0.9824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.691464900970459, "rewards/margins": 0.3335005044937134, "rewards/rejected": 2.357964277267456, "step": 53530 }, { "epoch": 2.4857235711964343, "grad_norm": 7.3348283767700195, "learning_rate": 8.590618567869135e-08, "logits/chosen": -19.046499252319336, "logits/rejected": -18.685344696044922, "logps/chosen": -338.3724670410156, "logps/rejected": -297.64984130859375, "loss": 0.9089, "rewards/accuracies": 0.5, "rewards/chosen": 2.856642961502075, "rewards/margins": 0.6850873231887817, "rewards/rejected": 2.171555519104004, "step": 53540 }, { "epoch": 2.4861878453038675, "grad_norm": 83.54499816894531, "learning_rate": 8.582880666078587e-08, "logits/chosen": -20.067665100097656, "logits/rejected": -19.422441482543945, "logps/chosen": -407.74371337890625, "logps/rejected": -356.9292907714844, "loss": 0.6243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.005712985992432, "rewards/margins": 1.105999231338501, "rewards/rejected": 2.8997139930725098, "step": 53550 }, { "epoch": 2.4866521194113003, "grad_norm": 245.66639709472656, "learning_rate": 8.575142764288035e-08, "logits/chosen": -19.280170440673828, "logits/rejected": -17.75485610961914, "logps/chosen": -398.192626953125, "logps/rejected": -276.3846130371094, "loss": 0.4343, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.490891933441162, "rewards/margins": 2.931364059448242, "rewards/rejected": 1.5595283508300781, "step": 53560 }, { "epoch": 2.4871163935187335, "grad_norm": 123.10244750976562, "learning_rate": 8.567404862497485e-08, "logits/chosen": -18.271160125732422, "logits/rejected": -17.975830078125, "logps/chosen": -248.340576171875, "logps/rejected": -178.339111328125, "loss": 0.5648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9941542148590088, "rewards/margins": 0.973589301109314, "rewards/rejected": 1.0205649137496948, "step": 53570 }, { "epoch": 2.4875806676261663, "grad_norm": 77.8828125, "learning_rate": 8.559666960706935e-08, "logits/chosen": -18.96395492553711, "logits/rejected": -18.628936767578125, "logps/chosen": -314.6376037597656, "logps/rejected": -267.9477233886719, "loss": 0.4903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.405991315841675, "rewards/margins": 1.1832730770111084, "rewards/rejected": 2.2227184772491455, "step": 53580 }, { "epoch": 2.4880449417335995, "grad_norm": 360.98541259765625, "learning_rate": 8.551929058916383e-08, "logits/chosen": -19.263202667236328, "logits/rejected": -17.79410171508789, "logps/chosen": -353.4685974121094, "logps/rejected": -247.2805938720703, "loss": 0.5038, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.82234525680542, "rewards/margins": 1.6214616298675537, "rewards/rejected": 1.2008836269378662, "step": 53590 }, { "epoch": 2.4885092158410327, "grad_norm": 115.45328521728516, "learning_rate": 8.544191157125834e-08, "logits/chosen": -17.800121307373047, "logits/rejected": -18.685184478759766, "logps/chosen": -301.74822998046875, "logps/rejected": -362.6490783691406, "loss": 1.269, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2973926067352295, "rewards/margins": -0.24720540642738342, "rewards/rejected": 3.544598340988159, "step": 53600 }, { "epoch": 2.4889734899484655, "grad_norm": 76.09783172607422, "learning_rate": 8.536453255335283e-08, "logits/chosen": -19.697397232055664, "logits/rejected": -18.684818267822266, "logps/chosen": -409.31683349609375, "logps/rejected": -264.6957702636719, "loss": 0.5136, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1042113304138184, "rewards/margins": 1.7545182704925537, "rewards/rejected": 1.3496931791305542, "step": 53610 }, { "epoch": 2.4894377640558987, "grad_norm": 34.116146087646484, "learning_rate": 8.528715353544733e-08, "logits/chosen": -18.900646209716797, "logits/rejected": -18.51753807067871, "logps/chosen": -208.6640167236328, "logps/rejected": -188.58322143554688, "loss": 1.3026, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9553970098495483, "rewards/margins": 0.36800307035446167, "rewards/rejected": 0.5873939990997314, "step": 53620 }, { "epoch": 2.4899020381633314, "grad_norm": 36.88664627075195, "learning_rate": 8.520977451754181e-08, "logits/chosen": -19.509984970092773, "logits/rejected": -17.848281860351562, "logps/chosen": -355.3365173339844, "logps/rejected": -265.18890380859375, "loss": 0.3871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.24348521232605, "rewards/margins": 1.972354531288147, "rewards/rejected": 1.271130919456482, "step": 53630 }, { "epoch": 2.4903663122707647, "grad_norm": 23.410045623779297, "learning_rate": 8.513239549963631e-08, "logits/chosen": -18.448612213134766, "logits/rejected": -18.10691261291504, "logps/chosen": -343.8152770996094, "logps/rejected": -270.81109619140625, "loss": 0.7765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6125301122665405, "rewards/margins": 0.559585690498352, "rewards/rejected": 1.0529447793960571, "step": 53640 }, { "epoch": 2.490830586378198, "grad_norm": 104.30764770507812, "learning_rate": 8.505501648173082e-08, "logits/chosen": -18.89996337890625, "logits/rejected": -17.51784324645996, "logps/chosen": -369.9268798828125, "logps/rejected": -251.29306030273438, "loss": 0.3779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6559250354766846, "rewards/margins": 2.4044861793518066, "rewards/rejected": 1.251438856124878, "step": 53650 }, { "epoch": 2.4912948604856306, "grad_norm": 28.26565933227539, "learning_rate": 8.49776374638253e-08, "logits/chosen": -19.049779891967773, "logits/rejected": -18.108871459960938, "logps/chosen": -336.7347412109375, "logps/rejected": -218.9002685546875, "loss": 0.4646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.869357109069824, "rewards/margins": 2.255253314971924, "rewards/rejected": 0.6141039133071899, "step": 53660 }, { "epoch": 2.491759134593064, "grad_norm": 190.58038330078125, "learning_rate": 8.49002584459198e-08, "logits/chosen": -18.973350524902344, "logits/rejected": -17.834827423095703, "logps/chosen": -527.7327270507812, "logps/rejected": -310.0960693359375, "loss": 0.5214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.342756748199463, "rewards/margins": 2.536733388900757, "rewards/rejected": 1.8060235977172852, "step": 53670 }, { "epoch": 2.4922234087004966, "grad_norm": 12.284629821777344, "learning_rate": 8.482287942801429e-08, "logits/chosen": -19.837190628051758, "logits/rejected": -18.366344451904297, "logps/chosen": -357.1782531738281, "logps/rejected": -275.4052734375, "loss": 0.453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.098523139953613, "rewards/margins": 2.2793641090393066, "rewards/rejected": 1.8191595077514648, "step": 53680 }, { "epoch": 2.49268768280793, "grad_norm": 0.9392388463020325, "learning_rate": 8.474550041010879e-08, "logits/chosen": -19.430648803710938, "logits/rejected": -18.65458106994629, "logps/chosen": -410.0128479003906, "logps/rejected": -388.520751953125, "loss": 1.0342, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2095565795898438, "rewards/margins": 0.855869472026825, "rewards/rejected": 2.353686809539795, "step": 53690 }, { "epoch": 2.493151956915363, "grad_norm": 10.942546844482422, "learning_rate": 8.46681213922033e-08, "logits/chosen": -19.725341796875, "logits/rejected": -19.505786895751953, "logps/chosen": -349.85711669921875, "logps/rejected": -380.4553527832031, "loss": 0.6725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.127626419067383, "rewards/margins": 0.8846712112426758, "rewards/rejected": 3.242954969406128, "step": 53700 }, { "epoch": 2.493616231022796, "grad_norm": 269.6287841796875, "learning_rate": 8.459074237429778e-08, "logits/chosen": -19.108606338500977, "logits/rejected": -18.965892791748047, "logps/chosen": -310.8584899902344, "logps/rejected": -325.774169921875, "loss": 1.0901, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9117989540100098, "rewards/margins": 0.48322105407714844, "rewards/rejected": 2.4285778999328613, "step": 53710 }, { "epoch": 2.494080505130229, "grad_norm": 40.91228485107422, "learning_rate": 8.451336335639228e-08, "logits/chosen": -19.386669158935547, "logits/rejected": -19.113224029541016, "logps/chosen": -401.64239501953125, "logps/rejected": -367.6700134277344, "loss": 0.6477, "rewards/accuracies": 0.5, "rewards/chosen": 3.919750213623047, "rewards/margins": 0.5928489565849304, "rewards/rejected": 3.32690167427063, "step": 53720 }, { "epoch": 2.494544779237662, "grad_norm": 135.75062561035156, "learning_rate": 8.443598433848676e-08, "logits/chosen": -19.327556610107422, "logits/rejected": -18.80722427368164, "logps/chosen": -366.6414489746094, "logps/rejected": -300.42498779296875, "loss": 0.5764, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.943704605102539, "rewards/margins": 0.3822179436683655, "rewards/rejected": 2.5614869594573975, "step": 53730 }, { "epoch": 2.495009053345095, "grad_norm": 4.995602607727051, "learning_rate": 8.435860532058128e-08, "logits/chosen": -20.146394729614258, "logits/rejected": -18.183238983154297, "logps/chosen": -439.76806640625, "logps/rejected": -213.9208526611328, "loss": 0.2812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9351115226745605, "rewards/margins": 2.52641224861145, "rewards/rejected": 1.4086992740631104, "step": 53740 }, { "epoch": 2.495473327452528, "grad_norm": 128.35423278808594, "learning_rate": 8.428122630267576e-08, "logits/chosen": -19.098241806030273, "logits/rejected": -18.390838623046875, "logps/chosen": -436.10308837890625, "logps/rejected": -415.09088134765625, "loss": 0.8738, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5469470024108887, "rewards/margins": 1.2164666652679443, "rewards/rejected": 2.3304805755615234, "step": 53750 }, { "epoch": 2.495937601559961, "grad_norm": 152.64451599121094, "learning_rate": 8.420384728477026e-08, "logits/chosen": -18.091838836669922, "logits/rejected": -18.1778564453125, "logps/chosen": -424.23602294921875, "logps/rejected": -379.45751953125, "loss": 1.36, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.972853422164917, "rewards/margins": 0.16335619986057281, "rewards/rejected": 3.809497356414795, "step": 53760 }, { "epoch": 2.4964018756673942, "grad_norm": 28.96641731262207, "learning_rate": 8.412646826686476e-08, "logits/chosen": -19.282331466674805, "logits/rejected": -18.369089126586914, "logps/chosen": -461.0733947753906, "logps/rejected": -344.15972900390625, "loss": 0.4615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2730441093444824, "rewards/margins": 1.570630431175232, "rewards/rejected": 1.70241379737854, "step": 53770 }, { "epoch": 2.496866149774827, "grad_norm": 31.702655792236328, "learning_rate": 8.404908924895924e-08, "logits/chosen": -18.858619689941406, "logits/rejected": -18.795291900634766, "logps/chosen": -398.6681213378906, "logps/rejected": -347.64593505859375, "loss": 0.6798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.005345106124878, "rewards/margins": 0.6347044706344604, "rewards/rejected": 2.370640754699707, "step": 53780 }, { "epoch": 2.4973304238822602, "grad_norm": 105.97293090820312, "learning_rate": 8.397171023105375e-08, "logits/chosen": -18.865062713623047, "logits/rejected": -18.346208572387695, "logps/chosen": -404.2882080078125, "logps/rejected": -342.22344970703125, "loss": 0.531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8367090225219727, "rewards/margins": 1.080268144607544, "rewards/rejected": 1.7564408779144287, "step": 53790 }, { "epoch": 2.497794697989693, "grad_norm": 215.0267333984375, "learning_rate": 8.389433121314824e-08, "logits/chosen": -19.571029663085938, "logits/rejected": -18.803415298461914, "logps/chosen": -370.8804931640625, "logps/rejected": -308.23089599609375, "loss": 0.9369, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.306057929992676, "rewards/margins": 0.9731396436691284, "rewards/rejected": 3.3329176902770996, "step": 53800 }, { "epoch": 2.498258972097126, "grad_norm": 106.15505981445312, "learning_rate": 8.381695219524274e-08, "logits/chosen": -19.010360717773438, "logits/rejected": -18.475719451904297, "logps/chosen": -387.68780517578125, "logps/rejected": -296.32080078125, "loss": 0.7913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.658822536468506, "rewards/margins": 1.9572092294692993, "rewards/rejected": 1.701613187789917, "step": 53810 }, { "epoch": 2.498723246204559, "grad_norm": 37.96015548706055, "learning_rate": 8.373957317733723e-08, "logits/chosen": -19.094648361206055, "logits/rejected": -18.46685218811035, "logps/chosen": -357.6614685058594, "logps/rejected": -371.4713439941406, "loss": 0.6702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.487732410430908, "rewards/margins": 1.2110650539398193, "rewards/rejected": 2.2766671180725098, "step": 53820 }, { "epoch": 2.499187520311992, "grad_norm": 263.375732421875, "learning_rate": 8.366219415943172e-08, "logits/chosen": -18.902164459228516, "logits/rejected": -19.083446502685547, "logps/chosen": -359.1360778808594, "logps/rejected": -323.69171142578125, "loss": 1.0941, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.433236598968506, "rewards/margins": -0.08345906436443329, "rewards/rejected": 3.516695499420166, "step": 53830 }, { "epoch": 2.4996517944194254, "grad_norm": 50.46466827392578, "learning_rate": 8.358481514152623e-08, "logits/chosen": -19.641992568969727, "logits/rejected": -19.396259307861328, "logps/chosen": -336.76300048828125, "logps/rejected": -340.0735168457031, "loss": 1.1065, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8388659954071045, "rewards/margins": -0.2813982665538788, "rewards/rejected": 4.120265007019043, "step": 53840 }, { "epoch": 2.500116068526858, "grad_norm": 330.20458984375, "learning_rate": 8.350743612362071e-08, "logits/chosen": -18.39107322692871, "logits/rejected": -17.665346145629883, "logps/chosen": -400.41046142578125, "logps/rejected": -316.883056640625, "loss": 0.7792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6189677715301514, "rewards/margins": 0.9095166921615601, "rewards/rejected": 2.709451198577881, "step": 53850 }, { "epoch": 2.5005803426342914, "grad_norm": 122.33988952636719, "learning_rate": 8.343005710571521e-08, "logits/chosen": -20.051713943481445, "logits/rejected": -19.844226837158203, "logps/chosen": -443.07342529296875, "logps/rejected": -426.5535583496094, "loss": 0.4823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.79666805267334, "rewards/margins": 0.9863706827163696, "rewards/rejected": 3.810297727584839, "step": 53860 }, { "epoch": 2.501044616741724, "grad_norm": 102.21971130371094, "learning_rate": 8.33526780878097e-08, "logits/chosen": -19.39248275756836, "logits/rejected": -18.871509552001953, "logps/chosen": -377.7098083496094, "logps/rejected": -325.81817626953125, "loss": 0.5019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.727071762084961, "rewards/margins": 1.5588924884796143, "rewards/rejected": 2.1681792736053467, "step": 53870 }, { "epoch": 2.5015088908491574, "grad_norm": 97.00584411621094, "learning_rate": 8.32752990699042e-08, "logits/chosen": -18.20606231689453, "logits/rejected": -18.21183204650879, "logps/chosen": -361.90557861328125, "logps/rejected": -378.30023193359375, "loss": 0.899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6970551013946533, "rewards/margins": 0.8417078852653503, "rewards/rejected": 2.8553466796875, "step": 53880 }, { "epoch": 2.50197316495659, "grad_norm": 55.248695373535156, "learning_rate": 8.319792005199871e-08, "logits/chosen": -18.718088150024414, "logits/rejected": -18.33150291442871, "logps/chosen": -276.4688720703125, "logps/rejected": -310.47760009765625, "loss": 0.8675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9493458271026611, "rewards/margins": 0.3022159934043884, "rewards/rejected": 1.6471296548843384, "step": 53890 }, { "epoch": 2.5024374390640234, "grad_norm": 1.7977843284606934, "learning_rate": 8.312054103409319e-08, "logits/chosen": -19.4404296875, "logits/rejected": -17.832426071166992, "logps/chosen": -431.62420654296875, "logps/rejected": -315.80072021484375, "loss": 0.4379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9174888134002686, "rewards/margins": 2.281881332397461, "rewards/rejected": 1.635607361793518, "step": 53900 }, { "epoch": 2.5029017131714566, "grad_norm": 36.326969146728516, "learning_rate": 8.304316201618769e-08, "logits/chosen": -18.095766067504883, "logits/rejected": -17.93752670288086, "logps/chosen": -407.86932373046875, "logps/rejected": -350.9817810058594, "loss": 0.9429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.770719528198242, "rewards/margins": 0.5685328841209412, "rewards/rejected": 2.2021865844726562, "step": 53910 }, { "epoch": 2.5033659872788894, "grad_norm": 10.205700874328613, "learning_rate": 8.296578299828217e-08, "logits/chosen": -19.044240951538086, "logits/rejected": -17.97563934326172, "logps/chosen": -375.70257568359375, "logps/rejected": -262.80108642578125, "loss": 0.3217, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.077542781829834, "rewards/margins": 1.7873642444610596, "rewards/rejected": 1.290178894996643, "step": 53920 }, { "epoch": 2.5038302613863226, "grad_norm": 188.55836486816406, "learning_rate": 8.288840398037667e-08, "logits/chosen": -18.294437408447266, "logits/rejected": -18.542518615722656, "logps/chosen": -415.1698303222656, "logps/rejected": -350.06396484375, "loss": 0.9107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.643603801727295, "rewards/margins": 0.24280250072479248, "rewards/rejected": 2.400801420211792, "step": 53930 }, { "epoch": 2.5042945354937554, "grad_norm": 16.18670654296875, "learning_rate": 8.281102496247118e-08, "logits/chosen": -18.915952682495117, "logits/rejected": -17.60325050354004, "logps/chosen": -478.2610778808594, "logps/rejected": -373.43145751953125, "loss": 0.6013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.143131732940674, "rewards/margins": 2.6414635181427, "rewards/rejected": 2.5016684532165527, "step": 53940 }, { "epoch": 2.5047588096011886, "grad_norm": 121.19420623779297, "learning_rate": 8.273364594456567e-08, "logits/chosen": -19.407670974731445, "logits/rejected": -18.32898712158203, "logps/chosen": -403.13360595703125, "logps/rejected": -291.06964111328125, "loss": 0.4118, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.137895107269287, "rewards/margins": 1.9287312030792236, "rewards/rejected": 2.209163188934326, "step": 53950 }, { "epoch": 2.5052230837086213, "grad_norm": 326.3648376464844, "learning_rate": 8.265626692666017e-08, "logits/chosen": -17.991785049438477, "logits/rejected": -18.604890823364258, "logps/chosen": -266.6833190917969, "logps/rejected": -316.60638427734375, "loss": 1.0906, "rewards/accuracies": 0.5, "rewards/chosen": 2.2042672634124756, "rewards/margins": 0.011788678355515003, "rewards/rejected": 2.192478656768799, "step": 53960 }, { "epoch": 2.5056873578160546, "grad_norm": 135.82435607910156, "learning_rate": 8.257888790875465e-08, "logits/chosen": -19.05805015563965, "logits/rejected": -18.05478286743164, "logps/chosen": -331.8569641113281, "logps/rejected": -215.08828735351562, "loss": 0.4294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.057201862335205, "rewards/margins": 2.259704351425171, "rewards/rejected": 0.7974973917007446, "step": 53970 }, { "epoch": 2.5061516319234878, "grad_norm": 79.70831298828125, "learning_rate": 8.250150889084915e-08, "logits/chosen": -19.354488372802734, "logits/rejected": -18.444250106811523, "logps/chosen": -426.43511962890625, "logps/rejected": -325.32061767578125, "loss": 0.4333, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8536205291748047, "rewards/margins": 1.2762290239334106, "rewards/rejected": 2.5773913860321045, "step": 53980 }, { "epoch": 2.5066159060309205, "grad_norm": 69.0022964477539, "learning_rate": 8.242412987294365e-08, "logits/chosen": -18.972797393798828, "logits/rejected": -18.851945877075195, "logps/chosen": -353.7364501953125, "logps/rejected": -317.151611328125, "loss": 1.1108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3085224628448486, "rewards/margins": 0.9609369039535522, "rewards/rejected": 2.3475852012634277, "step": 53990 }, { "epoch": 2.5070801801383538, "grad_norm": 91.55262756347656, "learning_rate": 8.234675085503815e-08, "logits/chosen": -19.481372833251953, "logits/rejected": -19.56943130493164, "logps/chosen": -349.7156677246094, "logps/rejected": -438.32354736328125, "loss": 1.0748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.818302631378174, "rewards/margins": -0.1479829102754593, "rewards/rejected": 3.9662857055664062, "step": 54000 }, { "epoch": 2.507544454245787, "grad_norm": 80.12690734863281, "learning_rate": 8.226937183713264e-08, "logits/chosen": -18.866243362426758, "logits/rejected": -18.103923797607422, "logps/chosen": -318.6654052734375, "logps/rejected": -228.8127899169922, "loss": 0.3799, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9263381958007812, "rewards/margins": 1.4587554931640625, "rewards/rejected": 0.46758270263671875, "step": 54010 }, { "epoch": 2.5080087283532198, "grad_norm": 207.01344299316406, "learning_rate": 8.219199281922713e-08, "logits/chosen": -20.056804656982422, "logits/rejected": -20.10563087463379, "logps/chosen": -420.8235778808594, "logps/rejected": -415.22235107421875, "loss": 1.1076, "rewards/accuracies": 0.5, "rewards/chosen": 4.370710372924805, "rewards/margins": 1.0216505527496338, "rewards/rejected": 3.34906005859375, "step": 54020 }, { "epoch": 2.5084730024606525, "grad_norm": 31.21950912475586, "learning_rate": 8.211461380132164e-08, "logits/chosen": -19.40744400024414, "logits/rejected": -19.100358963012695, "logps/chosen": -425.24017333984375, "logps/rejected": -334.8640441894531, "loss": 0.9515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5148673057556152, "rewards/margins": 0.5253328680992126, "rewards/rejected": 2.989534616470337, "step": 54030 }, { "epoch": 2.5089372765680857, "grad_norm": 36.54635238647461, "learning_rate": 8.203723478341612e-08, "logits/chosen": -20.09905433654785, "logits/rejected": -19.072053909301758, "logps/chosen": -492.0765686035156, "logps/rejected": -414.63409423828125, "loss": 1.0185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.588956356048584, "rewards/margins": 0.3795808255672455, "rewards/rejected": 3.2093753814697266, "step": 54040 }, { "epoch": 2.509401550675519, "grad_norm": 2.063119649887085, "learning_rate": 8.195985576551062e-08, "logits/chosen": -19.603776931762695, "logits/rejected": -18.773860931396484, "logps/chosen": -386.67291259765625, "logps/rejected": -356.6725158691406, "loss": 0.497, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6275811195373535, "rewards/margins": 1.446895956993103, "rewards/rejected": 2.1806843280792236, "step": 54050 }, { "epoch": 2.5098658247829517, "grad_norm": 52.09254455566406, "learning_rate": 8.188247674760512e-08, "logits/chosen": -18.860872268676758, "logits/rejected": -17.444795608520508, "logps/chosen": -451.4346618652344, "logps/rejected": -296.079345703125, "loss": 0.508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6958184242248535, "rewards/margins": 1.9370616674423218, "rewards/rejected": 1.7587568759918213, "step": 54060 }, { "epoch": 2.510330098890385, "grad_norm": 11.762587547302246, "learning_rate": 8.18050977296996e-08, "logits/chosen": -18.278791427612305, "logits/rejected": -18.218364715576172, "logps/chosen": -382.0989990234375, "logps/rejected": -412.425537109375, "loss": 1.1905, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.232388973236084, "rewards/margins": 0.5838390588760376, "rewards/rejected": 2.648550033569336, "step": 54070 }, { "epoch": 2.510794372997818, "grad_norm": 168.77906799316406, "learning_rate": 8.172771871179412e-08, "logits/chosen": -20.062328338623047, "logits/rejected": -19.23191261291504, "logps/chosen": -446.6214294433594, "logps/rejected": -427.77276611328125, "loss": 0.9069, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.938988208770752, "rewards/margins": 1.0627063512802124, "rewards/rejected": 2.876281976699829, "step": 54080 }, { "epoch": 2.511258647105251, "grad_norm": 161.59556579589844, "learning_rate": 8.16503396938886e-08, "logits/chosen": -18.852741241455078, "logits/rejected": -17.98486328125, "logps/chosen": -377.3684997558594, "logps/rejected": -308.49481201171875, "loss": 0.6235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1626906394958496, "rewards/margins": 0.9856745600700378, "rewards/rejected": 2.177016019821167, "step": 54090 }, { "epoch": 2.5117229212126837, "grad_norm": 31.46587371826172, "learning_rate": 8.15729606759831e-08, "logits/chosen": -19.155658721923828, "logits/rejected": -17.840112686157227, "logps/chosen": -421.85394287109375, "logps/rejected": -294.80853271484375, "loss": 0.5417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5892410278320312, "rewards/margins": 2.1538429260253906, "rewards/rejected": 1.4353984594345093, "step": 54100 }, { "epoch": 2.512187195320117, "grad_norm": 0.8636305928230286, "learning_rate": 8.149558165807758e-08, "logits/chosen": -19.741981506347656, "logits/rejected": -18.578468322753906, "logps/chosen": -366.52117919921875, "logps/rejected": -258.0943298339844, "loss": 0.5552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.823425769805908, "rewards/margins": 2.167595624923706, "rewards/rejected": 1.655829668045044, "step": 54110 }, { "epoch": 2.51265146942755, "grad_norm": 53.708091735839844, "learning_rate": 8.141820264017208e-08, "logits/chosen": -19.05162239074707, "logits/rejected": -18.849790573120117, "logps/chosen": -343.1431884765625, "logps/rejected": -336.2559814453125, "loss": 0.5893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7202563285827637, "rewards/margins": 1.0266530513763428, "rewards/rejected": 2.693603515625, "step": 54120 }, { "epoch": 2.513115743534983, "grad_norm": 6.298488616943359, "learning_rate": 8.13408236222666e-08, "logits/chosen": -19.686338424682617, "logits/rejected": -19.091806411743164, "logps/chosen": -377.8185729980469, "logps/rejected": -357.6543884277344, "loss": 0.6994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8421683311462402, "rewards/margins": 1.095746397972107, "rewards/rejected": 1.7464218139648438, "step": 54130 }, { "epoch": 2.513580017642416, "grad_norm": 173.65826416015625, "learning_rate": 8.126344460436108e-08, "logits/chosen": -19.018619537353516, "logits/rejected": -19.19554901123047, "logps/chosen": -377.5548400878906, "logps/rejected": -385.84375, "loss": 1.2255, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2606453895568848, "rewards/margins": -0.4662984311580658, "rewards/rejected": 3.7269434928894043, "step": 54140 }, { "epoch": 2.5140442917498493, "grad_norm": 105.83858489990234, "learning_rate": 8.118606558645558e-08, "logits/chosen": -18.800718307495117, "logits/rejected": -17.788740158081055, "logps/chosen": -436.2928161621094, "logps/rejected": -365.50701904296875, "loss": 0.64, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.486415147781372, "rewards/margins": 1.4523704051971436, "rewards/rejected": 2.0340445041656494, "step": 54150 }, { "epoch": 2.514508565857282, "grad_norm": 49.73006820678711, "learning_rate": 8.110868656855006e-08, "logits/chosen": -18.963741302490234, "logits/rejected": -18.620887756347656, "logps/chosen": -296.19903564453125, "logps/rejected": -264.01422119140625, "loss": 1.216, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.7437493801116943, "rewards/margins": 0.16995787620544434, "rewards/rejected": 2.57379150390625, "step": 54160 }, { "epoch": 2.5149728399647153, "grad_norm": 2.7862908840179443, "learning_rate": 8.103130755064456e-08, "logits/chosen": -18.97774887084961, "logits/rejected": -18.32953453063965, "logps/chosen": -358.66632080078125, "logps/rejected": -312.9026794433594, "loss": 0.4519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1004233360290527, "rewards/margins": 1.3511466979980469, "rewards/rejected": 1.7492763996124268, "step": 54170 }, { "epoch": 2.515437114072148, "grad_norm": 0.3370543122291565, "learning_rate": 8.095392853273906e-08, "logits/chosen": -19.055213928222656, "logits/rejected": -17.48361587524414, "logps/chosen": -479.6539001464844, "logps/rejected": -300.00042724609375, "loss": 0.2418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.322644233703613, "rewards/margins": 2.8689615726470947, "rewards/rejected": 1.4536832571029663, "step": 54180 }, { "epoch": 2.5159013881795813, "grad_norm": 29.644445419311523, "learning_rate": 8.087654951483356e-08, "logits/chosen": -19.1003475189209, "logits/rejected": -17.687889099121094, "logps/chosen": -388.3245544433594, "logps/rejected": -239.51174926757812, "loss": 0.4481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.044161796569824, "rewards/margins": 3.0111546516418457, "rewards/rejected": 1.0330065488815308, "step": 54190 }, { "epoch": 2.516365662287014, "grad_norm": 66.66234588623047, "learning_rate": 8.079917049692805e-08, "logits/chosen": -18.942989349365234, "logits/rejected": -17.87514877319336, "logps/chosen": -422.350830078125, "logps/rejected": -292.26141357421875, "loss": 1.3069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6144776344299316, "rewards/margins": 1.2772722244262695, "rewards/rejected": 2.337205648422241, "step": 54200 }, { "epoch": 2.5168299363944473, "grad_norm": 2.163933038711548, "learning_rate": 8.072179147902254e-08, "logits/chosen": -18.586563110351562, "logits/rejected": -17.55841064453125, "logps/chosen": -326.2398986816406, "logps/rejected": -219.6607208251953, "loss": 0.3687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7070202827453613, "rewards/margins": 1.810340166091919, "rewards/rejected": 0.8966803550720215, "step": 54210 }, { "epoch": 2.5172942105018805, "grad_norm": 13.175304412841797, "learning_rate": 8.064441246111704e-08, "logits/chosen": -19.04960060119629, "logits/rejected": -18.470375061035156, "logps/chosen": -394.628173828125, "logps/rejected": -350.69403076171875, "loss": 0.7845, "rewards/accuracies": 0.5, "rewards/chosen": 2.42189359664917, "rewards/margins": 0.9412460327148438, "rewards/rejected": 1.4806476831436157, "step": 54220 }, { "epoch": 2.5177584846093133, "grad_norm": 32.698333740234375, "learning_rate": 8.056703344321153e-08, "logits/chosen": -19.777111053466797, "logits/rejected": -19.009235382080078, "logps/chosen": -409.87384033203125, "logps/rejected": -303.2106628417969, "loss": 0.2884, "rewards/accuracies": 1.0, "rewards/chosen": 4.742629051208496, "rewards/margins": 2.6962809562683105, "rewards/rejected": 2.0463483333587646, "step": 54230 }, { "epoch": 2.5182227587167465, "grad_norm": 33.666358947753906, "learning_rate": 8.048965442530603e-08, "logits/chosen": -18.665782928466797, "logits/rejected": -18.227094650268555, "logps/chosen": -365.6739196777344, "logps/rejected": -309.3407287597656, "loss": 0.6539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.713881254196167, "rewards/margins": 1.986039161682129, "rewards/rejected": 1.7278423309326172, "step": 54240 }, { "epoch": 2.5186870328241793, "grad_norm": 44.960243225097656, "learning_rate": 8.041227540740053e-08, "logits/chosen": -18.681337356567383, "logits/rejected": -19.085590362548828, "logps/chosen": -277.6570739746094, "logps/rejected": -332.35064697265625, "loss": 1.1344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4656474590301514, "rewards/margins": -0.3155592978000641, "rewards/rejected": 1.781206727027893, "step": 54250 }, { "epoch": 2.5191513069316125, "grad_norm": 167.43017578125, "learning_rate": 8.033489638949502e-08, "logits/chosen": -19.81979751586914, "logits/rejected": -18.683399200439453, "logps/chosen": -425.18560791015625, "logps/rejected": -290.9864807128906, "loss": 0.5781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.890190124511719, "rewards/margins": 1.5756995677947998, "rewards/rejected": 3.314490795135498, "step": 54260 }, { "epoch": 2.5196155810390453, "grad_norm": 9.18310260772705, "learning_rate": 8.025751737158951e-08, "logits/chosen": -18.057661056518555, "logits/rejected": -16.954439163208008, "logps/chosen": -381.00616455078125, "logps/rejected": -230.34323120117188, "loss": 0.2246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9733705520629883, "rewards/margins": 2.3967413902282715, "rewards/rejected": 0.5766294598579407, "step": 54270 }, { "epoch": 2.5200798551464785, "grad_norm": 228.67994689941406, "learning_rate": 8.018013835368401e-08, "logits/chosen": -19.71365737915039, "logits/rejected": -18.790512084960938, "logps/chosen": -450.316162109375, "logps/rejected": -331.5166320800781, "loss": 0.8969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.761637210845947, "rewards/margins": 1.6075689792633057, "rewards/rejected": 3.1540684700012207, "step": 54280 }, { "epoch": 2.5205441292539117, "grad_norm": 0.35657358169555664, "learning_rate": 8.010275933577851e-08, "logits/chosen": -19.206527709960938, "logits/rejected": -19.226530075073242, "logps/chosen": -442.33465576171875, "logps/rejected": -376.23431396484375, "loss": 0.6561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.557469844818115, "rewards/margins": 1.6741039752960205, "rewards/rejected": 2.8833658695220947, "step": 54290 }, { "epoch": 2.5210084033613445, "grad_norm": 55.73152542114258, "learning_rate": 8.0025380317873e-08, "logits/chosen": -18.16449737548828, "logits/rejected": -16.676097869873047, "logps/chosen": -347.73870849609375, "logps/rejected": -205.64517211914062, "loss": 0.4797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.753887891769409, "rewards/margins": 1.8448641300201416, "rewards/rejected": 0.9090237617492676, "step": 54300 }, { "epoch": 2.5214726774687777, "grad_norm": 9.537267684936523, "learning_rate": 7.994800129996749e-08, "logits/chosen": -18.502471923828125, "logits/rejected": -18.106201171875, "logps/chosen": -513.0084838867188, "logps/rejected": -483.604736328125, "loss": 0.8961, "rewards/accuracies": 0.5, "rewards/chosen": 4.4040327072143555, "rewards/margins": 0.2791176438331604, "rewards/rejected": 4.12491512298584, "step": 54310 }, { "epoch": 2.5219369515762105, "grad_norm": 72.59895324707031, "learning_rate": 7.987062228206199e-08, "logits/chosen": -19.651578903198242, "logits/rejected": -18.283252716064453, "logps/chosen": -320.10308837890625, "logps/rejected": -254.7614288330078, "loss": 0.4069, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.703157663345337, "rewards/margins": 1.7170593738555908, "rewards/rejected": 0.9860982894897461, "step": 54320 }, { "epoch": 2.5224012256836437, "grad_norm": 0.5612923502922058, "learning_rate": 7.979324326415649e-08, "logits/chosen": -19.679616928100586, "logits/rejected": -18.577342987060547, "logps/chosen": -370.90447998046875, "logps/rejected": -297.22943115234375, "loss": 0.8776, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5298213958740234, "rewards/margins": 1.3946729898452759, "rewards/rejected": 2.135148286819458, "step": 54330 }, { "epoch": 2.5228654997910764, "grad_norm": 2.6250154972076416, "learning_rate": 7.971586424625099e-08, "logits/chosen": -19.47846221923828, "logits/rejected": -19.36996841430664, "logps/chosen": -400.43975830078125, "logps/rejected": -355.7469177246094, "loss": 0.6412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6613011360168457, "rewards/margins": 1.056254267692566, "rewards/rejected": 2.6050467491149902, "step": 54340 }, { "epoch": 2.5233297738985097, "grad_norm": 47.05551528930664, "learning_rate": 7.963848522834547e-08, "logits/chosen": -17.904905319213867, "logits/rejected": -17.112369537353516, "logps/chosen": -393.7509460449219, "logps/rejected": -270.13580322265625, "loss": 0.7035, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1208672523498535, "rewards/margins": 1.67226243019104, "rewards/rejected": 1.4486048221588135, "step": 54350 }, { "epoch": 2.523794048005943, "grad_norm": 0.6337706446647644, "learning_rate": 7.956110621043997e-08, "logits/chosen": -18.98416519165039, "logits/rejected": -17.848953247070312, "logps/chosen": -387.62017822265625, "logps/rejected": -317.70880126953125, "loss": 0.2924, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.132012367248535, "rewards/margins": 2.067434787750244, "rewards/rejected": 2.064577579498291, "step": 54360 }, { "epoch": 2.5242583221133756, "grad_norm": 120.34144592285156, "learning_rate": 7.948372719253448e-08, "logits/chosen": -19.326021194458008, "logits/rejected": -18.941545486450195, "logps/chosen": -425.7681579589844, "logps/rejected": -391.7333984375, "loss": 0.9937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.116074562072754, "rewards/margins": 0.33585476875305176, "rewards/rejected": 3.780219554901123, "step": 54370 }, { "epoch": 2.524722596220809, "grad_norm": 109.67433166503906, "learning_rate": 7.940634817462897e-08, "logits/chosen": -19.952960968017578, "logits/rejected": -19.451263427734375, "logps/chosen": -309.892578125, "logps/rejected": -303.9499206542969, "loss": 1.1906, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.5675394535064697, "rewards/margins": -0.2854211926460266, "rewards/rejected": 2.8529605865478516, "step": 54380 }, { "epoch": 2.5251868703282416, "grad_norm": 23.93048667907715, "learning_rate": 7.932896915672346e-08, "logits/chosen": -19.698001861572266, "logits/rejected": -18.958696365356445, "logps/chosen": -401.606201171875, "logps/rejected": -364.7103576660156, "loss": 0.8129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4744582176208496, "rewards/margins": 1.0145456790924072, "rewards/rejected": 2.4599127769470215, "step": 54390 }, { "epoch": 2.525651144435675, "grad_norm": 57.18449020385742, "learning_rate": 7.925159013881795e-08, "logits/chosen": -20.10976219177246, "logits/rejected": -19.809539794921875, "logps/chosen": -388.01922607421875, "logps/rejected": -356.24798583984375, "loss": 0.6203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3196167945861816, "rewards/margins": 0.6650981903076172, "rewards/rejected": 2.6545186042785645, "step": 54400 }, { "epoch": 2.5261154185431076, "grad_norm": 36.93132019042969, "learning_rate": 7.917421112091245e-08, "logits/chosen": -19.61636734008789, "logits/rejected": -18.883283615112305, "logps/chosen": -414.91650390625, "logps/rejected": -369.68829345703125, "loss": 0.723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.993360757827759, "rewards/margins": 0.5626777410507202, "rewards/rejected": 2.43068265914917, "step": 54410 }, { "epoch": 2.526579692650541, "grad_norm": 101.14733123779297, "learning_rate": 7.909683210300695e-08, "logits/chosen": -18.3664608001709, "logits/rejected": -17.95293617248535, "logps/chosen": -435.30462646484375, "logps/rejected": -492.2730407714844, "loss": 0.8376, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2522709369659424, "rewards/margins": 0.14634335041046143, "rewards/rejected": 3.1059274673461914, "step": 54420 }, { "epoch": 2.527043966757974, "grad_norm": 19.978046417236328, "learning_rate": 7.901945308510144e-08, "logits/chosen": -19.569028854370117, "logits/rejected": -18.719274520874023, "logps/chosen": -447.0675354003906, "logps/rejected": -335.33447265625, "loss": 0.615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8540310859680176, "rewards/margins": 1.5312130451202393, "rewards/rejected": 2.322817802429199, "step": 54430 }, { "epoch": 2.527508240865407, "grad_norm": 75.9168930053711, "learning_rate": 7.894207406719594e-08, "logits/chosen": -19.944835662841797, "logits/rejected": -18.659381866455078, "logps/chosen": -359.88372802734375, "logps/rejected": -266.0103454589844, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 3.668170928955078, "rewards/margins": 2.371164560317993, "rewards/rejected": 1.2970062494277954, "step": 54440 }, { "epoch": 2.52797251497284, "grad_norm": 40.94920349121094, "learning_rate": 7.886469504929043e-08, "logits/chosen": -17.96775245666504, "logits/rejected": -17.069961547851562, "logps/chosen": -393.59417724609375, "logps/rejected": -339.4466552734375, "loss": 0.2823, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3988895416259766, "rewards/margins": 2.0736660957336426, "rewards/rejected": 1.3252230882644653, "step": 54450 }, { "epoch": 2.5284367890802733, "grad_norm": 27.15101432800293, "learning_rate": 7.878731603138492e-08, "logits/chosen": -18.678836822509766, "logits/rejected": -17.912769317626953, "logps/chosen": -353.36407470703125, "logps/rejected": -268.2435302734375, "loss": 1.15, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.098850965499878, "rewards/margins": 0.9530082941055298, "rewards/rejected": 2.1458425521850586, "step": 54460 }, { "epoch": 2.528901063187706, "grad_norm": 71.86665344238281, "learning_rate": 7.870993701347942e-08, "logits/chosen": -18.858386993408203, "logits/rejected": -18.46622085571289, "logps/chosen": -386.9933166503906, "logps/rejected": -322.1316833496094, "loss": 0.9077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.538438320159912, "rewards/margins": 1.3281240463256836, "rewards/rejected": 2.2103142738342285, "step": 54470 }, { "epoch": 2.529365337295139, "grad_norm": 37.26945495605469, "learning_rate": 7.863255799557392e-08, "logits/chosen": -19.351057052612305, "logits/rejected": -19.25033187866211, "logps/chosen": -398.0727233886719, "logps/rejected": -360.728759765625, "loss": 0.8802, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.041454792022705, "rewards/margins": -0.0023897290229797363, "rewards/rejected": 3.04384446144104, "step": 54480 }, { "epoch": 2.529829611402572, "grad_norm": 240.3038330078125, "learning_rate": 7.855517897766842e-08, "logits/chosen": -19.3857479095459, "logits/rejected": -18.368188858032227, "logps/chosen": -397.3552551269531, "logps/rejected": -320.508544921875, "loss": 0.5314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.305813789367676, "rewards/margins": 2.4607465267181396, "rewards/rejected": 1.8450676202774048, "step": 54490 }, { "epoch": 2.5302938855100052, "grad_norm": 3.4418137073516846, "learning_rate": 7.84777999597629e-08, "logits/chosen": -18.729806900024414, "logits/rejected": -17.52313995361328, "logps/chosen": -345.21575927734375, "logps/rejected": -260.86376953125, "loss": 0.4738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5761876106262207, "rewards/margins": 2.102412700653076, "rewards/rejected": 0.47377490997314453, "step": 54500 }, { "epoch": 2.530758159617438, "grad_norm": 27.65077781677246, "learning_rate": 7.84004209418574e-08, "logits/chosen": -19.30941390991211, "logits/rejected": -17.392988204956055, "logps/chosen": -488.58837890625, "logps/rejected": -301.39996337890625, "loss": 0.3933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.712573289871216, "rewards/margins": 2.4240875244140625, "rewards/rejected": 1.2884857654571533, "step": 54510 }, { "epoch": 2.531222433724871, "grad_norm": 226.8343048095703, "learning_rate": 7.83230419239519e-08, "logits/chosen": -19.013378143310547, "logits/rejected": -18.820436477661133, "logps/chosen": -287.6404113769531, "logps/rejected": -227.4700164794922, "loss": 1.1857, "rewards/accuracies": 0.5, "rewards/chosen": 1.8931041955947876, "rewards/margins": 0.1945093423128128, "rewards/rejected": 1.6985950469970703, "step": 54520 }, { "epoch": 2.5316867078323044, "grad_norm": 5.201613426208496, "learning_rate": 7.82456629060464e-08, "logits/chosen": -18.782121658325195, "logits/rejected": -17.001964569091797, "logps/chosen": -408.5199279785156, "logps/rejected": -251.9858856201172, "loss": 0.1866, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.737311601638794, "rewards/margins": 2.575961112976074, "rewards/rejected": 1.1613502502441406, "step": 54530 }, { "epoch": 2.532150981939737, "grad_norm": 25.69222068786621, "learning_rate": 7.816828388814088e-08, "logits/chosen": -19.58173370361328, "logits/rejected": -19.332849502563477, "logps/chosen": -450.7832946777344, "logps/rejected": -428.2777404785156, "loss": 0.5057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.401752471923828, "rewards/margins": 0.8338495492935181, "rewards/rejected": 3.5679030418395996, "step": 54540 }, { "epoch": 2.5326152560471704, "grad_norm": 23.492958068847656, "learning_rate": 7.809090487023538e-08, "logits/chosen": -19.331026077270508, "logits/rejected": -18.86983871459961, "logps/chosen": -460.708251953125, "logps/rejected": -381.48089599609375, "loss": 0.4157, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9821362495422363, "rewards/margins": 1.5071995258331299, "rewards/rejected": 2.4749362468719482, "step": 54550 }, { "epoch": 2.533079530154603, "grad_norm": 2.6263606548309326, "learning_rate": 7.801352585232988e-08, "logits/chosen": -19.02248764038086, "logits/rejected": -17.830942153930664, "logps/chosen": -241.06179809570312, "logps/rejected": -168.26162719726562, "loss": 0.2782, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5594079494476318, "rewards/margins": 1.8528001308441162, "rewards/rejected": -0.29339221119880676, "step": 54560 }, { "epoch": 2.5335438042620364, "grad_norm": 1.2425763607025146, "learning_rate": 7.794388473621493e-08, "logits/chosen": -19.780536651611328, "logits/rejected": -18.930675506591797, "logps/chosen": -360.6665954589844, "logps/rejected": -287.3761901855469, "loss": 0.9146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4984335899353027, "rewards/margins": 0.3789198100566864, "rewards/rejected": 2.119513511657715, "step": 54570 }, { "epoch": 2.534008078369469, "grad_norm": 122.36275482177734, "learning_rate": 7.786650571830943e-08, "logits/chosen": -19.94815444946289, "logits/rejected": -19.665325164794922, "logps/chosen": -420.07550048828125, "logps/rejected": -339.2468566894531, "loss": 1.1189, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3212482929229736, "rewards/margins": 0.49619922041893005, "rewards/rejected": 2.8250489234924316, "step": 54580 }, { "epoch": 2.5344723524769024, "grad_norm": 108.9420394897461, "learning_rate": 7.778912670040391e-08, "logits/chosen": -19.03604507446289, "logits/rejected": -18.630908966064453, "logps/chosen": -358.7483825683594, "logps/rejected": -275.1123046875, "loss": 0.9258, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8368709087371826, "rewards/margins": 0.5240747928619385, "rewards/rejected": 2.312796115875244, "step": 54590 }, { "epoch": 2.5349366265843356, "grad_norm": 25.743898391723633, "learning_rate": 7.771174768249841e-08, "logits/chosen": -19.36260986328125, "logits/rejected": -18.41651725769043, "logps/chosen": -502.26336669921875, "logps/rejected": -409.42388916015625, "loss": 0.4568, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.019888401031494, "rewards/margins": 1.87612783908844, "rewards/rejected": 3.1437604427337646, "step": 54600 }, { "epoch": 2.5354009006917684, "grad_norm": 26.535234451293945, "learning_rate": 7.763436866459291e-08, "logits/chosen": -19.063669204711914, "logits/rejected": -18.286937713623047, "logps/chosen": -408.60333251953125, "logps/rejected": -329.6680603027344, "loss": 0.4629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3496158123016357, "rewards/margins": 0.989698052406311, "rewards/rejected": 2.3599178791046143, "step": 54610 }, { "epoch": 2.5358651747992016, "grad_norm": 56.500083923339844, "learning_rate": 7.75569896466874e-08, "logits/chosen": -19.408641815185547, "logits/rejected": -17.916868209838867, "logps/chosen": -417.8001403808594, "logps/rejected": -225.3544921875, "loss": 0.2221, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.305515766143799, "rewards/margins": 3.5129997730255127, "rewards/rejected": 0.7925159931182861, "step": 54620 }, { "epoch": 2.5363294489066344, "grad_norm": 64.67574310302734, "learning_rate": 7.74796106287819e-08, "logits/chosen": -18.59302520751953, "logits/rejected": -18.180133819580078, "logps/chosen": -263.2541809082031, "logps/rejected": -217.88369750976562, "loss": 0.9395, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4141640663146973, "rewards/margins": 0.7054482698440552, "rewards/rejected": 1.7087156772613525, "step": 54630 }, { "epoch": 2.5367937230140676, "grad_norm": 7.201085090637207, "learning_rate": 7.740223161087639e-08, "logits/chosen": -19.349470138549805, "logits/rejected": -17.59544563293457, "logps/chosen": -396.289794921875, "logps/rejected": -333.31982421875, "loss": 0.4571, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.5438642501831055, "rewards/margins": 2.712425708770752, "rewards/rejected": 1.8314390182495117, "step": 54640 }, { "epoch": 2.5372579971215004, "grad_norm": 55.68620300292969, "learning_rate": 7.732485259297089e-08, "logits/chosen": -18.636550903320312, "logits/rejected": -18.292375564575195, "logps/chosen": -391.9720764160156, "logps/rejected": -365.2564697265625, "loss": 1.2227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2444496154785156, "rewards/margins": 0.7130132913589478, "rewards/rejected": 2.531435966491699, "step": 54650 }, { "epoch": 2.5377222712289336, "grad_norm": 78.3026351928711, "learning_rate": 7.724747357506538e-08, "logits/chosen": -19.305362701416016, "logits/rejected": -19.332033157348633, "logps/chosen": -311.71630859375, "logps/rejected": -328.4394836425781, "loss": 0.707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9343057870864868, "rewards/margins": 0.45602256059646606, "rewards/rejected": 1.4782830476760864, "step": 54660 }, { "epoch": 2.538186545336367, "grad_norm": 142.90769958496094, "learning_rate": 7.717009455715988e-08, "logits/chosen": -18.228534698486328, "logits/rejected": -18.320514678955078, "logps/chosen": -415.74725341796875, "logps/rejected": -392.6052551269531, "loss": 1.0016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.932635545730591, "rewards/margins": 0.21674585342407227, "rewards/rejected": 2.7158894538879395, "step": 54670 }, { "epoch": 2.5386508194437996, "grad_norm": 172.23513793945312, "learning_rate": 7.709271553925437e-08, "logits/chosen": -18.62196922302246, "logits/rejected": -18.050214767456055, "logps/chosen": -424.8802185058594, "logps/rejected": -305.6571960449219, "loss": 0.8496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9491188526153564, "rewards/margins": 0.9916505813598633, "rewards/rejected": 1.9574683904647827, "step": 54680 }, { "epoch": 2.5391150935512328, "grad_norm": 41.81306838989258, "learning_rate": 7.701533652134886e-08, "logits/chosen": -18.722938537597656, "logits/rejected": -18.090469360351562, "logps/chosen": -408.3788146972656, "logps/rejected": -294.0762939453125, "loss": 0.4046, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.571812868118286, "rewards/margins": 1.6008532047271729, "rewards/rejected": 1.9709593057632446, "step": 54690 }, { "epoch": 2.5395793676586655, "grad_norm": 16.159915924072266, "learning_rate": 7.693795750344336e-08, "logits/chosen": -19.546916961669922, "logits/rejected": -18.513050079345703, "logps/chosen": -376.95196533203125, "logps/rejected": -309.72320556640625, "loss": 0.2635, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.239063262939453, "rewards/margins": 1.8889023065567017, "rewards/rejected": 1.350160837173462, "step": 54700 }, { "epoch": 2.5400436417660988, "grad_norm": 37.03419876098633, "learning_rate": 7.686057848553786e-08, "logits/chosen": -19.074382781982422, "logits/rejected": -18.45604133605957, "logps/chosen": -373.63970947265625, "logps/rejected": -268.0008850097656, "loss": 0.39, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9233620166778564, "rewards/margins": 1.9238182306289673, "rewards/rejected": 1.9995439052581787, "step": 54710 }, { "epoch": 2.5405079158735315, "grad_norm": 21.502517700195312, "learning_rate": 7.678319946763236e-08, "logits/chosen": -20.334983825683594, "logits/rejected": -18.63649559020996, "logps/chosen": -432.173583984375, "logps/rejected": -277.63287353515625, "loss": 0.3186, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.374199390411377, "rewards/margins": 3.2970588207244873, "rewards/rejected": 1.0771403312683105, "step": 54720 }, { "epoch": 2.5409721899809647, "grad_norm": 3.007941484451294, "learning_rate": 7.670582044972684e-08, "logits/chosen": -19.357440948486328, "logits/rejected": -19.63397789001465, "logps/chosen": -339.6900634765625, "logps/rejected": -395.75048828125, "loss": 0.7344, "rewards/accuracies": 0.5, "rewards/chosen": 4.176013469696045, "rewards/margins": 0.9557368159294128, "rewards/rejected": 3.2202773094177246, "step": 54730 }, { "epoch": 2.541436464088398, "grad_norm": 7.8142218589782715, "learning_rate": 7.662844143182134e-08, "logits/chosen": -18.846147537231445, "logits/rejected": -17.756824493408203, "logps/chosen": -399.220703125, "logps/rejected": -230.0708465576172, "loss": 0.615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5197348594665527, "rewards/margins": 2.5211594104766846, "rewards/rejected": 0.9985752105712891, "step": 54740 }, { "epoch": 2.5419007381958307, "grad_norm": 121.39570617675781, "learning_rate": 7.655106241391584e-08, "logits/chosen": -18.9931640625, "logits/rejected": -17.894052505493164, "logps/chosen": -531.2593994140625, "logps/rejected": -378.74212646484375, "loss": 0.4936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.463115215301514, "rewards/margins": 2.0272459983825684, "rewards/rejected": 2.4358692169189453, "step": 54750 }, { "epoch": 2.542365012303264, "grad_norm": 7.934357643127441, "learning_rate": 7.647368339601034e-08, "logits/chosen": -19.00531768798828, "logits/rejected": -17.795631408691406, "logps/chosen": -503.59075927734375, "logps/rejected": -376.1075134277344, "loss": 0.2783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.550715446472168, "rewards/margins": 2.2875447273254395, "rewards/rejected": 2.263171434402466, "step": 54760 }, { "epoch": 2.5428292864106967, "grad_norm": 18.544986724853516, "learning_rate": 7.639630437810484e-08, "logits/chosen": -19.48401641845703, "logits/rejected": -18.97789764404297, "logps/chosen": -394.2744140625, "logps/rejected": -426.6456604003906, "loss": 1.0411, "rewards/accuracies": 0.5, "rewards/chosen": 3.6517562866210938, "rewards/margins": 0.0220812801271677, "rewards/rejected": 3.6296753883361816, "step": 54770 }, { "epoch": 2.54329356051813, "grad_norm": 136.6360321044922, "learning_rate": 7.631892536019932e-08, "logits/chosen": -19.078899383544922, "logits/rejected": -18.852128982543945, "logps/chosen": -351.4009094238281, "logps/rejected": -373.1954650878906, "loss": 1.1886, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.710731267929077, "rewards/margins": -0.18903227150440216, "rewards/rejected": 2.899763584136963, "step": 54780 }, { "epoch": 2.5437578346255627, "grad_norm": 42.64887237548828, "learning_rate": 7.624154634229382e-08, "logits/chosen": -19.570804595947266, "logits/rejected": -18.878999710083008, "logps/chosen": -495.14923095703125, "logps/rejected": -421.2774353027344, "loss": 0.5643, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.334043025970459, "rewards/margins": 0.6221681237220764, "rewards/rejected": 3.7118752002716064, "step": 54790 }, { "epoch": 2.544222108732996, "grad_norm": 36.3234748840332, "learning_rate": 7.61641673243883e-08, "logits/chosen": -18.380455017089844, "logits/rejected": -18.386188507080078, "logps/chosen": -312.6361389160156, "logps/rejected": -326.32513427734375, "loss": 1.1843, "rewards/accuracies": 0.5, "rewards/chosen": 3.373155117034912, "rewards/margins": 0.6165400743484497, "rewards/rejected": 2.7566158771514893, "step": 54800 }, { "epoch": 2.544686382840429, "grad_norm": 84.9775161743164, "learning_rate": 7.608678830648281e-08, "logits/chosen": -19.010448455810547, "logits/rejected": -18.13347816467285, "logps/chosen": -363.680419921875, "logps/rejected": -298.789306640625, "loss": 0.4256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3196475505828857, "rewards/margins": 1.2566521167755127, "rewards/rejected": 2.062995433807373, "step": 54810 }, { "epoch": 2.545150656947862, "grad_norm": 47.10289001464844, "learning_rate": 7.600940928857731e-08, "logits/chosen": -18.998554229736328, "logits/rejected": -18.124805450439453, "logps/chosen": -390.28387451171875, "logps/rejected": -281.4339294433594, "loss": 0.324, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.861804246902466, "rewards/margins": 1.967098593711853, "rewards/rejected": 1.8947054147720337, "step": 54820 }, { "epoch": 2.545614931055295, "grad_norm": 8.907926559448242, "learning_rate": 7.59320302706718e-08, "logits/chosen": -18.728330612182617, "logits/rejected": -18.051593780517578, "logps/chosen": -251.69979858398438, "logps/rejected": -220.1450653076172, "loss": 0.7151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5383150577545166, "rewards/margins": 1.2061327695846558, "rewards/rejected": 1.3321824073791504, "step": 54830 }, { "epoch": 2.5460792051627283, "grad_norm": 31.475051879882812, "learning_rate": 7.58546512527663e-08, "logits/chosen": -19.405702590942383, "logits/rejected": -18.508472442626953, "logps/chosen": -414.36187744140625, "logps/rejected": -351.1705322265625, "loss": 0.5438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.181290149688721, "rewards/margins": 1.637451410293579, "rewards/rejected": 2.5438380241394043, "step": 54840 }, { "epoch": 2.546543479270161, "grad_norm": 71.24211883544922, "learning_rate": 7.577727223486078e-08, "logits/chosen": -19.29884910583496, "logits/rejected": -17.775463104248047, "logps/chosen": -402.1449890136719, "logps/rejected": -275.9080810546875, "loss": 0.3499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.149206161499023, "rewards/margins": 2.3629310131073, "rewards/rejected": 1.7862751483917236, "step": 54850 }, { "epoch": 2.547007753377594, "grad_norm": 37.739990234375, "learning_rate": 7.569989321695529e-08, "logits/chosen": -18.871383666992188, "logits/rejected": -17.945554733276367, "logps/chosen": -470.6590881347656, "logps/rejected": -423.9617614746094, "loss": 0.5652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.925959825515747, "rewards/margins": 0.7270843982696533, "rewards/rejected": 3.1988751888275146, "step": 54860 }, { "epoch": 2.547472027485027, "grad_norm": 1.8113130331039429, "learning_rate": 7.562251419904979e-08, "logits/chosen": -18.87325668334961, "logits/rejected": -17.952899932861328, "logps/chosen": -342.6473083496094, "logps/rejected": -249.8728790283203, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": 3.326615571975708, "rewards/margins": 1.975762963294983, "rewards/rejected": 1.350852608680725, "step": 54870 }, { "epoch": 2.5479363015924603, "grad_norm": 0.11895716935396194, "learning_rate": 7.554513518114427e-08, "logits/chosen": -18.679851531982422, "logits/rejected": -17.130935668945312, "logps/chosen": -397.5906677246094, "logps/rejected": -225.44485473632812, "loss": 0.3233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8578429222106934, "rewards/margins": 2.124831438064575, "rewards/rejected": 0.7330114245414734, "step": 54880 }, { "epoch": 2.548400575699893, "grad_norm": 84.10002899169922, "learning_rate": 7.546775616323877e-08, "logits/chosen": -18.3309268951416, "logits/rejected": -17.695409774780273, "logps/chosen": -432.4037170410156, "logps/rejected": -304.5199279785156, "loss": 0.8163, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7436604499816895, "rewards/margins": 1.432574987411499, "rewards/rejected": 2.3110859394073486, "step": 54890 }, { "epoch": 2.5488648498073263, "grad_norm": 117.62744903564453, "learning_rate": 7.539037714533326e-08, "logits/chosen": -18.86503028869629, "logits/rejected": -17.530704498291016, "logps/chosen": -468.708251953125, "logps/rejected": -347.153076171875, "loss": 0.3188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7177860736846924, "rewards/margins": 2.0770390033721924, "rewards/rejected": 1.6407474279403687, "step": 54900 }, { "epoch": 2.5493291239147595, "grad_norm": 88.41354370117188, "learning_rate": 7.531299812742777e-08, "logits/chosen": -19.26659393310547, "logits/rejected": -18.650482177734375, "logps/chosen": -465.5403747558594, "logps/rejected": -370.12884521484375, "loss": 0.5341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8928394317626953, "rewards/margins": 1.3646715879440308, "rewards/rejected": 2.528167963027954, "step": 54910 }, { "epoch": 2.5497933980221923, "grad_norm": 26.55124855041504, "learning_rate": 7.523561910952225e-08, "logits/chosen": -19.42980194091797, "logits/rejected": -19.163738250732422, "logps/chosen": -353.80621337890625, "logps/rejected": -362.92523193359375, "loss": 0.779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.884603500366211, "rewards/margins": 0.665570855140686, "rewards/rejected": 2.2190327644348145, "step": 54920 }, { "epoch": 2.550257672129625, "grad_norm": 0.991335928440094, "learning_rate": 7.515824009161675e-08, "logits/chosen": -19.93954849243164, "logits/rejected": -18.385000228881836, "logps/chosen": -417.3793029785156, "logps/rejected": -373.71356201171875, "loss": 0.4011, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.3026628494262695, "rewards/margins": 1.9880577325820923, "rewards/rejected": 2.314605236053467, "step": 54930 }, { "epoch": 2.5507219462370583, "grad_norm": 100.70641326904297, "learning_rate": 7.508859897550179e-08, "logits/chosen": -20.02811050415039, "logits/rejected": -18.617368698120117, "logps/chosen": -421.03045654296875, "logps/rejected": -376.9678039550781, "loss": 1.5819, "rewards/accuracies": 0.5, "rewards/chosen": 2.924423933029175, "rewards/margins": -0.1305297166109085, "rewards/rejected": 3.0549538135528564, "step": 54940 }, { "epoch": 2.5511862203444915, "grad_norm": 61.529212951660156, "learning_rate": 7.50112199575963e-08, "logits/chosen": -18.643320083618164, "logits/rejected": -18.710895538330078, "logps/chosen": -454.22454833984375, "logps/rejected": -362.28997802734375, "loss": 0.5171, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4895691871643066, "rewards/margins": 1.4843015670776367, "rewards/rejected": 2.005267858505249, "step": 54950 }, { "epoch": 2.5516504944519243, "grad_norm": 59.41868209838867, "learning_rate": 7.49338409396908e-08, "logits/chosen": -19.736228942871094, "logits/rejected": -19.542360305786133, "logps/chosen": -276.5712585449219, "logps/rejected": -238.8253173828125, "loss": 0.4474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.322568893432617, "rewards/margins": 1.2167223691940308, "rewards/rejected": 1.105846643447876, "step": 54960 }, { "epoch": 2.5521147685593575, "grad_norm": 15.29263687133789, "learning_rate": 7.485646192178528e-08, "logits/chosen": -20.497894287109375, "logits/rejected": -19.61888885498047, "logps/chosen": -391.1288146972656, "logps/rejected": -337.3211669921875, "loss": 1.3496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6774697303771973, "rewards/margins": 0.3476361632347107, "rewards/rejected": 3.3298332691192627, "step": 54970 }, { "epoch": 2.5525790426667907, "grad_norm": 32.11476516723633, "learning_rate": 7.477908290387978e-08, "logits/chosen": -18.526275634765625, "logits/rejected": -17.77145767211914, "logps/chosen": -384.65447998046875, "logps/rejected": -289.6183776855469, "loss": 0.556, "rewards/accuracies": 0.5, "rewards/chosen": 3.080334186553955, "rewards/margins": 0.876447319984436, "rewards/rejected": 2.2038872241973877, "step": 54980 }, { "epoch": 2.5530433167742235, "grad_norm": 62.89265441894531, "learning_rate": 7.470170388597427e-08, "logits/chosen": -19.171327590942383, "logits/rejected": -19.148334503173828, "logps/chosen": -311.4287109375, "logps/rejected": -240.4308624267578, "loss": 1.1165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6250007152557373, "rewards/margins": 0.42329511046409607, "rewards/rejected": 2.2017054557800293, "step": 54990 }, { "epoch": 2.5535075908816567, "grad_norm": 89.96465301513672, "learning_rate": 7.462432486806878e-08, "logits/chosen": -20.640134811401367, "logits/rejected": -19.215375900268555, "logps/chosen": -425.8805236816406, "logps/rejected": -336.12957763671875, "loss": 0.3436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.196527004241943, "rewards/margins": 1.8168790340423584, "rewards/rejected": 2.379647731781006, "step": 55000 }, { "epoch": 2.5539718649890895, "grad_norm": 100.44786071777344, "learning_rate": 7.454694585016327e-08, "logits/chosen": -19.845661163330078, "logits/rejected": -19.449235916137695, "logps/chosen": -385.5675354003906, "logps/rejected": -339.3880920410156, "loss": 0.8112, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.153028726577759, "rewards/margins": 0.9540963172912598, "rewards/rejected": 2.19893217086792, "step": 55010 }, { "epoch": 2.5544361390965227, "grad_norm": 23.636564254760742, "learning_rate": 7.446956683225776e-08, "logits/chosen": -18.93398666381836, "logits/rejected": -18.61253547668457, "logps/chosen": -426.3758239746094, "logps/rejected": -419.6199645996094, "loss": 1.3753, "rewards/accuracies": 0.5, "rewards/chosen": 3.0440869331359863, "rewards/margins": 0.08348403126001358, "rewards/rejected": 2.9606027603149414, "step": 55020 }, { "epoch": 2.5549004132039554, "grad_norm": 1.3646732568740845, "learning_rate": 7.439218781435226e-08, "logits/chosen": -19.17134666442871, "logits/rejected": -19.226125717163086, "logps/chosen": -409.08074951171875, "logps/rejected": -431.58270263671875, "loss": 1.6799, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.9635462760925293, "rewards/margins": -0.3767324388027191, "rewards/rejected": 3.3402791023254395, "step": 55030 }, { "epoch": 2.5553646873113887, "grad_norm": 56.0216178894043, "learning_rate": 7.431480879644674e-08, "logits/chosen": -18.783262252807617, "logits/rejected": -18.919294357299805, "logps/chosen": -388.10369873046875, "logps/rejected": -389.7932434082031, "loss": 0.8043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.134265184402466, "rewards/margins": 0.7654391527175903, "rewards/rejected": 2.368825912475586, "step": 55040 }, { "epoch": 2.555828961418822, "grad_norm": 83.88658905029297, "learning_rate": 7.423742977854125e-08, "logits/chosen": -18.76030921936035, "logits/rejected": -18.81698989868164, "logps/chosen": -264.29315185546875, "logps/rejected": -216.37258911132812, "loss": 0.7488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.917417287826538, "rewards/margins": 0.8299823999404907, "rewards/rejected": 1.087435007095337, "step": 55050 }, { "epoch": 2.5562932355262546, "grad_norm": 148.6691131591797, "learning_rate": 7.416005076063574e-08, "logits/chosen": -18.87004280090332, "logits/rejected": -18.09832763671875, "logps/chosen": -499.87481689453125, "logps/rejected": -437.7247619628906, "loss": 0.2676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8728132247924805, "rewards/margins": 2.559023380279541, "rewards/rejected": 2.3137898445129395, "step": 55060 }, { "epoch": 2.556757509633688, "grad_norm": 55.85272216796875, "learning_rate": 7.408267174273024e-08, "logits/chosen": -20.455121994018555, "logits/rejected": -19.26797866821289, "logps/chosen": -447.49493408203125, "logps/rejected": -349.8556823730469, "loss": 0.3427, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.095511436462402, "rewards/margins": 2.0348896980285645, "rewards/rejected": 3.060621500015259, "step": 55070 }, { "epoch": 2.5572217837411206, "grad_norm": 72.9500732421875, "learning_rate": 7.400529272482473e-08, "logits/chosen": -19.587467193603516, "logits/rejected": -18.826398849487305, "logps/chosen": -400.724365234375, "logps/rejected": -341.19049072265625, "loss": 0.7103, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7200634479522705, "rewards/margins": 0.7651049494743347, "rewards/rejected": 2.954958438873291, "step": 55080 }, { "epoch": 2.557686057848554, "grad_norm": 34.2793083190918, "learning_rate": 7.392791370691922e-08, "logits/chosen": -18.502349853515625, "logits/rejected": -17.87405014038086, "logps/chosen": -373.9720764160156, "logps/rejected": -266.0234069824219, "loss": 0.6141, "rewards/accuracies": 0.5, "rewards/chosen": 2.4935355186462402, "rewards/margins": 1.0015475749969482, "rewards/rejected": 1.491987943649292, "step": 55090 }, { "epoch": 2.5581503319559866, "grad_norm": 79.26988220214844, "learning_rate": 7.385053468901373e-08, "logits/chosen": -19.02507972717285, "logits/rejected": -19.102914810180664, "logps/chosen": -344.49627685546875, "logps/rejected": -362.97283935546875, "loss": 0.574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.254856586456299, "rewards/margins": 0.6523000597953796, "rewards/rejected": 2.6025562286376953, "step": 55100 }, { "epoch": 2.55861460606342, "grad_norm": 31.943546295166016, "learning_rate": 7.377315567110822e-08, "logits/chosen": -18.207481384277344, "logits/rejected": -17.708044052124023, "logps/chosen": -293.4794616699219, "logps/rejected": -284.79840087890625, "loss": 0.9078, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9789403676986694, "rewards/margins": 0.3659832775592804, "rewards/rejected": 0.6129571199417114, "step": 55110 }, { "epoch": 2.559078880170853, "grad_norm": 177.5049591064453, "learning_rate": 7.369577665320271e-08, "logits/chosen": -18.286914825439453, "logits/rejected": -18.641834259033203, "logps/chosen": -326.7882385253906, "logps/rejected": -384.7091369628906, "loss": 1.6069, "rewards/accuracies": 0.5, "rewards/chosen": 2.361009120941162, "rewards/margins": -0.6974659562110901, "rewards/rejected": 3.0584750175476074, "step": 55120 }, { "epoch": 2.559543154278286, "grad_norm": 0.582993745803833, "learning_rate": 7.361839763529721e-08, "logits/chosen": -18.275676727294922, "logits/rejected": -17.310077667236328, "logps/chosen": -420.444091796875, "logps/rejected": -272.2598571777344, "loss": 0.45, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3477184772491455, "rewards/margins": 1.6470985412597656, "rewards/rejected": 1.7006202936172485, "step": 55130 }, { "epoch": 2.560007428385719, "grad_norm": 42.20568084716797, "learning_rate": 7.354101861739171e-08, "logits/chosen": -19.308753967285156, "logits/rejected": -19.128284454345703, "logps/chosen": -406.82470703125, "logps/rejected": -382.16070556640625, "loss": 0.2327, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.282712936401367, "rewards/margins": 2.4658203125, "rewards/rejected": 1.8168929815292358, "step": 55140 }, { "epoch": 2.560471702493152, "grad_norm": 26.689510345458984, "learning_rate": 7.346363959948621e-08, "logits/chosen": -19.486148834228516, "logits/rejected": -18.427520751953125, "logps/chosen": -339.4742126464844, "logps/rejected": -239.93563842773438, "loss": 0.4411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.224135160446167, "rewards/margins": 1.0980280637741089, "rewards/rejected": 1.126106858253479, "step": 55150 }, { "epoch": 2.560935976600585, "grad_norm": 117.84323120117188, "learning_rate": 7.338626058158069e-08, "logits/chosen": -17.52554702758789, "logits/rejected": -18.037853240966797, "logps/chosen": -354.5119934082031, "logps/rejected": -368.17730712890625, "loss": 1.2155, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5966246128082275, "rewards/margins": 0.29932188987731934, "rewards/rejected": 2.297302722930908, "step": 55160 }, { "epoch": 2.561400250708018, "grad_norm": 36.9780387878418, "learning_rate": 7.330888156367519e-08, "logits/chosen": -18.38951301574707, "logits/rejected": -18.074825286865234, "logps/chosen": -388.8338317871094, "logps/rejected": -274.4925537109375, "loss": 0.423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6927874088287354, "rewards/margins": 1.519845962524414, "rewards/rejected": 1.1729412078857422, "step": 55170 }, { "epoch": 2.561864524815451, "grad_norm": 37.272300720214844, "learning_rate": 7.323150254576968e-08, "logits/chosen": -19.73582649230957, "logits/rejected": -18.486949920654297, "logps/chosen": -350.60699462890625, "logps/rejected": -210.839599609375, "loss": 0.5107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.285004138946533, "rewards/margins": 1.5437763929367065, "rewards/rejected": 0.7412273287773132, "step": 55180 }, { "epoch": 2.5623287989228842, "grad_norm": 59.04814529418945, "learning_rate": 7.315412352786419e-08, "logits/chosen": -20.447650909423828, "logits/rejected": -20.002534866333008, "logps/chosen": -398.7685546875, "logps/rejected": -385.9552917480469, "loss": 1.0084, "rewards/accuracies": 0.5, "rewards/chosen": 2.6288976669311523, "rewards/margins": 0.02436993084847927, "rewards/rejected": 2.604527711868286, "step": 55190 }, { "epoch": 2.562793073030317, "grad_norm": 61.879451751708984, "learning_rate": 7.307674450995868e-08, "logits/chosen": -18.68392562866211, "logits/rejected": -17.684707641601562, "logps/chosen": -332.31280517578125, "logps/rejected": -242.26181030273438, "loss": 0.4713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.098733901977539, "rewards/margins": 1.266807198524475, "rewards/rejected": 0.831926703453064, "step": 55200 }, { "epoch": 2.56325734713775, "grad_norm": 119.64881134033203, "learning_rate": 7.299936549205317e-08, "logits/chosen": -18.646381378173828, "logits/rejected": -19.339862823486328, "logps/chosen": -307.5018005371094, "logps/rejected": -381.0362548828125, "loss": 1.1435, "rewards/accuracies": 0.5, "rewards/chosen": 2.4405906200408936, "rewards/margins": -0.437509149312973, "rewards/rejected": 2.8780996799468994, "step": 55210 }, { "epoch": 2.563721621245183, "grad_norm": 38.9754524230957, "learning_rate": 7.292198647414767e-08, "logits/chosen": -19.36127471923828, "logits/rejected": -17.337890625, "logps/chosen": -461.5556640625, "logps/rejected": -252.9806671142578, "loss": 0.2291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.799459457397461, "rewards/margins": 3.42232084274292, "rewards/rejected": 1.3771384954452515, "step": 55220 }, { "epoch": 2.564185895352616, "grad_norm": 48.883480072021484, "learning_rate": 7.284460745624215e-08, "logits/chosen": -20.105316162109375, "logits/rejected": -19.76650047302246, "logps/chosen": -340.7430725097656, "logps/rejected": -335.6733093261719, "loss": 0.6251, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.193535566329956, "rewards/margins": 0.9055142402648926, "rewards/rejected": 2.2880208492279053, "step": 55230 }, { "epoch": 2.564650169460049, "grad_norm": 0.9420232176780701, "learning_rate": 7.276722843833666e-08, "logits/chosen": -18.640338897705078, "logits/rejected": -18.733369827270508, "logps/chosen": -334.03094482421875, "logps/rejected": -259.90618896484375, "loss": 1.1075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.085174798965454, "rewards/margins": 0.45177143812179565, "rewards/rejected": 1.6334034204483032, "step": 55240 }, { "epoch": 2.565114443567482, "grad_norm": 161.94436645507812, "learning_rate": 7.268984942043115e-08, "logits/chosen": -19.642770767211914, "logits/rejected": -19.279788970947266, "logps/chosen": -370.85296630859375, "logps/rejected": -328.9444580078125, "loss": 0.774, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.255945920944214, "rewards/margins": 0.6237925291061401, "rewards/rejected": 2.632153272628784, "step": 55250 }, { "epoch": 2.5655787176749154, "grad_norm": 162.98484802246094, "learning_rate": 7.261247040252565e-08, "logits/chosen": -18.608875274658203, "logits/rejected": -17.480527877807617, "logps/chosen": -360.80572509765625, "logps/rejected": -227.9396209716797, "loss": 0.8328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.410078287124634, "rewards/margins": 1.1401559114456177, "rewards/rejected": 1.2699220180511475, "step": 55260 }, { "epoch": 2.566042991782348, "grad_norm": 195.56480407714844, "learning_rate": 7.253509138462014e-08, "logits/chosen": -19.48777198791504, "logits/rejected": -17.847930908203125, "logps/chosen": -355.8170166015625, "logps/rejected": -305.49334716796875, "loss": 0.5209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.120147228240967, "rewards/margins": 2.7172322273254395, "rewards/rejected": 2.4029154777526855, "step": 55270 }, { "epoch": 2.5665072658897814, "grad_norm": 0.25612616539001465, "learning_rate": 7.245771236671463e-08, "logits/chosen": -18.43195152282715, "logits/rejected": -18.952842712402344, "logps/chosen": -443.37664794921875, "logps/rejected": -376.8715515136719, "loss": 0.824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2499656677246094, "rewards/margins": 1.2731691598892212, "rewards/rejected": 1.9767965078353882, "step": 55280 }, { "epoch": 2.5669715399972146, "grad_norm": 13.097200393676758, "learning_rate": 7.238033334880914e-08, "logits/chosen": -19.43903923034668, "logits/rejected": -19.00360679626465, "logps/chosen": -491.9598083496094, "logps/rejected": -372.90185546875, "loss": 0.6465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.005360126495361, "rewards/margins": 1.2381969690322876, "rewards/rejected": 2.7671632766723633, "step": 55290 }, { "epoch": 2.5674358141046474, "grad_norm": 84.42119598388672, "learning_rate": 7.230295433090363e-08, "logits/chosen": -17.877172470092773, "logits/rejected": -18.470142364501953, "logps/chosen": -275.402099609375, "logps/rejected": -298.33184814453125, "loss": 1.5426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.513889789581299, "rewards/margins": -0.0071905613876879215, "rewards/rejected": 2.5210800170898438, "step": 55300 }, { "epoch": 2.56790008821208, "grad_norm": 178.20556640625, "learning_rate": 7.222557531299812e-08, "logits/chosen": -20.454975128173828, "logits/rejected": -18.58953857421875, "logps/chosen": -464.8194274902344, "logps/rejected": -295.18682861328125, "loss": 0.6257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.380660057067871, "rewards/margins": 2.139742374420166, "rewards/rejected": 2.240917682647705, "step": 55310 }, { "epoch": 2.5683643623195134, "grad_norm": 93.30158996582031, "learning_rate": 7.214819629509262e-08, "logits/chosen": -18.274707794189453, "logits/rejected": -18.178041458129883, "logps/chosen": -266.86639404296875, "logps/rejected": -223.2352294921875, "loss": 1.0419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.0979371070861816, "rewards/margins": 0.7320374250411987, "rewards/rejected": 1.3658993244171143, "step": 55320 }, { "epoch": 2.5688286364269466, "grad_norm": 196.073974609375, "learning_rate": 7.207081727718711e-08, "logits/chosen": -18.942352294921875, "logits/rejected": -18.465457916259766, "logps/chosen": -337.3128662109375, "logps/rejected": -244.87863159179688, "loss": 0.9046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.815962314605713, "rewards/margins": 0.6075140237808228, "rewards/rejected": 2.2084479331970215, "step": 55330 }, { "epoch": 2.5692929105343794, "grad_norm": 0.8916886448860168, "learning_rate": 7.199343825928162e-08, "logits/chosen": -18.266536712646484, "logits/rejected": -17.309375762939453, "logps/chosen": -391.0038146972656, "logps/rejected": -263.55810546875, "loss": 0.4667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.565453052520752, "rewards/margins": 2.4929051399230957, "rewards/rejected": 1.0725480318069458, "step": 55340 }, { "epoch": 2.5697571846418126, "grad_norm": 19.5936222076416, "learning_rate": 7.19160592413761e-08, "logits/chosen": -19.53192138671875, "logits/rejected": -19.448389053344727, "logps/chosen": -293.1213073730469, "logps/rejected": -309.5860900878906, "loss": 0.9232, "rewards/accuracies": 0.5, "rewards/chosen": 2.6597750186920166, "rewards/margins": 0.28972965478897095, "rewards/rejected": 2.3700451850891113, "step": 55350 }, { "epoch": 2.570221458749246, "grad_norm": 0.128727987408638, "learning_rate": 7.18386802234706e-08, "logits/chosen": -19.93429946899414, "logits/rejected": -19.400774002075195, "logps/chosen": -508.04376220703125, "logps/rejected": -384.3841857910156, "loss": 0.2369, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9376816749572754, "rewards/margins": 2.4388396739959717, "rewards/rejected": 1.4988420009613037, "step": 55360 }, { "epoch": 2.5706857328566786, "grad_norm": 117.0697250366211, "learning_rate": 7.176130120556509e-08, "logits/chosen": -18.494525909423828, "logits/rejected": -17.172306060791016, "logps/chosen": -453.06121826171875, "logps/rejected": -318.5806884765625, "loss": 0.4985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.032362937927246, "rewards/margins": 2.500641345977783, "rewards/rejected": 1.5317217111587524, "step": 55370 }, { "epoch": 2.5711500069641113, "grad_norm": 10.338272094726562, "learning_rate": 7.168392218765958e-08, "logits/chosen": -19.27803611755371, "logits/rejected": -18.172744750976562, "logps/chosen": -343.108154296875, "logps/rejected": -298.2586975097656, "loss": 0.7046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5003247261047363, "rewards/margins": 1.1791728734970093, "rewards/rejected": 2.3211517333984375, "step": 55380 }, { "epoch": 2.5716142810715445, "grad_norm": 128.2122039794922, "learning_rate": 7.16065431697541e-08, "logits/chosen": -19.738494873046875, "logits/rejected": -18.797382354736328, "logps/chosen": -404.9769287109375, "logps/rejected": -344.431884765625, "loss": 0.6681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7796432971954346, "rewards/margins": 1.4672654867172241, "rewards/rejected": 2.312377452850342, "step": 55390 }, { "epoch": 2.5720785551789778, "grad_norm": 33.76868438720703, "learning_rate": 7.152916415184858e-08, "logits/chosen": -19.756229400634766, "logits/rejected": -18.250892639160156, "logps/chosen": -367.3284606933594, "logps/rejected": -319.3873596191406, "loss": 0.8058, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3465590476989746, "rewards/margins": 2.079643726348877, "rewards/rejected": 1.2669153213500977, "step": 55400 }, { "epoch": 2.5725428292864105, "grad_norm": 0.5750074982643127, "learning_rate": 7.145178513394308e-08, "logits/chosen": -19.526042938232422, "logits/rejected": -18.49312973022461, "logps/chosen": -433.8680725097656, "logps/rejected": -366.41619873046875, "loss": 0.7632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.33555269241333, "rewards/margins": 1.090498685836792, "rewards/rejected": 3.245053768157959, "step": 55410 }, { "epoch": 2.5730071033938438, "grad_norm": 39.934364318847656, "learning_rate": 7.137440611603756e-08, "logits/chosen": -18.79538917541504, "logits/rejected": -18.599868774414062, "logps/chosen": -466.81011962890625, "logps/rejected": -379.09149169921875, "loss": 0.8888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.506706714630127, "rewards/margins": 0.6798730492591858, "rewards/rejected": 2.826833724975586, "step": 55420 }, { "epoch": 2.573471377501277, "grad_norm": 3.772614002227783, "learning_rate": 7.129702709813207e-08, "logits/chosen": -18.915454864501953, "logits/rejected": -18.961427688598633, "logps/chosen": -439.2158203125, "logps/rejected": -441.61669921875, "loss": 0.8872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.744019985198975, "rewards/margins": 0.4966585040092468, "rewards/rejected": 4.247361183166504, "step": 55430 }, { "epoch": 2.5739356516087097, "grad_norm": 45.61404800415039, "learning_rate": 7.121964808022657e-08, "logits/chosen": -19.01608657836914, "logits/rejected": -18.315587997436523, "logps/chosen": -394.1138916015625, "logps/rejected": -321.41400146484375, "loss": 0.9445, "rewards/accuracies": 0.5, "rewards/chosen": 2.4549946784973145, "rewards/margins": 0.3700399100780487, "rewards/rejected": 2.0849547386169434, "step": 55440 }, { "epoch": 2.574399925716143, "grad_norm": 0.029220473021268845, "learning_rate": 7.114226906232106e-08, "logits/chosen": -19.24654197692871, "logits/rejected": -19.23463249206543, "logps/chosen": -293.69012451171875, "logps/rejected": -264.869873046875, "loss": 0.5761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.845184326171875, "rewards/margins": 1.7874867916107178, "rewards/rejected": 1.057697057723999, "step": 55450 }, { "epoch": 2.5748641998235757, "grad_norm": 49.58427429199219, "learning_rate": 7.106489004441556e-08, "logits/chosen": -19.26822280883789, "logits/rejected": -18.9401912689209, "logps/chosen": -516.4686279296875, "logps/rejected": -466.3287658691406, "loss": 0.3583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6130733489990234, "rewards/margins": 1.3336727619171143, "rewards/rejected": 2.279400587081909, "step": 55460 }, { "epoch": 2.575328473931009, "grad_norm": 33.112998962402344, "learning_rate": 7.098751102651004e-08, "logits/chosen": -18.9254093170166, "logits/rejected": -17.790088653564453, "logps/chosen": -401.4954528808594, "logps/rejected": -282.45037841796875, "loss": 0.3418, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8305885791778564, "rewards/margins": 1.5770502090454102, "rewards/rejected": 1.2535386085510254, "step": 55470 }, { "epoch": 2.5757927480384417, "grad_norm": 6.606535911560059, "learning_rate": 7.091013200860455e-08, "logits/chosen": -18.753589630126953, "logits/rejected": -17.837251663208008, "logps/chosen": -452.8370666503906, "logps/rejected": -352.02423095703125, "loss": 0.4411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.335818767547607, "rewards/margins": 1.931089997291565, "rewards/rejected": 2.404729127883911, "step": 55480 }, { "epoch": 2.576257022145875, "grad_norm": 101.4499282836914, "learning_rate": 7.083275299069904e-08, "logits/chosen": -18.449111938476562, "logits/rejected": -18.769071578979492, "logps/chosen": -276.95953369140625, "logps/rejected": -295.4195251464844, "loss": 1.27, "rewards/accuracies": 0.5, "rewards/chosen": 1.250640869140625, "rewards/margins": -0.3392745852470398, "rewards/rejected": 1.58991539478302, "step": 55490 }, { "epoch": 2.576721296253308, "grad_norm": 18.75156593322754, "learning_rate": 7.075537397279353e-08, "logits/chosen": -19.197919845581055, "logits/rejected": -18.22177505493164, "logps/chosen": -347.18524169921875, "logps/rejected": -270.29510498046875, "loss": 0.8911, "rewards/accuracies": 0.5, "rewards/chosen": 3.1481575965881348, "rewards/margins": 0.9527220726013184, "rewards/rejected": 2.1954355239868164, "step": 55500 }, { "epoch": 2.577185570360741, "grad_norm": 0.046269491314888, "learning_rate": 7.067799495488803e-08, "logits/chosen": -18.9309024810791, "logits/rejected": -17.179454803466797, "logps/chosen": -478.745849609375, "logps/rejected": -252.52108764648438, "loss": 0.1765, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.069224834442139, "rewards/margins": 3.9910054206848145, "rewards/rejected": 1.0782192945480347, "step": 55510 }, { "epoch": 2.577649844468174, "grad_norm": 89.75064849853516, "learning_rate": 7.060061593698252e-08, "logits/chosen": -18.93280029296875, "logits/rejected": -18.83261489868164, "logps/chosen": -374.97723388671875, "logps/rejected": -370.67730712890625, "loss": 1.211, "rewards/accuracies": 0.5, "rewards/chosen": 3.337505340576172, "rewards/margins": -0.45681291818618774, "rewards/rejected": 3.794318437576294, "step": 55520 }, { "epoch": 2.578114118575607, "grad_norm": 46.66748046875, "learning_rate": 7.052323691907703e-08, "logits/chosen": -18.721647262573242, "logits/rejected": -18.132047653198242, "logps/chosen": -434.16455078125, "logps/rejected": -372.9755859375, "loss": 0.9117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9512038230895996, "rewards/margins": 0.562181830406189, "rewards/rejected": 3.3890223503112793, "step": 55530 }, { "epoch": 2.57857839268304, "grad_norm": 2.504976987838745, "learning_rate": 7.044585790117151e-08, "logits/chosen": -18.86113166809082, "logits/rejected": -18.030868530273438, "logps/chosen": -287.0489807128906, "logps/rejected": -246.99130249023438, "loss": 0.6793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.225066661834717, "rewards/margins": 1.1544944047927856, "rewards/rejected": 1.0705724954605103, "step": 55540 }, { "epoch": 2.579042666790473, "grad_norm": 17.778188705444336, "learning_rate": 7.036847888326601e-08, "logits/chosen": -19.522449493408203, "logits/rejected": -19.512928009033203, "logps/chosen": -386.01483154296875, "logps/rejected": -374.07073974609375, "loss": 0.7659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8670389652252197, "rewards/margins": 0.8161064386367798, "rewards/rejected": 2.0509324073791504, "step": 55550 }, { "epoch": 2.579506940897906, "grad_norm": 46.41068649291992, "learning_rate": 7.029109986536051e-08, "logits/chosen": -18.889179229736328, "logits/rejected": -18.76412582397461, "logps/chosen": -365.1340637207031, "logps/rejected": -337.1041564941406, "loss": 0.7041, "rewards/accuracies": 0.5, "rewards/chosen": 3.6943981647491455, "rewards/margins": 0.45239192247390747, "rewards/rejected": 3.242006301879883, "step": 55560 }, { "epoch": 2.5799712150053393, "grad_norm": 89.38848114013672, "learning_rate": 7.0213720847455e-08, "logits/chosen": -18.702777862548828, "logits/rejected": -18.39582061767578, "logps/chosen": -299.27459716796875, "logps/rejected": -316.01605224609375, "loss": 0.8824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6687989234924316, "rewards/margins": 0.7448211312294006, "rewards/rejected": 1.9239776134490967, "step": 55570 }, { "epoch": 2.580435489112772, "grad_norm": 79.64315795898438, "learning_rate": 7.01363418295495e-08, "logits/chosen": -19.76597023010254, "logits/rejected": -19.338544845581055, "logps/chosen": -330.9100036621094, "logps/rejected": -304.8959045410156, "loss": 0.3379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3639004230499268, "rewards/margins": 1.41226065158844, "rewards/rejected": 1.9516395330429077, "step": 55580 }, { "epoch": 2.5808997632202053, "grad_norm": 23.83411979675293, "learning_rate": 7.005896281164399e-08, "logits/chosen": -18.606868743896484, "logits/rejected": -17.609500885009766, "logps/chosen": -305.683837890625, "logps/rejected": -319.9801025390625, "loss": 1.1427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.565221071243286, "rewards/margins": 0.8855813145637512, "rewards/rejected": 2.6796391010284424, "step": 55590 }, { "epoch": 2.581364037327638, "grad_norm": 225.67166137695312, "learning_rate": 6.998158379373849e-08, "logits/chosen": -19.00846290588379, "logits/rejected": -18.446826934814453, "logps/chosen": -340.1927490234375, "logps/rejected": -385.95281982421875, "loss": 1.745, "rewards/accuracies": 0.5, "rewards/chosen": 3.2798168659210205, "rewards/margins": -0.36441919207572937, "rewards/rejected": 3.6442363262176514, "step": 55600 }, { "epoch": 2.5818283114350713, "grad_norm": 0.43535420298576355, "learning_rate": 6.990420477583297e-08, "logits/chosen": -19.34627342224121, "logits/rejected": -18.39390754699707, "logps/chosen": -365.4703369140625, "logps/rejected": -257.4093933105469, "loss": 0.4861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1620821952819824, "rewards/margins": 1.6777416467666626, "rewards/rejected": 1.484340786933899, "step": 55610 }, { "epoch": 2.582292585542504, "grad_norm": 33.264259338378906, "learning_rate": 6.982682575792747e-08, "logits/chosen": -19.491561889648438, "logits/rejected": -18.39120101928711, "logps/chosen": -355.2612609863281, "logps/rejected": -295.0618591308594, "loss": 0.3698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.829059600830078, "rewards/margins": 2.052225351333618, "rewards/rejected": 1.776834487915039, "step": 55620 }, { "epoch": 2.5827568596499373, "grad_norm": 89.72740173339844, "learning_rate": 6.974944674002198e-08, "logits/chosen": -18.786521911621094, "logits/rejected": -18.041921615600586, "logps/chosen": -445.937744140625, "logps/rejected": -371.2403259277344, "loss": 0.38, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.24153995513916, "rewards/margins": 1.2694473266601562, "rewards/rejected": 2.972093105316162, "step": 55630 }, { "epoch": 2.5832211337573705, "grad_norm": 26.501310348510742, "learning_rate": 6.967206772211647e-08, "logits/chosen": -19.31026840209961, "logits/rejected": -18.190420150756836, "logps/chosen": -439.31817626953125, "logps/rejected": -388.3258972167969, "loss": 0.7096, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.005049228668213, "rewards/margins": 1.3695570230484009, "rewards/rejected": 2.6354920864105225, "step": 55640 }, { "epoch": 2.5836854078648033, "grad_norm": 27.842639923095703, "learning_rate": 6.959468870421097e-08, "logits/chosen": -19.777103424072266, "logits/rejected": -19.22294807434082, "logps/chosen": -416.3085021972656, "logps/rejected": -365.2491455078125, "loss": 0.3261, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9436256885528564, "rewards/margins": 1.561363935470581, "rewards/rejected": 2.3822619915008545, "step": 55650 }, { "epoch": 2.5841496819722365, "grad_norm": 156.80970764160156, "learning_rate": 6.951730968630545e-08, "logits/chosen": -18.91041374206543, "logits/rejected": -18.002731323242188, "logps/chosen": -469.0606384277344, "logps/rejected": -369.3053283691406, "loss": 0.9396, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.515784740447998, "rewards/margins": 1.6330009698867798, "rewards/rejected": 2.882784128189087, "step": 55660 }, { "epoch": 2.5846139560796697, "grad_norm": 3.661198139190674, "learning_rate": 6.943993066839995e-08, "logits/chosen": -19.254558563232422, "logits/rejected": -18.939922332763672, "logps/chosen": -397.484375, "logps/rejected": -348.0068359375, "loss": 1.0674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.754126787185669, "rewards/margins": 0.5280613303184509, "rewards/rejected": 3.2260653972625732, "step": 55670 }, { "epoch": 2.5850782301871025, "grad_norm": 34.68476867675781, "learning_rate": 6.936255165049446e-08, "logits/chosen": -18.68960189819336, "logits/rejected": -18.067123413085938, "logps/chosen": -437.46856689453125, "logps/rejected": -383.8670349121094, "loss": 0.5247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.705883741378784, "rewards/margins": 0.957994818687439, "rewards/rejected": 2.7478888034820557, "step": 55680 }, { "epoch": 2.5855425042945352, "grad_norm": 306.029541015625, "learning_rate": 6.928517263258894e-08, "logits/chosen": -18.97705841064453, "logits/rejected": -18.662357330322266, "logps/chosen": -374.8796081542969, "logps/rejected": -309.30828857421875, "loss": 0.8755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.958451509475708, "rewards/margins": 1.3807157278060913, "rewards/rejected": 2.577735424041748, "step": 55690 }, { "epoch": 2.5860067784019685, "grad_norm": 230.93685913085938, "learning_rate": 6.920779361468344e-08, "logits/chosen": -20.08309555053711, "logits/rejected": -18.66305160522461, "logps/chosen": -531.768310546875, "logps/rejected": -382.92767333984375, "loss": 0.4013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.172922134399414, "rewards/margins": 1.8565938472747803, "rewards/rejected": 3.316328525543213, "step": 55700 }, { "epoch": 2.5864710525094017, "grad_norm": 3.4249167442321777, "learning_rate": 6.913041459677793e-08, "logits/chosen": -18.64818572998047, "logits/rejected": -18.22664451599121, "logps/chosen": -358.89569091796875, "logps/rejected": -307.7379150390625, "loss": 1.07, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7862274646759033, "rewards/margins": 1.3318763971328735, "rewards/rejected": 2.4543514251708984, "step": 55710 }, { "epoch": 2.5869353266168345, "grad_norm": 1.0231329202651978, "learning_rate": 6.905303557887243e-08, "logits/chosen": -19.758859634399414, "logits/rejected": -19.227933883666992, "logps/chosen": -482.250244140625, "logps/rejected": -432.4317932128906, "loss": 0.6233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.512604713439941, "rewards/margins": 1.8641411066055298, "rewards/rejected": 3.648463726043701, "step": 55720 }, { "epoch": 2.5873996007242677, "grad_norm": 246.45384216308594, "learning_rate": 6.897565656096692e-08, "logits/chosen": -18.178850173950195, "logits/rejected": -17.78174591064453, "logps/chosen": -376.3438415527344, "logps/rejected": -350.3406066894531, "loss": 1.4233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7712905406951904, "rewards/margins": 0.3685416877269745, "rewards/rejected": 3.4027488231658936, "step": 55730 }, { "epoch": 2.587863874831701, "grad_norm": 155.42831420898438, "learning_rate": 6.889827754306142e-08, "logits/chosen": -19.087968826293945, "logits/rejected": -19.273296356201172, "logps/chosen": -463.9246520996094, "logps/rejected": -450.89727783203125, "loss": 1.0934, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.342103481292725, "rewards/margins": 0.06192593649029732, "rewards/rejected": 4.280177593231201, "step": 55740 }, { "epoch": 2.5883281489391337, "grad_norm": 36.186248779296875, "learning_rate": 6.882089852515592e-08, "logits/chosen": -18.451663970947266, "logits/rejected": -17.453296661376953, "logps/chosen": -441.233154296875, "logps/rejected": -324.9415283203125, "loss": 0.7733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7091050148010254, "rewards/margins": 1.757615327835083, "rewards/rejected": 1.9514896869659424, "step": 55750 }, { "epoch": 2.5887924230465664, "grad_norm": 50.684104919433594, "learning_rate": 6.87435195072504e-08, "logits/chosen": -19.359664916992188, "logits/rejected": -19.394758224487305, "logps/chosen": -328.60833740234375, "logps/rejected": -319.69561767578125, "loss": 0.9005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8621065616607666, "rewards/margins": 0.04386730119585991, "rewards/rejected": 2.8182389736175537, "step": 55760 }, { "epoch": 2.5892566971539996, "grad_norm": 98.35283660888672, "learning_rate": 6.866614048934492e-08, "logits/chosen": -18.899658203125, "logits/rejected": -18.60818099975586, "logps/chosen": -434.0438537597656, "logps/rejected": -410.8968811035156, "loss": 1.1543, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 4.3806538581848145, "rewards/margins": -0.012861823663115501, "rewards/rejected": 4.393515586853027, "step": 55770 }, { "epoch": 2.589720971261433, "grad_norm": 18.437610626220703, "learning_rate": 6.85887614714394e-08, "logits/chosen": -19.013072967529297, "logits/rejected": -18.133541107177734, "logps/chosen": -405.80206298828125, "logps/rejected": -378.5815734863281, "loss": 1.5302, "rewards/accuracies": 0.5, "rewards/chosen": 3.7202823162078857, "rewards/margins": 0.7490787506103516, "rewards/rejected": 2.971203565597534, "step": 55780 }, { "epoch": 2.5901852453688656, "grad_norm": 159.7034149169922, "learning_rate": 6.85113824535339e-08, "logits/chosen": -19.151782989501953, "logits/rejected": -17.615524291992188, "logps/chosen": -461.7955627441406, "logps/rejected": -344.62469482421875, "loss": 0.3188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.452948570251465, "rewards/margins": 2.9710755348205566, "rewards/rejected": 1.4818731546401978, "step": 55790 }, { "epoch": 2.590649519476299, "grad_norm": 13.14233684539795, "learning_rate": 6.84340034356284e-08, "logits/chosen": -20.58344268798828, "logits/rejected": -19.451501846313477, "logps/chosen": -456.30084228515625, "logps/rejected": -436.54974365234375, "loss": 0.8281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.337139129638672, "rewards/margins": 0.3255557119846344, "rewards/rejected": 4.01158332824707, "step": 55800 }, { "epoch": 2.591113793583732, "grad_norm": 213.94236755371094, "learning_rate": 6.835662441772288e-08, "logits/chosen": -18.340539932250977, "logits/rejected": -17.96535301208496, "logps/chosen": -380.02740478515625, "logps/rejected": -301.7413635253906, "loss": 1.2258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7210559844970703, "rewards/margins": 0.6834331750869751, "rewards/rejected": 3.0376229286193848, "step": 55810 }, { "epoch": 2.591578067691165, "grad_norm": 4.928842544555664, "learning_rate": 6.827924539981739e-08, "logits/chosen": -19.290817260742188, "logits/rejected": -18.057281494140625, "logps/chosen": -517.861572265625, "logps/rejected": -426.51904296875, "loss": 1.1018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.954925060272217, "rewards/margins": 1.0608766078948975, "rewards/rejected": 3.8940486907958984, "step": 55820 }, { "epoch": 2.592042341798598, "grad_norm": 12.884347915649414, "learning_rate": 6.820186638191188e-08, "logits/chosen": -18.6080322265625, "logits/rejected": -17.19556999206543, "logps/chosen": -358.17193603515625, "logps/rejected": -160.90501403808594, "loss": 0.4022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.170419692993164, "rewards/margins": 2.4668381214141846, "rewards/rejected": -0.29641860723495483, "step": 55830 }, { "epoch": 2.592506615906031, "grad_norm": 6.400677680969238, "learning_rate": 6.812448736400638e-08, "logits/chosen": -18.651710510253906, "logits/rejected": -17.935636520385742, "logps/chosen": -379.80291748046875, "logps/rejected": -290.01068115234375, "loss": 0.9348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.207307815551758, "rewards/margins": 1.4940799474716187, "rewards/rejected": 1.7132278680801392, "step": 55840 }, { "epoch": 2.592970890013464, "grad_norm": 35.162113189697266, "learning_rate": 6.804710834610086e-08, "logits/chosen": -18.058837890625, "logits/rejected": -17.787885665893555, "logps/chosen": -408.3616638183594, "logps/rejected": -348.68621826171875, "loss": 0.558, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.934832811355591, "rewards/margins": 1.3026787042617798, "rewards/rejected": 1.632153868675232, "step": 55850 }, { "epoch": 2.593435164120897, "grad_norm": 131.2801055908203, "learning_rate": 6.796972932819536e-08, "logits/chosen": -18.410602569580078, "logits/rejected": -17.78457260131836, "logps/chosen": -425.26910400390625, "logps/rejected": -321.51190185546875, "loss": 0.5444, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6886088848114014, "rewards/margins": 1.8785688877105713, "rewards/rejected": 1.8100401163101196, "step": 55860 }, { "epoch": 2.59389943822833, "grad_norm": 0.03252244368195534, "learning_rate": 6.789235031028987e-08, "logits/chosen": -19.163000106811523, "logits/rejected": -18.3362979888916, "logps/chosen": -302.0025329589844, "logps/rejected": -283.6254577636719, "loss": 0.4849, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3317489624023438, "rewards/margins": 1.7701923847198486, "rewards/rejected": 1.5615565776824951, "step": 55870 }, { "epoch": 2.5943637123357632, "grad_norm": 48.366493225097656, "learning_rate": 6.781497129238435e-08, "logits/chosen": -19.337453842163086, "logits/rejected": -19.4108943939209, "logps/chosen": -339.8298645019531, "logps/rejected": -400.19476318359375, "loss": 0.8327, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.539764881134033, "rewards/margins": 0.3249230682849884, "rewards/rejected": 2.214841842651367, "step": 55880 }, { "epoch": 2.594827986443196, "grad_norm": 1.4551130533218384, "learning_rate": 6.773759227447885e-08, "logits/chosen": -18.22614860534668, "logits/rejected": -17.821441650390625, "logps/chosen": -391.0312194824219, "logps/rejected": -286.7099609375, "loss": 0.5258, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.148542881011963, "rewards/margins": 1.484349250793457, "rewards/rejected": 1.6641935110092163, "step": 55890 }, { "epoch": 2.5952922605506292, "grad_norm": 226.10765075683594, "learning_rate": 6.766021325657334e-08, "logits/chosen": -19.758045196533203, "logits/rejected": -19.852157592773438, "logps/chosen": -387.0216369628906, "logps/rejected": -352.94097900390625, "loss": 0.8753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4703686237335205, "rewards/margins": 0.705524742603302, "rewards/rejected": 2.7648444175720215, "step": 55900 }, { "epoch": 2.595756534658062, "grad_norm": 83.69791412353516, "learning_rate": 6.758283423866784e-08, "logits/chosen": -18.67463493347168, "logits/rejected": -19.028032302856445, "logps/chosen": -396.4700622558594, "logps/rejected": -442.7022399902344, "loss": 0.9446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.734534740447998, "rewards/margins": 0.6162213087081909, "rewards/rejected": 3.1183133125305176, "step": 55910 }, { "epoch": 2.596220808765495, "grad_norm": 11.422734260559082, "learning_rate": 6.750545522076235e-08, "logits/chosen": -19.12360191345215, "logits/rejected": -19.003063201904297, "logps/chosen": -384.58905029296875, "logps/rejected": -380.07159423828125, "loss": 0.923, "rewards/accuracies": 0.5, "rewards/chosen": 3.318470001220703, "rewards/margins": -0.005701732821762562, "rewards/rejected": 3.324171781539917, "step": 55920 }, { "epoch": 2.596685082872928, "grad_norm": 137.65870666503906, "learning_rate": 6.742807620285683e-08, "logits/chosen": -19.414106369018555, "logits/rejected": -19.004711151123047, "logps/chosen": -305.6461486816406, "logps/rejected": -299.29986572265625, "loss": 0.76, "rewards/accuracies": 0.5, "rewards/chosen": 2.600534439086914, "rewards/margins": 0.33546051383018494, "rewards/rejected": 2.2650742530822754, "step": 55930 }, { "epoch": 2.597149356980361, "grad_norm": 15.917695999145508, "learning_rate": 6.735069718495133e-08, "logits/chosen": -19.085941314697266, "logits/rejected": -18.307640075683594, "logps/chosen": -436.779052734375, "logps/rejected": -296.30517578125, "loss": 0.6499, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.183231830596924, "rewards/margins": 0.9963432550430298, "rewards/rejected": 2.186889171600342, "step": 55940 }, { "epoch": 2.5976136310877944, "grad_norm": 307.993896484375, "learning_rate": 6.727331816704581e-08, "logits/chosen": -19.92198944091797, "logits/rejected": -18.647249221801758, "logps/chosen": -514.8213500976562, "logps/rejected": -319.3261413574219, "loss": 0.3876, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.427736759185791, "rewards/margins": 2.935789108276367, "rewards/rejected": 2.491947650909424, "step": 55950 }, { "epoch": 2.598077905195227, "grad_norm": 10.282662391662598, "learning_rate": 6.719593914914031e-08, "logits/chosen": -20.05818748474121, "logits/rejected": -19.144969940185547, "logps/chosen": -504.2167053222656, "logps/rejected": -325.2435607910156, "loss": 0.1573, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.130173206329346, "rewards/margins": 2.9997949600219727, "rewards/rejected": 2.130378484725952, "step": 55960 }, { "epoch": 2.5985421793026604, "grad_norm": 3.2276227474212646, "learning_rate": 6.711856013123481e-08, "logits/chosen": -19.55341339111328, "logits/rejected": -18.790136337280273, "logps/chosen": -367.03314208984375, "logps/rejected": -263.7518310546875, "loss": 0.6558, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1083953380584717, "rewards/margins": 1.2695410251617432, "rewards/rejected": 1.838854432106018, "step": 55970 }, { "epoch": 2.599006453410093, "grad_norm": 52.88205337524414, "learning_rate": 6.704118111332931e-08, "logits/chosen": -18.27476692199707, "logits/rejected": -18.038034439086914, "logps/chosen": -307.78582763671875, "logps/rejected": -291.6128845214844, "loss": 1.1278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3986153602600098, "rewards/margins": 0.014276576228439808, "rewards/rejected": 2.384338855743408, "step": 55980 }, { "epoch": 2.5994707275175264, "grad_norm": 90.46521759033203, "learning_rate": 6.696380209542381e-08, "logits/chosen": -18.05438804626465, "logits/rejected": -18.71347427368164, "logps/chosen": -290.109375, "logps/rejected": -388.53936767578125, "loss": 1.3061, "rewards/accuracies": 0.5, "rewards/chosen": 2.5186614990234375, "rewards/margins": -0.14602582156658173, "rewards/rejected": 2.664687156677246, "step": 55990 }, { "epoch": 2.599935001624959, "grad_norm": 34.731849670410156, "learning_rate": 6.688642307751829e-08, "logits/chosen": -19.754169464111328, "logits/rejected": -19.291616439819336, "logps/chosen": -451.6236267089844, "logps/rejected": -345.4239196777344, "loss": 0.7802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.3052754402160645, "rewards/margins": 1.4731776714324951, "rewards/rejected": 3.8320980072021484, "step": 56000 }, { "epoch": 2.6003992757323924, "grad_norm": 196.837890625, "learning_rate": 6.680904405961279e-08, "logits/chosen": -18.235240936279297, "logits/rejected": -17.018646240234375, "logps/chosen": -507.57440185546875, "logps/rejected": -365.1752624511719, "loss": 0.7166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5739612579345703, "rewards/margins": 1.9573934078216553, "rewards/rejected": 1.616567611694336, "step": 56010 }, { "epoch": 2.6008635498398256, "grad_norm": 39.198341369628906, "learning_rate": 6.673166504170729e-08, "logits/chosen": -18.572330474853516, "logits/rejected": -18.55998992919922, "logps/chosen": -398.87213134765625, "logps/rejected": -370.21380615234375, "loss": 0.9678, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4679789543151855, "rewards/margins": 0.35296112298965454, "rewards/rejected": 3.115017890930176, "step": 56020 }, { "epoch": 2.6013278239472584, "grad_norm": 40.63895797729492, "learning_rate": 6.665428602380179e-08, "logits/chosen": -19.806392669677734, "logits/rejected": -19.757930755615234, "logps/chosen": -345.6514892578125, "logps/rejected": -368.1250915527344, "loss": 1.0468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.524125337600708, "rewards/margins": 0.6129779815673828, "rewards/rejected": 2.911147356033325, "step": 56030 }, { "epoch": 2.6017920980546916, "grad_norm": 56.06557846069336, "learning_rate": 6.657690700589628e-08, "logits/chosen": -19.47694969177246, "logits/rejected": -17.836360931396484, "logps/chosen": -483.06964111328125, "logps/rejected": -414.5889587402344, "loss": 0.443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.404747486114502, "rewards/margins": 2.3238725662231445, "rewards/rejected": 2.0808751583099365, "step": 56040 }, { "epoch": 2.6022563721621244, "grad_norm": 1.72554349899292, "learning_rate": 6.649952798799077e-08, "logits/chosen": -18.839157104492188, "logits/rejected": -17.80169105529785, "logps/chosen": -398.5960693359375, "logps/rejected": -327.6250915527344, "loss": 0.4511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.04189920425415, "rewards/margins": 1.6044807434082031, "rewards/rejected": 2.437418222427368, "step": 56050 }, { "epoch": 2.6027206462695576, "grad_norm": 87.61849975585938, "learning_rate": 6.642214897008528e-08, "logits/chosen": -18.84401512145996, "logits/rejected": -18.522491455078125, "logps/chosen": -401.57794189453125, "logps/rejected": -358.9267272949219, "loss": 0.5023, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.808464765548706, "rewards/margins": 1.456329584121704, "rewards/rejected": 2.352134943008423, "step": 56060 }, { "epoch": 2.6031849203769903, "grad_norm": 13.2479829788208, "learning_rate": 6.634476995217976e-08, "logits/chosen": -19.63948631286621, "logits/rejected": -18.29409408569336, "logps/chosen": -378.1085510253906, "logps/rejected": -301.75634765625, "loss": 0.7596, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.498429775238037, "rewards/margins": 1.2170307636260986, "rewards/rejected": 2.2813992500305176, "step": 56070 }, { "epoch": 2.6036491944844236, "grad_norm": 179.49659729003906, "learning_rate": 6.626739093427426e-08, "logits/chosen": -19.90313720703125, "logits/rejected": -18.7067928314209, "logps/chosen": -426.849365234375, "logps/rejected": -347.05889892578125, "loss": 0.7699, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.925602436065674, "rewards/margins": 1.3557815551757812, "rewards/rejected": 2.5698208808898926, "step": 56080 }, { "epoch": 2.6041134685918568, "grad_norm": 254.84487915039062, "learning_rate": 6.619001191636875e-08, "logits/chosen": -19.168800354003906, "logits/rejected": -17.584224700927734, "logps/chosen": -431.72503662109375, "logps/rejected": -292.91949462890625, "loss": 0.5443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.288902759552002, "rewards/margins": 3.1594021320343018, "rewards/rejected": 1.1295006275177002, "step": 56090 }, { "epoch": 2.6045777426992895, "grad_norm": 273.51361083984375, "learning_rate": 6.611263289846325e-08, "logits/chosen": -19.32180404663086, "logits/rejected": -18.5722713470459, "logps/chosen": -461.5111389160156, "logps/rejected": -384.45233154296875, "loss": 0.8139, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8091320991516113, "rewards/margins": 1.5001229047775269, "rewards/rejected": 2.3090085983276367, "step": 56100 }, { "epoch": 2.6050420168067228, "grad_norm": 142.7447509765625, "learning_rate": 6.603525388055776e-08, "logits/chosen": -17.928577423095703, "logits/rejected": -17.88016700744629, "logps/chosen": -313.2626953125, "logps/rejected": -287.287109375, "loss": 1.0641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9391634464263916, "rewards/margins": 0.6051099896430969, "rewards/rejected": 1.3340532779693604, "step": 56110 }, { "epoch": 2.605506290914156, "grad_norm": 94.14120483398438, "learning_rate": 6.595787486265224e-08, "logits/chosen": -18.775772094726562, "logits/rejected": -17.872570037841797, "logps/chosen": -432.79833984375, "logps/rejected": -313.35516357421875, "loss": 0.6495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7434897422790527, "rewards/margins": 1.6308225393295288, "rewards/rejected": 2.1126675605773926, "step": 56120 }, { "epoch": 2.6059705650215887, "grad_norm": 14.91661262512207, "learning_rate": 6.588049584474674e-08, "logits/chosen": -19.053558349609375, "logits/rejected": -20.1671085357666, "logps/chosen": -385.74920654296875, "logps/rejected": -372.4044189453125, "loss": 1.0415, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.997065544128418, "rewards/margins": 0.32506951689720154, "rewards/rejected": 2.6719958782196045, "step": 56130 }, { "epoch": 2.6064348391290215, "grad_norm": 135.20529174804688, "learning_rate": 6.580311682684122e-08, "logits/chosen": -19.162511825561523, "logits/rejected": -19.08993148803711, "logps/chosen": -344.9150085449219, "logps/rejected": -293.0050964355469, "loss": 0.6446, "rewards/accuracies": 0.5, "rewards/chosen": 2.61340069770813, "rewards/margins": 0.7574673295021057, "rewards/rejected": 1.855933427810669, "step": 56140 }, { "epoch": 2.6068991132364547, "grad_norm": 79.05218505859375, "learning_rate": 6.572573780893572e-08, "logits/chosen": -18.588054656982422, "logits/rejected": -17.89051055908203, "logps/chosen": -372.6177673339844, "logps/rejected": -293.4461975097656, "loss": 1.0589, "rewards/accuracies": 0.5, "rewards/chosen": 3.2981467247009277, "rewards/margins": 1.03871488571167, "rewards/rejected": 2.259431838989258, "step": 56150 }, { "epoch": 2.607363387343888, "grad_norm": 73.7314224243164, "learning_rate": 6.564835879103022e-08, "logits/chosen": -19.984729766845703, "logits/rejected": -18.446075439453125, "logps/chosen": -493.8583068847656, "logps/rejected": -362.8478088378906, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 4.0655364990234375, "rewards/margins": 2.7151410579681396, "rewards/rejected": 1.3503955602645874, "step": 56160 }, { "epoch": 2.6078276614513207, "grad_norm": 0.05507350340485573, "learning_rate": 6.557097977312472e-08, "logits/chosen": -20.174238204956055, "logits/rejected": -19.267162322998047, "logps/chosen": -419.9256286621094, "logps/rejected": -255.9470977783203, "loss": 0.3853, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.075837135314941, "rewards/margins": 2.7216832637786865, "rewards/rejected": 2.354154109954834, "step": 56170 }, { "epoch": 2.608291935558754, "grad_norm": 0.2542901039123535, "learning_rate": 6.549360075521922e-08, "logits/chosen": -19.033966064453125, "logits/rejected": -17.930500030517578, "logps/chosen": -332.5145263671875, "logps/rejected": -190.22384643554688, "loss": 0.7002, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.038778066635132, "rewards/margins": 1.9735389947891235, "rewards/rejected": 1.0652393102645874, "step": 56180 }, { "epoch": 2.608756209666187, "grad_norm": 31.565921783447266, "learning_rate": 6.54162217373137e-08, "logits/chosen": -19.24602699279785, "logits/rejected": -19.440771102905273, "logps/chosen": -339.66888427734375, "logps/rejected": -374.2463684082031, "loss": 1.3807, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0137734413146973, "rewards/margins": 0.18328571319580078, "rewards/rejected": 2.8304874897003174, "step": 56190 }, { "epoch": 2.60922048377362, "grad_norm": 31.462844848632812, "learning_rate": 6.53388427194082e-08, "logits/chosen": -19.672239303588867, "logits/rejected": -18.569011688232422, "logps/chosen": -402.5902404785156, "logps/rejected": -267.5203552246094, "loss": 0.6039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.161701202392578, "rewards/margins": 1.5678025484085083, "rewards/rejected": 1.5938981771469116, "step": 56200 }, { "epoch": 2.6096847578810527, "grad_norm": 34.60498809814453, "learning_rate": 6.52614637015027e-08, "logits/chosen": -18.317129135131836, "logits/rejected": -17.637470245361328, "logps/chosen": -444.0812072753906, "logps/rejected": -430.28155517578125, "loss": 1.2663, "rewards/accuracies": 0.5, "rewards/chosen": 2.710702419281006, "rewards/margins": -0.11270129680633545, "rewards/rejected": 2.82340407371521, "step": 56210 }, { "epoch": 2.610149031988486, "grad_norm": 8.391944885253906, "learning_rate": 6.51840846835972e-08, "logits/chosen": -18.63414764404297, "logits/rejected": -17.440753936767578, "logps/chosen": -406.7266540527344, "logps/rejected": -354.0189208984375, "loss": 0.4314, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7391390800476074, "rewards/margins": 1.5521570444107056, "rewards/rejected": 2.1869821548461914, "step": 56220 }, { "epoch": 2.610613306095919, "grad_norm": 28.94209861755371, "learning_rate": 6.51067056656917e-08, "logits/chosen": -19.928028106689453, "logits/rejected": -18.171768188476562, "logps/chosen": -345.2848205566406, "logps/rejected": -276.8045959472656, "loss": 0.3109, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.409102201461792, "rewards/margins": 1.59943687915802, "rewards/rejected": 0.8096655011177063, "step": 56230 }, { "epoch": 2.611077580203352, "grad_norm": 0.07860340923070908, "learning_rate": 6.502932664778618e-08, "logits/chosen": -19.333133697509766, "logits/rejected": -18.709781646728516, "logps/chosen": -388.95892333984375, "logps/rejected": -317.8708801269531, "loss": 0.8064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.046636581420898, "rewards/margins": 1.6508671045303345, "rewards/rejected": 2.3957693576812744, "step": 56240 }, { "epoch": 2.611541854310785, "grad_norm": 6.069218158721924, "learning_rate": 6.495194762988068e-08, "logits/chosen": -20.28506851196289, "logits/rejected": -18.86552619934082, "logps/chosen": -465.85736083984375, "logps/rejected": -351.2755126953125, "loss": 0.3788, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.718623638153076, "rewards/margins": 1.3906633853912354, "rewards/rejected": 2.327960252761841, "step": 56250 }, { "epoch": 2.6120061284182183, "grad_norm": 92.05447387695312, "learning_rate": 6.487456861197517e-08, "logits/chosen": -21.133752822875977, "logits/rejected": -19.192699432373047, "logps/chosen": -433.85125732421875, "logps/rejected": -317.5811767578125, "loss": 0.6592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.144664764404297, "rewards/margins": 2.457496166229248, "rewards/rejected": 2.687168598175049, "step": 56260 }, { "epoch": 2.612470402525651, "grad_norm": 45.189815521240234, "learning_rate": 6.479718959406967e-08, "logits/chosen": -19.155559539794922, "logits/rejected": -18.58754539489746, "logps/chosen": -435.02783203125, "logps/rejected": -405.7008361816406, "loss": 0.7646, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5597147941589355, "rewards/margins": 1.1551549434661865, "rewards/rejected": 2.404560089111328, "step": 56270 }, { "epoch": 2.6129346766330843, "grad_norm": 79.14313507080078, "learning_rate": 6.471981057616416e-08, "logits/chosen": -18.656734466552734, "logits/rejected": -17.62460708618164, "logps/chosen": -386.4151916503906, "logps/rejected": -363.31195068359375, "loss": 0.3605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7085018157958984, "rewards/margins": 1.4040660858154297, "rewards/rejected": 2.3044357299804688, "step": 56280 }, { "epoch": 2.613398950740517, "grad_norm": 248.50872802734375, "learning_rate": 6.464243155825866e-08, "logits/chosen": -18.783702850341797, "logits/rejected": -17.89082145690918, "logps/chosen": -356.47784423828125, "logps/rejected": -304.6725769042969, "loss": 0.9574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.113429546356201, "rewards/margins": 0.8714041709899902, "rewards/rejected": 2.242025136947632, "step": 56290 }, { "epoch": 2.6138632248479503, "grad_norm": 16.667118072509766, "learning_rate": 6.456505254035315e-08, "logits/chosen": -19.498504638671875, "logits/rejected": -17.987855911254883, "logps/chosen": -505.3091735839844, "logps/rejected": -349.50335693359375, "loss": 0.4249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.502497673034668, "rewards/margins": 2.15743350982666, "rewards/rejected": 2.345064163208008, "step": 56300 }, { "epoch": 2.614327498955383, "grad_norm": 145.50717163085938, "learning_rate": 6.448767352244765e-08, "logits/chosen": -18.96116065979004, "logits/rejected": -18.003286361694336, "logps/chosen": -378.4290771484375, "logps/rejected": -271.68902587890625, "loss": 0.6225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0586960315704346, "rewards/margins": 1.4401445388793945, "rewards/rejected": 1.618551254272461, "step": 56310 }, { "epoch": 2.6147917730628163, "grad_norm": 17.23040008544922, "learning_rate": 6.441029450454215e-08, "logits/chosen": -18.55422592163086, "logits/rejected": -17.985347747802734, "logps/chosen": -484.44549560546875, "logps/rejected": -352.1076354980469, "loss": 0.3102, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.534884452819824, "rewards/margins": 2.150780439376831, "rewards/rejected": 2.3841042518615723, "step": 56320 }, { "epoch": 2.6152560471702495, "grad_norm": 137.17562866210938, "learning_rate": 6.433291548663663e-08, "logits/chosen": -18.916343688964844, "logits/rejected": -18.44920539855957, "logps/chosen": -321.3363342285156, "logps/rejected": -325.96136474609375, "loss": 1.3747, "rewards/accuracies": 0.5, "rewards/chosen": 1.3190491199493408, "rewards/margins": -0.1343051642179489, "rewards/rejected": 1.4533541202545166, "step": 56330 }, { "epoch": 2.6157203212776823, "grad_norm": 11.223443031311035, "learning_rate": 6.425553646873113e-08, "logits/chosen": -18.563491821289062, "logits/rejected": -17.963336944580078, "logps/chosen": -395.98284912109375, "logps/rejected": -313.0026550292969, "loss": 0.9639, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6828932762145996, "rewards/margins": 0.9614235758781433, "rewards/rejected": 1.721469521522522, "step": 56340 }, { "epoch": 2.6161845953851155, "grad_norm": 159.63327026367188, "learning_rate": 6.417815745082563e-08, "logits/chosen": -19.054546356201172, "logits/rejected": -18.520238876342773, "logps/chosen": -429.0038146972656, "logps/rejected": -323.75677490234375, "loss": 0.7854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9551024436950684, "rewards/margins": 0.8913348913192749, "rewards/rejected": 2.063767671585083, "step": 56350 }, { "epoch": 2.6166488694925483, "grad_norm": 207.30810546875, "learning_rate": 6.410077843292013e-08, "logits/chosen": -18.88754653930664, "logits/rejected": -18.657133102416992, "logps/chosen": -419.0065002441406, "logps/rejected": -337.9260559082031, "loss": 0.9512, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.216207504272461, "rewards/margins": 0.6548317670822144, "rewards/rejected": 2.561375856399536, "step": 56360 }, { "epoch": 2.6171131435999815, "grad_norm": 82.07455444335938, "learning_rate": 6.402339941501463e-08, "logits/chosen": -20.354618072509766, "logits/rejected": -19.1260929107666, "logps/chosen": -418.89654541015625, "logps/rejected": -330.7002868652344, "loss": 0.3734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.5186896324157715, "rewards/margins": 2.139431953430176, "rewards/rejected": 2.379257917404175, "step": 56370 }, { "epoch": 2.6175774177074143, "grad_norm": 7.973239421844482, "learning_rate": 6.394602039710911e-08, "logits/chosen": -18.884254455566406, "logits/rejected": -18.064332962036133, "logps/chosen": -337.0741882324219, "logps/rejected": -237.7130584716797, "loss": 0.6575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.417792558670044, "rewards/margins": 1.3949352502822876, "rewards/rejected": 1.0228573083877563, "step": 56380 }, { "epoch": 2.6180416918148475, "grad_norm": 55.69997024536133, "learning_rate": 6.386864137920361e-08, "logits/chosen": -20.074159622192383, "logits/rejected": -20.05202865600586, "logps/chosen": -412.7196350097656, "logps/rejected": -333.36602783203125, "loss": 0.3795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.028395652770996, "rewards/margins": 1.560177206993103, "rewards/rejected": 2.4682183265686035, "step": 56390 }, { "epoch": 2.6185059659222807, "grad_norm": 4.7864813804626465, "learning_rate": 6.37912623612981e-08, "logits/chosen": -18.273330688476562, "logits/rejected": -19.113569259643555, "logps/chosen": -311.2265625, "logps/rejected": -359.9248352050781, "loss": 1.0948, "rewards/accuracies": 0.5, "rewards/chosen": 1.9748637676239014, "rewards/margins": 0.1737765520811081, "rewards/rejected": 1.8010871410369873, "step": 56400 }, { "epoch": 2.6189702400297135, "grad_norm": 21.55830192565918, "learning_rate": 6.37138833433926e-08, "logits/chosen": -20.329557418823242, "logits/rejected": -19.0897216796875, "logps/chosen": -486.142333984375, "logps/rejected": -303.72882080078125, "loss": 0.3753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.609987258911133, "rewards/margins": 2.4222264289855957, "rewards/rejected": 2.1877615451812744, "step": 56410 }, { "epoch": 2.6194345141371467, "grad_norm": 0.4921260178089142, "learning_rate": 6.36365043254871e-08, "logits/chosen": -19.94338607788086, "logits/rejected": -18.94589614868164, "logps/chosen": -490.22247314453125, "logps/rejected": -429.0726013183594, "loss": 0.7215, "rewards/accuracies": 0.5, "rewards/chosen": 4.261475086212158, "rewards/margins": 1.1147379875183105, "rewards/rejected": 3.1467366218566895, "step": 56420 }, { "epoch": 2.6198987882445794, "grad_norm": 26.19515037536621, "learning_rate": 6.355912530758159e-08, "logits/chosen": -19.299898147583008, "logits/rejected": -18.445402145385742, "logps/chosen": -463.2244567871094, "logps/rejected": -401.39312744140625, "loss": 0.6178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.666152000427246, "rewards/margins": 1.1750956773757935, "rewards/rejected": 3.491055727005005, "step": 56430 }, { "epoch": 2.6203630623520127, "grad_norm": 0.875242292881012, "learning_rate": 6.348174628967609e-08, "logits/chosen": -19.35240364074707, "logits/rejected": -18.069259643554688, "logps/chosen": -414.9662170410156, "logps/rejected": -270.30438232421875, "loss": 0.6365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3470757007598877, "rewards/margins": 1.6596676111221313, "rewards/rejected": 1.6874080896377563, "step": 56440 }, { "epoch": 2.6208273364594454, "grad_norm": 58.02153778076172, "learning_rate": 6.340436727177059e-08, "logits/chosen": -18.196258544921875, "logits/rejected": -18.123653411865234, "logps/chosen": -416.83111572265625, "logps/rejected": -389.45770263671875, "loss": 1.4324, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.843655824661255, "rewards/margins": -0.3921739161014557, "rewards/rejected": 3.2358298301696777, "step": 56450 }, { "epoch": 2.6212916105668786, "grad_norm": 28.41605567932129, "learning_rate": 6.332698825386508e-08, "logits/chosen": -19.370464324951172, "logits/rejected": -18.356903076171875, "logps/chosen": -371.47430419921875, "logps/rejected": -327.40496826171875, "loss": 0.4023, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.7695424556732178, "rewards/margins": 1.4567493200302124, "rewards/rejected": 1.3127930164337158, "step": 56460 }, { "epoch": 2.621755884674312, "grad_norm": 11.311243057250977, "learning_rate": 6.324960923595958e-08, "logits/chosen": -18.62380599975586, "logits/rejected": -17.549150466918945, "logps/chosen": -456.4546813964844, "logps/rejected": -321.86236572265625, "loss": 0.6052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.26840353012085, "rewards/margins": 1.9779205322265625, "rewards/rejected": 2.290482997894287, "step": 56470 }, { "epoch": 2.6222201587817446, "grad_norm": 0.8928366899490356, "learning_rate": 6.317223021805407e-08, "logits/chosen": -19.44841194152832, "logits/rejected": -17.723041534423828, "logps/chosen": -312.0704040527344, "logps/rejected": -206.18716430664062, "loss": 0.5744, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.047621011734009, "rewards/margins": 2.1508986949920654, "rewards/rejected": 0.8967218399047852, "step": 56480 }, { "epoch": 2.622684432889178, "grad_norm": 338.5568542480469, "learning_rate": 6.309485120014856e-08, "logits/chosen": -19.167329788208008, "logits/rejected": -18.325218200683594, "logps/chosen": -357.8228759765625, "logps/rejected": -325.6902160644531, "loss": 0.6846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2023510932922363, "rewards/margins": 1.2351306676864624, "rewards/rejected": 0.9672204256057739, "step": 56490 }, { "epoch": 2.623148706996611, "grad_norm": 165.3843231201172, "learning_rate": 6.301747218224306e-08, "logits/chosen": -19.136333465576172, "logits/rejected": -19.104402542114258, "logps/chosen": -461.6461486816406, "logps/rejected": -367.90655517578125, "loss": 0.7604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.751842021942139, "rewards/margins": 1.5423870086669922, "rewards/rejected": 3.2094550132751465, "step": 56500 }, { "epoch": 2.623612981104044, "grad_norm": 169.0718231201172, "learning_rate": 6.294009316433756e-08, "logits/chosen": -19.375823974609375, "logits/rejected": -18.985782623291016, "logps/chosen": -313.0251770019531, "logps/rejected": -298.8614807128906, "loss": 0.4959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0590603351593018, "rewards/margins": 0.8216264843940735, "rewards/rejected": 2.237433671951294, "step": 56510 }, { "epoch": 2.6240772552114766, "grad_norm": 45.505699157714844, "learning_rate": 6.286271414643204e-08, "logits/chosen": -18.378067016601562, "logits/rejected": -18.280017852783203, "logps/chosen": -324.9523620605469, "logps/rejected": -304.7310485839844, "loss": 0.5962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8461639881134033, "rewards/margins": 0.5386017560958862, "rewards/rejected": 2.3075621128082275, "step": 56520 }, { "epoch": 2.62454152931891, "grad_norm": 12.072193145751953, "learning_rate": 6.278533512852654e-08, "logits/chosen": -17.899452209472656, "logits/rejected": -17.7840518951416, "logps/chosen": -344.9826965332031, "logps/rejected": -358.6157531738281, "loss": 0.9462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9118435382843018, "rewards/margins": 0.22872856259346008, "rewards/rejected": 2.683114767074585, "step": 56530 }, { "epoch": 2.625005803426343, "grad_norm": 210.13417053222656, "learning_rate": 6.270795611062104e-08, "logits/chosen": -18.133411407470703, "logits/rejected": -17.798686981201172, "logps/chosen": -355.0377197265625, "logps/rejected": -339.7251892089844, "loss": 1.1489, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.218925952911377, "rewards/margins": 0.5958935022354126, "rewards/rejected": 2.623032331466675, "step": 56540 }, { "epoch": 2.625470077533776, "grad_norm": 21.180957794189453, "learning_rate": 6.263057709271554e-08, "logits/chosen": -18.23712158203125, "logits/rejected": -18.428821563720703, "logps/chosen": -375.8143310546875, "logps/rejected": -339.69073486328125, "loss": 1.5917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.565176486968994, "rewards/margins": 0.9659919738769531, "rewards/rejected": 2.599184513092041, "step": 56550 }, { "epoch": 2.625934351641209, "grad_norm": 221.56399536132812, "learning_rate": 6.255319807481004e-08, "logits/chosen": -18.915122985839844, "logits/rejected": -18.39356803894043, "logps/chosen": -347.26824951171875, "logps/rejected": -273.6830749511719, "loss": 0.9003, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8736464977264404, "rewards/margins": 0.6252268552780151, "rewards/rejected": 3.2484195232391357, "step": 56560 }, { "epoch": 2.6263986257486422, "grad_norm": 55.831459045410156, "learning_rate": 6.247581905690454e-08, "logits/chosen": -18.675992965698242, "logits/rejected": -18.15937614440918, "logps/chosen": -397.8287353515625, "logps/rejected": -335.1580810546875, "loss": 0.5455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7304179668426514, "rewards/margins": 1.72051203250885, "rewards/rejected": 2.00990629196167, "step": 56570 }, { "epoch": 2.626862899856075, "grad_norm": 110.6386489868164, "learning_rate": 6.239844003899902e-08, "logits/chosen": -20.071046829223633, "logits/rejected": -18.705951690673828, "logps/chosen": -482.8268127441406, "logps/rejected": -371.4866638183594, "loss": 0.609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.455503940582275, "rewards/margins": 1.6316049098968506, "rewards/rejected": 2.823899030685425, "step": 56580 }, { "epoch": 2.627327173963508, "grad_norm": 0.14560087025165558, "learning_rate": 6.232106102109352e-08, "logits/chosen": -19.549604415893555, "logits/rejected": -18.992692947387695, "logps/chosen": -451.94195556640625, "logps/rejected": -415.043701171875, "loss": 0.5488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.749551296234131, "rewards/margins": 1.8557870388031006, "rewards/rejected": 2.893764019012451, "step": 56590 }, { "epoch": 2.627791448070941, "grad_norm": 68.09764862060547, "learning_rate": 6.224368200318802e-08, "logits/chosen": -19.987173080444336, "logits/rejected": -19.144132614135742, "logps/chosen": -412.3108825683594, "logps/rejected": -367.27728271484375, "loss": 0.319, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6447315216064453, "rewards/margins": 1.9744842052459717, "rewards/rejected": 1.6702474355697632, "step": 56600 }, { "epoch": 2.628255722178374, "grad_norm": 181.24139404296875, "learning_rate": 6.21663029852825e-08, "logits/chosen": -18.906455993652344, "logits/rejected": -17.92512321472168, "logps/chosen": -416.931884765625, "logps/rejected": -295.0736083984375, "loss": 0.5354, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5754153728485107, "rewards/margins": 2.0854179859161377, "rewards/rejected": 1.489997148513794, "step": 56610 }, { "epoch": 2.628719996285807, "grad_norm": 61.97702407836914, "learning_rate": 6.208892396737701e-08, "logits/chosen": -19.665428161621094, "logits/rejected": -18.314172744750977, "logps/chosen": -402.7064514160156, "logps/rejected": -291.2527770996094, "loss": 0.4886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.044175624847412, "rewards/margins": 0.8358144760131836, "rewards/rejected": 2.2083609104156494, "step": 56620 }, { "epoch": 2.62918427039324, "grad_norm": 25.98499298095703, "learning_rate": 6.20115449494715e-08, "logits/chosen": -18.594961166381836, "logits/rejected": -18.550262451171875, "logps/chosen": -354.2868957519531, "logps/rejected": -322.0398254394531, "loss": 0.9443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.575733184814453, "rewards/margins": 0.876611053943634, "rewards/rejected": 2.6991219520568848, "step": 56630 }, { "epoch": 2.6296485445006734, "grad_norm": 54.10962677001953, "learning_rate": 6.1934165931566e-08, "logits/chosen": -19.73720932006836, "logits/rejected": -18.16775894165039, "logps/chosen": -287.0361633300781, "logps/rejected": -220.2946014404297, "loss": 0.4188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.169178009033203, "rewards/margins": 1.8727853298187256, "rewards/rejected": 1.296392798423767, "step": 56640 }, { "epoch": 2.630112818608106, "grad_norm": 215.1270751953125, "learning_rate": 6.185678691366049e-08, "logits/chosen": -19.372936248779297, "logits/rejected": -19.00830078125, "logps/chosen": -426.1182556152344, "logps/rejected": -307.93096923828125, "loss": 0.7541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.56764030456543, "rewards/margins": 1.9965002536773682, "rewards/rejected": 2.5711400508880615, "step": 56650 }, { "epoch": 2.6305770927155394, "grad_norm": 132.0328369140625, "learning_rate": 6.177940789575498e-08, "logits/chosen": -18.993932723999023, "logits/rejected": -17.631389617919922, "logps/chosen": -406.4754943847656, "logps/rejected": -255.4242706298828, "loss": 0.5648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8477959632873535, "rewards/margins": 2.3916468620300293, "rewards/rejected": 1.4561493396759033, "step": 56660 }, { "epoch": 2.631041366822972, "grad_norm": 0.8280134797096252, "learning_rate": 6.170202887784948e-08, "logits/chosen": -18.11957550048828, "logits/rejected": -18.72763442993164, "logps/chosen": -288.06146240234375, "logps/rejected": -299.09808349609375, "loss": 0.9886, "rewards/accuracies": 0.5, "rewards/chosen": 2.783984661102295, "rewards/margins": 0.4671594500541687, "rewards/rejected": 2.3168251514434814, "step": 56670 }, { "epoch": 2.6315056409304054, "grad_norm": 59.518192291259766, "learning_rate": 6.162464985994397e-08, "logits/chosen": -19.4533748626709, "logits/rejected": -18.88360595703125, "logps/chosen": -339.1608581542969, "logps/rejected": -309.80743408203125, "loss": 0.952, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4343271255493164, "rewards/margins": 0.4287756085395813, "rewards/rejected": 2.00555157661438, "step": 56680 }, { "epoch": 2.631969915037838, "grad_norm": 42.84462356567383, "learning_rate": 6.154727084203847e-08, "logits/chosen": -18.87287139892578, "logits/rejected": -18.90580177307129, "logps/chosen": -385.39581298828125, "logps/rejected": -433.43695068359375, "loss": 1.8259, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1966586112976074, "rewards/margins": -0.8077646493911743, "rewards/rejected": 4.00442361831665, "step": 56690 }, { "epoch": 2.6324341891452714, "grad_norm": 52.889705657958984, "learning_rate": 6.146989182413297e-08, "logits/chosen": -18.987857818603516, "logits/rejected": -19.06668472290039, "logps/chosen": -363.262939453125, "logps/rejected": -363.1925964355469, "loss": 0.7366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2537102699279785, "rewards/margins": 0.5991044044494629, "rewards/rejected": 2.6546061038970947, "step": 56700 }, { "epoch": 2.6328984632527046, "grad_norm": 17.299156188964844, "learning_rate": 6.139251280622746e-08, "logits/chosen": -18.761764526367188, "logits/rejected": -18.69491195678711, "logps/chosen": -338.4232482910156, "logps/rejected": -437.2632751464844, "loss": 1.3358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.432734727859497, "rewards/margins": -0.07061998546123505, "rewards/rejected": 2.503354549407959, "step": 56710 }, { "epoch": 2.6333627373601374, "grad_norm": 76.1083984375, "learning_rate": 6.131513378832195e-08, "logits/chosen": -18.969890594482422, "logits/rejected": -18.63107681274414, "logps/chosen": -353.813720703125, "logps/rejected": -322.0372314453125, "loss": 0.5862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.029540538787842, "rewards/margins": 1.0933864116668701, "rewards/rejected": 1.9361541271209717, "step": 56720 }, { "epoch": 2.6338270114675706, "grad_norm": 12.561323165893555, "learning_rate": 6.123775477041645e-08, "logits/chosen": -19.371341705322266, "logits/rejected": -18.746387481689453, "logps/chosen": -381.2680358886719, "logps/rejected": -345.7767639160156, "loss": 0.5923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.786646604537964, "rewards/margins": 1.5901172161102295, "rewards/rejected": 2.1965291500091553, "step": 56730 }, { "epoch": 2.6342912855750034, "grad_norm": 0.010216863825917244, "learning_rate": 6.116037575251095e-08, "logits/chosen": -19.204273223876953, "logits/rejected": -18.365934371948242, "logps/chosen": -507.2198181152344, "logps/rejected": -413.9891662597656, "loss": 0.9249, "rewards/accuracies": 0.5, "rewards/chosen": 5.011647701263428, "rewards/margins": 0.8091505169868469, "rewards/rejected": 4.202497482299805, "step": 56740 }, { "epoch": 2.6347555596824366, "grad_norm": 62.32191848754883, "learning_rate": 6.108299673460545e-08, "logits/chosen": -19.49946403503418, "logits/rejected": -19.301265716552734, "logps/chosen": -423.2164611816406, "logps/rejected": -444.0240173339844, "loss": 0.4765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.054145336151123, "rewards/margins": 1.0558748245239258, "rewards/rejected": 2.9982705116271973, "step": 56750 }, { "epoch": 2.6352198337898693, "grad_norm": 112.24400329589844, "learning_rate": 6.100561771669993e-08, "logits/chosen": -19.182743072509766, "logits/rejected": -19.75379753112793, "logps/chosen": -341.5419921875, "logps/rejected": -409.8631286621094, "loss": 1.0852, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.232992649078369, "rewards/margins": -0.3739016354084015, "rewards/rejected": 3.6068942546844482, "step": 56760 }, { "epoch": 2.6356841078973026, "grad_norm": 38.52796936035156, "learning_rate": 6.092823869879443e-08, "logits/chosen": -18.42950439453125, "logits/rejected": -18.508317947387695, "logps/chosen": -319.3627014160156, "logps/rejected": -292.6137390136719, "loss": 1.0666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1865744590759277, "rewards/margins": 0.22772717475891113, "rewards/rejected": 0.9588473439216614, "step": 56770 }, { "epoch": 2.636148382004736, "grad_norm": 7.984971523284912, "learning_rate": 6.085085968088893e-08, "logits/chosen": -19.87432861328125, "logits/rejected": -18.662160873413086, "logps/chosen": -393.1149597167969, "logps/rejected": -298.0509338378906, "loss": 0.4915, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.70906138420105, "rewards/margins": 2.0510611534118652, "rewards/rejected": 1.6580002307891846, "step": 56780 }, { "epoch": 2.6366126561121686, "grad_norm": 3.043447256088257, "learning_rate": 6.077348066298343e-08, "logits/chosen": -18.79305076599121, "logits/rejected": -18.200992584228516, "logps/chosen": -307.28436279296875, "logps/rejected": -246.0379638671875, "loss": 1.0142, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7994625568389893, "rewards/margins": 1.573054313659668, "rewards/rejected": 1.2264083623886108, "step": 56790 }, { "epoch": 2.6370769302196018, "grad_norm": 30.156095504760742, "learning_rate": 6.069610164507792e-08, "logits/chosen": -19.874744415283203, "logits/rejected": -19.70425033569336, "logps/chosen": -465.79327392578125, "logps/rejected": -400.4447326660156, "loss": 0.8156, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.810434341430664, "rewards/margins": 1.255092978477478, "rewards/rejected": 2.5553414821624756, "step": 56800 }, { "epoch": 2.6375412043270345, "grad_norm": 35.38092041015625, "learning_rate": 6.061872262717241e-08, "logits/chosen": -19.402862548828125, "logits/rejected": -18.341066360473633, "logps/chosen": -426.54071044921875, "logps/rejected": -330.42510986328125, "loss": 0.6127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.509848594665527, "rewards/margins": 1.5905604362487793, "rewards/rejected": 2.919287919998169, "step": 56810 }, { "epoch": 2.6380054784344678, "grad_norm": 131.31832885742188, "learning_rate": 6.054134360926691e-08, "logits/chosen": -19.5737361907959, "logits/rejected": -18.535337448120117, "logps/chosen": -519.5802001953125, "logps/rejected": -434.92462158203125, "loss": 0.4831, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.690074443817139, "rewards/margins": 2.322469711303711, "rewards/rejected": 3.3676047325134277, "step": 56820 }, { "epoch": 2.6384697525419005, "grad_norm": 11.708304405212402, "learning_rate": 6.04639645913614e-08, "logits/chosen": -18.96578598022461, "logits/rejected": -18.937191009521484, "logps/chosen": -327.5643615722656, "logps/rejected": -327.79522705078125, "loss": 1.5173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9651150703430176, "rewards/margins": -0.02149066887795925, "rewards/rejected": 2.9866058826446533, "step": 56830 }, { "epoch": 2.6389340266493337, "grad_norm": 54.1292610168457, "learning_rate": 6.03865855734559e-08, "logits/chosen": -20.651687622070312, "logits/rejected": -19.49172019958496, "logps/chosen": -454.57598876953125, "logps/rejected": -359.987060546875, "loss": 0.9124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.719833850860596, "rewards/margins": 1.3918803930282593, "rewards/rejected": 3.327953815460205, "step": 56840 }, { "epoch": 2.639398300756767, "grad_norm": 240.40736389160156, "learning_rate": 6.030920655555039e-08, "logits/chosen": -19.780107498168945, "logits/rejected": -19.770374298095703, "logps/chosen": -482.52593994140625, "logps/rejected": -422.76727294921875, "loss": 0.4716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6597836017608643, "rewards/margins": 1.1858307123184204, "rewards/rejected": 2.4739527702331543, "step": 56850 }, { "epoch": 2.6398625748641997, "grad_norm": 0.16480150818824768, "learning_rate": 6.023182753764489e-08, "logits/chosen": -17.861684799194336, "logits/rejected": -17.292781829833984, "logps/chosen": -435.0741271972656, "logps/rejected": -352.56756591796875, "loss": 1.0439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7796459197998047, "rewards/margins": 1.6437766551971436, "rewards/rejected": 2.1358695030212402, "step": 56860 }, { "epoch": 2.640326848971633, "grad_norm": 33.98348617553711, "learning_rate": 6.015444851973938e-08, "logits/chosen": -19.770601272583008, "logits/rejected": -18.71255874633789, "logps/chosen": -429.6727600097656, "logps/rejected": -419.5691833496094, "loss": 0.9322, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7558586597442627, "rewards/margins": 0.62565678358078, "rewards/rejected": 2.130201816558838, "step": 56870 }, { "epoch": 2.6407911230790657, "grad_norm": 17.436317443847656, "learning_rate": 6.007706950183388e-08, "logits/chosen": -19.11207389831543, "logits/rejected": -17.798751831054688, "logps/chosen": -482.25732421875, "logps/rejected": -367.3709411621094, "loss": 0.6398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6872150897979736, "rewards/margins": 1.1186039447784424, "rewards/rejected": 2.5686111450195312, "step": 56880 }, { "epoch": 2.641255397186499, "grad_norm": 11.500931739807129, "learning_rate": 5.999969048392838e-08, "logits/chosen": -18.957813262939453, "logits/rejected": -17.353641510009766, "logps/chosen": -422.48602294921875, "logps/rejected": -226.5150604248047, "loss": 0.7268, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.050559997558594, "rewards/margins": 2.4404008388519287, "rewards/rejected": 1.610158920288086, "step": 56890 }, { "epoch": 2.6417196712939317, "grad_norm": 37.39013671875, "learning_rate": 5.992231146602287e-08, "logits/chosen": -18.50395393371582, "logits/rejected": -18.391557693481445, "logps/chosen": -372.6817321777344, "logps/rejected": -288.2365417480469, "loss": 0.9238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.175785779953003, "rewards/margins": 0.9479510188102722, "rewards/rejected": 1.2278345823287964, "step": 56900 }, { "epoch": 2.642183945401365, "grad_norm": 82.5304183959961, "learning_rate": 5.984493244811736e-08, "logits/chosen": -18.701457977294922, "logits/rejected": -19.072917938232422, "logps/chosen": -344.53656005859375, "logps/rejected": -340.0497131347656, "loss": 0.9877, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.4018545150756836, "rewards/margins": 0.09781596809625626, "rewards/rejected": 2.3040387630462646, "step": 56910 }, { "epoch": 2.642648219508798, "grad_norm": 17.21022605895996, "learning_rate": 5.976755343021186e-08, "logits/chosen": -19.43794822692871, "logits/rejected": -17.820804595947266, "logps/chosen": -404.659912109375, "logps/rejected": -295.0928649902344, "loss": 0.7295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.717388153076172, "rewards/margins": 1.6414169073104858, "rewards/rejected": 2.0759711265563965, "step": 56920 }, { "epoch": 2.643112493616231, "grad_norm": 77.2278823852539, "learning_rate": 5.969017441230636e-08, "logits/chosen": -18.955656051635742, "logits/rejected": -18.705202102661133, "logps/chosen": -470.1988830566406, "logps/rejected": -428.77618408203125, "loss": 1.4262, "rewards/accuracies": 0.5, "rewards/chosen": 4.510411262512207, "rewards/margins": 0.28686752915382385, "rewards/rejected": 4.223543167114258, "step": 56930 }, { "epoch": 2.643576767723664, "grad_norm": 152.50949096679688, "learning_rate": 5.961279539440086e-08, "logits/chosen": -18.770732879638672, "logits/rejected": -18.135482788085938, "logps/chosen": -361.0079040527344, "logps/rejected": -313.5481872558594, "loss": 0.5638, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.983302354812622, "rewards/margins": 1.092139482498169, "rewards/rejected": 1.8911631107330322, "step": 56940 }, { "epoch": 2.6440410418310973, "grad_norm": 75.20771026611328, "learning_rate": 5.953541637649535e-08, "logits/chosen": -18.48067855834961, "logits/rejected": -17.514293670654297, "logps/chosen": -417.40087890625, "logps/rejected": -298.5630798339844, "loss": 0.5792, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5875370502471924, "rewards/margins": 1.7201077938079834, "rewards/rejected": 1.8674290180206299, "step": 56950 }, { "epoch": 2.64450531593853, "grad_norm": 211.4349822998047, "learning_rate": 5.945803735858984e-08, "logits/chosen": -18.852075576782227, "logits/rejected": -18.27286148071289, "logps/chosen": -476.7493591308594, "logps/rejected": -376.57135009765625, "loss": 0.8966, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7192606925964355, "rewards/margins": 0.9797548055648804, "rewards/rejected": 2.7395055294036865, "step": 56960 }, { "epoch": 2.644969590045963, "grad_norm": 1.9173401594161987, "learning_rate": 5.938065834068433e-08, "logits/chosen": -19.913410186767578, "logits/rejected": -19.51560401916504, "logps/chosen": -395.08746337890625, "logps/rejected": -338.47698974609375, "loss": 0.7305, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.148434162139893, "rewards/margins": 1.4988361597061157, "rewards/rejected": 2.6495978832244873, "step": 56970 }, { "epoch": 2.645433864153396, "grad_norm": 24.647178649902344, "learning_rate": 5.930327932277883e-08, "logits/chosen": -18.43643569946289, "logits/rejected": -18.419437408447266, "logps/chosen": -511.5057678222656, "logps/rejected": -450.8983459472656, "loss": 0.8359, "rewards/accuracies": 0.5, "rewards/chosen": 4.847569465637207, "rewards/margins": 0.8953048586845398, "rewards/rejected": 3.9522652626037598, "step": 56980 }, { "epoch": 2.6458981382608293, "grad_norm": 201.97238159179688, "learning_rate": 5.922590030487333e-08, "logits/chosen": -18.513866424560547, "logits/rejected": -17.67160415649414, "logps/chosen": -411.46722412109375, "logps/rejected": -274.4467468261719, "loss": 0.3943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.358532667160034, "rewards/margins": 2.037677526473999, "rewards/rejected": 1.3208553791046143, "step": 56990 }, { "epoch": 2.646362412368262, "grad_norm": 5.3349928855896, "learning_rate": 5.9148521286967826e-08, "logits/chosen": -19.344676971435547, "logits/rejected": -18.043405532836914, "logps/chosen": -427.19122314453125, "logps/rejected": -297.9854736328125, "loss": 0.2445, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7239089012145996, "rewards/margins": 2.516052722930908, "rewards/rejected": 1.2078568935394287, "step": 57000 }, { "epoch": 2.6468266864756953, "grad_norm": 153.1414031982422, "learning_rate": 5.907114226906232e-08, "logits/chosen": -19.26735496520996, "logits/rejected": -18.52676010131836, "logps/chosen": -376.68572998046875, "logps/rejected": -336.7314758300781, "loss": 1.1057, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0353376865386963, "rewards/margins": 0.8389251828193665, "rewards/rejected": 2.1964125633239746, "step": 57010 }, { "epoch": 2.6472909605831285, "grad_norm": 182.0892791748047, "learning_rate": 5.8993763251156816e-08, "logits/chosen": -19.461124420166016, "logits/rejected": -18.693506240844727, "logps/chosen": -416.45758056640625, "logps/rejected": -294.06927490234375, "loss": 0.8023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0592758655548096, "rewards/margins": 1.4543142318725586, "rewards/rejected": 1.6049613952636719, "step": 57020 }, { "epoch": 2.6477552346905613, "grad_norm": 87.11577606201172, "learning_rate": 5.891638423325131e-08, "logits/chosen": -20.045711517333984, "logits/rejected": -19.485530853271484, "logps/chosen": -398.1213684082031, "logps/rejected": -321.5506896972656, "loss": 0.5942, "rewards/accuracies": 0.5, "rewards/chosen": 3.5714995861053467, "rewards/margins": 0.8778742551803589, "rewards/rejected": 2.693624973297119, "step": 57030 }, { "epoch": 2.648219508797994, "grad_norm": 5.696763038635254, "learning_rate": 5.88390052153458e-08, "logits/chosen": -18.811861038208008, "logits/rejected": -18.389545440673828, "logps/chosen": -362.1131896972656, "logps/rejected": -278.0303649902344, "loss": 0.8536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0893423557281494, "rewards/margins": 1.529032826423645, "rewards/rejected": 1.5603092908859253, "step": 57040 }, { "epoch": 2.6486837829054273, "grad_norm": 10.868837356567383, "learning_rate": 5.87616261974403e-08, "logits/chosen": -19.70164680480957, "logits/rejected": -19.700572967529297, "logps/chosen": -296.81011962890625, "logps/rejected": -309.65509033203125, "loss": 0.7695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.803313732147217, "rewards/margins": 0.4915722906589508, "rewards/rejected": 2.311741590499878, "step": 57050 }, { "epoch": 2.6491480570128605, "grad_norm": 34.93886947631836, "learning_rate": 5.8684247179534795e-08, "logits/chosen": -19.983623504638672, "logits/rejected": -18.539920806884766, "logps/chosen": -422.29150390625, "logps/rejected": -298.1265869140625, "loss": 0.3135, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.476235866546631, "rewards/margins": 2.108564853668213, "rewards/rejected": 1.367671012878418, "step": 57060 }, { "epoch": 2.6496123311202933, "grad_norm": 0.4400447905063629, "learning_rate": 5.860686816162929e-08, "logits/chosen": -19.46660804748535, "logits/rejected": -17.90288543701172, "logps/chosen": -359.8083801269531, "logps/rejected": -283.9556884765625, "loss": 0.3651, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.173182487487793, "rewards/margins": 2.568909168243408, "rewards/rejected": 1.6042734384536743, "step": 57070 }, { "epoch": 2.6500766052277265, "grad_norm": 114.53514862060547, "learning_rate": 5.8529489143723784e-08, "logits/chosen": -19.03420639038086, "logits/rejected": -17.655527114868164, "logps/chosen": -436.1078186035156, "logps/rejected": -277.01495361328125, "loss": 1.1056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.155333995819092, "rewards/margins": 1.7736259698867798, "rewards/rejected": 1.381708025932312, "step": 57080 }, { "epoch": 2.6505408793351597, "grad_norm": 46.281028747558594, "learning_rate": 5.8452110125818276e-08, "logits/chosen": -19.48294448852539, "logits/rejected": -18.7685546875, "logps/chosen": -438.55560302734375, "logps/rejected": -332.62457275390625, "loss": 0.5172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0501885414123535, "rewards/margins": 0.9622892141342163, "rewards/rejected": 2.0878994464874268, "step": 57090 }, { "epoch": 2.6510051534425925, "grad_norm": 22.88945198059082, "learning_rate": 5.8374731107912774e-08, "logits/chosen": -19.02748680114746, "logits/rejected": -18.667207717895508, "logps/chosen": -380.259521484375, "logps/rejected": -333.5543518066406, "loss": 0.8154, "rewards/accuracies": 0.5, "rewards/chosen": 2.7465245723724365, "rewards/margins": 0.2252543419599533, "rewards/rejected": 2.5212700366973877, "step": 57100 }, { "epoch": 2.6514694275500257, "grad_norm": 2.3726704120635986, "learning_rate": 5.829735209000727e-08, "logits/chosen": -20.188400268554688, "logits/rejected": -18.779804229736328, "logps/chosen": -436.3585510253906, "logps/rejected": -285.1673889160156, "loss": 0.3101, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9798667430877686, "rewards/margins": 2.1048879623413086, "rewards/rejected": 1.8749784231185913, "step": 57110 }, { "epoch": 2.6519337016574585, "grad_norm": 57.87104415893555, "learning_rate": 5.821997307210177e-08, "logits/chosen": -19.80086326599121, "logits/rejected": -18.744029998779297, "logps/chosen": -369.0555725097656, "logps/rejected": -285.2833557128906, "loss": 0.9104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1453871726989746, "rewards/margins": 0.8431676626205444, "rewards/rejected": 2.3022193908691406, "step": 57120 }, { "epoch": 2.6523979757648917, "grad_norm": 36.92704391479492, "learning_rate": 5.814259405419626e-08, "logits/chosen": -19.828548431396484, "logits/rejected": -18.762706756591797, "logps/chosen": -433.73638916015625, "logps/rejected": -358.7469482421875, "loss": 0.7091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.415148973464966, "rewards/margins": 1.050943374633789, "rewards/rejected": 2.3642053604125977, "step": 57130 }, { "epoch": 2.6528622498723244, "grad_norm": 24.903017044067383, "learning_rate": 5.806521503629075e-08, "logits/chosen": -18.893468856811523, "logits/rejected": -18.244186401367188, "logps/chosen": -438.22802734375, "logps/rejected": -393.88714599609375, "loss": 0.6484, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.339886665344238, "rewards/margins": 1.4235903024673462, "rewards/rejected": 2.9162964820861816, "step": 57140 }, { "epoch": 2.6533265239797577, "grad_norm": 22.823196411132812, "learning_rate": 5.798783601838525e-08, "logits/chosen": -20.324426651000977, "logits/rejected": -19.578887939453125, "logps/chosen": -353.6721496582031, "logps/rejected": -327.32525634765625, "loss": 0.6718, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5194363594055176, "rewards/margins": 1.2382490634918213, "rewards/rejected": 2.2811872959136963, "step": 57150 }, { "epoch": 2.653790798087191, "grad_norm": 75.78120422363281, "learning_rate": 5.791045700047974e-08, "logits/chosen": -19.62353515625, "logits/rejected": -19.88142967224121, "logps/chosen": -385.81378173828125, "logps/rejected": -344.15679931640625, "loss": 1.2482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.22239351272583, "rewards/margins": 0.6019934415817261, "rewards/rejected": 2.6203999519348145, "step": 57160 }, { "epoch": 2.6542550721946236, "grad_norm": 141.068359375, "learning_rate": 5.783307798257425e-08, "logits/chosen": -18.133853912353516, "logits/rejected": -18.077442169189453, "logps/chosen": -353.0821533203125, "logps/rejected": -414.75714111328125, "loss": 1.7429, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.3775241374969482, "rewards/margins": 0.042243100702762604, "rewards/rejected": 2.3352808952331543, "step": 57170 }, { "epoch": 2.654719346302057, "grad_norm": 23.2262020111084, "learning_rate": 5.775569896466874e-08, "logits/chosen": -20.099628448486328, "logits/rejected": -18.816879272460938, "logps/chosen": -352.56695556640625, "logps/rejected": -276.759033203125, "loss": 0.4355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.405855178833008, "rewards/margins": 1.4741290807724, "rewards/rejected": 1.9317258596420288, "step": 57180 }, { "epoch": 2.6551836204094896, "grad_norm": 135.48133850097656, "learning_rate": 5.7678319946763236e-08, "logits/chosen": -19.5093994140625, "logits/rejected": -18.28318977355957, "logps/chosen": -436.9365234375, "logps/rejected": -317.1233215332031, "loss": 0.5261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.225615978240967, "rewards/margins": 1.7424676418304443, "rewards/rejected": 2.4831480979919434, "step": 57190 }, { "epoch": 2.655647894516923, "grad_norm": 0.34706148505210876, "learning_rate": 5.760094092885773e-08, "logits/chosen": -19.424198150634766, "logits/rejected": -18.47479820251465, "logps/chosen": -392.6664123535156, "logps/rejected": -291.97113037109375, "loss": 0.9079, "rewards/accuracies": 0.5, "rewards/chosen": 3.6361117362976074, "rewards/margins": 0.8382906913757324, "rewards/rejected": 2.797821521759033, "step": 57200 }, { "epoch": 2.6561121686243556, "grad_norm": 23.289705276489258, "learning_rate": 5.752356191095222e-08, "logits/chosen": -18.73593521118164, "logits/rejected": -17.48166275024414, "logps/chosen": -454.10986328125, "logps/rejected": -309.22412109375, "loss": 0.4674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.526066780090332, "rewards/margins": 2.761669158935547, "rewards/rejected": 1.7643979787826538, "step": 57210 }, { "epoch": 2.656576442731789, "grad_norm": 3.4411795139312744, "learning_rate": 5.744618289304672e-08, "logits/chosen": -19.479347229003906, "logits/rejected": -18.388477325439453, "logps/chosen": -491.11920166015625, "logps/rejected": -401.01116943359375, "loss": 0.2761, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.0556745529174805, "rewards/margins": 1.9934591054916382, "rewards/rejected": 2.0622153282165527, "step": 57220 }, { "epoch": 2.657040716839222, "grad_norm": 4.610424995422363, "learning_rate": 5.7368803875141215e-08, "logits/chosen": -19.89545440673828, "logits/rejected": -18.9755916595459, "logps/chosen": -350.35675048828125, "logps/rejected": -328.74505615234375, "loss": 0.7987, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3888180255889893, "rewards/margins": 1.0221049785614014, "rewards/rejected": 1.3667131662368774, "step": 57230 }, { "epoch": 2.657504990946655, "grad_norm": 141.2620849609375, "learning_rate": 5.7291424857235713e-08, "logits/chosen": -18.523662567138672, "logits/rejected": -18.626052856445312, "logps/chosen": -278.08160400390625, "logps/rejected": -248.5911865234375, "loss": 0.6979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1195013523101807, "rewards/margins": 1.093069314956665, "rewards/rejected": 2.0264320373535156, "step": 57240 }, { "epoch": 2.657969265054088, "grad_norm": 68.95881652832031, "learning_rate": 5.7214045839330205e-08, "logits/chosen": -19.011091232299805, "logits/rejected": -18.474184036254883, "logps/chosen": -420.35906982421875, "logps/rejected": -356.68194580078125, "loss": 0.7513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.782439708709717, "rewards/margins": 0.2624756693840027, "rewards/rejected": 2.5199639797210693, "step": 57250 }, { "epoch": 2.658433539161521, "grad_norm": 227.9499053955078, "learning_rate": 5.7136666821424696e-08, "logits/chosen": -18.442764282226562, "logits/rejected": -17.991374969482422, "logps/chosen": -401.55682373046875, "logps/rejected": -428.95416259765625, "loss": 1.0496, "rewards/accuracies": 0.5, "rewards/chosen": 3.3587028980255127, "rewards/margins": 0.20121264457702637, "rewards/rejected": 3.1574900150299072, "step": 57260 }, { "epoch": 2.658897813268954, "grad_norm": 28.72571563720703, "learning_rate": 5.7059287803519194e-08, "logits/chosen": -19.2713623046875, "logits/rejected": -19.125024795532227, "logps/chosen": -372.1997985839844, "logps/rejected": -365.68402099609375, "loss": 0.4123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.808223009109497, "rewards/margins": 1.5511376857757568, "rewards/rejected": 2.2570853233337402, "step": 57270 }, { "epoch": 2.659362087376387, "grad_norm": 144.3704376220703, "learning_rate": 5.6981908785613686e-08, "logits/chosen": -18.846271514892578, "logits/rejected": -18.807222366333008, "logps/chosen": -414.68475341796875, "logps/rejected": -340.79266357421875, "loss": 1.4029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.502805709838867, "rewards/margins": -0.1530323177576065, "rewards/rejected": 2.6558380126953125, "step": 57280 }, { "epoch": 2.65982636148382, "grad_norm": 5.7042036056518555, "learning_rate": 5.690452976770819e-08, "logits/chosen": -19.152931213378906, "logits/rejected": -17.75476837158203, "logps/chosen": -454.16912841796875, "logps/rejected": -312.67120361328125, "loss": 0.858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1173386573791504, "rewards/margins": 1.6473766565322876, "rewards/rejected": 1.4699620008468628, "step": 57290 }, { "epoch": 2.6602906355912532, "grad_norm": 74.44990539550781, "learning_rate": 5.682715074980268e-08, "logits/chosen": -18.66716194152832, "logits/rejected": -18.617406845092773, "logps/chosen": -410.041748046875, "logps/rejected": -332.78448486328125, "loss": 1.254, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.59187650680542, "rewards/margins": 0.520839512348175, "rewards/rejected": 3.0710368156433105, "step": 57300 }, { "epoch": 2.660754909698686, "grad_norm": 0.6085052490234375, "learning_rate": 5.674977173189718e-08, "logits/chosen": -19.209524154663086, "logits/rejected": -18.330644607543945, "logps/chosen": -338.02923583984375, "logps/rejected": -270.45062255859375, "loss": 0.995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8096420764923096, "rewards/margins": 1.736424207687378, "rewards/rejected": 2.0732178688049316, "step": 57310 }, { "epoch": 2.661219183806119, "grad_norm": 152.69386291503906, "learning_rate": 5.667239271399167e-08, "logits/chosen": -19.093326568603516, "logits/rejected": -18.81021499633789, "logps/chosen": -358.3109436035156, "logps/rejected": -340.92291259765625, "loss": 0.9259, "rewards/accuracies": 0.5, "rewards/chosen": 2.5829765796661377, "rewards/margins": 0.8158715963363647, "rewards/rejected": 1.7671048641204834, "step": 57320 }, { "epoch": 2.6616834579135524, "grad_norm": 16.863277435302734, "learning_rate": 5.659501369608616e-08, "logits/chosen": -18.812910079956055, "logits/rejected": -18.82166290283203, "logps/chosen": -379.5899963378906, "logps/rejected": -394.22418212890625, "loss": 0.9392, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3252310752868652, "rewards/margins": 0.6873635053634644, "rewards/rejected": 2.6378679275512695, "step": 57330 }, { "epoch": 2.662147732020985, "grad_norm": 92.77565002441406, "learning_rate": 5.651763467818066e-08, "logits/chosen": -19.694576263427734, "logits/rejected": -18.351213455200195, "logps/chosen": -502.3204040527344, "logps/rejected": -321.1113586425781, "loss": 0.4129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7186100482940674, "rewards/margins": 1.9201465845108032, "rewards/rejected": 1.7984638214111328, "step": 57340 }, { "epoch": 2.662612006128418, "grad_norm": 1.865460753440857, "learning_rate": 5.644025566027516e-08, "logits/chosen": -18.41805076599121, "logits/rejected": -17.115861892700195, "logps/chosen": -387.689453125, "logps/rejected": -243.7777862548828, "loss": 0.6128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.936898708343506, "rewards/margins": 1.869081735610962, "rewards/rejected": 1.0678167343139648, "step": 57350 }, { "epoch": 2.663076280235851, "grad_norm": 6.105927467346191, "learning_rate": 5.636287664236966e-08, "logits/chosen": -19.797508239746094, "logits/rejected": -19.237464904785156, "logps/chosen": -435.32818603515625, "logps/rejected": -387.79437255859375, "loss": 0.509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.95824670791626, "rewards/margins": 1.6957166194915771, "rewards/rejected": 3.262530565261841, "step": 57360 }, { "epoch": 2.6635405543432844, "grad_norm": 12.796475410461426, "learning_rate": 5.628549762446415e-08, "logits/chosen": -18.44363784790039, "logits/rejected": -18.737438201904297, "logps/chosen": -480.2060546875, "logps/rejected": -448.68231201171875, "loss": 0.5996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.442432403564453, "rewards/margins": 0.9048404693603516, "rewards/rejected": 2.5375916957855225, "step": 57370 }, { "epoch": 2.664004828450717, "grad_norm": 126.954345703125, "learning_rate": 5.620811860655864e-08, "logits/chosen": -19.03946304321289, "logits/rejected": -18.581615447998047, "logps/chosen": -431.77886962890625, "logps/rejected": -303.189208984375, "loss": 0.62, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9889392852783203, "rewards/margins": 1.4988930225372314, "rewards/rejected": 2.490046501159668, "step": 57380 }, { "epoch": 2.6644691025581504, "grad_norm": 129.83604431152344, "learning_rate": 5.613073958865314e-08, "logits/chosen": -20.567453384399414, "logits/rejected": -18.879962921142578, "logps/chosen": -392.0470275878906, "logps/rejected": -295.60137939453125, "loss": 0.2076, "rewards/accuracies": 1.0, "rewards/chosen": 4.570093631744385, "rewards/margins": 2.715756893157959, "rewards/rejected": 1.8543370962142944, "step": 57390 }, { "epoch": 2.6649333766655836, "grad_norm": 118.5203628540039, "learning_rate": 5.605336057074763e-08, "logits/chosen": -18.618844985961914, "logits/rejected": -18.338417053222656, "logps/chosen": -307.8837585449219, "logps/rejected": -221.17660522460938, "loss": 0.7534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.408684015274048, "rewards/margins": 1.6819908618927002, "rewards/rejected": 0.726692795753479, "step": 57400 }, { "epoch": 2.6653976507730164, "grad_norm": 122.83875274658203, "learning_rate": 5.5975981552842134e-08, "logits/chosen": -18.91292953491211, "logits/rejected": -18.74056053161621, "logps/chosen": -271.26654052734375, "logps/rejected": -317.4757385253906, "loss": 1.8515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.999330520629883, "rewards/margins": -0.2360517978668213, "rewards/rejected": 3.235382080078125, "step": 57410 }, { "epoch": 2.665861924880449, "grad_norm": 72.7417984008789, "learning_rate": 5.5898602534936625e-08, "logits/chosen": -19.659555435180664, "logits/rejected": -19.0050048828125, "logps/chosen": -396.34478759765625, "logps/rejected": -287.36328125, "loss": 0.7668, "rewards/accuracies": 0.5, "rewards/chosen": 3.668912172317505, "rewards/margins": 1.43765127658844, "rewards/rejected": 2.2312612533569336, "step": 57420 }, { "epoch": 2.6663261989878824, "grad_norm": 42.75590896606445, "learning_rate": 5.582122351703112e-08, "logits/chosen": -19.832130432128906, "logits/rejected": -19.313846588134766, "logps/chosen": -344.5655212402344, "logps/rejected": -312.7525634765625, "loss": 0.539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4428024291992188, "rewards/margins": 0.9098216891288757, "rewards/rejected": 2.5329809188842773, "step": 57430 }, { "epoch": 2.6667904730953156, "grad_norm": 104.56144714355469, "learning_rate": 5.5743844499125615e-08, "logits/chosen": -18.6049861907959, "logits/rejected": -17.48752212524414, "logps/chosen": -459.38140869140625, "logps/rejected": -315.3674621582031, "loss": 0.2279, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7820961475372314, "rewards/margins": 3.2418265342712402, "rewards/rejected": 0.5402695536613464, "step": 57440 }, { "epoch": 2.6672547472027484, "grad_norm": 171.6323699951172, "learning_rate": 5.5666465481220106e-08, "logits/chosen": -18.210355758666992, "logits/rejected": -17.854860305786133, "logps/chosen": -351.2178955078125, "logps/rejected": -290.0685119628906, "loss": 0.8091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3258755207061768, "rewards/margins": 0.3064068853855133, "rewards/rejected": 2.0194687843322754, "step": 57450 }, { "epoch": 2.6677190213101816, "grad_norm": 81.01359558105469, "learning_rate": 5.5589086463314604e-08, "logits/chosen": -17.78816795349121, "logits/rejected": -17.266618728637695, "logps/chosen": -330.5394592285156, "logps/rejected": -292.15435791015625, "loss": 0.455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.971132516860962, "rewards/margins": 0.8779303431510925, "rewards/rejected": 1.0932022333145142, "step": 57460 }, { "epoch": 2.668183295417615, "grad_norm": 5.052005767822266, "learning_rate": 5.55117074454091e-08, "logits/chosen": -18.150897979736328, "logits/rejected": -17.839406967163086, "logps/chosen": -319.4587097167969, "logps/rejected": -333.0724182128906, "loss": 1.446, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.5935819149017334, "rewards/margins": -0.5454027056694031, "rewards/rejected": 2.138984441757202, "step": 57470 }, { "epoch": 2.6686475695250476, "grad_norm": 167.32728576660156, "learning_rate": 5.54343284275036e-08, "logits/chosen": -19.716787338256836, "logits/rejected": -17.777164459228516, "logps/chosen": -505.98651123046875, "logps/rejected": -415.75164794921875, "loss": 0.6223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.161306381225586, "rewards/margins": 0.9593184590339661, "rewards/rejected": 3.2019877433776855, "step": 57480 }, { "epoch": 2.6691118436324808, "grad_norm": 69.34346008300781, "learning_rate": 5.535694940959809e-08, "logits/chosen": -19.77682113647461, "logits/rejected": -18.848007202148438, "logps/chosen": -441.3162536621094, "logps/rejected": -320.454345703125, "loss": 0.2538, "rewards/accuracies": 1.0, "rewards/chosen": 5.07381534576416, "rewards/margins": 2.6408112049102783, "rewards/rejected": 2.4330039024353027, "step": 57490 }, { "epoch": 2.6695761177399135, "grad_norm": 68.81838989257812, "learning_rate": 5.5279570391692583e-08, "logits/chosen": -19.001117706298828, "logits/rejected": -18.279830932617188, "logps/chosen": -420.61444091796875, "logps/rejected": -328.99542236328125, "loss": 0.575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.965852975845337, "rewards/margins": 0.8434404134750366, "rewards/rejected": 2.1224124431610107, "step": 57500 }, { "epoch": 2.6700403918473468, "grad_norm": 42.92580795288086, "learning_rate": 5.520219137378708e-08, "logits/chosen": -19.33544921875, "logits/rejected": -18.340736389160156, "logps/chosen": -417.454833984375, "logps/rejected": -341.73321533203125, "loss": 0.4427, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.261898517608643, "rewards/margins": 1.9327001571655273, "rewards/rejected": 2.3291983604431152, "step": 57510 }, { "epoch": 2.6705046659547795, "grad_norm": 1.3846797943115234, "learning_rate": 5.512481235588157e-08, "logits/chosen": -19.545429229736328, "logits/rejected": -18.843952178955078, "logps/chosen": -390.9539794921875, "logps/rejected": -318.72857666015625, "loss": 0.5739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.607189893722534, "rewards/margins": 1.134399652481079, "rewards/rejected": 2.472790479660034, "step": 57520 }, { "epoch": 2.6709689400622127, "grad_norm": 15.612747192382812, "learning_rate": 5.504743333797608e-08, "logits/chosen": -19.145320892333984, "logits/rejected": -17.364585876464844, "logps/chosen": -451.89678955078125, "logps/rejected": -269.7239074707031, "loss": 0.2664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1645941734313965, "rewards/margins": 2.1408705711364746, "rewards/rejected": 1.0237236022949219, "step": 57530 }, { "epoch": 2.671433214169646, "grad_norm": 145.55133056640625, "learning_rate": 5.497005432007057e-08, "logits/chosen": -18.358642578125, "logits/rejected": -18.546812057495117, "logps/chosen": -267.2247619628906, "logps/rejected": -258.283203125, "loss": 0.817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9078048467636108, "rewards/margins": 0.3688647747039795, "rewards/rejected": 1.5389400720596313, "step": 57540 }, { "epoch": 2.6718974882770787, "grad_norm": 77.26924896240234, "learning_rate": 5.489267530216506e-08, "logits/chosen": -19.29714584350586, "logits/rejected": -18.66387939453125, "logps/chosen": -360.64959716796875, "logps/rejected": -325.078125, "loss": 0.9261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6153042316436768, "rewards/margins": 0.43338871002197266, "rewards/rejected": 3.181915760040283, "step": 57550 }, { "epoch": 2.672361762384512, "grad_norm": 0.41453683376312256, "learning_rate": 5.481529628425956e-08, "logits/chosen": -18.621870040893555, "logits/rejected": -17.893211364746094, "logps/chosen": -419.65521240234375, "logps/rejected": -353.1375427246094, "loss": 0.5036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.762819766998291, "rewards/margins": 1.5662018060684204, "rewards/rejected": 2.196617603302002, "step": 57560 }, { "epoch": 2.6728260364919447, "grad_norm": 1.4469088315963745, "learning_rate": 5.473791726635405e-08, "logits/chosen": -19.522136688232422, "logits/rejected": -18.29854393005371, "logps/chosen": -389.89056396484375, "logps/rejected": -358.22003173828125, "loss": 0.4492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.726466417312622, "rewards/margins": 1.693289041519165, "rewards/rejected": 2.033177375793457, "step": 57570 }, { "epoch": 2.673290310599378, "grad_norm": 3.944084882736206, "learning_rate": 5.466053824844855e-08, "logits/chosen": -19.20831298828125, "logits/rejected": -17.95747947692871, "logps/chosen": -374.05987548828125, "logps/rejected": -282.94580078125, "loss": 0.3718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2863311767578125, "rewards/margins": 2.3712124824523926, "rewards/rejected": 1.9151182174682617, "step": 57580 }, { "epoch": 2.6737545847068107, "grad_norm": 0.03680887818336487, "learning_rate": 5.4583159230543046e-08, "logits/chosen": -18.83902359008789, "logits/rejected": -18.481250762939453, "logps/chosen": -253.401123046875, "logps/rejected": -175.91006469726562, "loss": 0.5942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.481267213821411, "rewards/margins": 2.124600648880005, "rewards/rejected": 0.3566665053367615, "step": 57590 }, { "epoch": 2.674218858814244, "grad_norm": 3.2886459827423096, "learning_rate": 5.450578021263754e-08, "logits/chosen": -20.294273376464844, "logits/rejected": -18.962474822998047, "logps/chosen": -405.3888244628906, "logps/rejected": -306.3377990722656, "loss": 0.3564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1289825439453125, "rewards/margins": 1.4982829093933105, "rewards/rejected": 2.630699634552002, "step": 57600 }, { "epoch": 2.674683132921677, "grad_norm": 0.0022413143888115883, "learning_rate": 5.4428401194732036e-08, "logits/chosen": -18.481822967529297, "logits/rejected": -18.75748062133789, "logps/chosen": -266.18341064453125, "logps/rejected": -287.31463623046875, "loss": 2.0618, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.384769916534424, "rewards/margins": 0.3426974415779114, "rewards/rejected": 2.0420730113983154, "step": 57610 }, { "epoch": 2.67514740702911, "grad_norm": 1.472154974937439, "learning_rate": 5.435102217682653e-08, "logits/chosen": -18.30766487121582, "logits/rejected": -18.43518829345703, "logps/chosen": -345.0249938964844, "logps/rejected": -364.1633605957031, "loss": 1.0545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.249664306640625, "rewards/margins": 0.5629770755767822, "rewards/rejected": 2.686687469482422, "step": 57620 }, { "epoch": 2.675611681136543, "grad_norm": 1.7039011716842651, "learning_rate": 5.4273643158921025e-08, "logits/chosen": -19.159425735473633, "logits/rejected": -18.433940887451172, "logps/chosen": -346.21832275390625, "logps/rejected": -268.40185546875, "loss": 0.5366, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.256063222885132, "rewards/margins": 1.0931669473648071, "rewards/rejected": 1.1628963947296143, "step": 57630 }, { "epoch": 2.676075955243976, "grad_norm": 29.36978530883789, "learning_rate": 5.4196264141015517e-08, "logits/chosen": -19.4248104095459, "logits/rejected": -18.526447296142578, "logps/chosen": -392.25128173828125, "logps/rejected": -277.5588684082031, "loss": 0.5155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.293039321899414, "rewards/margins": 1.9472687244415283, "rewards/rejected": 2.3457703590393066, "step": 57640 }, { "epoch": 2.676540229351409, "grad_norm": 147.03932189941406, "learning_rate": 5.411888512311002e-08, "logits/chosen": -19.144262313842773, "logits/rejected": -19.491844177246094, "logps/chosen": -405.6964111328125, "logps/rejected": -441.57476806640625, "loss": 1.3192, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4400405883789062, "rewards/margins": 0.03852396085858345, "rewards/rejected": 3.4015164375305176, "step": 57650 }, { "epoch": 2.677004503458842, "grad_norm": 29.776315689086914, "learning_rate": 5.404150610520451e-08, "logits/chosen": -18.29898452758789, "logits/rejected": -17.961803436279297, "logps/chosen": -336.65032958984375, "logps/rejected": -300.48114013671875, "loss": 0.6482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.152299642562866, "rewards/margins": 1.0600463151931763, "rewards/rejected": 1.09225332736969, "step": 57660 }, { "epoch": 2.677468777566275, "grad_norm": 62.5098876953125, "learning_rate": 5.3964127087299004e-08, "logits/chosen": -19.785602569580078, "logits/rejected": -18.501144409179688, "logps/chosen": -363.4323425292969, "logps/rejected": -261.449951171875, "loss": 1.2628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9075534343719482, "rewards/margins": 0.6126993894577026, "rewards/rejected": 2.294853687286377, "step": 57670 }, { "epoch": 2.6779330516737083, "grad_norm": 126.26884460449219, "learning_rate": 5.38867480693935e-08, "logits/chosen": -19.164794921875, "logits/rejected": -18.812578201293945, "logps/chosen": -376.300537109375, "logps/rejected": -394.60205078125, "loss": 0.9855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5530266761779785, "rewards/margins": 0.9909148216247559, "rewards/rejected": 2.562112331390381, "step": 57680 }, { "epoch": 2.678397325781141, "grad_norm": 68.81788635253906, "learning_rate": 5.3809369051487994e-08, "logits/chosen": -19.29986000061035, "logits/rejected": -17.86319351196289, "logps/chosen": -380.57659912109375, "logps/rejected": -257.21484375, "loss": 0.6237, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.562364101409912, "rewards/margins": 2.398563861846924, "rewards/rejected": 1.1637998819351196, "step": 57690 }, { "epoch": 2.6788615998885743, "grad_norm": 18.198312759399414, "learning_rate": 5.373199003358249e-08, "logits/chosen": -18.636768341064453, "logits/rejected": -17.55203628540039, "logps/chosen": -361.7582702636719, "logps/rejected": -268.3848876953125, "loss": 0.4632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.652820587158203, "rewards/margins": 2.6162333488464355, "rewards/rejected": 1.036587119102478, "step": 57700 }, { "epoch": 2.679325873996007, "grad_norm": 193.3428192138672, "learning_rate": 5.365461101567699e-08, "logits/chosen": -18.950790405273438, "logits/rejected": -18.129261016845703, "logps/chosen": -359.7503967285156, "logps/rejected": -319.8318786621094, "loss": 0.3126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.369994163513184, "rewards/margins": 2.260143756866455, "rewards/rejected": 2.1098504066467285, "step": 57710 }, { "epoch": 2.6797901481034403, "grad_norm": 147.57769775390625, "learning_rate": 5.357723199777148e-08, "logits/chosen": -19.065393447875977, "logits/rejected": -18.15694808959961, "logps/chosen": -466.44000244140625, "logps/rejected": -318.9461975097656, "loss": 0.478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8572373390197754, "rewards/margins": 2.1432995796203613, "rewards/rejected": 1.713937759399414, "step": 57720 }, { "epoch": 2.680254422210873, "grad_norm": 85.24288940429688, "learning_rate": 5.349985297986598e-08, "logits/chosen": -18.328937530517578, "logits/rejected": -18.21432113647461, "logps/chosen": -365.757080078125, "logps/rejected": -367.0425109863281, "loss": 1.2107, "rewards/accuracies": 0.5, "rewards/chosen": 3.1563589572906494, "rewards/margins": 0.39726901054382324, "rewards/rejected": 2.759089946746826, "step": 57730 }, { "epoch": 2.6807186963183063, "grad_norm": 49.539634704589844, "learning_rate": 5.342247396196047e-08, "logits/chosen": -19.331979751586914, "logits/rejected": -18.876012802124023, "logps/chosen": -426.10369873046875, "logps/rejected": -385.00787353515625, "loss": 0.5829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4462523460388184, "rewards/margins": 0.45215725898742676, "rewards/rejected": 2.9940953254699707, "step": 57740 }, { "epoch": 2.6811829704257395, "grad_norm": 9.049201011657715, "learning_rate": 5.334509494405497e-08, "logits/chosen": -19.539325714111328, "logits/rejected": -18.360103607177734, "logps/chosen": -444.8206481933594, "logps/rejected": -346.19940185546875, "loss": 0.2477, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.253218650817871, "rewards/margins": 1.6645267009735107, "rewards/rejected": 2.5886924266815186, "step": 57750 }, { "epoch": 2.6816472445331723, "grad_norm": 30.872297286987305, "learning_rate": 5.326771592614946e-08, "logits/chosen": -19.098529815673828, "logits/rejected": -18.331806182861328, "logps/chosen": -279.2171325683594, "logps/rejected": -273.5424499511719, "loss": 0.7378, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3314950466156006, "rewards/margins": 0.17386139929294586, "rewards/rejected": 1.157633662223816, "step": 57760 }, { "epoch": 2.6821115186406055, "grad_norm": 6.489558219909668, "learning_rate": 5.319033690824396e-08, "logits/chosen": -19.86796760559082, "logits/rejected": -19.347383499145508, "logps/chosen": -431.2388610839844, "logps/rejected": -331.21856689453125, "loss": 0.4065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.81697416305542, "rewards/margins": 1.753938913345337, "rewards/rejected": 2.063035249710083, "step": 57770 }, { "epoch": 2.6825757927480387, "grad_norm": 92.65777587890625, "learning_rate": 5.3112957890338456e-08, "logits/chosen": -19.120891571044922, "logits/rejected": -18.344280242919922, "logps/chosen": -402.0611572265625, "logps/rejected": -370.33245849609375, "loss": 0.9276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7401375770568848, "rewards/margins": 1.2123547792434692, "rewards/rejected": 2.5277822017669678, "step": 57780 }, { "epoch": 2.6830400668554715, "grad_norm": 4.483494281768799, "learning_rate": 5.303557887243295e-08, "logits/chosen": -19.673336029052734, "logits/rejected": -17.582088470458984, "logps/chosen": -521.6292114257812, "logps/rejected": -324.73809814453125, "loss": 0.3238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.682242393493652, "rewards/margins": 3.7607104778289795, "rewards/rejected": 1.9215316772460938, "step": 57790 }, { "epoch": 2.6835043409629042, "grad_norm": 136.3313446044922, "learning_rate": 5.2958199854527446e-08, "logits/chosen": -19.554765701293945, "logits/rejected": -18.477741241455078, "logps/chosen": -432.034423828125, "logps/rejected": -326.118408203125, "loss": 0.8057, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6822731494903564, "rewards/margins": 1.048138976097107, "rewards/rejected": 2.6341347694396973, "step": 57800 }, { "epoch": 2.6839686150703375, "grad_norm": 0.46410587430000305, "learning_rate": 5.288082083662194e-08, "logits/chosen": -19.463953018188477, "logits/rejected": -18.414915084838867, "logps/chosen": -349.8153991699219, "logps/rejected": -248.59158325195312, "loss": 1.1077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4146857261657715, "rewards/margins": 1.0395926237106323, "rewards/rejected": 1.3750932216644287, "step": 57810 }, { "epoch": 2.6844328891777707, "grad_norm": 27.109092712402344, "learning_rate": 5.2803441818716435e-08, "logits/chosen": -18.912845611572266, "logits/rejected": -18.901765823364258, "logps/chosen": -451.68035888671875, "logps/rejected": -507.50494384765625, "loss": 1.2342, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0363261699676514, "rewards/margins": -0.3546615540981293, "rewards/rejected": 3.3909873962402344, "step": 57820 }, { "epoch": 2.6848971632852034, "grad_norm": 12.45335578918457, "learning_rate": 5.2726062800810933e-08, "logits/chosen": -19.54459571838379, "logits/rejected": -17.739505767822266, "logps/chosen": -416.51898193359375, "logps/rejected": -238.89785766601562, "loss": 0.8724, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.885568141937256, "rewards/margins": 2.9784176349639893, "rewards/rejected": 1.9071508646011353, "step": 57830 }, { "epoch": 2.6853614373926367, "grad_norm": 13.808361053466797, "learning_rate": 5.2648683782905425e-08, "logits/chosen": -18.60413360595703, "logits/rejected": -18.686580657958984, "logps/chosen": -281.25897216796875, "logps/rejected": -335.4183044433594, "loss": 1.2735, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.6431289911270142, "rewards/margins": -0.060684822499752045, "rewards/rejected": 1.703813910484314, "step": 57840 }, { "epoch": 2.68582571150007, "grad_norm": 0.019910525530576706, "learning_rate": 5.257130476499992e-08, "logits/chosen": -18.855159759521484, "logits/rejected": -17.603281021118164, "logps/chosen": -486.8214416503906, "logps/rejected": -320.33624267578125, "loss": 0.5687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.7039995193481445, "rewards/margins": 2.7263951301574707, "rewards/rejected": 1.9776042699813843, "step": 57850 }, { "epoch": 2.6862899856075027, "grad_norm": 35.19640350341797, "learning_rate": 5.2493925747094414e-08, "logits/chosen": -18.542781829833984, "logits/rejected": -18.453142166137695, "logps/chosen": -342.76409912109375, "logps/rejected": -317.35137939453125, "loss": 0.8212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.165910243988037, "rewards/margins": 0.6236782073974609, "rewards/rejected": 1.5422322750091553, "step": 57860 }, { "epoch": 2.6867542597149354, "grad_norm": 0.5439744591712952, "learning_rate": 5.241654672918891e-08, "logits/chosen": -19.383634567260742, "logits/rejected": -19.118709564208984, "logps/chosen": -434.2100524902344, "logps/rejected": -380.14556884765625, "loss": 1.3661, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.770463228225708, "rewards/margins": 0.43065619468688965, "rewards/rejected": 3.3398067951202393, "step": 57870 }, { "epoch": 2.6872185338223686, "grad_norm": 39.311893463134766, "learning_rate": 5.2339167711283404e-08, "logits/chosen": -19.16552734375, "logits/rejected": -18.466632843017578, "logps/chosen": -362.61212158203125, "logps/rejected": -311.48956298828125, "loss": 0.3233, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4579975605010986, "rewards/margins": 1.5867445468902588, "rewards/rejected": 1.8712527751922607, "step": 57880 }, { "epoch": 2.687682807929802, "grad_norm": 97.83260345458984, "learning_rate": 5.2261788693377895e-08, "logits/chosen": -18.846656799316406, "logits/rejected": -18.84855842590332, "logps/chosen": -332.885498046875, "logps/rejected": -331.3346252441406, "loss": 1.0805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6819424629211426, "rewards/margins": 0.12479307502508163, "rewards/rejected": 2.5571491718292236, "step": 57890 }, { "epoch": 2.6881470820372346, "grad_norm": 20.91256332397461, "learning_rate": 5.21844096754724e-08, "logits/chosen": -19.129161834716797, "logits/rejected": -18.086097717285156, "logps/chosen": -397.81109619140625, "logps/rejected": -316.9416809082031, "loss": 0.7644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.719027042388916, "rewards/margins": 0.8286385536193848, "rewards/rejected": 2.890388250350952, "step": 57900 }, { "epoch": 2.688611356144668, "grad_norm": 3.6661899089813232, "learning_rate": 5.210703065756689e-08, "logits/chosen": -19.556346893310547, "logits/rejected": -18.350170135498047, "logps/chosen": -438.9307556152344, "logps/rejected": -323.8097839355469, "loss": 0.1902, "rewards/accuracies": 1.0, "rewards/chosen": 4.896243095397949, "rewards/margins": 2.403024196624756, "rewards/rejected": 2.4932186603546143, "step": 57910 }, { "epoch": 2.689075630252101, "grad_norm": 34.32686996459961, "learning_rate": 5.202965163966139e-08, "logits/chosen": -18.697683334350586, "logits/rejected": -18.44609260559082, "logps/chosen": -407.65771484375, "logps/rejected": -314.7159423828125, "loss": 0.4536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.570754051208496, "rewards/margins": 2.2656543254852295, "rewards/rejected": 2.3050999641418457, "step": 57920 }, { "epoch": 2.689539904359534, "grad_norm": 97.2195053100586, "learning_rate": 5.195227262175588e-08, "logits/chosen": -18.637760162353516, "logits/rejected": -18.555604934692383, "logps/chosen": -409.2331237792969, "logps/rejected": -369.9620056152344, "loss": 1.3804, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7524776458740234, "rewards/margins": 0.5632284283638, "rewards/rejected": 3.189249038696289, "step": 57930 }, { "epoch": 2.690004178466967, "grad_norm": 142.53892517089844, "learning_rate": 5.187489360385037e-08, "logits/chosen": -17.75832176208496, "logits/rejected": -18.48568344116211, "logps/chosen": -258.72564697265625, "logps/rejected": -353.6182861328125, "loss": 1.8317, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.8070162534713745, "rewards/margins": -1.2515190839767456, "rewards/rejected": 2.058535099029541, "step": 57940 }, { "epoch": 2.6904684525744, "grad_norm": 30.038732528686523, "learning_rate": 5.179751458594487e-08, "logits/chosen": -18.648544311523438, "logits/rejected": -19.307796478271484, "logps/chosen": -401.5752258300781, "logps/rejected": -389.4127502441406, "loss": 1.2757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4942612648010254, "rewards/margins": 0.6841145753860474, "rewards/rejected": 2.8101468086242676, "step": 57950 }, { "epoch": 2.690932726681833, "grad_norm": 60.21796417236328, "learning_rate": 5.172013556803937e-08, "logits/chosen": -19.1483211517334, "logits/rejected": -18.390552520751953, "logps/chosen": -463.477294921875, "logps/rejected": -432.9768981933594, "loss": 0.6242, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.461217164993286, "rewards/margins": 0.8892046213150024, "rewards/rejected": 2.572012424468994, "step": 57960 }, { "epoch": 2.691397000789266, "grad_norm": 125.6036148071289, "learning_rate": 5.1642756550133867e-08, "logits/chosen": -19.721233367919922, "logits/rejected": -18.38765525817871, "logps/chosen": -471.59100341796875, "logps/rejected": -334.32232666015625, "loss": 0.338, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.057048320770264, "rewards/margins": 2.552642822265625, "rewards/rejected": 2.5044052600860596, "step": 57970 }, { "epoch": 2.691861274896699, "grad_norm": 4.410009860992432, "learning_rate": 5.156537753222836e-08, "logits/chosen": -19.279109954833984, "logits/rejected": -17.754608154296875, "logps/chosen": -460.85650634765625, "logps/rejected": -308.932373046875, "loss": 0.1377, "rewards/accuracies": 1.0, "rewards/chosen": 5.400076389312744, "rewards/margins": 3.142622470855713, "rewards/rejected": 2.257453680038452, "step": 57980 }, { "epoch": 2.6923255490041322, "grad_norm": 119.49750518798828, "learning_rate": 5.1487998514322856e-08, "logits/chosen": -19.5418701171875, "logits/rejected": -18.511594772338867, "logps/chosen": -351.8727111816406, "logps/rejected": -300.21563720703125, "loss": 0.6405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.8302905559539795, "rewards/margins": 0.7045159935951233, "rewards/rejected": 2.125774621963501, "step": 57990 }, { "epoch": 2.692789823111565, "grad_norm": 4.148409843444824, "learning_rate": 5.141061949641735e-08, "logits/chosen": -19.000652313232422, "logits/rejected": -18.65728187561035, "logps/chosen": -325.64105224609375, "logps/rejected": -273.74847412109375, "loss": 0.6759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.140749931335449, "rewards/margins": 0.6754209995269775, "rewards/rejected": 1.4653289318084717, "step": 58000 }, { "epoch": 2.693254097218998, "grad_norm": 0.41825100779533386, "learning_rate": 5.133324047851184e-08, "logits/chosen": -20.14902114868164, "logits/rejected": -17.621259689331055, "logps/chosen": -467.0386657714844, "logps/rejected": -239.9864044189453, "loss": 0.2483, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.769557237625122, "rewards/margins": 2.6499624252319336, "rewards/rejected": 1.119594693183899, "step": 58010 }, { "epoch": 2.693718371326431, "grad_norm": 2.1783437728881836, "learning_rate": 5.1255861460606344e-08, "logits/chosen": -19.42528533935547, "logits/rejected": -18.046070098876953, "logps/chosen": -433.77789306640625, "logps/rejected": -313.700927734375, "loss": 0.2833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7178642749786377, "rewards/margins": 1.899545431137085, "rewards/rejected": 1.8183187246322632, "step": 58020 }, { "epoch": 2.694182645433864, "grad_norm": 115.89385223388672, "learning_rate": 5.1178482442700835e-08, "logits/chosen": -19.252086639404297, "logits/rejected": -18.3051700592041, "logps/chosen": -417.75445556640625, "logps/rejected": -351.7005310058594, "loss": 0.6047, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.602379560470581, "rewards/margins": 1.9707939624786377, "rewards/rejected": 1.6315854787826538, "step": 58030 }, { "epoch": 2.694646919541297, "grad_norm": 57.16963577270508, "learning_rate": 5.110110342479533e-08, "logits/chosen": -18.832422256469727, "logits/rejected": -18.44216537475586, "logps/chosen": -362.8773498535156, "logps/rejected": -280.60009765625, "loss": 0.7969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.594008684158325, "rewards/margins": 1.5207343101501465, "rewards/rejected": 2.0732738971710205, "step": 58040 }, { "epoch": 2.69511119364873, "grad_norm": 1.3236373662948608, "learning_rate": 5.1023724406889825e-08, "logits/chosen": -18.483766555786133, "logits/rejected": -18.183082580566406, "logps/chosen": -321.50653076171875, "logps/rejected": -281.0090637207031, "loss": 1.0087, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.082738161087036, "rewards/margins": 1.5340458154678345, "rewards/rejected": 1.5486924648284912, "step": 58050 }, { "epoch": 2.6955754677561634, "grad_norm": 63.28017807006836, "learning_rate": 5.0946345388984316e-08, "logits/chosen": -19.62221908569336, "logits/rejected": -19.003032684326172, "logps/chosen": -380.1203918457031, "logps/rejected": -332.4698791503906, "loss": 0.9347, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4659149646759033, "rewards/margins": 0.8921483159065247, "rewards/rejected": 2.5737667083740234, "step": 58060 }, { "epoch": 2.696039741863596, "grad_norm": 97.11514282226562, "learning_rate": 5.0868966371078814e-08, "logits/chosen": -18.55713653564453, "logits/rejected": -17.80570411682129, "logps/chosen": -406.6793518066406, "logps/rejected": -331.1705017089844, "loss": 0.5083, "rewards/accuracies": 0.5, "rewards/chosen": 3.2164387702941895, "rewards/margins": 1.5687910318374634, "rewards/rejected": 1.6476472616195679, "step": 58070 }, { "epoch": 2.6965040159710294, "grad_norm": 28.163555145263672, "learning_rate": 5.079158735317331e-08, "logits/chosen": -19.059696197509766, "logits/rejected": -18.472240447998047, "logps/chosen": -345.7027893066406, "logps/rejected": -294.12646484375, "loss": 0.459, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.590893507003784, "rewards/margins": 2.422797203063965, "rewards/rejected": 1.1680961847305298, "step": 58080 }, { "epoch": 2.696968290078462, "grad_norm": 19.034934997558594, "learning_rate": 5.071420833526781e-08, "logits/chosen": -17.80438804626465, "logits/rejected": -17.47189712524414, "logps/chosen": -375.28070068359375, "logps/rejected": -370.7679138183594, "loss": 0.9874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7659597396850586, "rewards/margins": 0.5356496572494507, "rewards/rejected": 2.2303099632263184, "step": 58090 }, { "epoch": 2.6974325641858954, "grad_norm": 101.13965606689453, "learning_rate": 5.06368293173623e-08, "logits/chosen": -19.894821166992188, "logits/rejected": -19.816682815551758, "logps/chosen": -415.64434814453125, "logps/rejected": -400.9637756347656, "loss": 1.4382, "rewards/accuracies": 0.5, "rewards/chosen": 3.281982421875, "rewards/margins": 0.6085235476493835, "rewards/rejected": 2.6734588146209717, "step": 58100 }, { "epoch": 2.697896838293328, "grad_norm": 199.2140350341797, "learning_rate": 5.055945029945679e-08, "logits/chosen": -18.83218002319336, "logits/rejected": -19.04261589050293, "logps/chosen": -465.14044189453125, "logps/rejected": -443.12042236328125, "loss": 1.5918, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.938465118408203, "rewards/margins": -0.047086041420698166, "rewards/rejected": 3.985551118850708, "step": 58110 }, { "epoch": 2.6983611124007614, "grad_norm": 235.16848754882812, "learning_rate": 5.048207128155129e-08, "logits/chosen": -19.919313430786133, "logits/rejected": -19.561752319335938, "logps/chosen": -385.9895935058594, "logps/rejected": -382.9721984863281, "loss": 0.5433, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7928972244262695, "rewards/margins": 0.697311520576477, "rewards/rejected": 2.095585584640503, "step": 58120 }, { "epoch": 2.6988253865081946, "grad_norm": 0.6499546766281128, "learning_rate": 5.040469226364578e-08, "logits/chosen": -19.403274536132812, "logits/rejected": -17.70084571838379, "logps/chosen": -294.64263916015625, "logps/rejected": -226.60488891601562, "loss": 0.6418, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.841054916381836, "rewards/margins": 1.891096830368042, "rewards/rejected": 0.9499581456184387, "step": 58130 }, { "epoch": 2.6992896606156274, "grad_norm": 3.083303689956665, "learning_rate": 5.032731324574029e-08, "logits/chosen": -19.610584259033203, "logits/rejected": -18.870595932006836, "logps/chosen": -414.31976318359375, "logps/rejected": -254.2926788330078, "loss": 0.4291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.052065849304199, "rewards/margins": 1.4018957614898682, "rewards/rejected": 2.650169849395752, "step": 58140 }, { "epoch": 2.6997539347230606, "grad_norm": 109.75116729736328, "learning_rate": 5.024993422783478e-08, "logits/chosen": -18.74510383605957, "logits/rejected": -18.149883270263672, "logps/chosen": -294.55755615234375, "logps/rejected": -282.31475830078125, "loss": 0.918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.699827194213867, "rewards/margins": 2.0775275230407715, "rewards/rejected": 1.6222999095916748, "step": 58150 }, { "epoch": 2.700218208830494, "grad_norm": 0.08002574741840363, "learning_rate": 5.017255520992928e-08, "logits/chosen": -20.561717987060547, "logits/rejected": -18.395946502685547, "logps/chosen": -438.05389404296875, "logps/rejected": -294.77825927734375, "loss": 0.476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9761385917663574, "rewards/margins": 2.2949466705322266, "rewards/rejected": 1.6811918020248413, "step": 58160 }, { "epoch": 2.7006824829379266, "grad_norm": 45.86160659790039, "learning_rate": 5.009517619202377e-08, "logits/chosen": -18.670740127563477, "logits/rejected": -17.756378173828125, "logps/chosen": -480.3526916503906, "logps/rejected": -383.2982177734375, "loss": 0.5609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7107326984405518, "rewards/margins": 1.6593878269195557, "rewards/rejected": 2.051344871520996, "step": 58170 }, { "epoch": 2.7011467570453593, "grad_norm": 27.511402130126953, "learning_rate": 5.001779717411826e-08, "logits/chosen": -19.443584442138672, "logits/rejected": -18.671709060668945, "logps/chosen": -391.8641662597656, "logps/rejected": -300.5791015625, "loss": 0.2666, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.795268535614014, "rewards/margins": 2.116365909576416, "rewards/rejected": 2.6789023876190186, "step": 58180 }, { "epoch": 2.7016110311527926, "grad_norm": 124.68326568603516, "learning_rate": 4.994041815621276e-08, "logits/chosen": -18.541019439697266, "logits/rejected": -18.107280731201172, "logps/chosen": -360.6407165527344, "logps/rejected": -299.3941955566406, "loss": 0.6107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8672046661376953, "rewards/margins": 1.7183847427368164, "rewards/rejected": 2.148820400238037, "step": 58190 }, { "epoch": 2.7020753052602258, "grad_norm": 23.071826934814453, "learning_rate": 4.9863039138307256e-08, "logits/chosen": -19.5669002532959, "logits/rejected": -18.338794708251953, "logps/chosen": -359.6601257324219, "logps/rejected": -226.00997924804688, "loss": 0.3719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.885317325592041, "rewards/margins": 2.2569785118103027, "rewards/rejected": 1.6283382177352905, "step": 58200 }, { "epoch": 2.7025395793676585, "grad_norm": 41.52839279174805, "learning_rate": 4.9785660120401754e-08, "logits/chosen": -20.284860610961914, "logits/rejected": -19.807510375976562, "logps/chosen": -330.4809265136719, "logps/rejected": -279.665283203125, "loss": 0.5481, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1197926998138428, "rewards/margins": 0.8315666317939758, "rewards/rejected": 2.2882261276245117, "step": 58210 }, { "epoch": 2.7030038534750918, "grad_norm": 102.90913391113281, "learning_rate": 4.9708281102496245e-08, "logits/chosen": -19.03598403930664, "logits/rejected": -17.229768753051758, "logps/chosen": -496.869140625, "logps/rejected": -274.041748046875, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": 4.789244174957275, "rewards/margins": 2.7149887084960938, "rewards/rejected": 2.0742557048797607, "step": 58220 }, { "epoch": 2.703468127582525, "grad_norm": 50.312034606933594, "learning_rate": 4.9630902084590737e-08, "logits/chosen": -18.38619613647461, "logits/rejected": -17.458721160888672, "logps/chosen": -425.907470703125, "logps/rejected": -344.62921142578125, "loss": 0.5145, "rewards/accuracies": 0.5, "rewards/chosen": 3.723719358444214, "rewards/margins": 1.9659464359283447, "rewards/rejected": 1.7577730417251587, "step": 58230 }, { "epoch": 2.7039324016899577, "grad_norm": 72.26419067382812, "learning_rate": 4.9553523066685235e-08, "logits/chosen": -19.54437255859375, "logits/rejected": -19.340267181396484, "logps/chosen": -330.18389892578125, "logps/rejected": -334.58856201171875, "loss": 0.8693, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.196712017059326, "rewards/margins": 0.16815835237503052, "rewards/rejected": 3.0285539627075195, "step": 58240 }, { "epoch": 2.7043966757973905, "grad_norm": 25.668834686279297, "learning_rate": 4.9476144048779726e-08, "logits/chosen": -18.499744415283203, "logits/rejected": -17.434642791748047, "logps/chosen": -470.82666015625, "logps/rejected": -322.9507751464844, "loss": 0.3536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.695914268493652, "rewards/margins": 2.88927960395813, "rewards/rejected": 1.8066349029541016, "step": 58250 }, { "epoch": 2.7048609499048237, "grad_norm": 98.04305267333984, "learning_rate": 4.939876503087423e-08, "logits/chosen": -19.260860443115234, "logits/rejected": -19.83578872680664, "logps/chosen": -386.2065124511719, "logps/rejected": -335.9608154296875, "loss": 0.722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0848231315612793, "rewards/margins": 0.7236142158508301, "rewards/rejected": 2.3612093925476074, "step": 58260 }, { "epoch": 2.705325224012257, "grad_norm": 187.71221923828125, "learning_rate": 4.932138601296872e-08, "logits/chosen": -18.26651954650879, "logits/rejected": -17.88235855102539, "logps/chosen": -399.61083984375, "logps/rejected": -302.7896423339844, "loss": 0.4985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.02042818069458, "rewards/margins": 2.2794227600097656, "rewards/rejected": 1.741005301475525, "step": 58270 }, { "epoch": 2.7057894981196897, "grad_norm": 91.14716339111328, "learning_rate": 4.9244006995063214e-08, "logits/chosen": -18.85573387145996, "logits/rejected": -19.101238250732422, "logps/chosen": -364.3769226074219, "logps/rejected": -365.6379699707031, "loss": 0.5467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2123172283172607, "rewards/margins": 1.1284171342849731, "rewards/rejected": 2.083899974822998, "step": 58280 }, { "epoch": 2.706253772227123, "grad_norm": 6.121181011199951, "learning_rate": 4.916662797715771e-08, "logits/chosen": -19.705402374267578, "logits/rejected": -18.62019157409668, "logps/chosen": -387.7720031738281, "logps/rejected": -266.3705139160156, "loss": 0.4267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.80873441696167, "rewards/margins": 1.9601719379425049, "rewards/rejected": 1.848562479019165, "step": 58290 }, { "epoch": 2.706718046334556, "grad_norm": 9.047181129455566, "learning_rate": 4.90892489592522e-08, "logits/chosen": -18.91899299621582, "logits/rejected": -17.773731231689453, "logps/chosen": -398.2132263183594, "logps/rejected": -254.70254516601562, "loss": 0.2531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2311387062072754, "rewards/margins": 2.387932538986206, "rewards/rejected": 0.8432058095932007, "step": 58300 }, { "epoch": 2.707182320441989, "grad_norm": 224.3571319580078, "learning_rate": 4.90118699413467e-08, "logits/chosen": -18.476581573486328, "logits/rejected": -18.4307918548584, "logps/chosen": -331.1667175292969, "logps/rejected": -324.94622802734375, "loss": 1.3944, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.82855486869812, "rewards/margins": 0.018161917105317116, "rewards/rejected": 2.8103930950164795, "step": 58310 }, { "epoch": 2.707646594549422, "grad_norm": 46.245086669921875, "learning_rate": 4.89344909234412e-08, "logits/chosen": -18.93267822265625, "logits/rejected": -18.661745071411133, "logps/chosen": -449.918701171875, "logps/rejected": -343.917724609375, "loss": 0.6513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1704366207122803, "rewards/margins": 0.9875176548957825, "rewards/rejected": 2.1829190254211426, "step": 58320 }, { "epoch": 2.708110868656855, "grad_norm": 63.84743881225586, "learning_rate": 4.88571119055357e-08, "logits/chosen": -20.60071563720703, "logits/rejected": -20.5490665435791, "logps/chosen": -436.2567443847656, "logps/rejected": -374.66510009765625, "loss": 0.8674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.301295280456543, "rewards/margins": 0.9510940313339233, "rewards/rejected": 3.35020112991333, "step": 58330 }, { "epoch": 2.708575142764288, "grad_norm": 38.6795539855957, "learning_rate": 4.877973288763019e-08, "logits/chosen": -19.926855087280273, "logits/rejected": -18.84103775024414, "logps/chosen": -414.397216796875, "logps/rejected": -322.9136657714844, "loss": 0.4932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8511059284210205, "rewards/margins": 1.7757682800292969, "rewards/rejected": 2.0753374099731445, "step": 58340 }, { "epoch": 2.709039416871721, "grad_norm": 174.3206024169922, "learning_rate": 4.870235386972468e-08, "logits/chosen": -19.94411849975586, "logits/rejected": -18.797151565551758, "logps/chosen": -427.55975341796875, "logps/rejected": -368.35369873046875, "loss": 0.3903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.380428791046143, "rewards/margins": 1.0675092935562134, "rewards/rejected": 3.3129191398620605, "step": 58350 }, { "epoch": 2.709503690979154, "grad_norm": 26.47576904296875, "learning_rate": 4.862497485181918e-08, "logits/chosen": -19.234073638916016, "logits/rejected": -18.42519760131836, "logps/chosen": -358.919189453125, "logps/rejected": -311.5982971191406, "loss": 0.5087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.405168533325195, "rewards/margins": 1.6432220935821533, "rewards/rejected": 2.7619469165802, "step": 58360 }, { "epoch": 2.7099679650865873, "grad_norm": 227.2133331298828, "learning_rate": 4.854759583391367e-08, "logits/chosen": -19.78738021850586, "logits/rejected": -19.592880249023438, "logps/chosen": -275.2892761230469, "logps/rejected": -302.730224609375, "loss": 1.3178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5806022882461548, "rewards/margins": -0.12877750396728516, "rewards/rejected": 1.70937979221344, "step": 58370 }, { "epoch": 2.71043223919402, "grad_norm": 26.09197998046875, "learning_rate": 4.8470216816008174e-08, "logits/chosen": -18.740493774414062, "logits/rejected": -18.72100067138672, "logps/chosen": -325.64141845703125, "logps/rejected": -308.4922790527344, "loss": 0.6514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4787819385528564, "rewards/margins": 0.5892113447189331, "rewards/rejected": 1.8895705938339233, "step": 58380 }, { "epoch": 2.7108965133014533, "grad_norm": 6.117457866668701, "learning_rate": 4.8392837798102666e-08, "logits/chosen": -18.456119537353516, "logits/rejected": -17.773815155029297, "logps/chosen": -392.22784423828125, "logps/rejected": -309.14971923828125, "loss": 0.713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.580451965332031, "rewards/margins": 2.292996883392334, "rewards/rejected": 2.2874550819396973, "step": 58390 }, { "epoch": 2.711360787408886, "grad_norm": 246.48753356933594, "learning_rate": 4.831545878019716e-08, "logits/chosen": -19.418277740478516, "logits/rejected": -18.488574981689453, "logps/chosen": -453.4418029785156, "logps/rejected": -330.1694030761719, "loss": 1.1252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4809329509735107, "rewards/margins": 0.8091634511947632, "rewards/rejected": 2.671769618988037, "step": 58400 }, { "epoch": 2.7118250615163193, "grad_norm": 31.790918350219727, "learning_rate": 4.8238079762291655e-08, "logits/chosen": -19.45144271850586, "logits/rejected": -18.07500457763672, "logps/chosen": -383.9973449707031, "logps/rejected": -307.57147216796875, "loss": 0.374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.713064193725586, "rewards/margins": 2.2047460079193115, "rewards/rejected": 1.5083179473876953, "step": 58410 }, { "epoch": 2.712289335623752, "grad_norm": 211.71385192871094, "learning_rate": 4.816070074438615e-08, "logits/chosen": -19.319589614868164, "logits/rejected": -18.49466896057129, "logps/chosen": -335.3471374511719, "logps/rejected": -293.78436279296875, "loss": 0.6429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.254685401916504, "rewards/margins": 0.848550021648407, "rewards/rejected": 1.4061352014541626, "step": 58420 }, { "epoch": 2.7127536097311853, "grad_norm": 26.329450607299805, "learning_rate": 4.8083321726480645e-08, "logits/chosen": -18.791778564453125, "logits/rejected": -18.547163009643555, "logps/chosen": -304.6803283691406, "logps/rejected": -318.05743408203125, "loss": 1.3973, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0142059326171875, "rewards/margins": -0.6403507590293884, "rewards/rejected": 3.6545567512512207, "step": 58430 }, { "epoch": 2.7132178838386185, "grad_norm": 4.37026834487915, "learning_rate": 4.800594270857514e-08, "logits/chosen": -19.604135513305664, "logits/rejected": -18.323665618896484, "logps/chosen": -425.34429931640625, "logps/rejected": -313.84716796875, "loss": 0.2836, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.193162441253662, "rewards/margins": 2.140834331512451, "rewards/rejected": 1.05232834815979, "step": 58440 }, { "epoch": 2.7136821579460513, "grad_norm": 56.942359924316406, "learning_rate": 4.792856369066964e-08, "logits/chosen": -19.055511474609375, "logits/rejected": -18.0853328704834, "logps/chosen": -420.3601989746094, "logps/rejected": -287.98577880859375, "loss": 0.5403, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3202757835388184, "rewards/margins": 2.27935791015625, "rewards/rejected": 1.040917992591858, "step": 58450 }, { "epoch": 2.7141464320534845, "grad_norm": 165.1094512939453, "learning_rate": 4.785118467276413e-08, "logits/chosen": -19.001934051513672, "logits/rejected": -18.84811019897461, "logps/chosen": -442.20068359375, "logps/rejected": -413.45330810546875, "loss": 1.0781, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.8876404762268066, "rewards/margins": -0.4215128421783447, "rewards/rejected": 4.3091535568237305, "step": 58460 }, { "epoch": 2.7146107061609173, "grad_norm": 27.628253936767578, "learning_rate": 4.7773805654858624e-08, "logits/chosen": -19.410404205322266, "logits/rejected": -18.040481567382812, "logps/chosen": -373.0256652832031, "logps/rejected": -233.45632934570312, "loss": 0.5339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.310794353485107, "rewards/margins": 2.4532809257507324, "rewards/rejected": 1.8575128316879272, "step": 58470 }, { "epoch": 2.7150749802683505, "grad_norm": 181.2357177734375, "learning_rate": 4.769642663695312e-08, "logits/chosen": -19.239933013916016, "logits/rejected": -18.07005500793457, "logps/chosen": -364.28131103515625, "logps/rejected": -279.4364318847656, "loss": 0.7179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.183152675628662, "rewards/margins": 1.9885663986206055, "rewards/rejected": 1.1945860385894775, "step": 58480 }, { "epoch": 2.7155392543757833, "grad_norm": 229.91720581054688, "learning_rate": 4.7619047619047613e-08, "logits/chosen": -17.802621841430664, "logits/rejected": -18.375652313232422, "logps/chosen": -288.23846435546875, "logps/rejected": -370.0317687988281, "loss": 2.0293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.631247043609619, "rewards/margins": -0.5792545080184937, "rewards/rejected": 3.2105014324188232, "step": 58490 }, { "epoch": 2.7160035284832165, "grad_norm": 22.84164047241211, "learning_rate": 4.754166860114212e-08, "logits/chosen": -19.993946075439453, "logits/rejected": -18.88042640686035, "logps/chosen": -399.50970458984375, "logps/rejected": -314.6690979003906, "loss": 0.5704, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0056023597717285, "rewards/margins": 1.335852861404419, "rewards/rejected": 2.6697497367858887, "step": 58500 }, { "epoch": 2.7164678025906497, "grad_norm": 40.81633758544922, "learning_rate": 4.746428958323661e-08, "logits/chosen": -18.976842880249023, "logits/rejected": -17.47824478149414, "logps/chosen": -434.70849609375, "logps/rejected": -361.83056640625, "loss": 0.7342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.932746171951294, "rewards/margins": 1.5928552150726318, "rewards/rejected": 1.3398911952972412, "step": 58510 }, { "epoch": 2.7169320766980825, "grad_norm": 232.3850860595703, "learning_rate": 4.73869105653311e-08, "logits/chosen": -19.018077850341797, "logits/rejected": -19.407520294189453, "logps/chosen": -400.1048278808594, "logps/rejected": -435.59185791015625, "loss": 1.6394, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.243802309036255, "rewards/margins": -0.17305922508239746, "rewards/rejected": 3.4168617725372314, "step": 58520 }, { "epoch": 2.7173963508055157, "grad_norm": 10.186110496520996, "learning_rate": 4.73095315474256e-08, "logits/chosen": -19.372129440307617, "logits/rejected": -18.295793533325195, "logps/chosen": -391.79681396484375, "logps/rejected": -265.39141845703125, "loss": 0.2123, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.012132167816162, "rewards/margins": 2.860495090484619, "rewards/rejected": 1.1516368389129639, "step": 58530 }, { "epoch": 2.7178606249129484, "grad_norm": 45.882164001464844, "learning_rate": 4.723215252952009e-08, "logits/chosen": -19.167926788330078, "logits/rejected": -18.041641235351562, "logps/chosen": -322.968994140625, "logps/rejected": -261.24993896484375, "loss": 0.7078, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2637197971343994, "rewards/margins": 1.0286219120025635, "rewards/rejected": 1.2350982427597046, "step": 58540 }, { "epoch": 2.7183248990203817, "grad_norm": 182.8191375732422, "learning_rate": 4.715477351161459e-08, "logits/chosen": -18.52971649169922, "logits/rejected": -17.814218521118164, "logps/chosen": -442.4541931152344, "logps/rejected": -332.6598815917969, "loss": 0.4923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7005374431610107, "rewards/margins": 1.7626453638076782, "rewards/rejected": 1.9378925561904907, "step": 58550 }, { "epoch": 2.7187891731278144, "grad_norm": 18.08501434326172, "learning_rate": 4.7077394493709087e-08, "logits/chosen": -18.091066360473633, "logits/rejected": -17.90350341796875, "logps/chosen": -383.5677490234375, "logps/rejected": -360.27081298828125, "loss": 0.8788, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.116238832473755, "rewards/margins": 0.5137475728988647, "rewards/rejected": 2.6024909019470215, "step": 58560 }, { "epoch": 2.7192534472352476, "grad_norm": 212.3417205810547, "learning_rate": 4.700001547580358e-08, "logits/chosen": -18.294254302978516, "logits/rejected": -18.4918155670166, "logps/chosen": -425.16168212890625, "logps/rejected": -451.7191467285156, "loss": 0.9855, "rewards/accuracies": 0.5, "rewards/chosen": 2.955871105194092, "rewards/margins": -0.1351514756679535, "rewards/rejected": 3.091022491455078, "step": 58570 }, { "epoch": 2.719717721342681, "grad_norm": 42.07914352416992, "learning_rate": 4.6922636457898076e-08, "logits/chosen": -19.482206344604492, "logits/rejected": -17.720643997192383, "logps/chosen": -358.22723388671875, "logps/rejected": -263.70550537109375, "loss": 0.6864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6977272033691406, "rewards/margins": 1.95914626121521, "rewards/rejected": 1.7385810613632202, "step": 58580 }, { "epoch": 2.7201819954501136, "grad_norm": 40.99363327026367, "learning_rate": 4.684525743999257e-08, "logits/chosen": -18.907339096069336, "logits/rejected": -17.615385055541992, "logps/chosen": -394.0683288574219, "logps/rejected": -314.21893310546875, "loss": 0.5216, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2186479568481445, "rewards/margins": 1.6358829736709595, "rewards/rejected": 2.5827646255493164, "step": 58590 }, { "epoch": 2.720646269557547, "grad_norm": 1.1426246166229248, "learning_rate": 4.6767878422087066e-08, "logits/chosen": -18.42300033569336, "logits/rejected": -17.209041595458984, "logps/chosen": -366.20977783203125, "logps/rejected": -248.26437377929688, "loss": 0.3815, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.133592128753662, "rewards/margins": 1.9671642780303955, "rewards/rejected": 1.166427731513977, "step": 58600 }, { "epoch": 2.72111054366498, "grad_norm": 102.69498443603516, "learning_rate": 4.669049940418156e-08, "logits/chosen": -18.9678897857666, "logits/rejected": -19.228057861328125, "logps/chosen": -377.89434814453125, "logps/rejected": -309.87945556640625, "loss": 1.2243, "rewards/accuracies": 0.5, "rewards/chosen": 2.766543388366699, "rewards/margins": 0.1139988899230957, "rewards/rejected": 2.6525444984436035, "step": 58610 }, { "epoch": 2.721574817772413, "grad_norm": 275.8888244628906, "learning_rate": 4.661312038627606e-08, "logits/chosen": -18.742855072021484, "logits/rejected": -18.820266723632812, "logps/chosen": -378.514892578125, "logps/rejected": -444.3848571777344, "loss": 1.1479, "rewards/accuracies": 0.5, "rewards/chosen": 3.0094053745269775, "rewards/margins": -0.5038373470306396, "rewards/rejected": 3.513242721557617, "step": 58620 }, { "epoch": 2.7220390918798456, "grad_norm": 128.34422302246094, "learning_rate": 4.653574136837055e-08, "logits/chosen": -19.305706024169922, "logits/rejected": -19.036060333251953, "logps/chosen": -320.305419921875, "logps/rejected": -299.32977294921875, "loss": 0.6523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8154098987579346, "rewards/margins": 0.5532063245773315, "rewards/rejected": 2.2622036933898926, "step": 58630 }, { "epoch": 2.722503365987279, "grad_norm": 15.68356990814209, "learning_rate": 4.6458362350465045e-08, "logits/chosen": -19.459014892578125, "logits/rejected": -17.853857040405273, "logps/chosen": -381.9283752441406, "logps/rejected": -207.7337646484375, "loss": 0.2575, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.987848997116089, "rewards/margins": 3.3661460876464844, "rewards/rejected": 0.6217026114463806, "step": 58640 }, { "epoch": 2.722967640094712, "grad_norm": 32.771305084228516, "learning_rate": 4.638098333255954e-08, "logits/chosen": -19.529659271240234, "logits/rejected": -18.612464904785156, "logps/chosen": -483.343017578125, "logps/rejected": -364.5062561035156, "loss": 0.4803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.409201622009277, "rewards/margins": 1.4184354543685913, "rewards/rejected": 2.9907665252685547, "step": 58650 }, { "epoch": 2.723431914202145, "grad_norm": 79.77824401855469, "learning_rate": 4.6303604314654034e-08, "logits/chosen": -20.454296112060547, "logits/rejected": -20.27197265625, "logps/chosen": -343.39593505859375, "logps/rejected": -301.41741943359375, "loss": 0.9393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.377760887145996, "rewards/margins": 0.030295301228761673, "rewards/rejected": 2.3474655151367188, "step": 58660 }, { "epoch": 2.723896188309578, "grad_norm": 1.3109210729599, "learning_rate": 4.622622529674853e-08, "logits/chosen": -19.34591293334961, "logits/rejected": -18.58619499206543, "logps/chosen": -366.43280029296875, "logps/rejected": -295.16131591796875, "loss": 0.729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1184985637664795, "rewards/margins": 1.2231645584106445, "rewards/rejected": 1.8953338861465454, "step": 58670 }, { "epoch": 2.7243604624170112, "grad_norm": 74.91442108154297, "learning_rate": 4.614884627884303e-08, "logits/chosen": -18.32900619506836, "logits/rejected": -18.177732467651367, "logps/chosen": -480.5455017089844, "logps/rejected": -437.335693359375, "loss": 0.5552, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9766457080841064, "rewards/margins": 1.134956955909729, "rewards/rejected": 2.841689109802246, "step": 58680 }, { "epoch": 2.724824736524444, "grad_norm": 0.13749109208583832, "learning_rate": 4.607146726093752e-08, "logits/chosen": -18.980262756347656, "logits/rejected": -18.205551147460938, "logps/chosen": -305.9949645996094, "logps/rejected": -245.5011444091797, "loss": 0.6947, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.495628833770752, "rewards/margins": 2.532151699066162, "rewards/rejected": 0.9634771347045898, "step": 58690 }, { "epoch": 2.725289010631877, "grad_norm": 7.655878067016602, "learning_rate": 4.599408824303202e-08, "logits/chosen": -19.15054702758789, "logits/rejected": -18.51872444152832, "logps/chosen": -302.9328308105469, "logps/rejected": -291.5271301269531, "loss": 0.8933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8007919788360596, "rewards/margins": 1.1988850831985474, "rewards/rejected": 1.6019071340560913, "step": 58700 }, { "epoch": 2.72575328473931, "grad_norm": 0.3884521424770355, "learning_rate": 4.591670922512651e-08, "logits/chosen": -18.837858200073242, "logits/rejected": -17.767120361328125, "logps/chosen": -323.55718994140625, "logps/rejected": -219.88461303710938, "loss": 1.1005, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.202622413635254, "rewards/margins": 0.8688480257987976, "rewards/rejected": 1.3337739706039429, "step": 58710 }, { "epoch": 2.726217558846743, "grad_norm": 0.598694384098053, "learning_rate": 4.583933020722101e-08, "logits/chosen": -18.484596252441406, "logits/rejected": -18.050960540771484, "logps/chosen": -332.9026794433594, "logps/rejected": -305.711181640625, "loss": 0.9248, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2521071434020996, "rewards/margins": 0.9559162259101868, "rewards/rejected": 2.2961907386779785, "step": 58720 }, { "epoch": 2.726681832954176, "grad_norm": 154.6409149169922, "learning_rate": 4.57619511893155e-08, "logits/chosen": -18.763383865356445, "logits/rejected": -18.032527923583984, "logps/chosen": -401.69183349609375, "logps/rejected": -309.873779296875, "loss": 0.3771, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.288647413253784, "rewards/margins": 1.4759823083877563, "rewards/rejected": 1.812665343284607, "step": 58730 }, { "epoch": 2.727146107061609, "grad_norm": 11.319327354431152, "learning_rate": 4.568457217141e-08, "logits/chosen": -18.97635269165039, "logits/rejected": -17.251380920410156, "logps/chosen": -369.9820861816406, "logps/rejected": -191.48609924316406, "loss": 0.2073, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8560783863067627, "rewards/margins": 2.8790764808654785, "rewards/rejected": -0.02299807034432888, "step": 58740 }, { "epoch": 2.7276103811690424, "grad_norm": 117.65485382080078, "learning_rate": 4.56071931535045e-08, "logits/chosen": -20.34219741821289, "logits/rejected": -19.74795913696289, "logps/chosen": -431.08428955078125, "logps/rejected": -384.51239013671875, "loss": 0.3976, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.288944721221924, "rewards/margins": 0.8292549848556519, "rewards/rejected": 2.4596898555755615, "step": 58750 }, { "epoch": 2.728074655276475, "grad_norm": 52.939796447753906, "learning_rate": 4.552981413559899e-08, "logits/chosen": -18.680309295654297, "logits/rejected": -18.687353134155273, "logps/chosen": -373.11102294921875, "logps/rejected": -381.3067321777344, "loss": 0.5296, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.936370849609375, "rewards/margins": 0.8421497344970703, "rewards/rejected": 3.094221353530884, "step": 58760 }, { "epoch": 2.7285389293839084, "grad_norm": 2.1186132431030273, "learning_rate": 4.5452435117693486e-08, "logits/chosen": -19.41095542907715, "logits/rejected": -18.51380729675293, "logps/chosen": -374.9515686035156, "logps/rejected": -282.2132873535156, "loss": 0.5069, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.040767192840576, "rewards/margins": 1.9170341491699219, "rewards/rejected": 1.1237331628799438, "step": 58770 }, { "epoch": 2.729003203491341, "grad_norm": 92.62166595458984, "learning_rate": 4.537505609978798e-08, "logits/chosen": -18.914705276489258, "logits/rejected": -18.984729766845703, "logps/chosen": -391.713623046875, "logps/rejected": -378.6802673339844, "loss": 0.9664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1383986473083496, "rewards/margins": 0.4606854319572449, "rewards/rejected": 2.67771315574646, "step": 58780 }, { "epoch": 2.7294674775987744, "grad_norm": 102.63768005371094, "learning_rate": 4.529767708188247e-08, "logits/chosen": -19.872323989868164, "logits/rejected": -18.98422622680664, "logps/chosen": -293.8544006347656, "logps/rejected": -263.86846923828125, "loss": 0.6664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2318501472473145, "rewards/margins": 0.7658685445785522, "rewards/rejected": 2.4659814834594727, "step": 58790 }, { "epoch": 2.729931751706207, "grad_norm": 91.16485595703125, "learning_rate": 4.522029806397697e-08, "logits/chosen": -19.466625213623047, "logits/rejected": -18.259410858154297, "logps/chosen": -518.6326293945312, "logps/rejected": -337.4438781738281, "loss": 0.3077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.8538336753845215, "rewards/margins": 2.2906007766723633, "rewards/rejected": 2.563232898712158, "step": 58800 }, { "epoch": 2.7303960258136404, "grad_norm": 109.03235626220703, "learning_rate": 4.5142919046071465e-08, "logits/chosen": -19.858280181884766, "logits/rejected": -18.941967010498047, "logps/chosen": -432.91680908203125, "logps/rejected": -371.514892578125, "loss": 0.448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6776771545410156, "rewards/margins": 1.7353944778442383, "rewards/rejected": 1.9422826766967773, "step": 58810 }, { "epoch": 2.7308602999210736, "grad_norm": 75.98391723632812, "learning_rate": 4.5065540028165963e-08, "logits/chosen": -19.85411262512207, "logits/rejected": -19.05904769897461, "logps/chosen": -361.5434875488281, "logps/rejected": -260.89874267578125, "loss": 0.7631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0760304927825928, "rewards/margins": 0.9511641263961792, "rewards/rejected": 2.124866008758545, "step": 58820 }, { "epoch": 2.7313245740285064, "grad_norm": 103.9148178100586, "learning_rate": 4.4988161010260455e-08, "logits/chosen": -19.671245574951172, "logits/rejected": -19.790132522583008, "logps/chosen": -333.9518127441406, "logps/rejected": -277.74786376953125, "loss": 0.6502, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6949992179870605, "rewards/margins": 0.9066675305366516, "rewards/rejected": 2.7883315086364746, "step": 58830 }, { "epoch": 2.7317888481359396, "grad_norm": 24.27691650390625, "learning_rate": 4.491078199235495e-08, "logits/chosen": -18.718130111694336, "logits/rejected": -18.07282829284668, "logps/chosen": -312.7641906738281, "logps/rejected": -222.49752807617188, "loss": 0.4057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1580986976623535, "rewards/margins": 1.0217000246047974, "rewards/rejected": 1.1363987922668457, "step": 58840 }, { "epoch": 2.7322531222433724, "grad_norm": 0.748274028301239, "learning_rate": 4.4833402974449444e-08, "logits/chosen": -18.65970802307129, "logits/rejected": -17.02057647705078, "logps/chosen": -501.05950927734375, "logps/rejected": -346.34454345703125, "loss": 0.8585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.925508975982666, "rewards/margins": 2.176123857498169, "rewards/rejected": 1.7493852376937866, "step": 58850 }, { "epoch": 2.7327173963508056, "grad_norm": 51.04777526855469, "learning_rate": 4.4756023956543936e-08, "logits/chosen": -18.81809425354004, "logits/rejected": -17.964750289916992, "logps/chosen": -459.89373779296875, "logps/rejected": -366.2869873046875, "loss": 0.6263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0861871242523193, "rewards/margins": 1.380164623260498, "rewards/rejected": 1.7060226202011108, "step": 58860 }, { "epoch": 2.7331816704582383, "grad_norm": 35.09929275512695, "learning_rate": 4.467864493863844e-08, "logits/chosen": -19.23753547668457, "logits/rejected": -18.519039154052734, "logps/chosen": -440.59112548828125, "logps/rejected": -374.01971435546875, "loss": 0.8598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0002636909484863, "rewards/margins": 0.5177599191665649, "rewards/rejected": 2.482503652572632, "step": 58870 }, { "epoch": 2.7336459445656716, "grad_norm": 17.80154800415039, "learning_rate": 4.460126592073293e-08, "logits/chosen": -19.274089813232422, "logits/rejected": -18.375473022460938, "logps/chosen": -480.2332458496094, "logps/rejected": -368.41229248046875, "loss": 0.3858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.71651554107666, "rewards/margins": 2.3251829147338867, "rewards/rejected": 2.3913323879241943, "step": 58880 }, { "epoch": 2.7341102186731048, "grad_norm": 10.899353981018066, "learning_rate": 4.452388690282743e-08, "logits/chosen": -18.30954933166504, "logits/rejected": -17.311260223388672, "logps/chosen": -309.5979919433594, "logps/rejected": -221.6389617919922, "loss": 0.4523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.338813543319702, "rewards/margins": 1.772834062576294, "rewards/rejected": 0.5659793615341187, "step": 58890 }, { "epoch": 2.7345744927805375, "grad_norm": 21.130626678466797, "learning_rate": 4.444650788492192e-08, "logits/chosen": -19.564559936523438, "logits/rejected": -18.298282623291016, "logps/chosen": -464.91387939453125, "logps/rejected": -386.21295166015625, "loss": 0.8565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.563652515411377, "rewards/margins": 1.3204301595687866, "rewards/rejected": 3.243222713470459, "step": 58900 }, { "epoch": 2.7350387668879708, "grad_norm": 33.57794952392578, "learning_rate": 4.436912886701641e-08, "logits/chosen": -18.742170333862305, "logits/rejected": -18.499156951904297, "logps/chosen": -334.37384033203125, "logps/rejected": -322.26763916015625, "loss": 0.4928, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0233023166656494, "rewards/margins": 1.399207353591919, "rewards/rejected": 1.6240949630737305, "step": 58910 }, { "epoch": 2.7355030409954035, "grad_norm": 44.52644729614258, "learning_rate": 4.429174984911091e-08, "logits/chosen": -18.815574645996094, "logits/rejected": -18.85199737548828, "logps/chosen": -373.5246276855469, "logps/rejected": -373.76446533203125, "loss": 0.7862, "rewards/accuracies": 0.5, "rewards/chosen": 3.0323548316955566, "rewards/margins": 0.6471889615058899, "rewards/rejected": 2.3851656913757324, "step": 58920 }, { "epoch": 2.7359673151028367, "grad_norm": 57.843448638916016, "learning_rate": 4.421437083120541e-08, "logits/chosen": -18.359786987304688, "logits/rejected": -18.565326690673828, "logps/chosen": -325.6714172363281, "logps/rejected": -350.1630554199219, "loss": 0.6124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3196942806243896, "rewards/margins": 0.9033111333847046, "rewards/rejected": 1.416383147239685, "step": 58930 }, { "epoch": 2.7364315892102695, "grad_norm": 47.174678802490234, "learning_rate": 4.413699181329991e-08, "logits/chosen": -20.089069366455078, "logits/rejected": -19.4603328704834, "logps/chosen": -419.72125244140625, "logps/rejected": -479.12786865234375, "loss": 0.8779, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 5.318645000457764, "rewards/margins": 0.7147426009178162, "rewards/rejected": 4.603902816772461, "step": 58940 }, { "epoch": 2.7368958633177027, "grad_norm": 58.397972106933594, "learning_rate": 4.40596127953944e-08, "logits/chosen": -19.578393936157227, "logits/rejected": -18.7275447845459, "logps/chosen": -348.7649841308594, "logps/rejected": -279.4207458496094, "loss": 0.4871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.385042667388916, "rewards/margins": 1.436554193496704, "rewards/rejected": 2.9484879970550537, "step": 58950 }, { "epoch": 2.737360137425136, "grad_norm": 101.5904769897461, "learning_rate": 4.3982233777488896e-08, "logits/chosen": -19.276927947998047, "logits/rejected": -18.244243621826172, "logps/chosen": -417.4803161621094, "logps/rejected": -337.2727966308594, "loss": 1.1476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8919239044189453, "rewards/margins": 1.028658151626587, "rewards/rejected": 2.8632659912109375, "step": 58960 }, { "epoch": 2.7378244115325687, "grad_norm": 29.788959503173828, "learning_rate": 4.390485475958339e-08, "logits/chosen": -19.591686248779297, "logits/rejected": -19.210437774658203, "logps/chosen": -474.71240234375, "logps/rejected": -365.03704833984375, "loss": 0.2812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.638236045837402, "rewards/margins": 2.025782823562622, "rewards/rejected": 2.612452983856201, "step": 58970 }, { "epoch": 2.738288685640002, "grad_norm": 10.22471809387207, "learning_rate": 4.382747574167788e-08, "logits/chosen": -19.80010223388672, "logits/rejected": -19.737586975097656, "logps/chosen": -323.2929382324219, "logps/rejected": -318.36383056640625, "loss": 1.0754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0554282665252686, "rewards/margins": 0.9656036496162415, "rewards/rejected": 2.089824676513672, "step": 58980 }, { "epoch": 2.738752959747435, "grad_norm": 181.75360107421875, "learning_rate": 4.3750096723772384e-08, "logits/chosen": -19.43191909790039, "logits/rejected": -18.218719482421875, "logps/chosen": -376.84368896484375, "logps/rejected": -319.4272766113281, "loss": 0.5056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.893062114715576, "rewards/margins": 1.6621391773223877, "rewards/rejected": 1.2309232950210571, "step": 58990 }, { "epoch": 2.739217233854868, "grad_norm": 25.923194885253906, "learning_rate": 4.3672717705866875e-08, "logits/chosen": -19.471105575561523, "logits/rejected": -18.968164443969727, "logps/chosen": -403.8594055175781, "logps/rejected": -393.0386657714844, "loss": 0.6742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.989307403564453, "rewards/margins": 0.8896082043647766, "rewards/rejected": 3.0996994972229004, "step": 59000 }, { "epoch": 2.7396815079623007, "grad_norm": 20.77415657043457, "learning_rate": 4.3595338687961374e-08, "logits/chosen": -19.37858772277832, "logits/rejected": -19.156774520874023, "logps/chosen": -414.7088317871094, "logps/rejected": -394.4284362792969, "loss": 0.6733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.310981273651123, "rewards/margins": 0.25673753023147583, "rewards/rejected": 3.05424427986145, "step": 59010 }, { "epoch": 2.740145782069734, "grad_norm": 135.59207153320312, "learning_rate": 4.3517959670055865e-08, "logits/chosen": -18.78415870666504, "logits/rejected": -18.40920639038086, "logps/chosen": -477.1787109375, "logps/rejected": -378.2183837890625, "loss": 0.7798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.341464996337891, "rewards/margins": 1.006378173828125, "rewards/rejected": 3.3350863456726074, "step": 59020 }, { "epoch": 2.740610056177167, "grad_norm": 186.42100524902344, "learning_rate": 4.3440580652150356e-08, "logits/chosen": -18.35842514038086, "logits/rejected": -18.148509979248047, "logps/chosen": -331.91168212890625, "logps/rejected": -276.23565673828125, "loss": 0.78, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6211154460906982, "rewards/margins": 0.734709620475769, "rewards/rejected": 1.8864057064056396, "step": 59030 }, { "epoch": 2.7410743302846, "grad_norm": 36.35850524902344, "learning_rate": 4.3363201634244854e-08, "logits/chosen": -18.503841400146484, "logits/rejected": -17.748371124267578, "logps/chosen": -326.4063415527344, "logps/rejected": -244.95620727539062, "loss": 0.5192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.6652660369873047, "rewards/margins": 1.3411895036697388, "rewards/rejected": 1.3240764141082764, "step": 59040 }, { "epoch": 2.741538604392033, "grad_norm": 21.484249114990234, "learning_rate": 4.328582261633935e-08, "logits/chosen": -18.76361656188965, "logits/rejected": -18.10202407836914, "logps/chosen": -379.92303466796875, "logps/rejected": -318.9097900390625, "loss": 0.6923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.755042552947998, "rewards/margins": 1.2808477878570557, "rewards/rejected": 2.4741950035095215, "step": 59050 }, { "epoch": 2.7420028784994663, "grad_norm": 3.8509838581085205, "learning_rate": 4.320844359843385e-08, "logits/chosen": -20.155988693237305, "logits/rejected": -18.903194427490234, "logps/chosen": -372.76031494140625, "logps/rejected": -298.1728210449219, "loss": 0.4127, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.204662799835205, "rewards/margins": 1.3696260452270508, "rewards/rejected": 1.8350368738174438, "step": 59060 }, { "epoch": 2.742467152606899, "grad_norm": 124.73009490966797, "learning_rate": 4.313106458052834e-08, "logits/chosen": -18.766902923583984, "logits/rejected": -17.45070457458496, "logps/chosen": -257.7754821777344, "logps/rejected": -198.75814819335938, "loss": 0.4708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8029091358184814, "rewards/margins": 1.2795140743255615, "rewards/rejected": 0.5233949422836304, "step": 59070 }, { "epoch": 2.742931426714332, "grad_norm": 1.4708222150802612, "learning_rate": 4.3053685562622833e-08, "logits/chosen": -20.634279251098633, "logits/rejected": -18.818511962890625, "logps/chosen": -409.75274658203125, "logps/rejected": -262.2252502441406, "loss": 0.2276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.032158851623535, "rewards/margins": 2.91377854347229, "rewards/rejected": 2.118380069732666, "step": 59080 }, { "epoch": 2.743395700821765, "grad_norm": 88.777099609375, "learning_rate": 4.297630654471733e-08, "logits/chosen": -19.441936492919922, "logits/rejected": -19.041961669921875, "logps/chosen": -416.2848205566406, "logps/rejected": -362.9179992675781, "loss": 1.0898, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2611072063446045, "rewards/margins": -0.3036009669303894, "rewards/rejected": 3.5647082328796387, "step": 59090 }, { "epoch": 2.7438599749291983, "grad_norm": 0.08599071949720383, "learning_rate": 4.289892752681182e-08, "logits/chosen": -19.79216766357422, "logits/rejected": -18.772001266479492, "logps/chosen": -365.7276916503906, "logps/rejected": -266.23223876953125, "loss": 0.5913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7479310035705566, "rewards/margins": 1.8896995782852173, "rewards/rejected": 1.858231782913208, "step": 59100 }, { "epoch": 2.744324249036631, "grad_norm": 14.154829025268555, "learning_rate": 4.282154850890633e-08, "logits/chosen": -18.6575870513916, "logits/rejected": -18.172847747802734, "logps/chosen": -436.12689208984375, "logps/rejected": -341.94512939453125, "loss": 0.3091, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9880690574645996, "rewards/margins": 1.7668921947479248, "rewards/rejected": 2.221176862716675, "step": 59110 }, { "epoch": 2.7447885231440643, "grad_norm": 205.20822143554688, "learning_rate": 4.274416949100082e-08, "logits/chosen": -20.355236053466797, "logits/rejected": -20.01193618774414, "logps/chosen": -420.48388671875, "logps/rejected": -359.49017333984375, "loss": 0.5935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.050615310668945, "rewards/margins": 1.0830926895141602, "rewards/rejected": 2.967522621154785, "step": 59120 }, { "epoch": 2.7452527972514975, "grad_norm": 87.52129364013672, "learning_rate": 4.266679047309532e-08, "logits/chosen": -19.62192726135254, "logits/rejected": -18.775766372680664, "logps/chosen": -351.4476318359375, "logps/rejected": -303.49407958984375, "loss": 0.9433, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5778872966766357, "rewards/margins": 0.49246659874916077, "rewards/rejected": 2.085420846939087, "step": 59130 }, { "epoch": 2.7457170713589303, "grad_norm": 160.58900451660156, "learning_rate": 4.258941145518981e-08, "logits/chosen": -19.71139907836914, "logits/rejected": -19.062969207763672, "logps/chosen": -452.2828674316406, "logps/rejected": -336.69329833984375, "loss": 0.4811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3928399085998535, "rewards/margins": 1.137441635131836, "rewards/rejected": 2.2553982734680176, "step": 59140 }, { "epoch": 2.7461813454663635, "grad_norm": 70.90331268310547, "learning_rate": 4.25120324372843e-08, "logits/chosen": -18.759817123413086, "logits/rejected": -18.728723526000977, "logps/chosen": -451.1288146972656, "logps/rejected": -463.1190490722656, "loss": 0.9, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.93900990486145, "rewards/margins": 0.4084312319755554, "rewards/rejected": 3.53057861328125, "step": 59150 }, { "epoch": 2.7466456195737963, "grad_norm": 67.21403503417969, "learning_rate": 4.24346534193788e-08, "logits/chosen": -18.934276580810547, "logits/rejected": -18.118770599365234, "logps/chosen": -401.51226806640625, "logps/rejected": -318.46539306640625, "loss": 0.811, "rewards/accuracies": 0.5, "rewards/chosen": 3.9768593311309814, "rewards/margins": 1.2796767950057983, "rewards/rejected": 2.6971821784973145, "step": 59160 }, { "epoch": 2.7471098936812295, "grad_norm": 182.67605590820312, "learning_rate": 4.2357274401473296e-08, "logits/chosen": -18.734882354736328, "logits/rejected": -18.36783218383789, "logps/chosen": -338.87066650390625, "logps/rejected": -285.97894287109375, "loss": 1.6711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.961145043373108, "rewards/margins": -0.6267573237419128, "rewards/rejected": 2.587902545928955, "step": 59170 }, { "epoch": 2.7475741677886623, "grad_norm": 29.480195999145508, "learning_rate": 4.2279895383567794e-08, "logits/chosen": -20.008285522460938, "logits/rejected": -19.038562774658203, "logps/chosen": -422.1407775878906, "logps/rejected": -311.81744384765625, "loss": 0.8411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7657241821289062, "rewards/margins": 1.0591812133789062, "rewards/rejected": 2.706543207168579, "step": 59180 }, { "epoch": 2.7480384418960955, "grad_norm": 37.77796936035156, "learning_rate": 4.2202516365662286e-08, "logits/chosen": -19.98259735107422, "logits/rejected": -19.585948944091797, "logps/chosen": -363.9250793457031, "logps/rejected": -345.7076721191406, "loss": 0.8159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9548897743225098, "rewards/margins": 1.1396632194519043, "rewards/rejected": 2.8152260780334473, "step": 59190 }, { "epoch": 2.7485027160035287, "grad_norm": 181.3603515625, "learning_rate": 4.212513734775678e-08, "logits/chosen": -19.37662124633789, "logits/rejected": -18.463558197021484, "logps/chosen": -339.5971374511719, "logps/rejected": -319.96429443359375, "loss": 0.5195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.337993621826172, "rewards/margins": 1.644561767578125, "rewards/rejected": 1.6934322118759155, "step": 59200 }, { "epoch": 2.7489669901109615, "grad_norm": 95.57971954345703, "learning_rate": 4.2047758329851275e-08, "logits/chosen": -18.51169204711914, "logits/rejected": -18.853496551513672, "logps/chosen": -292.0693359375, "logps/rejected": -326.74603271484375, "loss": 2.0436, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.9736721515655518, "rewards/margins": -1.3573087453842163, "rewards/rejected": 3.3309810161590576, "step": 59210 }, { "epoch": 2.7494312642183947, "grad_norm": 3.963315725326538, "learning_rate": 4.1970379311945767e-08, "logits/chosen": -19.160751342773438, "logits/rejected": -18.972667694091797, "logps/chosen": -377.9804382324219, "logps/rejected": -368.9649353027344, "loss": 0.8828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.092952251434326, "rewards/margins": 1.3659671545028687, "rewards/rejected": 2.726985454559326, "step": 59220 }, { "epoch": 2.7498955383258274, "grad_norm": 49.99829864501953, "learning_rate": 4.189300029404027e-08, "logits/chosen": -18.734668731689453, "logits/rejected": -17.67426109313965, "logps/chosen": -400.6256408691406, "logps/rejected": -311.08221435546875, "loss": 0.7627, "rewards/accuracies": 0.5, "rewards/chosen": 3.1286940574645996, "rewards/margins": 0.9533838033676147, "rewards/rejected": 2.1753106117248535, "step": 59230 }, { "epoch": 2.7503598124332607, "grad_norm": 189.76173400878906, "learning_rate": 4.181562127613476e-08, "logits/chosen": -18.58124351501465, "logits/rejected": -17.798782348632812, "logps/chosen": -392.6538391113281, "logps/rejected": -302.9228820800781, "loss": 0.833, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.69315242767334, "rewards/margins": 1.0493972301483154, "rewards/rejected": 1.6437549591064453, "step": 59240 }, { "epoch": 2.7508240865406934, "grad_norm": 23.40775489807129, "learning_rate": 4.1738242258229254e-08, "logits/chosen": -18.8942928314209, "logits/rejected": -18.269079208374023, "logps/chosen": -457.1009826660156, "logps/rejected": -382.55755615234375, "loss": 1.0021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7699317932128906, "rewards/margins": 0.8351038694381714, "rewards/rejected": 2.9348275661468506, "step": 59250 }, { "epoch": 2.7512883606481267, "grad_norm": 47.69805145263672, "learning_rate": 4.166086324032375e-08, "logits/chosen": -19.015209197998047, "logits/rejected": -18.81902313232422, "logps/chosen": -394.3583679199219, "logps/rejected": -330.6649475097656, "loss": 0.28, "rewards/accuracies": 1.0, "rewards/chosen": 3.7681808471679688, "rewards/margins": 1.5445502996444702, "rewards/rejected": 2.223630428314209, "step": 59260 }, { "epoch": 2.75175263475556, "grad_norm": 20.41316032409668, "learning_rate": 4.1583484222418244e-08, "logits/chosen": -18.83561134338379, "logits/rejected": -17.762163162231445, "logps/chosen": -241.99032592773438, "logps/rejected": -255.7477264404297, "loss": 1.1584, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.066664218902588, "rewards/margins": 0.7483023405075073, "rewards/rejected": 1.3183619976043701, "step": 59270 }, { "epoch": 2.7522169088629926, "grad_norm": 145.16891479492188, "learning_rate": 4.150610520451274e-08, "logits/chosen": -18.68138885498047, "logits/rejected": -18.168994903564453, "logps/chosen": -354.3319091796875, "logps/rejected": -324.10498046875, "loss": 0.7663, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1926217079162598, "rewards/margins": 1.234148621559143, "rewards/rejected": 1.9584732055664062, "step": 59280 }, { "epoch": 2.752681182970426, "grad_norm": 67.20758819580078, "learning_rate": 4.142872618660724e-08, "logits/chosen": -19.030019760131836, "logits/rejected": -18.185352325439453, "logps/chosen": -355.51275634765625, "logps/rejected": -328.80804443359375, "loss": 1.4668, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1684277057647705, "rewards/margins": -0.3274306356906891, "rewards/rejected": 2.4958581924438477, "step": 59290 }, { "epoch": 2.7531454570778586, "grad_norm": 59.18201446533203, "learning_rate": 4.135134716870174e-08, "logits/chosen": -20.032743453979492, "logits/rejected": -19.47813606262207, "logps/chosen": -440.98223876953125, "logps/rejected": -363.31396484375, "loss": 0.7584, "rewards/accuracies": 0.5, "rewards/chosen": 4.038945198059082, "rewards/margins": 0.7118522524833679, "rewards/rejected": 3.3270938396453857, "step": 59300 }, { "epoch": 2.753609731185292, "grad_norm": 1.532355546951294, "learning_rate": 4.127396815079623e-08, "logits/chosen": -18.862192153930664, "logits/rejected": -18.18439292907715, "logps/chosen": -349.5970764160156, "logps/rejected": -262.77142333984375, "loss": 0.7322, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5559396743774414, "rewards/margins": 0.9994109869003296, "rewards/rejected": 1.5565288066864014, "step": 59310 }, { "epoch": 2.7540740052927246, "grad_norm": 139.56109619140625, "learning_rate": 4.119658913289072e-08, "logits/chosen": -19.570068359375, "logits/rejected": -19.118391036987305, "logps/chosen": -346.7214050292969, "logps/rejected": -358.82855224609375, "loss": 1.0239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.806631088256836, "rewards/margins": 1.3066927194595337, "rewards/rejected": 2.499938488006592, "step": 59320 }, { "epoch": 2.754538279400158, "grad_norm": 0.7236853837966919, "learning_rate": 4.111921011498522e-08, "logits/chosen": -17.68362045288086, "logits/rejected": -17.820533752441406, "logps/chosen": -266.75262451171875, "logps/rejected": -324.25274658203125, "loss": 1.4983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6746660470962524, "rewards/margins": 0.5801804661750793, "rewards/rejected": 1.0944856405258179, "step": 59330 }, { "epoch": 2.755002553507591, "grad_norm": 12.711581230163574, "learning_rate": 4.104183109707971e-08, "logits/chosen": -20.231048583984375, "logits/rejected": -19.381938934326172, "logps/chosen": -374.58367919921875, "logps/rejected": -259.0543212890625, "loss": 0.4257, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.009680271148682, "rewards/margins": 1.2548938989639282, "rewards/rejected": 2.754786252975464, "step": 59340 }, { "epoch": 2.755466827615024, "grad_norm": 1.3085156679153442, "learning_rate": 4.0964452079174215e-08, "logits/chosen": -18.988208770751953, "logits/rejected": -17.55759048461914, "logps/chosen": -366.53387451171875, "logps/rejected": -297.6141357421875, "loss": 0.3119, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4228146076202393, "rewards/margins": 2.4276108741760254, "rewards/rejected": 0.9952036738395691, "step": 59350 }, { "epoch": 2.755931101722457, "grad_norm": 39.62864685058594, "learning_rate": 4.0887073061268706e-08, "logits/chosen": -18.1268310546875, "logits/rejected": -17.998897552490234, "logps/chosen": -351.9866943359375, "logps/rejected": -353.43218994140625, "loss": 1.1137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4797921180725098, "rewards/margins": 0.1696525365114212, "rewards/rejected": 2.3101396560668945, "step": 59360 }, { "epoch": 2.75639537582989, "grad_norm": 43.5434684753418, "learning_rate": 4.08096940433632e-08, "logits/chosen": -18.644750595092773, "logits/rejected": -18.363569259643555, "logps/chosen": -405.1190185546875, "logps/rejected": -341.93218994140625, "loss": 0.4402, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.203568458557129, "rewards/margins": 1.1974457502365112, "rewards/rejected": 3.006122350692749, "step": 59370 }, { "epoch": 2.756859649937323, "grad_norm": 28.128841400146484, "learning_rate": 4.0732315025457696e-08, "logits/chosen": -19.23787498474121, "logits/rejected": -18.668235778808594, "logps/chosen": -409.52911376953125, "logps/rejected": -365.2546691894531, "loss": 1.3096, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1395530700683594, "rewards/margins": 0.18691331148147583, "rewards/rejected": 2.9526398181915283, "step": 59380 }, { "epoch": 2.757323924044756, "grad_norm": 229.0045928955078, "learning_rate": 4.065493600755219e-08, "logits/chosen": -18.326332092285156, "logits/rejected": -18.836488723754883, "logps/chosen": -318.1676025390625, "logps/rejected": -269.5885925292969, "loss": 1.014, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.233226776123047, "rewards/margins": 0.7031009197235107, "rewards/rejected": 1.5301259756088257, "step": 59390 }, { "epoch": 2.757788198152189, "grad_norm": 16.17303466796875, "learning_rate": 4.0577556989646685e-08, "logits/chosen": -19.65463638305664, "logits/rejected": -18.473907470703125, "logps/chosen": -383.2078552246094, "logps/rejected": -287.12591552734375, "loss": 0.3113, "rewards/accuracies": 1.0, "rewards/chosen": 3.8179306983947754, "rewards/margins": 2.111433267593384, "rewards/rejected": 1.7064975500106812, "step": 59400 }, { "epoch": 2.7582524722596222, "grad_norm": 5.625439643859863, "learning_rate": 4.0500177971741183e-08, "logits/chosen": -18.962482452392578, "logits/rejected": -18.011287689208984, "logps/chosen": -362.89666748046875, "logps/rejected": -323.76251220703125, "loss": 0.7606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.710465431213379, "rewards/margins": 0.9687157869338989, "rewards/rejected": 1.7417500019073486, "step": 59410 }, { "epoch": 2.758716746367055, "grad_norm": 133.0155792236328, "learning_rate": 4.0422798953835675e-08, "logits/chosen": -19.118854522705078, "logits/rejected": -17.715679168701172, "logps/chosen": -399.91229248046875, "logps/rejected": -311.55908203125, "loss": 0.6218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.073094367980957, "rewards/margins": 1.8479411602020264, "rewards/rejected": 1.2251532077789307, "step": 59420 }, { "epoch": 2.759181020474488, "grad_norm": 37.25522232055664, "learning_rate": 4.034541993593017e-08, "logits/chosen": -18.968280792236328, "logits/rejected": -18.953706741333008, "logps/chosen": -430.9818420410156, "logps/rejected": -353.2046813964844, "loss": 0.5193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7387032508850098, "rewards/margins": 1.4018510580062866, "rewards/rejected": 2.3368518352508545, "step": 59430 }, { "epoch": 2.7596452945819214, "grad_norm": 80.23554229736328, "learning_rate": 4.0268040918024664e-08, "logits/chosen": -18.471813201904297, "logits/rejected": -17.871925354003906, "logps/chosen": -394.7131652832031, "logps/rejected": -392.2359313964844, "loss": 0.7805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5003302097320557, "rewards/margins": 0.7039092183113098, "rewards/rejected": 1.7964208126068115, "step": 59440 }, { "epoch": 2.760109568689354, "grad_norm": 15.222352981567383, "learning_rate": 4.019066190011916e-08, "logits/chosen": -19.974918365478516, "logits/rejected": -18.36743927001953, "logps/chosen": -325.19793701171875, "logps/rejected": -245.1110382080078, "loss": 0.7967, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1357991695404053, "rewards/margins": 1.8772363662719727, "rewards/rejected": 1.2585628032684326, "step": 59450 }, { "epoch": 2.760573842796787, "grad_norm": 59.03990173339844, "learning_rate": 4.0113282882213654e-08, "logits/chosen": -19.29168701171875, "logits/rejected": -19.065731048583984, "logps/chosen": -292.03143310546875, "logps/rejected": -310.63433837890625, "loss": 0.6032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8680076599121094, "rewards/margins": 0.7809674143791199, "rewards/rejected": 2.0870401859283447, "step": 59460 }, { "epoch": 2.76103811690422, "grad_norm": 204.07928466796875, "learning_rate": 4.003590386430816e-08, "logits/chosen": -18.75720977783203, "logits/rejected": -18.954816818237305, "logps/chosen": -311.2066650390625, "logps/rejected": -349.093505859375, "loss": 1.5751, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 1.6916929483413696, "rewards/margins": -1.0601911544799805, "rewards/rejected": 2.7518842220306396, "step": 59470 }, { "epoch": 2.7615023910116534, "grad_norm": 72.1351089477539, "learning_rate": 3.995852484640265e-08, "logits/chosen": -19.562747955322266, "logits/rejected": -18.435012817382812, "logps/chosen": -398.00714111328125, "logps/rejected": -274.45538330078125, "loss": 0.425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.075438976287842, "rewards/margins": 1.6400763988494873, "rewards/rejected": 2.4353628158569336, "step": 59480 }, { "epoch": 2.761966665119086, "grad_norm": 164.35397338867188, "learning_rate": 3.988114582849714e-08, "logits/chosen": -19.252132415771484, "logits/rejected": -17.330434799194336, "logps/chosen": -366.0439453125, "logps/rejected": -250.8982391357422, "loss": 0.6021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.733940601348877, "rewards/margins": 1.9338858127593994, "rewards/rejected": 0.8000551462173462, "step": 59490 }, { "epoch": 2.7624309392265194, "grad_norm": 0.00372777390293777, "learning_rate": 3.981150471238219e-08, "logits/chosen": -19.280838012695312, "logits/rejected": -18.95650863647461, "logps/chosen": -378.39923095703125, "logps/rejected": -335.5484924316406, "loss": 0.7518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.479646682739258, "rewards/margins": 1.300410509109497, "rewards/rejected": 2.17923641204834, "step": 59500 }, { "epoch": 2.7628952133339526, "grad_norm": 131.1044921875, "learning_rate": 3.973412569447668e-08, "logits/chosen": -19.29061508178711, "logits/rejected": -19.0426082611084, "logps/chosen": -310.433837890625, "logps/rejected": -358.8728942871094, "loss": 1.462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5949819087982178, "rewards/margins": 0.2510111331939697, "rewards/rejected": 2.343970775604248, "step": 59510 }, { "epoch": 2.7633594874413854, "grad_norm": 3.5169785022735596, "learning_rate": 3.965674667657118e-08, "logits/chosen": -18.825695037841797, "logits/rejected": -18.777999877929688, "logps/chosen": -394.8525695800781, "logps/rejected": -343.05364990234375, "loss": 0.5246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.250584840774536, "rewards/margins": 1.0873594284057617, "rewards/rejected": 2.163224935531616, "step": 59520 }, { "epoch": 2.763823761548818, "grad_norm": 41.851600646972656, "learning_rate": 3.957936765866567e-08, "logits/chosen": -18.785663604736328, "logits/rejected": -17.574777603149414, "logps/chosen": -422.57421875, "logps/rejected": -341.2525329589844, "loss": 0.8229, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6931803226470947, "rewards/margins": 1.1254392862319946, "rewards/rejected": 1.5677410364151, "step": 59530 }, { "epoch": 2.7642880356562514, "grad_norm": 219.68524169921875, "learning_rate": 3.950198864076017e-08, "logits/chosen": -19.605823516845703, "logits/rejected": -18.940784454345703, "logps/chosen": -350.0062561035156, "logps/rejected": -325.0898132324219, "loss": 0.7917, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.573737382888794, "rewards/margins": 2.1564223766326904, "rewards/rejected": 1.4173154830932617, "step": 59540 }, { "epoch": 2.7647523097636846, "grad_norm": 112.0606460571289, "learning_rate": 3.942460962285467e-08, "logits/chosen": -18.36541748046875, "logits/rejected": -18.23992919921875, "logps/chosen": -374.9885559082031, "logps/rejected": -330.59686279296875, "loss": 1.7121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5570013523101807, "rewards/margins": -0.30136001110076904, "rewards/rejected": 2.85836124420166, "step": 59550 }, { "epoch": 2.7652165838711174, "grad_norm": 49.616363525390625, "learning_rate": 3.934723060494916e-08, "logits/chosen": -19.133325576782227, "logits/rejected": -18.484642028808594, "logps/chosen": -382.5107727050781, "logps/rejected": -331.75872802734375, "loss": 0.2657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.559431791305542, "rewards/margins": 1.7119970321655273, "rewards/rejected": 1.847434639930725, "step": 59560 }, { "epoch": 2.7656808579785506, "grad_norm": 1.456366777420044, "learning_rate": 3.926985158704366e-08, "logits/chosen": -18.616376876831055, "logits/rejected": -18.06080436706543, "logps/chosen": -253.63070678710938, "logps/rejected": -278.73175048828125, "loss": 1.0029, "rewards/accuracies": 0.5, "rewards/chosen": 2.8525166511535645, "rewards/margins": 0.8308089375495911, "rewards/rejected": 2.021707534790039, "step": 59570 }, { "epoch": 2.766145132085984, "grad_norm": 116.22541046142578, "learning_rate": 3.919247256913815e-08, "logits/chosen": -19.802425384521484, "logits/rejected": -18.52791404724121, "logps/chosen": -490.3999938964844, "logps/rejected": -383.28692626953125, "loss": 0.2539, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.853188991546631, "rewards/margins": 2.142479419708252, "rewards/rejected": 2.7107093334198, "step": 59580 }, { "epoch": 2.7666094061934166, "grad_norm": 0.6380792260169983, "learning_rate": 3.911509355123265e-08, "logits/chosen": -18.75936508178711, "logits/rejected": -18.89596176147461, "logps/chosen": -519.9276123046875, "logps/rejected": -447.413330078125, "loss": 0.8678, "rewards/accuracies": 0.5, "rewards/chosen": 4.453635215759277, "rewards/margins": 0.6986796259880066, "rewards/rejected": 3.754955291748047, "step": 59590 }, { "epoch": 2.7670736803008498, "grad_norm": 157.9362335205078, "learning_rate": 3.903771453332714e-08, "logits/chosen": -18.78668975830078, "logits/rejected": -17.712528228759766, "logps/chosen": -432.7911071777344, "logps/rejected": -319.48095703125, "loss": 0.6677, "rewards/accuracies": 0.5, "rewards/chosen": 3.435824155807495, "rewards/margins": 1.082519769668579, "rewards/rejected": 2.353303909301758, "step": 59600 }, { "epoch": 2.7675379544082825, "grad_norm": 60.623939514160156, "learning_rate": 3.8960335515421637e-08, "logits/chosen": -19.90032386779785, "logits/rejected": -19.045255661010742, "logps/chosen": -409.8990173339844, "logps/rejected": -364.81805419921875, "loss": 0.4067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0114545822143555, "rewards/margins": 1.441878080368042, "rewards/rejected": 2.5695767402648926, "step": 59610 }, { "epoch": 2.7680022285157158, "grad_norm": 119.15081787109375, "learning_rate": 3.8882956497516135e-08, "logits/chosen": -19.35580062866211, "logits/rejected": -18.049266815185547, "logps/chosen": -530.62744140625, "logps/rejected": -373.87933349609375, "loss": 0.2647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.912642002105713, "rewards/margins": 1.912530541419983, "rewards/rejected": 3.0001113414764404, "step": 59620 }, { "epoch": 2.7684665026231485, "grad_norm": 174.5307159423828, "learning_rate": 3.8805577479610626e-08, "logits/chosen": -18.93509292602539, "logits/rejected": -18.340618133544922, "logps/chosen": -416.8282165527344, "logps/rejected": -416.22662353515625, "loss": 0.7021, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9750380516052246, "rewards/margins": 0.9669851064682007, "rewards/rejected": 3.0080530643463135, "step": 59630 }, { "epoch": 2.7689307767305817, "grad_norm": 38.237247467041016, "learning_rate": 3.8728198461705124e-08, "logits/chosen": -18.939529418945312, "logits/rejected": -18.676774978637695, "logps/chosen": -408.20196533203125, "logps/rejected": -352.41363525390625, "loss": 0.6102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7515978813171387, "rewards/margins": 1.5742905139923096, "rewards/rejected": 2.177307605743408, "step": 59640 }, { "epoch": 2.769395050838015, "grad_norm": 0.8204280138015747, "learning_rate": 3.8650819443799616e-08, "logits/chosen": -19.830690383911133, "logits/rejected": -18.61347770690918, "logps/chosen": -463.0869140625, "logps/rejected": -406.5577087402344, "loss": 0.6356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.279409885406494, "rewards/margins": 1.7634309530258179, "rewards/rejected": 3.5159783363342285, "step": 59650 }, { "epoch": 2.7698593249454477, "grad_norm": 7.564657211303711, "learning_rate": 3.8573440425894114e-08, "logits/chosen": -18.81597328186035, "logits/rejected": -18.38082504272461, "logps/chosen": -353.8319396972656, "logps/rejected": -323.43475341796875, "loss": 0.7325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.349243640899658, "rewards/margins": 0.6865351796150208, "rewards/rejected": 1.6627082824707031, "step": 59660 }, { "epoch": 2.770323599052881, "grad_norm": 120.72943115234375, "learning_rate": 3.849606140798861e-08, "logits/chosen": -18.976945877075195, "logits/rejected": -18.18242835998535, "logps/chosen": -402.2425231933594, "logps/rejected": -343.36187744140625, "loss": 0.7187, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6956753730773926, "rewards/margins": 1.6369167566299438, "rewards/rejected": 2.05875825881958, "step": 59670 }, { "epoch": 2.7707878731603137, "grad_norm": 25.20392417907715, "learning_rate": 3.84186823900831e-08, "logits/chosen": -19.229930877685547, "logits/rejected": -18.324485778808594, "logps/chosen": -352.74420166015625, "logps/rejected": -342.8168029785156, "loss": 0.6523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.287813663482666, "rewards/margins": 1.3499904870986938, "rewards/rejected": 1.9378232955932617, "step": 59680 }, { "epoch": 2.771252147267747, "grad_norm": 37.32230758666992, "learning_rate": 3.83413033721776e-08, "logits/chosen": -20.007633209228516, "logits/rejected": -19.11355209350586, "logps/chosen": -300.65972900390625, "logps/rejected": -256.03192138671875, "loss": 1.0495, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2967567443847656, "rewards/margins": 1.5198538303375244, "rewards/rejected": 1.7769027948379517, "step": 59690 }, { "epoch": 2.7717164213751797, "grad_norm": 156.9017333984375, "learning_rate": 3.826392435427209e-08, "logits/chosen": -18.84385108947754, "logits/rejected": -18.716747283935547, "logps/chosen": -391.5856018066406, "logps/rejected": -319.47137451171875, "loss": 1.1051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.0412092208862305, "rewards/margins": 1.479952096939087, "rewards/rejected": 2.5612564086914062, "step": 59700 }, { "epoch": 2.772180695482613, "grad_norm": 0.5275711417198181, "learning_rate": 3.818654533636659e-08, "logits/chosen": -19.37356185913086, "logits/rejected": -17.80239486694336, "logps/chosen": -424.9278869628906, "logps/rejected": -317.68896484375, "loss": 0.7639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6193904876708984, "rewards/margins": 1.9345117807388306, "rewards/rejected": 1.6848785877227783, "step": 59710 }, { "epoch": 2.772644969590046, "grad_norm": 132.70765686035156, "learning_rate": 3.810916631846108e-08, "logits/chosen": -19.773391723632812, "logits/rejected": -19.360652923583984, "logps/chosen": -392.9937744140625, "logps/rejected": -393.0665588378906, "loss": 0.773, "rewards/accuracies": 0.5, "rewards/chosen": 3.517627239227295, "rewards/margins": 0.1358756721019745, "rewards/rejected": 3.3817520141601562, "step": 59720 }, { "epoch": 2.773109243697479, "grad_norm": 231.02822875976562, "learning_rate": 3.8031787300555574e-08, "logits/chosen": -18.664724349975586, "logits/rejected": -19.206161499023438, "logps/chosen": -433.2044982910156, "logps/rejected": -367.50433349609375, "loss": 1.0473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2790908813476562, "rewards/margins": 0.49807628989219666, "rewards/rejected": 2.7810144424438477, "step": 59730 }, { "epoch": 2.773573517804912, "grad_norm": 102.1248779296875, "learning_rate": 3.795440828265008e-08, "logits/chosen": -20.035043716430664, "logits/rejected": -19.73187828063965, "logps/chosen": -453.09112548828125, "logps/rejected": -424.13104248046875, "loss": 0.711, "rewards/accuracies": 0.5, "rewards/chosen": 3.901158094406128, "rewards/margins": 0.029888689517974854, "rewards/rejected": 3.8712692260742188, "step": 59740 }, { "epoch": 2.774037791912345, "grad_norm": 196.587158203125, "learning_rate": 3.787702926474457e-08, "logits/chosen": -18.840023040771484, "logits/rejected": -18.233964920043945, "logps/chosen": -528.6159057617188, "logps/rejected": -401.88836669921875, "loss": 0.9101, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.160473346710205, "rewards/margins": 0.7723003625869751, "rewards/rejected": 3.3881728649139404, "step": 59750 }, { "epoch": 2.774502066019778, "grad_norm": 87.5400390625, "learning_rate": 3.779965024683907e-08, "logits/chosen": -19.397336959838867, "logits/rejected": -18.78481674194336, "logps/chosen": -402.2747802734375, "logps/rejected": -413.95062255859375, "loss": 0.5567, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.545112133026123, "rewards/margins": 0.8572031855583191, "rewards/rejected": 2.6879093647003174, "step": 59760 }, { "epoch": 2.774966340127211, "grad_norm": 107.24584197998047, "learning_rate": 3.772227122893356e-08, "logits/chosen": -18.65337371826172, "logits/rejected": -17.22165870666504, "logps/chosen": -407.53106689453125, "logps/rejected": -275.67230224609375, "loss": 0.609, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.7741494178771973, "rewards/margins": 1.9561570882797241, "rewards/rejected": 0.817992091178894, "step": 59770 }, { "epoch": 2.775430614234644, "grad_norm": 40.11353302001953, "learning_rate": 3.764489221102805e-08, "logits/chosen": -19.502450942993164, "logits/rejected": -18.857807159423828, "logps/chosen": -460.45660400390625, "logps/rejected": -452.58001708984375, "loss": 0.5492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.915999174118042, "rewards/margins": 1.7125232219696045, "rewards/rejected": 2.2034754753112793, "step": 59780 }, { "epoch": 2.7758948883420773, "grad_norm": 2.007361650466919, "learning_rate": 3.756751319312255e-08, "logits/chosen": -18.244457244873047, "logits/rejected": -17.41927146911621, "logps/chosen": -335.0970153808594, "logps/rejected": -253.6962432861328, "loss": 0.6644, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4606642723083496, "rewards/margins": 1.5917714834213257, "rewards/rejected": 0.8688924908638, "step": 59790 }, { "epoch": 2.77635916244951, "grad_norm": 237.79287719726562, "learning_rate": 3.749013417521705e-08, "logits/chosen": -19.092899322509766, "logits/rejected": -18.299863815307617, "logps/chosen": -459.84222412109375, "logps/rejected": -355.3162536621094, "loss": 0.8362, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.011143684387207, "rewards/margins": 1.9747140407562256, "rewards/rejected": 3.0364294052124023, "step": 59800 }, { "epoch": 2.7768234365569433, "grad_norm": 0.0034735845401883125, "learning_rate": 3.7412755157311545e-08, "logits/chosen": -19.42925262451172, "logits/rejected": -17.83797836303711, "logps/chosen": -375.09796142578125, "logps/rejected": -315.26727294921875, "loss": 0.5737, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6027050018310547, "rewards/margins": 2.052311420440674, "rewards/rejected": 1.5503934621810913, "step": 59810 }, { "epoch": 2.7772877106643765, "grad_norm": 228.24375915527344, "learning_rate": 3.7335376139406036e-08, "logits/chosen": -19.164974212646484, "logits/rejected": -19.65412712097168, "logps/chosen": -400.7558898925781, "logps/rejected": -397.728759765625, "loss": 1.2289, "rewards/accuracies": 0.5, "rewards/chosen": 3.0625405311584473, "rewards/margins": -0.09903963655233383, "rewards/rejected": 3.1615796089172363, "step": 59820 }, { "epoch": 2.7777519847718093, "grad_norm": 69.10083770751953, "learning_rate": 3.7257997121500534e-08, "logits/chosen": -19.42547607421875, "logits/rejected": -18.563207626342773, "logps/chosen": -362.53167724609375, "logps/rejected": -327.03387451171875, "loss": 0.5821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6703529357910156, "rewards/margins": 0.8160821795463562, "rewards/rejected": 2.854271173477173, "step": 59830 }, { "epoch": 2.778216258879242, "grad_norm": 3.664991855621338, "learning_rate": 3.7180618103595026e-08, "logits/chosen": -18.4719181060791, "logits/rejected": -18.091175079345703, "logps/chosen": -344.1636657714844, "logps/rejected": -271.95782470703125, "loss": 0.6547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.050736904144287, "rewards/margins": 1.8392305374145508, "rewards/rejected": 1.2115062475204468, "step": 59840 }, { "epoch": 2.7786805329866753, "grad_norm": 6.086808204650879, "learning_rate": 3.710323908568952e-08, "logits/chosen": -19.752344131469727, "logits/rejected": -18.26006507873535, "logps/chosen": -463.6712951660156, "logps/rejected": -308.25628662109375, "loss": 0.8191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.265265464782715, "rewards/margins": 2.3823630809783936, "rewards/rejected": 1.88290274143219, "step": 59850 }, { "epoch": 2.7791448070941085, "grad_norm": 114.11355590820312, "learning_rate": 3.702586006778402e-08, "logits/chosen": -19.68856430053711, "logits/rejected": -19.316621780395508, "logps/chosen": -466.1351623535156, "logps/rejected": -335.25555419921875, "loss": 0.6773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.0665974617004395, "rewards/margins": 1.294872522354126, "rewards/rejected": 2.7717251777648926, "step": 59860 }, { "epoch": 2.7796090812015413, "grad_norm": 0.5942036509513855, "learning_rate": 3.6948481049878513e-08, "logits/chosen": -19.051366806030273, "logits/rejected": -18.514856338500977, "logps/chosen": -337.14208984375, "logps/rejected": -265.71844482421875, "loss": 0.8197, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1782968044281006, "rewards/margins": 0.9291197061538696, "rewards/rejected": 2.2491774559020996, "step": 59870 }, { "epoch": 2.7800733553089745, "grad_norm": 28.142684936523438, "learning_rate": 3.687110203197301e-08, "logits/chosen": -19.160839080810547, "logits/rejected": -18.69424057006836, "logps/chosen": -302.53985595703125, "logps/rejected": -220.276611328125, "loss": 0.5941, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3361942768096924, "rewards/margins": 0.8786091804504395, "rewards/rejected": 1.457585096359253, "step": 59880 }, { "epoch": 2.7805376294164077, "grad_norm": 111.09806060791016, "learning_rate": 3.67937230140675e-08, "logits/chosen": -20.290952682495117, "logits/rejected": -19.57134246826172, "logps/chosen": -360.1438293457031, "logps/rejected": -259.78717041015625, "loss": 0.4712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3998863697052, "rewards/margins": 1.5989686250686646, "rewards/rejected": 1.8009178638458252, "step": 59890 }, { "epoch": 2.7810019035238405, "grad_norm": 35.67654037475586, "learning_rate": 3.6716343996161994e-08, "logits/chosen": -20.10213279724121, "logits/rejected": -18.58206558227539, "logps/chosen": -320.8604736328125, "logps/rejected": -234.1230010986328, "loss": 0.4396, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.621019124984741, "rewards/margins": 2.0319552421569824, "rewards/rejected": 0.5890642404556274, "step": 59900 }, { "epoch": 2.7814661776312732, "grad_norm": 48.59783935546875, "learning_rate": 3.663896497825649e-08, "logits/chosen": -19.62917137145996, "logits/rejected": -19.014347076416016, "logps/chosen": -423.7508239746094, "logps/rejected": -326.1412353515625, "loss": 0.6079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7863357067108154, "rewards/margins": 1.6844441890716553, "rewards/rejected": 2.1018917560577393, "step": 59910 }, { "epoch": 2.7819304517387065, "grad_norm": 317.79595947265625, "learning_rate": 3.656158596035099e-08, "logits/chosen": -18.79039192199707, "logits/rejected": -17.878870010375977, "logps/chosen": -358.32525634765625, "logps/rejected": -286.24261474609375, "loss": 0.7575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.944129705429077, "rewards/margins": 1.4693410396575928, "rewards/rejected": 1.4747889041900635, "step": 59920 }, { "epoch": 2.7823947258461397, "grad_norm": 1.2279210090637207, "learning_rate": 3.648420694244549e-08, "logits/chosen": -19.34183120727539, "logits/rejected": -18.056377410888672, "logps/chosen": -413.1177673339844, "logps/rejected": -263.0393371582031, "loss": 0.3451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8691978454589844, "rewards/margins": 2.628355026245117, "rewards/rejected": 1.2408428192138672, "step": 59930 }, { "epoch": 2.7828589999535724, "grad_norm": 7.940678596496582, "learning_rate": 3.640682792453998e-08, "logits/chosen": -17.55258560180664, "logits/rejected": -17.590002059936523, "logps/chosen": -440.082275390625, "logps/rejected": -489.67303466796875, "loss": 1.5816, "rewards/accuracies": 0.5, "rewards/chosen": 2.9893460273742676, "rewards/margins": 0.30262723565101624, "rewards/rejected": 2.6867189407348633, "step": 59940 }, { "epoch": 2.7833232740610057, "grad_norm": 7.506056785583496, "learning_rate": 3.632944890663447e-08, "logits/chosen": -19.764629364013672, "logits/rejected": -18.848657608032227, "logps/chosen": -414.7184143066406, "logps/rejected": -314.1194763183594, "loss": 0.4594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.798304080963135, "rewards/margins": 1.6681525707244873, "rewards/rejected": 3.1301510334014893, "step": 59950 }, { "epoch": 2.783787548168439, "grad_norm": 43.73996353149414, "learning_rate": 3.625206988872897e-08, "logits/chosen": -18.958232879638672, "logits/rejected": -18.720966339111328, "logps/chosen": -442.9375, "logps/rejected": -394.5741271972656, "loss": 1.0429, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.0189952850341797, "rewards/margins": 0.11832217127084732, "rewards/rejected": 2.9006731510162354, "step": 59960 }, { "epoch": 2.7842518222758716, "grad_norm": 21.171772003173828, "learning_rate": 3.617469087082346e-08, "logits/chosen": -18.496519088745117, "logits/rejected": -18.90769386291504, "logps/chosen": -341.5960693359375, "logps/rejected": -272.37347412109375, "loss": 0.9601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.067321300506592, "rewards/margins": 0.16475360095500946, "rewards/rejected": 2.9025678634643555, "step": 59970 }, { "epoch": 2.784716096383305, "grad_norm": 169.4695281982422, "learning_rate": 3.6097311852917966e-08, "logits/chosen": -18.76538848876953, "logits/rejected": -19.248638153076172, "logps/chosen": -352.0882263183594, "logps/rejected": -405.1314697265625, "loss": 1.1469, "rewards/accuracies": 0.5, "rewards/chosen": 2.3612029552459717, "rewards/margins": -0.2339911013841629, "rewards/rejected": 2.5951943397521973, "step": 59980 }, { "epoch": 2.7851803704907376, "grad_norm": 149.96466064453125, "learning_rate": 3.601993283501246e-08, "logits/chosen": -18.54361343383789, "logits/rejected": -18.208993911743164, "logps/chosen": -398.13970947265625, "logps/rejected": -360.103515625, "loss": 0.7081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0435376167297363, "rewards/margins": 0.9487168192863464, "rewards/rejected": 2.094820737838745, "step": 59990 }, { "epoch": 2.785644644598171, "grad_norm": 66.18913269042969, "learning_rate": 3.5942553817106955e-08, "logits/chosen": -19.30019760131836, "logits/rejected": -18.263763427734375, "logps/chosen": -518.6048583984375, "logps/rejected": -402.9433288574219, "loss": 0.7863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.1365461349487305, "rewards/margins": 1.1473643779754639, "rewards/rejected": 2.9891815185546875, "step": 60000 }, { "epoch": 2.7861089187056036, "grad_norm": 0.5205546617507935, "learning_rate": 3.5865174799201446e-08, "logits/chosen": -19.576478958129883, "logits/rejected": -18.282320022583008, "logps/chosen": -519.91552734375, "logps/rejected": -380.3846435546875, "loss": 0.7523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.236899375915527, "rewards/margins": 2.7824339866638184, "rewards/rejected": 2.454465389251709, "step": 60010 }, { "epoch": 2.786573192813037, "grad_norm": 40.445743560791016, "learning_rate": 3.578779578129594e-08, "logits/chosen": -19.509729385375977, "logits/rejected": -17.977169036865234, "logps/chosen": -346.5068664550781, "logps/rejected": -244.3920440673828, "loss": 0.303, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.3921542167663574, "rewards/margins": 1.9354711771011353, "rewards/rejected": 1.4566832780838013, "step": 60020 }, { "epoch": 2.78703746692047, "grad_norm": 6.521240711212158, "learning_rate": 3.5710416763390436e-08, "logits/chosen": -20.180519104003906, "logits/rejected": -19.313079833984375, "logps/chosen": -562.5025634765625, "logps/rejected": -388.4693908691406, "loss": 0.4859, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.871251106262207, "rewards/margins": 1.6889324188232422, "rewards/rejected": 3.1823184490203857, "step": 60030 }, { "epoch": 2.787501741027903, "grad_norm": 49.729270935058594, "learning_rate": 3.5633037745484934e-08, "logits/chosen": -19.84433937072754, "logits/rejected": -17.44647979736328, "logps/chosen": -509.7603454589844, "logps/rejected": -282.62164306640625, "loss": 0.1303, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.5099310874938965, "rewards/margins": 4.109489917755127, "rewards/rejected": 0.40044140815734863, "step": 60040 }, { "epoch": 2.787966015135336, "grad_norm": 55.11420440673828, "learning_rate": 3.555565872757943e-08, "logits/chosen": -19.157466888427734, "logits/rejected": -18.05525779724121, "logps/chosen": -384.69317626953125, "logps/rejected": -355.08892822265625, "loss": 0.4427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5375735759735107, "rewards/margins": 1.7571680545806885, "rewards/rejected": 0.7804054021835327, "step": 60050 }, { "epoch": 2.788430289242769, "grad_norm": 115.13323211669922, "learning_rate": 3.5478279709673924e-08, "logits/chosen": -19.86978530883789, "logits/rejected": -19.911788940429688, "logps/chosen": -436.44879150390625, "logps/rejected": -404.5885314941406, "loss": 0.4607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.548295497894287, "rewards/margins": 0.8561960458755493, "rewards/rejected": 3.6920993328094482, "step": 60060 }, { "epoch": 2.788894563350202, "grad_norm": 46.08122634887695, "learning_rate": 3.5400900691768415e-08, "logits/chosen": -19.307811737060547, "logits/rejected": -18.431535720825195, "logps/chosen": -371.84918212890625, "logps/rejected": -268.478271484375, "loss": 0.3656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5287368297576904, "rewards/margins": 2.266453742980957, "rewards/rejected": 1.2622830867767334, "step": 60070 }, { "epoch": 2.789358837457635, "grad_norm": 77.2310562133789, "learning_rate": 3.532352167386291e-08, "logits/chosen": -19.23999786376953, "logits/rejected": -18.33768081665039, "logps/chosen": -263.39874267578125, "logps/rejected": -174.34579467773438, "loss": 0.8626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9487159252166748, "rewards/margins": 1.1299796104431152, "rewards/rejected": 0.8187362551689148, "step": 60080 }, { "epoch": 2.789823111565068, "grad_norm": 77.42724609375, "learning_rate": 3.5246142655957404e-08, "logits/chosen": -18.394758224487305, "logits/rejected": -18.93132781982422, "logps/chosen": -379.61346435546875, "logps/rejected": -415.73065185546875, "loss": 1.585, "rewards/accuracies": 0.5, "rewards/chosen": 2.6872458457946777, "rewards/margins": -0.3138110637664795, "rewards/rejected": 3.0010569095611572, "step": 60090 }, { "epoch": 2.7902873856725012, "grad_norm": 2.9431707859039307, "learning_rate": 3.516876363805191e-08, "logits/chosen": -18.960693359375, "logits/rejected": -18.577482223510742, "logps/chosen": -487.1289978027344, "logps/rejected": -454.4703063964844, "loss": 0.9469, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9259700775146484, "rewards/margins": 0.6089451909065247, "rewards/rejected": 3.3170249462127686, "step": 60100 }, { "epoch": 2.790751659779934, "grad_norm": 20.548006057739258, "learning_rate": 3.50913846201464e-08, "logits/chosen": -19.32168197631836, "logits/rejected": -18.956300735473633, "logps/chosen": -397.55377197265625, "logps/rejected": -402.01776123046875, "loss": 0.7023, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6413685083389282, "rewards/margins": 0.5236414074897766, "rewards/rejected": 1.117727279663086, "step": 60110 }, { "epoch": 2.791215933887367, "grad_norm": 7.75833797454834, "learning_rate": 3.501400560224089e-08, "logits/chosen": -19.94846534729004, "logits/rejected": -19.18942642211914, "logps/chosen": -305.0689392089844, "logps/rejected": -341.96209716796875, "loss": 0.8232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4439594745635986, "rewards/margins": 0.7761563658714294, "rewards/rejected": 1.667803168296814, "step": 60120 }, { "epoch": 2.7916802079948, "grad_norm": 51.72294235229492, "learning_rate": 3.493662658433539e-08, "logits/chosen": -19.01947593688965, "logits/rejected": -18.629253387451172, "logps/chosen": -427.0556640625, "logps/rejected": -451.3624572753906, "loss": 1.0419, "rewards/accuracies": 0.5, "rewards/chosen": 3.5125739574432373, "rewards/margins": 0.4355706572532654, "rewards/rejected": 3.0770034790039062, "step": 60130 }, { "epoch": 2.792144482102233, "grad_norm": 7.469959735870361, "learning_rate": 3.485924756642988e-08, "logits/chosen": -18.765722274780273, "logits/rejected": -19.104045867919922, "logps/chosen": -336.8287048339844, "logps/rejected": -296.2330017089844, "loss": 1.7829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.8515821695327759, "rewards/margins": -0.7386935353279114, "rewards/rejected": 2.590275526046753, "step": 60140 }, { "epoch": 2.792608756209666, "grad_norm": 0.12942636013031006, "learning_rate": 3.478186854852438e-08, "logits/chosen": -18.837791442871094, "logits/rejected": -17.768495559692383, "logps/chosen": -413.50042724609375, "logps/rejected": -373.7899475097656, "loss": 0.9408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.771424293518066, "rewards/margins": 1.867340087890625, "rewards/rejected": 2.9040839672088623, "step": 60150 }, { "epoch": 2.793073030317099, "grad_norm": 16.028310775756836, "learning_rate": 3.470448953061888e-08, "logits/chosen": -19.479265213012695, "logits/rejected": -18.717302322387695, "logps/chosen": -331.35040283203125, "logps/rejected": -259.73712158203125, "loss": 0.4974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.560974597930908, "rewards/margins": 1.3150765895843506, "rewards/rejected": 2.2458982467651367, "step": 60160 }, { "epoch": 2.7935373044245324, "grad_norm": 93.89778137207031, "learning_rate": 3.4627110512713376e-08, "logits/chosen": -19.18191146850586, "logits/rejected": -17.89638900756836, "logps/chosen": -469.21466064453125, "logps/rejected": -343.6881103515625, "loss": 0.3941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0850019454956055, "rewards/margins": 2.1956112384796143, "rewards/rejected": 1.889390230178833, "step": 60170 }, { "epoch": 2.794001578531965, "grad_norm": 33.75676345825195, "learning_rate": 3.454973149480787e-08, "logits/chosen": -19.85537338256836, "logits/rejected": -18.749746322631836, "logps/chosen": -397.07916259765625, "logps/rejected": -284.171142578125, "loss": 0.4102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4864158630371094, "rewards/margins": 1.9673494100570679, "rewards/rejected": 1.519066333770752, "step": 60180 }, { "epoch": 2.7944658526393984, "grad_norm": 37.27665710449219, "learning_rate": 3.447235247690236e-08, "logits/chosen": -19.591760635375977, "logits/rejected": -18.290800094604492, "logps/chosen": -236.62808227539062, "logps/rejected": -152.06690979003906, "loss": 0.3811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.375171661376953, "rewards/margins": 2.32710599899292, "rewards/rejected": 0.04806581884622574, "step": 60190 }, { "epoch": 2.794930126746831, "grad_norm": 54.41367721557617, "learning_rate": 3.4394973458996857e-08, "logits/chosen": -18.703413009643555, "logits/rejected": -18.021610260009766, "logps/chosen": -325.4598388671875, "logps/rejected": -284.882080078125, "loss": 0.7072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0359854698181152, "rewards/margins": 1.0220246315002441, "rewards/rejected": 2.01396107673645, "step": 60200 }, { "epoch": 2.7953944008542644, "grad_norm": 0.2659437358379364, "learning_rate": 3.431759444109135e-08, "logits/chosen": -18.929601669311523, "logits/rejected": -17.741945266723633, "logps/chosen": -395.35784912109375, "logps/rejected": -335.77593994140625, "loss": 0.1812, "rewards/accuracies": 1.0, "rewards/chosen": 4.700595855712891, "rewards/margins": 2.9246983528137207, "rewards/rejected": 1.7758972644805908, "step": 60210 }, { "epoch": 2.795858674961697, "grad_norm": 166.22679138183594, "learning_rate": 3.424021542318585e-08, "logits/chosen": -18.31708526611328, "logits/rejected": -18.047075271606445, "logps/chosen": -548.9512329101562, "logps/rejected": -431.1085510253906, "loss": 0.8834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.791433334350586, "rewards/margins": 1.427563190460205, "rewards/rejected": 3.363870143890381, "step": 60220 }, { "epoch": 2.7963229490691304, "grad_norm": 54.05704116821289, "learning_rate": 3.4162836405280344e-08, "logits/chosen": -19.559606552124023, "logits/rejected": -18.927024841308594, "logps/chosen": -395.31024169921875, "logps/rejected": -251.1350860595703, "loss": 0.3031, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.919882297515869, "rewards/margins": 2.293024778366089, "rewards/rejected": 1.6268573999404907, "step": 60230 }, { "epoch": 2.7967872231765636, "grad_norm": 0.45835113525390625, "learning_rate": 3.4085457387374836e-08, "logits/chosen": -20.038488388061523, "logits/rejected": -18.831483840942383, "logps/chosen": -474.7706604003906, "logps/rejected": -346.8702087402344, "loss": 0.7491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.183863639831543, "rewards/margins": 1.5765340328216553, "rewards/rejected": 2.607329845428467, "step": 60240 }, { "epoch": 2.7972514972839964, "grad_norm": 12.465906143188477, "learning_rate": 3.4008078369469334e-08, "logits/chosen": -18.98531723022461, "logits/rejected": -18.93826675415039, "logps/chosen": -369.7417907714844, "logps/rejected": -357.16400146484375, "loss": 0.7745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.556392192840576, "rewards/margins": 0.3568403124809265, "rewards/rejected": 2.199551820755005, "step": 60250 }, { "epoch": 2.7977157713914296, "grad_norm": 27.195331573486328, "learning_rate": 3.3930699351563825e-08, "logits/chosen": -19.840551376342773, "logits/rejected": -19.738365173339844, "logps/chosen": -241.3076934814453, "logps/rejected": -260.22857666015625, "loss": 0.8838, "rewards/accuracies": 0.5, "rewards/chosen": 2.165348529815674, "rewards/margins": 0.48743337392807007, "rewards/rejected": 1.6779152154922485, "step": 60260 }, { "epoch": 2.798180045498863, "grad_norm": 46.47853088378906, "learning_rate": 3.385332033365832e-08, "logits/chosen": -19.773822784423828, "logits/rejected": -17.929725646972656, "logps/chosen": -512.9637451171875, "logps/rejected": -350.17120361328125, "loss": 0.2478, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.882210731506348, "rewards/margins": 2.6705713272094727, "rewards/rejected": 2.2116384506225586, "step": 60270 }, { "epoch": 2.7986443196062956, "grad_norm": 12.804585456848145, "learning_rate": 3.377594131575282e-08, "logits/chosen": -19.03660774230957, "logits/rejected": -18.70172882080078, "logps/chosen": -325.05499267578125, "logps/rejected": -270.99456787109375, "loss": 0.5603, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0077829360961914, "rewards/margins": 1.4275643825531006, "rewards/rejected": 1.5802189111709595, "step": 60280 }, { "epoch": 2.7991085937137283, "grad_norm": 0.056372758001089096, "learning_rate": 3.369856229784732e-08, "logits/chosen": -18.65121078491211, "logits/rejected": -17.136850357055664, "logps/chosen": -354.2556457519531, "logps/rejected": -171.19692993164062, "loss": 0.6978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.17972731590271, "rewards/margins": 2.355743885040283, "rewards/rejected": 0.823983371257782, "step": 60290 }, { "epoch": 2.7995728678211615, "grad_norm": 0.8333981037139893, "learning_rate": 3.362118327994181e-08, "logits/chosen": -18.92032241821289, "logits/rejected": -17.9993953704834, "logps/chosen": -320.1302185058594, "logps/rejected": -256.8815002441406, "loss": 0.6425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.232283115386963, "rewards/margins": 1.01882004737854, "rewards/rejected": 1.2134630680084229, "step": 60300 }, { "epoch": 2.8000371419285948, "grad_norm": 51.938079833984375, "learning_rate": 3.35438042620363e-08, "logits/chosen": -19.966991424560547, "logits/rejected": -19.387388229370117, "logps/chosen": -353.3884582519531, "logps/rejected": -318.36663818359375, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 4.05513858795166, "rewards/margins": 1.9738476276397705, "rewards/rejected": 2.0812911987304688, "step": 60310 }, { "epoch": 2.8005014160360275, "grad_norm": 129.8931884765625, "learning_rate": 3.34664252441308e-08, "logits/chosen": -19.909503936767578, "logits/rejected": -19.333911895751953, "logps/chosen": -400.8539123535156, "logps/rejected": -376.2073669433594, "loss": 1.0207, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.0597381591796875, "rewards/margins": 0.08065304905176163, "rewards/rejected": 2.9790854454040527, "step": 60320 }, { "epoch": 2.8009656901434608, "grad_norm": 85.21504974365234, "learning_rate": 3.338904622622529e-08, "logits/chosen": -19.594547271728516, "logits/rejected": -18.712604522705078, "logps/chosen": -517.6473388671875, "logps/rejected": -437.4495544433594, "loss": 0.8779, "rewards/accuracies": 0.5, "rewards/chosen": 4.636052131652832, "rewards/margins": 0.7864327430725098, "rewards/rejected": 3.8496196269989014, "step": 60330 }, { "epoch": 2.801429964250894, "grad_norm": 0.09217079728841782, "learning_rate": 3.3311667208319796e-08, "logits/chosen": -18.157451629638672, "logits/rejected": -17.587968826293945, "logps/chosen": -380.9698181152344, "logps/rejected": -328.7392578125, "loss": 1.0151, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5051937103271484, "rewards/margins": 0.5626177191734314, "rewards/rejected": 1.9425760507583618, "step": 60340 }, { "epoch": 2.8018942383583267, "grad_norm": 100.60796356201172, "learning_rate": 3.323428819041429e-08, "logits/chosen": -18.697301864624023, "logits/rejected": -18.30954360961914, "logps/chosen": -411.184326171875, "logps/rejected": -350.68450927734375, "loss": 0.9676, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4533495903015137, "rewards/margins": 0.5437966585159302, "rewards/rejected": 2.909553050994873, "step": 60350 }, { "epoch": 2.8023585124657595, "grad_norm": 240.01739501953125, "learning_rate": 3.315690917250878e-08, "logits/chosen": -19.043659210205078, "logits/rejected": -18.18844223022461, "logps/chosen": -411.59442138671875, "logps/rejected": -307.2515563964844, "loss": 0.6022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9715123176574707, "rewards/margins": 1.377295732498169, "rewards/rejected": 2.5942165851593018, "step": 60360 }, { "epoch": 2.8028227865731927, "grad_norm": 2.252875328063965, "learning_rate": 3.307953015460328e-08, "logits/chosen": -18.45680809020996, "logits/rejected": -17.088077545166016, "logps/chosen": -392.78045654296875, "logps/rejected": -296.01470947265625, "loss": 0.7147, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.2839250564575195, "rewards/margins": 2.139050006866455, "rewards/rejected": 2.1448757648468018, "step": 60370 }, { "epoch": 2.803287060680626, "grad_norm": 210.91082763671875, "learning_rate": 3.300215113669777e-08, "logits/chosen": -19.026447296142578, "logits/rejected": -18.411046981811523, "logps/chosen": -326.99017333984375, "logps/rejected": -299.73101806640625, "loss": 0.8484, "rewards/accuracies": 0.5, "rewards/chosen": 2.6381027698516846, "rewards/margins": 0.393301784992218, "rewards/rejected": 2.2448012828826904, "step": 60380 }, { "epoch": 2.8037513347880587, "grad_norm": 170.01402282714844, "learning_rate": 3.292477211879227e-08, "logits/chosen": -18.705623626708984, "logits/rejected": -18.986120223999023, "logps/chosen": -300.5252990722656, "logps/rejected": -329.2696838378906, "loss": 0.8508, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.712905168533325, "rewards/margins": 0.29082581400871277, "rewards/rejected": 2.42207932472229, "step": 60390 }, { "epoch": 2.804215608895492, "grad_norm": 223.87710571289062, "learning_rate": 3.2847393100886765e-08, "logits/chosen": -17.861858367919922, "logits/rejected": -17.943655014038086, "logps/chosen": -285.1910095214844, "logps/rejected": -263.4991760253906, "loss": 1.216, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.7432914972305298, "rewards/margins": -0.464459091424942, "rewards/rejected": 2.2077507972717285, "step": 60400 }, { "epoch": 2.804679883002925, "grad_norm": 15.403422355651855, "learning_rate": 3.2770014082981256e-08, "logits/chosen": -18.99190902709961, "logits/rejected": -17.79192543029785, "logps/chosen": -465.3277893066406, "logps/rejected": -319.5022277832031, "loss": 0.2827, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.653660297393799, "rewards/margins": 1.8715198040008545, "rewards/rejected": 1.7821404933929443, "step": 60410 }, { "epoch": 2.805144157110358, "grad_norm": 83.1667709350586, "learning_rate": 3.2692635065075754e-08, "logits/chosen": -19.00743293762207, "logits/rejected": -17.58482551574707, "logps/chosen": -501.9706115722656, "logps/rejected": -371.7751770019531, "loss": 0.9299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.961822032928467, "rewards/margins": 2.1686508655548096, "rewards/rejected": 2.7931714057922363, "step": 60420 }, { "epoch": 2.805608431217791, "grad_norm": 1.2312406301498413, "learning_rate": 3.2615256047170246e-08, "logits/chosen": -19.193485260009766, "logits/rejected": -18.902427673339844, "logps/chosen": -362.97003173828125, "logps/rejected": -347.19232177734375, "loss": 1.1541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1957201957702637, "rewards/margins": 1.161195993423462, "rewards/rejected": 2.034524440765381, "step": 60430 }, { "epoch": 2.806072705325224, "grad_norm": 4.2214155197143555, "learning_rate": 3.2537877029264744e-08, "logits/chosen": -18.73871421813965, "logits/rejected": -19.20566749572754, "logps/chosen": -303.70782470703125, "logps/rejected": -368.4179382324219, "loss": 2.0793, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.406270980834961, "rewards/margins": -1.0512646436691284, "rewards/rejected": 3.4575355052948, "step": 60440 }, { "epoch": 2.806536979432657, "grad_norm": 46.81911087036133, "learning_rate": 3.2460498011359235e-08, "logits/chosen": -19.207406997680664, "logits/rejected": -19.100589752197266, "logps/chosen": -357.27880859375, "logps/rejected": -285.5422668457031, "loss": 1.9372, "rewards/accuracies": 0.5, "rewards/chosen": 1.9126802682876587, "rewards/margins": -0.6698107719421387, "rewards/rejected": 2.582490921020508, "step": 60450 }, { "epoch": 2.80700125354009, "grad_norm": 218.32301330566406, "learning_rate": 3.238311899345374e-08, "logits/chosen": -18.994735717773438, "logits/rejected": -18.323474884033203, "logps/chosen": -379.0965576171875, "logps/rejected": -293.9613952636719, "loss": 0.7547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.492030382156372, "rewards/margins": 1.7129700183868408, "rewards/rejected": 1.7790606021881104, "step": 60460 }, { "epoch": 2.807465527647523, "grad_norm": 68.30162811279297, "learning_rate": 3.230573997554823e-08, "logits/chosen": -19.19679832458496, "logits/rejected": -18.628231048583984, "logps/chosen": -477.8681640625, "logps/rejected": -471.42791748046875, "loss": 0.6958, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.566689968109131, "rewards/margins": 0.5499745607376099, "rewards/rejected": 3.0167157649993896, "step": 60470 }, { "epoch": 2.8079298017549563, "grad_norm": 0.45322927832603455, "learning_rate": 3.222836095764272e-08, "logits/chosen": -19.34964370727539, "logits/rejected": -17.721302032470703, "logps/chosen": -407.4567565917969, "logps/rejected": -267.67425537109375, "loss": 0.2951, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9583518505096436, "rewards/margins": 2.9088311195373535, "rewards/rejected": 1.0495209693908691, "step": 60480 }, { "epoch": 2.808394075862389, "grad_norm": 37.91359329223633, "learning_rate": 3.215098193973722e-08, "logits/chosen": -20.391841888427734, "logits/rejected": -20.29793930053711, "logps/chosen": -393.176025390625, "logps/rejected": -341.7313537597656, "loss": 0.5364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.209292411804199, "rewards/margins": 0.7862779498100281, "rewards/rejected": 3.4230148792266846, "step": 60490 }, { "epoch": 2.8088583499698223, "grad_norm": 96.38735961914062, "learning_rate": 3.207360292183171e-08, "logits/chosen": -18.187015533447266, "logits/rejected": -18.459064483642578, "logps/chosen": -311.38983154296875, "logps/rejected": -384.9384460449219, "loss": 1.9586, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.029339551925659, "rewards/margins": -0.9850614666938782, "rewards/rejected": 3.0144009590148926, "step": 60500 }, { "epoch": 2.809322624077255, "grad_norm": 66.57051086425781, "learning_rate": 3.199622390392621e-08, "logits/chosen": -18.850276947021484, "logits/rejected": -18.671504974365234, "logps/chosen": -376.7726135253906, "logps/rejected": -335.08831787109375, "loss": 0.8591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5126922130584717, "rewards/margins": 0.3236997127532959, "rewards/rejected": 2.188992500305176, "step": 60510 }, { "epoch": 2.8097868981846883, "grad_norm": 85.99663543701172, "learning_rate": 3.191884488602071e-08, "logits/chosen": -18.999874114990234, "logits/rejected": -18.540664672851562, "logps/chosen": -397.959228515625, "logps/rejected": -332.2203369140625, "loss": 0.4316, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.031548023223877, "rewards/margins": 1.620100736618042, "rewards/rejected": 2.4114468097686768, "step": 60520 }, { "epoch": 2.810251172292121, "grad_norm": 76.11730194091797, "learning_rate": 3.18414658681152e-08, "logits/chosen": -19.27518081665039, "logits/rejected": -18.26460838317871, "logps/chosen": -436.4723205566406, "logps/rejected": -287.748779296875, "loss": 0.4412, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.599417209625244, "rewards/margins": 2.560688018798828, "rewards/rejected": 2.038729190826416, "step": 60530 }, { "epoch": 2.8107154463995543, "grad_norm": 5.914029121398926, "learning_rate": 3.17640868502097e-08, "logits/chosen": -18.84952735900879, "logits/rejected": -17.850162506103516, "logps/chosen": -286.716796875, "logps/rejected": -208.8408660888672, "loss": 0.8044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.193110466003418, "rewards/margins": 1.801561951637268, "rewards/rejected": 0.39154860377311707, "step": 60540 }, { "epoch": 2.8111797205069875, "grad_norm": 17.766630172729492, "learning_rate": 3.168670783230419e-08, "logits/chosen": -19.540266036987305, "logits/rejected": -18.601675033569336, "logps/chosen": -445.2118225097656, "logps/rejected": -413.6136169433594, "loss": 0.3112, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.246851921081543, "rewards/margins": 1.9822137355804443, "rewards/rejected": 3.2646377086639404, "step": 60550 }, { "epoch": 2.8116439946144203, "grad_norm": 278.13916015625, "learning_rate": 3.160932881439869e-08, "logits/chosen": -19.504247665405273, "logits/rejected": -18.337528228759766, "logps/chosen": -395.57196044921875, "logps/rejected": -315.86944580078125, "loss": 0.5448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7886734008789062, "rewards/margins": 2.2291159629821777, "rewards/rejected": 1.5595571994781494, "step": 60560 }, { "epoch": 2.8121082687218535, "grad_norm": 97.08979797363281, "learning_rate": 3.153194979649318e-08, "logits/chosen": -20.08602523803711, "logits/rejected": -17.866519927978516, "logps/chosen": -500.8663635253906, "logps/rejected": -293.88323974609375, "loss": 0.2799, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.746031761169434, "rewards/margins": 3.148080587387085, "rewards/rejected": 1.5979511737823486, "step": 60570 }, { "epoch": 2.8125725428292863, "grad_norm": 24.285249710083008, "learning_rate": 3.145457077858768e-08, "logits/chosen": -19.407085418701172, "logits/rejected": -18.18904685974121, "logps/chosen": -489.6761169433594, "logps/rejected": -340.83013916015625, "loss": 0.5053, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.139462471008301, "rewards/margins": 2.609708070755005, "rewards/rejected": 1.5297542810440063, "step": 60580 }, { "epoch": 2.8130368169367195, "grad_norm": 0.48739534616470337, "learning_rate": 3.1377191760682175e-08, "logits/chosen": -19.532201766967773, "logits/rejected": -18.499801635742188, "logps/chosen": -524.1182861328125, "logps/rejected": -325.477294921875, "loss": 0.4265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.188429355621338, "rewards/margins": 2.988614082336426, "rewards/rejected": 2.199815273284912, "step": 60590 }, { "epoch": 2.8135010910441522, "grad_norm": 272.51348876953125, "learning_rate": 3.1299812742776667e-08, "logits/chosen": -19.409042358398438, "logits/rejected": -19.04976463317871, "logps/chosen": -393.30035400390625, "logps/rejected": -382.0182800292969, "loss": 1.108, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3057961463928223, "rewards/margins": -0.5172432065010071, "rewards/rejected": 3.8230392932891846, "step": 60600 }, { "epoch": 2.8139653651515855, "grad_norm": 146.77044677734375, "learning_rate": 3.1222433724871165e-08, "logits/chosen": -19.423908233642578, "logits/rejected": -19.433605194091797, "logps/chosen": -369.2432556152344, "logps/rejected": -311.33624267578125, "loss": 1.0212, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.83143949508667, "rewards/margins": 0.06772241741418839, "rewards/rejected": 2.7637171745300293, "step": 60610 }, { "epoch": 2.8144296392590187, "grad_norm": 202.0327911376953, "learning_rate": 3.1145054706965656e-08, "logits/chosen": -18.872295379638672, "logits/rejected": -18.581079483032227, "logps/chosen": -371.9928283691406, "logps/rejected": -373.052001953125, "loss": 0.9324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3527112007141113, "rewards/margins": 0.4174973964691162, "rewards/rejected": 2.935213565826416, "step": 60620 }, { "epoch": 2.8148939133664515, "grad_norm": 191.4888916015625, "learning_rate": 3.1067675689060154e-08, "logits/chosen": -18.21617317199707, "logits/rejected": -17.53189468383789, "logps/chosen": -267.0057067871094, "logps/rejected": -172.57888793945312, "loss": 0.8328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.1920695304870605, "rewards/margins": 2.0161044597625732, "rewards/rejected": 0.17596498131752014, "step": 60630 }, { "epoch": 2.8153581874738847, "grad_norm": 55.21879577636719, "learning_rate": 3.0990296671154646e-08, "logits/chosen": -18.586833953857422, "logits/rejected": -16.935344696044922, "logps/chosen": -394.05474853515625, "logps/rejected": -233.56472778320312, "loss": 0.2379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.951857328414917, "rewards/margins": 3.1609230041503906, "rewards/rejected": 0.7909345030784607, "step": 60640 }, { "epoch": 2.815822461581318, "grad_norm": 15.987579345703125, "learning_rate": 3.0912917653249144e-08, "logits/chosen": -19.78659439086914, "logits/rejected": -19.384437561035156, "logps/chosen": -546.216796875, "logps/rejected": -480.57550048828125, "loss": 0.5566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.760105133056641, "rewards/margins": 0.8641121983528137, "rewards/rejected": 3.89599347114563, "step": 60650 }, { "epoch": 2.8162867356887507, "grad_norm": 11.254143714904785, "learning_rate": 3.083553863534364e-08, "logits/chosen": -19.45726203918457, "logits/rejected": -17.99209976196289, "logps/chosen": -333.4560546875, "logps/rejected": -227.13623046875, "loss": 0.5873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9971396923065186, "rewards/margins": 2.3278229236602783, "rewards/rejected": 0.6693168878555298, "step": 60660 }, { "epoch": 2.8167510097961834, "grad_norm": 155.88677978515625, "learning_rate": 3.075815961743813e-08, "logits/chosen": -19.307886123657227, "logits/rejected": -18.780284881591797, "logps/chosen": -325.1607971191406, "logps/rejected": -295.51727294921875, "loss": 1.2873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7507944107055664, "rewards/margins": 0.10163426399230957, "rewards/rejected": 2.649160146713257, "step": 60670 }, { "epoch": 2.8172152839036166, "grad_norm": 95.7159423828125, "learning_rate": 3.068078059953263e-08, "logits/chosen": -19.430789947509766, "logits/rejected": -17.38827896118164, "logps/chosen": -328.36444091796875, "logps/rejected": -212.59677124023438, "loss": 0.4205, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2950997352600098, "rewards/margins": 2.7184128761291504, "rewards/rejected": 0.5766867399215698, "step": 60680 }, { "epoch": 2.81767955801105, "grad_norm": 168.92433166503906, "learning_rate": 3.060340158162712e-08, "logits/chosen": -20.19873046875, "logits/rejected": -20.28936004638672, "logps/chosen": -477.00079345703125, "logps/rejected": -509.7959899902344, "loss": 1.5641, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.4079596996307373, "rewards/margins": -0.4044707715511322, "rewards/rejected": 3.8124308586120605, "step": 60690 }, { "epoch": 2.8181438321184826, "grad_norm": 46.50137710571289, "learning_rate": 3.052602256372162e-08, "logits/chosen": -19.439226150512695, "logits/rejected": -18.561643600463867, "logps/chosen": -385.6994323730469, "logps/rejected": -285.11395263671875, "loss": 0.4173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1774661540985107, "rewards/margins": 1.6337693929672241, "rewards/rejected": 1.543696641921997, "step": 60700 }, { "epoch": 2.818608106225916, "grad_norm": 125.7203140258789, "learning_rate": 3.044864354581611e-08, "logits/chosen": -19.23950958251953, "logits/rejected": -18.931774139404297, "logps/chosen": -315.517578125, "logps/rejected": -361.2135314941406, "loss": 1.1069, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.0908303260803223, "rewards/margins": -0.43933433294296265, "rewards/rejected": 2.5301644802093506, "step": 60710 }, { "epoch": 2.819072380333349, "grad_norm": 6.073196887969971, "learning_rate": 3.037126452791061e-08, "logits/chosen": -19.8604793548584, "logits/rejected": -19.477754592895508, "logps/chosen": -367.0304870605469, "logps/rejected": -323.35284423828125, "loss": 0.5106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.306563138961792, "rewards/margins": 1.2435230016708374, "rewards/rejected": 2.063040256500244, "step": 60720 }, { "epoch": 2.819536654440782, "grad_norm": 14.086995124816895, "learning_rate": 3.029388551000511e-08, "logits/chosen": -19.676162719726562, "logits/rejected": -18.563703536987305, "logps/chosen": -409.916748046875, "logps/rejected": -351.29296875, "loss": 0.2846, "rewards/accuracies": 1.0, "rewards/chosen": 4.707508087158203, "rewards/margins": 1.987754225730896, "rewards/rejected": 2.7197539806365967, "step": 60730 }, { "epoch": 2.8200009285482146, "grad_norm": 172.2434539794922, "learning_rate": 3.02165064920996e-08, "logits/chosen": -19.571765899658203, "logits/rejected": -17.80270004272461, "logps/chosen": -557.7879638671875, "logps/rejected": -393.2101745605469, "loss": 0.7169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.882832050323486, "rewards/margins": 2.2541496753692627, "rewards/rejected": 3.6286826133728027, "step": 60740 }, { "epoch": 2.820465202655648, "grad_norm": 81.93839263916016, "learning_rate": 3.01391274741941e-08, "logits/chosen": -18.58339500427246, "logits/rejected": -17.689197540283203, "logps/chosen": -301.5099792480469, "logps/rejected": -231.30648803710938, "loss": 0.4683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8568918704986572, "rewards/margins": 1.4052717685699463, "rewards/rejected": 1.4516199827194214, "step": 60750 }, { "epoch": 2.820929476763081, "grad_norm": 74.61740112304688, "learning_rate": 3.006174845628859e-08, "logits/chosen": -18.817119598388672, "logits/rejected": -17.52470588684082, "logps/chosen": -391.44732666015625, "logps/rejected": -281.18572998046875, "loss": 0.2818, "rewards/accuracies": 1.0, "rewards/chosen": 3.3212730884552, "rewards/margins": 2.316589832305908, "rewards/rejected": 1.004683256149292, "step": 60760 }, { "epoch": 2.821393750870514, "grad_norm": 0.1651236116886139, "learning_rate": 2.998436943838309e-08, "logits/chosen": -19.27153205871582, "logits/rejected": -17.910146713256836, "logps/chosen": -486.46051025390625, "logps/rejected": -355.9686584472656, "loss": 0.5806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.908639430999756, "rewards/margins": 1.5377721786499023, "rewards/rejected": 3.3708674907684326, "step": 60770 }, { "epoch": 2.821858024977947, "grad_norm": 7.202604293823242, "learning_rate": 2.9906990420477585e-08, "logits/chosen": -19.40320587158203, "logits/rejected": -19.293546676635742, "logps/chosen": -493.8741760253906, "logps/rejected": -381.346435546875, "loss": 0.5256, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.141356468200684, "rewards/margins": 1.251497745513916, "rewards/rejected": 2.8898584842681885, "step": 60780 }, { "epoch": 2.8223222990853802, "grad_norm": 61.37794876098633, "learning_rate": 2.982961140257208e-08, "logits/chosen": -19.682756423950195, "logits/rejected": -17.441051483154297, "logps/chosen": -508.2103576660156, "logps/rejected": -265.7004699707031, "loss": 0.1829, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.890166759490967, "rewards/margins": 3.3194496631622314, "rewards/rejected": 1.5707167387008667, "step": 60790 }, { "epoch": 2.822786573192813, "grad_norm": 81.3836441040039, "learning_rate": 2.975223238466657e-08, "logits/chosen": -18.90908432006836, "logits/rejected": -18.41481590270996, "logps/chosen": -420.36676025390625, "logps/rejected": -360.27490234375, "loss": 0.7919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.9518561363220215, "rewards/margins": 0.6527873873710632, "rewards/rejected": 2.2990686893463135, "step": 60800 }, { "epoch": 2.8232508473002462, "grad_norm": 107.2825927734375, "learning_rate": 2.967485336676107e-08, "logits/chosen": -19.49742889404297, "logits/rejected": -18.462909698486328, "logps/chosen": -375.87762451171875, "logps/rejected": -291.97430419921875, "loss": 0.3168, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6208348274230957, "rewards/margins": 1.4455161094665527, "rewards/rejected": 2.175318717956543, "step": 60810 }, { "epoch": 2.823715121407679, "grad_norm": 90.41682434082031, "learning_rate": 2.9597474348855564e-08, "logits/chosen": -18.947729110717773, "logits/rejected": -17.84817886352539, "logps/chosen": -435.6722717285156, "logps/rejected": -317.90240478515625, "loss": 1.014, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7588202953338623, "rewards/margins": 1.322014570236206, "rewards/rejected": 1.4368058443069458, "step": 60820 }, { "epoch": 2.824179395515112, "grad_norm": 161.486572265625, "learning_rate": 2.9520095330950056e-08, "logits/chosen": -18.80730438232422, "logits/rejected": -18.678499221801758, "logps/chosen": -429.00726318359375, "logps/rejected": -363.84765625, "loss": 0.7716, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.923611879348755, "rewards/margins": 1.2249860763549805, "rewards/rejected": 2.6986258029937744, "step": 60830 }, { "epoch": 2.824643669622545, "grad_norm": 125.41731262207031, "learning_rate": 2.9442716313044554e-08, "logits/chosen": -18.96689796447754, "logits/rejected": -18.516483306884766, "logps/chosen": -458.01470947265625, "logps/rejected": -333.87127685546875, "loss": 0.5136, "rewards/accuracies": 0.5, "rewards/chosen": 4.361236572265625, "rewards/margins": 2.374497652053833, "rewards/rejected": 1.9867385625839233, "step": 60840 }, { "epoch": 2.825107943729978, "grad_norm": 4.426964282989502, "learning_rate": 2.936533729513905e-08, "logits/chosen": -18.837976455688477, "logits/rejected": -17.638097763061523, "logps/chosen": -476.077392578125, "logps/rejected": -340.80108642578125, "loss": 0.4937, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.726840019226074, "rewards/margins": 2.218540668487549, "rewards/rejected": 2.5082993507385254, "step": 60850 }, { "epoch": 2.8255722178374114, "grad_norm": 82.30863189697266, "learning_rate": 2.9287958277233543e-08, "logits/chosen": -19.180953979492188, "logits/rejected": -18.662050247192383, "logps/chosen": -268.9222412109375, "logps/rejected": -215.92910766601562, "loss": 0.4892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.0550451278686523, "rewards/margins": 1.2515153884887695, "rewards/rejected": 0.8035297393798828, "step": 60860 }, { "epoch": 2.826036491944844, "grad_norm": 133.584228515625, "learning_rate": 2.921057925932804e-08, "logits/chosen": -18.736751556396484, "logits/rejected": -18.90243148803711, "logps/chosen": -385.7751159667969, "logps/rejected": -430.29998779296875, "loss": 1.0582, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 2.3896613121032715, "rewards/margins": -0.3572661280632019, "rewards/rejected": 2.746927261352539, "step": 60870 }, { "epoch": 2.8265007660522774, "grad_norm": 108.26473236083984, "learning_rate": 2.9133200241422536e-08, "logits/chosen": -19.949792861938477, "logits/rejected": -19.126771926879883, "logps/chosen": -340.8934020996094, "logps/rejected": -264.65740966796875, "loss": 0.511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7985007762908936, "rewards/margins": 1.3330621719360352, "rewards/rejected": 2.4654386043548584, "step": 60880 }, { "epoch": 2.82696504015971, "grad_norm": 68.52778625488281, "learning_rate": 2.9055821223517028e-08, "logits/chosen": -19.477380752563477, "logits/rejected": -17.931657791137695, "logps/chosen": -451.4385681152344, "logps/rejected": -333.50201416015625, "loss": 0.1794, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.527793884277344, "rewards/margins": 3.1575164794921875, "rewards/rejected": 1.3702778816223145, "step": 60890 }, { "epoch": 2.8274293142671434, "grad_norm": 30.95488166809082, "learning_rate": 2.8978442205611526e-08, "logits/chosen": -18.478382110595703, "logits/rejected": -18.54413604736328, "logps/chosen": -330.2553405761719, "logps/rejected": -342.5038757324219, "loss": 1.0017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4317989349365234, "rewards/margins": 0.8046091794967651, "rewards/rejected": 1.6271898746490479, "step": 60900 }, { "epoch": 2.827893588374576, "grad_norm": 203.1626739501953, "learning_rate": 2.890106318770602e-08, "logits/chosen": -18.38419532775879, "logits/rejected": -18.298826217651367, "logps/chosen": -315.3009948730469, "logps/rejected": -328.62933349609375, "loss": 1.3098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.9059213399887085, "rewards/margins": 0.0157012939453125, "rewards/rejected": 1.890220284461975, "step": 60910 }, { "epoch": 2.8283578624820094, "grad_norm": 3.1465561389923096, "learning_rate": 2.8823684169800515e-08, "logits/chosen": -18.976198196411133, "logits/rejected": -18.376733779907227, "logps/chosen": -381.55670166015625, "logps/rejected": -306.67181396484375, "loss": 0.2173, "rewards/accuracies": 1.0, "rewards/chosen": 4.288878440856934, "rewards/margins": 2.4768338203430176, "rewards/rejected": 1.8120445013046265, "step": 60920 }, { "epoch": 2.8288221365894426, "grad_norm": 245.98883056640625, "learning_rate": 2.8746305151895013e-08, "logits/chosen": -18.748579025268555, "logits/rejected": -18.215925216674805, "logps/chosen": -482.6004943847656, "logps/rejected": -378.1627502441406, "loss": 1.1553, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8352813720703125, "rewards/margins": 1.166757345199585, "rewards/rejected": 2.6685240268707275, "step": 60930 }, { "epoch": 2.8292864106968754, "grad_norm": 16.868614196777344, "learning_rate": 2.8668926133989508e-08, "logits/chosen": -18.104297637939453, "logits/rejected": -17.44934844970703, "logps/chosen": -476.8544921875, "logps/rejected": -380.270751953125, "loss": 1.4695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8155694007873535, "rewards/margins": 0.3008652925491333, "rewards/rejected": 2.5147042274475098, "step": 60940 }, { "epoch": 2.8297506848043086, "grad_norm": 228.8357391357422, "learning_rate": 2.8591547116084e-08, "logits/chosen": -19.345211029052734, "logits/rejected": -18.512935638427734, "logps/chosen": -488.2076721191406, "logps/rejected": -435.1015625, "loss": 0.8384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.286665439605713, "rewards/margins": 0.8111727833747864, "rewards/rejected": 3.4754929542541504, "step": 60950 }, { "epoch": 2.8302149589117414, "grad_norm": 138.57261657714844, "learning_rate": 2.8514168098178497e-08, "logits/chosen": -19.20430564880371, "logits/rejected": -18.19765853881836, "logps/chosen": -395.38433837890625, "logps/rejected": -287.44146728515625, "loss": 0.4672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6043286323547363, "rewards/margins": 1.4730371236801147, "rewards/rejected": 2.131291389465332, "step": 60960 }, { "epoch": 2.8306792330191746, "grad_norm": 1.4140185117721558, "learning_rate": 2.8436789080272992e-08, "logits/chosen": -19.50495147705078, "logits/rejected": -19.249652862548828, "logps/chosen": -425.717041015625, "logps/rejected": -352.990966796875, "loss": 0.8224, "rewards/accuracies": 0.5, "rewards/chosen": 4.066357612609863, "rewards/margins": 1.3879473209381104, "rewards/rejected": 2.678410530090332, "step": 60970 }, { "epoch": 2.8311435071266073, "grad_norm": 84.6687240600586, "learning_rate": 2.8359410062367487e-08, "logits/chosen": -19.63931655883789, "logits/rejected": -18.315387725830078, "logps/chosen": -499.4461975097656, "logps/rejected": -322.14093017578125, "loss": 0.2394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.147889137268066, "rewards/margins": 2.7077693939208984, "rewards/rejected": 1.4401195049285889, "step": 60980 }, { "epoch": 2.8316077812340406, "grad_norm": 41.98154067993164, "learning_rate": 2.8282031044461985e-08, "logits/chosen": -18.396108627319336, "logits/rejected": -17.79513168334961, "logps/chosen": -307.7290954589844, "logps/rejected": -246.47067260742188, "loss": 1.1537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.4720160961151123, "rewards/margins": 0.9043936729431152, "rewards/rejected": 1.5676229000091553, "step": 60990 }, { "epoch": 2.8320720553414738, "grad_norm": 142.0328826904297, "learning_rate": 2.820465202655648e-08, "logits/chosen": -18.773921966552734, "logits/rejected": -17.836246490478516, "logps/chosen": -258.9104919433594, "logps/rejected": -180.08660888671875, "loss": 0.4075, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.135594129562378, "rewards/margins": 1.2367929220199585, "rewards/rejected": 0.8988014459609985, "step": 61000 }, { "epoch": 2.8325363294489065, "grad_norm": 63.177974700927734, "learning_rate": 2.812727300865097e-08, "logits/chosen": -19.737720489501953, "logits/rejected": -19.026798248291016, "logps/chosen": -459.94769287109375, "logps/rejected": -381.0565490722656, "loss": 0.5748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.8136701583862305, "rewards/margins": 1.1612879037857056, "rewards/rejected": 3.6523823738098145, "step": 61010 }, { "epoch": 2.8330006035563398, "grad_norm": 0.7473381161689758, "learning_rate": 2.804989399074547e-08, "logits/chosen": -19.40406036376953, "logits/rejected": -18.43618392944336, "logps/chosen": -369.0544128417969, "logps/rejected": -305.3167724609375, "loss": 0.7189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.993757724761963, "rewards/margins": 2.170475482940674, "rewards/rejected": 0.8232821226119995, "step": 61020 }, { "epoch": 2.8334648776637725, "grad_norm": 70.083740234375, "learning_rate": 2.7972514972839964e-08, "logits/chosen": -18.685588836669922, "logits/rejected": -18.300880432128906, "logps/chosen": -363.5565185546875, "logps/rejected": -337.7018127441406, "loss": 0.9651, "rewards/accuracies": 0.5, "rewards/chosen": 3.715693235397339, "rewards/margins": 1.239774465560913, "rewards/rejected": 2.4759185314178467, "step": 61030 }, { "epoch": 2.8339291517712057, "grad_norm": 1.2101138830184937, "learning_rate": 2.789513595493446e-08, "logits/chosen": -19.189529418945312, "logits/rejected": -18.96578598022461, "logps/chosen": -447.9859313964844, "logps/rejected": -443.84088134765625, "loss": 1.0961, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.7106881141662598, "rewards/margins": 0.3737332820892334, "rewards/rejected": 3.3369545936584473, "step": 61040 }, { "epoch": 2.8343934258786385, "grad_norm": 78.65328216552734, "learning_rate": 2.7817756937028957e-08, "logits/chosen": -19.478466033935547, "logits/rejected": -19.253759384155273, "logps/chosen": -315.13726806640625, "logps/rejected": -337.73138427734375, "loss": 0.9493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4615111351013184, "rewards/margins": 0.6402050256729126, "rewards/rejected": 2.821305990219116, "step": 61050 }, { "epoch": 2.8348576999860717, "grad_norm": 54.10173416137695, "learning_rate": 2.7740377919123448e-08, "logits/chosen": -18.547435760498047, "logits/rejected": -18.95271873474121, "logps/chosen": -387.1690673828125, "logps/rejected": -311.1714782714844, "loss": 0.4007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2612464427948, "rewards/margins": 1.1318037509918213, "rewards/rejected": 2.1294426918029785, "step": 61060 }, { "epoch": 2.835321974093505, "grad_norm": 88.44969940185547, "learning_rate": 2.7662998901217943e-08, "logits/chosen": -19.894771575927734, "logits/rejected": -18.328622817993164, "logps/chosen": -431.96075439453125, "logps/rejected": -346.6556091308594, "loss": 1.3811, "rewards/accuracies": 0.5, "rewards/chosen": 3.470590591430664, "rewards/margins": 0.6347266435623169, "rewards/rejected": 2.835864305496216, "step": 61070 }, { "epoch": 2.8357862482009377, "grad_norm": 31.612829208374023, "learning_rate": 2.7585619883312438e-08, "logits/chosen": -20.789398193359375, "logits/rejected": -18.991342544555664, "logps/chosen": -496.70758056640625, "logps/rejected": -253.47006225585938, "loss": 0.3793, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.502193450927734, "rewards/margins": 2.0885684490203857, "rewards/rejected": 2.4136252403259277, "step": 61080 }, { "epoch": 2.836250522308371, "grad_norm": 52.61833572387695, "learning_rate": 2.7508240865406936e-08, "logits/chosen": -19.76105499267578, "logits/rejected": -19.391796112060547, "logps/chosen": -370.0645751953125, "logps/rejected": -291.5503234863281, "loss": 0.312, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.995940685272217, "rewards/margins": 1.9048116207122803, "rewards/rejected": 2.0911290645599365, "step": 61090 }, { "epoch": 2.836714796415804, "grad_norm": 85.30211639404297, "learning_rate": 2.743086184750143e-08, "logits/chosen": -18.430383682250977, "logits/rejected": -17.488191604614258, "logps/chosen": -420.53045654296875, "logps/rejected": -308.9305114746094, "loss": 0.6123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2033743858337402, "rewards/margins": 1.5105082988739014, "rewards/rejected": 1.6928659677505493, "step": 61100 }, { "epoch": 2.837179070523237, "grad_norm": 253.67198181152344, "learning_rate": 2.7353482829595925e-08, "logits/chosen": -18.667749404907227, "logits/rejected": -18.02450180053711, "logps/chosen": -427.7474670410156, "logps/rejected": -438.6957092285156, "loss": 0.4418, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.263354301452637, "rewards/margins": 1.860658049583435, "rewards/rejected": 2.4026968479156494, "step": 61110 }, { "epoch": 2.8376433446306697, "grad_norm": 199.5867919921875, "learning_rate": 2.727610381169042e-08, "logits/chosen": -18.387897491455078, "logits/rejected": -17.828411102294922, "logps/chosen": -367.3372497558594, "logps/rejected": -304.1094970703125, "loss": 0.5479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.547360897064209, "rewards/margins": 1.7688791751861572, "rewards/rejected": 1.7784814834594727, "step": 61120 }, { "epoch": 2.838107618738103, "grad_norm": 99.69519805908203, "learning_rate": 2.7198724793784915e-08, "logits/chosen": -19.130695343017578, "logits/rejected": -18.148468017578125, "logps/chosen": -328.3117370605469, "logps/rejected": -307.04754638671875, "loss": 0.5356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.096439838409424, "rewards/margins": 1.2006101608276367, "rewards/rejected": 1.8958295583724976, "step": 61130 }, { "epoch": 2.838571892845536, "grad_norm": 35.26104736328125, "learning_rate": 2.712134577587941e-08, "logits/chosen": -19.019916534423828, "logits/rejected": -19.01841163635254, "logps/chosen": -375.4248962402344, "logps/rejected": -332.6283264160156, "loss": 1.005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4497807025909424, "rewards/margins": 0.13433977961540222, "rewards/rejected": 2.3154408931732178, "step": 61140 }, { "epoch": 2.839036166952969, "grad_norm": 41.41191482543945, "learning_rate": 2.7043966757973908e-08, "logits/chosen": -19.95328140258789, "logits/rejected": -17.858566284179688, "logps/chosen": -454.08734130859375, "logps/rejected": -236.6901397705078, "loss": 0.2211, "rewards/accuracies": 1.0, "rewards/chosen": 4.050225734710693, "rewards/margins": 2.7349355220794678, "rewards/rejected": 1.3152902126312256, "step": 61150 }, { "epoch": 2.839500441060402, "grad_norm": 57.508949279785156, "learning_rate": 2.6966587740068402e-08, "logits/chosen": -18.498653411865234, "logits/rejected": -18.528738021850586, "logps/chosen": -348.70635986328125, "logps/rejected": -334.55755615234375, "loss": 1.5419, "rewards/accuracies": 0.5, "rewards/chosen": 2.4676880836486816, "rewards/margins": -0.49140501022338867, "rewards/rejected": 2.9590930938720703, "step": 61160 }, { "epoch": 2.8399647151678353, "grad_norm": 39.82143783569336, "learning_rate": 2.6889208722162897e-08, "logits/chosen": -18.55171012878418, "logits/rejected": -18.161746978759766, "logps/chosen": -381.984619140625, "logps/rejected": -337.75567626953125, "loss": 0.7999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.402315855026245, "rewards/margins": 0.6009843349456787, "rewards/rejected": 2.8013315200805664, "step": 61170 }, { "epoch": 2.840428989275268, "grad_norm": 18.37026596069336, "learning_rate": 2.6811829704257392e-08, "logits/chosen": -19.593399047851562, "logits/rejected": -18.795753479003906, "logps/chosen": -404.00421142578125, "logps/rejected": -350.25787353515625, "loss": 0.5815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5868964195251465, "rewards/margins": 1.3982446193695068, "rewards/rejected": 2.1886518001556396, "step": 61180 }, { "epoch": 2.840893263382701, "grad_norm": 102.86539459228516, "learning_rate": 2.6734450686351887e-08, "logits/chosen": -19.92821502685547, "logits/rejected": -19.280399322509766, "logps/chosen": -429.3487243652344, "logps/rejected": -335.90740966796875, "loss": 0.6497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7104504108428955, "rewards/margins": 1.0835517644882202, "rewards/rejected": 2.6268982887268066, "step": 61190 }, { "epoch": 2.841357537490134, "grad_norm": 10.010420799255371, "learning_rate": 2.665707166844638e-08, "logits/chosen": -19.590351104736328, "logits/rejected": -18.391536712646484, "logps/chosen": -371.269775390625, "logps/rejected": -258.19122314453125, "loss": 0.2592, "rewards/accuracies": 1.0, "rewards/chosen": 4.177994251251221, "rewards/margins": 2.3026373386383057, "rewards/rejected": 1.8753570318222046, "step": 61200 }, { "epoch": 2.8418218115975673, "grad_norm": 12.940826416015625, "learning_rate": 2.657969265054088e-08, "logits/chosen": -19.043716430664062, "logits/rejected": -18.158592224121094, "logps/chosen": -543.7022094726562, "logps/rejected": -454.0311584472656, "loss": 0.5146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.057478904724121, "rewards/margins": 1.2804434299468994, "rewards/rejected": 2.7770354747772217, "step": 61210 }, { "epoch": 2.842286085705, "grad_norm": 0.7414228916168213, "learning_rate": 2.6502313632635374e-08, "logits/chosen": -19.15596580505371, "logits/rejected": -17.978151321411133, "logps/chosen": -351.7572326660156, "logps/rejected": -326.48260498046875, "loss": 0.4674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.315983533859253, "rewards/margins": 1.4456470012664795, "rewards/rejected": 1.8703362941741943, "step": 61220 }, { "epoch": 2.8427503598124333, "grad_norm": 18.572933197021484, "learning_rate": 2.6424934614729866e-08, "logits/chosen": -20.093347549438477, "logits/rejected": -18.539813995361328, "logps/chosen": -412.0331115722656, "logps/rejected": -360.39361572265625, "loss": 0.3076, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.582385540008545, "rewards/margins": 1.9825376272201538, "rewards/rejected": 2.5998477935791016, "step": 61230 }, { "epoch": 2.8432146339198665, "grad_norm": 130.3671417236328, "learning_rate": 2.6347555596824364e-08, "logits/chosen": -19.646778106689453, "logits/rejected": -18.736852645874023, "logps/chosen": -428.06805419921875, "logps/rejected": -331.6280517578125, "loss": 0.5565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.395206451416016, "rewards/margins": 2.102931499481201, "rewards/rejected": 2.2922749519348145, "step": 61240 }, { "epoch": 2.8436789080272993, "grad_norm": 113.56413269042969, "learning_rate": 2.627017657891886e-08, "logits/chosen": -20.252790451049805, "logits/rejected": -18.3665714263916, "logps/chosen": -450.8023376464844, "logps/rejected": -283.90130615234375, "loss": 0.2646, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.590827465057373, "rewards/margins": 3.0852584838867188, "rewards/rejected": 1.505569577217102, "step": 61250 }, { "epoch": 2.8441431821347325, "grad_norm": 17.947710037231445, "learning_rate": 2.6192797561013353e-08, "logits/chosen": -18.570022583007812, "logits/rejected": -17.975845336914062, "logps/chosen": -364.18206787109375, "logps/rejected": -302.2073974609375, "loss": 1.4934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7706844806671143, "rewards/margins": 0.9934806823730469, "rewards/rejected": 1.777203917503357, "step": 61260 }, { "epoch": 2.8446074562421653, "grad_norm": 90.61585235595703, "learning_rate": 2.611541854310785e-08, "logits/chosen": -19.73477554321289, "logits/rejected": -19.484783172607422, "logps/chosen": -305.3650207519531, "logps/rejected": -232.3033447265625, "loss": 0.7272, "rewards/accuracies": 0.5, "rewards/chosen": 2.7644338607788086, "rewards/margins": 0.4725423753261566, "rewards/rejected": 2.291891574859619, "step": 61270 }, { "epoch": 2.8450717303495985, "grad_norm": 221.71914672851562, "learning_rate": 2.6038039525202346e-08, "logits/chosen": -19.6021671295166, "logits/rejected": -18.857275009155273, "logps/chosen": -412.982177734375, "logps/rejected": -386.6372985839844, "loss": 0.8229, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.122756242752075, "rewards/margins": 0.570547878742218, "rewards/rejected": 2.552208185195923, "step": 61280 }, { "epoch": 2.8455360044570313, "grad_norm": 105.9270248413086, "learning_rate": 2.5960660507296837e-08, "logits/chosen": -17.99954605102539, "logits/rejected": -18.107868194580078, "logps/chosen": -285.52777099609375, "logps/rejected": -378.897705078125, "loss": 1.2266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9421272277832031, "rewards/margins": 0.3738480806350708, "rewards/rejected": 1.5682793855667114, "step": 61290 }, { "epoch": 2.8460002785644645, "grad_norm": 11.04665756225586, "learning_rate": 2.5883281489391335e-08, "logits/chosen": -18.413808822631836, "logits/rejected": -17.51839828491211, "logps/chosen": -444.2596130371094, "logps/rejected": -386.54779052734375, "loss": 1.3888, "rewards/accuracies": 0.5, "rewards/chosen": 4.521933555603027, "rewards/margins": 1.2926876544952393, "rewards/rejected": 3.229245662689209, "step": 61300 }, { "epoch": 2.8464645526718977, "grad_norm": 175.30982971191406, "learning_rate": 2.580590247148583e-08, "logits/chosen": -19.809659957885742, "logits/rejected": -19.34757423400879, "logps/chosen": -381.98974609375, "logps/rejected": -370.95281982421875, "loss": 0.8659, "rewards/accuracies": 0.5, "rewards/chosen": 2.9775664806365967, "rewards/margins": -0.024668991565704346, "rewards/rejected": 3.0022358894348145, "step": 61310 }, { "epoch": 2.8469288267793305, "grad_norm": 105.95797729492188, "learning_rate": 2.5728523453580325e-08, "logits/chosen": -18.801761627197266, "logits/rejected": -18.129335403442383, "logps/chosen": -381.1380310058594, "logps/rejected": -303.18328857421875, "loss": 0.7169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.238338947296143, "rewards/margins": 2.1131231784820557, "rewards/rejected": 2.125215768814087, "step": 61320 }, { "epoch": 2.8473931008867637, "grad_norm": 1.9124486446380615, "learning_rate": 2.5651144435674823e-08, "logits/chosen": -18.346786499023438, "logits/rejected": -18.688541412353516, "logps/chosen": -355.0382385253906, "logps/rejected": -264.9402770996094, "loss": 0.7264, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.238976001739502, "rewards/margins": 0.7955209016799927, "rewards/rejected": 1.4434552192687988, "step": 61330 }, { "epoch": 2.8478573749941964, "grad_norm": 26.242647171020508, "learning_rate": 2.5573765417769318e-08, "logits/chosen": -19.74146842956543, "logits/rejected": -19.88768768310547, "logps/chosen": -471.17041015625, "logps/rejected": -338.58013916015625, "loss": 0.3137, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.094372272491455, "rewards/margins": 2.6264946460723877, "rewards/rejected": 2.4678778648376465, "step": 61340 }, { "epoch": 2.8483216491016297, "grad_norm": 27.664236068725586, "learning_rate": 2.549638639986381e-08, "logits/chosen": -18.444671630859375, "logits/rejected": -17.850467681884766, "logps/chosen": -416.58526611328125, "logps/rejected": -335.36566162109375, "loss": 0.4602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.389042615890503, "rewards/margins": 1.9182803630828857, "rewards/rejected": 1.4707621335983276, "step": 61350 }, { "epoch": 2.8487859232090624, "grad_norm": 1.5863656997680664, "learning_rate": 2.5419007381958307e-08, "logits/chosen": -19.189205169677734, "logits/rejected": -17.8935546875, "logps/chosen": -420.2308044433594, "logps/rejected": -264.4487609863281, "loss": 0.2533, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.546050071716309, "rewards/margins": 2.7776596546173096, "rewards/rejected": 1.7683900594711304, "step": 61360 }, { "epoch": 2.8492501973164956, "grad_norm": 102.86585998535156, "learning_rate": 2.5341628364052802e-08, "logits/chosen": -19.798357009887695, "logits/rejected": -18.945430755615234, "logps/chosen": -546.5221557617188, "logps/rejected": -390.99554443359375, "loss": 0.6542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.225188255310059, "rewards/margins": 2.0190443992614746, "rewards/rejected": 3.206143617630005, "step": 61370 }, { "epoch": 2.849714471423929, "grad_norm": 20.217313766479492, "learning_rate": 2.5264249346147297e-08, "logits/chosen": -19.0338191986084, "logits/rejected": -18.107492446899414, "logps/chosen": -371.90228271484375, "logps/rejected": -324.2776184082031, "loss": 0.679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.963879108428955, "rewards/margins": 0.8701078295707703, "rewards/rejected": 2.09377121925354, "step": 61380 }, { "epoch": 2.8501787455313616, "grad_norm": 2.7922134399414062, "learning_rate": 2.5186870328241795e-08, "logits/chosen": -20.001728057861328, "logits/rejected": -18.112998962402344, "logps/chosen": -585.9317626953125, "logps/rejected": -289.0982360839844, "loss": 0.08, "rewards/accuracies": 1.0, "rewards/chosen": 4.935309410095215, "rewards/margins": 4.190427303314209, "rewards/rejected": 0.7448815703392029, "step": 61390 }, { "epoch": 2.850643019638795, "grad_norm": 23.735002517700195, "learning_rate": 2.5109491310336286e-08, "logits/chosen": -18.702363967895508, "logits/rejected": -17.39914894104004, "logps/chosen": -309.0626525878906, "logps/rejected": -233.52938842773438, "loss": 0.9858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.687522292137146, "rewards/margins": 0.8165546655654907, "rewards/rejected": 0.8709675669670105, "step": 61400 }, { "epoch": 2.8511072937462276, "grad_norm": 8.038228034973145, "learning_rate": 2.503211229243078e-08, "logits/chosen": -19.31661033630371, "logits/rejected": -18.124710083007812, "logps/chosen": -338.2858581542969, "logps/rejected": -245.8456268310547, "loss": 1.1108, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.280813217163086, "rewards/margins": 0.9837799072265625, "rewards/rejected": 1.297033429145813, "step": 61410 }, { "epoch": 2.851571567853661, "grad_norm": 158.59939575195312, "learning_rate": 2.495473327452528e-08, "logits/chosen": -19.95010757446289, "logits/rejected": -19.575401306152344, "logps/chosen": -400.64971923828125, "logps/rejected": -367.076904296875, "loss": 1.3042, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.2962679862976074, "rewards/margins": -0.194468691945076, "rewards/rejected": 3.490736722946167, "step": 61420 }, { "epoch": 2.8520358419610936, "grad_norm": 187.9275665283203, "learning_rate": 2.4877354256619774e-08, "logits/chosen": -20.29330825805664, "logits/rejected": -18.925241470336914, "logps/chosen": -531.3043212890625, "logps/rejected": -484.06060791015625, "loss": 0.4558, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.728327751159668, "rewards/margins": 1.1253111362457275, "rewards/rejected": 3.6030163764953613, "step": 61430 }, { "epoch": 2.852500116068527, "grad_norm": 59.64840316772461, "learning_rate": 2.479997523871427e-08, "logits/chosen": -18.28191375732422, "logits/rejected": -18.17953872680664, "logps/chosen": -338.9147644042969, "logps/rejected": -307.4339294433594, "loss": 0.8442, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.4283947944641113, "rewards/margins": 0.10141830146312714, "rewards/rejected": 2.3269765377044678, "step": 61440 }, { "epoch": 2.85296439017596, "grad_norm": 170.5992431640625, "learning_rate": 2.4722596220808767e-08, "logits/chosen": -18.620397567749023, "logits/rejected": -18.106502532958984, "logps/chosen": -460.582275390625, "logps/rejected": -370.1664123535156, "loss": 0.352, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.2328338623046875, "rewards/margins": 2.822971820831299, "rewards/rejected": 2.4098620414733887, "step": 61450 }, { "epoch": 2.853428664283393, "grad_norm": 0.2256951481103897, "learning_rate": 2.4645217202903258e-08, "logits/chosen": -19.587963104248047, "logits/rejected": -18.5300235748291, "logps/chosen": -439.9845275878906, "logps/rejected": -290.68768310546875, "loss": 0.183, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.339794635772705, "rewards/margins": 2.625692844390869, "rewards/rejected": 1.7141015529632568, "step": 61460 }, { "epoch": 2.853892938390826, "grad_norm": 54.78183364868164, "learning_rate": 2.4567838184997753e-08, "logits/chosen": -19.33441162109375, "logits/rejected": -19.537626266479492, "logps/chosen": -329.89703369140625, "logps/rejected": -323.4066162109375, "loss": 0.4916, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.2578067779541016, "rewards/margins": 1.2315752506256104, "rewards/rejected": 2.026231288909912, "step": 61470 }, { "epoch": 2.8543572124982592, "grad_norm": 3.262068033218384, "learning_rate": 2.449045916709225e-08, "logits/chosen": -20.074026107788086, "logits/rejected": -19.396394729614258, "logps/chosen": -365.8713073730469, "logps/rejected": -332.269775390625, "loss": 0.5198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6608147621154785, "rewards/margins": 1.4693524837493896, "rewards/rejected": 2.1914620399475098, "step": 61480 }, { "epoch": 2.854821486605692, "grad_norm": 106.55620574951172, "learning_rate": 2.4413080149186746e-08, "logits/chosen": -19.221784591674805, "logits/rejected": -18.361879348754883, "logps/chosen": -411.674560546875, "logps/rejected": -290.2685546875, "loss": 0.8316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6849753856658936, "rewards/margins": 1.326895833015442, "rewards/rejected": 2.358079671859741, "step": 61490 }, { "epoch": 2.855285760713125, "grad_norm": 260.5374755859375, "learning_rate": 2.433570113128124e-08, "logits/chosen": -19.320528030395508, "logits/rejected": -18.48984146118164, "logps/chosen": -499.16192626953125, "logps/rejected": -468.14080810546875, "loss": 0.812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.4142537117004395, "rewards/margins": 0.7886953353881836, "rewards/rejected": 3.625558853149414, "step": 61500 }, { "epoch": 2.855750034820558, "grad_norm": 26.099760055541992, "learning_rate": 2.425832211337574e-08, "logits/chosen": -19.754255294799805, "logits/rejected": -18.321592330932617, "logps/chosen": -454.0047912597656, "logps/rejected": -327.8229064941406, "loss": 0.1834, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.1974196434021, "rewards/margins": 3.451002597808838, "rewards/rejected": 1.7464174032211304, "step": 61510 }, { "epoch": 2.856214308927991, "grad_norm": 27.351825714111328, "learning_rate": 2.418094309547023e-08, "logits/chosen": -19.58696746826172, "logits/rejected": -18.914794921875, "logps/chosen": -416.998291015625, "logps/rejected": -401.1993713378906, "loss": 0.5992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.2962822914123535, "rewards/margins": 1.1153663396835327, "rewards/rejected": 3.180915594100952, "step": 61520 }, { "epoch": 2.856678583035424, "grad_norm": 6.8454108238220215, "learning_rate": 2.4103564077564725e-08, "logits/chosen": -19.79415512084961, "logits/rejected": -17.561782836914062, "logps/chosen": -409.64093017578125, "logps/rejected": -245.8749237060547, "loss": 0.1272, "rewards/accuracies": 1.0, "rewards/chosen": 4.2592058181762695, "rewards/margins": 3.1329898834228516, "rewards/rejected": 1.1262162923812866, "step": 61530 }, { "epoch": 2.857142857142857, "grad_norm": 80.89649200439453, "learning_rate": 2.4026185059659223e-08, "logits/chosen": -19.555307388305664, "logits/rejected": -18.778762817382812, "logps/chosen": -279.72613525390625, "logps/rejected": -289.3020935058594, "loss": 0.7308, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1876893043518066, "rewards/margins": 1.1259139776229858, "rewards/rejected": 1.0617753267288208, "step": 61540 }, { "epoch": 2.8576071312502904, "grad_norm": 14.918234825134277, "learning_rate": 2.3948806041753717e-08, "logits/chosen": -19.867074966430664, "logits/rejected": -18.960365295410156, "logps/chosen": -405.8540344238281, "logps/rejected": -346.1605224609375, "loss": 0.6207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.052183151245117, "rewards/margins": 1.1367394924163818, "rewards/rejected": 2.915443181991577, "step": 61550 }, { "epoch": 2.858071405357723, "grad_norm": 31.12175750732422, "learning_rate": 2.3871427023848212e-08, "logits/chosen": -19.854299545288086, "logits/rejected": -18.91731071472168, "logps/chosen": -461.40338134765625, "logps/rejected": -382.6915588378906, "loss": 0.5507, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.77202033996582, "rewards/margins": 1.1971900463104248, "rewards/rejected": 3.5748298168182373, "step": 61560 }, { "epoch": 2.858535679465156, "grad_norm": 104.66858673095703, "learning_rate": 2.379404800594271e-08, "logits/chosen": -19.36116600036621, "logits/rejected": -18.980140686035156, "logps/chosen": -431.274169921875, "logps/rejected": -335.04718017578125, "loss": 0.5705, "rewards/accuracies": 0.5, "rewards/chosen": 3.844078779220581, "rewards/margins": 0.682766854763031, "rewards/rejected": 3.1613118648529053, "step": 61570 }, { "epoch": 2.858999953572589, "grad_norm": 108.59778594970703, "learning_rate": 2.3716668988037202e-08, "logits/chosen": -18.664024353027344, "logits/rejected": -18.840084075927734, "logps/chosen": -379.02325439453125, "logps/rejected": -314.601806640625, "loss": 1.4754, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.1447651386260986, "rewards/margins": -0.07827337086200714, "rewards/rejected": 2.2230384349823, "step": 61580 }, { "epoch": 2.8594642276800224, "grad_norm": 96.35549926757812, "learning_rate": 2.3639289970131696e-08, "logits/chosen": -18.443565368652344, "logits/rejected": -18.012718200683594, "logps/chosen": -363.242919921875, "logps/rejected": -265.136474609375, "loss": 0.3352, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.250514507293701, "rewards/margins": 1.9664266109466553, "rewards/rejected": 1.2840880155563354, "step": 61590 }, { "epoch": 2.859928501787455, "grad_norm": 196.40574645996094, "learning_rate": 2.3561910952226195e-08, "logits/chosen": -19.766101837158203, "logits/rejected": -19.495311737060547, "logps/chosen": -420.32568359375, "logps/rejected": -322.9595642089844, "loss": 0.7961, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5571956634521484, "rewards/margins": 0.3252670168876648, "rewards/rejected": 3.231928586959839, "step": 61600 }, { "epoch": 2.8603927758948884, "grad_norm": 35.578216552734375, "learning_rate": 2.348453193432069e-08, "logits/chosen": -19.386234283447266, "logits/rejected": -18.757343292236328, "logps/chosen": -487.0035705566406, "logps/rejected": -385.9532165527344, "loss": 0.7646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.6776123046875, "rewards/margins": 0.5823055505752563, "rewards/rejected": 3.095306873321533, "step": 61610 }, { "epoch": 2.8608570500023216, "grad_norm": 34.59233856201172, "learning_rate": 2.3407152916415184e-08, "logits/chosen": -20.39348030090332, "logits/rejected": -18.35915756225586, "logps/chosen": -491.84063720703125, "logps/rejected": -417.72857666015625, "loss": 0.321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.1977434158325195, "rewards/margins": 2.2339775562286377, "rewards/rejected": 2.96376633644104, "step": 61620 }, { "epoch": 2.8613213241097544, "grad_norm": 150.7161407470703, "learning_rate": 2.332977389850968e-08, "logits/chosen": -18.860187530517578, "logits/rejected": -18.042407989501953, "logps/chosen": -380.492431640625, "logps/rejected": -286.98541259765625, "loss": 0.9103, "rewards/accuracies": 0.5, "rewards/chosen": 3.6254029273986816, "rewards/margins": 1.598169207572937, "rewards/rejected": 2.027233600616455, "step": 61630 }, { "epoch": 2.8617855982171876, "grad_norm": 0.0245828814804554, "learning_rate": 2.3252394880604174e-08, "logits/chosen": -19.0009822845459, "logits/rejected": -17.46552085876465, "logps/chosen": -417.60400390625, "logps/rejected": -260.65545654296875, "loss": 0.4748, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4646809101104736, "rewards/margins": 2.654289722442627, "rewards/rejected": 0.8103916049003601, "step": 61640 }, { "epoch": 2.8622498723246204, "grad_norm": 107.5757064819336, "learning_rate": 2.3175015862698668e-08, "logits/chosen": -18.84382438659668, "logits/rejected": -17.942859649658203, "logps/chosen": -348.7373046875, "logps/rejected": -294.1321105957031, "loss": 0.4928, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1026923656463623, "rewards/margins": 1.4249188899993896, "rewards/rejected": 1.6777732372283936, "step": 61650 }, { "epoch": 2.8627141464320536, "grad_norm": 164.2776336669922, "learning_rate": 2.3097636844793166e-08, "logits/chosen": -19.3291015625, "logits/rejected": -18.69756507873535, "logps/chosen": -387.6837158203125, "logps/rejected": -321.4195861816406, "loss": 0.9768, "rewards/accuracies": 0.5, "rewards/chosen": 3.2679836750030518, "rewards/margins": 0.4522159695625305, "rewards/rejected": 2.815767765045166, "step": 61660 }, { "epoch": 2.8631784205394863, "grad_norm": 192.09609985351562, "learning_rate": 2.302025782688766e-08, "logits/chosen": -18.883712768554688, "logits/rejected": -17.785228729248047, "logps/chosen": -274.73663330078125, "logps/rejected": -284.05023193359375, "loss": 0.3929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5250191688537598, "rewards/margins": 1.4076263904571533, "rewards/rejected": 1.1173923015594482, "step": 61670 }, { "epoch": 2.8636426946469196, "grad_norm": 11.70590591430664, "learning_rate": 2.2950616710772707e-08, "logits/chosen": -18.57084846496582, "logits/rejected": -17.8842830657959, "logps/chosen": -407.3951110839844, "logps/rejected": -322.51348876953125, "loss": 0.7607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8523736000061035, "rewards/margins": 1.6371757984161377, "rewards/rejected": 2.2151975631713867, "step": 61680 }, { "epoch": 2.8641069687543528, "grad_norm": 107.73880004882812, "learning_rate": 2.2873237692867202e-08, "logits/chosen": -18.997310638427734, "logits/rejected": -18.079654693603516, "logps/chosen": -346.4002380371094, "logps/rejected": -257.194580078125, "loss": 0.4759, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1873414516448975, "rewards/margins": 1.75518798828125, "rewards/rejected": 1.4321534633636475, "step": 61690 }, { "epoch": 2.8645712428617855, "grad_norm": 71.09982299804688, "learning_rate": 2.2795858674961697e-08, "logits/chosen": -19.019580841064453, "logits/rejected": -17.418243408203125, "logps/chosen": -384.4430236816406, "logps/rejected": -279.97210693359375, "loss": 0.4184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.126372337341309, "rewards/margins": 2.6521055698394775, "rewards/rejected": 1.4742664098739624, "step": 61700 }, { "epoch": 2.8650355169692188, "grad_norm": 57.252017974853516, "learning_rate": 2.271847965705619e-08, "logits/chosen": -18.775569915771484, "logits/rejected": -18.172107696533203, "logps/chosen": -390.76776123046875, "logps/rejected": -296.1448669433594, "loss": 0.7018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.887648582458496, "rewards/margins": 1.5915606021881104, "rewards/rejected": 1.2960878610610962, "step": 61710 }, { "epoch": 2.8654997910766515, "grad_norm": 21.24279022216797, "learning_rate": 2.2641100639150686e-08, "logits/chosen": -19.673681259155273, "logits/rejected": -18.756271362304688, "logps/chosen": -368.25634765625, "logps/rejected": -246.85299682617188, "loss": 0.6712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.829404354095459, "rewards/margins": 2.0557005405426025, "rewards/rejected": 1.7737038135528564, "step": 61720 }, { "epoch": 2.8659640651840848, "grad_norm": 7.310616493225098, "learning_rate": 2.256372162124518e-08, "logits/chosen": -19.88947105407715, "logits/rejected": -19.862667083740234, "logps/chosen": -362.7107849121094, "logps/rejected": -400.0823669433594, "loss": 1.2001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.980722427368164, "rewards/margins": -0.19115674495697021, "rewards/rejected": 3.171879291534424, "step": 61730 }, { "epoch": 2.8664283392915175, "grad_norm": 35.40876007080078, "learning_rate": 2.248634260333968e-08, "logits/chosen": -18.895591735839844, "logits/rejected": -18.2884578704834, "logps/chosen": -398.6673889160156, "logps/rejected": -344.45562744140625, "loss": 0.4219, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9796805381774902, "rewards/margins": 1.817129135131836, "rewards/rejected": 2.1625514030456543, "step": 61740 }, { "epoch": 2.8668926133989507, "grad_norm": 76.73666381835938, "learning_rate": 2.2408963585434174e-08, "logits/chosen": -19.784740447998047, "logits/rejected": -18.5124454498291, "logps/chosen": -451.1640625, "logps/rejected": -380.4479064941406, "loss": 0.5941, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0348448753356934, "rewards/margins": 1.0564700365066528, "rewards/rejected": 1.9783750772476196, "step": 61750 }, { "epoch": 2.867356887506384, "grad_norm": 134.77670288085938, "learning_rate": 2.2331584567528665e-08, "logits/chosen": -18.68756103515625, "logits/rejected": -17.927961349487305, "logps/chosen": -293.3405456542969, "logps/rejected": -340.8144836425781, "loss": 1.242, "rewards/accuracies": 0.5, "rewards/chosen": 3.1774282455444336, "rewards/margins": 0.749796450138092, "rewards/rejected": 2.427631139755249, "step": 61760 }, { "epoch": 2.8678211616138167, "grad_norm": 7.580418586730957, "learning_rate": 2.2254205549623163e-08, "logits/chosen": -19.107616424560547, "logits/rejected": -18.282434463500977, "logps/chosen": -357.23455810546875, "logps/rejected": -284.55657958984375, "loss": 0.6698, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7237613201141357, "rewards/margins": 2.4453539848327637, "rewards/rejected": 1.278407335281372, "step": 61770 }, { "epoch": 2.86828543572125, "grad_norm": 10.984620094299316, "learning_rate": 2.2176826531717658e-08, "logits/chosen": -19.818384170532227, "logits/rejected": -18.62384605407715, "logps/chosen": -447.8517150878906, "logps/rejected": -333.98583984375, "loss": 0.3612, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.169951915740967, "rewards/margins": 1.871912956237793, "rewards/rejected": 2.298038959503174, "step": 61780 }, { "epoch": 2.8687497098286827, "grad_norm": 15.183855056762695, "learning_rate": 2.2099447513812153e-08, "logits/chosen": -18.745038986206055, "logits/rejected": -17.718360900878906, "logps/chosen": -331.87451171875, "logps/rejected": -333.0885009765625, "loss": 0.4664, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2063076496124268, "rewards/margins": 1.439732551574707, "rewards/rejected": 1.7665752172470093, "step": 61790 }, { "epoch": 2.869213983936116, "grad_norm": 16.55256462097168, "learning_rate": 2.202206849590665e-08, "logits/chosen": -20.255786895751953, "logits/rejected": -19.378253936767578, "logps/chosen": -379.728515625, "logps/rejected": -332.76116943359375, "loss": 0.8712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.797595024108887, "rewards/margins": 1.4976165294647217, "rewards/rejected": 3.299978256225586, "step": 61800 }, { "epoch": 2.8696782580435487, "grad_norm": 0.26839250326156616, "learning_rate": 2.1944689478001146e-08, "logits/chosen": -19.93815803527832, "logits/rejected": -17.368671417236328, "logps/chosen": -432.86492919921875, "logps/rejected": -320.4324951171875, "loss": 0.4925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.4746599197387695, "rewards/margins": 2.7595763206481934, "rewards/rejected": 1.7150834798812866, "step": 61810 }, { "epoch": 2.870142532150982, "grad_norm": 100.9148941040039, "learning_rate": 2.1867310460095637e-08, "logits/chosen": -17.348852157592773, "logits/rejected": -18.588083267211914, "logps/chosen": -268.5557556152344, "logps/rejected": -267.3472900390625, "loss": 1.2175, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.7087217569351196, "rewards/margins": 0.18755501508712769, "rewards/rejected": 1.5211668014526367, "step": 61820 }, { "epoch": 2.870606806258415, "grad_norm": 32.7228889465332, "learning_rate": 2.1789931442190135e-08, "logits/chosen": -19.192487716674805, "logits/rejected": -18.44744873046875, "logps/chosen": -402.86346435546875, "logps/rejected": -351.3221435546875, "loss": 0.5337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6434645652770996, "rewards/margins": 0.8878753781318665, "rewards/rejected": 2.755589246749878, "step": 61830 }, { "epoch": 2.871071080365848, "grad_norm": 5.5277814865112305, "learning_rate": 2.171255242428463e-08, "logits/chosen": -20.3909854888916, "logits/rejected": -18.372533798217773, "logps/chosen": -330.2802429199219, "logps/rejected": -318.93304443359375, "loss": 0.9246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4704864025115967, "rewards/margins": 0.4444383680820465, "rewards/rejected": 3.026048183441162, "step": 61840 }, { "epoch": 2.871535354473281, "grad_norm": 34.020782470703125, "learning_rate": 2.1635173406379125e-08, "logits/chosen": -19.46448516845703, "logits/rejected": -19.126983642578125, "logps/chosen": -436.8787536621094, "logps/rejected": -365.93359375, "loss": 0.8689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 5.189786434173584, "rewards/margins": 0.9930096864700317, "rewards/rejected": 4.196776390075684, "step": 61850 }, { "epoch": 2.871999628580714, "grad_norm": 0.14307446777820587, "learning_rate": 2.1557794388473623e-08, "logits/chosen": -18.917163848876953, "logits/rejected": -19.401317596435547, "logps/chosen": -358.0055236816406, "logps/rejected": -311.80950927734375, "loss": 1.6927, "rewards/accuracies": 0.5, "rewards/chosen": 2.6093761920928955, "rewards/margins": -0.14842991530895233, "rewards/rejected": 2.7578063011169434, "step": 61860 }, { "epoch": 2.872463902688147, "grad_norm": 3.2622201442718506, "learning_rate": 2.1480415370568118e-08, "logits/chosen": -18.128252029418945, "logits/rejected": -18.244464874267578, "logps/chosen": -356.5174560546875, "logps/rejected": -370.4137268066406, "loss": 1.1738, "rewards/accuracies": 0.5, "rewards/chosen": 2.758883237838745, "rewards/margins": -0.01242753304541111, "rewards/rejected": 2.771310329437256, "step": 61870 }, { "epoch": 2.87292817679558, "grad_norm": 157.7417755126953, "learning_rate": 2.140303635266261e-08, "logits/chosen": -18.38951873779297, "logits/rejected": -17.72087860107422, "logps/chosen": -389.49566650390625, "logps/rejected": -282.44183349609375, "loss": 0.7255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.2140114307403564, "rewards/margins": 1.3719184398651123, "rewards/rejected": 1.8420928716659546, "step": 61880 }, { "epoch": 2.873392450903013, "grad_norm": 34.71269989013672, "learning_rate": 2.1325657334757107e-08, "logits/chosen": -18.136837005615234, "logits/rejected": -17.673986434936523, "logps/chosen": -333.94256591796875, "logps/rejected": -236.66409301757812, "loss": 0.6721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8393135070800781, "rewards/margins": 0.5925248265266418, "rewards/rejected": 1.246788740158081, "step": 61890 }, { "epoch": 2.8738567250104463, "grad_norm": 167.26593017578125, "learning_rate": 2.1248278316851602e-08, "logits/chosen": -19.16815757751465, "logits/rejected": -18.198162078857422, "logps/chosen": -388.69732666015625, "logps/rejected": -374.5394592285156, "loss": 0.6657, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.13944673538208, "rewards/margins": 1.0361464023590088, "rewards/rejected": 2.103300094604492, "step": 61900 }, { "epoch": 2.874320999117879, "grad_norm": 185.96023559570312, "learning_rate": 2.1170899298946097e-08, "logits/chosen": -19.355913162231445, "logits/rejected": -18.187896728515625, "logps/chosen": -531.2595825195312, "logps/rejected": -410.71990966796875, "loss": 0.8045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8020756244659424, "rewards/margins": 1.1674540042877197, "rewards/rejected": 2.6346218585968018, "step": 61910 }, { "epoch": 2.8747852732253123, "grad_norm": 228.01573181152344, "learning_rate": 2.1093520281040595e-08, "logits/chosen": -19.76127052307129, "logits/rejected": -19.4339599609375, "logps/chosen": -325.40985107421875, "logps/rejected": -319.6907653808594, "loss": 0.701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7264342308044434, "rewards/margins": 0.7704064846038818, "rewards/rejected": 1.956027626991272, "step": 61920 }, { "epoch": 2.8752495473327455, "grad_norm": 47.83038330078125, "learning_rate": 2.101614126313509e-08, "logits/chosen": -19.269956588745117, "logits/rejected": -18.457130432128906, "logps/chosen": -330.1097106933594, "logps/rejected": -286.83453369140625, "loss": 0.7281, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.338130474090576, "rewards/margins": 0.6926256418228149, "rewards/rejected": 1.6455047130584717, "step": 61930 }, { "epoch": 2.8757138214401783, "grad_norm": 49.499446868896484, "learning_rate": 2.093876224522958e-08, "logits/chosen": -19.46099853515625, "logits/rejected": -17.529544830322266, "logps/chosen": -355.3307800292969, "logps/rejected": -211.6637725830078, "loss": 0.293, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.816434144973755, "rewards/margins": 3.008183240890503, "rewards/rejected": 0.8082504272460938, "step": 61940 }, { "epoch": 2.876178095547611, "grad_norm": 48.947208404541016, "learning_rate": 2.086138322732408e-08, "logits/chosen": -19.12221336364746, "logits/rejected": -18.616703033447266, "logps/chosen": -384.04132080078125, "logps/rejected": -300.36029052734375, "loss": 0.9834, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.392399549484253, "rewards/margins": 1.678261160850525, "rewards/rejected": 1.7141382694244385, "step": 61950 }, { "epoch": 2.8766423696550443, "grad_norm": 96.4481201171875, "learning_rate": 2.0784004209418574e-08, "logits/chosen": -19.449344635009766, "logits/rejected": -18.722187042236328, "logps/chosen": -420.791259765625, "logps/rejected": -340.72149658203125, "loss": 0.7744, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.5326313972473145, "rewards/margins": 0.9403254389762878, "rewards/rejected": 1.5923060178756714, "step": 61960 }, { "epoch": 2.8771066437624775, "grad_norm": 64.56097412109375, "learning_rate": 2.070662519151307e-08, "logits/chosen": -18.437854766845703, "logits/rejected": -18.047100067138672, "logps/chosen": -395.40008544921875, "logps/rejected": -338.2452392578125, "loss": 1.5652, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.764875888824463, "rewards/margins": -0.5737538933753967, "rewards/rejected": 4.338629722595215, "step": 61970 }, { "epoch": 2.8775709178699103, "grad_norm": 9.339611053466797, "learning_rate": 2.0629246173607566e-08, "logits/chosen": -19.572307586669922, "logits/rejected": -18.652402877807617, "logps/chosen": -403.7001953125, "logps/rejected": -338.4736022949219, "loss": 0.6425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.381176471710205, "rewards/margins": 1.4545621871948242, "rewards/rejected": 2.926614284515381, "step": 61980 }, { "epoch": 2.8780351919773435, "grad_norm": 21.38888168334961, "learning_rate": 2.0551867155702058e-08, "logits/chosen": -19.24667739868164, "logits/rejected": -18.598962783813477, "logps/chosen": -326.75701904296875, "logps/rejected": -246.9448699951172, "loss": 0.4026, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.979762315750122, "rewards/margins": 1.1629002094268799, "rewards/rejected": 0.8168624043464661, "step": 61990 }, { "epoch": 2.8784994660847767, "grad_norm": 71.12268829345703, "learning_rate": 2.0474488137796553e-08, "logits/chosen": -19.855884552001953, "logits/rejected": -18.837432861328125, "logps/chosen": -550.9351806640625, "logps/rejected": -415.7367248535156, "loss": 0.531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.085168361663818, "rewards/margins": 1.5250622034072876, "rewards/rejected": 3.560105800628662, "step": 62000 }, { "epoch": 2.8789637401922095, "grad_norm": 6.288009166717529, "learning_rate": 2.039710911989105e-08, "logits/chosen": -18.693220138549805, "logits/rejected": -17.36122703552246, "logps/chosen": -555.2647705078125, "logps/rejected": -401.3725891113281, "loss": 0.3327, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.550671100616455, "rewards/margins": 2.9538071155548096, "rewards/rejected": 2.5968639850616455, "step": 62010 }, { "epoch": 2.8794280142996422, "grad_norm": 110.7878189086914, "learning_rate": 2.0319730101985545e-08, "logits/chosen": -19.199552536010742, "logits/rejected": -18.303470611572266, "logps/chosen": -307.5445251464844, "logps/rejected": -542.4971313476562, "loss": 0.8643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.799492835998535, "rewards/margins": 2.527498722076416, "rewards/rejected": 0.2719945013523102, "step": 62020 }, { "epoch": 2.8798922884070755, "grad_norm": 250.57647705078125, "learning_rate": 2.024235108408004e-08, "logits/chosen": -18.309249877929688, "logits/rejected": -18.774293899536133, "logps/chosen": -331.2767639160156, "logps/rejected": -345.67822265625, "loss": 1.5603, "rewards/accuracies": 0.5, "rewards/chosen": 2.275792121887207, "rewards/margins": -0.7491413950920105, "rewards/rejected": 3.0249335765838623, "step": 62030 }, { "epoch": 2.8803565625145087, "grad_norm": 62.60655975341797, "learning_rate": 2.0164972066174538e-08, "logits/chosen": -18.737781524658203, "logits/rejected": -18.436595916748047, "logps/chosen": -441.0672302246094, "logps/rejected": -380.5555725097656, "loss": 0.8711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5759687423706055, "rewards/margins": 1.0773003101348877, "rewards/rejected": 1.4986684322357178, "step": 62040 }, { "epoch": 2.8808208366219414, "grad_norm": 127.97105407714844, "learning_rate": 2.008759304826903e-08, "logits/chosen": -19.175569534301758, "logits/rejected": -18.607927322387695, "logps/chosen": -488.5646057128906, "logps/rejected": -410.02056884765625, "loss": 0.8088, "rewards/accuracies": 0.5, "rewards/chosen": 4.4205169677734375, "rewards/margins": 0.5441175103187561, "rewards/rejected": 3.876399517059326, "step": 62050 }, { "epoch": 2.8812851107293747, "grad_norm": 115.041748046875, "learning_rate": 2.0010214030363524e-08, "logits/chosen": -19.884258270263672, "logits/rejected": -18.687101364135742, "logps/chosen": -343.82025146484375, "logps/rejected": -254.41384887695312, "loss": 0.4713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.23778772354126, "rewards/margins": 2.1142027378082275, "rewards/rejected": 2.1235849857330322, "step": 62060 }, { "epoch": 2.881749384836808, "grad_norm": 176.7801513671875, "learning_rate": 1.993283501245802e-08, "logits/chosen": -18.611984252929688, "logits/rejected": -18.83533477783203, "logps/chosen": -310.8855285644531, "logps/rejected": -368.05645751953125, "loss": 1.0757, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 2.775813102722168, "rewards/margins": -0.0685567632317543, "rewards/rejected": 2.844369888305664, "step": 62070 }, { "epoch": 2.8822136589442406, "grad_norm": 11.33565902709961, "learning_rate": 1.9855455994552517e-08, "logits/chosen": -20.03585433959961, "logits/rejected": -18.347034454345703, "logps/chosen": -398.944580078125, "logps/rejected": -234.7912139892578, "loss": 0.4044, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.58612322807312, "rewards/margins": 1.9885746240615845, "rewards/rejected": 1.5975487232208252, "step": 62080 }, { "epoch": 2.882677933051674, "grad_norm": 69.5378646850586, "learning_rate": 1.9778076976647012e-08, "logits/chosen": -19.255643844604492, "logits/rejected": -18.702775955200195, "logps/chosen": -398.79827880859375, "logps/rejected": -306.6669616699219, "loss": 0.3305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9038479328155518, "rewards/margins": 1.8405545949935913, "rewards/rejected": 2.0632925033569336, "step": 62090 }, { "epoch": 2.8831422071591066, "grad_norm": 37.512454986572266, "learning_rate": 1.9700697958741503e-08, "logits/chosen": -19.040124893188477, "logits/rejected": -19.39145278930664, "logps/chosen": -319.12841796875, "logps/rejected": -320.53326416015625, "loss": 0.8154, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.439302682876587, "rewards/margins": 0.7562500834465027, "rewards/rejected": 1.683052659034729, "step": 62100 }, { "epoch": 2.88360648126654, "grad_norm": 180.0022430419922, "learning_rate": 1.9623318940836e-08, "logits/chosen": -19.311147689819336, "logits/rejected": -18.188268661499023, "logps/chosen": -434.10382080078125, "logps/rejected": -363.85406494140625, "loss": 0.5239, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.5865478515625, "rewards/margins": 1.3426692485809326, "rewards/rejected": 2.2438786029815674, "step": 62110 }, { "epoch": 2.8840707553739726, "grad_norm": 52.29737091064453, "learning_rate": 1.9545939922930496e-08, "logits/chosen": -18.3684139251709, "logits/rejected": -17.652362823486328, "logps/chosen": -379.9224548339844, "logps/rejected": -305.7598571777344, "loss": 0.4268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4832472801208496, "rewards/margins": 1.194362759590149, "rewards/rejected": 1.2888844013214111, "step": 62120 }, { "epoch": 2.884535029481406, "grad_norm": 1.6751657724380493, "learning_rate": 1.946856090502499e-08, "logits/chosen": -19.190330505371094, "logits/rejected": -18.774974822998047, "logps/chosen": -312.68829345703125, "logps/rejected": -331.87127685546875, "loss": 0.7981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6568400859832764, "rewards/margins": 0.7520006895065308, "rewards/rejected": 1.9048397541046143, "step": 62130 }, { "epoch": 2.884999303588839, "grad_norm": 6.989424705505371, "learning_rate": 1.939118188711949e-08, "logits/chosen": -19.15575408935547, "logits/rejected": -18.200138092041016, "logps/chosen": -339.613037109375, "logps/rejected": -281.1024169921875, "loss": 0.3819, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.316112995147705, "rewards/margins": 1.8214327096939087, "rewards/rejected": 1.4946801662445068, "step": 62140 }, { "epoch": 2.885463577696272, "grad_norm": 125.21521759033203, "learning_rate": 1.9313802869213984e-08, "logits/chosen": -18.825088500976562, "logits/rejected": -18.172866821289062, "logps/chosen": -430.14044189453125, "logps/rejected": -320.7886047363281, "loss": 0.4767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.581428050994873, "rewards/margins": 1.5069211721420288, "rewards/rejected": 2.074506998062134, "step": 62150 }, { "epoch": 2.885927851803705, "grad_norm": 59.93654251098633, "learning_rate": 1.9236423851308475e-08, "logits/chosen": -19.319625854492188, "logits/rejected": -18.933486938476562, "logps/chosen": -541.67529296875, "logps/rejected": -468.4150390625, "loss": 0.8871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.711645126342773, "rewards/margins": 0.9526758193969727, "rewards/rejected": 3.75896954536438, "step": 62160 }, { "epoch": 2.886392125911138, "grad_norm": 37.11730194091797, "learning_rate": 1.9159044833402973e-08, "logits/chosen": -18.68503189086914, "logits/rejected": -18.499576568603516, "logps/chosen": -468.8955993652344, "logps/rejected": -348.0218811035156, "loss": 0.6436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.968064069747925, "rewards/margins": 1.610654592514038, "rewards/rejected": 2.3574090003967285, "step": 62170 }, { "epoch": 2.886856400018571, "grad_norm": 4.0087809562683105, "learning_rate": 1.9081665815497468e-08, "logits/chosen": -19.235614776611328, "logits/rejected": -17.978073120117188, "logps/chosen": -307.70562744140625, "logps/rejected": -269.2423095703125, "loss": 0.177, "rewards/accuracies": 1.0, "rewards/chosen": 3.3558356761932373, "rewards/margins": 2.3647208213806152, "rewards/rejected": 0.9911152720451355, "step": 62180 }, { "epoch": 2.887320674126004, "grad_norm": 129.79586791992188, "learning_rate": 1.9004286797591963e-08, "logits/chosen": -18.751327514648438, "logits/rejected": -18.269062042236328, "logps/chosen": -244.3124542236328, "logps/rejected": -201.78184509277344, "loss": 0.6046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.205333948135376, "rewards/margins": 0.3599799573421478, "rewards/rejected": 0.8453539609909058, "step": 62190 }, { "epoch": 2.887784948233437, "grad_norm": 64.7137451171875, "learning_rate": 1.892690777968646e-08, "logits/chosen": -19.586320877075195, "logits/rejected": -18.47389793395996, "logps/chosen": -404.5673828125, "logps/rejected": -329.76861572265625, "loss": 0.667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.635920524597168, "rewards/margins": 1.5435197353363037, "rewards/rejected": 3.0924010276794434, "step": 62200 }, { "epoch": 2.8882492223408702, "grad_norm": 12.215314865112305, "learning_rate": 1.8849528761780956e-08, "logits/chosen": -18.782350540161133, "logits/rejected": -18.06211280822754, "logps/chosen": -377.16290283203125, "logps/rejected": -337.4378356933594, "loss": 0.9593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9407525062561035, "rewards/margins": 1.1086206436157227, "rewards/rejected": 2.8321316242218018, "step": 62210 }, { "epoch": 2.888713496448303, "grad_norm": 26.405818939208984, "learning_rate": 1.8772149743875447e-08, "logits/chosen": -18.867033004760742, "logits/rejected": -18.100231170654297, "logps/chosen": -392.3008117675781, "logps/rejected": -271.108154296875, "loss": 1.3153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.0374250411987305, "rewards/margins": 1.7254245281219482, "rewards/rejected": 2.312000036239624, "step": 62220 }, { "epoch": 2.889177770555736, "grad_norm": 98.46910095214844, "learning_rate": 1.8694770725969945e-08, "logits/chosen": -18.698705673217773, "logits/rejected": -17.933916091918945, "logps/chosen": -350.47357177734375, "logps/rejected": -233.7924346923828, "loss": 0.6568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5148983001708984, "rewards/margins": 1.654016137123108, "rewards/rejected": 1.8608827590942383, "step": 62230 }, { "epoch": 2.889642044663169, "grad_norm": 157.12229919433594, "learning_rate": 1.861739170806444e-08, "logits/chosen": -19.39145278930664, "logits/rejected": -20.057003021240234, "logps/chosen": -364.59716796875, "logps/rejected": -440.70355224609375, "loss": 1.4382, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.908202886581421, "rewards/margins": -0.5879687070846558, "rewards/rejected": 3.496171474456787, "step": 62240 }, { "epoch": 2.890106318770602, "grad_norm": 0.093539297580719, "learning_rate": 1.8540012690158935e-08, "logits/chosen": -18.921117782592773, "logits/rejected": -18.82566261291504, "logps/chosen": -408.2550354003906, "logps/rejected": -358.01043701171875, "loss": 1.6922, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6809844970703125, "rewards/margins": 1.2621610164642334, "rewards/rejected": 2.4188232421875, "step": 62250 }, { "epoch": 2.890570592878035, "grad_norm": 65.61473083496094, "learning_rate": 1.8462633672253433e-08, "logits/chosen": -19.15458106994629, "logits/rejected": -17.875198364257812, "logps/chosen": -330.9140625, "logps/rejected": -255.40420532226562, "loss": 0.409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.6996805667877197, "rewards/margins": 2.2414016723632812, "rewards/rejected": 0.4582786560058594, "step": 62260 }, { "epoch": 2.891034866985468, "grad_norm": 13.912505149841309, "learning_rate": 1.8385254654347927e-08, "logits/chosen": -18.61199951171875, "logits/rejected": -18.228519439697266, "logps/chosen": -397.7393798828125, "logps/rejected": -326.74212646484375, "loss": 0.9149, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.8792171478271484, "rewards/margins": 0.8684536814689636, "rewards/rejected": 3.01076340675354, "step": 62270 }, { "epoch": 2.8914991410929014, "grad_norm": 227.71900939941406, "learning_rate": 1.830787563644242e-08, "logits/chosen": -19.194339752197266, "logits/rejected": -19.239940643310547, "logps/chosen": -296.29681396484375, "logps/rejected": -229.09912109375, "loss": 0.9581, "rewards/accuracies": 0.5, "rewards/chosen": 2.2519092559814453, "rewards/margins": 0.9198077321052551, "rewards/rejected": 1.332101583480835, "step": 62280 }, { "epoch": 2.891963415200334, "grad_norm": 64.86380767822266, "learning_rate": 1.8230496618536917e-08, "logits/chosen": -18.100284576416016, "logits/rejected": -18.114198684692383, "logps/chosen": -237.6507568359375, "logps/rejected": -215.6730194091797, "loss": 1.0142, "rewards/accuracies": 0.5, "rewards/chosen": 1.9756639003753662, "rewards/margins": 0.3539584279060364, "rewards/rejected": 1.6217056512832642, "step": 62290 }, { "epoch": 2.8924276893077674, "grad_norm": 31.5147647857666, "learning_rate": 1.8153117600631412e-08, "logits/chosen": -19.74161720275879, "logits/rejected": -18.346050262451172, "logps/chosen": -352.9870300292969, "logps/rejected": -232.31741333007812, "loss": 0.2978, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.020678997039795, "rewards/margins": 3.233417510986328, "rewards/rejected": 0.7872614860534668, "step": 62300 }, { "epoch": 2.8928919634152006, "grad_norm": 44.403175354003906, "learning_rate": 1.8075738582725906e-08, "logits/chosen": -18.823345184326172, "logits/rejected": -18.29739761352539, "logps/chosen": -362.03955078125, "logps/rejected": -344.7802734375, "loss": 0.6461, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.731546401977539, "rewards/margins": 1.57806396484375, "rewards/rejected": 2.153482437133789, "step": 62310 }, { "epoch": 2.8933562375226334, "grad_norm": 0.6719360947608948, "learning_rate": 1.7998359564820405e-08, "logits/chosen": -18.77791404724121, "logits/rejected": -18.123027801513672, "logps/chosen": -369.86956787109375, "logps/rejected": -273.9646911621094, "loss": 0.6642, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.067754745483398, "rewards/margins": 1.7016003131866455, "rewards/rejected": 2.366154909133911, "step": 62320 }, { "epoch": 2.893820511630066, "grad_norm": 0.1406467705965042, "learning_rate": 1.7920980546914896e-08, "logits/chosen": -19.079608917236328, "logits/rejected": -18.052989959716797, "logps/chosen": -355.49383544921875, "logps/rejected": -303.82232666015625, "loss": 0.3859, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.2584240436553955, "rewards/margins": 1.668357491493225, "rewards/rejected": 0.5900663733482361, "step": 62330 }, { "epoch": 2.8942847857374994, "grad_norm": 15.867907524108887, "learning_rate": 1.784360152900939e-08, "logits/chosen": -18.156009674072266, "logits/rejected": -17.745134353637695, "logps/chosen": -341.7884826660156, "logps/rejected": -269.379150390625, "loss": 0.3485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.4444096088409424, "rewards/margins": 1.6083176136016846, "rewards/rejected": 0.8360918164253235, "step": 62340 }, { "epoch": 2.8947490598449326, "grad_norm": 194.906982421875, "learning_rate": 1.776622251110389e-08, "logits/chosen": -18.289714813232422, "logits/rejected": -18.838623046875, "logps/chosen": -291.0772399902344, "logps/rejected": -300.0336608886719, "loss": 1.2047, "rewards/accuracies": 0.5, "rewards/chosen": 1.566344141960144, "rewards/margins": -0.6355170011520386, "rewards/rejected": 2.201861619949341, "step": 62350 }, { "epoch": 2.8952133339523654, "grad_norm": 24.14899444580078, "learning_rate": 1.7688843493198384e-08, "logits/chosen": -19.634023666381836, "logits/rejected": -19.066015243530273, "logps/chosen": -339.4436340332031, "logps/rejected": -291.8758544921875, "loss": 0.4164, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.7774460315704346, "rewards/margins": 1.6035499572753906, "rewards/rejected": 2.173896074295044, "step": 62360 }, { "epoch": 2.8956776080597986, "grad_norm": 0.1824997514486313, "learning_rate": 1.7611464475292878e-08, "logits/chosen": -19.54830551147461, "logits/rejected": -18.712871551513672, "logps/chosen": -303.04754638671875, "logps/rejected": -242.78225708007812, "loss": 1.1686, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.8210169076919556, "rewards/margins": 0.6517950892448425, "rewards/rejected": 1.1692218780517578, "step": 62370 }, { "epoch": 2.896141882167232, "grad_norm": 57.635353088378906, "learning_rate": 1.7534085457387376e-08, "logits/chosen": -18.78167724609375, "logits/rejected": -18.514352798461914, "logps/chosen": -316.13043212890625, "logps/rejected": -299.7733154296875, "loss": 0.6506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.36950945854187, "rewards/margins": 0.42034730315208435, "rewards/rejected": 1.949162244796753, "step": 62380 }, { "epoch": 2.8966061562746646, "grad_norm": 179.83409118652344, "learning_rate": 1.7456706439481868e-08, "logits/chosen": -18.97584342956543, "logits/rejected": -18.381332397460938, "logps/chosen": -438.54107666015625, "logps/rejected": -378.6663513183594, "loss": 1.2028, "rewards/accuracies": 0.5, "rewards/chosen": 4.322152614593506, "rewards/margins": 0.7049911618232727, "rewards/rejected": 3.617161989212036, "step": 62390 }, { "epoch": 2.8970704303820973, "grad_norm": 177.2047576904297, "learning_rate": 1.7379327421576363e-08, "logits/chosen": -18.88506507873535, "logits/rejected": -18.470596313476562, "logps/chosen": -378.666748046875, "logps/rejected": -429.62603759765625, "loss": 0.6443, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.047471284866333, "rewards/margins": 0.6310631036758423, "rewards/rejected": 2.416408061981201, "step": 62400 }, { "epoch": 2.8975347044895305, "grad_norm": 0.0033731739968061447, "learning_rate": 1.730194840367086e-08, "logits/chosen": -19.360532760620117, "logits/rejected": -17.266414642333984, "logps/chosen": -489.09661865234375, "logps/rejected": -338.2030334472656, "loss": 0.2057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.3936285972595215, "rewards/margins": 3.433211088180542, "rewards/rejected": 1.96041738986969, "step": 62410 }, { "epoch": 2.8979989785969638, "grad_norm": 0.27606266736984253, "learning_rate": 1.7224569385765355e-08, "logits/chosen": -19.484140396118164, "logits/rejected": -18.183841705322266, "logps/chosen": -308.6123046875, "logps/rejected": -299.8498840332031, "loss": 0.9138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.911567449569702, "rewards/margins": 0.602872908115387, "rewards/rejected": 2.30869460105896, "step": 62420 }, { "epoch": 2.8984632527043965, "grad_norm": 155.75180053710938, "learning_rate": 1.714719036785985e-08, "logits/chosen": -19.48392105102539, "logits/rejected": -18.984329223632812, "logps/chosen": -503.0137634277344, "logps/rejected": -398.499267578125, "loss": 0.8564, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.679137706756592, "rewards/margins": 0.7589993476867676, "rewards/rejected": 2.9201388359069824, "step": 62430 }, { "epoch": 2.8989275268118297, "grad_norm": 77.57327270507812, "learning_rate": 1.7069811349954348e-08, "logits/chosen": -18.86839485168457, "logits/rejected": -18.013370513916016, "logps/chosen": -316.9817810058594, "logps/rejected": -290.2445983886719, "loss": 0.3858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.3854026794433594, "rewards/margins": 1.854797124862671, "rewards/rejected": 1.530605673789978, "step": 62440 }, { "epoch": 2.899391800919263, "grad_norm": 35.21255111694336, "learning_rate": 1.699243233204884e-08, "logits/chosen": -17.876806259155273, "logits/rejected": -17.877269744873047, "logps/chosen": -363.3744201660156, "logps/rejected": -341.77825927734375, "loss": 1.3039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1405723094940186, "rewards/margins": -0.004336738493293524, "rewards/rejected": 3.144908905029297, "step": 62450 }, { "epoch": 2.8998560750266957, "grad_norm": 12.803351402282715, "learning_rate": 1.6915053314143334e-08, "logits/chosen": -19.992300033569336, "logits/rejected": -18.796985626220703, "logps/chosen": -429.8646545410156, "logps/rejected": -309.6079406738281, "loss": 0.2324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.342311382293701, "rewards/margins": 2.321049213409424, "rewards/rejected": 1.0212624073028564, "step": 62460 }, { "epoch": 2.900320349134129, "grad_norm": 0.24430842697620392, "learning_rate": 1.6837674296237832e-08, "logits/chosen": -19.158405303955078, "logits/rejected": -18.44001579284668, "logps/chosen": -357.70233154296875, "logps/rejected": -271.8277893066406, "loss": 0.7379, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.4072556495666504, "rewards/margins": 1.7280170917510986, "rewards/rejected": 1.6792383193969727, "step": 62470 }, { "epoch": 2.9007846232415617, "grad_norm": 0.055129751563072205, "learning_rate": 1.6760295278332327e-08, "logits/chosen": -20.06614875793457, "logits/rejected": -18.5833797454834, "logps/chosen": -389.36944580078125, "logps/rejected": -287.85089111328125, "loss": 0.514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.423336505889893, "rewards/margins": 2.6979563236236572, "rewards/rejected": 1.7253801822662354, "step": 62480 }, { "epoch": 2.901248897348995, "grad_norm": 24.86804962158203, "learning_rate": 1.6682916260426822e-08, "logits/chosen": -19.035839080810547, "logits/rejected": -17.942546844482422, "logps/chosen": -411.69696044921875, "logps/rejected": -370.3114929199219, "loss": 1.0474, "rewards/accuracies": 0.5, "rewards/chosen": 3.1869969367980957, "rewards/margins": 0.5616936087608337, "rewards/rejected": 2.6253037452697754, "step": 62490 }, { "epoch": 2.9017131714564277, "grad_norm": 39.477386474609375, "learning_rate": 1.660553724252132e-08, "logits/chosen": -19.560758590698242, "logits/rejected": -18.832035064697266, "logps/chosen": -348.4833984375, "logps/rejected": -331.8253479003906, "loss": 0.6975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.747760057449341, "rewards/margins": 0.9830910563468933, "rewards/rejected": 2.7646689414978027, "step": 62500 }, { "epoch": 2.902177445563861, "grad_norm": 33.46603012084961, "learning_rate": 1.652815822461581e-08, "logits/chosen": -19.16268539428711, "logits/rejected": -17.98440933227539, "logps/chosen": -387.35186767578125, "logps/rejected": -282.2189636230469, "loss": 0.565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6928164958953857, "rewards/margins": 1.3813438415527344, "rewards/rejected": 1.3114726543426514, "step": 62510 }, { "epoch": 2.902641719671294, "grad_norm": 15.414779663085938, "learning_rate": 1.6450779206710306e-08, "logits/chosen": -18.85879135131836, "logits/rejected": -17.340654373168945, "logps/chosen": -499.9676818847656, "logps/rejected": -250.2388458251953, "loss": 0.1461, "rewards/accuracies": 1.0, "rewards/chosen": 4.523498058319092, "rewards/margins": 3.549591541290283, "rewards/rejected": 0.9739063382148743, "step": 62520 }, { "epoch": 2.903105993778727, "grad_norm": 34.84385681152344, "learning_rate": 1.6373400188804804e-08, "logits/chosen": -19.10829734802246, "logits/rejected": -18.19307518005371, "logps/chosen": -437.27557373046875, "logps/rejected": -289.0514221191406, "loss": 0.5217, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4531867504119873, "rewards/margins": 1.3913801908493042, "rewards/rejected": 2.0618066787719727, "step": 62530 }, { "epoch": 2.90357026788616, "grad_norm": 0.2939368486404419, "learning_rate": 1.6303759072689847e-08, "logits/chosen": -19.273082733154297, "logits/rejected": -18.920114517211914, "logps/chosen": -382.80389404296875, "logps/rejected": -232.7003631591797, "loss": 0.6526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.3475308418273926, "rewards/margins": 1.1502147912979126, "rewards/rejected": 1.1973161697387695, "step": 62540 }, { "epoch": 2.904034541993593, "grad_norm": 47.646873474121094, "learning_rate": 1.6226380054784345e-08, "logits/chosen": -19.0203857421875, "logits/rejected": -17.409854888916016, "logps/chosen": -464.2147521972656, "logps/rejected": -259.3519592285156, "loss": 0.5015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.55193567276001, "rewards/margins": 1.893446683883667, "rewards/rejected": 2.6584885120391846, "step": 62550 }, { "epoch": 2.904498816101026, "grad_norm": 44.746212005615234, "learning_rate": 1.614900103687884e-08, "logits/chosen": -20.049671173095703, "logits/rejected": -19.059110641479492, "logps/chosen": -330.0125427246094, "logps/rejected": -288.401123046875, "loss": 0.6523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.456526279449463, "rewards/margins": 0.8126134872436523, "rewards/rejected": 1.6439129114151, "step": 62560 }, { "epoch": 2.904963090208459, "grad_norm": 32.53147506713867, "learning_rate": 1.6071622018973335e-08, "logits/chosen": -18.76736831665039, "logits/rejected": -18.080968856811523, "logps/chosen": -544.8247680664062, "logps/rejected": -399.9391174316406, "loss": 0.3718, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.028402328491211, "rewards/margins": 2.0406320095062256, "rewards/rejected": 1.9877703189849854, "step": 62570 }, { "epoch": 2.905427364315892, "grad_norm": 246.45016479492188, "learning_rate": 1.599424300106783e-08, "logits/chosen": -18.91509246826172, "logits/rejected": -17.876501083374023, "logps/chosen": -506.345947265625, "logps/rejected": -420.24078369140625, "loss": 0.3379, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.915363311767578, "rewards/margins": 2.4291775226593018, "rewards/rejected": 2.486185312271118, "step": 62580 }, { "epoch": 2.9058916384233253, "grad_norm": 61.165374755859375, "learning_rate": 1.5916863983162324e-08, "logits/chosen": -18.159236907958984, "logits/rejected": -17.018444061279297, "logps/chosen": -333.51690673828125, "logps/rejected": -269.8168640136719, "loss": 0.8596, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.629800319671631, "rewards/margins": 2.3821537494659424, "rewards/rejected": 1.247646689414978, "step": 62590 }, { "epoch": 2.906355912530758, "grad_norm": 10.879914283752441, "learning_rate": 1.583948496525682e-08, "logits/chosen": -18.724058151245117, "logits/rejected": -18.32961654663086, "logps/chosen": -372.1816711425781, "logps/rejected": -298.40692138671875, "loss": 0.8039, "rewards/accuracies": 0.5, "rewards/chosen": 3.1961703300476074, "rewards/margins": 1.7791883945465088, "rewards/rejected": 1.4169822931289673, "step": 62600 }, { "epoch": 2.9068201866381913, "grad_norm": 24.171165466308594, "learning_rate": 1.5762105947351317e-08, "logits/chosen": -19.402568817138672, "logits/rejected": -18.601810455322266, "logps/chosen": -240.8731231689453, "logps/rejected": -265.38043212890625, "loss": 1.146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.2909865379333496, "rewards/margins": 0.1757895052433014, "rewards/rejected": 2.115196704864502, "step": 62610 }, { "epoch": 2.907284460745624, "grad_norm": 114.29432678222656, "learning_rate": 1.5684726929445812e-08, "logits/chosen": -18.71129608154297, "logits/rejected": -18.123340606689453, "logps/chosen": -414.481689453125, "logps/rejected": -360.32623291015625, "loss": 1.1136, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.695927858352661, "rewards/margins": 0.7095538973808289, "rewards/rejected": 2.9863739013671875, "step": 62620 }, { "epoch": 2.9077487348530573, "grad_norm": 159.02008056640625, "learning_rate": 1.5607347911540307e-08, "logits/chosen": -18.684152603149414, "logits/rejected": -18.220247268676758, "logps/chosen": -428.9051818847656, "logps/rejected": -376.10186767578125, "loss": 0.7336, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.5146305561065674, "rewards/margins": 1.131088137626648, "rewards/rejected": 2.383542060852051, "step": 62630 }, { "epoch": 2.90821300896049, "grad_norm": 37.263755798339844, "learning_rate": 1.55299688936348e-08, "logits/chosen": -19.17149543762207, "logits/rejected": -18.43326187133789, "logps/chosen": -354.0694885253906, "logps/rejected": -329.5850830078125, "loss": 0.6609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.013033628463745, "rewards/margins": 1.2206026315689087, "rewards/rejected": 1.7924308776855469, "step": 62640 }, { "epoch": 2.9086772830679233, "grad_norm": 146.38034057617188, "learning_rate": 1.5452589875729296e-08, "logits/chosen": -18.84819221496582, "logits/rejected": -19.29354476928711, "logps/chosen": -428.2403869628906, "logps/rejected": -396.7400207519531, "loss": 0.7802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.088843822479248, "rewards/margins": 0.9454701542854309, "rewards/rejected": 3.143373966217041, "step": 62650 }, { "epoch": 2.9091415571753565, "grad_norm": 27.66463851928711, "learning_rate": 1.5375210857823794e-08, "logits/chosen": -18.81509017944336, "logits/rejected": -17.558420181274414, "logps/chosen": -387.06353759765625, "logps/rejected": -224.8051300048828, "loss": 0.6683, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.341859817504883, "rewards/margins": 1.9005264043807983, "rewards/rejected": 1.441333532333374, "step": 62660 }, { "epoch": 2.9096058312827893, "grad_norm": 0.33334994316101074, "learning_rate": 1.5297831839918286e-08, "logits/chosen": -19.31989860534668, "logits/rejected": -17.530126571655273, "logps/chosen": -496.59539794921875, "logps/rejected": -245.9513702392578, "loss": 0.1752, "rewards/accuracies": 1.0, "rewards/chosen": 5.241449356079102, "rewards/margins": 4.164783477783203, "rewards/rejected": 1.0766664743423462, "step": 62670 }, { "epoch": 2.9100701053902225, "grad_norm": 173.3858184814453, "learning_rate": 1.5220452822012784e-08, "logits/chosen": -19.52604866027832, "logits/rejected": -19.036785125732422, "logps/chosen": -413.2537536621094, "logps/rejected": -298.7513427734375, "loss": 0.7094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.125894069671631, "rewards/margins": 0.9959150552749634, "rewards/rejected": 1.129978895187378, "step": 62680 }, { "epoch": 2.9105343794976553, "grad_norm": 180.21636962890625, "learning_rate": 1.514307380410728e-08, "logits/chosen": -18.96371078491211, "logits/rejected": -18.49209976196289, "logps/chosen": -365.1775207519531, "logps/rejected": -348.205810546875, "loss": 0.6202, "rewards/accuracies": 0.5, "rewards/chosen": 4.055589199066162, "rewards/margins": 0.9213545918464661, "rewards/rejected": 3.13423490524292, "step": 62690 }, { "epoch": 2.9109986536050885, "grad_norm": 51.60935592651367, "learning_rate": 1.5065694786201773e-08, "logits/chosen": -19.6904296875, "logits/rejected": -18.561786651611328, "logps/chosen": -433.3385314941406, "logps/rejected": -368.2992858886719, "loss": 0.9811, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.4001097679138184, "rewards/margins": 0.4659203588962555, "rewards/rejected": 2.934189558029175, "step": 62700 }, { "epoch": 2.9114629277125212, "grad_norm": 41.65443420410156, "learning_rate": 1.4988315768296268e-08, "logits/chosen": -19.499202728271484, "logits/rejected": -19.026138305664062, "logps/chosen": -436.6717834472656, "logps/rejected": -379.7742614746094, "loss": 0.2914, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.60487699508667, "rewards/margins": 1.8938652276992798, "rewards/rejected": 2.7110114097595215, "step": 62710 }, { "epoch": 2.9119272018199545, "grad_norm": 2.4478609561920166, "learning_rate": 1.4910936750390763e-08, "logits/chosen": -19.386363983154297, "logits/rejected": -18.78342056274414, "logps/chosen": -341.5317077636719, "logps/rejected": -239.07138061523438, "loss": 0.5857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3827271461486816, "rewards/margins": 1.8333141803741455, "rewards/rejected": 1.5494126081466675, "step": 62720 }, { "epoch": 2.9123914759273877, "grad_norm": 136.00540161132812, "learning_rate": 1.4833557732485259e-08, "logits/chosen": -19.28434181213379, "logits/rejected": -18.571733474731445, "logps/chosen": -402.95843505859375, "logps/rejected": -363.27508544921875, "loss": 0.4741, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9509174823760986, "rewards/margins": 1.5546505451202393, "rewards/rejected": 2.3962669372558594, "step": 62730 }, { "epoch": 2.9128557500348204, "grad_norm": 52.01122283935547, "learning_rate": 1.4756178714579754e-08, "logits/chosen": -19.561004638671875, "logits/rejected": -18.220905303955078, "logps/chosen": -389.17266845703125, "logps/rejected": -361.03912353515625, "loss": 0.2922, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.29665994644165, "rewards/margins": 2.602259635925293, "rewards/rejected": 1.694400429725647, "step": 62740 }, { "epoch": 2.9133200241422537, "grad_norm": 1.2022701501846313, "learning_rate": 1.467879969667425e-08, "logits/chosen": -18.88775062561035, "logits/rejected": -19.760595321655273, "logps/chosen": -394.4169921875, "logps/rejected": -445.26275634765625, "loss": 0.8717, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.805804491043091, "rewards/margins": 0.34669095277786255, "rewards/rejected": 3.459113359451294, "step": 62750 }, { "epoch": 2.913784298249687, "grad_norm": 39.00605392456055, "learning_rate": 1.4601420678768743e-08, "logits/chosen": -20.11210823059082, "logits/rejected": -19.04058837890625, "logps/chosen": -348.750732421875, "logps/rejected": -303.6513671875, "loss": 0.9731, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7426438331604004, "rewards/margins": 0.6574617624282837, "rewards/rejected": 2.0851824283599854, "step": 62760 }, { "epoch": 2.9142485723571196, "grad_norm": 82.32688903808594, "learning_rate": 1.452404166086324e-08, "logits/chosen": -18.79931640625, "logits/rejected": -17.957374572753906, "logps/chosen": -349.71490478515625, "logps/rejected": -284.9981384277344, "loss": 0.8144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7803330421447754, "rewards/margins": 1.232503890991211, "rewards/rejected": 1.5478293895721436, "step": 62770 }, { "epoch": 2.9147128464645524, "grad_norm": 91.52769470214844, "learning_rate": 1.4446662642957736e-08, "logits/chosen": -19.438871383666992, "logits/rejected": -18.4603214263916, "logps/chosen": -335.11883544921875, "logps/rejected": -236.1344757080078, "loss": 0.4833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5122132301330566, "rewards/margins": 1.665712594985962, "rewards/rejected": 0.8465008735656738, "step": 62780 }, { "epoch": 2.9151771205719856, "grad_norm": 135.3477020263672, "learning_rate": 1.4369283625052229e-08, "logits/chosen": -19.020488739013672, "logits/rejected": -18.517351150512695, "logps/chosen": -356.15826416015625, "logps/rejected": -283.7445373535156, "loss": 0.8277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8368935585021973, "rewards/margins": 0.9861310124397278, "rewards/rejected": 1.8507623672485352, "step": 62790 }, { "epoch": 2.915641394679419, "grad_norm": 0.2090195119380951, "learning_rate": 1.4291904607146726e-08, "logits/chosen": -20.421344757080078, "logits/rejected": -19.76797866821289, "logps/chosen": -474.0264587402344, "logps/rejected": -314.4911193847656, "loss": 0.282, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.291567802429199, "rewards/margins": 2.4653468132019043, "rewards/rejected": 2.826220989227295, "step": 62800 }, { "epoch": 2.9161056687868516, "grad_norm": 6.5890913009643555, "learning_rate": 1.4214525589241222e-08, "logits/chosen": -19.546321868896484, "logits/rejected": -18.32114028930664, "logps/chosen": -423.53460693359375, "logps/rejected": -385.44866943359375, "loss": 0.9589, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.819126605987549, "rewards/margins": 1.3555428981781006, "rewards/rejected": 3.4635841846466064, "step": 62810 }, { "epoch": 2.916569942894285, "grad_norm": 34.26865768432617, "learning_rate": 1.4137146571335715e-08, "logits/chosen": -19.979101181030273, "logits/rejected": -19.76276206970215, "logps/chosen": -419.9098205566406, "logps/rejected": -375.48736572265625, "loss": 0.8394, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.133004665374756, "rewards/margins": 0.3701843023300171, "rewards/rejected": 2.7628204822540283, "step": 62820 }, { "epoch": 2.917034217001718, "grad_norm": 39.509891510009766, "learning_rate": 1.4059767553430211e-08, "logits/chosen": -20.458805084228516, "logits/rejected": -19.415037155151367, "logps/chosen": -388.0038146972656, "logps/rejected": -274.07098388671875, "loss": 0.5791, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.003822326660156, "rewards/margins": 1.694598913192749, "rewards/rejected": 2.309222936630249, "step": 62830 }, { "epoch": 2.917498491109151, "grad_norm": 200.1149444580078, "learning_rate": 1.3982388535524706e-08, "logits/chosen": -19.26798439025879, "logits/rejected": -17.651447296142578, "logps/chosen": -396.5460510253906, "logps/rejected": -321.38177490234375, "loss": 0.604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.391648292541504, "rewards/margins": 1.9117259979248047, "rewards/rejected": 2.47992205619812, "step": 62840 }, { "epoch": 2.9179627652165836, "grad_norm": 163.4866485595703, "learning_rate": 1.3905009517619201e-08, "logits/chosen": -19.310623168945312, "logits/rejected": -17.893217086791992, "logps/chosen": -462.79327392578125, "logps/rejected": -483.21270751953125, "loss": 0.6068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.056542873382568, "rewards/margins": 0.7414524555206299, "rewards/rejected": 3.3150908946990967, "step": 62850 }, { "epoch": 2.918427039324017, "grad_norm": 44.13114929199219, "learning_rate": 1.3827630499713697e-08, "logits/chosen": -19.222484588623047, "logits/rejected": -18.793209075927734, "logps/chosen": -388.2583923339844, "logps/rejected": -290.6603088378906, "loss": 1.6505, "rewards/accuracies": 0.5, "rewards/chosen": 2.066542387008667, "rewards/margins": -0.05132431909441948, "rewards/rejected": 2.1178665161132812, "step": 62860 }, { "epoch": 2.91889131343145, "grad_norm": 0.7323942184448242, "learning_rate": 1.3750251481808192e-08, "logits/chosen": -19.0310115814209, "logits/rejected": -18.33598518371582, "logps/chosen": -464.023193359375, "logps/rejected": -404.82086181640625, "loss": 0.541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.182535171508789, "rewards/margins": 1.5098586082458496, "rewards/rejected": 2.6726763248443604, "step": 62870 }, { "epoch": 2.919355587538883, "grad_norm": 141.2051239013672, "learning_rate": 1.3672872463902687e-08, "logits/chosen": -20.77778434753418, "logits/rejected": -19.444808959960938, "logps/chosen": -386.44476318359375, "logps/rejected": -264.021240234375, "loss": 0.4203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.812952756881714, "rewards/margins": 1.0958654880523682, "rewards/rejected": 2.717087745666504, "step": 62880 }, { "epoch": 2.919819861646316, "grad_norm": 49.61457824707031, "learning_rate": 1.3595493445997183e-08, "logits/chosen": -18.62496566772461, "logits/rejected": -19.082935333251953, "logps/chosen": -269.1299133300781, "logps/rejected": -368.693359375, "loss": 1.166, "rewards/accuracies": 0.5, "rewards/chosen": 2.3071401119232178, "rewards/margins": -0.5046289563179016, "rewards/rejected": 2.8117692470550537, "step": 62890 }, { "epoch": 2.9202841357537492, "grad_norm": 85.9299087524414, "learning_rate": 1.3518114428091678e-08, "logits/chosen": -19.43354034423828, "logits/rejected": -20.34816551208496, "logps/chosen": -451.4637756347656, "logps/rejected": -477.80914306640625, "loss": 1.0942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.742452621459961, "rewards/margins": 0.02347099781036377, "rewards/rejected": 3.7189815044403076, "step": 62900 }, { "epoch": 2.920748409861182, "grad_norm": 146.17984008789062, "learning_rate": 1.3440735410186173e-08, "logits/chosen": -19.61733055114746, "logits/rejected": -19.06870460510254, "logps/chosen": -369.1634216308594, "logps/rejected": -349.95501708984375, "loss": 0.8168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5550312995910645, "rewards/margins": 0.6590278744697571, "rewards/rejected": 2.896003246307373, "step": 62910 }, { "epoch": 2.921212683968615, "grad_norm": 60.093631744384766, "learning_rate": 1.336335639228067e-08, "logits/chosen": -18.91248321533203, "logits/rejected": -18.253765106201172, "logps/chosen": -358.45758056640625, "logps/rejected": -335.63995361328125, "loss": 1.2099, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.588608980178833, "rewards/margins": 0.08894586563110352, "rewards/rejected": 2.4996628761291504, "step": 62920 }, { "epoch": 2.921676958076048, "grad_norm": 128.21119689941406, "learning_rate": 1.3285977374375162e-08, "logits/chosen": -18.081090927124023, "logits/rejected": -18.183467864990234, "logps/chosen": -315.13421630859375, "logps/rejected": -311.7238464355469, "loss": 0.7471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.943627953529358, "rewards/margins": 0.5421635508537292, "rewards/rejected": 1.4014642238616943, "step": 62930 }, { "epoch": 2.922141232183481, "grad_norm": 141.45703125, "learning_rate": 1.3208598356469659e-08, "logits/chosen": -19.250764846801758, "logits/rejected": -18.63730239868164, "logps/chosen": -367.6690368652344, "logps/rejected": -273.5338134765625, "loss": 0.6789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5273966789245605, "rewards/margins": 1.3988971710205078, "rewards/rejected": 1.1284993886947632, "step": 62940 }, { "epoch": 2.922605506290914, "grad_norm": 0.0232476107776165, "learning_rate": 1.3131219338564155e-08, "logits/chosen": -19.926340103149414, "logits/rejected": -18.681640625, "logps/chosen": -396.50909423828125, "logps/rejected": -282.760986328125, "loss": 0.6065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4804344177246094, "rewards/margins": 2.0271201133728027, "rewards/rejected": 1.4533145427703857, "step": 62950 }, { "epoch": 2.923069780398347, "grad_norm": 282.553466796875, "learning_rate": 1.3053840320658648e-08, "logits/chosen": -18.957605361938477, "logits/rejected": -19.435134887695312, "logps/chosen": -393.1471252441406, "logps/rejected": -448.4610900878906, "loss": 1.3334, "rewards/accuracies": 0.5, "rewards/chosen": 2.748786211013794, "rewards/margins": -0.623124361038208, "rewards/rejected": 3.3719100952148438, "step": 62960 }, { "epoch": 2.9235340545057804, "grad_norm": 4.168710708618164, "learning_rate": 1.2976461302753145e-08, "logits/chosen": -19.274423599243164, "logits/rejected": -18.00263786315918, "logps/chosen": -371.66119384765625, "logps/rejected": -218.9080352783203, "loss": 0.3429, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.1661112308502197, "rewards/margins": 1.914419174194336, "rewards/rejected": 0.2516923248767853, "step": 62970 }, { "epoch": 2.923998328613213, "grad_norm": 80.5140380859375, "learning_rate": 1.2899082284847641e-08, "logits/chosen": -19.14917755126953, "logits/rejected": -18.668378829956055, "logps/chosen": -372.5247497558594, "logps/rejected": -305.46392822265625, "loss": 0.8121, "rewards/accuracies": 0.5, "rewards/chosen": 2.594163417816162, "rewards/margins": 0.7420774698257446, "rewards/rejected": 1.852085828781128, "step": 62980 }, { "epoch": 2.9244626027206464, "grad_norm": 49.08409881591797, "learning_rate": 1.2821703266942134e-08, "logits/chosen": -19.868947982788086, "logits/rejected": -19.982852935791016, "logps/chosen": -351.82635498046875, "logps/rejected": -370.7434997558594, "loss": 1.0539, "rewards/accuracies": 0.5, "rewards/chosen": 3.9075515270233154, "rewards/margins": 0.012343907728791237, "rewards/rejected": 3.895207643508911, "step": 62990 }, { "epoch": 2.924926876828079, "grad_norm": 15.080891609191895, "learning_rate": 1.274432424903663e-08, "logits/chosen": -19.557819366455078, "logits/rejected": -18.948244094848633, "logps/chosen": -390.29010009765625, "logps/rejected": -339.6202697753906, "loss": 0.9307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.943220615386963, "rewards/margins": 1.7609164714813232, "rewards/rejected": 3.1823041439056396, "step": 63000 }, { "epoch": 2.9253911509355124, "grad_norm": 0.03947184607386589, "learning_rate": 1.2666945231131127e-08, "logits/chosen": -18.464176177978516, "logits/rejected": -17.744020462036133, "logps/chosen": -382.44696044921875, "logps/rejected": -315.7063903808594, "loss": 0.5854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.343935966491699, "rewards/margins": 2.0199849605560303, "rewards/rejected": 2.323951244354248, "step": 63010 }, { "epoch": 2.925855425042945, "grad_norm": 9.809271812438965, "learning_rate": 1.258956621322562e-08, "logits/chosen": -18.342870712280273, "logits/rejected": -17.682104110717773, "logps/chosen": -406.02154541015625, "logps/rejected": -383.36163330078125, "loss": 0.9293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9812350273132324, "rewards/margins": 0.9402132034301758, "rewards/rejected": 2.0410220623016357, "step": 63020 }, { "epoch": 2.9263196991503784, "grad_norm": 166.28097534179688, "learning_rate": 1.2512187195320116e-08, "logits/chosen": -18.360210418701172, "logits/rejected": -17.39967918395996, "logps/chosen": -431.72509765625, "logps/rejected": -278.3192138671875, "loss": 0.7667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.640211820602417, "rewards/margins": 0.8792134523391724, "rewards/rejected": 2.760998487472534, "step": 63030 }, { "epoch": 2.9267839732578116, "grad_norm": 0.7101399302482605, "learning_rate": 1.2434808177414613e-08, "logits/chosen": -18.975605010986328, "logits/rejected": -17.7200870513916, "logps/chosen": -338.7779846191406, "logps/rejected": -306.0378723144531, "loss": 0.5969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.5797383785247803, "rewards/margins": 1.5538946390151978, "rewards/rejected": 1.0258437395095825, "step": 63040 }, { "epoch": 2.9272482473652444, "grad_norm": 8.1329984664917, "learning_rate": 1.2357429159509106e-08, "logits/chosen": -19.461084365844727, "logits/rejected": -19.07440757751465, "logps/chosen": -499.11505126953125, "logps/rejected": -401.1037292480469, "loss": 0.4537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.928254127502441, "rewards/margins": 1.2036349773406982, "rewards/rejected": 3.724618434906006, "step": 63050 }, { "epoch": 2.9277125214726776, "grad_norm": 85.06858825683594, "learning_rate": 1.2280050141603602e-08, "logits/chosen": -18.501142501831055, "logits/rejected": -18.021484375, "logps/chosen": -469.4480895996094, "logps/rejected": -388.7203369140625, "loss": 0.8296, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5329673290252686, "rewards/margins": 0.9284394979476929, "rewards/rejected": 2.6045279502868652, "step": 63060 }, { "epoch": 2.9281767955801103, "grad_norm": 76.67442321777344, "learning_rate": 1.2202671123698099e-08, "logits/chosen": -18.861827850341797, "logits/rejected": -18.16364097595215, "logps/chosen": -417.6622009277344, "logps/rejected": -385.53289794921875, "loss": 0.6891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.918639659881592, "rewards/margins": 1.8499733209609985, "rewards/rejected": 2.0686659812927246, "step": 63070 }, { "epoch": 2.9286410696875436, "grad_norm": 92.99991607666016, "learning_rate": 1.2125292105792592e-08, "logits/chosen": -18.839218139648438, "logits/rejected": -18.711551666259766, "logps/chosen": -260.67022705078125, "logps/rejected": -252.4824676513672, "loss": 0.9519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.622123122215271, "rewards/margins": 0.20415715873241425, "rewards/rejected": 1.4179656505584717, "step": 63080 }, { "epoch": 2.9291053437949763, "grad_norm": 18.989648818969727, "learning_rate": 1.2047913087887088e-08, "logits/chosen": -18.692354202270508, "logits/rejected": -18.67922592163086, "logps/chosen": -467.7096252441406, "logps/rejected": -362.60400390625, "loss": 0.7683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7086894512176514, "rewards/margins": 1.0325804948806763, "rewards/rejected": 2.6761093139648438, "step": 63090 }, { "epoch": 2.9295696179024096, "grad_norm": 191.47056579589844, "learning_rate": 1.1970534069981585e-08, "logits/chosen": -18.908153533935547, "logits/rejected": -19.03645133972168, "logps/chosen": -323.01873779296875, "logps/rejected": -383.910400390625, "loss": 1.5475, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.1938376426696777, "rewards/margins": -0.5669049024581909, "rewards/rejected": 3.760742664337158, "step": 63100 }, { "epoch": 2.9300338920098428, "grad_norm": 4.58560848236084, "learning_rate": 1.1893155052076078e-08, "logits/chosen": -19.092615127563477, "logits/rejected": -18.833377838134766, "logps/chosen": -399.39666748046875, "logps/rejected": -422.57794189453125, "loss": 1.1946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9771695137023926, "rewards/margins": 0.30399951338768005, "rewards/rejected": 3.673170566558838, "step": 63110 }, { "epoch": 2.9304981661172755, "grad_norm": 32.77881622314453, "learning_rate": 1.1815776034170574e-08, "logits/chosen": -19.10677719116211, "logits/rejected": -18.745014190673828, "logps/chosen": -381.4226989746094, "logps/rejected": -345.52020263671875, "loss": 0.6871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.043301582336426, "rewards/margins": 0.6207238435745239, "rewards/rejected": 3.4225780963897705, "step": 63120 }, { "epoch": 2.9309624402247088, "grad_norm": 0.8809614777565002, "learning_rate": 1.173839701626507e-08, "logits/chosen": -19.39345932006836, "logits/rejected": -17.823780059814453, "logps/chosen": -320.26702880859375, "logps/rejected": -201.9505615234375, "loss": 0.3421, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.4981656074523926, "rewards/margins": 2.3719756603240967, "rewards/rejected": 0.1261899173259735, "step": 63130 }, { "epoch": 2.931426714332142, "grad_norm": 38.07555389404297, "learning_rate": 1.1661017998359564e-08, "logits/chosen": -19.868907928466797, "logits/rejected": -18.368724822998047, "logps/chosen": -524.39208984375, "logps/rejected": -428.3668518066406, "loss": 0.246, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.181858062744141, "rewards/margins": 2.8141274452209473, "rewards/rejected": 2.3677308559417725, "step": 63140 }, { "epoch": 2.9318909884395747, "grad_norm": 276.501708984375, "learning_rate": 1.158363898045406e-08, "logits/chosen": -20.37261962890625, "logits/rejected": -20.134859085083008, "logps/chosen": -387.6092529296875, "logps/rejected": -422.871337890625, "loss": 0.8613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5562853813171387, "rewards/margins": 0.4381233751773834, "rewards/rejected": 3.118161916732788, "step": 63150 }, { "epoch": 2.9323552625470075, "grad_norm": 13.385658264160156, "learning_rate": 1.1506259962548555e-08, "logits/chosen": -18.63135528564453, "logits/rejected": -17.892162322998047, "logps/chosen": -405.68768310546875, "logps/rejected": -350.04620361328125, "loss": 1.0262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.9768900871276855, "rewards/margins": 0.5279723405838013, "rewards/rejected": 2.448917865753174, "step": 63160 }, { "epoch": 2.9328195366544407, "grad_norm": 76.21797180175781, "learning_rate": 1.142888094464305e-08, "logits/chosen": -19.379825592041016, "logits/rejected": -18.593608856201172, "logps/chosen": -417.78448486328125, "logps/rejected": -385.04315185546875, "loss": 0.3476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.781824588775635, "rewards/margins": 2.237159252166748, "rewards/rejected": 2.5446653366088867, "step": 63170 }, { "epoch": 2.933283810761874, "grad_norm": 125.00099182128906, "learning_rate": 1.1351501926737546e-08, "logits/chosen": -18.790695190429688, "logits/rejected": -18.626522064208984, "logps/chosen": -498.50177001953125, "logps/rejected": -460.96875, "loss": 0.8381, "rewards/accuracies": 0.5, "rewards/chosen": 4.016531944274902, "rewards/margins": 0.3657844662666321, "rewards/rejected": 3.650747299194336, "step": 63180 }, { "epoch": 2.9337480848693067, "grad_norm": 6.9237823486328125, "learning_rate": 1.127412290883204e-08, "logits/chosen": -19.258241653442383, "logits/rejected": -18.30856704711914, "logps/chosen": -407.8460388183594, "logps/rejected": -315.1536560058594, "loss": 0.7969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.660008192062378, "rewards/margins": 1.059550166130066, "rewards/rejected": 1.600457787513733, "step": 63190 }, { "epoch": 2.93421235897674, "grad_norm": 43.16706085205078, "learning_rate": 1.1196743890926535e-08, "logits/chosen": -19.697572708129883, "logits/rejected": -18.968292236328125, "logps/chosen": -482.32342529296875, "logps/rejected": -400.3322448730469, "loss": 1.048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.336520671844482, "rewards/margins": 1.1673370599746704, "rewards/rejected": 3.1691837310791016, "step": 63200 }, { "epoch": 2.934676633084173, "grad_norm": 8.263763427734375, "learning_rate": 1.1119364873021032e-08, "logits/chosen": -19.112712860107422, "logits/rejected": -18.49486541748047, "logps/chosen": -419.4698181152344, "logps/rejected": -346.9933776855469, "loss": 0.4626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.0296642780303955, "rewards/margins": 1.4176700115203857, "rewards/rejected": 1.6119941473007202, "step": 63210 }, { "epoch": 2.935140907191606, "grad_norm": 61.4590950012207, "learning_rate": 1.1041985855115527e-08, "logits/chosen": -18.692649841308594, "logits/rejected": -17.72109603881836, "logps/chosen": -455.52398681640625, "logps/rejected": -321.66595458984375, "loss": 0.4894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.8558292388916016, "rewards/margins": 1.4458582401275635, "rewards/rejected": 2.409971237182617, "step": 63220 }, { "epoch": 2.9356051812990387, "grad_norm": 157.3949737548828, "learning_rate": 1.0964606837210021e-08, "logits/chosen": -19.479272842407227, "logits/rejected": -18.923954010009766, "logps/chosen": -505.6979064941406, "logps/rejected": -492.84539794921875, "loss": 1.0027, "rewards/accuracies": 0.5, "rewards/chosen": 5.332608222961426, "rewards/margins": 1.0606712102890015, "rewards/rejected": 4.271937370300293, "step": 63230 }, { "epoch": 2.936069455406472, "grad_norm": 0.5614960789680481, "learning_rate": 1.0887227819304518e-08, "logits/chosen": -18.586883544921875, "logits/rejected": -18.35769271850586, "logps/chosen": -406.8082580566406, "logps/rejected": -329.1111145019531, "loss": 0.7489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4713122844696045, "rewards/margins": 2.2410991191864014, "rewards/rejected": 1.230212926864624, "step": 63240 }, { "epoch": 2.936533729513905, "grad_norm": 20.672075271606445, "learning_rate": 1.0809848801399013e-08, "logits/chosen": -18.144371032714844, "logits/rejected": -17.42608642578125, "logps/chosen": -419.9505310058594, "logps/rejected": -336.82794189453125, "loss": 0.7312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.8441267013549805, "rewards/margins": 1.175667643547058, "rewards/rejected": 1.6684592962265015, "step": 63250 }, { "epoch": 2.936998003621338, "grad_norm": 160.15133666992188, "learning_rate": 1.0732469783493507e-08, "logits/chosen": -18.892780303955078, "logits/rejected": -18.147640228271484, "logps/chosen": -351.57177734375, "logps/rejected": -291.3020935058594, "loss": 1.1046, "rewards/accuracies": 0.5, "rewards/chosen": 3.4558982849121094, "rewards/margins": 0.9760038256645203, "rewards/rejected": 2.4798941612243652, "step": 63260 }, { "epoch": 2.937462277728771, "grad_norm": 32.5698356628418, "learning_rate": 1.0655090765588004e-08, "logits/chosen": -19.12320899963379, "logits/rejected": -19.02440071105957, "logps/chosen": -332.16448974609375, "logps/rejected": -341.3023986816406, "loss": 1.078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.593189239501953, "rewards/margins": 0.6144152879714966, "rewards/rejected": 1.978773832321167, "step": 63270 }, { "epoch": 2.9379265518362043, "grad_norm": 235.39431762695312, "learning_rate": 1.0577711747682497e-08, "logits/chosen": -18.683063507080078, "logits/rejected": -18.245155334472656, "logps/chosen": -406.98065185546875, "logps/rejected": -328.1530456542969, "loss": 0.5334, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4303994178771973, "rewards/margins": 1.3518311977386475, "rewards/rejected": 2.0785679817199707, "step": 63280 }, { "epoch": 2.938390825943637, "grad_norm": 270.0656433105469, "learning_rate": 1.0500332729776993e-08, "logits/chosen": -19.861215591430664, "logits/rejected": -18.673900604248047, "logps/chosen": -452.07550048828125, "logps/rejected": -312.65875244140625, "loss": 1.0844, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.4368696212768555, "rewards/margins": 1.6763862371444702, "rewards/rejected": 2.7604832649230957, "step": 63290 }, { "epoch": 2.9388551000510703, "grad_norm": 0.45698055624961853, "learning_rate": 1.042295371187149e-08, "logits/chosen": -19.08003807067871, "logits/rejected": -18.23033332824707, "logps/chosen": -365.92431640625, "logps/rejected": -331.3402099609375, "loss": 0.834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.5970215797424316, "rewards/margins": 1.257380485534668, "rewards/rejected": 2.3396408557891846, "step": 63300 }, { "epoch": 2.939319374158503, "grad_norm": 11.4324951171875, "learning_rate": 1.0345574693965983e-08, "logits/chosen": -18.820531845092773, "logits/rejected": -18.368267059326172, "logps/chosen": -305.4960632324219, "logps/rejected": -261.8818054199219, "loss": 0.4832, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5064854621887207, "rewards/margins": 1.2309248447418213, "rewards/rejected": 1.275560736656189, "step": 63310 }, { "epoch": 2.9397836482659363, "grad_norm": 38.86336898803711, "learning_rate": 1.0268195676060479e-08, "logits/chosen": -20.09585952758789, "logits/rejected": -19.31838607788086, "logps/chosen": -336.72210693359375, "logps/rejected": -269.29071044921875, "loss": 0.3038, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.4976043701171875, "rewards/margins": 1.9639403820037842, "rewards/rejected": 1.5336639881134033, "step": 63320 }, { "epoch": 2.940247922373369, "grad_norm": 43.3725700378418, "learning_rate": 1.0190816658154974e-08, "logits/chosen": -17.93142318725586, "logits/rejected": -17.7575626373291, "logps/chosen": -319.539794921875, "logps/rejected": -247.18032836914062, "loss": 1.0411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8881733417510986, "rewards/margins": 0.6076579093933105, "rewards/rejected": 1.2805149555206299, "step": 63330 }, { "epoch": 2.9407121964808023, "grad_norm": 1.6935573816299438, "learning_rate": 1.0113437640249469e-08, "logits/chosen": -18.639102935791016, "logits/rejected": -17.814105987548828, "logps/chosen": -345.4657287597656, "logps/rejected": -220.5426788330078, "loss": 0.8621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.981570839881897, "rewards/margins": 1.4181758165359497, "rewards/rejected": 0.5633949041366577, "step": 63340 }, { "epoch": 2.9411764705882355, "grad_norm": 19.57352638244629, "learning_rate": 1.0036058622343965e-08, "logits/chosen": -18.6789608001709, "logits/rejected": -17.348003387451172, "logps/chosen": -461.84307861328125, "logps/rejected": -321.37030029296875, "loss": 0.7946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.739809513092041, "rewards/margins": 2.1809723377227783, "rewards/rejected": 1.5588362216949463, "step": 63350 }, { "epoch": 2.9416407446956683, "grad_norm": 10.427389144897461, "learning_rate": 9.95867960443846e-09, "logits/chosen": -20.13474464416504, "logits/rejected": -18.922222137451172, "logps/chosen": -413.2447814941406, "logps/rejected": -311.70294189453125, "loss": 0.4556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.250246047973633, "rewards/margins": 1.7207542657852173, "rewards/rejected": 2.529491901397705, "step": 63360 }, { "epoch": 2.9421050188031015, "grad_norm": 152.38722229003906, "learning_rate": 9.881300586532954e-09, "logits/chosen": -18.461233139038086, "logits/rejected": -19.121355056762695, "logps/chosen": -363.1803894042969, "logps/rejected": -406.94000244140625, "loss": 1.35, "rewards/accuracies": 0.5, "rewards/chosen": 3.149876832962036, "rewards/margins": -0.4624730944633484, "rewards/rejected": 3.612349271774292, "step": 63370 }, { "epoch": 2.9425692929105343, "grad_norm": 184.50941467285156, "learning_rate": 9.803921568627451e-09, "logits/chosen": -18.20082664489746, "logits/rejected": -17.951324462890625, "logps/chosen": -319.74371337890625, "logps/rejected": -279.7455749511719, "loss": 1.0792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.5130343437194824, "rewards/margins": 0.19928660988807678, "rewards/rejected": 2.3137478828430176, "step": 63380 }, { "epoch": 2.9430335670179675, "grad_norm": 8.302878379821777, "learning_rate": 9.726542550721946e-09, "logits/chosen": -19.067256927490234, "logits/rejected": -18.149383544921875, "logps/chosen": -388.7002868652344, "logps/rejected": -273.5213317871094, "loss": 0.3287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.8879427909851074, "rewards/margins": 1.5578439235687256, "rewards/rejected": 1.3300989866256714, "step": 63390 }, { "epoch": 2.9434978411254002, "grad_norm": 62.58248519897461, "learning_rate": 9.64916353281644e-09, "logits/chosen": -18.760801315307617, "logits/rejected": -18.50014305114746, "logps/chosen": -356.60406494140625, "logps/rejected": -300.4912414550781, "loss": 0.7255, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.32031512260437, "rewards/margins": 1.1760923862457275, "rewards/rejected": 2.1442227363586426, "step": 63400 }, { "epoch": 2.9439621152328335, "grad_norm": 3.8404715061187744, "learning_rate": 9.571784514910937e-09, "logits/chosen": -19.46702003479004, "logits/rejected": -17.85446548461914, "logps/chosen": -411.9203186035156, "logps/rejected": -271.03662109375, "loss": 0.4981, "rewards/accuracies": 0.5, "rewards/chosen": 2.972280263900757, "rewards/margins": 1.800215721130371, "rewards/rejected": 1.1720649003982544, "step": 63410 }, { "epoch": 2.9444263893402667, "grad_norm": 34.46311569213867, "learning_rate": 9.494405497005432e-09, "logits/chosen": -19.24308967590332, "logits/rejected": -18.422855377197266, "logps/chosen": -399.5541076660156, "logps/rejected": -299.99200439453125, "loss": 0.523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5823841094970703, "rewards/margins": 1.2639398574829102, "rewards/rejected": 1.3184444904327393, "step": 63420 }, { "epoch": 2.9448906634476995, "grad_norm": 26.391368865966797, "learning_rate": 9.417026479099926e-09, "logits/chosen": -19.340255737304688, "logits/rejected": -18.68606948852539, "logps/chosen": -344.5391540527344, "logps/rejected": -300.8072204589844, "loss": 0.4589, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.133695125579834, "rewards/margins": 1.6176894903182983, "rewards/rejected": 1.5160057544708252, "step": 63430 }, { "epoch": 2.9453549375551327, "grad_norm": 116.92404174804688, "learning_rate": 9.339647461194423e-09, "logits/chosen": -20.084362030029297, "logits/rejected": -18.77627944946289, "logps/chosen": -422.118408203125, "logps/rejected": -233.64389038085938, "loss": 0.2917, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.539298057556152, "rewards/margins": 2.794764280319214, "rewards/rejected": 1.7445333003997803, "step": 63440 }, { "epoch": 2.9458192116625654, "grad_norm": 112.33289337158203, "learning_rate": 9.262268443288917e-09, "logits/chosen": -20.037227630615234, "logits/rejected": -19.947566986083984, "logps/chosen": -354.6704406738281, "logps/rejected": -353.0900573730469, "loss": 0.8983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.985321044921875, "rewards/margins": 0.24144014716148376, "rewards/rejected": 2.7438807487487793, "step": 63450 }, { "epoch": 2.9462834857699987, "grad_norm": 0.3942772150039673, "learning_rate": 9.184889425383412e-09, "logits/chosen": -19.332311630249023, "logits/rejected": -17.922922134399414, "logps/chosen": -376.8211669921875, "logps/rejected": -295.7621154785156, "loss": 0.3492, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.2652459144592285, "rewards/margins": 2.196244955062866, "rewards/rejected": 2.069000720977783, "step": 63460 }, { "epoch": 2.9467477598774314, "grad_norm": 89.50272369384766, "learning_rate": 9.107510407477909e-09, "logits/chosen": -19.186967849731445, "logits/rejected": -18.595470428466797, "logps/chosen": -398.5306396484375, "logps/rejected": -375.85028076171875, "loss": 0.7825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.5078773498535156, "rewards/margins": 1.0897166728973389, "rewards/rejected": 2.4181606769561768, "step": 63470 }, { "epoch": 2.9472120339848646, "grad_norm": 19.720130920410156, "learning_rate": 9.030131389572403e-09, "logits/chosen": -18.553848266601562, "logits/rejected": -18.079296112060547, "logps/chosen": -430.9627990722656, "logps/rejected": -392.33697509765625, "loss": 1.4662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.6621155738830566, "rewards/margins": 0.06122231483459473, "rewards/rejected": 2.600893497467041, "step": 63480 }, { "epoch": 2.947676308092298, "grad_norm": 38.876808166503906, "learning_rate": 8.952752371666898e-09, "logits/chosen": -19.274368286132812, "logits/rejected": -17.521589279174805, "logps/chosen": -478.19122314453125, "logps/rejected": -321.85760498046875, "loss": 0.3441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.629337310791016, "rewards/margins": 2.4054646492004395, "rewards/rejected": 2.2238731384277344, "step": 63490 }, { "epoch": 2.9481405821997306, "grad_norm": 146.09942626953125, "learning_rate": 8.875373353761393e-09, "logits/chosen": -18.802265167236328, "logits/rejected": -18.186792373657227, "logps/chosen": -387.4720153808594, "logps/rejected": -350.63812255859375, "loss": 0.395, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.418729305267334, "rewards/margins": 1.0270493030548096, "rewards/rejected": 1.391680121421814, "step": 63500 }, { "epoch": 2.948604856307164, "grad_norm": 117.15955352783203, "learning_rate": 8.79799433585589e-09, "logits/chosen": -19.29549217224121, "logits/rejected": -17.867237091064453, "logps/chosen": -432.5777282714844, "logps/rejected": -334.92681884765625, "loss": 0.3838, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.974084854125977, "rewards/margins": 2.3547306060791016, "rewards/rejected": 2.619354486465454, "step": 63510 }, { "epoch": 2.9490691304145966, "grad_norm": 88.57947540283203, "learning_rate": 8.720615317950384e-09, "logits/chosen": -18.87405014038086, "logits/rejected": -18.96420669555664, "logps/chosen": -464.7275390625, "logps/rejected": -438.697021484375, "loss": 1.5751, "rewards/accuracies": 0.5, "rewards/chosen": 2.9478378295898438, "rewards/margins": -0.4053087830543518, "rewards/rejected": 3.3531463146209717, "step": 63520 }, { "epoch": 2.94953340452203, "grad_norm": 36.567806243896484, "learning_rate": 8.643236300044879e-09, "logits/chosen": -19.299327850341797, "logits/rejected": -18.48658561706543, "logps/chosen": -408.9994812011719, "logps/rejected": -265.6467590332031, "loss": 0.4637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9722752571105957, "rewards/margins": 1.9971323013305664, "rewards/rejected": 1.9751427173614502, "step": 63530 }, { "epoch": 2.9499976786294626, "grad_norm": 53.36912536621094, "learning_rate": 8.565857282139375e-09, "logits/chosen": -19.13668441772461, "logits/rejected": -19.209226608276367, "logps/chosen": -419.3035583496094, "logps/rejected": -433.6531677246094, "loss": 0.6511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9022412300109863, "rewards/margins": 0.7456243634223938, "rewards/rejected": 3.156616687774658, "step": 63540 }, { "epoch": 2.950461952736896, "grad_norm": 94.32598114013672, "learning_rate": 8.48847826423387e-09, "logits/chosen": -19.62711524963379, "logits/rejected": -18.495555877685547, "logps/chosen": -479.14154052734375, "logps/rejected": -283.70672607421875, "loss": 0.3216, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.7394332885742188, "rewards/margins": 1.678739309310913, "rewards/rejected": 2.0606939792633057, "step": 63550 }, { "epoch": 2.950926226844329, "grad_norm": 129.90423583984375, "learning_rate": 8.411099246328365e-09, "logits/chosen": -18.583972930908203, "logits/rejected": -18.773347854614258, "logps/chosen": -403.3540954589844, "logps/rejected": -414.9962463378906, "loss": 0.9575, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.6615047454833984, "rewards/margins": 0.5222578048706055, "rewards/rejected": 3.139246940612793, "step": 63560 }, { "epoch": 2.951390500951762, "grad_norm": 17.666297912597656, "learning_rate": 8.333720228422861e-09, "logits/chosen": -19.618459701538086, "logits/rejected": -18.976621627807617, "logps/chosen": -418.4278259277344, "logps/rejected": -336.16278076171875, "loss": 0.411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.076007604598999, "rewards/margins": 1.9026172161102295, "rewards/rejected": 1.1733906269073486, "step": 63570 }, { "epoch": 2.951854775059195, "grad_norm": 229.45693969726562, "learning_rate": 8.256341210517356e-09, "logits/chosen": -18.968603134155273, "logits/rejected": -19.00949478149414, "logps/chosen": -442.1372985839844, "logps/rejected": -402.3027648925781, "loss": 0.7767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.7232041358947754, "rewards/margins": 1.0763757228851318, "rewards/rejected": 2.6468284130096436, "step": 63580 }, { "epoch": 2.9523190491666282, "grad_norm": 8.435269355773926, "learning_rate": 8.17896219261185e-09, "logits/chosen": -19.264368057250977, "logits/rejected": -18.490013122558594, "logps/chosen": -381.3330383300781, "logps/rejected": -365.0892028808594, "loss": 0.5895, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.9148688316345215, "rewards/margins": 1.5127077102661133, "rewards/rejected": 2.402161121368408, "step": 63590 }, { "epoch": 2.952783323274061, "grad_norm": 173.5928955078125, "learning_rate": 8.101583174706347e-09, "logits/chosen": -19.1892147064209, "logits/rejected": -17.79949378967285, "logps/chosen": -527.0875854492188, "logps/rejected": -359.1689147949219, "loss": 1.0308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.027287483215332, "rewards/margins": 1.5102750062942505, "rewards/rejected": 2.51701283454895, "step": 63600 }, { "epoch": 2.953247597381494, "grad_norm": 19.15772819519043, "learning_rate": 8.024204156800842e-09, "logits/chosen": -20.042760848999023, "logits/rejected": -19.47269058227539, "logps/chosen": -471.3304748535156, "logps/rejected": -402.03594970703125, "loss": 0.6611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.356480598449707, "rewards/margins": 1.5545551776885986, "rewards/rejected": 2.8019251823425293, "step": 63610 }, { "epoch": 2.953711871488927, "grad_norm": 44.352813720703125, "learning_rate": 7.946825138895336e-09, "logits/chosen": -19.203351974487305, "logits/rejected": -18.283174514770508, "logps/chosen": -375.0257873535156, "logps/rejected": -376.7403564453125, "loss": 0.7232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.564298629760742, "rewards/margins": 0.7756611108779907, "rewards/rejected": 2.788637161254883, "step": 63620 }, { "epoch": 2.95417614559636, "grad_norm": 44.43687057495117, "learning_rate": 7.869446120989833e-09, "logits/chosen": -19.042720794677734, "logits/rejected": -19.71090316772461, "logps/chosen": -315.1040344238281, "logps/rejected": -439.59161376953125, "loss": 1.6195, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 3.2612671852111816, "rewards/margins": -0.8275675773620605, "rewards/rejected": 4.0888352394104, "step": 63630 }, { "epoch": 2.954640419703793, "grad_norm": 50.83656311035156, "learning_rate": 7.792067103084328e-09, "logits/chosen": -18.556598663330078, "logits/rejected": -17.714513778686523, "logps/chosen": -372.36279296875, "logps/rejected": -297.8599548339844, "loss": 0.394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.071818828582764, "rewards/margins": 2.728187084197998, "rewards/rejected": 1.3436315059661865, "step": 63640 }, { "epoch": 2.955104693811226, "grad_norm": 39.25721740722656, "learning_rate": 7.714688085178822e-09, "logits/chosen": -18.769609451293945, "logits/rejected": -19.163494110107422, "logps/chosen": -331.0924987792969, "logps/rejected": -319.47283935546875, "loss": 0.9731, "rewards/accuracies": 0.5, "rewards/chosen": 2.629626750946045, "rewards/margins": -0.08780093491077423, "rewards/rejected": 2.7174274921417236, "step": 63650 }, { "epoch": 2.9555689679186594, "grad_norm": 101.27825927734375, "learning_rate": 7.637309067273317e-09, "logits/chosen": -19.370166778564453, "logits/rejected": -18.247446060180664, "logps/chosen": -498.32342529296875, "logps/rejected": -442.9103088378906, "loss": 0.7748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.10097074508667, "rewards/margins": 1.407588005065918, "rewards/rejected": 2.693382501602173, "step": 63660 }, { "epoch": 2.956033242026092, "grad_norm": 116.28361511230469, "learning_rate": 7.559930049367814e-09, "logits/chosen": -18.953935623168945, "logits/rejected": -17.16024398803711, "logps/chosen": -298.14373779296875, "logps/rejected": -221.9779815673828, "loss": 1.0876, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.5429766178131104, "rewards/margins": 2.150608539581299, "rewards/rejected": 0.3923679292201996, "step": 63670 }, { "epoch": 2.956497516133525, "grad_norm": 115.1240463256836, "learning_rate": 7.482551031462308e-09, "logits/chosen": -20.162982940673828, "logits/rejected": -18.917566299438477, "logps/chosen": -526.87158203125, "logps/rejected": -439.345947265625, "loss": 0.5154, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.557232856750488, "rewards/margins": 1.6763975620269775, "rewards/rejected": 3.880835771560669, "step": 63680 }, { "epoch": 2.956961790240958, "grad_norm": 200.7189178466797, "learning_rate": 7.405172013556804e-09, "logits/chosen": -18.607769012451172, "logits/rejected": -18.198272705078125, "logps/chosen": -349.81976318359375, "logps/rejected": -307.0232849121094, "loss": 0.8492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6982131004333496, "rewards/margins": 1.007981538772583, "rewards/rejected": 2.6902318000793457, "step": 63690 }, { "epoch": 2.9574260643483914, "grad_norm": 77.03125762939453, "learning_rate": 7.327792995651299e-09, "logits/chosen": -18.946224212646484, "logits/rejected": -18.528911590576172, "logps/chosen": -379.47882080078125, "logps/rejected": -429.9541931152344, "loss": 1.3638, "rewards/accuracies": 0.5, "rewards/chosen": 3.847553253173828, "rewards/margins": 0.04406120628118515, "rewards/rejected": 3.8034920692443848, "step": 63700 }, { "epoch": 2.957890338455824, "grad_norm": 24.092130661010742, "learning_rate": 7.250413977745794e-09, "logits/chosen": -19.30542755126953, "logits/rejected": -17.665109634399414, "logps/chosen": -445.74615478515625, "logps/rejected": -249.2321319580078, "loss": 0.2942, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.813684463500977, "rewards/margins": 4.171639442443848, "rewards/rejected": 1.642045259475708, "step": 63710 }, { "epoch": 2.9583546125632574, "grad_norm": 45.56019973754883, "learning_rate": 7.17303495984029e-09, "logits/chosen": -19.066509246826172, "logits/rejected": -17.948331832885742, "logps/chosen": -327.693603515625, "logps/rejected": -224.58377075195312, "loss": 0.443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.169151782989502, "rewards/margins": 2.0477538108825684, "rewards/rejected": 1.1213973760604858, "step": 63720 }, { "epoch": 2.9588188866706906, "grad_norm": 1.174931526184082, "learning_rate": 7.0956559419347845e-09, "logits/chosen": -19.389493942260742, "logits/rejected": -18.604015350341797, "logps/chosen": -407.25531005859375, "logps/rejected": -326.709228515625, "loss": 0.2464, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 5.107586860656738, "rewards/margins": 3.1137335300445557, "rewards/rejected": 1.9938533306121826, "step": 63730 }, { "epoch": 2.9592831607781234, "grad_norm": 67.6560287475586, "learning_rate": 7.01827692402928e-09, "logits/chosen": -19.16959571838379, "logits/rejected": -18.685100555419922, "logps/chosen": -316.84515380859375, "logps/rejected": -210.6899871826172, "loss": 0.8869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.102489948272705, "rewards/margins": 1.133746862411499, "rewards/rejected": 0.968742847442627, "step": 63740 }, { "epoch": 2.9597474348855566, "grad_norm": 274.14617919921875, "learning_rate": 6.940897906123775e-09, "logits/chosen": -18.961345672607422, "logits/rejected": -18.95792007446289, "logps/chosen": -455.3094787597656, "logps/rejected": -450.94354248046875, "loss": 1.2726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.681582450866699, "rewards/margins": 1.093282699584961, "rewards/rejected": 3.588299512863159, "step": 63750 }, { "epoch": 2.9602117089929894, "grad_norm": 32.9719352722168, "learning_rate": 6.8635188882182704e-09, "logits/chosen": -18.841535568237305, "logits/rejected": -19.29227066040039, "logps/chosen": -371.24566650390625, "logps/rejected": -364.55072021484375, "loss": 1.5583, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.2995705604553223, "rewards/margins": -0.22940507531166077, "rewards/rejected": 2.5289759635925293, "step": 63760 }, { "epoch": 2.9606759831004226, "grad_norm": 135.63204956054688, "learning_rate": 6.786139870312766e-09, "logits/chosen": -18.912914276123047, "logits/rejected": -18.491092681884766, "logps/chosen": -342.7139587402344, "logps/rejected": -318.7275695800781, "loss": 0.4675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.7155163288116455, "rewards/margins": 1.4404252767562866, "rewards/rejected": 1.2750909328460693, "step": 63770 }, { "epoch": 2.9611402572078553, "grad_norm": 22.59868812561035, "learning_rate": 6.708760852407261e-09, "logits/chosen": -19.346511840820312, "logits/rejected": -18.23521614074707, "logps/chosen": -353.63507080078125, "logps/rejected": -304.43133544921875, "loss": 0.3361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.2168071269989014, "rewards/margins": 1.8342756032943726, "rewards/rejected": 1.382531762123108, "step": 63780 }, { "epoch": 2.9616045313152886, "grad_norm": 48.19343948364258, "learning_rate": 6.631381834501756e-09, "logits/chosen": -18.891780853271484, "logits/rejected": -17.768939971923828, "logps/chosen": -516.5865478515625, "logps/rejected": -324.52764892578125, "loss": 0.1592, "rewards/accuracies": 1.0, "rewards/chosen": 4.7208709716796875, "rewards/margins": 2.7274222373962402, "rewards/rejected": 1.9934488534927368, "step": 63790 }, { "epoch": 2.9620688054227218, "grad_norm": 110.93583679199219, "learning_rate": 6.554002816596252e-09, "logits/chosen": -19.756759643554688, "logits/rejected": -19.68555450439453, "logps/chosen": -401.0480041503906, "logps/rejected": -407.21124267578125, "loss": 1.185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.33297061920166, "rewards/margins": 0.4323626160621643, "rewards/rejected": 3.9006080627441406, "step": 63800 }, { "epoch": 2.9625330795301545, "grad_norm": 32.78126525878906, "learning_rate": 6.476623798690747e-09, "logits/chosen": -19.38965606689453, "logits/rejected": -18.331928253173828, "logps/chosen": -409.8981018066406, "logps/rejected": -217.08810424804688, "loss": 0.5056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.1024973392486572, "rewards/margins": 2.2230658531188965, "rewards/rejected": 0.879431426525116, "step": 63810 }, { "epoch": 2.9629973536375878, "grad_norm": 144.45648193359375, "learning_rate": 6.3992447807852414e-09, "logits/chosen": -20.347652435302734, "logits/rejected": -19.343761444091797, "logps/chosen": -441.77618408203125, "logps/rejected": -379.3394775390625, "loss": 0.554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.123818397521973, "rewards/margins": 1.3754743337631226, "rewards/rejected": 2.7483437061309814, "step": 63820 }, { "epoch": 2.9634616277450205, "grad_norm": 144.17742919921875, "learning_rate": 6.321865762879737e-09, "logits/chosen": -19.4165096282959, "logits/rejected": -19.579341888427734, "logps/chosen": -372.0664978027344, "logps/rejected": -413.3545837402344, "loss": 1.0387, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 4.111189842224121, "rewards/margins": 0.12926307320594788, "rewards/rejected": 3.981926441192627, "step": 63830 }, { "epoch": 2.9639259018524537, "grad_norm": 36.793880462646484, "learning_rate": 6.244486744974233e-09, "logits/chosen": -18.786075592041016, "logits/rejected": -17.879962921142578, "logps/chosen": -433.5289001464844, "logps/rejected": -336.24200439453125, "loss": 0.383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.6168103218078613, "rewards/margins": 1.9332422018051147, "rewards/rejected": 1.6835676431655884, "step": 63840 }, { "epoch": 2.9643901759598865, "grad_norm": 11.000682830810547, "learning_rate": 6.167107727068727e-09, "logits/chosen": -20.307960510253906, "logits/rejected": -19.906803131103516, "logps/chosen": -461.07928466796875, "logps/rejected": -382.8768615722656, "loss": 0.5027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.9964847564697266, "rewards/margins": 1.5823159217834473, "rewards/rejected": 2.4141688346862793, "step": 63850 }, { "epoch": 2.9648544500673197, "grad_norm": 79.23046112060547, "learning_rate": 6.089728709163223e-09, "logits/chosen": -19.017005920410156, "logits/rejected": -18.416452407836914, "logps/chosen": -405.5909118652344, "logps/rejected": -370.87298583984375, "loss": 0.5634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6605401039123535, "rewards/margins": 1.729303002357483, "rewards/rejected": 1.9312368631362915, "step": 63860 }, { "epoch": 2.965318724174753, "grad_norm": 135.2073516845703, "learning_rate": 6.0123496912577185e-09, "logits/chosen": -19.363412857055664, "logits/rejected": -19.217144012451172, "logps/chosen": -272.5785217285156, "logps/rejected": -267.0726013183594, "loss": 0.5364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.7365574836730957, "rewards/margins": 1.3333380222320557, "rewards/rejected": 1.40321946144104, "step": 63870 }, { "epoch": 2.9657829982821857, "grad_norm": 15.821976661682129, "learning_rate": 5.934970673352213e-09, "logits/chosen": -19.24368667602539, "logits/rejected": -17.926210403442383, "logps/chosen": -460.6332092285156, "logps/rejected": -261.91680908203125, "loss": 0.2181, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.433373928070068, "rewards/margins": 3.047441005706787, "rewards/rejected": 1.3859331607818604, "step": 63880 }, { "epoch": 2.966247272389619, "grad_norm": 161.42063903808594, "learning_rate": 5.857591655446709e-09, "logits/chosen": -18.355985641479492, "logits/rejected": -18.042797088623047, "logps/chosen": -451.3236389160156, "logps/rejected": -371.4873962402344, "loss": 1.1839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.621646404266357, "rewards/margins": 1.2354505062103271, "rewards/rejected": 3.386195659637451, "step": 63890 }, { "epoch": 2.9667115464970517, "grad_norm": 117.94379425048828, "learning_rate": 5.780212637541204e-09, "logits/chosen": -19.40583610534668, "logits/rejected": -17.88268280029297, "logps/chosen": -447.98321533203125, "logps/rejected": -251.1012725830078, "loss": 0.3833, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.0254359245300293, "rewards/margins": 1.9458097219467163, "rewards/rejected": 1.0796263217926025, "step": 63900 }, { "epoch": 2.967175820604485, "grad_norm": 2.0010781288146973, "learning_rate": 5.702833619635699e-09, "logits/chosen": -19.36237335205078, "logits/rejected": -17.605609893798828, "logps/chosen": -537.894775390625, "logps/rejected": -373.8617858886719, "loss": 0.1005, "rewards/accuracies": 1.0, "rewards/chosen": 5.5990729331970215, "rewards/margins": 3.5107827186584473, "rewards/rejected": 2.088290214538574, "step": 63910 }, { "epoch": 2.9676400947119177, "grad_norm": 72.46248626708984, "learning_rate": 5.625454601730194e-09, "logits/chosen": -18.427791595458984, "logits/rejected": -17.373659133911133, "logps/chosen": -413.4175720214844, "logps/rejected": -310.24853515625, "loss": 0.5078, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.232882022857666, "rewards/margins": 1.8337180614471436, "rewards/rejected": 1.3991641998291016, "step": 63920 }, { "epoch": 2.968104368819351, "grad_norm": 30.723613739013672, "learning_rate": 5.54807558382469e-09, "logits/chosen": -19.143329620361328, "logits/rejected": -17.942272186279297, "logps/chosen": -406.57696533203125, "logps/rejected": -278.3713073730469, "loss": 0.8615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.266946792602539, "rewards/margins": 1.458074688911438, "rewards/rejected": 1.808871865272522, "step": 63930 }, { "epoch": 2.968568642926784, "grad_norm": 33.53153991699219, "learning_rate": 5.470696565919185e-09, "logits/chosen": -19.317279815673828, "logits/rejected": -18.428499221801758, "logps/chosen": -354.9376220703125, "logps/rejected": -265.9207763671875, "loss": 0.7796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.105449914932251, "rewards/margins": 1.5579437017440796, "rewards/rejected": 1.5475060939788818, "step": 63940 }, { "epoch": 2.969032917034217, "grad_norm": 107.67124938964844, "learning_rate": 5.39331754801368e-09, "logits/chosen": -18.648914337158203, "logits/rejected": -17.89626121520996, "logps/chosen": -425.01898193359375, "logps/rejected": -413.1446228027344, "loss": 1.1901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.3884999752044678, "rewards/margins": 0.31767457723617554, "rewards/rejected": 2.0708255767822266, "step": 63950 }, { "epoch": 2.96949719114165, "grad_norm": 96.20186614990234, "learning_rate": 5.315938530108176e-09, "logits/chosen": -20.59189224243164, "logits/rejected": -19.14801025390625, "logps/chosen": -476.55426025390625, "logps/rejected": -363.6645202636719, "loss": 0.538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.450714111328125, "rewards/margins": 1.6662395000457764, "rewards/rejected": 2.7844746112823486, "step": 63960 }, { "epoch": 2.9699614652490833, "grad_norm": 220.8039093017578, "learning_rate": 5.238559512202671e-09, "logits/chosen": -19.217973709106445, "logits/rejected": -19.070491790771484, "logps/chosen": -452.3885803222656, "logps/rejected": -409.82440185546875, "loss": 0.5702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.018075466156006, "rewards/margins": 1.0342670679092407, "rewards/rejected": 3.9838080406188965, "step": 63970 }, { "epoch": 2.970425739356516, "grad_norm": 0.31198909878730774, "learning_rate": 5.161180494297166e-09, "logits/chosen": -20.295604705810547, "logits/rejected": -19.319442749023438, "logps/chosen": -373.7816162109375, "logps/rejected": -264.14654541015625, "loss": 1.0744, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8410840034484863, "rewards/margins": 1.596454381942749, "rewards/rejected": 2.244629383087158, "step": 63980 }, { "epoch": 2.970890013463949, "grad_norm": 120.90410614013672, "learning_rate": 5.083801476391662e-09, "logits/chosen": -19.30991554260254, "logits/rejected": -17.398422241210938, "logps/chosen": -423.63330078125, "logps/rejected": -329.6810607910156, "loss": 0.3912, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.564778804779053, "rewards/margins": 1.9683078527450562, "rewards/rejected": 2.5964713096618652, "step": 63990 }, { "epoch": 2.971354287571382, "grad_norm": 286.4439697265625, "learning_rate": 5.006422458486157e-09, "logits/chosen": -19.01304054260254, "logits/rejected": -18.427085876464844, "logps/chosen": -339.3192138671875, "logps/rejected": -331.94024658203125, "loss": 0.6579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3187096118927, "rewards/margins": 1.1883783340454102, "rewards/rejected": 2.130331039428711, "step": 64000 }, { "epoch": 2.9718185616788153, "grad_norm": 30.929391860961914, "learning_rate": 4.929043440580652e-09, "logits/chosen": -18.153608322143555, "logits/rejected": -17.769309997558594, "logps/chosen": -290.253662109375, "logps/rejected": -249.47207641601562, "loss": 0.6259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.530981421470642, "rewards/margins": 1.183826208114624, "rewards/rejected": 0.3471551835536957, "step": 64010 }, { "epoch": 2.972282835786248, "grad_norm": 44.96504211425781, "learning_rate": 4.851664422675147e-09, "logits/chosen": -18.978260040283203, "logits/rejected": -18.627296447753906, "logps/chosen": -441.01544189453125, "logps/rejected": -400.45404052734375, "loss": 0.979, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 3.3873684406280518, "rewards/margins": 0.509831964969635, "rewards/rejected": 2.8775367736816406, "step": 64020 }, { "epoch": 2.9727471098936813, "grad_norm": 9.905720710754395, "learning_rate": 4.774285404769643e-09, "logits/chosen": -18.103347778320312, "logits/rejected": -18.520164489746094, "logps/chosen": -358.15692138671875, "logps/rejected": -374.6416931152344, "loss": 1.2767, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0183629989624023, "rewards/margins": 0.05718288570642471, "rewards/rejected": 2.9611799716949463, "step": 64030 }, { "epoch": 2.9732113840011145, "grad_norm": 211.87188720703125, "learning_rate": 4.6969063868641375e-09, "logits/chosen": -18.725528717041016, "logits/rejected": -17.884340286254883, "logps/chosen": -400.7248229980469, "logps/rejected": -260.8279113769531, "loss": 0.8126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7676949501037598, "rewards/margins": 1.5328716039657593, "rewards/rejected": 2.234823226928711, "step": 64040 }, { "epoch": 2.9736756581085473, "grad_norm": 210.0655975341797, "learning_rate": 4.619527368958632e-09, "logits/chosen": -18.9584903717041, "logits/rejected": -18.934329986572266, "logps/chosen": -500.5013732910156, "logps/rejected": -430.95501708984375, "loss": 0.7923, "rewards/accuracies": 0.5, "rewards/chosen": 4.015151023864746, "rewards/margins": 0.16717728972434998, "rewards/rejected": 3.847973346710205, "step": 64050 }, { "epoch": 2.97413993221598, "grad_norm": 57.228572845458984, "learning_rate": 4.542148351053129e-09, "logits/chosen": -20.388185501098633, "logits/rejected": -18.740901947021484, "logps/chosen": -447.9723205566406, "logps/rejected": -361.0072021484375, "loss": 0.295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.869755268096924, "rewards/margins": 1.9521700143814087, "rewards/rejected": 1.9175856113433838, "step": 64060 }, { "epoch": 2.9746042063234133, "grad_norm": 136.8776092529297, "learning_rate": 4.4647693331476234e-09, "logits/chosen": -20.194072723388672, "logits/rejected": -19.11325454711914, "logps/chosen": -379.66412353515625, "logps/rejected": -317.05682373046875, "loss": 0.4003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.035046577453613, "rewards/margins": 1.5601640939712524, "rewards/rejected": 2.4748823642730713, "step": 64070 }, { "epoch": 2.9750684804308465, "grad_norm": 3.0398499965667725, "learning_rate": 4.387390315242118e-09, "logits/chosen": -18.201839447021484, "logits/rejected": -18.274492263793945, "logps/chosen": -366.6111755371094, "logps/rejected": -311.7259216308594, "loss": 0.6966, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.2711315155029297, "rewards/margins": 1.1068607568740845, "rewards/rejected": 2.1642708778381348, "step": 64080 }, { "epoch": 2.9755327545382793, "grad_norm": 5.821261882781982, "learning_rate": 4.310011297336615e-09, "logits/chosen": -19.239595413208008, "logits/rejected": -17.670589447021484, "logps/chosen": -399.17559814453125, "logps/rejected": -235.4336700439453, "loss": 0.276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.1839053630828857, "rewards/margins": 2.8196024894714355, "rewards/rejected": 0.36430293321609497, "step": 64090 }, { "epoch": 2.9759970286457125, "grad_norm": 6.089181900024414, "learning_rate": 4.232632279431109e-09, "logits/chosen": -19.375308990478516, "logits/rejected": -18.401432037353516, "logps/chosen": -434.60089111328125, "logps/rejected": -377.44207763671875, "loss": 1.0955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8018195629119873, "rewards/margins": 0.818414568901062, "rewards/rejected": 2.983405351638794, "step": 64100 }, { "epoch": 2.9764613027531457, "grad_norm": 162.1511993408203, "learning_rate": 4.155253261525604e-09, "logits/chosen": -18.90231704711914, "logits/rejected": -18.37557029724121, "logps/chosen": -394.9859619140625, "logps/rejected": -340.90130615234375, "loss": 0.7339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.8039848804473877, "rewards/margins": 1.4649146795272827, "rewards/rejected": 2.3390705585479736, "step": 64110 }, { "epoch": 2.9769255768605785, "grad_norm": 186.88290405273438, "learning_rate": 4.0778742436201e-09, "logits/chosen": -18.26791000366211, "logits/rejected": -18.2582950592041, "logps/chosen": -386.36370849609375, "logps/rejected": -345.60504150390625, "loss": 1.0538, "rewards/accuracies": 0.5, "rewards/chosen": 2.5067811012268066, "rewards/margins": -0.14445587992668152, "rewards/rejected": 2.6512372493743896, "step": 64120 }, { "epoch": 2.9773898509680117, "grad_norm": 186.62142944335938, "learning_rate": 4.000495225714595e-09, "logits/chosen": -19.12310218811035, "logits/rejected": -18.949071884155273, "logps/chosen": -385.8204040527344, "logps/rejected": -407.24285888671875, "loss": 1.106, "rewards/accuracies": 0.5, "rewards/chosen": 2.6946451663970947, "rewards/margins": 0.07973375171422958, "rewards/rejected": 2.6149115562438965, "step": 64130 }, { "epoch": 2.9778541250754444, "grad_norm": 0.07767084240913391, "learning_rate": 3.92311620780909e-09, "logits/chosen": -20.47315216064453, "logits/rejected": -18.957988739013672, "logps/chosen": -400.9795837402344, "logps/rejected": -306.3951416015625, "loss": 0.3952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.1551971435546875, "rewards/margins": 2.7715728282928467, "rewards/rejected": 1.3836246728897095, "step": 64140 }, { "epoch": 2.9783183991828777, "grad_norm": 53.36372375488281, "learning_rate": 3.8457371899035856e-09, "logits/chosen": -19.421743392944336, "logits/rejected": -18.671987533569336, "logps/chosen": -520.4072875976562, "logps/rejected": -375.94512939453125, "loss": 0.1907, "rewards/accuracies": 1.0, "rewards/chosen": 5.736571788787842, "rewards/margins": 2.948863983154297, "rewards/rejected": 2.787707805633545, "step": 64150 }, { "epoch": 2.9787826732903104, "grad_norm": 47.87133026123047, "learning_rate": 3.768358171998081e-09, "logits/chosen": -18.50984764099121, "logits/rejected": -18.737668991088867, "logps/chosen": -388.8401184082031, "logps/rejected": -383.98773193359375, "loss": 0.9578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3968780040740967, "rewards/margins": 0.010895365849137306, "rewards/rejected": 3.3859825134277344, "step": 64160 }, { "epoch": 2.9792469473977437, "grad_norm": 284.05413818359375, "learning_rate": 3.6909791540925763e-09, "logits/chosen": -18.72211265563965, "logits/rejected": -17.99986457824707, "logps/chosen": -332.44488525390625, "logps/rejected": -337.0299072265625, "loss": 1.1617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.6705543994903564, "rewards/margins": 0.8502914309501648, "rewards/rejected": 2.820262908935547, "step": 64170 }, { "epoch": 2.979711221505177, "grad_norm": 24.81778335571289, "learning_rate": 3.613600136187071e-09, "logits/chosen": -19.3748779296875, "logits/rejected": -18.314645767211914, "logps/chosen": -420.61993408203125, "logps/rejected": -346.587158203125, "loss": 0.5837, "rewards/accuracies": 0.5, "rewards/chosen": 3.3672637939453125, "rewards/margins": 1.2869884967803955, "rewards/rejected": 2.080275058746338, "step": 64180 }, { "epoch": 2.9801754956126096, "grad_norm": 13.819673538208008, "learning_rate": 3.5362211182815666e-09, "logits/chosen": -19.15408706665039, "logits/rejected": -18.178985595703125, "logps/chosen": -345.0204162597656, "logps/rejected": -265.0532531738281, "loss": 0.2021, "rewards/accuracies": 1.0, "rewards/chosen": 3.3100883960723877, "rewards/margins": 2.107321262359619, "rewards/rejected": 1.202767014503479, "step": 64190 }, { "epoch": 2.980639769720043, "grad_norm": 109.50016021728516, "learning_rate": 3.4588421003760622e-09, "logits/chosen": -19.179134368896484, "logits/rejected": -18.989288330078125, "logps/chosen": -468.7109375, "logps/rejected": -397.3555603027344, "loss": 0.5705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.9899673461914062, "rewards/margins": 1.1819789409637451, "rewards/rejected": 2.8079888820648193, "step": 64200 }, { "epoch": 2.9811040438274756, "grad_norm": 27.06171417236328, "learning_rate": 3.381463082470557e-09, "logits/chosen": -19.813764572143555, "logits/rejected": -19.29509925842285, "logps/chosen": -476.13995361328125, "logps/rejected": -349.61785888671875, "loss": 0.8394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.990935802459717, "rewards/margins": 0.9520605206489563, "rewards/rejected": 3.0388755798339844, "step": 64210 }, { "epoch": 2.981568317934909, "grad_norm": 69.41825866699219, "learning_rate": 3.3040840645650526e-09, "logits/chosen": -18.621395111083984, "logits/rejected": -18.67079734802246, "logps/chosen": -316.64324951171875, "logps/rejected": -322.53912353515625, "loss": 1.1222, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.69309401512146, "rewards/margins": -0.05790863186120987, "rewards/rejected": 2.751002788543701, "step": 64220 }, { "epoch": 2.9820325920423416, "grad_norm": 35.05979919433594, "learning_rate": 3.2267050466595473e-09, "logits/chosen": -19.150001525878906, "logits/rejected": -18.274362564086914, "logps/chosen": -395.49432373046875, "logps/rejected": -353.23516845703125, "loss": 0.2974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.049111843109131, "rewards/margins": 1.4829455614089966, "rewards/rejected": 2.566166400909424, "step": 64230 }, { "epoch": 2.982496866149775, "grad_norm": 158.83828735351562, "learning_rate": 3.149326028754043e-09, "logits/chosen": -18.83196449279785, "logits/rejected": -17.899511337280273, "logps/chosen": -312.4810791015625, "logps/rejected": -275.80902099609375, "loss": 0.9478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.4812417030334473, "rewards/margins": 1.3191487789154053, "rewards/rejected": 2.162092924118042, "step": 64240 }, { "epoch": 2.982961140257208, "grad_norm": 4.869825839996338, "learning_rate": 3.0719470108485385e-09, "logits/chosen": -19.32152557373047, "logits/rejected": -18.0798397064209, "logps/chosen": -514.0711669921875, "logps/rejected": -332.1479797363281, "loss": 0.276, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 5.340117454528809, "rewards/margins": 2.8123106956481934, "rewards/rejected": 2.5278067588806152, "step": 64250 }, { "epoch": 2.983425414364641, "grad_norm": 239.08302307128906, "learning_rate": 2.9945679929430332e-09, "logits/chosen": -19.376585006713867, "logits/rejected": -17.830135345458984, "logps/chosen": -427.461669921875, "logps/rejected": -289.7210998535156, "loss": 0.5107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.777593612670898, "rewards/margins": 3.160266160964966, "rewards/rejected": 1.6173276901245117, "step": 64260 }, { "epoch": 2.983889688472074, "grad_norm": 7.915149688720703, "learning_rate": 2.9171889750375288e-09, "logits/chosen": -19.14497947692871, "logits/rejected": -17.709218978881836, "logps/chosen": -443.7813415527344, "logps/rejected": -291.70208740234375, "loss": 0.4982, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.6569318771362305, "rewards/margins": 2.461566925048828, "rewards/rejected": 2.1953654289245605, "step": 64270 }, { "epoch": 2.984353962579507, "grad_norm": 233.68667602539062, "learning_rate": 2.839809957132024e-09, "logits/chosen": -18.594074249267578, "logits/rejected": -18.310680389404297, "logps/chosen": -434.6293029785156, "logps/rejected": -405.56243896484375, "loss": 1.0125, "rewards/accuracies": 0.5, "rewards/chosen": 4.337759017944336, "rewards/margins": 0.3251453638076782, "rewards/rejected": 4.012613296508789, "step": 64280 }, { "epoch": 2.98481823668694, "grad_norm": 0.1780867874622345, "learning_rate": 2.762430939226519e-09, "logits/chosen": -19.174413681030273, "logits/rejected": -19.13595962524414, "logps/chosen": -380.63421630859375, "logps/rejected": -374.24383544921875, "loss": 1.3735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1896538734436035, "rewards/margins": 0.6769925951957703, "rewards/rejected": 2.5126614570617676, "step": 64290 }, { "epoch": 2.985282510794373, "grad_norm": 211.4679412841797, "learning_rate": 2.6850519213210147e-09, "logits/chosen": -20.677534103393555, "logits/rejected": -19.465564727783203, "logps/chosen": -393.5762634277344, "logps/rejected": -280.42388916015625, "loss": 0.6475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.225144147872925, "rewards/margins": 1.8978464603424072, "rewards/rejected": 1.3272976875305176, "step": 64300 }, { "epoch": 2.985746784901806, "grad_norm": 210.4291229248047, "learning_rate": 2.60767290341551e-09, "logits/chosen": -18.030742645263672, "logits/rejected": -18.252161026000977, "logps/chosen": -357.04473876953125, "logps/rejected": -367.68463134765625, "loss": 1.4377, "rewards/accuracies": 0.5, "rewards/chosen": 3.124939441680908, "rewards/margins": 0.4655916094779968, "rewards/rejected": 2.6593480110168457, "step": 64310 }, { "epoch": 2.986211059009239, "grad_norm": 76.801025390625, "learning_rate": 2.530293885510005e-09, "logits/chosen": -18.84645652770996, "logits/rejected": -17.76552391052246, "logps/chosen": -331.83868408203125, "logps/rejected": -248.66665649414062, "loss": 0.3271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.484487533569336, "rewards/margins": 1.9169527292251587, "rewards/rejected": 1.5675350427627563, "step": 64320 }, { "epoch": 2.986675333116672, "grad_norm": 177.88954162597656, "learning_rate": 2.4529148676045e-09, "logits/chosen": -19.556947708129883, "logits/rejected": -18.438800811767578, "logps/chosen": -425.6004333496094, "logps/rejected": -370.51739501953125, "loss": 0.5136, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.692996025085449, "rewards/margins": 1.5887008905410767, "rewards/rejected": 3.104295015335083, "step": 64330 }, { "epoch": 2.987139607224105, "grad_norm": 98.8775863647461, "learning_rate": 2.3755358496989954e-09, "logits/chosen": -20.42262840270996, "logits/rejected": -18.742305755615234, "logps/chosen": -389.95263671875, "logps/rejected": -289.11663818359375, "loss": 0.3076, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.326416492462158, "rewards/margins": 1.6453750133514404, "rewards/rejected": 1.6810414791107178, "step": 64340 }, { "epoch": 2.987603881331538, "grad_norm": 1.8982335329055786, "learning_rate": 2.298156831793491e-09, "logits/chosen": -19.92592430114746, "logits/rejected": -19.5489559173584, "logps/chosen": -363.09820556640625, "logps/rejected": -321.96435546875, "loss": 0.9868, "rewards/accuracies": 0.5, "rewards/chosen": 2.8871378898620605, "rewards/margins": 0.8400553464889526, "rewards/rejected": 2.0470826625823975, "step": 64350 }, { "epoch": 2.988068155438971, "grad_norm": 80.52617645263672, "learning_rate": 2.220777813887986e-09, "logits/chosen": -19.184432983398438, "logits/rejected": -18.292577743530273, "logps/chosen": -413.08160400390625, "logps/rejected": -357.41082763671875, "loss": 0.5263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.626481533050537, "rewards/margins": 0.9552600979804993, "rewards/rejected": 1.671221375465393, "step": 64360 }, { "epoch": 2.988532429546404, "grad_norm": 51.89841079711914, "learning_rate": 2.1433987959824813e-09, "logits/chosen": -18.69496726989746, "logits/rejected": -17.81989288330078, "logps/chosen": -400.8126525878906, "logps/rejected": -322.6590270996094, "loss": 0.9561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.1041440963745117, "rewards/margins": 1.2911878824234009, "rewards/rejected": 1.8129562139511108, "step": 64370 }, { "epoch": 2.988996703653837, "grad_norm": 4.121476650238037, "learning_rate": 2.0660197780769764e-09, "logits/chosen": -18.717605590820312, "logits/rejected": -17.700868606567383, "logps/chosen": -391.874267578125, "logps/rejected": -361.4237976074219, "loss": 0.7353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.289586067199707, "rewards/margins": 0.37958523631095886, "rewards/rejected": 1.9100008010864258, "step": 64380 }, { "epoch": 2.9894609777612704, "grad_norm": 45.9666748046875, "learning_rate": 1.988640760171472e-09, "logits/chosen": -18.795021057128906, "logits/rejected": -18.008275985717773, "logps/chosen": -457.971923828125, "logps/rejected": -371.06591796875, "loss": 0.7009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.3124260902404785, "rewards/margins": 0.937320351600647, "rewards/rejected": 2.375105619430542, "step": 64390 }, { "epoch": 2.989925251868703, "grad_norm": 49.03023147583008, "learning_rate": 1.911261742265967e-09, "logits/chosen": -17.964366912841797, "logits/rejected": -18.295949935913086, "logps/chosen": -328.37054443359375, "logps/rejected": -328.14764404296875, "loss": 0.94, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.291367292404175, "rewards/margins": 0.5229073762893677, "rewards/rejected": 1.768459677696228, "step": 64400 }, { "epoch": 2.9903895259761364, "grad_norm": 145.5160369873047, "learning_rate": 1.8338827243604623e-09, "logits/chosen": -18.702938079833984, "logits/rejected": -17.650754928588867, "logps/chosen": -342.4078674316406, "logps/rejected": -239.8215789794922, "loss": 0.6132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.3181445598602295, "rewards/margins": 2.330747604370117, "rewards/rejected": 0.9873968958854675, "step": 64410 }, { "epoch": 2.9908538000835696, "grad_norm": 229.45596313476562, "learning_rate": 1.7565037064549575e-09, "logits/chosen": -19.437660217285156, "logits/rejected": -20.566186904907227, "logps/chosen": -340.09857177734375, "logps/rejected": -438.349365234375, "loss": 1.2271, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 2.993788719177246, "rewards/margins": -0.541567325592041, "rewards/rejected": 3.535356044769287, "step": 64420 }, { "epoch": 2.9913180741910024, "grad_norm": 2.221419095993042, "learning_rate": 1.6791246885494529e-09, "logits/chosen": -18.963176727294922, "logits/rejected": -17.997699737548828, "logps/chosen": -558.4782104492188, "logps/rejected": -390.2214050292969, "loss": 0.3796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.494084358215332, "rewards/margins": 1.9791005849838257, "rewards/rejected": 2.514984130859375, "step": 64430 }, { "epoch": 2.991782348298435, "grad_norm": 63.51863479614258, "learning_rate": 1.6017456706439482e-09, "logits/chosen": -19.588769912719727, "logits/rejected": -18.612873077392578, "logps/chosen": -393.45355224609375, "logps/rejected": -292.05657958984375, "loss": 0.7071, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.728790760040283, "rewards/margins": 1.0068410634994507, "rewards/rejected": 2.721949815750122, "step": 64440 }, { "epoch": 2.9922466224058684, "grad_norm": 186.8231658935547, "learning_rate": 1.5243666527384434e-09, "logits/chosen": -19.121370315551758, "logits/rejected": -19.081684112548828, "logps/chosen": -428.5420837402344, "logps/rejected": -403.1763610839844, "loss": 0.5627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.189263343811035, "rewards/margins": 1.4930980205535889, "rewards/rejected": 2.696165084838867, "step": 64450 }, { "epoch": 2.9927108965133016, "grad_norm": 262.63421630859375, "learning_rate": 1.4469876348329386e-09, "logits/chosen": -19.821195602416992, "logits/rejected": -18.526079177856445, "logps/chosen": -462.8419494628906, "logps/rejected": -390.6852722167969, "loss": 0.5127, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.486141204833984, "rewards/margins": 1.8779737949371338, "rewards/rejected": 2.6081674098968506, "step": 64460 }, { "epoch": 2.9931751706207343, "grad_norm": 29.505359649658203, "learning_rate": 1.3696086169274337e-09, "logits/chosen": -19.628028869628906, "logits/rejected": -18.70142364501953, "logps/chosen": -437.78594970703125, "logps/rejected": -348.18499755859375, "loss": 0.6126, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.5150344371795654, "rewards/margins": 1.1184654235839844, "rewards/rejected": 2.39656925201416, "step": 64470 }, { "epoch": 2.9936394447281676, "grad_norm": 0.0027863590512424707, "learning_rate": 1.2922295990219293e-09, "logits/chosen": -19.75725555419922, "logits/rejected": -18.22597885131836, "logps/chosen": -392.52191162109375, "logps/rejected": -291.5907897949219, "loss": 0.3287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.9199070930480957, "rewards/margins": 3.252061367034912, "rewards/rejected": 0.6678458452224731, "step": 64480 }, { "epoch": 2.994103718835601, "grad_norm": 236.48306274414062, "learning_rate": 1.2148505811164245e-09, "logits/chosen": -18.583297729492188, "logits/rejected": -17.48786163330078, "logps/chosen": -416.68267822265625, "logps/rejected": -376.51434326171875, "loss": 0.6223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 4.414263725280762, "rewards/margins": 1.283869981765747, "rewards/rejected": 3.1303939819335938, "step": 64490 }, { "epoch": 2.9945679929430336, "grad_norm": 3.972989797592163, "learning_rate": 1.1374715632109196e-09, "logits/chosen": -18.92525863647461, "logits/rejected": -18.48126792907715, "logps/chosen": -314.0093688964844, "logps/rejected": -223.7864227294922, "loss": 0.5987, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2532427310943604, "rewards/margins": 1.2997839450836182, "rewards/rejected": 0.9534587860107422, "step": 64500 }, { "epoch": 2.9950322670504663, "grad_norm": 18.336917877197266, "learning_rate": 1.060092545305415e-09, "logits/chosen": -19.7398738861084, "logits/rejected": -20.098575592041016, "logps/chosen": -296.74884033203125, "logps/rejected": -342.4451904296875, "loss": 0.9822, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.9645323753356934, "rewards/margins": 0.3644176125526428, "rewards/rejected": 2.600114583969116, "step": 64510 }, { "epoch": 2.9954965411578995, "grad_norm": 9.55074691772461, "learning_rate": 9.827135273999102e-10, "logits/chosen": -19.69094467163086, "logits/rejected": -18.43178367614746, "logps/chosen": -380.78564453125, "logps/rejected": -316.83734130859375, "loss": 0.7129, "rewards/accuracies": 0.5, "rewards/chosen": 3.5048165321350098, "rewards/margins": 0.8661490678787231, "rewards/rejected": 2.638666868209839, "step": 64520 }, { "epoch": 2.9959608152653328, "grad_norm": 160.69857788085938, "learning_rate": 9.053345094944054e-10, "logits/chosen": -18.9793643951416, "logits/rejected": -18.12508773803711, "logps/chosen": -406.3864440917969, "logps/rejected": -325.47784423828125, "loss": 0.3924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.7793869972229004, "rewards/margins": 1.3974696397781372, "rewards/rejected": 2.3819172382354736, "step": 64530 }, { "epoch": 2.9964250893727655, "grad_norm": 0.42347708344459534, "learning_rate": 8.279554915889007e-10, "logits/chosen": -19.12906265258789, "logits/rejected": -18.323820114135742, "logps/chosen": -344.9168701171875, "logps/rejected": -280.91961669921875, "loss": 0.4587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 3.449845552444458, "rewards/margins": 1.893072485923767, "rewards/rejected": 1.5567728281021118, "step": 64540 }, { "epoch": 2.9968893634801987, "grad_norm": 21.205596923828125, "learning_rate": 7.50576473683396e-10, "logits/chosen": -19.174114227294922, "logits/rejected": -18.844633102416992, "logps/chosen": -334.5975036621094, "logps/rejected": -311.613037109375, "loss": 0.6368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1843132972717285, "rewards/margins": 0.6387394666671753, "rewards/rejected": 2.5455737113952637, "step": 64550 }, { "epoch": 2.997353637587632, "grad_norm": 70.11481475830078, "learning_rate": 6.731974557778912e-10, "logits/chosen": -20.231977462768555, "logits/rejected": -19.38433837890625, "logps/chosen": -365.53228759765625, "logps/rejected": -186.3464813232422, "loss": 0.6404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.576854705810547, "rewards/margins": 2.3843588829040527, "rewards/rejected": 1.192495584487915, "step": 64560 }, { "epoch": 2.9978179116950647, "grad_norm": 123.03356170654297, "learning_rate": 5.958184378723864e-10, "logits/chosen": -19.953584671020508, "logits/rejected": -19.319625854492188, "logps/chosen": -376.47991943359375, "logps/rejected": -305.0920715332031, "loss": 0.3205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 4.509020805358887, "rewards/margins": 1.8508962392807007, "rewards/rejected": 2.658125162124634, "step": 64570 }, { "epoch": 2.998282185802498, "grad_norm": 32.829898834228516, "learning_rate": 5.184394199668818e-10, "logits/chosen": -18.959651947021484, "logits/rejected": -17.8935489654541, "logps/chosen": -430.1485290527344, "logps/rejected": -319.8473205566406, "loss": 0.3868, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 4.612296104431152, "rewards/margins": 2.1019530296325684, "rewards/rejected": 2.510343074798584, "step": 64580 }, { "epoch": 2.9987464599099307, "grad_norm": 212.6770477294922, "learning_rate": 4.41060402061377e-10, "logits/chosen": -19.613597869873047, "logits/rejected": -18.98232650756836, "logps/chosen": -563.52099609375, "logps/rejected": -419.409912109375, "loss": 0.7248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.386358261108398, "rewards/margins": 1.1960694789886475, "rewards/rejected": 4.19028902053833, "step": 64590 }, { "epoch": 2.999210734017364, "grad_norm": 79.72403717041016, "learning_rate": 3.6368138415587226e-10, "logits/chosen": -18.641956329345703, "logits/rejected": -18.10024642944336, "logps/chosen": -469.4883728027344, "logps/rejected": -358.2345275878906, "loss": 0.4926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 5.174752712249756, "rewards/margins": 1.1776841878890991, "rewards/rejected": 3.9970688819885254, "step": 64600 }, { "epoch": 2.9996750081247967, "grad_norm": 79.18889617919922, "learning_rate": 2.8630236625036753e-10, "logits/chosen": -18.985483169555664, "logits/rejected": -18.063180923461914, "logps/chosen": -540.1299438476562, "logps/rejected": -435.4956970214844, "loss": 0.2588, "rewards/accuracies": 1.0, "rewards/chosen": 6.295302391052246, "rewards/margins": 1.6691440343856812, "rewards/rejected": 4.626158714294434, "step": 64610 } ], "logging_steps": 10, "max_steps": 64617, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }