{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9973828840617638, "eval_steps": 100, "global_step": 954, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002093692750588851, "grad_norm": 38.333577570579074, "learning_rate": 1.0416666666666666e-08, "logits/chosen": 5.468747138977051, "logits/rejected": 5.353150367736816, "logps/chosen": -399.0700988769531, "logps/rejected": -414.2703857421875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02093692750588851, "grad_norm": 36.45097456473781, "learning_rate": 1.0416666666666667e-07, "logits/chosen": 4.634159088134766, "logits/rejected": 4.8650617599487305, "logps/chosen": -481.9865417480469, "logps/rejected": -402.9172668457031, "loss": 0.7192, "rewards/accuracies": 0.4236111044883728, "rewards/chosen": 0.036201052367687225, "rewards/margins": 0.05521820858120918, "rewards/rejected": -0.01901715248823166, "step": 10 }, { "epoch": 0.04187385501177702, "grad_norm": 38.31461130932718, "learning_rate": 2.0833333333333333e-07, "logits/chosen": 4.8017449378967285, "logits/rejected": 5.193596363067627, "logps/chosen": -428.74591064453125, "logps/rejected": -379.7098693847656, "loss": 0.7525, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.014457901008427143, "rewards/margins": -0.03727109357714653, "rewards/rejected": 0.0517289862036705, "step": 20 }, { "epoch": 0.06281078251766553, "grad_norm": 38.39033659648525, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 4.625308513641357, "logits/rejected": 4.913487434387207, "logps/chosen": -459.8934631347656, "logps/rejected": -365.87176513671875, "loss": 0.7389, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.032367587089538574, "rewards/margins": 0.07106685638427734, "rewards/rejected": -0.03869926929473877, "step": 30 }, { "epoch": 0.08374771002355404, "grad_norm": 34.959297165636315, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 5.135643005371094, "logits/rejected": 5.29467248916626, "logps/chosen": -388.5003662109375, "logps/rejected": -341.11138916015625, "loss": 0.7521, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.061884332448244095, "rewards/margins": -0.051958512514829636, "rewards/rejected": -0.009925814345479012, "step": 40 }, { "epoch": 0.10468463752944256, "grad_norm": 40.59746593571697, "learning_rate": 5.208333333333334e-07, "logits/chosen": 4.794947147369385, "logits/rejected": 5.206262111663818, "logps/chosen": -418.7637634277344, "logps/rejected": -366.21783447265625, "loss": 0.7539, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0046131848357617855, "rewards/margins": 0.009517465718090534, "rewards/rejected": -0.014130651950836182, "step": 50 }, { "epoch": 0.12562156503533106, "grad_norm": 35.116262927070615, "learning_rate": 6.249999999999999e-07, "logits/chosen": 4.984349250793457, "logits/rejected": 5.210784435272217, "logps/chosen": -389.5479431152344, "logps/rejected": -355.3258361816406, "loss": 0.7337, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.09027661383152008, "rewards/margins": 0.13924987614154816, "rewards/rejected": -0.048973266035318375, "step": 60 }, { "epoch": 0.14655849254121958, "grad_norm": 37.65630163274615, "learning_rate": 7.291666666666666e-07, "logits/chosen": 5.079476356506348, "logits/rejected": 5.1062331199646, "logps/chosen": -472.6788635253906, "logps/rejected": -410.6566467285156, "loss": 0.7532, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.01137494295835495, "rewards/margins": -0.019245151430368423, "rewards/rejected": 0.030620098114013672, "step": 70 }, { "epoch": 0.16749542004710807, "grad_norm": 36.35209638887961, "learning_rate": 8.333333333333333e-07, "logits/chosen": 4.831971645355225, "logits/rejected": 5.179555892944336, "logps/chosen": -465.8661193847656, "logps/rejected": -352.46063232421875, "loss": 0.7337, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.029347699135541916, "rewards/margins": 0.011580700054764748, "rewards/rejected": 0.017767000943422318, "step": 80 }, { "epoch": 0.1884323475529966, "grad_norm": 40.247342074541066, "learning_rate": 9.374999999999999e-07, "logits/chosen": 4.667853355407715, "logits/rejected": 5.083367347717285, "logps/chosen": -410.145263671875, "logps/rejected": -347.24871826171875, "loss": 0.7325, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04706032946705818, "rewards/margins": 0.05596155673265457, "rewards/rejected": -0.008901228196918964, "step": 90 }, { "epoch": 0.2093692750588851, "grad_norm": 36.88448509348576, "learning_rate": 9.999463737538052e-07, "logits/chosen": 5.017066955566406, "logits/rejected": 5.157826900482178, "logps/chosen": -453.6114196777344, "logps/rejected": -376.13214111328125, "loss": 0.7296, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.03123999759554863, "rewards/margins": 0.038888636976480484, "rewards/rejected": -0.007648637983947992, "step": 100 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": 4.755386829376221, "eval_logits/rejected": 5.127224445343018, "eval_logps/chosen": -443.48101806640625, "eval_logps/rejected": -377.5273742675781, "eval_loss": 0.7357296347618103, "eval_rewards/accuracies": 0.5515872836112976, "eval_rewards/chosen": 0.01168334111571312, "eval_rewards/margins": 0.03692733868956566, "eval_rewards/rejected": -0.025244001299142838, "eval_runtime": 21.3186, "eval_samples_per_second": 93.815, "eval_steps_per_second": 2.955, "step": 100 }, { "epoch": 0.23030620256477363, "grad_norm": 36.21924654761057, "learning_rate": 9.993432105822034e-07, "logits/chosen": 4.768385887145996, "logits/rejected": 5.076653957366943, "logps/chosen": -449.16375732421875, "logps/rejected": -369.2919006347656, "loss": 0.7211, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.04003235697746277, "rewards/margins": -0.059906214475631714, "rewards/rejected": 0.019873863086104393, "step": 110 }, { "epoch": 0.2512431300706621, "grad_norm": 36.12599439727971, "learning_rate": 9.980706626858607e-07, "logits/chosen": 5.0697174072265625, "logits/rejected": 5.350961208343506, "logps/chosen": -392.9084777832031, "logps/rejected": -342.9964294433594, "loss": 0.7213, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.005513651762157679, "rewards/margins": 0.07320307195186615, "rewards/rejected": -0.06768941879272461, "step": 120 }, { "epoch": 0.2721800575765506, "grad_norm": 43.81008737879554, "learning_rate": 9.961304359538434e-07, "logits/chosen": 4.7396440505981445, "logits/rejected": 5.110291957855225, "logps/chosen": -445.08209228515625, "logps/rejected": -356.9689636230469, "loss": 0.7319, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.03245898336172104, "rewards/margins": 0.0837341919541359, "rewards/rejected": -0.05127520486712456, "step": 130 }, { "epoch": 0.29311698508243916, "grad_norm": 37.12091478913465, "learning_rate": 9.935251313189563e-07, "logits/chosen": 4.5339274406433105, "logits/rejected": 5.020459175109863, "logps/chosen": -473.4126892089844, "logps/rejected": -364.12939453125, "loss": 0.7193, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.008073748089373112, "rewards/margins": 0.035515300929546356, "rewards/rejected": -0.02744155190885067, "step": 140 }, { "epoch": 0.31405391258832765, "grad_norm": 36.18564480374988, "learning_rate": 9.902582412711118e-07, "logits/chosen": 4.540812969207764, "logits/rejected": 4.964258193969727, "logps/chosen": -426.5033264160156, "logps/rejected": -353.1463317871094, "loss": 0.7232, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.01480414904654026, "rewards/margins": 0.06241898611187935, "rewards/rejected": -0.07722313702106476, "step": 150 }, { "epoch": 0.33499084009421615, "grad_norm": 32.90233228487631, "learning_rate": 9.86334145175542e-07, "logits/chosen": 4.807779788970947, "logits/rejected": 5.042156219482422, "logps/chosen": -396.0440673828125, "logps/rejected": -360.52886962890625, "loss": 0.7013, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.010227044112980366, "rewards/margins": 0.055318038910627365, "rewards/rejected": -0.045090995728969574, "step": 160 }, { "epoch": 0.3559277676001047, "grad_norm": 37.311964741290105, "learning_rate": 9.817581034021272e-07, "logits/chosen": 4.897703170776367, "logits/rejected": 5.062272071838379, "logps/chosen": -389.55810546875, "logps/rejected": -329.748779296875, "loss": 0.7043, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.008063828572630882, "rewards/margins": 0.049024712294340134, "rewards/rejected": -0.05708853527903557, "step": 170 }, { "epoch": 0.3768646951059932, "grad_norm": 34.14102924438106, "learning_rate": 9.765362502737097e-07, "logits/chosen": 5.039429187774658, "logits/rejected": 5.049492835998535, "logps/chosen": -384.9471130371094, "logps/rejected": -381.6601257324219, "loss": 0.7091, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.004380516707897186, "rewards/margins": 0.038515396416187286, "rewards/rejected": -0.0341348834335804, "step": 180 }, { "epoch": 0.39780162261188173, "grad_norm": 35.88568083270778, "learning_rate": 9.706755858428485e-07, "logits/chosen": 5.025214195251465, "logits/rejected": 5.097342014312744, "logps/chosen": -397.56402587890625, "logps/rejected": -396.12799072265625, "loss": 0.7161, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.056682147085666656, "rewards/margins": 0.023842817172408104, "rewards/rejected": -0.08052496612071991, "step": 190 }, { "epoch": 0.4187385501177702, "grad_norm": 38.60485416145746, "learning_rate": 9.641839665080363e-07, "logits/chosen": 5.1590471267700195, "logits/rejected": 5.290652275085449, "logps/chosen": -399.9522399902344, "logps/rejected": -363.53936767578125, "loss": 0.7062, "rewards/accuracies": 0.53125, "rewards/chosen": 0.04297986626625061, "rewards/margins": 0.06266864389181137, "rewards/rejected": -0.019688773900270462, "step": 200 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": 4.725487232208252, "eval_logits/rejected": 5.087867736816406, "eval_logps/chosen": -443.55450439453125, "eval_logps/rejected": -377.6705627441406, "eval_loss": 0.6988219022750854, "eval_rewards/accuracies": 0.567460298538208, "eval_rewards/chosen": -0.025081120431423187, "eval_rewards/margins": 0.07174728065729141, "eval_rewards/rejected": -0.0968284010887146, "eval_runtime": 21.5315, "eval_samples_per_second": 92.887, "eval_steps_per_second": 2.926, "step": 200 }, { "epoch": 0.4396754776236587, "grad_norm": 36.10684145537435, "learning_rate": 9.570700944819582e-07, "logits/chosen": 4.827897548675537, "logits/rejected": 5.154609680175781, "logps/chosen": -451.2969665527344, "logps/rejected": -372.116455078125, "loss": 0.7141, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07117662578821182, "rewards/margins": 0.02444976009428501, "rewards/rejected": -0.09562637656927109, "step": 210 }, { "epoch": 0.46061240512954726, "grad_norm": 33.13953697631782, "learning_rate": 9.493435061259129e-07, "logits/chosen": 5.24191427230835, "logits/rejected": 5.477172374725342, "logps/chosen": -365.3572692871094, "logps/rejected": -345.34814453125, "loss": 0.7138, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.07684006541967392, "rewards/margins": -0.033464811742305756, "rewards/rejected": -0.043375253677368164, "step": 220 }, { "epoch": 0.48154933263543576, "grad_norm": 36.46869252662201, "learning_rate": 9.4101455916603e-07, "logits/chosen": 4.996638298034668, "logits/rejected": 5.203185081481934, "logps/chosen": -390.2169494628906, "logps/rejected": -381.74090576171875, "loss": 0.7027, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.006761978380382061, "rewards/margins": 0.11298926174640656, "rewards/rejected": -0.11975125223398209, "step": 230 }, { "epoch": 0.5024862601413242, "grad_norm": 32.465417090707724, "learning_rate": 9.320944188084241e-07, "logits/chosen": 4.961588382720947, "logits/rejected": 5.104936122894287, "logps/chosen": -405.73651123046875, "logps/rejected": -368.39312744140625, "loss": 0.6966, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.04590672254562378, "rewards/margins": 0.21882851421833038, "rewards/rejected": -0.1729217916727066, "step": 240 }, { "epoch": 0.5234231876472127, "grad_norm": 32.56446548910322, "learning_rate": 9.225950427718974e-07, "logits/chosen": 4.295259475708008, "logits/rejected": 4.731950759887695, "logps/chosen": -457.6085510253906, "logps/rejected": -378.92083740234375, "loss": 0.6895, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08796132355928421, "rewards/margins": 0.07854396849870682, "rewards/rejected": -0.16650527715682983, "step": 250 }, { "epoch": 0.5443601151531012, "grad_norm": 37.3555609307497, "learning_rate": 9.125291652582547e-07, "logits/chosen": 4.772681713104248, "logits/rejected": 4.774602890014648, "logps/chosen": -429.554931640625, "logps/rejected": -350.5638122558594, "loss": 0.697, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.017600687220692635, "rewards/margins": 0.15435068309307098, "rewards/rejected": -0.1367500126361847, "step": 260 }, { "epoch": 0.5652970426589898, "grad_norm": 33.42195670184768, "learning_rate": 9.019102798817195e-07, "logits/chosen": 4.580355644226074, "logits/rejected": 5.011557102203369, "logps/chosen": -446.68438720703125, "logps/rejected": -380.5400390625, "loss": 0.6778, "rewards/accuracies": 0.5625, "rewards/chosen": 0.019351882860064507, "rewards/margins": 0.15824225544929504, "rewards/rejected": -0.13889038562774658, "step": 270 }, { "epoch": 0.5862339701648783, "grad_norm": 39.119947805234894, "learning_rate": 8.90752621580335e-07, "logits/chosen": 5.025314807891846, "logits/rejected": 5.18468713760376, "logps/chosen": -424.27191162109375, "logps/rejected": -344.4115295410156, "loss": 0.7073, "rewards/accuracies": 0.625, "rewards/chosen": 0.023305395618081093, "rewards/margins": 0.14927226305007935, "rewards/rejected": -0.125966876745224, "step": 280 }, { "epoch": 0.6071708976707668, "grad_norm": 33.472133866310024, "learning_rate": 8.79071147533597e-07, "logits/chosen": 4.961835861206055, "logits/rejected": 5.123082637786865, "logps/chosen": -400.4245300292969, "logps/rejected": -388.8963623046875, "loss": 0.6687, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.08449111878871918, "rewards/margins": 0.2874522805213928, "rewards/rejected": -0.20296116173267365, "step": 290 }, { "epoch": 0.6281078251766553, "grad_norm": 38.29458361274397, "learning_rate": 8.668815171119019e-07, "logits/chosen": 4.6071085929870605, "logits/rejected": 4.85768985748291, "logps/chosen": -445.59393310546875, "logps/rejected": -373.83636474609375, "loss": 0.6782, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.043323811143636703, "rewards/margins": 0.2540794312953949, "rewards/rejected": -0.2107556313276291, "step": 300 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": 4.662118434906006, "eval_logits/rejected": 5.016141414642334, "eval_logps/chosen": -443.56884765625, "eval_logps/rejected": -377.883056640625, "eval_loss": 0.6942777037620544, "eval_rewards/accuracies": 0.567460298538208, "eval_rewards/chosen": -0.03225937858223915, "eval_rewards/margins": 0.17080551385879517, "eval_rewards/rejected": -0.20306488871574402, "eval_runtime": 21.6344, "eval_samples_per_second": 92.445, "eval_steps_per_second": 2.912, "step": 300 }, { "epoch": 0.6490447526825438, "grad_norm": 35.277636776310345, "learning_rate": 8.54200070884685e-07, "logits/chosen": 4.7398271560668945, "logits/rejected": 5.0438690185546875, "logps/chosen": -455.08074951171875, "logps/rejected": -346.21905517578125, "loss": 0.6648, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00532907247543335, "rewards/margins": 0.22818481922149658, "rewards/rejected": -0.23351387679576874, "step": 310 }, { "epoch": 0.6699816801884323, "grad_norm": 36.1242464612453, "learning_rate": 8.410438087153911e-07, "logits/chosen": 4.823008060455322, "logits/rejected": 4.949624538421631, "logps/chosen": -420.04150390625, "logps/rejected": -346.31134033203125, "loss": 0.6633, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02057427167892456, "rewards/margins": 0.1925923228263855, "rewards/rejected": -0.21316656470298767, "step": 320 }, { "epoch": 0.6909186076943209, "grad_norm": 34.485635067716444, "learning_rate": 8.274303669726426e-07, "logits/chosen": 4.866278171539307, "logits/rejected": 5.084838390350342, "logps/chosen": -413.07647705078125, "logps/rejected": -359.72637939453125, "loss": 0.6964, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0661710649728775, "rewards/margins": 0.11275775730609894, "rewards/rejected": -0.17892882227897644, "step": 330 }, { "epoch": 0.7118555352002094, "grad_norm": 36.021204090397475, "learning_rate": 8.133779948881513e-07, "logits/chosen": 4.962647914886475, "logits/rejected": 5.274256229400635, "logps/chosen": -423.34796142578125, "logps/rejected": -374.83831787109375, "loss": 0.6843, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.06308840215206146, "rewards/margins": 0.14286582171916962, "rewards/rejected": -0.20595422387123108, "step": 340 }, { "epoch": 0.7327924627060979, "grad_norm": 40.80633953486743, "learning_rate": 7.989055300930704e-07, "logits/chosen": 4.9410552978515625, "logits/rejected": 5.171365737915039, "logps/chosen": -401.3800048828125, "logps/rejected": -339.8207702636719, "loss": 0.6799, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.05132729932665825, "rewards/margins": 0.14917483925819397, "rewards/rejected": -0.20050212740898132, "step": 350 }, { "epoch": 0.7537293902119864, "grad_norm": 32.290367261199215, "learning_rate": 7.840323733655778e-07, "logits/chosen": 4.760105609893799, "logits/rejected": 4.936800956726074, "logps/chosen": -475.94305419921875, "logps/rejected": -373.4317626953125, "loss": 0.6723, "rewards/accuracies": 0.65625, "rewards/chosen": 0.040363796055316925, "rewards/margins": 0.26596716046333313, "rewards/rejected": -0.2256033718585968, "step": 360 }, { "epoch": 0.7746663177178749, "grad_norm": 33.44911142948228, "learning_rate": 7.687784626235447e-07, "logits/chosen": 4.649796485900879, "logits/rejected": 4.882054328918457, "logps/chosen": -437.54791259765625, "logps/rejected": -343.1330871582031, "loss": 0.6722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03584844991564751, "rewards/margins": 0.2080194056034088, "rewards/rejected": -0.2438678741455078, "step": 370 }, { "epoch": 0.7956032452237635, "grad_norm": 32.64019199720913, "learning_rate": 7.531642461971514e-07, "logits/chosen": 4.7331953048706055, "logits/rejected": 5.047934532165527, "logps/chosen": -434.0751953125, "logps/rejected": -363.1179504394531, "loss": 0.673, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03724004700779915, "rewards/margins": 0.23209133744239807, "rewards/rejected": -0.2693313956260681, "step": 380 }, { "epoch": 0.816540172729652, "grad_norm": 33.38515796622506, "learning_rate": 7.372106554172801e-07, "logits/chosen": 4.660643577575684, "logits/rejected": 4.7719621658325195, "logps/chosen": -434.41015625, "logps/rejected": -394.9471130371094, "loss": 0.6778, "rewards/accuracies": 0.59375, "rewards/chosen": 0.021881069988012314, "rewards/margins": 0.2415298968553543, "rewards/rejected": -0.2196488082408905, "step": 390 }, { "epoch": 0.8374771002355405, "grad_norm": 32.60957530709497, "learning_rate": 7.209390765564318e-07, "logits/chosen": 4.807684421539307, "logits/rejected": 5.217709541320801, "logps/chosen": -368.07122802734375, "logps/rejected": -328.12066650390625, "loss": 0.6863, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03271085396409035, "rewards/margins": 0.1988353729248047, "rewards/rejected": -0.23154623806476593, "step": 400 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": 4.645900249481201, "eval_logits/rejected": 4.999230861663818, "eval_logps/chosen": -443.68084716796875, "eval_logps/rejected": -378.0348205566406, "eval_loss": 0.6756832003593445, "eval_rewards/accuracies": 0.5992063283920288, "eval_rewards/chosen": -0.08822782337665558, "eval_rewards/margins": 0.19070643186569214, "eval_rewards/rejected": -0.2789342403411865, "eval_runtime": 21.4973, "eval_samples_per_second": 93.035, "eval_steps_per_second": 2.931, "step": 400 }, { "epoch": 0.8584140277414289, "grad_norm": 80.32344495768098, "learning_rate": 7.043713221597773e-07, "logits/chosen": 4.9558234214782715, "logits/rejected": 5.171336650848389, "logps/chosen": -464.4634704589844, "logps/rejected": -378.52130126953125, "loss": 0.6691, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.028427015990018845, "rewards/margins": 0.16248683631420135, "rewards/rejected": -0.1909138560295105, "step": 410 }, { "epoch": 0.8793509552473174, "grad_norm": 38.003974257278905, "learning_rate": 6.875296018047809e-07, "logits/chosen": 5.062918663024902, "logits/rejected": 5.093894958496094, "logps/chosen": -414.597900390625, "logps/rejected": -392.76422119140625, "loss": 0.6778, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05205658823251724, "rewards/margins": 0.166357159614563, "rewards/rejected": -0.21841374039649963, "step": 420 }, { "epoch": 0.9002878827532059, "grad_norm": 33.39326538286459, "learning_rate": 6.704364923285857e-07, "logits/chosen": 4.783626556396484, "logits/rejected": 5.061443328857422, "logps/chosen": -454.7694396972656, "logps/rejected": -349.71099853515625, "loss": 0.6613, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03129550814628601, "rewards/margins": 0.2672887146472931, "rewards/rejected": -0.2985842227935791, "step": 430 }, { "epoch": 0.9212248102590945, "grad_norm": 33.88594593881104, "learning_rate": 6.531149075630796e-07, "logits/chosen": 4.762629985809326, "logits/rejected": 4.992688179016113, "logps/chosen": -422.49639892578125, "logps/rejected": -342.6626892089844, "loss": 0.6829, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08267354220151901, "rewards/margins": 0.18921074271202087, "rewards/rejected": -0.2718842923641205, "step": 440 }, { "epoch": 0.942161737764983, "grad_norm": 34.194378360359096, "learning_rate": 6.355880676182085e-07, "logits/chosen": 4.86130952835083, "logits/rejected": 5.088041305541992, "logps/chosen": -423.82366943359375, "logps/rejected": -386.20172119140625, "loss": 0.6777, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029518108814954758, "rewards/margins": 0.2812921106815338, "rewards/rejected": -0.3108102083206177, "step": 450 }, { "epoch": 0.9630986652708715, "grad_norm": 35.220161431379815, "learning_rate": 6.178794677547137e-07, "logits/chosen": 4.96859073638916, "logits/rejected": 5.295912265777588, "logps/chosen": -408.28228759765625, "logps/rejected": -337.6819763183594, "loss": 0.6573, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02989841438829899, "rewards/margins": 0.3131854832172394, "rewards/rejected": -0.343083918094635, "step": 460 }, { "epoch": 0.98403559277676, "grad_norm": 36.11741005068747, "learning_rate": 6.000128468880222e-07, "logits/chosen": 4.616504669189453, "logits/rejected": 4.935946464538574, "logps/chosen": -435.3017578125, "logps/rejected": -375.13800048828125, "loss": 0.6647, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.11524273455142975, "rewards/margins": 0.2546766698360443, "rewards/rejected": -0.3699193596839905, "step": 470 }, { "epoch": 1.0049725202826485, "grad_norm": 32.471857091487834, "learning_rate": 5.820121557655108e-07, "logits/chosen": 4.9493536949157715, "logits/rejected": 5.226868152618408, "logps/chosen": -423.6285095214844, "logps/rejected": -362.1949768066406, "loss": 0.6629, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.05261852219700813, "rewards/margins": 0.3280298113822937, "rewards/rejected": -0.27541130781173706, "step": 480 }, { "epoch": 1.025909447788537, "grad_norm": 39.51652905408664, "learning_rate": 5.639015248598023e-07, "logits/chosen": 4.762259006500244, "logits/rejected": 5.021244525909424, "logps/chosen": -424.96697998046875, "logps/rejected": -342.76666259765625, "loss": 0.6644, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03716667741537094, "rewards/margins": 0.2011403739452362, "rewards/rejected": -0.23830704391002655, "step": 490 }, { "epoch": 1.0468463752944255, "grad_norm": 34.43579926142672, "learning_rate": 5.457052320211339e-07, "logits/chosen": 4.543593406677246, "logits/rejected": 4.786489009857178, "logps/chosen": -434.46746826171875, "logps/rejected": -367.75689697265625, "loss": 0.6836, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08059108257293701, "rewards/margins": 0.23346829414367676, "rewards/rejected": -0.3140593469142914, "step": 500 }, { "epoch": 1.0468463752944255, "eval_logits/chosen": 4.617003440856934, "eval_logits/rejected": 4.9695563316345215, "eval_logps/chosen": -443.69580078125, "eval_logps/rejected": -378.1418762207031, "eval_loss": 0.6708300113677979, "eval_rewards/accuracies": 0.6349206566810608, "eval_rewards/chosen": -0.09571509808301926, "eval_rewards/margins": 0.23677198588848114, "eval_rewards/rejected": -0.3324871063232422, "eval_runtime": 20.9626, "eval_samples_per_second": 95.408, "eval_steps_per_second": 3.005, "step": 500 }, { "epoch": 1.067783302800314, "grad_norm": 33.3201987416808, "learning_rate": 5.274476699321637e-07, "logits/chosen": 4.583409786224365, "logits/rejected": 4.803020477294922, "logps/chosen": -390.48565673828125, "logps/rejected": -351.6776123046875, "loss": 0.6779, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0327589213848114, "rewards/margins": 0.32544368505477905, "rewards/rejected": -0.35820263624191284, "step": 510 }, { "epoch": 1.0887202303062025, "grad_norm": 30.742730814185435, "learning_rate": 5.091533134088387e-07, "logits/chosen": 4.493949890136719, "logits/rejected": 4.9839911460876465, "logps/chosen": -383.7958984375, "logps/rejected": -354.36480712890625, "loss": 0.656, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07341745495796204, "rewards/margins": 0.19709812104701996, "rewards/rejected": -0.2705155909061432, "step": 520 }, { "epoch": 1.109657157812091, "grad_norm": 34.05900047947194, "learning_rate": 4.908466865911614e-07, "logits/chosen": 4.793222904205322, "logits/rejected": 5.078155517578125, "logps/chosen": -401.0002746582031, "logps/rejected": -340.4061279296875, "loss": 0.6618, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05051114410161972, "rewards/margins": 0.27152642607688904, "rewards/rejected": -0.32203757762908936, "step": 530 }, { "epoch": 1.1305940853179797, "grad_norm": 30.483486401054424, "learning_rate": 4.7255233006783624e-07, "logits/chosen": 4.857717990875244, "logits/rejected": 5.0497636795043945, "logps/chosen": -375.65362548828125, "logps/rejected": -330.26165771484375, "loss": 0.6544, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02352207899093628, "rewards/margins": 0.35729408264160156, "rewards/rejected": -0.38081610202789307, "step": 540 }, { "epoch": 1.151531012823868, "grad_norm": 35.09603470685652, "learning_rate": 4.5429476797886617e-07, "logits/chosen": 4.932369232177734, "logits/rejected": 5.050224781036377, "logps/chosen": -430.0126953125, "logps/rejected": -331.1691589355469, "loss": 0.6599, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0060789333656430244, "rewards/margins": 0.25288811326026917, "rewards/rejected": -0.25896701216697693, "step": 550 }, { "epoch": 1.1724679403297567, "grad_norm": 40.64422646125966, "learning_rate": 4.3609847514019763e-07, "logits/chosen": 4.637743949890137, "logits/rejected": 5.000674724578857, "logps/chosen": -420.3258361816406, "logps/rejected": -362.2751159667969, "loss": 0.6718, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0094971414655447, "rewards/margins": 0.22678783535957336, "rewards/rejected": -0.23628497123718262, "step": 560 }, { "epoch": 1.193404867835645, "grad_norm": 32.638009640148645, "learning_rate": 4.179878442344892e-07, "logits/chosen": 4.855754375457764, "logits/rejected": 4.871184349060059, "logps/chosen": -384.08660888671875, "logps/rejected": -371.4262390136719, "loss": 0.6766, "rewards/accuracies": 0.625, "rewards/chosen": -0.05095939710736275, "rewards/margins": 0.28148993849754333, "rewards/rejected": -0.332449346780777, "step": 570 }, { "epoch": 1.2143417953415336, "grad_norm": 35.519971577107064, "learning_rate": 3.9998715311197783e-07, "logits/chosen": 4.73850679397583, "logits/rejected": 5.173120021820068, "logps/chosen": -414.8775329589844, "logps/rejected": -341.5818786621094, "loss": 0.6508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09668377041816711, "rewards/margins": 0.25211262702941895, "rewards/rejected": -0.34879642724990845, "step": 580 }, { "epoch": 1.235278722847422, "grad_norm": 34.20580765627037, "learning_rate": 3.821205322452863e-07, "logits/chosen": 4.916988372802734, "logits/rejected": 5.1998610496521, "logps/chosen": -448.5626525878906, "logps/rejected": -367.84027099609375, "loss": 0.644, "rewards/accuracies": 0.625, "rewards/chosen": -0.07886572182178497, "rewards/margins": 0.3578983247280121, "rewards/rejected": -0.43676406145095825, "step": 590 }, { "epoch": 1.2562156503533106, "grad_norm": 33.854286929995176, "learning_rate": 3.6441193238179146e-07, "logits/chosen": 4.852269649505615, "logits/rejected": 4.903324127197266, "logps/chosen": -446.4149475097656, "logps/rejected": -423.3356018066406, "loss": 0.6349, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.14010193943977356, "rewards/margins": 0.15067996084690094, "rewards/rejected": -0.2907818853855133, "step": 600 }, { "epoch": 1.2562156503533106, "eval_logits/chosen": 4.62031364440918, "eval_logits/rejected": 4.9707465171813965, "eval_logps/chosen": -443.61212158203125, "eval_logps/rejected": -378.1197204589844, "eval_loss": 0.6720485091209412, "eval_rewards/accuracies": 0.5992063283920288, "eval_rewards/chosen": -0.053870752453804016, "eval_rewards/margins": 0.267531156539917, "eval_rewards/rejected": -0.321401983499527, "eval_runtime": 20.8046, "eval_samples_per_second": 96.133, "eval_steps_per_second": 3.028, "step": 600 }, { "epoch": 1.2771525778591992, "grad_norm": 36.085842973391074, "learning_rate": 3.4688509243692034e-07, "logits/chosen": 4.767918586730957, "logits/rejected": 4.757430553436279, "logps/chosen": -407.41668701171875, "logps/rejected": -317.3873596191406, "loss": 0.6402, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.08183420449495316, "rewards/margins": 0.33883604407310486, "rewards/rejected": -0.42067021131515503, "step": 610 }, { "epoch": 1.2980895053650876, "grad_norm": 29.698333183198105, "learning_rate": 3.295635076714144e-07, "logits/chosen": 5.085806846618652, "logits/rejected": 5.415268898010254, "logps/chosen": -395.627685546875, "logps/rejected": -331.7653503417969, "loss": 0.6266, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09060301631689072, "rewards/margins": 0.3094441294670105, "rewards/rejected": -0.4000471234321594, "step": 620 }, { "epoch": 1.3190264328709762, "grad_norm": 35.208773349468885, "learning_rate": 3.12470398195219e-07, "logits/chosen": 4.828533172607422, "logits/rejected": 4.925856113433838, "logps/chosen": -418.5848083496094, "logps/rejected": -376.3353576660156, "loss": 0.6486, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.06391973793506622, "rewards/margins": 0.44674786925315857, "rewards/rejected": -0.3828281760215759, "step": 630 }, { "epoch": 1.3399633603768648, "grad_norm": 29.673309842493335, "learning_rate": 2.956286778402226e-07, "logits/chosen": 4.896113872528076, "logits/rejected": 5.183098793029785, "logps/chosen": -394.4980773925781, "logps/rejected": -374.76422119140625, "loss": 0.6394, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.04267222806811333, "rewards/margins": 0.2913575768470764, "rewards/rejected": -0.33402982354164124, "step": 640 }, { "epoch": 1.3609002878827532, "grad_norm": 35.03684415648848, "learning_rate": 2.7906092344356826e-07, "logits/chosen": 4.610795021057129, "logits/rejected": 4.8373188972473145, "logps/chosen": -379.4288024902344, "logps/rejected": -345.05596923828125, "loss": 0.6646, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.1311950385570526, "rewards/margins": 0.2357216328382492, "rewards/rejected": -0.366916686296463, "step": 650 }, { "epoch": 1.3818372153886418, "grad_norm": 33.06984951084542, "learning_rate": 2.6278934458271996e-07, "logits/chosen": 4.830328941345215, "logits/rejected": 5.017812252044678, "logps/chosen": -377.4278564453125, "logps/rejected": -343.86529541015625, "loss": 0.6613, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10503290593624115, "rewards/margins": 0.11762680858373642, "rewards/rejected": -0.22265975177288055, "step": 660 }, { "epoch": 1.4027741428945302, "grad_norm": 31.761556922593446, "learning_rate": 2.468357538028487e-07, "logits/chosen": 4.728631496429443, "logits/rejected": 4.90619421005249, "logps/chosen": -413.2724609375, "logps/rejected": -346.9877624511719, "loss": 0.6393, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03663766756653786, "rewards/margins": 0.2855125069618225, "rewards/rejected": -0.32215017080307007, "step": 670 }, { "epoch": 1.4237110704004188, "grad_norm": 34.93162849349177, "learning_rate": 2.312215373764551e-07, "logits/chosen": 4.728277206420898, "logits/rejected": 5.018845558166504, "logps/chosen": -421.8961486816406, "logps/rejected": -403.57354736328125, "loss": 0.6533, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1260446161031723, "rewards/margins": 0.2200162708759308, "rewards/rejected": -0.3460609018802643, "step": 680 }, { "epoch": 1.4446479979063072, "grad_norm": 33.66523822793528, "learning_rate": 2.1596762663442213e-07, "logits/chosen": 4.863284111022949, "logits/rejected": 4.840500354766846, "logps/chosen": -422.4331970214844, "logps/rejected": -355.96868896484375, "loss": 0.6477, "rewards/accuracies": 0.625, "rewards/chosen": -0.10065089166164398, "rewards/margins": 0.24511468410491943, "rewards/rejected": -0.3457655906677246, "step": 690 }, { "epoch": 1.4655849254121958, "grad_norm": 34.48257400044076, "learning_rate": 2.0109446990692963e-07, "logits/chosen": 4.709015846252441, "logits/rejected": 4.914425849914551, "logps/chosen": -452.9461364746094, "logps/rejected": -442.56658935546875, "loss": 0.6427, "rewards/accuracies": 0.65625, "rewards/chosen": 0.08032918721437454, "rewards/margins": 0.39584842324256897, "rewards/rejected": -0.3155192732810974, "step": 700 }, { "epoch": 1.4655849254121958, "eval_logits/chosen": 4.592012882232666, "eval_logits/rejected": 4.943046569824219, "eval_logps/chosen": -443.6796875, "eval_logps/rejected": -378.1680908203125, "eval_loss": 0.6795812845230103, "eval_rewards/accuracies": 0.60317462682724, "eval_rewards/chosen": -0.08766676485538483, "eval_rewards/margins": 0.25792673230171204, "eval_rewards/rejected": -0.34559354186058044, "eval_runtime": 21.1978, "eval_samples_per_second": 94.349, "eval_steps_per_second": 2.972, "step": 700 }, { "epoch": 1.4865218529180844, "grad_norm": 36.350524448045036, "learning_rate": 1.8662200511184872e-07, "logits/chosen": 4.871232509613037, "logits/rejected": 4.886293411254883, "logps/chosen": -417.8133850097656, "logps/rejected": -384.177490234375, "loss": 0.6669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.12399878352880478, "rewards/margins": 0.27855515480041504, "rewards/rejected": -0.40255388617515564, "step": 710 }, { "epoch": 1.5074587804239727, "grad_norm": 34.52058371975813, "learning_rate": 1.725696330273575e-07, "logits/chosen": 4.8079633712768555, "logits/rejected": 5.118483543395996, "logps/chosen": -433.02032470703125, "logps/rejected": -383.21539306640625, "loss": 0.6234, "rewards/accuracies": 0.65625, "rewards/chosen": 0.049599818885326385, "rewards/margins": 0.36388009786605835, "rewards/rejected": -0.31428030133247375, "step": 720 }, { "epoch": 1.5283957079298613, "grad_norm": 36.62094520000859, "learning_rate": 1.589561912846089e-07, "logits/chosen": 4.67967414855957, "logits/rejected": 4.974714756011963, "logps/chosen": -402.2828063964844, "logps/rejected": -343.87939453125, "loss": 0.6419, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02172028087079525, "rewards/margins": 0.3966042995452881, "rewards/rejected": -0.4183245599269867, "step": 730 }, { "epoch": 1.54933263543575, "grad_norm": 34.85140828972076, "learning_rate": 1.4579992911531496e-07, "logits/chosen": 4.999066352844238, "logits/rejected": 5.089913845062256, "logps/chosen": -442.08538818359375, "logps/rejected": -387.76953125, "loss": 0.6641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.024145543575286865, "rewards/margins": 0.3119828999042511, "rewards/rejected": -0.28783735632896423, "step": 740 }, { "epoch": 1.5702695629416383, "grad_norm": 33.55559408410901, "learning_rate": 1.3311848288809813e-07, "logits/chosen": 4.944571018218994, "logits/rejected": 4.949963569641113, "logps/chosen": -422.9165954589844, "logps/rejected": -378.2356262207031, "loss": 0.6431, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10791780799627304, "rewards/margins": 0.16808216273784637, "rewards/rejected": -0.2759999632835388, "step": 750 }, { "epoch": 1.5912064904475267, "grad_norm": 33.284252993746314, "learning_rate": 1.209288524664029e-07, "logits/chosen": 4.269396781921387, "logits/rejected": 4.640176296234131, "logps/chosen": -513.432861328125, "logps/rejected": -464.742431640625, "loss": 0.6505, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00313050439581275, "rewards/margins": 0.3427557051181793, "rewards/rejected": -0.33962517976760864, "step": 760 }, { "epoch": 1.6121434179534153, "grad_norm": 33.301123590813035, "learning_rate": 1.0924737841966497e-07, "logits/chosen": 4.588865756988525, "logits/rejected": 4.75103235244751, "logps/chosen": -465.42059326171875, "logps/rejected": -370.064697265625, "loss": 0.653, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.01498096901923418, "rewards/margins": 0.34728002548217773, "rewards/rejected": -0.3622610569000244, "step": 770 }, { "epoch": 1.633080345459304, "grad_norm": 33.707974100314466, "learning_rate": 9.808972011828054e-08, "logits/chosen": 4.657374382019043, "logits/rejected": 5.004950523376465, "logps/chosen": -452.0787048339844, "logps/rejected": -383.26824951171875, "loss": 0.6419, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.06604544818401337, "rewards/margins": 0.4590230882167816, "rewards/rejected": -0.39297762513160706, "step": 780 }, { "epoch": 1.6540172729651923, "grad_norm": 36.400512256730096, "learning_rate": 8.747083474174527e-08, "logits/chosen": 4.775164604187012, "logits/rejected": 5.237417221069336, "logps/chosen": -431.0052185058594, "logps/rejected": -372.1168212890625, "loss": 0.6398, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.017582783475518227, "rewards/margins": 0.35258156061172485, "rewards/rejected": -0.37016433477401733, "step": 790 }, { "epoch": 1.674954200471081, "grad_norm": 29.96252260731642, "learning_rate": 7.740495722810269e-08, "logits/chosen": 4.998331546783447, "logits/rejected": 4.909043312072754, "logps/chosen": -489.783447265625, "logps/rejected": -415.0606384277344, "loss": 0.6128, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.04877934604883194, "rewards/margins": 0.3542923033237457, "rewards/rejected": -0.3055129647254944, "step": 800 }, { "epoch": 1.674954200471081, "eval_logits/chosen": 4.6106181144714355, "eval_logits/rejected": 4.968925476074219, "eval_logps/chosen": -443.625244140625, "eval_logps/rejected": -378.2127990722656, "eval_loss": 0.6703739166259766, "eval_rewards/accuracies": 0.6071428656578064, "eval_rewards/chosen": -0.06042463704943657, "eval_rewards/margins": 0.30752548575401306, "eval_rewards/rejected": -0.3679501414299011, "eval_runtime": 21.1621, "eval_samples_per_second": 94.509, "eval_steps_per_second": 2.977, "step": 800 }, { "epoch": 1.6958911279769695, "grad_norm": 34.07376933070953, "learning_rate": 6.790558119157597e-08, "logits/chosen": 4.842529773712158, "logits/rejected": 4.945174217224121, "logps/chosen": -446.68682861328125, "logps/rejected": -379.9209899902344, "loss": 0.6409, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.010895573534071445, "rewards/margins": 0.47363200783729553, "rewards/rejected": -0.46273642778396606, "step": 810 }, { "epoch": 1.7168280554828579, "grad_norm": 31.41564508164701, "learning_rate": 5.898544083397e-08, "logits/chosen": 4.57013463973999, "logits/rejected": 4.8762030601501465, "logps/chosen": -459.298583984375, "logps/rejected": -376.189208984375, "loss": 0.6381, "rewards/accuracies": 0.625, "rewards/chosen": -0.04928427189588547, "rewards/margins": 0.33450883626937866, "rewards/rejected": -0.38379308581352234, "step": 820 }, { "epoch": 1.7377649829887463, "grad_norm": 38.55984096337612, "learning_rate": 5.065649387408705e-08, "logits/chosen": 4.863150596618652, "logits/rejected": 4.996617317199707, "logps/chosen": -405.2935485839844, "logps/rejected": -383.06756591796875, "loss": 0.6587, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.14060600101947784, "rewards/margins": 0.1646648645401001, "rewards/rejected": -0.30527088046073914, "step": 830 }, { "epoch": 1.7587019104946349, "grad_norm": 32.69891650352482, "learning_rate": 4.292990551804171e-08, "logits/chosen": 4.561503887176514, "logits/rejected": 4.661375522613525, "logps/chosen": -374.9468688964844, "logps/rejected": -359.5188293457031, "loss": 0.6426, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05613694339990616, "rewards/margins": 0.3448534607887268, "rewards/rejected": -0.40099042654037476, "step": 840 }, { "epoch": 1.7796388380005235, "grad_norm": 32.82316724445512, "learning_rate": 3.581603349196371e-08, "logits/chosen": 4.668177604675293, "logits/rejected": 5.044764518737793, "logps/chosen": -391.29534912109375, "logps/rejected": -374.1195068359375, "loss": 0.6501, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.14754648506641388, "rewards/margins": 0.16757197678089142, "rewards/rejected": -0.3151184618473053, "step": 850 }, { "epoch": 1.8005757655064119, "grad_norm": 32.36442235611696, "learning_rate": 2.9324414157151367e-08, "logits/chosen": 4.706895351409912, "logits/rejected": 5.021437644958496, "logps/chosen": -417.41021728515625, "logps/rejected": -335.3275451660156, "loss": 0.6534, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.06291428953409195, "rewards/margins": 0.3059840798377991, "rewards/rejected": -0.36889833211898804, "step": 860 }, { "epoch": 1.8215126930123005, "grad_norm": 29.740377909388123, "learning_rate": 2.3463749726290284e-08, "logits/chosen": 4.696743965148926, "logits/rejected": 4.8797287940979, "logps/chosen": -477.77783203125, "logps/rejected": -390.98175048828125, "loss": 0.6614, "rewards/accuracies": 0.53125, "rewards/chosen": -0.07511474192142487, "rewards/margins": 0.17798468470573425, "rewards/rejected": -0.2530994415283203, "step": 870 }, { "epoch": 1.842449620518189, "grad_norm": 30.952090476967147, "learning_rate": 1.824189659787284e-08, "logits/chosen": 4.781184196472168, "logits/rejected": 5.032862663269043, "logps/chosen": -387.22906494140625, "logps/rejected": -360.9486389160156, "loss": 0.6618, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.11400938034057617, "rewards/margins": 0.21471650898456573, "rewards/rejected": -0.3287258744239807, "step": 880 }, { "epoch": 1.8633865480240774, "grad_norm": 31.64887361264221, "learning_rate": 1.3665854824458035e-08, "logits/chosen": 4.322469234466553, "logits/rejected": 4.672883033752441, "logps/chosen": -445.35699462890625, "logps/rejected": -390.5237731933594, "loss": 0.624, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.032880861312150955, "rewards/margins": 0.3543739914894104, "rewards/rejected": -0.38725486397743225, "step": 890 }, { "epoch": 1.8843234755299658, "grad_norm": 33.85502008551422, "learning_rate": 9.741758728888217e-09, "logits/chosen": 4.4365644454956055, "logits/rejected": 4.837357997894287, "logps/chosen": -472.887451171875, "logps/rejected": -367.82611083984375, "loss": 0.6474, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05196143314242363, "rewards/margins": 0.3476230800151825, "rewards/rejected": -0.39958447217941284, "step": 900 }, { "epoch": 1.8843234755299658, "eval_logits/chosen": 4.5737175941467285, "eval_logits/rejected": 4.921082496643066, "eval_logps/chosen": -443.622314453125, "eval_logps/rejected": -378.2174377441406, "eval_loss": 0.6692253351211548, "eval_rewards/accuracies": 0.6269841194152832, "eval_rewards/chosen": -0.05897674709558487, "eval_rewards/margins": 0.31128397583961487, "eval_rewards/rejected": -0.37026071548461914, "eval_runtime": 21.3225, "eval_samples_per_second": 93.797, "eval_steps_per_second": 2.955, "step": 900 }, { "epoch": 1.9052604030358546, "grad_norm": 33.28147635399462, "learning_rate": 6.474868681043577e-09, "logits/chosen": 4.713411808013916, "logits/rejected": 4.913935661315918, "logps/chosen": -384.9287109375, "logps/rejected": -316.16265869140625, "loss": 0.6491, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02645047940313816, "rewards/margins": 0.40163594484329224, "rewards/rejected": -0.42808642983436584, "step": 910 }, { "epoch": 1.926197330541743, "grad_norm": 35.875215811609834, "learning_rate": 3.869564046156459e-09, "logits/chosen": 4.6749348640441895, "logits/rejected": 4.898279190063477, "logps/chosen": -441.083740234375, "logps/rejected": -361.4406433105469, "loss": 0.6389, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0031513571739196777, "rewards/margins": 0.41674357652664185, "rewards/rejected": -0.4198949337005615, "step": 920 }, { "epoch": 1.9471342580476314, "grad_norm": 32.946064523302205, "learning_rate": 1.929337314139412e-09, "logits/chosen": 4.862700462341309, "logits/rejected": 4.817538261413574, "logps/chosen": -429.21051025390625, "logps/rejected": -370.45745849609375, "loss": 0.6312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0764947384595871, "rewards/margins": 0.20215356349945068, "rewards/rejected": -0.2786482870578766, "step": 930 }, { "epoch": 1.96807118555352, "grad_norm": 37.53766060677335, "learning_rate": 6.567894177967325e-10, "logits/chosen": 5.056074142456055, "logits/rejected": 5.200203895568848, "logps/chosen": -382.3914489746094, "logps/rejected": -319.7542419433594, "loss": 0.6475, "rewards/accuracies": 0.625, "rewards/chosen": -0.012996235862374306, "rewards/margins": 0.2734270989894867, "rewards/rejected": -0.28642335534095764, "step": 940 }, { "epoch": 1.9890081130594086, "grad_norm": 32.67422145978211, "learning_rate": 5.3626246194704575e-11, "logits/chosen": 4.634739875793457, "logits/rejected": 4.890820503234863, "logps/chosen": -425.7994689941406, "logps/rejected": -344.5509033203125, "loss": 0.6372, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.18660762906074524, "rewards/margins": 0.23620739579200745, "rewards/rejected": -0.4228149950504303, "step": 950 }, { "epoch": 1.9973828840617638, "step": 954, "total_flos": 0.0, "train_loss": 0.675485389037702, "train_runtime": 5897.7907, "train_samples_per_second": 20.731, "train_steps_per_second": 0.162 } ], "logging_steps": 10, "max_steps": 954, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }