{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998751404669747, "eval_steps": 1000, "global_step": 4004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.6953125, "learning_rate": 1.2468827930174565e-08, "logits/chosen": -2.4102063179016113, "logits/rejected": -2.672837734222412, "logps/chosen": -21.34674835205078, "logps/rejected": -42.586097717285156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.7421875, "learning_rate": 1.2468827930174566e-07, "logits/chosen": -2.239577293395996, "logits/rejected": -2.476416826248169, "logps/chosen": -21.881580352783203, "logps/rejected": -54.84682083129883, "loss": 0.693, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.00018471028306521475, "rewards/margins": 0.00028743690927512944, "rewards/rejected": -0.00010272659710608423, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.83984375, "learning_rate": 2.493765586034913e-07, "logits/chosen": -2.163784980773926, "logits/rejected": -2.405578136444092, "logps/chosen": -21.341472625732422, "logps/rejected": -55.192710876464844, "loss": 0.6927, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.000193244923138991, "rewards/margins": 0.0009195079328492284, "rewards/rejected": -0.0007262630388140678, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.6796875, "learning_rate": 3.7406483790523695e-07, "logits/chosen": -2.0837199687957764, "logits/rejected": -2.361438274383545, "logps/chosen": -21.834430694580078, "logps/rejected": -51.4864501953125, "loss": 0.6929, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 6.211231811903417e-05, "rewards/margins": 0.000506018113810569, "rewards/rejected": -0.00044390588300302625, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.80078125, "learning_rate": 4.987531172069826e-07, "logits/chosen": -2.088737726211548, "logits/rejected": -2.3435609340667725, "logps/chosen": -22.16689682006836, "logps/rejected": -55.5480842590332, "loss": 0.6924, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0003017825947608799, "rewards/margins": 0.001477475045248866, "rewards/rejected": -0.0011756925377994776, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.70703125, "learning_rate": 6.234413965087283e-07, "logits/chosen": -2.1819872856140137, "logits/rejected": -2.480788469314575, "logps/chosen": -22.51431655883789, "logps/rejected": -58.81789016723633, "loss": 0.6917, "rewards/accuracies": 0.875, "rewards/chosen": 0.0005659356247633696, "rewards/margins": 0.00293327565304935, "rewards/rejected": -0.0023673397954553366, "step": 50 }, { "epoch": 0.01, "grad_norm": 1.1171875, "learning_rate": 7.481296758104739e-07, "logits/chosen": -2.13201642036438, "logits/rejected": -2.3695003986358643, "logps/chosen": -22.39255142211914, "logps/rejected": -57.26397705078125, "loss": 0.6906, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0013241939013823867, "rewards/margins": 0.0052015529945492744, "rewards/rejected": -0.0038773592095822096, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.640625, "learning_rate": 8.728179551122195e-07, "logits/chosen": -2.2473509311676025, "logits/rejected": -2.4891819953918457, "logps/chosen": -21.353261947631836, "logps/rejected": -50.47459411621094, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.0020654837135225534, "rewards/margins": 0.00875779427587986, "rewards/rejected": -0.0066923112608492374, "step": 70 }, { "epoch": 0.02, "grad_norm": 0.6796875, "learning_rate": 9.975062344139653e-07, "logits/chosen": -2.0938005447387695, "logits/rejected": -2.3415422439575195, "logps/chosen": -21.880977630615234, "logps/rejected": -55.579505920410156, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.004472785629332066, "rewards/margins": 0.016874177381396294, "rewards/rejected": -0.012401392683386803, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.69140625, "learning_rate": 1.1221945137157108e-06, "logits/chosen": -2.2169880867004395, "logits/rejected": -2.43099308013916, "logps/chosen": -21.09603500366211, "logps/rejected": -52.89866256713867, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.007039895746856928, "rewards/margins": 0.02454659901559353, "rewards/rejected": -0.017506705597043037, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.8203125, "learning_rate": 1.2468827930174565e-06, "logits/chosen": -2.079484701156616, "logits/rejected": -2.3457648754119873, "logps/chosen": -20.769153594970703, "logps/rejected": -60.0728645324707, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.010782149620354176, "rewards/margins": 0.03612237051129341, "rewards/rejected": -0.02534021995961666, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.7890625, "learning_rate": 1.3715710723192023e-06, "logits/chosen": -2.1157209873199463, "logits/rejected": -2.373112440109253, "logps/chosen": -20.276954650878906, "logps/rejected": -56.75362014770508, "loss": 0.6659, "rewards/accuracies": 1.0, "rewards/chosen": 0.016899898648262024, "rewards/margins": 0.055193256586790085, "rewards/rejected": -0.03829335793852806, "step": 110 }, { "epoch": 0.03, "grad_norm": 1.4296875, "learning_rate": 1.4962593516209478e-06, "logits/chosen": -2.0827651023864746, "logits/rejected": -2.3430802822113037, "logps/chosen": -20.093006134033203, "logps/rejected": -55.97692108154297, "loss": 0.6578, "rewards/accuracies": 1.0, "rewards/chosen": 0.02318699099123478, "rewards/margins": 0.07196114957332611, "rewards/rejected": -0.04877415671944618, "step": 120 }, { "epoch": 0.03, "grad_norm": 0.88671875, "learning_rate": 1.6209476309226935e-06, "logits/chosen": -2.2666783332824707, "logits/rejected": -2.533306837081909, "logps/chosen": -18.66347885131836, "logps/rejected": -60.43751907348633, "loss": 0.6421, "rewards/accuracies": 1.0, "rewards/chosen": 0.03544957563281059, "rewards/margins": 0.10495258867740631, "rewards/rejected": -0.06950302422046661, "step": 130 }, { "epoch": 0.03, "grad_norm": 0.734375, "learning_rate": 1.745635910224439e-06, "logits/chosen": -2.132418155670166, "logits/rejected": -2.3867433071136475, "logps/chosen": -16.58405876159668, "logps/rejected": -65.14090728759766, "loss": 0.6258, "rewards/accuracies": 1.0, "rewards/chosen": 0.052351079881191254, "rewards/margins": 0.1396940052509308, "rewards/rejected": -0.08734293282032013, "step": 140 }, { "epoch": 0.04, "grad_norm": 0.859375, "learning_rate": 1.8703241895261848e-06, "logits/chosen": -2.2100329399108887, "logits/rejected": -2.446981906890869, "logps/chosen": -14.807760238647461, "logps/rejected": -61.867767333984375, "loss": 0.6085, "rewards/accuracies": 1.0, "rewards/chosen": 0.07087540626525879, "rewards/margins": 0.17729689180850983, "rewards/rejected": -0.10642149299383163, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.890625, "learning_rate": 1.9950124688279305e-06, "logits/chosen": -2.2164454460144043, "logits/rejected": -2.4408745765686035, "logps/chosen": -12.931081771850586, "logps/rejected": -65.07794189453125, "loss": 0.5835, "rewards/accuracies": 1.0, "rewards/chosen": 0.09038490056991577, "rewards/margins": 0.2332114726305008, "rewards/rejected": -0.14282655715942383, "step": 160 }, { "epoch": 0.04, "grad_norm": 0.7578125, "learning_rate": 2.119700748129676e-06, "logits/chosen": -2.3733327388763428, "logits/rejected": -2.637248992919922, "logps/chosen": -10.747810363769531, "logps/rejected": -66.96683502197266, "loss": 0.5576, "rewards/accuracies": 1.0, "rewards/chosen": 0.1092229038476944, "rewards/margins": 0.2931229770183563, "rewards/rejected": -0.18390007317066193, "step": 170 }, { "epoch": 0.04, "grad_norm": 0.94921875, "learning_rate": 2.2443890274314216e-06, "logits/chosen": -2.059483051300049, "logits/rejected": -2.3210699558258057, "logps/chosen": -9.155550003051758, "logps/rejected": -81.08940124511719, "loss": 0.525, "rewards/accuracies": 1.0, "rewards/chosen": 0.1273214966058731, "rewards/margins": 0.371276319026947, "rewards/rejected": -0.24395480751991272, "step": 180 }, { "epoch": 0.05, "grad_norm": 0.88671875, "learning_rate": 2.3690773067331675e-06, "logits/chosen": -2.1012320518493652, "logits/rejected": -2.380855083465576, "logps/chosen": -8.220524787902832, "logps/rejected": -86.42562866210938, "loss": 0.4879, "rewards/accuracies": 1.0, "rewards/chosen": 0.13605494797229767, "rewards/margins": 0.4646981358528137, "rewards/rejected": -0.32864317297935486, "step": 190 }, { "epoch": 0.05, "grad_norm": 1.0546875, "learning_rate": 2.493765586034913e-06, "logits/chosen": -2.178351879119873, "logits/rejected": -2.448537826538086, "logps/chosen": -7.208306312561035, "logps/rejected": -91.26810455322266, "loss": 0.4484, "rewards/accuracies": 1.0, "rewards/chosen": 0.15183252096176147, "rewards/margins": 0.5721064805984497, "rewards/rejected": -0.42027395963668823, "step": 200 }, { "epoch": 0.05, "grad_norm": 0.90625, "learning_rate": 2.6184538653366586e-06, "logits/chosen": -2.1630733013153076, "logits/rejected": -2.411649703979492, "logps/chosen": -4.617680072784424, "logps/rejected": -102.69218444824219, "loss": 0.4025, "rewards/accuracies": 1.0, "rewards/chosen": 0.1683187633752823, "rewards/margins": 0.705622136592865, "rewards/rejected": -0.5373033881187439, "step": 210 }, { "epoch": 0.05, "grad_norm": 0.8828125, "learning_rate": 2.7431421446384045e-06, "logits/chosen": -2.1586241722106934, "logits/rejected": -2.3992838859558105, "logps/chosen": -3.4560561180114746, "logps/rejected": -116.10273742675781, "loss": 0.3606, "rewards/accuracies": 1.0, "rewards/chosen": 0.18259310722351074, "rewards/margins": 0.8416939973831177, "rewards/rejected": -0.6591008901596069, "step": 220 }, { "epoch": 0.06, "grad_norm": 0.953125, "learning_rate": 2.86783042394015e-06, "logits/chosen": -2.1585848331451416, "logits/rejected": -2.381598949432373, "logps/chosen": -2.7871248722076416, "logps/rejected": -130.73558044433594, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 0.18752917647361755, "rewards/margins": 0.9992051124572754, "rewards/rejected": -0.8116759061813354, "step": 230 }, { "epoch": 0.06, "grad_norm": 1.0390625, "learning_rate": 2.9925187032418956e-06, "logits/chosen": -2.2422091960906982, "logits/rejected": -2.491379499435425, "logps/chosen": -2.960984230041504, "logps/rejected": -167.91981506347656, "loss": 0.2364, "rewards/accuracies": 1.0, "rewards/chosen": 0.1910472810268402, "rewards/margins": 1.3611987829208374, "rewards/rejected": -1.1701514720916748, "step": 240 }, { "epoch": 0.06, "grad_norm": 0.64453125, "learning_rate": 3.117206982543641e-06, "logits/chosen": -2.1426875591278076, "logits/rejected": -2.385005474090576, "logps/chosen": -2.7859835624694824, "logps/rejected": -208.47451782226562, "loss": 0.1849, "rewards/accuracies": 1.0, "rewards/chosen": 0.1911415457725525, "rewards/margins": 1.752079963684082, "rewards/rejected": -1.5609384775161743, "step": 250 }, { "epoch": 0.06, "grad_norm": 0.3359375, "learning_rate": 3.241895261845387e-06, "logits/chosen": -2.171957492828369, "logits/rejected": -2.405855894088745, "logps/chosen": -2.6331560611724854, "logps/rejected": -266.33441162109375, "loss": 0.1401, "rewards/accuracies": 1.0, "rewards/chosen": 0.18891991674900055, "rewards/margins": 2.3281731605529785, "rewards/rejected": -2.1392531394958496, "step": 260 }, { "epoch": 0.07, "grad_norm": 0.46484375, "learning_rate": 3.3665835411471326e-06, "logits/chosen": -2.136756181716919, "logits/rejected": -2.3706109523773193, "logps/chosen": -2.3929905891418457, "logps/rejected": -311.50164794921875, "loss": 0.1147, "rewards/accuracies": 1.0, "rewards/chosen": 0.19246290624141693, "rewards/margins": 2.773569107055664, "rewards/rejected": -2.581106185913086, "step": 270 }, { "epoch": 0.07, "grad_norm": 0.1064453125, "learning_rate": 3.491271820448878e-06, "logits/chosen": -2.04775071144104, "logits/rejected": -2.256491184234619, "logps/chosen": -3.071178436279297, "logps/rejected": -338.50042724609375, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": 0.18945232033729553, "rewards/margins": 3.0333011150360107, "rewards/rejected": -2.8438491821289062, "step": 280 }, { "epoch": 0.07, "grad_norm": 0.255859375, "learning_rate": 3.615960099750624e-06, "logits/chosen": -2.0987088680267334, "logits/rejected": -2.3304831981658936, "logps/chosen": -2.6518688201904297, "logps/rejected": -317.264892578125, "loss": 0.1174, "rewards/accuracies": 1.0, "rewards/chosen": 0.1924947053194046, "rewards/margins": 2.8529839515686035, "rewards/rejected": -2.660489082336426, "step": 290 }, { "epoch": 0.07, "grad_norm": 0.416015625, "learning_rate": 3.7406483790523696e-06, "logits/chosen": -2.090928792953491, "logits/rejected": -2.331522226333618, "logps/chosen": -2.140141725540161, "logps/rejected": -386.000244140625, "loss": 0.0843, "rewards/accuracies": 1.0, "rewards/chosen": 0.19298282265663147, "rewards/margins": 3.519921064376831, "rewards/rejected": -3.3269379138946533, "step": 300 }, { "epoch": 0.08, "grad_norm": 0.341796875, "learning_rate": 3.8653366583541155e-06, "logits/chosen": -2.0327889919281006, "logits/rejected": -2.2435851097106934, "logps/chosen": -2.820808172225952, "logps/rejected": -395.51666259765625, "loss": 0.1065, "rewards/accuracies": 1.0, "rewards/chosen": 0.19103531539440155, "rewards/margins": 3.636303424835205, "rewards/rejected": -3.445268154144287, "step": 310 }, { "epoch": 0.08, "grad_norm": 0.361328125, "learning_rate": 3.990024937655861e-06, "logits/chosen": -2.0326077938079834, "logits/rejected": -2.2442269325256348, "logps/chosen": -2.771969795227051, "logps/rejected": -503.487548828125, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": 0.18934586644172668, "rewards/margins": 4.642784595489502, "rewards/rejected": -4.453438758850098, "step": 320 }, { "epoch": 0.08, "grad_norm": 0.330078125, "learning_rate": 4.114713216957607e-06, "logits/chosen": -2.1069068908691406, "logits/rejected": -2.2923953533172607, "logps/chosen": -4.020249366760254, "logps/rejected": -392.1181335449219, "loss": 0.087, "rewards/accuracies": 1.0, "rewards/chosen": 0.18417596817016602, "rewards/margins": 3.6148147583007812, "rewards/rejected": -3.4306392669677734, "step": 330 }, { "epoch": 0.08, "grad_norm": 0.361328125, "learning_rate": 4.239401496259352e-06, "logits/chosen": -2.040024518966675, "logits/rejected": -2.235044240951538, "logps/chosen": -4.0639448165893555, "logps/rejected": -495.93304443359375, "loss": 0.07, "rewards/accuracies": 1.0, "rewards/chosen": 0.17745602130889893, "rewards/margins": 4.601096153259277, "rewards/rejected": -4.423640251159668, "step": 340 }, { "epoch": 0.09, "grad_norm": 0.1513671875, "learning_rate": 4.364089775561098e-06, "logits/chosen": -2.1371054649353027, "logits/rejected": -2.314563274383545, "logps/chosen": -5.727438449859619, "logps/rejected": -448.86077880859375, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 0.1613588184118271, "rewards/margins": 4.152359962463379, "rewards/rejected": -3.991001605987549, "step": 350 }, { "epoch": 0.09, "grad_norm": 0.44921875, "learning_rate": 4.488778054862843e-06, "logits/chosen": -2.1167104244232178, "logits/rejected": -2.327972412109375, "logps/chosen": -7.92899227142334, "logps/rejected": -551.7445068359375, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 0.14445874094963074, "rewards/margins": 5.136443138122559, "rewards/rejected": -4.9919843673706055, "step": 360 }, { "epoch": 0.09, "grad_norm": 0.5625, "learning_rate": 4.6134663341645895e-06, "logits/chosen": -2.1294167041778564, "logits/rejected": -2.320831537246704, "logps/chosen": -14.308802604675293, "logps/rejected": -556.9048461914062, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": 0.07796537131071091, "rewards/margins": 5.144980430603027, "rewards/rejected": -5.067015171051025, "step": 370 }, { "epoch": 0.09, "grad_norm": 0.74609375, "learning_rate": 4.738154613466335e-06, "logits/chosen": -2.059129238128662, "logits/rejected": -2.283139705657959, "logps/chosen": -30.504648208618164, "logps/rejected": -904.5690307617188, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -0.08387070894241333, "rewards/margins": 8.345690727233887, "rewards/rejected": -8.429560661315918, "step": 380 }, { "epoch": 0.1, "grad_norm": 0.0244140625, "learning_rate": 4.862842892768081e-06, "logits/chosen": -2.1258416175842285, "logits/rejected": -2.3157804012298584, "logps/chosen": -44.502899169921875, "logps/rejected": -788.58056640625, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -0.2255050241947174, "rewards/margins": 7.145709037780762, "rewards/rejected": -7.371213436126709, "step": 390 }, { "epoch": 0.1, "grad_norm": 0.2001953125, "learning_rate": 4.987531172069826e-06, "logits/chosen": -2.0560178756713867, "logits/rejected": -2.2473480701446533, "logps/chosen": -65.08824157714844, "logps/rejected": -957.4326171875, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -0.4304937422275543, "rewards/margins": 8.619054794311523, "rewards/rejected": -9.04954719543457, "step": 400 }, { "epoch": 0.1, "grad_norm": 0.1484375, "learning_rate": 4.999923022460671e-06, "logits/chosen": -2.0123705863952637, "logits/rejected": -2.2279648780822754, "logps/chosen": -74.45851135253906, "logps/rejected": -1195.67138671875, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -0.5188314318656921, "rewards/margins": 10.849455833435059, "rewards/rejected": -11.368288040161133, "step": 410 }, { "epoch": 0.1, "grad_norm": 0.51953125, "learning_rate": 4.999656933348981e-06, "logits/chosen": -2.234529972076416, "logits/rejected": -2.406409740447998, "logps/chosen": -88.35977935791016, "logps/rejected": -900.6641845703125, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -0.6683001518249512, "rewards/margins": 7.860695838928223, "rewards/rejected": -8.528995513916016, "step": 420 }, { "epoch": 0.11, "grad_norm": 0.005340576171875, "learning_rate": 4.99920080255011e-06, "logits/chosen": -2.054624080657959, "logits/rejected": -2.283973217010498, "logps/chosen": -83.51972198486328, "logps/rejected": -1245.379150390625, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -0.6176443099975586, "rewards/margins": 11.277586936950684, "rewards/rejected": -11.895231246948242, "step": 430 }, { "epoch": 0.11, "grad_norm": 0.5, "learning_rate": 4.998554664742362e-06, "logits/chosen": -2.136657476425171, "logits/rejected": -2.320204973220825, "logps/chosen": -88.63077545166016, "logps/rejected": -1043.793701171875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -0.6717870831489563, "rewards/margins": 9.280545234680176, "rewards/rejected": -9.95233154296875, "step": 440 }, { "epoch": 0.11, "grad_norm": 0.287109375, "learning_rate": 4.997718569049726e-06, "logits/chosen": -2.074990749359131, "logits/rejected": -2.277477979660034, "logps/chosen": -94.67475891113281, "logps/rejected": -1131.5499267578125, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -0.7206478714942932, "rewards/margins": 10.060680389404297, "rewards/rejected": -10.781328201293945, "step": 450 }, { "epoch": 0.11, "grad_norm": 0.0244140625, "learning_rate": 4.9966925790381404e-06, "logits/chosen": -2.1405222415924072, "logits/rejected": -2.3201403617858887, "logps/chosen": -73.81873321533203, "logps/rejected": -1013.6409912109375, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.528481125831604, "rewards/margins": 9.096994400024414, "rewards/rejected": -9.625473976135254, "step": 460 }, { "epoch": 0.12, "grad_norm": 0.19140625, "learning_rate": 4.995476772710657e-06, "logits/chosen": -2.0950608253479004, "logits/rejected": -2.316931962966919, "logps/chosen": -103.3821792602539, "logps/rejected": -1335.799560546875, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -0.8135234117507935, "rewards/margins": 11.993521690368652, "rewards/rejected": -12.807044982910156, "step": 470 }, { "epoch": 0.12, "grad_norm": 0.2451171875, "learning_rate": 4.994071242501516e-06, "logits/chosen": -2.185049057006836, "logits/rejected": -2.3854544162750244, "logps/chosen": -70.46857452392578, "logps/rejected": -1074.28271484375, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -0.48124104738235474, "rewards/margins": 9.766096115112305, "rewards/rejected": -10.247336387634277, "step": 480 }, { "epoch": 0.12, "grad_norm": 2.8967857360839844e-05, "learning_rate": 4.992476095269112e-06, "logits/chosen": -2.1872477531433105, "logits/rejected": -2.3788347244262695, "logps/chosen": -60.58274459838867, "logps/rejected": -1168.738037109375, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -0.3954273462295532, "rewards/margins": 10.741006851196289, "rewards/rejected": -11.136434555053711, "step": 490 }, { "epoch": 0.12, "grad_norm": 0.267578125, "learning_rate": 4.990691452287877e-06, "logits/chosen": -2.034578800201416, "logits/rejected": -2.228356122970581, "logps/chosen": -70.90155029296875, "logps/rejected": -1098.248779296875, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -0.49016299843788147, "rewards/margins": 9.965959548950195, "rewards/rejected": -10.456122398376465, "step": 500 }, { "epoch": 0.13, "grad_norm": 0.2412109375, "learning_rate": 4.988717449239056e-06, "logits/chosen": -2.086670398712158, "logits/rejected": -2.27720046043396, "logps/chosen": -79.89573669433594, "logps/rejected": -1124.399658203125, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -0.5849625468254089, "rewards/margins": 10.160270690917969, "rewards/rejected": -10.745233535766602, "step": 510 }, { "epoch": 0.13, "grad_norm": 0.0245361328125, "learning_rate": 4.98655423620039e-06, "logits/chosen": -2.119935989379883, "logits/rejected": -2.3267104625701904, "logps/chosen": -77.09886169433594, "logps/rejected": -1248.1905517578125, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5508411526679993, "rewards/margins": 11.388254165649414, "rewards/rejected": -11.939095497131348, "step": 520 }, { "epoch": 0.13, "grad_norm": 0.01080322265625, "learning_rate": 4.984201977634711e-06, "logits/chosen": -2.213916301727295, "logits/rejected": -2.4463677406311035, "logps/chosen": -90.18511199951172, "logps/rejected": -1377.499755859375, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -0.6767342686653137, "rewards/margins": 12.541799545288086, "rewards/rejected": -13.218534469604492, "step": 530 }, { "epoch": 0.13, "grad_norm": 0.0634765625, "learning_rate": 4.9816608523774345e-06, "logits/chosen": -2.105821132659912, "logits/rejected": -2.3127095699310303, "logps/chosen": -79.32666015625, "logps/rejected": -1143.241943359375, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -0.5732973217964172, "rewards/margins": 10.353887557983398, "rewards/rejected": -10.927184104919434, "step": 540 }, { "epoch": 0.14, "grad_norm": 0.0145263671875, "learning_rate": 4.978931053622964e-06, "logits/chosen": -2.1495628356933594, "logits/rejected": -2.370626449584961, "logps/chosen": -78.36927795410156, "logps/rejected": -1290.0146484375, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -0.5716887712478638, "rewards/margins": 11.808379173278809, "rewards/rejected": -12.380067825317383, "step": 550 }, { "epoch": 0.14, "grad_norm": 0.0003566741943359375, "learning_rate": 4.9760127889100044e-06, "logits/chosen": -2.1675076484680176, "logits/rejected": -2.3700671195983887, "logps/chosen": -62.899436950683594, "logps/rejected": -1186.429443359375, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -0.4144817888736725, "rewards/margins": 10.914512634277344, "rewards/rejected": -11.328994750976562, "step": 560 }, { "epoch": 0.14, "grad_norm": 0.23828125, "learning_rate": 4.972906280105781e-06, "logits/chosen": -2.0299549102783203, "logits/rejected": -2.252498149871826, "logps/chosen": -80.07968139648438, "logps/rejected": -1246.3656005859375, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.5788360238075256, "rewards/margins": 11.359301567077637, "rewards/rejected": -11.938138008117676, "step": 570 }, { "epoch": 0.14, "grad_norm": 0.0030670166015625, "learning_rate": 4.969611763389175e-06, "logits/chosen": -2.19167423248291, "logits/rejected": -2.402195453643799, "logps/chosen": -83.22602844238281, "logps/rejected": -1125.298095703125, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.6159827709197998, "rewards/margins": 10.143194198608398, "rewards/rejected": -10.759176254272461, "step": 580 }, { "epoch": 0.15, "grad_norm": 0.197265625, "learning_rate": 4.966129489232762e-06, "logits/chosen": -2.1329731941223145, "logits/rejected": -2.375246286392212, "logps/chosen": -77.91215515136719, "logps/rejected": -1410.508056640625, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5520095825195312, "rewards/margins": 12.985345840454102, "rewards/rejected": -13.537355422973633, "step": 590 }, { "epoch": 0.15, "grad_norm": 0.431640625, "learning_rate": 4.962459722383775e-06, "logits/chosen": -2.0712943077087402, "logits/rejected": -2.288693428039551, "logps/chosen": -73.90785217285156, "logps/rejected": -1434.8343505859375, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -0.5121776461601257, "rewards/margins": 13.254251480102539, "rewards/rejected": -13.766427993774414, "step": 600 }, { "epoch": 0.15, "grad_norm": 0.000926971435546875, "learning_rate": 4.958602741843975e-06, "logits/chosen": -2.0742838382720947, "logits/rejected": -2.333592176437378, "logps/chosen": -74.49140167236328, "logps/rejected": -1376.306396484375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.5260452032089233, "rewards/margins": 12.687009811401367, "rewards/rejected": -13.213055610656738, "step": 610 }, { "epoch": 0.15, "grad_norm": 0.08154296875, "learning_rate": 4.954558840848437e-06, "logits/chosen": -2.213879346847534, "logits/rejected": -2.4216055870056152, "logps/chosen": -65.52778625488281, "logps/rejected": -1092.6256103515625, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -0.43167224526405334, "rewards/margins": 10.010717391967773, "rewards/rejected": -10.442389488220215, "step": 620 }, { "epoch": 0.16, "grad_norm": 0.1318359375, "learning_rate": 4.950328326843258e-06, "logits/chosen": -2.0717647075653076, "logits/rejected": -2.3038885593414307, "logps/chosen": -59.320228576660156, "logps/rejected": -1350.045654296875, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.37677788734436035, "rewards/margins": 12.551568984985352, "rewards/rejected": -12.92834758758545, "step": 630 }, { "epoch": 0.16, "grad_norm": 0.00848388671875, "learning_rate": 4.945911521462182e-06, "logits/chosen": -2.2182841300964355, "logits/rejected": -2.4369709491729736, "logps/chosen": -66.91700744628906, "logps/rejected": -1338.757080078125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.45198917388916016, "rewards/margins": 12.385056495666504, "rewards/rejected": -12.837045669555664, "step": 640 }, { "epoch": 0.16, "grad_norm": 0.26171875, "learning_rate": 4.941308760502149e-06, "logits/chosen": -2.2334372997283936, "logits/rejected": -2.4091877937316895, "logps/chosen": -77.1009292602539, "logps/rejected": -1184.353759765625, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -0.5503292083740234, "rewards/margins": 10.71965217590332, "rewards/rejected": -11.26998233795166, "step": 650 }, { "epoch": 0.16, "grad_norm": 0.423828125, "learning_rate": 4.936520393897762e-06, "logits/chosen": -2.174837589263916, "logits/rejected": -2.3993821144104004, "logps/chosen": -66.07880401611328, "logps/rejected": -1318.9051513671875, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -0.43549758195877075, "rewards/margins": 12.193208694458008, "rewards/rejected": -12.628705978393555, "step": 660 }, { "epoch": 0.17, "grad_norm": 0.0062255859375, "learning_rate": 4.931546785694684e-06, "logits/chosen": -2.2091901302337646, "logits/rejected": -2.44826078414917, "logps/chosen": -83.01612091064453, "logps/rejected": -1474.8822021484375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.6095518469810486, "rewards/margins": 13.620869636535645, "rewards/rejected": -14.230420112609863, "step": 670 }, { "epoch": 0.17, "grad_norm": 0.017822265625, "learning_rate": 4.926388314021964e-06, "logits/chosen": -2.2539894580841064, "logits/rejected": -2.4782536029815674, "logps/chosen": -97.8957748413086, "logps/rejected": -1248.599365234375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.7543063759803772, "rewards/margins": 11.235261917114258, "rewards/rejected": -11.989568710327148, "step": 680 }, { "epoch": 0.17, "grad_norm": 0.00060272216796875, "learning_rate": 4.921045371063283e-06, "logits/chosen": -2.241508960723877, "logits/rejected": -2.45992112159729, "logps/chosen": -75.34230041503906, "logps/rejected": -1410.6185302734375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.5371032953262329, "rewards/margins": 13.022886276245117, "rewards/rejected": -13.559989929199219, "step": 690 }, { "epoch": 0.17, "grad_norm": 0.09423828125, "learning_rate": 4.915518363027142e-06, "logits/chosen": -2.3091747760772705, "logits/rejected": -2.516079902648926, "logps/chosen": -77.0430679321289, "logps/rejected": -1162.167236328125, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -0.5587199926376343, "rewards/margins": 10.586333274841309, "rewards/rejected": -11.145052909851074, "step": 700 }, { "epoch": 0.18, "grad_norm": 0.21484375, "learning_rate": 4.909807710115977e-06, "logits/chosen": -2.06872820854187, "logits/rejected": -2.280989170074463, "logps/chosen": -57.82390213012695, "logps/rejected": -1309.2158203125, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -0.36199483275413513, "rewards/margins": 12.196971893310547, "rewards/rejected": -12.558965682983398, "step": 710 }, { "epoch": 0.18, "grad_norm": 0.10595703125, "learning_rate": 4.903913846494211e-06, "logits/chosen": -2.057790994644165, "logits/rejected": -2.2983431816101074, "logps/chosen": -63.02252960205078, "logps/rejected": -1628.169677734375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.4083371162414551, "rewards/margins": 15.24769115447998, "rewards/rejected": -15.656028747558594, "step": 720 }, { "epoch": 0.18, "grad_norm": 0.283203125, "learning_rate": 4.897837220255251e-06, "logits/chosen": -2.101783275604248, "logits/rejected": -2.2945144176483154, "logps/chosen": -62.76520538330078, "logps/rejected": -1316.2451171875, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -0.4040141999721527, "rewards/margins": 12.210726737976074, "rewards/rejected": -12.614742279052734, "step": 730 }, { "epoch": 0.18, "grad_norm": 0.0218505859375, "learning_rate": 4.891578293387413e-06, "logits/chosen": -2.183640241622925, "logits/rejected": -2.3983137607574463, "logps/chosen": -72.8852767944336, "logps/rejected": -1332.2720947265625, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -0.5061370134353638, "rewards/margins": 12.295533180236816, "rewards/rejected": -12.801671028137207, "step": 740 }, { "epoch": 0.19, "grad_norm": 0.064453125, "learning_rate": 4.885137541738808e-06, "logits/chosen": -2.1432900428771973, "logits/rejected": -2.3399500846862793, "logps/chosen": -52.877479553222656, "logps/rejected": -1188.441650390625, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.3169303238391876, "rewards/margins": 11.048254013061523, "rewards/rejected": -11.365182876586914, "step": 750 }, { "epoch": 0.19, "grad_norm": 2.086162567138672e-05, "learning_rate": 4.878515454981153e-06, "logits/chosen": -2.013054370880127, "logits/rejected": -2.238393783569336, "logps/chosen": -60.31416702270508, "logps/rejected": -1504.335693359375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.38582319021224976, "rewards/margins": 14.089462280273438, "rewards/rejected": -14.475286483764648, "step": 760 }, { "epoch": 0.19, "grad_norm": 0.29296875, "learning_rate": 4.8717125365725545e-06, "logits/chosen": -2.2411911487579346, "logits/rejected": -2.4217007160186768, "logps/chosen": -71.92973327636719, "logps/rejected": -1072.138427734375, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -0.4977429509162903, "rewards/margins": 9.747591018676758, "rewards/rejected": -10.245333671569824, "step": 770 }, { "epoch": 0.19, "grad_norm": 0.076171875, "learning_rate": 4.864729303719221e-06, "logits/chosen": -2.183976650238037, "logits/rejected": -2.4096364974975586, "logps/chosen": -75.01698303222656, "logps/rejected": -1566.1385498046875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.5285240411758423, "rewards/margins": 14.545486450195312, "rewards/rejected": -15.074010848999023, "step": 780 }, { "epoch": 0.2, "grad_norm": 0.142578125, "learning_rate": 4.857566287336152e-06, "logits/chosen": -2.1151528358459473, "logits/rejected": -2.352687358856201, "logps/chosen": -81.51287841796875, "logps/rejected": -1503.718505859375, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -0.5906480550765991, "rewards/margins": 13.887521743774414, "rewards/rejected": -14.478169441223145, "step": 790 }, { "epoch": 0.2, "grad_norm": 0.000972747802734375, "learning_rate": 4.850224032006765e-06, "logits/chosen": -2.2330470085144043, "logits/rejected": -2.4612553119659424, "logps/chosen": -86.96638488769531, "logps/rejected": -1362.5970458984375, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -0.6454036831855774, "rewards/margins": 12.467041969299316, "rewards/rejected": -13.112444877624512, "step": 800 }, { "epoch": 0.2, "grad_norm": 0.220703125, "learning_rate": 4.8427030959414984e-06, "logits/chosen": -2.0308804512023926, "logits/rejected": -2.2706708908081055, "logps/chosen": -80.2787857055664, "logps/rejected": -1434.3277587890625, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.5847833752632141, "rewards/margins": 13.225725173950195, "rewards/rejected": -13.810508728027344, "step": 810 }, { "epoch": 0.2, "grad_norm": 0.193359375, "learning_rate": 4.835004050935369e-06, "logits/chosen": -2.134955644607544, "logits/rejected": -2.338745594024658, "logps/chosen": -71.83667755126953, "logps/rejected": -1364.427978515625, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -0.49762091040611267, "rewards/margins": 12.594762802124023, "rewards/rejected": -13.092384338378906, "step": 820 }, { "epoch": 0.21, "grad_norm": 0.13671875, "learning_rate": 4.8271274823245e-06, "logits/chosen": -2.1413967609405518, "logits/rejected": -2.343967914581299, "logps/chosen": -51.91362762451172, "logps/rejected": -1269.490478515625, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -0.2962699830532074, "rewards/margins": 11.855816841125488, "rewards/rejected": -12.152085304260254, "step": 830 }, { "epoch": 0.21, "grad_norm": 0.208984375, "learning_rate": 4.8190739889416264e-06, "logits/chosen": -2.1291534900665283, "logits/rejected": -2.3538265228271484, "logps/chosen": -51.05685043334961, "logps/rejected": -1429.0635986328125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.28625136613845825, "rewards/margins": 13.460894584655762, "rewards/rejected": -13.74714469909668, "step": 840 }, { "epoch": 0.21, "grad_norm": 0.06884765625, "learning_rate": 4.810844183070553e-06, "logits/chosen": -2.2312417030334473, "logits/rejected": -2.45286226272583, "logps/chosen": -65.88993072509766, "logps/rejected": -1232.466064453125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.4434642195701599, "rewards/margins": 11.373895645141602, "rewards/rejected": -11.817359924316406, "step": 850 }, { "epoch": 0.21, "grad_norm": 0.20703125, "learning_rate": 4.802438690399622e-06, "logits/chosen": -2.1778035163879395, "logits/rejected": -2.4104442596435547, "logps/chosen": -61.74702835083008, "logps/rejected": -1364.88525390625, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.4024318754673004, "rewards/margins": 12.702482223510742, "rewards/rejected": -13.104913711547852, "step": 860 }, { "epoch": 0.22, "grad_norm": 0.00174713134765625, "learning_rate": 4.793858149974129e-06, "logits/chosen": -2.142401933670044, "logits/rejected": -2.3973865509033203, "logps/chosen": -64.52376556396484, "logps/rejected": -1546.420654296875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.4257756769657135, "rewards/margins": 14.488656997680664, "rewards/rejected": -14.91443157196045, "step": 870 }, { "epoch": 0.22, "grad_norm": 0.08544921875, "learning_rate": 4.785103214147747e-06, "logits/chosen": -2.2586052417755127, "logits/rejected": -2.4925296306610107, "logps/chosen": -58.837852478027344, "logps/rejected": -1360.659423828125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.3716233968734741, "rewards/margins": 12.72169017791748, "rewards/rejected": -13.093313217163086, "step": 880 }, { "epoch": 0.22, "grad_norm": 0.0027008056640625, "learning_rate": 4.776174548532926e-06, "logits/chosen": -2.158493757247925, "logits/rejected": -2.3726634979248047, "logps/chosen": -60.35230255126953, "logps/rejected": -1398.0426025390625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.3876059055328369, "rewards/margins": 13.057897567749023, "rewards/rejected": -13.445503234863281, "step": 890 }, { "epoch": 0.22, "grad_norm": 0.1650390625, "learning_rate": 4.767072831950288e-06, "logits/chosen": -2.205594539642334, "logits/rejected": -2.447887420654297, "logps/chosen": -58.41968536376953, "logps/rejected": -1474.5992431640625, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.36307448148727417, "rewards/margins": 13.841937065124512, "rewards/rejected": -14.205012321472168, "step": 900 }, { "epoch": 0.23, "grad_norm": 0.05029296875, "learning_rate": 4.7577987563770226e-06, "logits/chosen": -2.0987536907196045, "logits/rejected": -2.3415169715881348, "logps/chosen": -63.24462890625, "logps/rejected": -1456.5894775390625, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.40772971510887146, "rewards/margins": 13.594934463500977, "rewards/rejected": -14.002664566040039, "step": 910 }, { "epoch": 0.23, "grad_norm": 0.001953125, "learning_rate": 4.748353026894273e-06, "logits/chosen": -2.176764965057373, "logits/rejected": -2.3934457302093506, "logps/chosen": -77.48558044433594, "logps/rejected": -1403.7001953125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.5516784191131592, "rewards/margins": 12.959416389465332, "rewards/rejected": -13.51109504699707, "step": 920 }, { "epoch": 0.23, "grad_norm": 0.01708984375, "learning_rate": 4.738736361633532e-06, "logits/chosen": -2.2761058807373047, "logits/rejected": -2.475376605987549, "logps/chosen": -69.98649597167969, "logps/rejected": -1317.3599853515625, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.4822467267513275, "rewards/margins": 12.17310619354248, "rewards/rejected": -12.655353546142578, "step": 930 }, { "epoch": 0.23, "grad_norm": 0.00396728515625, "learning_rate": 4.728949491722046e-06, "logits/chosen": -2.3034911155700684, "logits/rejected": -2.5063111782073975, "logps/chosen": -82.03058624267578, "logps/rejected": -1236.0631103515625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -0.6094164848327637, "rewards/margins": 11.258821487426758, "rewards/rejected": -11.868239402770996, "step": 940 }, { "epoch": 0.24, "grad_norm": 0.154296875, "learning_rate": 4.718993161227231e-06, "logits/chosen": -2.156198740005493, "logits/rejected": -2.4342427253723145, "logps/chosen": -52.259849548339844, "logps/rejected": -1551.4473876953125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.2965083420276642, "rewards/margins": 14.673095703125, "rewards/rejected": -14.9696044921875, "step": 950 }, { "epoch": 0.24, "grad_norm": 0.0185546875, "learning_rate": 4.708868127100098e-06, "logits/chosen": -2.225891351699829, "logits/rejected": -2.446601629257202, "logps/chosen": -45.30867385864258, "logps/rejected": -1178.07958984375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.24066181480884552, "rewards/margins": 11.030438423156738, "rewards/rejected": -11.271100044250488, "step": 960 }, { "epoch": 0.24, "grad_norm": 0.000530242919921875, "learning_rate": 4.6985751591177075e-06, "logits/chosen": -2.071913242340088, "logits/rejected": -2.3076140880584717, "logps/chosen": -40.99647521972656, "logps/rejected": -1356.589599609375, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.1879800409078598, "rewards/margins": 12.83641529083252, "rewards/rejected": -13.024395942687988, "step": 970 }, { "epoch": 0.24, "grad_norm": 0.0279541015625, "learning_rate": 4.688115039824648e-06, "logits/chosen": -2.138272523880005, "logits/rejected": -2.3490092754364014, "logps/chosen": -39.265869140625, "logps/rejected": -1271.980712890625, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.17672276496887207, "rewards/margins": 12.01789665222168, "rewards/rejected": -12.194620132446289, "step": 980 }, { "epoch": 0.25, "grad_norm": 8.96453857421875e-05, "learning_rate": 4.677488564473535e-06, "logits/chosen": -2.0846240520477295, "logits/rejected": -2.3261351585388184, "logps/chosen": -54.3425178527832, "logps/rejected": -1450.612548828125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.32301291823387146, "rewards/margins": 13.639193534851074, "rewards/rejected": -13.96220588684082, "step": 990 }, { "epoch": 0.25, "grad_norm": 0.08349609375, "learning_rate": 4.666696540964556e-06, "logits/chosen": -2.2266921997070312, "logits/rejected": -2.44096040725708, "logps/chosen": -60.16071701049805, "logps/rejected": -1275.765380859375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.3776375949382782, "rewards/margins": 11.887288093566895, "rewards/rejected": -12.264925003051758, "step": 1000 }, { "epoch": 0.25, "eval_logits/chosen": -2.585369348526001, "eval_logits/rejected": -2.6955134868621826, "eval_logps/chosen": -101.94501495361328, "eval_logps/rejected": -625.497314453125, "eval_loss": 0.01159477885812521, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.7603151202201843, "eval_rewards/margins": 5.045938014984131, "eval_rewards/rejected": -5.806252956390381, "eval_runtime": 0.6566, "eval_samples_per_second": 7.615, "eval_steps_per_second": 4.569, "step": 1000 }, { "epoch": 0.25, "grad_norm": 0.000972747802734375, "learning_rate": 4.6557397897840454e-06, "logits/chosen": -2.227430820465088, "logits/rejected": -2.466034412384033, "logps/chosen": -49.18635940551758, "logps/rejected": -1351.74853515625, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -0.27514129877090454, "rewards/margins": 12.695302963256836, "rewards/rejected": -12.970443725585938, "step": 1010 }, { "epoch": 0.25, "grad_norm": 0.0011138916015625, "learning_rate": 4.644619143942108e-06, "logits/chosen": -2.2175045013427734, "logits/rejected": -2.4644241333007812, "logps/chosen": -38.532127380371094, "logps/rejected": -1415.847412109375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.16876588761806488, "rewards/margins": 13.413320541381836, "rewards/rejected": -13.582087516784668, "step": 1020 }, { "epoch": 0.26, "grad_norm": 0.08740234375, "learning_rate": 4.633335448909284e-06, "logits/chosen": -2.0612175464630127, "logits/rejected": -2.274484157562256, "logps/chosen": -39.870052337646484, "logps/rejected": -1330.5323486328125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.17578956484794617, "rewards/margins": 12.581718444824219, "rewards/rejected": -12.75750732421875, "step": 1030 }, { "epoch": 0.26, "grad_norm": 0.09912109375, "learning_rate": 4.621889562552272e-06, "logits/chosen": -2.163442850112915, "logits/rejected": -2.4233555793762207, "logps/chosen": -62.47473907470703, "logps/rejected": -1504.832763671875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -0.40125980973243713, "rewards/margins": 14.0916748046875, "rewards/rejected": -14.492935180664062, "step": 1040 }, { "epoch": 0.26, "grad_norm": 0.004669189453125, "learning_rate": 4.610282355068707e-06, "logits/chosen": -2.2863821983337402, "logits/rejected": -2.5355706214904785, "logps/chosen": -59.4514274597168, "logps/rejected": -1562.39013671875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.36575835943222046, "rewards/margins": 14.687586784362793, "rewards/rejected": -15.053342819213867, "step": 1050 }, { "epoch": 0.26, "grad_norm": 0.64453125, "learning_rate": 4.598514708921006e-06, "logits/chosen": -2.262545108795166, "logits/rejected": -2.510559558868408, "logps/chosen": -55.71985626220703, "logps/rejected": -1498.1640625, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.3440173268318176, "rewards/margins": 14.074090957641602, "rewards/rejected": -14.418106079101562, "step": 1060 }, { "epoch": 0.27, "grad_norm": 0.00142669677734375, "learning_rate": 4.5865875187692695e-06, "logits/chosen": -2.2046749591827393, "logits/rejected": -2.423334836959839, "logps/chosen": -48.60809326171875, "logps/rejected": -1244.1680908203125, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -0.2774764597415924, "rewards/margins": 11.670097351074219, "rewards/rejected": -11.9475736618042, "step": 1070 }, { "epoch": 0.27, "grad_norm": 0.041015625, "learning_rate": 4.57450169140327e-06, "logits/chosen": -2.0672097206115723, "logits/rejected": -2.3301241397857666, "logps/chosen": -47.35566711425781, "logps/rejected": -1535.633544921875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.25116461515426636, "rewards/margins": 14.5623779296875, "rewards/rejected": -14.813543319702148, "step": 1080 }, { "epoch": 0.27, "grad_norm": 0.0615234375, "learning_rate": 4.562258145673507e-06, "logits/chosen": -2.2260966300964355, "logits/rejected": -2.4950501918792725, "logps/chosen": -40.86091613769531, "logps/rejected": -1499.596435546875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.18965426087379456, "rewards/margins": 14.258898735046387, "rewards/rejected": -14.448553085327148, "step": 1090 }, { "epoch": 0.27, "grad_norm": 0.01068115234375, "learning_rate": 4.549857812421353e-06, "logits/chosen": -2.14607572555542, "logits/rejected": -2.3866307735443115, "logps/chosen": -44.410030364990234, "logps/rejected": -1332.9017333984375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.23083043098449707, "rewards/margins": 12.585546493530273, "rewards/rejected": -12.816377639770508, "step": 1100 }, { "epoch": 0.28, "grad_norm": 0.040283203125, "learning_rate": 4.537301634408281e-06, "logits/chosen": -2.169417142868042, "logits/rejected": -2.4057748317718506, "logps/chosen": -44.095577239990234, "logps/rejected": -1315.9925537109375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.2211724817752838, "rewards/margins": 12.42273235321045, "rewards/rejected": -12.643904685974121, "step": 1110 }, { "epoch": 0.28, "grad_norm": 0.11376953125, "learning_rate": 4.52459056624419e-06, "logits/chosen": -2.217676877975464, "logits/rejected": -2.4193835258483887, "logps/chosen": -46.805503845214844, "logps/rejected": -1376.5738525390625, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.2504929006099701, "rewards/margins": 12.966726303100586, "rewards/rejected": -13.217218399047852, "step": 1120 }, { "epoch": 0.28, "grad_norm": 0.12451171875, "learning_rate": 4.51172557431483e-06, "logits/chosen": -2.1065962314605713, "logits/rejected": -2.3267951011657715, "logps/chosen": -61.67560958862305, "logps/rejected": -1427.5928955078125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.39770394563674927, "rewards/margins": 13.320897102355957, "rewards/rejected": -13.718599319458008, "step": 1130 }, { "epoch": 0.28, "grad_norm": 0.06494140625, "learning_rate": 4.49870763670833e-06, "logits/chosen": -2.1609268188476562, "logits/rejected": -2.4237403869628906, "logps/chosen": -55.2051887512207, "logps/rejected": -1529.240966796875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.3337005376815796, "rewards/margins": 14.417539596557617, "rewards/rejected": -14.751240730285645, "step": 1140 }, { "epoch": 0.29, "grad_norm": 0.1953125, "learning_rate": 4.4855377431408335e-06, "logits/chosen": -2.152674436569214, "logits/rejected": -2.3682188987731934, "logps/chosen": -57.719703674316406, "logps/rejected": -1428.2706298828125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -0.3568641245365143, "rewards/margins": 13.377996444702148, "rewards/rejected": -13.73486042022705, "step": 1150 }, { "epoch": 0.29, "grad_norm": 0.0169677734375, "learning_rate": 4.472216894881261e-06, "logits/chosen": -2.146556854248047, "logits/rejected": -2.361703872680664, "logps/chosen": -56.385284423828125, "logps/rejected": -1308.488525390625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.34260934591293335, "rewards/margins": 12.229646682739258, "rewards/rejected": -12.572256088256836, "step": 1160 }, { "epoch": 0.29, "grad_norm": 0.0908203125, "learning_rate": 4.4587461046751815e-06, "logits/chosen": -2.1846487522125244, "logits/rejected": -2.4170939922332764, "logps/chosen": -47.7278938293457, "logps/rejected": -1271.083740234375, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.2519899308681488, "rewards/margins": 11.94546890258789, "rewards/rejected": -12.197460174560547, "step": 1170 }, { "epoch": 0.29, "grad_norm": 0.0247802734375, "learning_rate": 4.44512639666781e-06, "logits/chosen": -2.1769089698791504, "logits/rejected": -2.394580602645874, "logps/chosen": -61.13446044921875, "logps/rejected": -1223.753662109375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -0.3933146893978119, "rewards/margins": 11.356694221496582, "rewards/rejected": -11.75001049041748, "step": 1180 }, { "epoch": 0.3, "grad_norm": 0.390625, "learning_rate": 4.431358806326158e-06, "logits/chosen": -2.1201298236846924, "logits/rejected": -2.3456811904907227, "logps/chosen": -81.98688507080078, "logps/rejected": -1611.583984375, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -0.6020825505256653, "rewards/margins": 14.962076187133789, "rewards/rejected": -15.56415843963623, "step": 1190 }, { "epoch": 0.3, "grad_norm": 0.609375, "learning_rate": 4.4174443803603e-06, "logits/chosen": -2.204873561859131, "logits/rejected": -2.4108097553253174, "logps/chosen": -82.76813507080078, "logps/rejected": -1430.592041015625, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.6107802391052246, "rewards/margins": 13.188611030578613, "rewards/rejected": -13.79939079284668, "step": 1200 }, { "epoch": 0.3, "grad_norm": 0.123046875, "learning_rate": 4.4033841766438e-06, "logits/chosen": -2.178987503051758, "logits/rejected": -2.39570689201355, "logps/chosen": -57.776702880859375, "logps/rejected": -1284.997802734375, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -0.3557528853416443, "rewards/margins": 11.97689151763916, "rewards/rejected": -12.33264446258545, "step": 1210 }, { "epoch": 0.3, "grad_norm": 0.004180908203125, "learning_rate": 4.389179264133281e-06, "logits/chosen": -2.260874032974243, "logits/rejected": -2.495485305786133, "logps/chosen": -35.43501663208008, "logps/rejected": -1262.712890625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.1415407359600067, "rewards/margins": 11.985517501831055, "rewards/rejected": -12.127059936523438, "step": 1220 }, { "epoch": 0.31, "grad_norm": 0.337890625, "learning_rate": 4.374830722787159e-06, "logits/chosen": -2.265794277191162, "logits/rejected": -2.539062976837158, "logps/chosen": -40.56992721557617, "logps/rejected": -1323.746826171875, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -0.184851735830307, "rewards/margins": 12.533957481384277, "rewards/rejected": -12.718809127807617, "step": 1230 }, { "epoch": 0.31, "grad_norm": 0.10986328125, "learning_rate": 4.360339643483533e-06, "logits/chosen": -2.2265820503234863, "logits/rejected": -2.4537243843078613, "logps/chosen": -40.92462921142578, "logps/rejected": -1421.2384033203125, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.19266225397586823, "rewards/margins": 13.46106243133545, "rewards/rejected": -13.653724670410156, "step": 1240 }, { "epoch": 0.31, "grad_norm": 0.00179290771484375, "learning_rate": 4.345707127937253e-06, "logits/chosen": -2.136321544647217, "logits/rejected": -2.4158737659454346, "logps/chosen": -47.67406463623047, "logps/rejected": -1579.249267578125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.2534434497356415, "rewards/margins": 15.00439453125, "rewards/rejected": -15.257838249206543, "step": 1250 }, { "epoch": 0.31, "grad_norm": 0.060302734375, "learning_rate": 4.330934288616154e-06, "logits/chosen": -2.168765068054199, "logits/rejected": -2.4067187309265137, "logps/chosen": -62.91276931762695, "logps/rejected": -1362.3446044921875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.40555182099342346, "rewards/margins": 12.703886032104492, "rewards/rejected": -13.10943603515625, "step": 1260 }, { "epoch": 0.32, "grad_norm": 0.150390625, "learning_rate": 4.316022248656485e-06, "logits/chosen": -2.1002354621887207, "logits/rejected": -2.365851402282715, "logps/chosen": -53.953285217285156, "logps/rejected": -1585.8782958984375, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.32181739807128906, "rewards/margins": 14.933857917785645, "rewards/rejected": -15.255674362182617, "step": 1270 }, { "epoch": 0.32, "grad_norm": 0.024658203125, "learning_rate": 4.3009721417775166e-06, "logits/chosen": -2.1251707077026367, "logits/rejected": -2.363041639328003, "logps/chosen": -58.41363525390625, "logps/rejected": -1543.182861328125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.36327052116394043, "rewards/margins": 14.513801574707031, "rewards/rejected": -14.87707233428955, "step": 1280 }, { "epoch": 0.32, "grad_norm": 0.0230712890625, "learning_rate": 4.285785112195346e-06, "logits/chosen": -2.1945090293884277, "logits/rejected": -2.4488844871520996, "logps/chosen": -69.85707092285156, "logps/rejected": -1662.5419921875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.47537103295326233, "rewards/margins": 15.588732719421387, "rewards/rejected": -16.064102172851562, "step": 1290 }, { "epoch": 0.32, "grad_norm": 0.035888671875, "learning_rate": 4.27046231453591e-06, "logits/chosen": -2.1391608715057373, "logits/rejected": -2.379563808441162, "logps/chosen": -63.22686004638672, "logps/rejected": -1555.8231201171875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.41675299406051636, "rewards/margins": 14.574475288391113, "rewards/rejected": -14.991228103637695, "step": 1300 }, { "epoch": 0.33, "grad_norm": 2.682209014892578e-06, "learning_rate": 4.255004913747196e-06, "logits/chosen": -2.1814258098602295, "logits/rejected": -2.415797710418701, "logps/chosen": -57.22446823120117, "logps/rejected": -1578.1937255859375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.35579347610473633, "rewards/margins": 14.885470390319824, "rewards/rejected": -15.241262435913086, "step": 1310 }, { "epoch": 0.33, "grad_norm": 0.0059814453125, "learning_rate": 4.2394140850106825e-06, "logits/chosen": -2.1057560443878174, "logits/rejected": -2.3444247245788574, "logps/chosen": -60.09722900390625, "logps/rejected": -1566.6488037109375, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.37801143527030945, "rewards/margins": 14.739140510559082, "rewards/rejected": -15.117152214050293, "step": 1320 }, { "epoch": 0.33, "grad_norm": 0.00049591064453125, "learning_rate": 4.223691013651986e-06, "logits/chosen": -2.145397424697876, "logits/rejected": -2.3859896659851074, "logps/chosen": -50.876380920410156, "logps/rejected": -1696.628173828125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.28237268328666687, "rewards/margins": 16.069416046142578, "rewards/rejected": -16.351787567138672, "step": 1330 }, { "epoch": 0.33, "grad_norm": 0.076171875, "learning_rate": 4.207836895050748e-06, "logits/chosen": -2.290546178817749, "logits/rejected": -2.601999282836914, "logps/chosen": -49.566925048828125, "logps/rejected": -1810.896240234375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.27409371733665466, "rewards/margins": 17.24148178100586, "rewards/rejected": -17.515573501586914, "step": 1340 }, { "epoch": 0.34, "grad_norm": 0.23046875, "learning_rate": 4.1918529345497525e-06, "logits/chosen": -2.2135214805603027, "logits/rejected": -2.4138569831848145, "logps/chosen": -51.47322463989258, "logps/rejected": -1197.149169921875, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.2979530692100525, "rewards/margins": 11.184531211853027, "rewards/rejected": -11.482483863830566, "step": 1350 }, { "epoch": 0.34, "grad_norm": 0.22265625, "learning_rate": 4.175740347363289e-06, "logits/chosen": -2.2823052406311035, "logits/rejected": -2.500483989715576, "logps/chosen": -50.924964904785156, "logps/rejected": -1341.169189453125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.2940273880958557, "rewards/margins": 12.595098495483398, "rewards/rejected": -12.889126777648926, "step": 1360 }, { "epoch": 0.34, "grad_norm": 0.001129150390625, "learning_rate": 4.159500358484759e-06, "logits/chosen": -2.1221683025360107, "logits/rejected": -2.388002872467041, "logps/chosen": -52.10107421875, "logps/rejected": -1701.734375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.2977118492126465, "rewards/margins": 16.128461837768555, "rewards/rejected": -16.42617416381836, "step": 1370 }, { "epoch": 0.34, "grad_norm": 0.26171875, "learning_rate": 4.143134202593549e-06, "logits/chosen": -2.1562037467956543, "logits/rejected": -2.3721659183502197, "logps/chosen": -50.73106002807617, "logps/rejected": -1416.411376953125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.28152209520339966, "rewards/margins": 13.32117748260498, "rewards/rejected": -13.602702140808105, "step": 1380 }, { "epoch": 0.35, "grad_norm": 0.00372314453125, "learning_rate": 4.126643123961158e-06, "logits/chosen": -2.2438769340515137, "logits/rejected": -2.4929661750793457, "logps/chosen": -71.16793060302734, "logps/rejected": -1686.4351806640625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.4894431233406067, "rewards/margins": 15.813896179199219, "rewards/rejected": -16.303340911865234, "step": 1390 }, { "epoch": 0.35, "grad_norm": 0.03466796875, "learning_rate": 4.110028376356599e-06, "logits/chosen": -2.222071647644043, "logits/rejected": -2.447359323501587, "logps/chosen": -70.91515350341797, "logps/rejected": -1337.4664306640625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.4935643672943115, "rewards/margins": 12.358075141906738, "rewards/rejected": -12.851638793945312, "step": 1400 }, { "epoch": 0.35, "grad_norm": 0.0791015625, "learning_rate": 4.093291222951079e-06, "logits/chosen": -2.1609065532684326, "logits/rejected": -2.4100985527038574, "logps/chosen": -71.06592559814453, "logps/rejected": -1599.0948486328125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.49402037262916565, "rewards/margins": 14.944366455078125, "rewards/rejected": -15.438386917114258, "step": 1410 }, { "epoch": 0.35, "grad_norm": 0.234375, "learning_rate": 4.076432936221965e-06, "logits/chosen": -2.1633338928222656, "logits/rejected": -2.3718645572662354, "logps/chosen": -76.24402618408203, "logps/rejected": -1331.0867919921875, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -0.5511754751205444, "rewards/margins": 12.27659797668457, "rewards/rejected": -12.827774047851562, "step": 1420 }, { "epoch": 0.36, "grad_norm": 0.07177734375, "learning_rate": 4.059454797856039e-06, "logits/chosen": -2.200438976287842, "logits/rejected": -2.4105000495910645, "logps/chosen": -72.47054290771484, "logps/rejected": -1285.55029296875, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -0.5113335251808167, "rewards/margins": 11.851224899291992, "rewards/rejected": -12.362558364868164, "step": 1430 }, { "epoch": 0.36, "grad_norm": 0.0015869140625, "learning_rate": 4.042358098652057e-06, "logits/chosen": -2.257859468460083, "logits/rejected": -2.485215187072754, "logps/chosen": -52.50494384765625, "logps/rejected": -1284.864990234375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.30147919058799744, "rewards/margins": 12.056472778320312, "rewards/rejected": -12.357951164245605, "step": 1440 }, { "epoch": 0.36, "grad_norm": 0.00494384765625, "learning_rate": 4.025144138422615e-06, "logits/chosen": -2.1999363899230957, "logits/rejected": -2.436066150665283, "logps/chosen": -60.535972595214844, "logps/rejected": -1517.8021240234375, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -0.3815072178840637, "rewards/margins": 14.274576187133789, "rewards/rejected": -14.656084060668945, "step": 1450 }, { "epoch": 0.36, "grad_norm": 0.048583984375, "learning_rate": 4.007814225895321e-06, "logits/chosen": -2.1949074268341064, "logits/rejected": -2.453916549682617, "logps/chosen": -40.10565948486328, "logps/rejected": -1380.6016845703125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.184524804353714, "rewards/margins": 13.0862398147583, "rewards/rejected": -13.270764350891113, "step": 1460 }, { "epoch": 0.37, "grad_norm": 0.0703125, "learning_rate": 3.990369678613303e-06, "logits/chosen": -2.1046247482299805, "logits/rejected": -2.339478015899658, "logps/chosen": -32.17253875732422, "logps/rejected": -1487.5965576171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.1069754809141159, "rewards/margins": 14.181404113769531, "rewards/rejected": -14.288378715515137, "step": 1470 }, { "epoch": 0.37, "grad_norm": 0.0157470703125, "learning_rate": 3.97281182283504e-06, "logits/chosen": -2.168814182281494, "logits/rejected": -2.4204602241516113, "logps/chosen": -33.689884185791016, "logps/rejected": -1507.740966796875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -0.11984201520681381, "rewards/margins": 14.405647277832031, "rewards/rejected": -14.525489807128906, "step": 1480 }, { "epoch": 0.37, "grad_norm": 0.3828125, "learning_rate": 3.955141993433526e-06, "logits/chosen": -2.2266287803649902, "logits/rejected": -2.45817494392395, "logps/chosen": -52.63502883911133, "logps/rejected": -1366.678955078125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.3042375445365906, "rewards/margins": 12.845235824584961, "rewards/rejected": -13.14947509765625, "step": 1490 }, { "epoch": 0.37, "grad_norm": 0.08544921875, "learning_rate": 3.937361533794784e-06, "logits/chosen": -2.156094551086426, "logits/rejected": -2.3926451206207275, "logps/chosen": -44.07966232299805, "logps/rejected": -1358.091064453125, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.22154085338115692, "rewards/margins": 12.819009780883789, "rewards/rejected": -13.040552139282227, "step": 1500 }, { "epoch": 0.38, "grad_norm": 0.021484375, "learning_rate": 3.919471795715738e-06, "logits/chosen": -2.212313652038574, "logits/rejected": -2.4430899620056152, "logps/chosen": -40.03847122192383, "logps/rejected": -1265.60009765625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.1857212483882904, "rewards/margins": 11.979241371154785, "rewards/rejected": -12.164961814880371, "step": 1510 }, { "epoch": 0.38, "grad_norm": 0.150390625, "learning_rate": 3.901474139301433e-06, "logits/chosen": -2.100083112716675, "logits/rejected": -2.327531337738037, "logps/chosen": -47.98102569580078, "logps/rejected": -1396.822021484375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.2628587484359741, "rewards/margins": 13.183023452758789, "rewards/rejected": -13.445881843566895, "step": 1520 }, { "epoch": 0.38, "grad_norm": 0.08740234375, "learning_rate": 3.883369932861634e-06, "logits/chosen": -2.2499475479125977, "logits/rejected": -2.4626846313476562, "logps/chosen": -53.71254348754883, "logps/rejected": -1261.4793701171875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.31738823652267456, "rewards/margins": 11.826452255249023, "rewards/rejected": -12.143839836120605, "step": 1530 }, { "epoch": 0.38, "grad_norm": 0.000232696533203125, "learning_rate": 3.865160552806796e-06, "logits/chosen": -2.293903350830078, "logits/rejected": -2.5309927463531494, "logps/chosen": -59.31644821166992, "logps/rejected": -1348.3590087890625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.37211018800735474, "rewards/margins": 12.622639656066895, "rewards/rejected": -12.994749069213867, "step": 1540 }, { "epoch": 0.39, "grad_norm": 0.0002765655517578125, "learning_rate": 3.84684738354342e-06, "logits/chosen": -2.301741361618042, "logits/rejected": -2.5269277095794678, "logps/chosen": -35.07439422607422, "logps/rejected": -1298.1329345703125, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -0.1266619861125946, "rewards/margins": 12.352213859558105, "rewards/rejected": -12.478874206542969, "step": 1550 }, { "epoch": 0.39, "grad_norm": 0.0712890625, "learning_rate": 3.828431817368798e-06, "logits/chosen": -2.15970778465271, "logits/rejected": -2.3912577629089355, "logps/chosen": -23.495868682861328, "logps/rejected": -1347.333740234375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.02085093781352043, "rewards/margins": 12.912274360656738, "rewards/rejected": -12.933123588562012, "step": 1560 }, { "epoch": 0.39, "grad_norm": 0.421875, "learning_rate": 3.8099152543651684e-06, "logits/chosen": -2.3851158618927, "logits/rejected": -2.659996509552002, "logps/chosen": -34.04401397705078, "logps/rejected": -1443.980712890625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.12582853436470032, "rewards/margins": 13.780733108520508, "rewards/rejected": -13.906559944152832, "step": 1570 }, { "epoch": 0.39, "grad_norm": 0.154296875, "learning_rate": 3.791299102293261e-06, "logits/chosen": -2.125797748565674, "logits/rejected": -2.3718996047973633, "logps/chosen": -31.654226303100586, "logps/rejected": -1515.550048828125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.0940675288438797, "rewards/margins": 14.493083000183105, "rewards/rejected": -14.587150573730469, "step": 1580 }, { "epoch": 0.4, "grad_norm": 0.076171875, "learning_rate": 3.7725847764852774e-06, "logits/chosen": -2.117516040802002, "logits/rejected": -2.376412868499756, "logps/chosen": -33.58929443359375, "logps/rejected": -1522.32470703125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.11228612810373306, "rewards/margins": 14.516647338867188, "rewards/rejected": -14.628933906555176, "step": 1590 }, { "epoch": 0.4, "grad_norm": 0.051025390625, "learning_rate": 3.7537736997372833e-06, "logits/chosen": -2.183899402618408, "logits/rejected": -2.4056055545806885, "logps/chosen": -38.9683723449707, "logps/rejected": -1303.519287109375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -0.17212000489234924, "rewards/margins": 12.313672065734863, "rewards/rejected": -12.485791206359863, "step": 1600 }, { "epoch": 0.4, "grad_norm": 0.00689697265625, "learning_rate": 3.734867302201038e-06, "logits/chosen": -2.2842166423797607, "logits/rejected": -2.4898123741149902, "logps/chosen": -38.427486419677734, "logps/rejected": -1249.1448974609375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.17140671610832214, "rewards/margins": 11.829398155212402, "rewards/rejected": -12.000804901123047, "step": 1610 }, { "epoch": 0.4, "grad_norm": 0.1435546875, "learning_rate": 3.7158670212752666e-06, "logits/chosen": -2.1761648654937744, "logits/rejected": -2.4285309314727783, "logps/chosen": -43.9667854309082, "logps/rejected": -1409.6014404296875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.21925847232341766, "rewards/margins": 13.351308822631836, "rewards/rejected": -13.570569038391113, "step": 1620 }, { "epoch": 0.41, "grad_norm": 0.028076171875, "learning_rate": 3.696774301496376e-06, "logits/chosen": -2.253307342529297, "logits/rejected": -2.4998929500579834, "logps/chosen": -39.94139862060547, "logps/rejected": -1315.309814453125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.1798352301120758, "rewards/margins": 12.484976768493652, "rewards/rejected": -12.664812088012695, "step": 1630 }, { "epoch": 0.41, "grad_norm": 0.0152587890625, "learning_rate": 3.677590594428629e-06, "logits/chosen": -2.187530517578125, "logits/rejected": -2.411306142807007, "logps/chosen": -46.19135284423828, "logps/rejected": -1337.900390625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.2449018955230713, "rewards/margins": 12.619891166687012, "rewards/rejected": -12.86479377746582, "step": 1640 }, { "epoch": 0.41, "grad_norm": 0.0035400390625, "learning_rate": 3.658317358553794e-06, "logits/chosen": -2.1583094596862793, "logits/rejected": -2.399893045425415, "logps/chosen": -42.413978576660156, "logps/rejected": -1464.1385498046875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.20776596665382385, "rewards/margins": 13.914667129516602, "rewards/rejected": -14.122431755065918, "step": 1650 }, { "epoch": 0.41, "grad_norm": 0.1435546875, "learning_rate": 3.638956059160252e-06, "logits/chosen": -2.2085630893707275, "logits/rejected": -2.465798854827881, "logps/chosen": -51.00899887084961, "logps/rejected": -1475.9312744140625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.28529030084609985, "rewards/margins": 13.978610038757324, "rewards/rejected": -14.26390266418457, "step": 1660 }, { "epoch": 0.42, "grad_norm": 0.05029296875, "learning_rate": 3.6195081682315972e-06, "logits/chosen": -2.2395682334899902, "logits/rejected": -2.461138963699341, "logps/chosen": -52.74529266357422, "logps/rejected": -1418.346923828125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.3086877167224884, "rewards/margins": 13.38988208770752, "rewards/rejected": -13.698568344116211, "step": 1670 }, { "epoch": 0.42, "grad_norm": 0.08544921875, "learning_rate": 3.5999751643347342e-06, "logits/chosen": -2.16579008102417, "logits/rejected": -2.4046080112457275, "logps/chosen": -46.71515655517578, "logps/rejected": -1608.938232421875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.24464385211467743, "rewards/margins": 15.266571044921875, "rewards/rejected": -15.51121711730957, "step": 1680 }, { "epoch": 0.42, "grad_norm": 0.1923828125, "learning_rate": 3.5803585325074536e-06, "logits/chosen": -2.1881327629089355, "logits/rejected": -2.427145481109619, "logps/chosen": -37.16319274902344, "logps/rejected": -1421.040771484375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.15972432494163513, "rewards/margins": 13.539385795593262, "rewards/rejected": -13.69911003112793, "step": 1690 }, { "epoch": 0.42, "grad_norm": 0.07275390625, "learning_rate": 3.5606597641455387e-06, "logits/chosen": -2.219900369644165, "logits/rejected": -2.4398694038391113, "logps/chosen": -32.802005767822266, "logps/rejected": -1393.4263916015625, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.11474724858999252, "rewards/margins": 13.298954963684082, "rewards/rejected": -13.413702011108398, "step": 1700 }, { "epoch": 0.43, "grad_norm": 0.130859375, "learning_rate": 3.540880356889376e-06, "logits/chosen": -2.23069429397583, "logits/rejected": -2.4424965381622314, "logps/chosen": -42.188209533691406, "logps/rejected": -1353.078857421875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.2047566920518875, "rewards/margins": 12.800407409667969, "rewards/rejected": -13.005162239074707, "step": 1710 }, { "epoch": 0.43, "grad_norm": 0.326171875, "learning_rate": 3.5210218145100934e-06, "logits/chosen": -2.1350436210632324, "logits/rejected": -2.3985211849212646, "logps/chosen": -51.05349349975586, "logps/rejected": -1367.61767578125, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -0.29429805278778076, "rewards/margins": 12.873303413391113, "rewards/rejected": -13.167600631713867, "step": 1720 }, { "epoch": 0.43, "grad_norm": 0.134765625, "learning_rate": 3.5010856467952335e-06, "logits/chosen": -2.1528429985046387, "logits/rejected": -2.3955628871917725, "logps/chosen": -42.9320068359375, "logps/rejected": -1482.957763671875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.20944638550281525, "rewards/margins": 14.0554780960083, "rewards/rejected": -14.264923095703125, "step": 1730 }, { "epoch": 0.43, "grad_norm": 0.58984375, "learning_rate": 3.4810733694339687e-06, "logits/chosen": -2.2495784759521484, "logits/rejected": -2.512760639190674, "logps/chosen": -57.50274658203125, "logps/rejected": -1577.8023681640625, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.3561248183250427, "rewards/margins": 14.898828506469727, "rewards/rejected": -15.25495433807373, "step": 1740 }, { "epoch": 0.44, "grad_norm": 0.1435546875, "learning_rate": 3.4609865039018676e-06, "logits/chosen": -2.2507643699645996, "logits/rejected": -2.475839614868164, "logps/chosen": -41.08405685424805, "logps/rejected": -1401.9703369140625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.20163026452064514, "rewards/margins": 13.3068265914917, "rewards/rejected": -13.508456230163574, "step": 1750 }, { "epoch": 0.44, "grad_norm": 0.044677734375, "learning_rate": 3.4408265773452226e-06, "logits/chosen": -2.1668903827667236, "logits/rejected": -2.4009640216827393, "logps/chosen": -43.23725891113281, "logps/rejected": -1427.717041015625, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.21243014931678772, "rewards/margins": 13.541119575500488, "rewards/rejected": -13.753549575805664, "step": 1760 }, { "epoch": 0.44, "grad_norm": 0.0021514892578125, "learning_rate": 3.420595122464942e-06, "logits/chosen": -2.2544631958007812, "logits/rejected": -2.4994306564331055, "logps/chosen": -50.723716735839844, "logps/rejected": -1400.42236328125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.2909182906150818, "rewards/margins": 13.212321281433105, "rewards/rejected": -13.503240585327148, "step": 1770 }, { "epoch": 0.44, "grad_norm": 0.1767578125, "learning_rate": 3.4002936774000284e-06, "logits/chosen": -2.1552722454071045, "logits/rejected": -2.4494900703430176, "logps/chosen": -53.8035888671875, "logps/rejected": -1743.7855224609375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.31691521406173706, "rewards/margins": 16.568302154541016, "rewards/rejected": -16.885215759277344, "step": 1780 }, { "epoch": 0.45, "grad_norm": 0.04296875, "learning_rate": 3.3799237856106348e-06, "logits/chosen": -2.1529643535614014, "logits/rejected": -2.4126904010772705, "logps/chosen": -55.90287399291992, "logps/rejected": -1550.77783203125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.34558922052383423, "rewards/margins": 14.627325057983398, "rewards/rejected": -14.97291374206543, "step": 1790 }, { "epoch": 0.45, "grad_norm": 0.004913330078125, "learning_rate": 3.35948699576072e-06, "logits/chosen": -2.108168363571167, "logits/rejected": -2.371859550476074, "logps/chosen": -63.180198669433594, "logps/rejected": -1683.2808837890625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.40645408630371094, "rewards/margins": 15.862528800964355, "rewards/rejected": -16.268983840942383, "step": 1800 }, { "epoch": 0.45, "grad_norm": 0.37890625, "learning_rate": 3.3389848616003085e-06, "logits/chosen": -2.202070951461792, "logits/rejected": -2.4270646572113037, "logps/chosen": -47.17142105102539, "logps/rejected": -1418.73046875, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.25016140937805176, "rewards/margins": 13.404383659362793, "rewards/rejected": -13.654545783996582, "step": 1810 }, { "epoch": 0.45, "grad_norm": 0.054443359375, "learning_rate": 3.3184189418473674e-06, "logits/chosen": -2.0919992923736572, "logits/rejected": -2.3279192447662354, "logps/chosen": -37.22324752807617, "logps/rejected": -1371.5806884765625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.15740033984184265, "rewards/margins": 13.067277908325195, "rewards/rejected": -13.224676132202148, "step": 1820 }, { "epoch": 0.46, "grad_norm": 0.0159912109375, "learning_rate": 3.2977908000692925e-06, "logits/chosen": -2.1699509620666504, "logits/rejected": -2.4078266620635986, "logps/chosen": -46.939552307128906, "logps/rejected": -1496.64501953125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -0.24631306529045105, "rewards/margins": 14.200462341308594, "rewards/rejected": -14.44677448272705, "step": 1830 }, { "epoch": 0.46, "grad_norm": 0.490234375, "learning_rate": 3.2771020045640435e-06, "logits/chosen": -2.314471960067749, "logits/rejected": -2.533036708831787, "logps/chosen": -49.747779846191406, "logps/rejected": -1293.560302734375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -0.27524739503860474, "rewards/margins": 12.140924453735352, "rewards/rejected": -12.416172981262207, "step": 1840 }, { "epoch": 0.46, "grad_norm": 0.19140625, "learning_rate": 3.256354128240907e-06, "logits/chosen": -2.101799488067627, "logits/rejected": -2.320006847381592, "logps/chosen": -58.1518669128418, "logps/rejected": -1474.82666015625, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.361042320728302, "rewards/margins": 13.842974662780762, "rewards/rejected": -14.204015731811523, "step": 1850 }, { "epoch": 0.46, "grad_norm": 2.551823854446411e-07, "learning_rate": 3.235548748500914e-06, "logits/chosen": -2.3442602157592773, "logits/rejected": -2.5813608169555664, "logps/chosen": -64.3367691040039, "logps/rejected": -1516.5281982421875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.42836618423461914, "rewards/margins": 14.244784355163574, "rewards/rejected": -14.673149108886719, "step": 1860 }, { "epoch": 0.47, "grad_norm": 0.130859375, "learning_rate": 3.214687447116913e-06, "logits/chosen": -2.129812717437744, "logits/rejected": -2.35500168800354, "logps/chosen": -60.48137664794922, "logps/rejected": -1468.074462890625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.3904629647731781, "rewards/margins": 13.763618469238281, "rewards/rejected": -14.154080390930176, "step": 1870 }, { "epoch": 0.47, "grad_norm": 0.005218505859375, "learning_rate": 3.193771810113313e-06, "logits/chosen": -2.1812546253204346, "logits/rejected": -2.450334072113037, "logps/chosen": -57.156097412109375, "logps/rejected": -1621.7850341796875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.34869837760925293, "rewards/margins": 15.336013793945312, "rewards/rejected": -15.684713363647461, "step": 1880 }, { "epoch": 0.47, "grad_norm": 0.03271484375, "learning_rate": 3.1728034276455032e-06, "logits/chosen": -2.1772501468658447, "logits/rejected": -2.4167187213897705, "logps/chosen": -47.676063537597656, "logps/rejected": -1501.980224609375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.2580206096172333, "rewards/margins": 14.23988151550293, "rewards/rejected": -14.497901916503906, "step": 1890 }, { "epoch": 0.47, "grad_norm": 0.00274658203125, "learning_rate": 3.1517838938789597e-06, "logits/chosen": -2.1416432857513428, "logits/rejected": -2.3887360095977783, "logps/chosen": -31.932031631469727, "logps/rejected": -1682.0501708984375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.10166473686695099, "rewards/margins": 16.101634979248047, "rewards/rejected": -16.203296661376953, "step": 1900 }, { "epoch": 0.48, "grad_norm": 0.36328125, "learning_rate": 3.130714806868041e-06, "logits/chosen": -2.132199764251709, "logits/rejected": -2.3675732612609863, "logps/chosen": -38.96401596069336, "logps/rejected": -1434.172607421875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.1679878979921341, "rewards/margins": 13.631708145141602, "rewards/rejected": -13.799695014953613, "step": 1910 }, { "epoch": 0.48, "grad_norm": 0.1484375, "learning_rate": 3.1095977684344976e-06, "logits/chosen": -2.221590042114258, "logits/rejected": -2.477220296859741, "logps/chosen": -42.42957305908203, "logps/rejected": -1500.699462890625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.2044091671705246, "rewards/margins": 14.278982162475586, "rewards/rejected": -14.483392715454102, "step": 1920 }, { "epoch": 0.48, "grad_norm": 0.0040283203125, "learning_rate": 3.0884343840456874e-06, "logits/chosen": -2.280695915222168, "logits/rejected": -2.5356380939483643, "logps/chosen": -51.98859405517578, "logps/rejected": -1650.245361328125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3021391034126282, "rewards/margins": 15.651565551757812, "rewards/rejected": -15.953704833984375, "step": 1930 }, { "epoch": 0.48, "grad_norm": 0.0002536773681640625, "learning_rate": 3.0672262626925174e-06, "logits/chosen": -2.1820268630981445, "logits/rejected": -2.439319133758545, "logps/chosen": -47.39429473876953, "logps/rejected": -1611.6102294921875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.24379031360149384, "rewards/margins": 15.318346977233887, "rewards/rejected": -15.56213665008545, "step": 1940 }, { "epoch": 0.49, "grad_norm": 3.910064697265625e-05, "learning_rate": 3.0459750167671147e-06, "logits/chosen": -2.1863160133361816, "logits/rejected": -2.450911283493042, "logps/chosen": -57.97031784057617, "logps/rejected": -1733.2484130859375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.35936951637268066, "rewards/margins": 16.4171199798584, "rewards/rejected": -16.776485443115234, "step": 1950 }, { "epoch": 0.49, "grad_norm": 0.375, "learning_rate": 3.024682261940247e-06, "logits/chosen": -2.1711161136627197, "logits/rejected": -2.381054401397705, "logps/chosen": -63.16656494140625, "logps/rejected": -1473.282958984375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.40881744027137756, "rewards/margins": 13.772600173950195, "rewards/rejected": -14.181416511535645, "step": 1960 }, { "epoch": 0.49, "grad_norm": 0.000568389892578125, "learning_rate": 3.0033496170384803e-06, "logits/chosen": -2.232100009918213, "logits/rejected": -2.4612276554107666, "logps/chosen": -56.055152893066406, "logps/rejected": -1356.71484375, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -0.3471956253051758, "rewards/margins": 12.729546546936035, "rewards/rejected": -13.076742172241211, "step": 1970 }, { "epoch": 0.49, "grad_norm": 0.10791015625, "learning_rate": 2.9819787039211068e-06, "logits/chosen": -2.1615240573883057, "logits/rejected": -2.393810510635376, "logps/chosen": -35.02969741821289, "logps/rejected": -1524.7955322265625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.13164165616035461, "rewards/margins": 14.575594902038574, "rewards/rejected": -14.707235336303711, "step": 1980 }, { "epoch": 0.5, "grad_norm": 0.0054931640625, "learning_rate": 2.960571147356845e-06, "logits/chosen": -2.256544828414917, "logits/rejected": -2.5309910774230957, "logps/chosen": -49.80757522583008, "logps/rejected": -1592.794677734375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.2703229784965515, "rewards/margins": 15.133091926574707, "rewards/rejected": -15.403416633605957, "step": 1990 }, { "epoch": 0.5, "grad_norm": 0.0023040771484375, "learning_rate": 2.9391285749003046e-06, "logits/chosen": -2.15415620803833, "logits/rejected": -2.405571460723877, "logps/chosen": -40.737998962402344, "logps/rejected": -1701.052734375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.18067236244678497, "rewards/margins": 16.24726104736328, "rewards/rejected": -16.427934646606445, "step": 2000 }, { "epoch": 0.5, "eval_logits/chosen": -2.6136603355407715, "eval_logits/rejected": -2.7333316802978516, "eval_logps/chosen": -48.08984375, "eval_logps/rejected": -693.2846069335938, "eval_loss": 0.0037064917851239443, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.22176341712474823, "eval_rewards/margins": 6.262362003326416, "eval_rewards/rejected": -6.48412561416626, "eval_runtime": 0.6544, "eval_samples_per_second": 7.641, "eval_steps_per_second": 4.585, "step": 2000 }, { "epoch": 0.5, "grad_norm": 0.6328125, "learning_rate": 2.9176526167682543e-06, "logits/chosen": -2.1183362007141113, "logits/rejected": -2.351123571395874, "logps/chosen": -37.299964904785156, "logps/rejected": -1437.7230224609375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.15952737629413605, "rewards/margins": 13.696490287780762, "rewards/rejected": -13.85601806640625, "step": 2010 }, { "epoch": 0.5, "grad_norm": 0.03857421875, "learning_rate": 2.8961449057156775e-06, "logits/chosen": -2.200801372528076, "logits/rejected": -2.4389915466308594, "logps/chosen": -42.25465774536133, "logps/rejected": -1569.040771484375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.20262226462364197, "rewards/margins": 14.958514213562012, "rewards/rejected": -15.16113567352295, "step": 2020 }, { "epoch": 0.51, "grad_norm": 0.0037689208984375, "learning_rate": 2.874607076911642e-06, "logits/chosen": -2.212007999420166, "logits/rejected": -2.4628169536590576, "logps/chosen": -54.49187088012695, "logps/rejected": -1452.176513671875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.31496500968933105, "rewards/margins": 13.68268871307373, "rewards/rejected": -13.997654914855957, "step": 2030 }, { "epoch": 0.51, "grad_norm": 0.0026092529296875, "learning_rate": 2.8530407678149806e-06, "logits/chosen": -2.1855294704437256, "logits/rejected": -2.428863525390625, "logps/chosen": -61.762428283691406, "logps/rejected": -1588.792236328125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.4000323414802551, "rewards/margins": 14.938389778137207, "rewards/rejected": -15.338421821594238, "step": 2040 }, { "epoch": 0.51, "grad_norm": 0.001739501953125, "learning_rate": 2.8314476180498003e-06, "logits/chosen": -2.0332534313201904, "logits/rejected": -2.267488718032837, "logps/chosen": -41.453369140625, "logps/rejected": -1475.7647705078125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.1973056197166443, "rewards/margins": 14.028945922851562, "rewards/rejected": -14.226251602172852, "step": 2050 }, { "epoch": 0.51, "grad_norm": 0.13671875, "learning_rate": 2.8098292692808253e-06, "logits/chosen": -2.2281060218811035, "logits/rejected": -2.422762632369995, "logps/chosen": -41.2132453918457, "logps/rejected": -1153.19775390625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.1940850019454956, "rewards/margins": 10.885441780090332, "rewards/rejected": -11.079526901245117, "step": 2060 }, { "epoch": 0.52, "grad_norm": 0.162109375, "learning_rate": 2.7881873650885904e-06, "logits/chosen": -2.227834463119507, "logits/rejected": -2.4453253746032715, "logps/chosen": -50.43096160888672, "logps/rejected": -1375.1741943359375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.2844979166984558, "rewards/margins": 12.969167709350586, "rewards/rejected": -13.25366497039795, "step": 2070 }, { "epoch": 0.52, "grad_norm": 0.1513671875, "learning_rate": 2.7665235508444772e-06, "logits/chosen": -2.1580593585968018, "logits/rejected": -2.404978036880493, "logps/chosen": -47.8787841796875, "logps/rejected": -1663.496826171875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.2667320966720581, "rewards/margins": 15.83378791809082, "rewards/rejected": -16.10051727294922, "step": 2080 }, { "epoch": 0.52, "grad_norm": 0.000820159912109375, "learning_rate": 2.7448394735856275e-06, "logits/chosen": -2.1202292442321777, "logits/rejected": -2.387399196624756, "logps/chosen": -29.072830200195312, "logps/rejected": -1652.7601318359375, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.07453112304210663, "rewards/margins": 15.884991645812988, "rewards/rejected": -15.959524154663086, "step": 2090 }, { "epoch": 0.52, "grad_norm": 0.1259765625, "learning_rate": 2.723136781889722e-06, "logits/chosen": -2.248565912246704, "logits/rejected": -2.483459949493408, "logps/chosen": -49.5106201171875, "logps/rejected": -1374.344482421875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.26804444193840027, "rewards/margins": 12.9815092086792, "rewards/rejected": -13.249552726745605, "step": 2100 }, { "epoch": 0.53, "grad_norm": 0.0208740234375, "learning_rate": 2.7014171257496414e-06, "logits/chosen": -2.2338385581970215, "logits/rejected": -2.4451489448547363, "logps/chosen": -47.859092712402344, "logps/rejected": -1475.451416015625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.259570837020874, "rewards/margins": 13.942883491516113, "rewards/rejected": -14.20245361328125, "step": 2110 }, { "epoch": 0.53, "grad_norm": 0.2216796875, "learning_rate": 2.6796821564480237e-06, "logits/chosen": -2.1667749881744385, "logits/rejected": -2.3811049461364746, "logps/chosen": -51.062232971191406, "logps/rejected": -1320.582763671875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.2894384562969208, "rewards/margins": 12.419047355651855, "rewards/rejected": -12.708486557006836, "step": 2120 }, { "epoch": 0.53, "grad_norm": 0.00604248046875, "learning_rate": 2.6579335264317253e-06, "logits/chosen": -2.3176040649414062, "logits/rejected": -2.558061361312866, "logps/chosen": -36.845001220703125, "logps/rejected": -1507.6351318359375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.15489184856414795, "rewards/margins": 14.385536193847656, "rewards/rejected": -14.540430068969727, "step": 2130 }, { "epoch": 0.53, "grad_norm": 0.00830078125, "learning_rate": 2.6361728891861843e-06, "logits/chosen": -2.067624568939209, "logits/rejected": -2.2963385581970215, "logps/chosen": -45.42739486694336, "logps/rejected": -1546.36083984375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.23402588069438934, "rewards/margins": 14.646380424499512, "rewards/rejected": -14.880406379699707, "step": 2140 }, { "epoch": 0.54, "grad_norm": 0.0032806396484375, "learning_rate": 2.614401899109716e-06, "logits/chosen": -2.247525930404663, "logits/rejected": -2.4837863445281982, "logps/chosen": -48.529815673828125, "logps/rejected": -1460.3306884765625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.2696106433868408, "rewards/margins": 13.825261116027832, "rewards/rejected": -14.094873428344727, "step": 2150 }, { "epoch": 0.54, "grad_norm": 0.06201171875, "learning_rate": 2.5926222113877282e-06, "logits/chosen": -2.243438482284546, "logits/rejected": -2.4923970699310303, "logps/chosen": -43.63848114013672, "logps/rejected": -1591.591064453125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.2216021567583084, "rewards/margins": 15.112271308898926, "rewards/rejected": -15.333871841430664, "step": 2160 }, { "epoch": 0.54, "grad_norm": 0.205078125, "learning_rate": 2.570835481866889e-06, "logits/chosen": -2.144465923309326, "logits/rejected": -2.3723580837249756, "logps/chosen": -45.58980178833008, "logps/rejected": -1466.3011474609375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.2433461844921112, "rewards/margins": 13.90100383758545, "rewards/rejected": -14.1443510055542, "step": 2170 }, { "epoch": 0.54, "grad_norm": 0.140625, "learning_rate": 2.5490433669292337e-06, "logits/chosen": -2.0634944438934326, "logits/rejected": -2.311782121658325, "logps/chosen": -37.41926193237305, "logps/rejected": -1625.005126953125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.1547044962644577, "rewards/margins": 15.56786060333252, "rewards/rejected": -15.722564697265625, "step": 2180 }, { "epoch": 0.55, "grad_norm": 0.00244140625, "learning_rate": 2.527247523366232e-06, "logits/chosen": -2.2304885387420654, "logits/rejected": -2.4748549461364746, "logps/chosen": -54.11591339111328, "logps/rejected": -1548.42578125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.31723862886428833, "rewards/margins": 14.627525329589844, "rewards/rejected": -14.9447660446167, "step": 2190 }, { "epoch": 0.55, "grad_norm": 0.01214599609375, "learning_rate": 2.5054496082528336e-06, "logits/chosen": -2.2945401668548584, "logits/rejected": -2.553946018218994, "logps/chosen": -50.36088180541992, "logps/rejected": -1503.6251220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.2821517586708069, "rewards/margins": 14.262479782104492, "rewards/rejected": -14.544631958007812, "step": 2200 }, { "epoch": 0.55, "grad_norm": 0.07373046875, "learning_rate": 2.483651278821481e-06, "logits/chosen": -2.240737199783325, "logits/rejected": -2.468348264694214, "logps/chosen": -38.926151275634766, "logps/rejected": -1415.637939453125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.17295430600643158, "rewards/margins": 13.47205638885498, "rewards/rejected": -13.645009994506836, "step": 2210 }, { "epoch": 0.55, "grad_norm": 0.111328125, "learning_rate": 2.4618541923361166e-06, "logits/chosen": -2.4229185581207275, "logits/rejected": -2.6283278465270996, "logps/chosen": -44.134647369384766, "logps/rejected": -1301.507080078125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.2265511453151703, "rewards/margins": 12.291707038879395, "rewards/rejected": -12.518258094787598, "step": 2220 }, { "epoch": 0.56, "grad_norm": 0.001983642578125, "learning_rate": 2.4400600059661836e-06, "logits/chosen": -2.0719246864318848, "logits/rejected": -2.375192165374756, "logps/chosen": -46.18827819824219, "logps/rejected": -1760.173095703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.24284347891807556, "rewards/margins": 16.816072463989258, "rewards/rejected": -17.058916091918945, "step": 2230 }, { "epoch": 0.56, "grad_norm": 0.046630859375, "learning_rate": 2.41827037666064e-06, "logits/chosen": -2.2636351585388184, "logits/rejected": -2.4840915203094482, "logps/chosen": -47.29922103881836, "logps/rejected": -1315.336181640625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.25474974513053894, "rewards/margins": 12.407878875732422, "rewards/rejected": -12.662630081176758, "step": 2240 }, { "epoch": 0.56, "grad_norm": 0.01141357421875, "learning_rate": 2.396486961021983e-06, "logits/chosen": -2.1793510913848877, "logits/rejected": -2.4308459758758545, "logps/chosen": -41.266380310058594, "logps/rejected": -1442.6708984375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.19039396941661835, "rewards/margins": 13.722501754760742, "rewards/rejected": -13.912895202636719, "step": 2250 }, { "epoch": 0.56, "grad_norm": 0.010498046875, "learning_rate": 2.3747114151802993e-06, "logits/chosen": -2.3280482292175293, "logits/rejected": -2.5701987743377686, "logps/chosen": -47.68052673339844, "logps/rejected": -1394.991455078125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.2576830983161926, "rewards/margins": 13.206995010375977, "rewards/rejected": -13.464675903320312, "step": 2260 }, { "epoch": 0.57, "grad_norm": 0.08349609375, "learning_rate": 2.352945394667363e-06, "logits/chosen": -2.0980782508850098, "logits/rejected": -2.364197254180908, "logps/chosen": -47.831058502197266, "logps/rejected": -1665.154296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.25840622186660767, "rewards/margins": 15.833898544311523, "rewards/rejected": -16.092304229736328, "step": 2270 }, { "epoch": 0.57, "grad_norm": 0.44921875, "learning_rate": 2.3311905542907627e-06, "logits/chosen": -2.256291389465332, "logits/rejected": -2.486441135406494, "logps/chosen": -42.5937614440918, "logps/rejected": -1361.2073974609375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -0.2082025557756424, "rewards/margins": 12.911099433898926, "rewards/rejected": -13.119302749633789, "step": 2280 }, { "epoch": 0.57, "grad_norm": 0.06591796875, "learning_rate": 2.30944854800809e-06, "logits/chosen": -2.2147023677825928, "logits/rejected": -2.4364144802093506, "logps/chosen": -40.498531341552734, "logps/rejected": -1479.181396484375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.18588443100452423, "rewards/margins": 14.101339340209961, "rewards/rejected": -14.287226676940918, "step": 2290 }, { "epoch": 0.57, "grad_norm": 0.004364013671875, "learning_rate": 2.287721028801204e-06, "logits/chosen": -2.175849676132202, "logits/rejected": -2.4008584022521973, "logps/chosen": -43.332298278808594, "logps/rejected": -1385.960205078125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.21884088218212128, "rewards/margins": 13.133634567260742, "rewards/rejected": -13.352476119995117, "step": 2300 }, { "epoch": 0.58, "grad_norm": 0.057861328125, "learning_rate": 2.26600964855055e-06, "logits/chosen": -2.2437031269073486, "logits/rejected": -2.4617691040039062, "logps/chosen": -43.779388427734375, "logps/rejected": -1358.084716796875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.22216272354125977, "rewards/margins": 12.88685417175293, "rewards/rejected": -13.109016418457031, "step": 2310 }, { "epoch": 0.58, "grad_norm": 0.0230712890625, "learning_rate": 2.244316057909573e-06, "logits/chosen": -2.205610752105713, "logits/rejected": -2.4241251945495605, "logps/chosen": -37.175682067871094, "logps/rejected": -1405.6822509765625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.15614905953407288, "rewards/margins": 13.411378860473633, "rewards/rejected": -13.567527770996094, "step": 2320 }, { "epoch": 0.58, "grad_norm": 0.0208740234375, "learning_rate": 2.2226419061792282e-06, "logits/chosen": -2.284442901611328, "logits/rejected": -2.527775526046753, "logps/chosen": -48.442630767822266, "logps/rejected": -1552.012939453125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.2705609202384949, "rewards/margins": 14.728759765625, "rewards/rejected": -14.999322891235352, "step": 2330 }, { "epoch": 0.58, "grad_norm": 0.0081787109375, "learning_rate": 2.200988841182589e-06, "logits/chosen": -2.219576358795166, "logits/rejected": -2.4669265747070312, "logps/chosen": -44.78432083129883, "logps/rejected": -1667.3538818359375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.2340162992477417, "rewards/margins": 15.922113418579102, "rewards/rejected": -16.156129837036133, "step": 2340 }, { "epoch": 0.59, "grad_norm": 0.01287841796875, "learning_rate": 2.179358509139559e-06, "logits/chosen": -2.171391010284424, "logits/rejected": -2.4009640216827393, "logps/chosen": -61.962059020996094, "logps/rejected": -1350.2457275390625, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.3955707848072052, "rewards/margins": 12.617085456848145, "rewards/rejected": -13.012655258178711, "step": 2350 }, { "epoch": 0.59, "grad_norm": 0.0517578125, "learning_rate": 2.1577525545417254e-06, "logits/chosen": -2.1860475540161133, "logits/rejected": -2.418872594833374, "logps/chosen": -58.040443420410156, "logps/rejected": -1468.6392822265625, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.3626258969306946, "rewards/margins": 13.818751335144043, "rewards/rejected": -14.181378364562988, "step": 2360 }, { "epoch": 0.59, "grad_norm": 0.0186767578125, "learning_rate": 2.1361726200273293e-06, "logits/chosen": -2.2700607776641846, "logits/rejected": -2.521707057952881, "logps/chosen": -48.01632308959961, "logps/rejected": -1544.2449951171875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.25974512100219727, "rewards/margins": 14.68183708190918, "rewards/rejected": -14.941583633422852, "step": 2370 }, { "epoch": 0.59, "grad_norm": 0.0859375, "learning_rate": 2.1146203462563773e-06, "logits/chosen": -2.335644483566284, "logits/rejected": -2.5736241340637207, "logps/chosen": -39.67052459716797, "logps/rejected": -1430.16796875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.18928556144237518, "rewards/margins": 13.622029304504395, "rewards/rejected": -13.811314582824707, "step": 2380 }, { "epoch": 0.6, "grad_norm": 0.08056640625, "learning_rate": 2.0930973717859117e-06, "logits/chosen": -2.352358341217041, "logits/rejected": -2.598140239715576, "logps/chosen": -44.863258361816406, "logps/rejected": -1464.2244873046875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.23050542175769806, "rewards/margins": 13.890615463256836, "rewards/rejected": -14.121121406555176, "step": 2390 }, { "epoch": 0.6, "grad_norm": 0.00011587142944335938, "learning_rate": 2.0716053329454337e-06, "logits/chosen": -2.07816481590271, "logits/rejected": -2.320413112640381, "logps/chosen": -42.12782287597656, "logps/rejected": -1603.09716796875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.20372910797595978, "rewards/margins": 15.295297622680664, "rewards/rejected": -15.49902629852295, "step": 2400 }, { "epoch": 0.6, "grad_norm": 0.0255126953125, "learning_rate": 2.0501458637124963e-06, "logits/chosen": -2.2174525260925293, "logits/rejected": -2.5070488452911377, "logps/chosen": -49.52367401123047, "logps/rejected": -1763.182861328125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.2707751393318176, "rewards/margins": 16.840404510498047, "rewards/rejected": -17.11117935180664, "step": 2410 }, { "epoch": 0.6, "grad_norm": 0.0206298828125, "learning_rate": 2.0287205955884812e-06, "logits/chosen": -2.2282018661499023, "logits/rejected": -2.47560453414917, "logps/chosen": -39.33561706542969, "logps/rejected": -1586.3204345703125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.17531004548072815, "rewards/margins": 15.11853313446045, "rewards/rejected": -15.293844223022461, "step": 2420 }, { "epoch": 0.61, "grad_norm": 0.0439453125, "learning_rate": 2.0073311574745583e-06, "logits/chosen": -2.1908931732177734, "logits/rejected": -2.4531962871551514, "logps/chosen": -46.56280517578125, "logps/rejected": -1645.0823974609375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.24412044882774353, "rewards/margins": 15.665544509887695, "rewards/rejected": -15.909663200378418, "step": 2430 }, { "epoch": 0.61, "grad_norm": 0.01141357421875, "learning_rate": 1.9859791755478453e-06, "logits/chosen": -2.2081665992736816, "logits/rejected": -2.4285478591918945, "logps/chosen": -36.21527099609375, "logps/rejected": -1296.198486328125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.14344918727874756, "rewards/margins": 12.341351509094238, "rewards/rejected": -12.484800338745117, "step": 2440 }, { "epoch": 0.61, "grad_norm": 0.2353515625, "learning_rate": 1.9646662731377737e-06, "logits/chosen": -2.157654285430908, "logits/rejected": -2.3908090591430664, "logps/chosen": -45.09668731689453, "logps/rejected": -1404.2435302734375, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -0.23492522537708282, "rewards/margins": 13.305872917175293, "rewards/rejected": -13.540797233581543, "step": 2450 }, { "epoch": 0.61, "grad_norm": 0.01287841796875, "learning_rate": 1.9433940706026743e-06, "logits/chosen": -2.1844208240509033, "logits/rejected": -2.438828468322754, "logps/chosen": -47.74811935424805, "logps/rejected": -1656.3568115234375, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.2575169503688812, "rewards/margins": 15.76098346710205, "rewards/rejected": -16.01849937438965, "step": 2460 }, { "epoch": 0.62, "grad_norm": 0.000972747802734375, "learning_rate": 1.9221641852065807e-06, "logits/chosen": -2.18261456489563, "logits/rejected": -2.4000496864318848, "logps/chosen": -44.85232925415039, "logps/rejected": -1401.519775390625, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.22903266549110413, "rewards/margins": 13.285311698913574, "rewards/rejected": -13.51434326171875, "step": 2470 }, { "epoch": 0.62, "grad_norm": 0.06201171875, "learning_rate": 1.9009782309962805e-06, "logits/chosen": -2.281862497329712, "logits/rejected": -2.5180306434631348, "logps/chosen": -35.415897369384766, "logps/rejected": -1375.5728759765625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.13269564509391785, "rewards/margins": 13.11363697052002, "rewards/rejected": -13.246332168579102, "step": 2480 }, { "epoch": 0.62, "grad_norm": 0.09423828125, "learning_rate": 1.8798378186785979e-06, "logits/chosen": -2.2361299991607666, "logits/rejected": -2.4721415042877197, "logps/chosen": -30.1846923828125, "logps/rejected": -1444.7518310546875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.0831189677119255, "rewards/margins": 13.8583402633667, "rewards/rejected": -13.941459655761719, "step": 2490 }, { "epoch": 0.62, "grad_norm": 0.035400390625, "learning_rate": 1.8587445554979404e-06, "logits/chosen": -2.073253870010376, "logits/rejected": -2.3244481086730957, "logps/chosen": -36.15102005004883, "logps/rejected": -1567.7164306640625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.1387818604707718, "rewards/margins": 14.997014045715332, "rewards/rejected": -15.135795593261719, "step": 2500 }, { "epoch": 0.63, "grad_norm": 0.0006103515625, "learning_rate": 1.8377000451141013e-06, "logits/chosen": -2.120227336883545, "logits/rejected": -2.379242420196533, "logps/chosen": -42.131507873535156, "logps/rejected": -1565.419189453125, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -0.207178995013237, "rewards/margins": 14.909128189086914, "rewards/rejected": -15.116305351257324, "step": 2510 }, { "epoch": 0.63, "grad_norm": 0.0927734375, "learning_rate": 1.8167058874803405e-06, "logits/chosen": -2.234502077102661, "logits/rejected": -2.4847466945648193, "logps/chosen": -42.673118591308594, "logps/rejected": -1594.222900390625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.2050865888595581, "rewards/margins": 15.173876762390137, "rewards/rejected": -15.3789644241333, "step": 2520 }, { "epoch": 0.63, "grad_norm": 0.138671875, "learning_rate": 1.7957636787217451e-06, "logits/chosen": -2.1729538440704346, "logits/rejected": -2.4276270866394043, "logps/chosen": -26.112987518310547, "logps/rejected": -1523.0291748046875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.044597070664167404, "rewards/margins": 14.675150871276855, "rewards/rejected": -14.719749450683594, "step": 2530 }, { "epoch": 0.63, "grad_norm": 0.0322265625, "learning_rate": 1.7748750110138768e-06, "logits/chosen": -2.106745481491089, "logits/rejected": -2.3529787063598633, "logps/chosen": -38.04988098144531, "logps/rejected": -1700.769287109375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.15463842451572418, "rewards/margins": 16.276233673095703, "rewards/rejected": -16.430871963500977, "step": 2540 }, { "epoch": 0.64, "grad_norm": 0.0849609375, "learning_rate": 1.7540414724617282e-06, "logits/chosen": -2.070836067199707, "logits/rejected": -2.3102221488952637, "logps/chosen": -41.751487731933594, "logps/rejected": -1488.04931640625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.19269555807113647, "rewards/margins": 14.147076606750488, "rewards/rejected": -14.33977222442627, "step": 2550 }, { "epoch": 0.64, "grad_norm": 0.00946044921875, "learning_rate": 1.7332646469789827e-06, "logits/chosen": -2.2572789192199707, "logits/rejected": -2.481287956237793, "logps/chosen": -29.41888427734375, "logps/rejected": -1229.242431640625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.07652698457241058, "rewards/margins": 11.74826717376709, "rewards/rejected": -11.824793815612793, "step": 2560 }, { "epoch": 0.64, "grad_norm": 0.0830078125, "learning_rate": 1.7125461141675881e-06, "logits/chosen": -2.1423022747039795, "logits/rejected": -2.3926641941070557, "logps/chosen": -30.7061710357666, "logps/rejected": -1465.9014892578125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.09345726668834686, "rewards/margins": 14.04102897644043, "rewards/rejected": -14.134485244750977, "step": 2570 }, { "epoch": 0.64, "grad_norm": 0.00014209747314453125, "learning_rate": 1.6918874491976744e-06, "logits/chosen": -2.290851354598999, "logits/rejected": -2.5240445137023926, "logps/chosen": -36.6445198059082, "logps/rejected": -1480.7818603515625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.149479478597641, "rewards/margins": 14.131547927856445, "rewards/rejected": -14.281025886535645, "step": 2580 }, { "epoch": 0.65, "grad_norm": 0.002655029296875, "learning_rate": 1.6712902226877917e-06, "logits/chosen": -2.1575067043304443, "logits/rejected": -2.402039051055908, "logps/chosen": -45.549842834472656, "logps/rejected": -1545.114990234375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.23408398032188416, "rewards/margins": 14.681310653686523, "rewards/rejected": -14.91539478302002, "step": 2590 }, { "epoch": 0.65, "grad_norm": 0.0028839111328125, "learning_rate": 1.6507560005854977e-06, "logits/chosen": -2.066991090774536, "logits/rejected": -2.3206119537353516, "logps/chosen": -47.11815643310547, "logps/rejected": -1413.30126953125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.24164950847625732, "rewards/margins": 13.29053020477295, "rewards/rejected": -13.532180786132812, "step": 2600 }, { "epoch": 0.65, "grad_norm": 0.03759765625, "learning_rate": 1.6302863440483121e-06, "logits/chosen": -2.1091551780700684, "logits/rejected": -2.394484043121338, "logps/chosen": -54.07494354248047, "logps/rejected": -1674.350341796875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.3039209842681885, "rewards/margins": 15.897099494934082, "rewards/rejected": -16.201021194458008, "step": 2610 }, { "epoch": 0.65, "grad_norm": 0.035400390625, "learning_rate": 1.6098828093250203e-06, "logits/chosen": -2.0393662452697754, "logits/rejected": -2.2912774085998535, "logps/chosen": -43.22583770751953, "logps/rejected": -1745.9964599609375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.20912513136863708, "rewards/margins": 16.64576530456543, "rewards/rejected": -16.85489273071289, "step": 2620 }, { "epoch": 0.66, "grad_norm": 0.038330078125, "learning_rate": 1.5895469476373545e-06, "logits/chosen": -2.12833833694458, "logits/rejected": -2.353044033050537, "logps/chosen": -51.28118133544922, "logps/rejected": -1477.751708984375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.2927466034889221, "rewards/margins": 13.93517017364502, "rewards/rejected": -14.227917671203613, "step": 2630 }, { "epoch": 0.66, "grad_norm": 0.002105712890625, "learning_rate": 1.5692803050620642e-06, "logits/chosen": -2.146883726119995, "logits/rejected": -2.3877830505371094, "logps/chosen": -42.891048431396484, "logps/rejected": -1572.1334228515625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.20707789063453674, "rewards/margins": 14.972661018371582, "rewards/rejected": -15.17973804473877, "step": 2640 }, { "epoch": 0.66, "grad_norm": 0.05029296875, "learning_rate": 1.5490844224133717e-06, "logits/chosen": -2.2065834999084473, "logits/rejected": -2.4583041667938232, "logps/chosen": -58.87604522705078, "logps/rejected": -1606.8402099609375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3703632652759552, "rewards/margins": 15.166918754577637, "rewards/rejected": -15.53728199005127, "step": 2650 }, { "epoch": 0.66, "grad_norm": 0.06298828125, "learning_rate": 1.528960835125822e-06, "logits/chosen": -2.3619742393493652, "logits/rejected": -2.5886929035186768, "logps/chosen": -47.88628005981445, "logps/rejected": -1394.3492431640625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.2605898976325989, "rewards/margins": 13.201173782348633, "rewards/rejected": -13.461764335632324, "step": 2660 }, { "epoch": 0.67, "grad_norm": 0.373046875, "learning_rate": 1.5089110731375568e-06, "logits/chosen": -2.1769912242889404, "logits/rejected": -2.4125704765319824, "logps/chosen": -54.75007247924805, "logps/rejected": -1521.18310546875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.32575544714927673, "rewards/margins": 14.370445251464844, "rewards/rejected": -14.696202278137207, "step": 2670 }, { "epoch": 0.67, "grad_norm": 0.080078125, "learning_rate": 1.4889366607739925e-06, "logits/chosen": -2.322796583175659, "logits/rejected": -2.5181009769439697, "logps/chosen": -45.69524383544922, "logps/rejected": -1201.08251953125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.24491195380687714, "rewards/margins": 11.296446800231934, "rewards/rejected": -11.541359901428223, "step": 2680 }, { "epoch": 0.67, "grad_norm": 0.03076171875, "learning_rate": 1.4690391166319307e-06, "logits/chosen": -2.1181106567382812, "logits/rejected": -2.3545029163360596, "logps/chosen": -43.6742057800293, "logps/rejected": -1542.8385009765625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.21549615263938904, "rewards/margins": 14.664543151855469, "rewards/rejected": -14.880040168762207, "step": 2690 }, { "epoch": 0.67, "grad_norm": 0.5078125, "learning_rate": 1.4492199534641055e-06, "logits/chosen": -2.21667218208313, "logits/rejected": -2.4625155925750732, "logps/chosen": -47.34065628051758, "logps/rejected": -1472.347900390625, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -0.25726571679115295, "rewards/margins": 13.980672836303711, "rewards/rejected": -14.237937927246094, "step": 2700 }, { "epoch": 0.68, "grad_norm": 0.0111083984375, "learning_rate": 1.429480678064174e-06, "logits/chosen": -2.2022199630737305, "logits/rejected": -2.4795124530792236, "logps/chosen": -51.5767822265625, "logps/rejected": -1817.763671875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.2908291518688202, "rewards/margins": 17.308137893676758, "rewards/rejected": -17.598966598510742, "step": 2710 }, { "epoch": 0.68, "grad_norm": 0.76171875, "learning_rate": 1.4098227911521523e-06, "logits/chosen": -2.219804286956787, "logits/rejected": -2.462226390838623, "logps/chosen": -46.08092498779297, "logps/rejected": -1526.5423583984375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.23279635608196259, "rewards/margins": 14.499191284179688, "rewards/rejected": -14.731986999511719, "step": 2720 }, { "epoch": 0.68, "grad_norm": 0.058349609375, "learning_rate": 1.3902477872603295e-06, "logits/chosen": -2.319612503051758, "logits/rejected": -2.517526149749756, "logps/chosen": -40.09135055541992, "logps/rejected": -1286.315185546875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.18802298605442047, "rewards/margins": 12.167932510375977, "rewards/rejected": -12.355955123901367, "step": 2730 }, { "epoch": 0.68, "grad_norm": 0.0859375, "learning_rate": 1.370757154619638e-06, "logits/chosen": -2.2395832538604736, "logits/rejected": -2.470933198928833, "logps/chosen": -56.124351501464844, "logps/rejected": -1607.262939453125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.33287256956100464, "rewards/margins": 15.186471939086914, "rewards/rejected": -15.519342422485352, "step": 2740 }, { "epoch": 0.69, "grad_norm": 8.761882781982422e-06, "learning_rate": 1.3513523750465049e-06, "logits/chosen": -2.2328319549560547, "logits/rejected": -2.4625821113586426, "logps/chosen": -39.597564697265625, "logps/rejected": -1417.1002197265625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.17806950211524963, "rewards/margins": 13.474327087402344, "rewards/rejected": -13.652397155761719, "step": 2750 }, { "epoch": 0.69, "grad_norm": 0.0030517578125, "learning_rate": 1.332034923830199e-06, "logits/chosen": -2.136444568634033, "logits/rejected": -2.3981611728668213, "logps/chosen": -44.04825210571289, "logps/rejected": -1527.0830078125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.22466015815734863, "rewards/margins": 14.532748222351074, "rewards/rejected": -14.757411003112793, "step": 2760 }, { "epoch": 0.69, "grad_norm": 0.130859375, "learning_rate": 1.31280626962067e-06, "logits/chosen": -2.2737619876861572, "logits/rejected": -2.488204002380371, "logps/chosen": -49.427528381347656, "logps/rejected": -1363.083740234375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.27385297417640686, "rewards/margins": 12.863523483276367, "rewards/rejected": -13.137374877929688, "step": 2770 }, { "epoch": 0.69, "grad_norm": 0.0162353515625, "learning_rate": 1.2936678743168813e-06, "logits/chosen": -2.2063140869140625, "logits/rejected": -2.450613498687744, "logps/chosen": -47.45142364501953, "logps/rejected": -1494.2115478515625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.2554013133049011, "rewards/margins": 14.182083129882812, "rewards/rejected": -14.437482833862305, "step": 2780 }, { "epoch": 0.7, "grad_norm": 0.06298828125, "learning_rate": 1.2746211929556777e-06, "logits/chosen": -2.171708583831787, "logits/rejected": -2.4892578125, "logps/chosen": -47.59801483154297, "logps/rejected": -1927.90234375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.25736138224601746, "rewards/margins": 18.46487808227539, "rewards/rejected": -18.722238540649414, "step": 2790 }, { "epoch": 0.7, "grad_norm": 5.900859832763672e-06, "learning_rate": 1.2556676736011558e-06, "logits/chosen": -2.200247287750244, "logits/rejected": -2.433065891265869, "logps/chosen": -47.600555419921875, "logps/rejected": -1647.335693359375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.25239020586013794, "rewards/margins": 15.647438049316406, "rewards/rejected": -15.89982795715332, "step": 2800 }, { "epoch": 0.7, "grad_norm": 0.0279541015625, "learning_rate": 1.2368087572345772e-06, "logits/chosen": -2.235849380493164, "logits/rejected": -2.4413654804229736, "logps/chosen": -48.26280975341797, "logps/rejected": -1258.86572265625, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.26613086462020874, "rewards/margins": 11.844823837280273, "rewards/rejected": -12.110954284667969, "step": 2810 }, { "epoch": 0.7, "grad_norm": 1.2734375, "learning_rate": 1.2180458776448067e-06, "logits/chosen": -2.187344551086426, "logits/rejected": -2.4354748725891113, "logps/chosen": -40.0956916809082, "logps/rejected": -1655.0335693359375, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.18102389574050903, "rewards/margins": 15.813700675964355, "rewards/rejected": -15.994723320007324, "step": 2820 }, { "epoch": 0.71, "grad_norm": 0.0234375, "learning_rate": 1.1993804613193158e-06, "logits/chosen": -2.18884539604187, "logits/rejected": -2.4379589557647705, "logps/chosen": -64.91096496582031, "logps/rejected": -1493.458740234375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.4194706082344055, "rewards/margins": 13.966886520385742, "rewards/rejected": -14.386357307434082, "step": 2830 }, { "epoch": 0.71, "grad_norm": 1.3649463653564453e-05, "learning_rate": 1.1808139273357232e-06, "logits/chosen": -2.1439809799194336, "logits/rejected": -2.3814640045166016, "logps/chosen": -47.53407669067383, "logps/rejected": -1624.342529296875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -0.25394895672798157, "rewards/margins": 15.44206714630127, "rewards/rejected": -15.696017265319824, "step": 2840 }, { "epoch": 0.71, "grad_norm": 0.0003833770751953125, "learning_rate": 1.1623476872539108e-06, "logits/chosen": -2.1601688861846924, "logits/rejected": -2.4301838874816895, "logps/chosen": -46.782203674316406, "logps/rejected": -1727.318115234375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.24715037643909454, "rewards/margins": 16.47604751586914, "rewards/rejected": -16.72319984436035, "step": 2850 }, { "epoch": 0.71, "grad_norm": 0.0908203125, "learning_rate": 1.1439831450087032e-06, "logits/chosen": -2.204617738723755, "logits/rejected": -2.4746241569519043, "logps/chosen": -70.88643646240234, "logps/rejected": -1754.577392578125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.48623570799827576, "rewards/margins": 16.506263732910156, "rewards/rejected": -16.992502212524414, "step": 2860 }, { "epoch": 0.72, "grad_norm": 0.68359375, "learning_rate": 1.1257216968031357e-06, "logits/chosen": -2.172727108001709, "logits/rejected": -2.4189679622650146, "logps/chosen": -53.47440719604492, "logps/rejected": -1510.17333984375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.3195067048072815, "rewards/margins": 14.29955768585205, "rewards/rejected": -14.619064331054688, "step": 2870 }, { "epoch": 0.72, "grad_norm": 0.0003604888916015625, "learning_rate": 1.1075647310022974e-06, "logits/chosen": -2.324207305908203, "logits/rejected": -2.5505588054656982, "logps/chosen": -48.77964401245117, "logps/rejected": -1276.3153076171875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.2692165970802307, "rewards/margins": 12.017995834350586, "rewards/rejected": -12.287213325500488, "step": 2880 }, { "epoch": 0.72, "grad_norm": 0.00640869140625, "learning_rate": 1.0895136280277863e-06, "logits/chosen": -2.1405930519104004, "logits/rejected": -2.389354705810547, "logps/chosen": -52.074989318847656, "logps/rejected": -1792.354248046875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.2989426851272583, "rewards/margins": 17.034626007080078, "rewards/rejected": -17.33357048034668, "step": 2890 }, { "epoch": 0.72, "grad_norm": 0.0634765625, "learning_rate": 1.0715697602527542e-06, "logits/chosen": -2.0093884468078613, "logits/rejected": -2.2803351879119873, "logps/chosen": -60.875953674316406, "logps/rejected": -1733.0355224609375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.3902908265590668, "rewards/margins": 16.359115600585938, "rewards/rejected": -16.749406814575195, "step": 2900 }, { "epoch": 0.73, "grad_norm": 0.10546875, "learning_rate": 1.0537344918975708e-06, "logits/chosen": -2.2281734943389893, "logits/rejected": -2.414677381515503, "logps/chosen": -56.65943145751953, "logps/rejected": -1399.209228515625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.3390200734138489, "rewards/margins": 13.123127937316895, "rewards/rejected": -13.46214771270752, "step": 2910 }, { "epoch": 0.73, "grad_norm": 3.910064697265625e-05, "learning_rate": 1.036009178926107e-06, "logits/chosen": -2.1897904872894287, "logits/rejected": -2.426058530807495, "logps/chosen": -48.45244216918945, "logps/rejected": -1510.665771484375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.26533347368240356, "rewards/margins": 14.326631546020508, "rewards/rejected": -14.59196662902832, "step": 2920 }, { "epoch": 0.73, "grad_norm": 0.02099609375, "learning_rate": 1.0183951689426438e-06, "logits/chosen": -2.1068902015686035, "logits/rejected": -2.3621068000793457, "logps/chosen": -49.524208068847656, "logps/rejected": -1805.392333984375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.2760918438434601, "rewards/margins": 17.226123809814453, "rewards/rejected": -17.502214431762695, "step": 2930 }, { "epoch": 0.73, "grad_norm": 0.109375, "learning_rate": 1.0008938010894156e-06, "logits/chosen": -2.0732312202453613, "logits/rejected": -2.359827756881714, "logps/chosen": -49.63148880004883, "logps/rejected": -1718.0699462890625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.2772377133369446, "rewards/margins": 16.361858367919922, "rewards/rejected": -16.639095306396484, "step": 2940 }, { "epoch": 0.74, "grad_norm": 0.169921875, "learning_rate": 9.83506405944804e-07, "logits/chosen": -2.0447497367858887, "logits/rejected": -2.2771499156951904, "logps/chosen": -39.81549072265625, "logps/rejected": -1609.721923828125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.17990897595882416, "rewards/margins": 15.340120315551758, "rewards/rejected": -15.520029067993164, "step": 2950 }, { "epoch": 0.74, "grad_norm": 9.894371032714844e-06, "learning_rate": 9.662343054221743e-07, "logits/chosen": -2.053480625152588, "logits/rejected": -2.3034961223602295, "logps/chosen": -49.807559967041016, "logps/rejected": -1805.352294921875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.2724885642528534, "rewards/margins": 17.18954849243164, "rewards/rejected": -17.462038040161133, "step": 2960 }, { "epoch": 0.74, "grad_norm": 0.08349609375, "learning_rate": 9.490788126693754e-07, "logits/chosen": -2.081925868988037, "logits/rejected": -2.337897777557373, "logps/chosen": -39.89293670654297, "logps/rejected": -1609.5567626953125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.18545950949192047, "rewards/margins": 15.35168170928955, "rewards/rejected": -15.537139892578125, "step": 2970 }, { "epoch": 0.74, "grad_norm": 0.007781982421875, "learning_rate": 9.32041231968904e-07, "logits/chosen": -2.1510047912597656, "logits/rejected": -2.4004898071289062, "logps/chosen": -42.21772766113281, "logps/rejected": -1647.2308349609375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.20343096554279327, "rewards/margins": 15.7313232421875, "rewards/rejected": -15.93475341796875, "step": 2980 }, { "epoch": 0.75, "grad_norm": 0.030029296875, "learning_rate": 9.151228586387464e-07, "logits/chosen": -2.2137999534606934, "logits/rejected": -2.4432384967803955, "logps/chosen": -46.389976501464844, "logps/rejected": -1448.36328125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.24593684077262878, "rewards/margins": 13.737565994262695, "rewards/rejected": -13.983503341674805, "step": 2990 }, { "epoch": 0.75, "grad_norm": 0.19921875, "learning_rate": 8.983249789338941e-07, "logits/chosen": -2.1793341636657715, "logits/rejected": -2.4067111015319824, "logps/chosen": -53.83050537109375, "logps/rejected": -1414.4827880859375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.3225783705711365, "rewards/margins": 13.334991455078125, "rewards/rejected": -13.657569885253906, "step": 3000 }, { "epoch": 0.75, "eval_logits/chosen": -2.61519455909729, "eval_logits/rejected": -2.734154462814331, "eval_logps/chosen": -62.61328125, "eval_logps/rejected": -755.4296264648438, "eval_loss": 0.002784780925139785, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.36699774861335754, "eval_rewards/margins": 6.738577365875244, "eval_rewards/rejected": -7.1055755615234375, "eval_runtime": 0.6552, "eval_samples_per_second": 7.632, "eval_steps_per_second": 4.579, "step": 3000 }, { "epoch": 0.75, "grad_norm": 0.0478515625, "learning_rate": 8.816488699485593e-07, "logits/chosen": -2.2049620151519775, "logits/rejected": -2.431889057159424, "logps/chosen": -43.32733917236328, "logps/rejected": -1466.1322021484375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.21807484328746796, "rewards/margins": 13.928044319152832, "rewards/rejected": -14.146120071411133, "step": 3010 }, { "epoch": 0.75, "grad_norm": 0.00012302398681640625, "learning_rate": 8.650957995190784e-07, "logits/chosen": -2.168497085571289, "logits/rejected": -2.439462661743164, "logps/chosen": -43.25156784057617, "logps/rejected": -1766.5902099609375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.20799896121025085, "rewards/margins": 16.896240234375, "rewards/rejected": -17.104238510131836, "step": 3020 }, { "epoch": 0.76, "grad_norm": 0.0830078125, "learning_rate": 8.486670261275193e-07, "logits/chosen": -2.28559947013855, "logits/rejected": -2.535123348236084, "logps/chosen": -46.22868347167969, "logps/rejected": -1491.29638671875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.24624836444854736, "rewards/margins": 14.186907768249512, "rewards/rejected": -14.43315601348877, "step": 3030 }, { "epoch": 0.76, "grad_norm": 0.01068115234375, "learning_rate": 8.32363798806011e-07, "logits/chosen": -2.2580156326293945, "logits/rejected": -2.499662160873413, "logps/chosen": -43.39426803588867, "logps/rejected": -1570.83154296875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.2131904661655426, "rewards/margins": 14.96166706085205, "rewards/rejected": -15.174858093261719, "step": 3040 }, { "epoch": 0.76, "grad_norm": 0.0703125, "learning_rate": 8.161873570417742e-07, "logits/chosen": -2.205913543701172, "logits/rejected": -2.461812973022461, "logps/chosen": -51.799095153808594, "logps/rejected": -1628.696044921875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.2980988919734955, "rewards/margins": 15.454202651977539, "rewards/rejected": -15.752302169799805, "step": 3050 }, { "epoch": 0.76, "grad_norm": 0.0263671875, "learning_rate": 8.001389306828897e-07, "logits/chosen": -2.1009681224823, "logits/rejected": -2.3735690116882324, "logps/chosen": -57.14350128173828, "logps/rejected": -1912.2584228515625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.3501451313495636, "rewards/margins": 18.137147903442383, "rewards/rejected": -18.487293243408203, "step": 3060 }, { "epoch": 0.77, "grad_norm": 2.002716064453125e-05, "learning_rate": 7.842197398447993e-07, "logits/chosen": -2.145404100418091, "logits/rejected": -2.3879191875457764, "logps/chosen": -46.874935150146484, "logps/rejected": -1601.4229736328125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.25245028734207153, "rewards/margins": 15.218485832214355, "rewards/rejected": -15.470934867858887, "step": 3070 }, { "epoch": 0.77, "grad_norm": 0.0014801025390625, "learning_rate": 7.684309948175414e-07, "logits/chosen": -2.1167359352111816, "logits/rejected": -2.340625047683716, "logps/chosen": -41.6113395690918, "logps/rejected": -1543.5413818359375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.19553497433662415, "rewards/margins": 14.705103874206543, "rewards/rejected": -14.900639533996582, "step": 3080 }, { "epoch": 0.77, "grad_norm": 0.0126953125, "learning_rate": 7.527738959737371e-07, "logits/chosen": -2.1751418113708496, "logits/rejected": -2.421253204345703, "logps/chosen": -55.10563278198242, "logps/rejected": -1535.4105224609375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.33046358823776245, "rewards/margins": 14.514165878295898, "rewards/rejected": -14.844629287719727, "step": 3090 }, { "epoch": 0.77, "grad_norm": 0.1435546875, "learning_rate": 7.372496336773269e-07, "logits/chosen": -2.143078565597534, "logits/rejected": -2.3641624450683594, "logps/chosen": -44.76749038696289, "logps/rejected": -1385.771728515625, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.23040492832660675, "rewards/margins": 13.126627922058105, "rewards/rejected": -13.357030868530273, "step": 3100 }, { "epoch": 0.78, "grad_norm": 0.1650390625, "learning_rate": 7.218593881930744e-07, "logits/chosen": -2.237316370010376, "logits/rejected": -2.46457839012146, "logps/chosen": -43.913902282714844, "logps/rejected": -1421.6866455078125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.21920108795166016, "rewards/margins": 13.50879955291748, "rewards/rejected": -13.728001594543457, "step": 3110 }, { "epoch": 0.78, "grad_norm": 0.0218505859375, "learning_rate": 7.066043295968342e-07, "logits/chosen": -2.2042956352233887, "logits/rejected": -2.437238931655884, "logps/chosen": -38.968666076660156, "logps/rejected": -1539.2254638671875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.1654917150735855, "rewards/margins": 14.6710786819458, "rewards/rejected": -14.836568832397461, "step": 3120 }, { "epoch": 0.78, "grad_norm": 0.00147247314453125, "learning_rate": 6.914856176865891e-07, "logits/chosen": -2.2930877208709717, "logits/rejected": -2.530980110168457, "logps/chosen": -39.30299377441406, "logps/rejected": -1486.0648193359375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.18379421532154083, "rewards/margins": 14.16865348815918, "rewards/rejected": -14.352447509765625, "step": 3130 }, { "epoch": 0.78, "grad_norm": 0.0001888275146484375, "learning_rate": 6.765044018942804e-07, "logits/chosen": -2.2794032096862793, "logits/rejected": -2.5219268798828125, "logps/chosen": -37.822265625, "logps/rejected": -1375.9476318359375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.16687843203544617, "rewards/margins": 13.096723556518555, "rewards/rejected": -13.263601303100586, "step": 3140 }, { "epoch": 0.79, "grad_norm": 0.0673828125, "learning_rate": 6.616618211984169e-07, "logits/chosen": -2.189056873321533, "logits/rejected": -2.428335666656494, "logps/chosen": -45.38810348510742, "logps/rejected": -1504.1986083984375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.24011921882629395, "rewards/margins": 14.305384635925293, "rewards/rejected": -14.545504570007324, "step": 3150 }, { "epoch": 0.79, "grad_norm": 0.0289306640625, "learning_rate": 6.469590040374799e-07, "logits/chosen": -2.135713815689087, "logits/rejected": -2.3790910243988037, "logps/chosen": -32.32978057861328, "logps/rejected": -1641.7115478515625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.10296233743429184, "rewards/margins": 15.768649101257324, "rewards/rejected": -15.871612548828125, "step": 3160 }, { "epoch": 0.79, "grad_norm": 0.050537109375, "learning_rate": 6.32397068224136e-07, "logits/chosen": -2.248927593231201, "logits/rejected": -2.501868963241577, "logps/chosen": -40.18678283691406, "logps/rejected": -1579.708251953125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.18390187621116638, "rewards/margins": 15.054614067077637, "rewards/rejected": -15.238515853881836, "step": 3170 }, { "epoch": 0.79, "grad_norm": 0.1005859375, "learning_rate": 6.17977120860249e-07, "logits/chosen": -2.2377326488494873, "logits/rejected": -2.4842400550842285, "logps/chosen": -68.27392578125, "logps/rejected": -1504.4427490234375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.4537542462348938, "rewards/margins": 14.08979606628418, "rewards/rejected": -14.543548583984375, "step": 3180 }, { "epoch": 0.8, "grad_norm": 0.1435546875, "learning_rate": 6.037002582527121e-07, "logits/chosen": -2.17307710647583, "logits/rejected": -2.4036478996276855, "logps/chosen": -38.560646057128906, "logps/rejected": -1505.03662109375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.169607013463974, "rewards/margins": 14.335187911987305, "rewards/rejected": -14.504794120788574, "step": 3190 }, { "epoch": 0.8, "grad_norm": 0.07666015625, "learning_rate": 5.895675658300981e-07, "logits/chosen": -2.3447728157043457, "logits/rejected": -2.5695431232452393, "logps/chosen": -52.10234451293945, "logps/rejected": -1275.713134765625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.3004547953605652, "rewards/margins": 11.987954139709473, "rewards/rejected": -12.288411140441895, "step": 3200 }, { "epoch": 0.8, "grad_norm": 0.140625, "learning_rate": 5.755801180601381e-07, "logits/chosen": -2.2320406436920166, "logits/rejected": -2.4947474002838135, "logps/chosen": -46.99077224731445, "logps/rejected": -1553.0062255859375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.25020334124565125, "rewards/margins": 14.758695602416992, "rewards/rejected": -15.008898735046387, "step": 3210 }, { "epoch": 0.8, "grad_norm": 0.1240234375, "learning_rate": 5.617389783680307e-07, "logits/chosen": -2.0936381816864014, "logits/rejected": -2.3752357959747314, "logps/chosen": -44.832740783691406, "logps/rejected": -1831.029296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.22681677341461182, "rewards/margins": 17.518943786621094, "rewards/rejected": -17.74576187133789, "step": 3220 }, { "epoch": 0.81, "grad_norm": 0.197265625, "learning_rate": 5.48045199055596e-07, "logits/chosen": -2.19124174118042, "logits/rejected": -2.438732624053955, "logps/chosen": -44.39277648925781, "logps/rejected": -1470.6522216796875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.22872138023376465, "rewards/margins": 13.976783752441406, "rewards/rejected": -14.205507278442383, "step": 3230 }, { "epoch": 0.81, "grad_norm": 7.264316082000732e-07, "learning_rate": 5.344998212212704e-07, "logits/chosen": -2.103717565536499, "logits/rejected": -2.3787877559661865, "logps/chosen": -46.40209197998047, "logps/rejected": -1813.8245849609375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.23586265742778778, "rewards/margins": 17.30778694152832, "rewards/rejected": -17.543649673461914, "step": 3240 }, { "epoch": 0.81, "grad_norm": 0.01300048828125, "learning_rate": 5.211038746809551e-07, "logits/chosen": -2.2235634326934814, "logits/rejected": -2.4578189849853516, "logps/chosen": -50.56513595581055, "logps/rejected": -1465.8817138671875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.29284316301345825, "rewards/margins": 13.869397163391113, "rewards/rejected": -14.162240982055664, "step": 3250 }, { "epoch": 0.81, "grad_norm": 0.1123046875, "learning_rate": 5.078583778897216e-07, "logits/chosen": -2.2172188758850098, "logits/rejected": -2.4327051639556885, "logps/chosen": -58.36212158203125, "logps/rejected": -1398.1490478515625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.36190560460090637, "rewards/margins": 13.11742877960205, "rewards/rejected": -13.479333877563477, "step": 3260 }, { "epoch": 0.82, "grad_norm": 0.11962890625, "learning_rate": 4.94764337864384e-07, "logits/chosen": -2.304565668106079, "logits/rejected": -2.5306572914123535, "logps/chosen": -43.99140548706055, "logps/rejected": -1435.986572265625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.2204219549894333, "rewards/margins": 13.629618644714355, "rewards/rejected": -13.8500394821167, "step": 3270 }, { "epoch": 0.82, "grad_norm": 0.0184326171875, "learning_rate": 4.818227501069328e-07, "logits/chosen": -2.259232521057129, "logits/rejected": -2.5605130195617676, "logps/chosen": -62.873863220214844, "logps/rejected": -1876.69140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.39689213037490845, "rewards/margins": 17.804046630859375, "rewards/rejected": -18.200939178466797, "step": 3280 }, { "epoch": 0.82, "grad_norm": 0.0203857421875, "learning_rate": 4.690345985288572e-07, "logits/chosen": -2.158508777618408, "logits/rejected": -2.399550437927246, "logps/chosen": -40.623416900634766, "logps/rejected": -1611.9981689453125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.18651778995990753, "rewards/margins": 15.386564254760742, "rewards/rejected": -15.573080062866211, "step": 3290 }, { "epoch": 0.82, "grad_norm": 1.5273690223693848e-06, "learning_rate": 4.5640085537633633e-07, "logits/chosen": -2.185797691345215, "logits/rejected": -2.462428331375122, "logps/chosen": -64.81330871582031, "logps/rejected": -1722.494384765625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.42417916655540466, "rewards/margins": 16.25531578063965, "rewards/rejected": -16.679494857788086, "step": 3300 }, { "epoch": 0.83, "grad_norm": 5.936622619628906e-05, "learning_rate": 4.439224811563211e-07, "logits/chosen": -2.0893611907958984, "logits/rejected": -2.3352718353271484, "logps/chosen": -42.231239318847656, "logps/rejected": -1722.9840087890625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.20374104380607605, "rewards/margins": 16.47307586669922, "rewards/rejected": -16.67681884765625, "step": 3310 }, { "epoch": 0.83, "grad_norm": 0.000370025634765625, "learning_rate": 4.316004245635158e-07, "logits/chosen": -2.1728897094726562, "logits/rejected": -2.4218878746032715, "logps/chosen": -47.56116485595703, "logps/rejected": -1730.8961181640625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.25542110204696655, "rewards/margins": 16.49363899230957, "rewards/rejected": -16.74905776977539, "step": 3320 }, { "epoch": 0.83, "grad_norm": 9.1552734375e-05, "learning_rate": 4.194356224082455e-07, "logits/chosen": -2.095263957977295, "logits/rejected": -2.3779187202453613, "logps/chosen": -44.129798889160156, "logps/rejected": -1774.712158203125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.22583921253681183, "rewards/margins": 16.942506790161133, "rewards/rejected": -17.168346405029297, "step": 3330 }, { "epoch": 0.83, "grad_norm": 0.00445556640625, "learning_rate": 4.074289995452338e-07, "logits/chosen": -2.1644439697265625, "logits/rejected": -2.4039626121520996, "logps/chosen": -55.25883102416992, "logps/rejected": -1481.6717529296875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.3257646858692169, "rewards/margins": 13.985359191894531, "rewards/rejected": -14.311124801635742, "step": 3340 }, { "epoch": 0.84, "grad_norm": 0.00010395050048828125, "learning_rate": 3.9558146880329246e-07, "logits/chosen": -2.1858904361724854, "logits/rejected": -2.422576427459717, "logps/chosen": -38.67001724243164, "logps/rejected": -1623.956298828125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.1640951931476593, "rewards/margins": 15.4973783493042, "rewards/rejected": -15.661474227905273, "step": 3350 }, { "epoch": 0.84, "grad_norm": 0.03369140625, "learning_rate": 3.838939309159187e-07, "logits/chosen": -2.179760694503784, "logits/rejected": -2.4091451168060303, "logps/chosen": -44.970279693603516, "logps/rejected": -1523.0604248046875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.23116175830364227, "rewards/margins": 14.479741096496582, "rewards/rejected": -14.710905075073242, "step": 3360 }, { "epoch": 0.84, "grad_norm": 0.005523681640625, "learning_rate": 3.723672744528162e-07, "logits/chosen": -2.256727695465088, "logits/rejected": -2.5047221183776855, "logps/chosen": -41.15488815307617, "logps/rejected": -1610.76220703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.19345328211784363, "rewards/margins": 15.386802673339844, "rewards/rejected": -15.580256462097168, "step": 3370 }, { "epoch": 0.84, "grad_norm": 0.00286865234375, "learning_rate": 3.6100237575233647e-07, "logits/chosen": -2.3228962421417236, "logits/rejected": -2.5387914180755615, "logps/chosen": -51.057395935058594, "logps/rejected": -1306.785400390625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.2881939113140106, "rewards/margins": 12.320541381835938, "rewards/rejected": -12.608736038208008, "step": 3380 }, { "epoch": 0.85, "grad_norm": 0.06591796875, "learning_rate": 3.4980009885486054e-07, "logits/chosen": -2.25309157371521, "logits/rejected": -2.4575724601745605, "logps/chosen": -41.59147644042969, "logps/rejected": -1270.520751953125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.20340308547019958, "rewards/margins": 12.038886070251465, "rewards/rejected": -12.242289543151855, "step": 3390 }, { "epoch": 0.85, "grad_norm": 0.09423828125, "learning_rate": 3.3876129543710197e-07, "logits/chosen": -2.2136871814727783, "logits/rejected": -2.4533486366271973, "logps/chosen": -39.60608673095703, "logps/rejected": -1685.295654296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.1774289309978485, "rewards/margins": 16.114295959472656, "rewards/rejected": -16.291725158691406, "step": 3400 }, { "epoch": 0.85, "grad_norm": 0.002960205078125, "learning_rate": 3.2788680474735687e-07, "logits/chosen": -2.194180488586426, "logits/rejected": -2.439089298248291, "logps/chosen": -38.77867889404297, "logps/rejected": -1473.8935546875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.174292653799057, "rewards/margins": 14.067087173461914, "rewards/rejected": -14.24138069152832, "step": 3410 }, { "epoch": 0.85, "grad_norm": 0.01177978515625, "learning_rate": 3.1717745354170214e-07, "logits/chosen": -2.0905921459198, "logits/rejected": -2.367194175720215, "logps/chosen": -52.29819869995117, "logps/rejected": -1684.010986328125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3022548258304596, "rewards/margins": 16.003009796142578, "rewards/rejected": -16.30526351928711, "step": 3420 }, { "epoch": 0.86, "grad_norm": 0.002471923828125, "learning_rate": 3.0663405602113727e-07, "logits/chosen": -2.258749485015869, "logits/rejected": -2.5248327255249023, "logps/chosen": -48.49363327026367, "logps/rejected": -1557.198486328125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.2676192820072174, "rewards/margins": 14.78343391418457, "rewards/rejected": -15.051053047180176, "step": 3430 }, { "epoch": 0.86, "grad_norm": 0.021240234375, "learning_rate": 2.9625741376968107e-07, "logits/chosen": -2.0779199600219727, "logits/rejected": -2.3387115001678467, "logps/chosen": -61.849632263183594, "logps/rejected": -1759.1298828125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.3940272629261017, "rewards/margins": 16.581546783447266, "rewards/rejected": -16.975570678710938, "step": 3440 }, { "epoch": 0.86, "grad_norm": 0.0257568359375, "learning_rate": 2.8604831569343324e-07, "logits/chosen": -2.3138976097106934, "logits/rejected": -2.529716968536377, "logps/chosen": -50.220458984375, "logps/rejected": -1407.71826171875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.2772979736328125, "rewards/margins": 13.286694526672363, "rewards/rejected": -13.563992500305176, "step": 3450 }, { "epoch": 0.86, "grad_norm": 0.0184326171875, "learning_rate": 2.760075379605942e-07, "logits/chosen": -2.144134759902954, "logits/rejected": -2.3687326908111572, "logps/chosen": -49.850528717041016, "logps/rejected": -1546.928955078125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.2831845283508301, "rewards/margins": 14.664377212524414, "rewards/rejected": -14.947561264038086, "step": 3460 }, { "epoch": 0.87, "grad_norm": 0.291015625, "learning_rate": 2.661358439424552e-07, "logits/chosen": -2.209009885787964, "logits/rejected": -2.4345531463623047, "logps/chosen": -46.4788818359375, "logps/rejected": -1371.7965087890625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -0.24716854095458984, "rewards/margins": 12.960786819458008, "rewards/rejected": -13.207954406738281, "step": 3470 }, { "epoch": 0.87, "grad_norm": 0.005462646484375, "learning_rate": 2.564339841553615e-07, "logits/chosen": -2.200819969177246, "logits/rejected": -2.416544198989868, "logps/chosen": -43.68715286254883, "logps/rejected": -1402.603759765625, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.22630712389945984, "rewards/margins": 13.287277221679688, "rewards/rejected": -13.513586044311523, "step": 3480 }, { "epoch": 0.87, "grad_norm": 0.462890625, "learning_rate": 2.469026962036539e-07, "logits/chosen": -2.1682403087615967, "logits/rejected": -2.384089946746826, "logps/chosen": -43.91130447387695, "logps/rejected": -1500.0106201171875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.22072705626487732, "rewards/margins": 14.21442699432373, "rewards/rejected": -14.435153007507324, "step": 3490 }, { "epoch": 0.87, "grad_norm": 0.0478515625, "learning_rate": 2.3754270472358786e-07, "logits/chosen": -2.17598032951355, "logits/rejected": -2.39375638961792, "logps/chosen": -40.356300354003906, "logps/rejected": -1466.735595703125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.18523935973644257, "rewards/margins": 13.940629959106445, "rewards/rejected": -14.125869750976562, "step": 3500 }, { "epoch": 0.88, "grad_norm": 0.51953125, "learning_rate": 2.283547213282458e-07, "logits/chosen": -2.2732253074645996, "logits/rejected": -2.502781629562378, "logps/chosen": -46.22926330566406, "logps/rejected": -1531.5400390625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.24311251938343048, "rewards/margins": 14.52368450164795, "rewards/rejected": -14.76679515838623, "step": 3510 }, { "epoch": 0.88, "grad_norm": 0.1328125, "learning_rate": 2.1933944455343166e-07, "logits/chosen": -2.0053231716156006, "logits/rejected": -2.297400951385498, "logps/chosen": -57.17345428466797, "logps/rejected": -1729.294921875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.35318198800086975, "rewards/margins": 16.38203239440918, "rewards/rejected": -16.735218048095703, "step": 3520 }, { "epoch": 0.88, "grad_norm": 0.00653076171875, "learning_rate": 2.104975598045647e-07, "logits/chosen": -2.1619279384613037, "logits/rejected": -2.387904167175293, "logps/chosen": -37.9974250793457, "logps/rejected": -1388.173583984375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.16347357630729675, "rewards/margins": 13.225934982299805, "rewards/rejected": -13.38940715789795, "step": 3530 }, { "epoch": 0.88, "grad_norm": 0.19921875, "learning_rate": 2.018297393045701e-07, "logits/chosen": -2.201099395751953, "logits/rejected": -2.4077112674713135, "logps/chosen": -43.07371139526367, "logps/rejected": -1454.396728515625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.21469160914421082, "rewards/margins": 13.828866958618164, "rewards/rejected": -14.043559074401855, "step": 3540 }, { "epoch": 0.89, "grad_norm": 0.05859375, "learning_rate": 1.9333664204277236e-07, "logits/chosen": -2.0957770347595215, "logits/rejected": -2.3300156593322754, "logps/chosen": -40.987003326416016, "logps/rejected": -1783.749755859375, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.1888490617275238, "rewards/margins": 17.056163787841797, "rewards/rejected": -17.245014190673828, "step": 3550 }, { "epoch": 0.89, "grad_norm": 0.0001068115234375, "learning_rate": 1.8501891372479124e-07, "logits/chosen": -2.1852810382843018, "logits/rejected": -2.4316954612731934, "logps/chosen": -44.98752975463867, "logps/rejected": -1581.531982421875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.22910073399543762, "rewards/margins": 15.049044609069824, "rewards/rejected": -15.278146743774414, "step": 3560 }, { "epoch": 0.89, "grad_norm": 0.115234375, "learning_rate": 1.7687718672345533e-07, "logits/chosen": -2.1352264881134033, "logits/rejected": -2.3711695671081543, "logps/chosen": -51.448890686035156, "logps/rejected": -1699.1624755859375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.29201555252075195, "rewards/margins": 16.14071273803711, "rewards/rejected": -16.432727813720703, "step": 3570 }, { "epoch": 0.89, "grad_norm": 0.002471923828125, "learning_rate": 1.689120800307212e-07, "logits/chosen": -2.0329132080078125, "logits/rejected": -2.287954092025757, "logps/chosen": -46.30390167236328, "logps/rejected": -1923.584716796875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.24167513847351074, "rewards/margins": 18.367229461669922, "rewards/rejected": -18.608905792236328, "step": 3580 }, { "epoch": 0.9, "grad_norm": 0.1376953125, "learning_rate": 1.6112419921061357e-07, "logits/chosen": -2.1787500381469727, "logits/rejected": -2.4122672080993652, "logps/chosen": -50.2169075012207, "logps/rejected": -1459.5537109375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.2793883681297302, "rewards/margins": 13.814018249511719, "rewards/rejected": -14.093404769897461, "step": 3590 }, { "epoch": 0.9, "grad_norm": 0.2060546875, "learning_rate": 1.5351413635318807e-07, "logits/chosen": -2.2764883041381836, "logits/rejected": -2.5183794498443604, "logps/chosen": -47.43731689453125, "logps/rejected": -1477.555419921875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.2550382614135742, "rewards/margins": 13.986851692199707, "rewards/rejected": -14.241891860961914, "step": 3600 }, { "epoch": 0.9, "grad_norm": 0.1279296875, "learning_rate": 1.460824700295138e-07, "logits/chosen": -2.268395185470581, "logits/rejected": -2.5008223056793213, "logps/chosen": -55.70990753173828, "logps/rejected": -1575.289794921875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.3271048367023468, "rewards/margins": 14.906885147094727, "rewards/rejected": -15.233988761901855, "step": 3610 }, { "epoch": 0.9, "grad_norm": 0.05224609375, "learning_rate": 1.3882976524768694e-07, "logits/chosen": -2.2560763359069824, "logits/rejected": -2.474360942840576, "logps/chosen": -48.290626525878906, "logps/rejected": -1297.683349609375, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.25945621728897095, "rewards/margins": 12.217463493347168, "rewards/rejected": -12.476920127868652, "step": 3620 }, { "epoch": 0.91, "grad_norm": 0.0004482269287109375, "learning_rate": 1.3175657340987664e-07, "logits/chosen": -2.1752967834472656, "logits/rejected": -2.405945301055908, "logps/chosen": -39.73582077026367, "logps/rejected": -1538.538818359375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.1809137761592865, "rewards/margins": 14.694430351257324, "rewards/rejected": -14.875345230102539, "step": 3630 }, { "epoch": 0.91, "grad_norm": 0.018798828125, "learning_rate": 1.2486343227040122e-07, "logits/chosen": -2.286973476409912, "logits/rejected": -2.5369372367858887, "logps/chosen": -47.38744354248047, "logps/rejected": -1557.68798828125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.24152731895446777, "rewards/margins": 14.780197143554688, "rewards/rejected": -15.021723747253418, "step": 3640 }, { "epoch": 0.91, "grad_norm": 0.91796875, "learning_rate": 1.181508658948452e-07, "logits/chosen": -2.2179925441741943, "logits/rejected": -2.4420266151428223, "logps/chosen": -38.083351135253906, "logps/rejected": -1484.4072265625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.16277261078357697, "rewards/margins": 14.176862716674805, "rewards/rejected": -14.339635848999023, "step": 3650 }, { "epoch": 0.91, "grad_norm": 0.10107421875, "learning_rate": 1.1161938462021627e-07, "logits/chosen": -2.1011550426483154, "logits/rejected": -2.328584671020508, "logps/chosen": -42.358421325683594, "logps/rejected": -1466.247802734375, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -0.19884276390075684, "rewards/margins": 13.929216384887695, "rewards/rejected": -14.128057479858398, "step": 3660 }, { "epoch": 0.92, "grad_norm": 0.45703125, "learning_rate": 1.0526948501614536e-07, "logits/chosen": -2.129077434539795, "logits/rejected": -2.3908514976501465, "logps/chosen": -53.09492874145508, "logps/rejected": -1710.9476318359375, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.3066931366920471, "rewards/margins": 16.256519317626953, "rewards/rejected": -16.563209533691406, "step": 3670 }, { "epoch": 0.92, "grad_norm": 0.4296875, "learning_rate": 9.910164984713477e-08, "logits/chosen": -2.135789394378662, "logits/rejected": -2.3950417041778564, "logps/chosen": -43.19747543334961, "logps/rejected": -1662.0009765625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.2104502171278, "rewards/margins": 15.8755464553833, "rewards/rejected": -16.085996627807617, "step": 3680 }, { "epoch": 0.92, "grad_norm": 0.0751953125, "learning_rate": 9.311634803585323e-08, "logits/chosen": -2.1814115047454834, "logits/rejected": -2.445276975631714, "logps/chosen": -53.1719970703125, "logps/rejected": -1662.022705078125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.3151262402534485, "rewards/margins": 15.78093433380127, "rewards/rejected": -16.0960636138916, "step": 3690 }, { "epoch": 0.92, "grad_norm": 0.00125885009765625, "learning_rate": 8.7314034627487e-08, "logits/chosen": -2.230149507522583, "logits/rejected": -2.479522705078125, "logps/chosen": -36.06591033935547, "logps/rejected": -1619.81884765625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.14542549848556519, "rewards/margins": 15.533602714538574, "rewards/rejected": -15.679028511047363, "step": 3700 }, { "epoch": 0.93, "grad_norm": 0.023681640625, "learning_rate": 8.16951507551439e-08, "logits/chosen": -2.2386298179626465, "logits/rejected": -2.456665515899658, "logps/chosen": -45.547454833984375, "logps/rejected": -1492.232666015625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.2374843806028366, "rewards/margins": 14.150115966796875, "rewards/rejected": -14.387600898742676, "step": 3710 }, { "epoch": 0.93, "grad_norm": 0.00019931793212890625, "learning_rate": 7.626012360631291e-08, "logits/chosen": -2.266707420349121, "logits/rejected": -2.5029749870300293, "logps/chosen": -49.74803924560547, "logps/rejected": -1482.993896484375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.2803342640399933, "rewards/margins": 14.048635482788086, "rewards/rejected": -14.328969955444336, "step": 3720 }, { "epoch": 0.93, "grad_norm": 0.03662109375, "learning_rate": 7.100936639038936e-08, "logits/chosen": -2.040673017501831, "logits/rejected": -2.3331856727600098, "logps/chosen": -43.85186004638672, "logps/rejected": -1894.5054931640625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.2176673710346222, "rewards/margins": 18.155574798583984, "rewards/rejected": -18.373241424560547, "step": 3730 }, { "epoch": 0.93, "grad_norm": 1.0609626770019531e-05, "learning_rate": 6.594327830725916e-08, "logits/chosen": -2.190392017364502, "logits/rejected": -2.451129913330078, "logps/chosen": -54.63507080078125, "logps/rejected": -1576.1376953125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.32724398374557495, "rewards/margins": 14.939753532409668, "rewards/rejected": -15.266998291015625, "step": 3740 }, { "epoch": 0.94, "grad_norm": 0.1083984375, "learning_rate": 6.106224451694592e-08, "logits/chosen": -2.2175679206848145, "logits/rejected": -2.4564685821533203, "logps/chosen": -46.39513397216797, "logps/rejected": -1616.2586669921875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.24283328652381897, "rewards/margins": 15.379629135131836, "rewards/rejected": -15.622464179992676, "step": 3750 }, { "epoch": 0.94, "grad_norm": 0.1669921875, "learning_rate": 5.636663611033266e-08, "logits/chosen": -2.0778698921203613, "logits/rejected": -2.3482048511505127, "logps/chosen": -44.864990234375, "logps/rejected": -1604.7249755859375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.23550191521644592, "rewards/margins": 15.299467086791992, "rewards/rejected": -15.534968376159668, "step": 3760 }, { "epoch": 0.94, "grad_norm": 0.0162353515625, "learning_rate": 5.185681008094579e-08, "logits/chosen": -2.284482479095459, "logits/rejected": -2.5175766944885254, "logps/chosen": -47.569602966308594, "logps/rejected": -1563.1016845703125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.25453075766563416, "rewards/margins": 14.83684253692627, "rewards/rejected": -15.0913724899292, "step": 3770 }, { "epoch": 0.94, "grad_norm": 0.00015735626220703125, "learning_rate": 4.753310929781513e-08, "logits/chosen": -2.2356629371643066, "logits/rejected": -2.451608657836914, "logps/chosen": -53.69340133666992, "logps/rejected": -1443.2952880859375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.3177313208580017, "rewards/margins": 13.615710258483887, "rewards/rejected": -13.933443069458008, "step": 3780 }, { "epoch": 0.95, "grad_norm": 6.246566772460938e-05, "learning_rate": 4.3395862479405914e-08, "logits/chosen": -2.156893253326416, "logits/rejected": -2.395084857940674, "logps/chosen": -46.39254379272461, "logps/rejected": -1630.5118408203125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.24043354392051697, "rewards/margins": 15.48585319519043, "rewards/rejected": -15.726287841796875, "step": 3790 }, { "epoch": 0.95, "grad_norm": 0.0162353515625, "learning_rate": 3.9445384168628474e-08, "logits/chosen": -2.328781843185425, "logits/rejected": -2.580176591873169, "logps/chosen": -52.544822692871094, "logps/rejected": -1474.49365234375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.3075386881828308, "rewards/margins": 13.93040657043457, "rewards/rejected": -14.237945556640625, "step": 3800 }, { "epoch": 0.95, "grad_norm": 0.046630859375, "learning_rate": 3.5681974708923484e-08, "logits/chosen": -2.120448589324951, "logits/rejected": -2.3445982933044434, "logps/chosen": -37.53495788574219, "logps/rejected": -1442.9287109375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.1521245837211609, "rewards/margins": 13.743891716003418, "rewards/rejected": -13.896017074584961, "step": 3810 }, { "epoch": 0.95, "grad_norm": 0.01068115234375, "learning_rate": 3.210592022142717e-08, "logits/chosen": -2.1601128578186035, "logits/rejected": -2.3705685138702393, "logps/chosen": -52.31241989135742, "logps/rejected": -1557.874755859375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.3095288872718811, "rewards/margins": 14.724299430847168, "rewards/rejected": -15.033828735351562, "step": 3820 }, { "epoch": 0.96, "grad_norm": 0.09912109375, "learning_rate": 2.8717492583220095e-08, "logits/chosen": -2.2527565956115723, "logits/rejected": -2.4976978302001953, "logps/chosen": -45.60851287841797, "logps/rejected": -1558.29931640625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.24002361297607422, "rewards/margins": 14.838605880737305, "rewards/rejected": -15.078630447387695, "step": 3830 }, { "epoch": 0.96, "grad_norm": 1.1171875, "learning_rate": 2.551694940665539e-08, "logits/chosen": -2.191880464553833, "logits/rejected": -2.41998291015625, "logps/chosen": -51.693626403808594, "logps/rejected": -1440.926513671875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.29828059673309326, "rewards/margins": 13.601608276367188, "rewards/rejected": -13.899889945983887, "step": 3840 }, { "epoch": 0.96, "grad_norm": 0.0006561279296875, "learning_rate": 2.2504534019774092e-08, "logits/chosen": -2.347978353500366, "logits/rejected": -2.5543174743652344, "logps/chosen": -42.8640251159668, "logps/rejected": -1363.5374755859375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.2091234177350998, "rewards/margins": 12.935150146484375, "rewards/rejected": -13.14427375793457, "step": 3850 }, { "epoch": 0.96, "grad_norm": 0.099609375, "learning_rate": 1.9680475447805826e-08, "logits/chosen": -2.231818437576294, "logits/rejected": -2.4574391841888428, "logps/chosen": -58.65113067626953, "logps/rejected": -1416.2275390625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.3713977038860321, "rewards/margins": 13.290824890136719, "rewards/rejected": -13.662221908569336, "step": 3860 }, { "epoch": 0.97, "grad_norm": 0.00102996826171875, "learning_rate": 1.70449883957563e-08, "logits/chosen": -2.254411220550537, "logits/rejected": -2.486921787261963, "logps/chosen": -50.42361831665039, "logps/rejected": -1524.43603515625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.28153225779533386, "rewards/margins": 14.412869453430176, "rewards/rejected": -14.694402694702148, "step": 3870 }, { "epoch": 0.97, "grad_norm": 0.07080078125, "learning_rate": 1.4598273232083182e-08, "logits/chosen": -2.2290568351745605, "logits/rejected": -2.4446380138397217, "logps/chosen": -40.07840347290039, "logps/rejected": -1425.9326171875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.18418380618095398, "rewards/margins": 13.563642501831055, "rewards/rejected": -13.747825622558594, "step": 3880 }, { "epoch": 0.97, "grad_norm": 0.000240325927734375, "learning_rate": 1.2340515973464917e-08, "logits/chosen": -2.1592583656311035, "logits/rejected": -2.4244697093963623, "logps/chosen": -54.51990509033203, "logps/rejected": -1623.231689453125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.32529979944229126, "rewards/margins": 15.373883247375488, "rewards/rejected": -15.699182510375977, "step": 3890 }, { "epoch": 0.97, "grad_norm": 0.007537841796875, "learning_rate": 1.0271888270655118e-08, "logits/chosen": -2.0656638145446777, "logits/rejected": -2.2895803451538086, "logps/chosen": -38.120445251464844, "logps/rejected": -1580.198486328125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.16423656046390533, "rewards/margins": 15.07524299621582, "rewards/rejected": -15.239479064941406, "step": 3900 }, { "epoch": 0.98, "grad_norm": 0.107421875, "learning_rate": 8.392547395435769e-09, "logits/chosen": -2.413311243057251, "logits/rejected": -2.6234774589538574, "logps/chosen": -62.75360107421875, "logps/rejected": -1339.970458984375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.40989524126052856, "rewards/margins": 12.507749557495117, "rewards/rejected": -12.917645454406738, "step": 3910 }, { "epoch": 0.98, "grad_norm": 0.00030517578125, "learning_rate": 6.702636228657911e-09, "logits/chosen": -2.292491912841797, "logits/rejected": -2.5207715034484863, "logps/chosen": -43.633766174316406, "logps/rejected": -1431.236083984375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.2170587033033371, "rewards/margins": 13.580774307250977, "rewards/rejected": -13.797833442687988, "step": 3920 }, { "epoch": 0.98, "grad_norm": 0.181640625, "learning_rate": 5.2022832493800465e-09, "logits/chosen": -2.363647937774658, "logits/rejected": -2.5729119777679443, "logps/chosen": -53.60352325439453, "logps/rejected": -1316.2359619140625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3134381175041199, "rewards/margins": 12.360208511352539, "rewards/rejected": -12.673646926879883, "step": 3930 }, { "epoch": 0.98, "grad_norm": 0.0001983642578125, "learning_rate": 3.891602525100124e-09, "logits/chosen": -2.2279720306396484, "logits/rejected": -2.4751038551330566, "logps/chosen": -46.06736373901367, "logps/rejected": -1572.96337890625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.24313127994537354, "rewards/margins": 14.95093059539795, "rewards/rejected": -15.194061279296875, "step": 3940 }, { "epoch": 0.99, "grad_norm": 0.01434326171875, "learning_rate": 2.7706937030827495e-09, "logits/chosen": -2.2832016944885254, "logits/rejected": -2.5126912593841553, "logps/chosen": -51.094390869140625, "logps/rejected": -1317.524169921875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.2949807047843933, "rewards/margins": 12.386547088623047, "rewards/rejected": -12.681528091430664, "step": 3950 }, { "epoch": 0.99, "grad_norm": 0.028564453125, "learning_rate": 1.839642002783859e-09, "logits/chosen": -2.2053470611572266, "logits/rejected": -2.42755389213562, "logps/chosen": -39.788963317871094, "logps/rejected": -1376.3099365234375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.18181900680065155, "rewards/margins": 13.076495170593262, "rewards/rejected": -13.25831413269043, "step": 3960 }, { "epoch": 0.99, "grad_norm": 0.03857421875, "learning_rate": 1.0985182093714574e-09, "logits/chosen": -2.253420352935791, "logits/rejected": -2.4623093605041504, "logps/chosen": -59.57801055908203, "logps/rejected": -1399.8045654296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.3837854266166687, "rewards/margins": 13.101987838745117, "rewards/rejected": -13.485774040222168, "step": 3970 }, { "epoch": 0.99, "grad_norm": 0.0198974609375, "learning_rate": 5.473786683440896e-10, "logits/chosen": -2.1431727409362793, "logits/rejected": -2.3964176177978516, "logps/chosen": -56.668495178222656, "logps/rejected": -1672.459716796875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.34237998723983765, "rewards/margins": 15.848222732543945, "rewards/rejected": -16.190601348876953, "step": 3980 }, { "epoch": 1.0, "grad_norm": 0.09765625, "learning_rate": 1.862652812467669e-10, "logits/chosen": -2.1814169883728027, "logits/rejected": -2.422477960586548, "logps/chosen": -39.490379333496094, "logps/rejected": -1714.8070068359375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.17177413403987885, "rewards/margins": 16.374120712280273, "rewards/rejected": -16.545894622802734, "step": 3990 }, { "epoch": 1.0, "grad_norm": 0.0001277923583984375, "learning_rate": 1.5205502486292932e-11, "logits/chosen": -2.172867774963379, "logits/rejected": -2.422581911087036, "logps/chosen": -44.31547164916992, "logps/rejected": -1577.330078125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.23060818016529083, "rewards/margins": 15.038507461547852, "rewards/rejected": -15.269113540649414, "step": 4000 }, { "epoch": 1.0, "eval_logits/chosen": -2.614182472229004, "eval_logits/rejected": -2.732987642288208, "eval_logps/chosen": -60.27886962890625, "eval_logps/rejected": -752.9370727539062, "eval_loss": 0.0027744148392230272, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.34365367889404297, "eval_rewards/margins": 6.736997127532959, "eval_rewards/rejected": -7.080650329589844, "eval_runtime": 0.6551, "eval_samples_per_second": 7.633, "eval_steps_per_second": 4.58, "step": 4000 }, { "epoch": 1.0, "step": 4004, "total_flos": 0.0, "train_loss": 0.04242442899794605, "train_runtime": 8772.4234, "train_samples_per_second": 1.826, "train_steps_per_second": 0.456 } ], "logging_steps": 10, "max_steps": 4004, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }