diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6109 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998751404669747, + "eval_steps": 1000, + "global_step": 4004, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.6953125, + "learning_rate": 1.2468827930174565e-08, + "logits/chosen": -2.4102063179016113, + "logits/rejected": -2.672837734222412, + "logps/chosen": -21.34674835205078, + "logps/rejected": -42.586097717285156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.7421875, + "learning_rate": 1.2468827930174566e-07, + "logits/chosen": -2.239577293395996, + "logits/rejected": -2.476416826248169, + "logps/chosen": -21.881580352783203, + "logps/rejected": -54.84682083129883, + "loss": 0.693, + "rewards/accuracies": 0.5555555820465088, + "rewards/chosen": 0.00018471028306521475, + "rewards/margins": 0.00028743690927512944, + "rewards/rejected": -0.00010272659710608423, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.83984375, + "learning_rate": 2.493765586034913e-07, + "logits/chosen": -2.163784980773926, + "logits/rejected": -2.405578136444092, + "logps/chosen": -21.341472625732422, + "logps/rejected": -55.192710876464844, + "loss": 0.6927, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.000193244923138991, + "rewards/margins": 0.0009195079328492284, + "rewards/rejected": -0.0007262630388140678, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.6796875, + "learning_rate": 3.7406483790523695e-07, + "logits/chosen": -2.0837199687957764, + "logits/rejected": -2.361438274383545, + "logps/chosen": -21.834430694580078, + "logps/rejected": -51.4864501953125, + "loss": 0.6929, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 6.211231811903417e-05, + "rewards/margins": 0.000506018113810569, + "rewards/rejected": -0.00044390588300302625, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.80078125, + "learning_rate": 4.987531172069826e-07, + "logits/chosen": -2.088737726211548, + "logits/rejected": -2.3435609340667725, + "logps/chosen": -22.16689682006836, + "logps/rejected": -55.5480842590332, + "loss": 0.6924, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0003017825947608799, + "rewards/margins": 0.001477475045248866, + "rewards/rejected": -0.0011756925377994776, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.70703125, + "learning_rate": 6.234413965087283e-07, + "logits/chosen": -2.1819872856140137, + "logits/rejected": -2.480788469314575, + "logps/chosen": -22.51431655883789, + "logps/rejected": -58.81789016723633, + "loss": 0.6917, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0005659356247633696, + "rewards/margins": 0.00293327565304935, + "rewards/rejected": -0.0023673397954553366, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 1.1171875, + "learning_rate": 7.481296758104739e-07, + "logits/chosen": -2.13201642036438, + "logits/rejected": -2.3695003986358643, + "logps/chosen": -22.39255142211914, + "logps/rejected": -57.26397705078125, + "loss": 0.6906, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0013241939013823867, + "rewards/margins": 0.0052015529945492744, + "rewards/rejected": -0.0038773592095822096, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.640625, + "learning_rate": 8.728179551122195e-07, + "logits/chosen": -2.2473509311676025, + "logits/rejected": -2.4891819953918457, + "logps/chosen": -21.353261947631836, + "logps/rejected": -50.47459411621094, + "loss": 0.6888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0020654837135225534, + "rewards/margins": 0.00875779427587986, + "rewards/rejected": -0.0066923112608492374, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6796875, + "learning_rate": 9.975062344139653e-07, + "logits/chosen": -2.0938005447387695, + "logits/rejected": -2.3415422439575195, + "logps/chosen": -21.880977630615234, + "logps/rejected": -55.579505920410156, + "loss": 0.6847, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004472785629332066, + "rewards/margins": 0.016874177381396294, + "rewards/rejected": -0.012401392683386803, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 0.69140625, + "learning_rate": 1.1221945137157108e-06, + "logits/chosen": -2.2169880867004395, + "logits/rejected": -2.43099308013916, + "logps/chosen": -21.09603500366211, + "logps/rejected": -52.89866256713867, + "loss": 0.681, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007039895746856928, + "rewards/margins": 0.02454659901559353, + "rewards/rejected": -0.017506705597043037, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 0.8203125, + "learning_rate": 1.2468827930174565e-06, + "logits/chosen": -2.079484701156616, + "logits/rejected": -2.3457648754119873, + "logps/chosen": -20.769153594970703, + "logps/rejected": -60.0728645324707, + "loss": 0.6753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010782149620354176, + "rewards/margins": 0.03612237051129341, + "rewards/rejected": -0.02534021995961666, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 0.7890625, + "learning_rate": 1.3715710723192023e-06, + "logits/chosen": -2.1157209873199463, + "logits/rejected": -2.373112440109253, + "logps/chosen": -20.276954650878906, + "logps/rejected": -56.75362014770508, + "loss": 0.6659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016899898648262024, + "rewards/margins": 0.055193256586790085, + "rewards/rejected": -0.03829335793852806, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 1.4296875, + "learning_rate": 1.4962593516209478e-06, + "logits/chosen": -2.0827651023864746, + "logits/rejected": -2.3430802822113037, + "logps/chosen": -20.093006134033203, + "logps/rejected": -55.97692108154297, + "loss": 0.6578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02318699099123478, + "rewards/margins": 0.07196114957332611, + "rewards/rejected": -0.04877415671944618, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 0.88671875, + "learning_rate": 1.6209476309226935e-06, + "logits/chosen": -2.2666783332824707, + "logits/rejected": -2.533306837081909, + "logps/chosen": -18.66347885131836, + "logps/rejected": -60.43751907348633, + "loss": 0.6421, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03544957563281059, + "rewards/margins": 0.10495258867740631, + "rewards/rejected": -0.06950302422046661, + "step": 130 + }, + { + "epoch": 0.03, + "grad_norm": 0.734375, + "learning_rate": 1.745635910224439e-06, + "logits/chosen": -2.132418155670166, + "logits/rejected": -2.3867433071136475, + "logps/chosen": -16.58405876159668, + "logps/rejected": -65.14090728759766, + "loss": 0.6258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.052351079881191254, + "rewards/margins": 0.1396940052509308, + "rewards/rejected": -0.08734293282032013, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 0.859375, + "learning_rate": 1.8703241895261848e-06, + "logits/chosen": -2.2100329399108887, + "logits/rejected": -2.446981906890869, + "logps/chosen": -14.807760238647461, + "logps/rejected": -61.867767333984375, + "loss": 0.6085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07087540626525879, + "rewards/margins": 0.17729689180850983, + "rewards/rejected": -0.10642149299383163, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 0.890625, + "learning_rate": 1.9950124688279305e-06, + "logits/chosen": -2.2164454460144043, + "logits/rejected": -2.4408745765686035, + "logps/chosen": -12.931081771850586, + "logps/rejected": -65.07794189453125, + "loss": 0.5835, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09038490056991577, + "rewards/margins": 0.2332114726305008, + "rewards/rejected": -0.14282655715942383, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 0.7578125, + "learning_rate": 2.119700748129676e-06, + "logits/chosen": -2.3733327388763428, + "logits/rejected": -2.637248992919922, + "logps/chosen": -10.747810363769531, + "logps/rejected": -66.96683502197266, + "loss": 0.5576, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1092229038476944, + "rewards/margins": 0.2931229770183563, + "rewards/rejected": -0.18390007317066193, + "step": 170 + }, + { + "epoch": 0.04, + "grad_norm": 0.94921875, + "learning_rate": 2.2443890274314216e-06, + "logits/chosen": -2.059483051300049, + "logits/rejected": -2.3210699558258057, + "logps/chosen": -9.155550003051758, + "logps/rejected": -81.08940124511719, + "loss": 0.525, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1273214966058731, + "rewards/margins": 0.371276319026947, + "rewards/rejected": -0.24395480751991272, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 0.88671875, + "learning_rate": 2.3690773067331675e-06, + "logits/chosen": -2.1012320518493652, + "logits/rejected": -2.380855083465576, + "logps/chosen": -8.220524787902832, + "logps/rejected": -86.42562866210938, + "loss": 0.4879, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13605494797229767, + "rewards/margins": 0.4646981358528137, + "rewards/rejected": -0.32864317297935486, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 1.0546875, + "learning_rate": 2.493765586034913e-06, + "logits/chosen": -2.178351879119873, + "logits/rejected": -2.448537826538086, + "logps/chosen": -7.208306312561035, + "logps/rejected": -91.26810455322266, + "loss": 0.4484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15183252096176147, + "rewards/margins": 0.5721064805984497, + "rewards/rejected": -0.42027395963668823, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 0.90625, + "learning_rate": 2.6184538653366586e-06, + "logits/chosen": -2.1630733013153076, + "logits/rejected": -2.411649703979492, + "logps/chosen": -4.617680072784424, + "logps/rejected": -102.69218444824219, + "loss": 0.4025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1683187633752823, + "rewards/margins": 0.705622136592865, + "rewards/rejected": -0.5373033881187439, + "step": 210 + }, + { + "epoch": 0.05, + "grad_norm": 0.8828125, + "learning_rate": 2.7431421446384045e-06, + "logits/chosen": -2.1586241722106934, + "logits/rejected": -2.3992838859558105, + "logps/chosen": -3.4560561180114746, + "logps/rejected": -116.10273742675781, + "loss": 0.3606, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18259310722351074, + "rewards/margins": 0.8416939973831177, + "rewards/rejected": -0.6591008901596069, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 0.953125, + "learning_rate": 2.86783042394015e-06, + "logits/chosen": -2.1585848331451416, + "logits/rejected": -2.381598949432373, + "logps/chosen": -2.7871248722076416, + "logps/rejected": -130.73558044433594, + "loss": 0.3171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18752917647361755, + "rewards/margins": 0.9992051124572754, + "rewards/rejected": -0.8116759061813354, + "step": 230 + }, + { + "epoch": 0.06, + "grad_norm": 1.0390625, + "learning_rate": 2.9925187032418956e-06, + "logits/chosen": -2.2422091960906982, + "logits/rejected": -2.491379499435425, + "logps/chosen": -2.960984230041504, + "logps/rejected": -167.91981506347656, + "loss": 0.2364, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1910472810268402, + "rewards/margins": 1.3611987829208374, + "rewards/rejected": -1.1701514720916748, + "step": 240 + }, + { + "epoch": 0.06, + "grad_norm": 0.64453125, + "learning_rate": 3.117206982543641e-06, + "logits/chosen": -2.1426875591278076, + "logits/rejected": -2.385005474090576, + "logps/chosen": -2.7859835624694824, + "logps/rejected": -208.47451782226562, + "loss": 0.1849, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1911415457725525, + "rewards/margins": 1.752079963684082, + "rewards/rejected": -1.5609384775161743, + "step": 250 + }, + { + "epoch": 0.06, + "grad_norm": 0.3359375, + "learning_rate": 3.241895261845387e-06, + "logits/chosen": -2.171957492828369, + "logits/rejected": -2.405855894088745, + "logps/chosen": -2.6331560611724854, + "logps/rejected": -266.33441162109375, + "loss": 0.1401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18891991674900055, + "rewards/margins": 2.3281731605529785, + "rewards/rejected": -2.1392531394958496, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 0.46484375, + "learning_rate": 3.3665835411471326e-06, + "logits/chosen": -2.136756181716919, + "logits/rejected": -2.3706109523773193, + "logps/chosen": -2.3929905891418457, + "logps/rejected": -311.50164794921875, + "loss": 0.1147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19246290624141693, + "rewards/margins": 2.773569107055664, + "rewards/rejected": -2.581106185913086, + "step": 270 + }, + { + "epoch": 0.07, + "grad_norm": 0.1064453125, + "learning_rate": 3.491271820448878e-06, + "logits/chosen": -2.04775071144104, + "logits/rejected": -2.256491184234619, + "logps/chosen": -3.071178436279297, + "logps/rejected": -338.50042724609375, + "loss": 0.1247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18945232033729553, + "rewards/margins": 3.0333011150360107, + "rewards/rejected": -2.8438491821289062, + "step": 280 + }, + { + "epoch": 0.07, + "grad_norm": 0.255859375, + "learning_rate": 3.615960099750624e-06, + "logits/chosen": -2.0987088680267334, + "logits/rejected": -2.3304831981658936, + "logps/chosen": -2.6518688201904297, + "logps/rejected": -317.264892578125, + "loss": 0.1174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1924947053194046, + "rewards/margins": 2.8529839515686035, + "rewards/rejected": -2.660489082336426, + "step": 290 + }, + { + "epoch": 0.07, + "grad_norm": 0.416015625, + "learning_rate": 3.7406483790523696e-06, + "logits/chosen": -2.090928792953491, + "logits/rejected": -2.331522226333618, + "logps/chosen": -2.140141725540161, + "logps/rejected": -386.000244140625, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19298282265663147, + "rewards/margins": 3.519921064376831, + "rewards/rejected": -3.3269379138946533, + "step": 300 + }, + { + "epoch": 0.08, + "grad_norm": 0.341796875, + "learning_rate": 3.8653366583541155e-06, + "logits/chosen": -2.0327889919281006, + "logits/rejected": -2.2435851097106934, + "logps/chosen": -2.820808172225952, + "logps/rejected": -395.51666259765625, + "loss": 0.1065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19103531539440155, + "rewards/margins": 3.636303424835205, + "rewards/rejected": -3.445268154144287, + "step": 310 + }, + { + "epoch": 0.08, + "grad_norm": 0.361328125, + "learning_rate": 3.990024937655861e-06, + "logits/chosen": -2.0326077938079834, + "logits/rejected": -2.2442269325256348, + "logps/chosen": -2.771969795227051, + "logps/rejected": -503.487548828125, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18934586644172668, + "rewards/margins": 4.642784595489502, + "rewards/rejected": -4.453438758850098, + "step": 320 + }, + { + "epoch": 0.08, + "grad_norm": 0.330078125, + "learning_rate": 4.114713216957607e-06, + "logits/chosen": -2.1069068908691406, + "logits/rejected": -2.2923953533172607, + "logps/chosen": -4.020249366760254, + "logps/rejected": -392.1181335449219, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18417596817016602, + "rewards/margins": 3.6148147583007812, + "rewards/rejected": -3.4306392669677734, + "step": 330 + }, + { + "epoch": 0.08, + "grad_norm": 0.361328125, + "learning_rate": 4.239401496259352e-06, + "logits/chosen": -2.040024518966675, + "logits/rejected": -2.235044240951538, + "logps/chosen": -4.0639448165893555, + "logps/rejected": -495.93304443359375, + "loss": 0.07, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17745602130889893, + "rewards/margins": 4.601096153259277, + "rewards/rejected": -4.423640251159668, + "step": 340 + }, + { + "epoch": 0.09, + "grad_norm": 0.1513671875, + "learning_rate": 4.364089775561098e-06, + "logits/chosen": -2.1371054649353027, + "logits/rejected": -2.314563274383545, + "logps/chosen": -5.727438449859619, + "logps/rejected": -448.86077880859375, + "loss": 0.0817, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1613588184118271, + "rewards/margins": 4.152359962463379, + "rewards/rejected": -3.991001605987549, + "step": 350 + }, + { + "epoch": 0.09, + "grad_norm": 0.44921875, + "learning_rate": 4.488778054862843e-06, + "logits/chosen": -2.1167104244232178, + "logits/rejected": -2.327972412109375, + "logps/chosen": -7.92899227142334, + "logps/rejected": -551.7445068359375, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14445874094963074, + "rewards/margins": 5.136443138122559, + "rewards/rejected": -4.9919843673706055, + "step": 360 + }, + { + "epoch": 0.09, + "grad_norm": 0.5625, + "learning_rate": 4.6134663341645895e-06, + "logits/chosen": -2.1294167041778564, + "logits/rejected": -2.320831537246704, + "logps/chosen": -14.308802604675293, + "logps/rejected": -556.9048461914062, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07796537131071091, + "rewards/margins": 5.144980430603027, + "rewards/rejected": -5.067015171051025, + "step": 370 + }, + { + "epoch": 0.09, + "grad_norm": 0.74609375, + "learning_rate": 4.738154613466335e-06, + "logits/chosen": -2.059129238128662, + "logits/rejected": -2.283139705657959, + "logps/chosen": -30.504648208618164, + "logps/rejected": -904.5690307617188, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08387070894241333, + "rewards/margins": 8.345690727233887, + "rewards/rejected": -8.429560661315918, + "step": 380 + }, + { + "epoch": 0.1, + "grad_norm": 0.0244140625, + "learning_rate": 4.862842892768081e-06, + "logits/chosen": -2.1258416175842285, + "logits/rejected": -2.3157804012298584, + "logps/chosen": -44.502899169921875, + "logps/rejected": -788.58056640625, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2255050241947174, + "rewards/margins": 7.145709037780762, + "rewards/rejected": -7.371213436126709, + "step": 390 + }, + { + "epoch": 0.1, + "grad_norm": 0.2001953125, + "learning_rate": 4.987531172069826e-06, + "logits/chosen": -2.0560178756713867, + "logits/rejected": -2.2473480701446533, + "logps/chosen": -65.08824157714844, + "logps/rejected": -957.4326171875, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4304937422275543, + "rewards/margins": 8.619054794311523, + "rewards/rejected": -9.04954719543457, + "step": 400 + }, + { + "epoch": 0.1, + "grad_norm": 0.1484375, + "learning_rate": 4.999923022460671e-06, + "logits/chosen": -2.0123705863952637, + "logits/rejected": -2.2279648780822754, + "logps/chosen": -74.45851135253906, + "logps/rejected": -1195.67138671875, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5188314318656921, + "rewards/margins": 10.849455833435059, + "rewards/rejected": -11.368288040161133, + "step": 410 + }, + { + "epoch": 0.1, + "grad_norm": 0.51953125, + "learning_rate": 4.999656933348981e-06, + "logits/chosen": -2.234529972076416, + "logits/rejected": -2.406409740447998, + "logps/chosen": -88.35977935791016, + "logps/rejected": -900.6641845703125, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6683001518249512, + "rewards/margins": 7.860695838928223, + "rewards/rejected": -8.528995513916016, + "step": 420 + }, + { + "epoch": 0.11, + "grad_norm": 0.005340576171875, + "learning_rate": 4.99920080255011e-06, + "logits/chosen": -2.054624080657959, + "logits/rejected": -2.283973217010498, + "logps/chosen": -83.51972198486328, + "logps/rejected": -1245.379150390625, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6176443099975586, + "rewards/margins": 11.277586936950684, + "rewards/rejected": -11.895231246948242, + "step": 430 + }, + { + "epoch": 0.11, + "grad_norm": 0.5, + "learning_rate": 4.998554664742362e-06, + "logits/chosen": -2.136657476425171, + "logits/rejected": -2.320204973220825, + "logps/chosen": -88.63077545166016, + "logps/rejected": -1043.793701171875, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6717870831489563, + "rewards/margins": 9.280545234680176, + "rewards/rejected": -9.95233154296875, + "step": 440 + }, + { + "epoch": 0.11, + "grad_norm": 0.287109375, + "learning_rate": 4.997718569049726e-06, + "logits/chosen": -2.074990749359131, + "logits/rejected": -2.277477979660034, + "logps/chosen": -94.67475891113281, + "logps/rejected": -1131.5499267578125, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7206478714942932, + "rewards/margins": 10.060680389404297, + "rewards/rejected": -10.781328201293945, + "step": 450 + }, + { + "epoch": 0.11, + "grad_norm": 0.0244140625, + "learning_rate": 4.9966925790381404e-06, + "logits/chosen": -2.1405222415924072, + "logits/rejected": -2.3201403617858887, + "logps/chosen": -73.81873321533203, + "logps/rejected": -1013.6409912109375, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.528481125831604, + "rewards/margins": 9.096994400024414, + "rewards/rejected": -9.625473976135254, + "step": 460 + }, + { + "epoch": 0.12, + "grad_norm": 0.19140625, + "learning_rate": 4.995476772710657e-06, + "logits/chosen": -2.0950608253479004, + "logits/rejected": -2.316931962966919, + "logps/chosen": -103.3821792602539, + "logps/rejected": -1335.799560546875, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8135234117507935, + "rewards/margins": 11.993521690368652, + "rewards/rejected": -12.807044982910156, + "step": 470 + }, + { + "epoch": 0.12, + "grad_norm": 0.2451171875, + "learning_rate": 4.994071242501516e-06, + "logits/chosen": -2.185049057006836, + "logits/rejected": -2.3854544162750244, + "logps/chosen": -70.46857452392578, + "logps/rejected": -1074.28271484375, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48124104738235474, + "rewards/margins": 9.766096115112305, + "rewards/rejected": -10.247336387634277, + "step": 480 + }, + { + "epoch": 0.12, + "grad_norm": 2.8967857360839844e-05, + "learning_rate": 4.992476095269112e-06, + "logits/chosen": -2.1872477531433105, + "logits/rejected": -2.3788347244262695, + "logps/chosen": -60.58274459838867, + "logps/rejected": -1168.738037109375, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3954273462295532, + "rewards/margins": 10.741006851196289, + "rewards/rejected": -11.136434555053711, + "step": 490 + }, + { + "epoch": 0.12, + "grad_norm": 0.267578125, + "learning_rate": 4.990691452287877e-06, + "logits/chosen": -2.034578800201416, + "logits/rejected": -2.228356122970581, + "logps/chosen": -70.90155029296875, + "logps/rejected": -1098.248779296875, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49016299843788147, + "rewards/margins": 9.965959548950195, + "rewards/rejected": -10.456122398376465, + "step": 500 + }, + { + "epoch": 0.13, + "grad_norm": 0.2412109375, + "learning_rate": 4.988717449239056e-06, + "logits/chosen": -2.086670398712158, + "logits/rejected": -2.27720046043396, + "logps/chosen": -79.89573669433594, + "logps/rejected": -1124.399658203125, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5849625468254089, + "rewards/margins": 10.160270690917969, + "rewards/rejected": -10.745233535766602, + "step": 510 + }, + { + "epoch": 0.13, + "grad_norm": 0.0245361328125, + "learning_rate": 4.98655423620039e-06, + "logits/chosen": -2.119935989379883, + "logits/rejected": -2.3267104625701904, + "logps/chosen": -77.09886169433594, + "logps/rejected": -1248.1905517578125, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5508411526679993, + "rewards/margins": 11.388254165649414, + "rewards/rejected": -11.939095497131348, + "step": 520 + }, + { + "epoch": 0.13, + "grad_norm": 0.01080322265625, + "learning_rate": 4.984201977634711e-06, + "logits/chosen": -2.213916301727295, + "logits/rejected": -2.4463677406311035, + "logps/chosen": -90.18511199951172, + "logps/rejected": -1377.499755859375, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6767342686653137, + "rewards/margins": 12.541799545288086, + "rewards/rejected": -13.218534469604492, + "step": 530 + }, + { + "epoch": 0.13, + "grad_norm": 0.0634765625, + "learning_rate": 4.9816608523774345e-06, + "logits/chosen": -2.105821132659912, + "logits/rejected": -2.3127095699310303, + "logps/chosen": -79.32666015625, + "logps/rejected": -1143.241943359375, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5732973217964172, + "rewards/margins": 10.353887557983398, + "rewards/rejected": -10.927184104919434, + "step": 540 + }, + { + "epoch": 0.14, + "grad_norm": 0.0145263671875, + "learning_rate": 4.978931053622964e-06, + "logits/chosen": -2.1495628356933594, + "logits/rejected": -2.370626449584961, + "logps/chosen": -78.36927795410156, + "logps/rejected": -1290.0146484375, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5716887712478638, + "rewards/margins": 11.808379173278809, + "rewards/rejected": -12.380067825317383, + "step": 550 + }, + { + "epoch": 0.14, + "grad_norm": 0.0003566741943359375, + "learning_rate": 4.9760127889100044e-06, + "logits/chosen": -2.1675076484680176, + "logits/rejected": -2.3700671195983887, + "logps/chosen": -62.899436950683594, + "logps/rejected": -1186.429443359375, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4144817888736725, + "rewards/margins": 10.914512634277344, + "rewards/rejected": -11.328994750976562, + "step": 560 + }, + { + "epoch": 0.14, + "grad_norm": 0.23828125, + "learning_rate": 4.972906280105781e-06, + "logits/chosen": -2.0299549102783203, + "logits/rejected": -2.252498149871826, + "logps/chosen": -80.07968139648438, + "logps/rejected": -1246.3656005859375, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5788360238075256, + "rewards/margins": 11.359301567077637, + "rewards/rejected": -11.938138008117676, + "step": 570 + }, + { + "epoch": 0.14, + "grad_norm": 0.0030670166015625, + "learning_rate": 4.969611763389175e-06, + "logits/chosen": -2.19167423248291, + "logits/rejected": -2.402195453643799, + "logps/chosen": -83.22602844238281, + "logps/rejected": -1125.298095703125, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6159827709197998, + "rewards/margins": 10.143194198608398, + "rewards/rejected": -10.759176254272461, + "step": 580 + }, + { + "epoch": 0.15, + "grad_norm": 0.197265625, + "learning_rate": 4.966129489232762e-06, + "logits/chosen": -2.1329731941223145, + "logits/rejected": -2.375246286392212, + "logps/chosen": -77.91215515136719, + "logps/rejected": -1410.508056640625, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5520095825195312, + "rewards/margins": 12.985345840454102, + "rewards/rejected": -13.537355422973633, + "step": 590 + }, + { + "epoch": 0.15, + "grad_norm": 0.431640625, + "learning_rate": 4.962459722383775e-06, + "logits/chosen": -2.0712943077087402, + "logits/rejected": -2.288693428039551, + "logps/chosen": -73.90785217285156, + "logps/rejected": -1434.8343505859375, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5121776461601257, + "rewards/margins": 13.254251480102539, + "rewards/rejected": -13.766427993774414, + "step": 600 + }, + { + "epoch": 0.15, + "grad_norm": 0.000926971435546875, + "learning_rate": 4.958602741843975e-06, + "logits/chosen": -2.0742838382720947, + "logits/rejected": -2.333592176437378, + "logps/chosen": -74.49140167236328, + "logps/rejected": -1376.306396484375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5260452032089233, + "rewards/margins": 12.687009811401367, + "rewards/rejected": -13.213055610656738, + "step": 610 + }, + { + "epoch": 0.15, + "grad_norm": 0.08154296875, + "learning_rate": 4.954558840848437e-06, + "logits/chosen": -2.213879346847534, + "logits/rejected": -2.4216055870056152, + "logps/chosen": -65.52778625488281, + "logps/rejected": -1092.6256103515625, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43167224526405334, + "rewards/margins": 10.010717391967773, + "rewards/rejected": -10.442389488220215, + "step": 620 + }, + { + "epoch": 0.16, + "grad_norm": 0.1318359375, + "learning_rate": 4.950328326843258e-06, + "logits/chosen": -2.0717647075653076, + "logits/rejected": -2.3038885593414307, + "logps/chosen": -59.320228576660156, + "logps/rejected": -1350.045654296875, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37677788734436035, + "rewards/margins": 12.551568984985352, + "rewards/rejected": -12.92834758758545, + "step": 630 + }, + { + "epoch": 0.16, + "grad_norm": 0.00848388671875, + "learning_rate": 4.945911521462182e-06, + "logits/chosen": -2.2182841300964355, + "logits/rejected": -2.4369709491729736, + "logps/chosen": -66.91700744628906, + "logps/rejected": -1338.757080078125, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45198917388916016, + "rewards/margins": 12.385056495666504, + "rewards/rejected": -12.837045669555664, + "step": 640 + }, + { + "epoch": 0.16, + "grad_norm": 0.26171875, + "learning_rate": 4.941308760502149e-06, + "logits/chosen": -2.2334372997283936, + "logits/rejected": -2.4091877937316895, + "logps/chosen": -77.1009292602539, + "logps/rejected": -1184.353759765625, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5503292083740234, + "rewards/margins": 10.71965217590332, + "rewards/rejected": -11.26998233795166, + "step": 650 + }, + { + "epoch": 0.16, + "grad_norm": 0.423828125, + "learning_rate": 4.936520393897762e-06, + "logits/chosen": -2.174837589263916, + "logits/rejected": -2.3993821144104004, + "logps/chosen": -66.07880401611328, + "logps/rejected": -1318.9051513671875, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43549758195877075, + "rewards/margins": 12.193208694458008, + "rewards/rejected": -12.628705978393555, + "step": 660 + }, + { + "epoch": 0.17, + "grad_norm": 0.0062255859375, + "learning_rate": 4.931546785694684e-06, + "logits/chosen": -2.2091901302337646, + "logits/rejected": -2.44826078414917, + "logps/chosen": -83.01612091064453, + "logps/rejected": -1474.8822021484375, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6095518469810486, + "rewards/margins": 13.620869636535645, + "rewards/rejected": -14.230420112609863, + "step": 670 + }, + { + "epoch": 0.17, + "grad_norm": 0.017822265625, + "learning_rate": 4.926388314021964e-06, + "logits/chosen": -2.2539894580841064, + "logits/rejected": -2.4782536029815674, + "logps/chosen": -97.8957748413086, + "logps/rejected": -1248.599365234375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7543063759803772, + "rewards/margins": 11.235261917114258, + "rewards/rejected": -11.989568710327148, + "step": 680 + }, + { + "epoch": 0.17, + "grad_norm": 0.00060272216796875, + "learning_rate": 4.921045371063283e-06, + "logits/chosen": -2.241508960723877, + "logits/rejected": -2.45992112159729, + "logps/chosen": -75.34230041503906, + "logps/rejected": -1410.6185302734375, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5371032953262329, + "rewards/margins": 13.022886276245117, + "rewards/rejected": -13.559989929199219, + "step": 690 + }, + { + "epoch": 0.17, + "grad_norm": 0.09423828125, + "learning_rate": 4.915518363027142e-06, + "logits/chosen": -2.3091747760772705, + "logits/rejected": -2.516079902648926, + "logps/chosen": -77.0430679321289, + "logps/rejected": -1162.167236328125, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5587199926376343, + "rewards/margins": 10.586333274841309, + "rewards/rejected": -11.145052909851074, + "step": 700 + }, + { + "epoch": 0.18, + "grad_norm": 0.21484375, + "learning_rate": 4.909807710115977e-06, + "logits/chosen": -2.06872820854187, + "logits/rejected": -2.280989170074463, + "logps/chosen": -57.82390213012695, + "logps/rejected": -1309.2158203125, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36199483275413513, + "rewards/margins": 12.196971893310547, + "rewards/rejected": -12.558965682983398, + "step": 710 + }, + { + "epoch": 0.18, + "grad_norm": 0.10595703125, + "learning_rate": 4.903913846494211e-06, + "logits/chosen": -2.057790994644165, + "logits/rejected": -2.2983431816101074, + "logps/chosen": -63.02252960205078, + "logps/rejected": -1628.169677734375, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4083371162414551, + "rewards/margins": 15.24769115447998, + "rewards/rejected": -15.656028747558594, + "step": 720 + }, + { + "epoch": 0.18, + "grad_norm": 0.283203125, + "learning_rate": 4.897837220255251e-06, + "logits/chosen": -2.101783275604248, + "logits/rejected": -2.2945144176483154, + "logps/chosen": -62.76520538330078, + "logps/rejected": -1316.2451171875, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4040141999721527, + "rewards/margins": 12.210726737976074, + "rewards/rejected": -12.614742279052734, + "step": 730 + }, + { + "epoch": 0.18, + "grad_norm": 0.0218505859375, + "learning_rate": 4.891578293387413e-06, + "logits/chosen": -2.183640241622925, + "logits/rejected": -2.3983137607574463, + "logps/chosen": -72.8852767944336, + "logps/rejected": -1332.2720947265625, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5061370134353638, + "rewards/margins": 12.295533180236816, + "rewards/rejected": -12.801671028137207, + "step": 740 + }, + { + "epoch": 0.19, + "grad_norm": 0.064453125, + "learning_rate": 4.885137541738808e-06, + "logits/chosen": -2.1432900428771973, + "logits/rejected": -2.3399500846862793, + "logps/chosen": -52.877479553222656, + "logps/rejected": -1188.441650390625, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3169303238391876, + "rewards/margins": 11.048254013061523, + "rewards/rejected": -11.365182876586914, + "step": 750 + }, + { + "epoch": 0.19, + "grad_norm": 2.086162567138672e-05, + "learning_rate": 4.878515454981153e-06, + "logits/chosen": -2.013054370880127, + "logits/rejected": -2.238393783569336, + "logps/chosen": -60.31416702270508, + "logps/rejected": -1504.335693359375, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38582319021224976, + "rewards/margins": 14.089462280273438, + "rewards/rejected": -14.475286483764648, + "step": 760 + }, + { + "epoch": 0.19, + "grad_norm": 0.29296875, + "learning_rate": 4.8717125365725545e-06, + "logits/chosen": -2.2411911487579346, + "logits/rejected": -2.4217007160186768, + "logps/chosen": -71.92973327636719, + "logps/rejected": -1072.138427734375, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4977429509162903, + "rewards/margins": 9.747591018676758, + "rewards/rejected": -10.245333671569824, + "step": 770 + }, + { + "epoch": 0.19, + "grad_norm": 0.076171875, + "learning_rate": 4.864729303719221e-06, + "logits/chosen": -2.183976650238037, + "logits/rejected": -2.4096364974975586, + "logps/chosen": -75.01698303222656, + "logps/rejected": -1566.1385498046875, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5285240411758423, + "rewards/margins": 14.545486450195312, + "rewards/rejected": -15.074010848999023, + "step": 780 + }, + { + "epoch": 0.2, + "grad_norm": 0.142578125, + "learning_rate": 4.857566287336152e-06, + "logits/chosen": -2.1151528358459473, + "logits/rejected": -2.352687358856201, + "logps/chosen": -81.51287841796875, + "logps/rejected": -1503.718505859375, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5906480550765991, + "rewards/margins": 13.887521743774414, + "rewards/rejected": -14.478169441223145, + "step": 790 + }, + { + "epoch": 0.2, + "grad_norm": 0.000972747802734375, + "learning_rate": 4.850224032006765e-06, + "logits/chosen": -2.2330470085144043, + "logits/rejected": -2.4612553119659424, + "logps/chosen": -86.96638488769531, + "logps/rejected": -1362.5970458984375, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6454036831855774, + "rewards/margins": 12.467041969299316, + "rewards/rejected": -13.112444877624512, + "step": 800 + }, + { + "epoch": 0.2, + "grad_norm": 0.220703125, + "learning_rate": 4.8427030959414984e-06, + "logits/chosen": -2.0308804512023926, + "logits/rejected": -2.2706708908081055, + "logps/chosen": -80.2787857055664, + "logps/rejected": -1434.3277587890625, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5847833752632141, + "rewards/margins": 13.225725173950195, + "rewards/rejected": -13.810508728027344, + "step": 810 + }, + { + "epoch": 0.2, + "grad_norm": 0.193359375, + "learning_rate": 4.835004050935369e-06, + "logits/chosen": -2.134955644607544, + "logits/rejected": -2.338745594024658, + "logps/chosen": -71.83667755126953, + "logps/rejected": -1364.427978515625, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49762091040611267, + "rewards/margins": 12.594762802124023, + "rewards/rejected": -13.092384338378906, + "step": 820 + }, + { + "epoch": 0.21, + "grad_norm": 0.13671875, + "learning_rate": 4.8271274823245e-06, + "logits/chosen": -2.1413967609405518, + "logits/rejected": -2.343967914581299, + "logps/chosen": -51.91362762451172, + "logps/rejected": -1269.490478515625, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2962699830532074, + "rewards/margins": 11.855816841125488, + "rewards/rejected": -12.152085304260254, + "step": 830 + }, + { + "epoch": 0.21, + "grad_norm": 0.208984375, + "learning_rate": 4.8190739889416264e-06, + "logits/chosen": -2.1291534900665283, + "logits/rejected": -2.3538265228271484, + "logps/chosen": -51.05685043334961, + "logps/rejected": -1429.0635986328125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28625136613845825, + "rewards/margins": 13.460894584655762, + "rewards/rejected": -13.74714469909668, + "step": 840 + }, + { + "epoch": 0.21, + "grad_norm": 0.06884765625, + "learning_rate": 4.810844183070553e-06, + "logits/chosen": -2.2312417030334473, + "logits/rejected": -2.45286226272583, + "logps/chosen": -65.88993072509766, + "logps/rejected": -1232.466064453125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4434642195701599, + "rewards/margins": 11.373895645141602, + "rewards/rejected": -11.817359924316406, + "step": 850 + }, + { + "epoch": 0.21, + "grad_norm": 0.20703125, + "learning_rate": 4.802438690399622e-06, + "logits/chosen": -2.1778035163879395, + "logits/rejected": -2.4104442596435547, + "logps/chosen": -61.74702835083008, + "logps/rejected": -1364.88525390625, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4024318754673004, + "rewards/margins": 12.702482223510742, + "rewards/rejected": -13.104913711547852, + "step": 860 + }, + { + "epoch": 0.22, + "grad_norm": 0.00174713134765625, + "learning_rate": 4.793858149974129e-06, + "logits/chosen": -2.142401933670044, + "logits/rejected": -2.3973865509033203, + "logps/chosen": -64.52376556396484, + "logps/rejected": -1546.420654296875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4257756769657135, + "rewards/margins": 14.488656997680664, + "rewards/rejected": -14.91443157196045, + "step": 870 + }, + { + "epoch": 0.22, + "grad_norm": 0.08544921875, + "learning_rate": 4.785103214147747e-06, + "logits/chosen": -2.2586052417755127, + "logits/rejected": -2.4925296306610107, + "logps/chosen": -58.837852478027344, + "logps/rejected": -1360.659423828125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3716233968734741, + "rewards/margins": 12.72169017791748, + "rewards/rejected": -13.093313217163086, + "step": 880 + }, + { + "epoch": 0.22, + "grad_norm": 0.0027008056640625, + "learning_rate": 4.776174548532926e-06, + "logits/chosen": -2.158493757247925, + "logits/rejected": -2.3726634979248047, + "logps/chosen": -60.35230255126953, + "logps/rejected": -1398.0426025390625, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3876059055328369, + "rewards/margins": 13.057897567749023, + "rewards/rejected": -13.445503234863281, + "step": 890 + }, + { + "epoch": 0.22, + "grad_norm": 0.1650390625, + "learning_rate": 4.767072831950288e-06, + "logits/chosen": -2.205594539642334, + "logits/rejected": -2.447887420654297, + "logps/chosen": -58.41968536376953, + "logps/rejected": -1474.5992431640625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36307448148727417, + "rewards/margins": 13.841937065124512, + "rewards/rejected": -14.205012321472168, + "step": 900 + }, + { + "epoch": 0.23, + "grad_norm": 0.05029296875, + "learning_rate": 4.7577987563770226e-06, + "logits/chosen": -2.0987536907196045, + "logits/rejected": -2.3415169715881348, + "logps/chosen": -63.24462890625, + "logps/rejected": -1456.5894775390625, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40772971510887146, + "rewards/margins": 13.594934463500977, + "rewards/rejected": -14.002664566040039, + "step": 910 + }, + { + "epoch": 0.23, + "grad_norm": 0.001953125, + "learning_rate": 4.748353026894273e-06, + "logits/chosen": -2.176764965057373, + "logits/rejected": -2.3934457302093506, + "logps/chosen": -77.48558044433594, + "logps/rejected": -1403.7001953125, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5516784191131592, + "rewards/margins": 12.959416389465332, + "rewards/rejected": -13.51109504699707, + "step": 920 + }, + { + "epoch": 0.23, + "grad_norm": 0.01708984375, + "learning_rate": 4.738736361633532e-06, + "logits/chosen": -2.2761058807373047, + "logits/rejected": -2.475376605987549, + "logps/chosen": -69.98649597167969, + "logps/rejected": -1317.3599853515625, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4822467267513275, + "rewards/margins": 12.17310619354248, + "rewards/rejected": -12.655353546142578, + "step": 930 + }, + { + "epoch": 0.23, + "grad_norm": 0.00396728515625, + "learning_rate": 4.728949491722046e-06, + "logits/chosen": -2.3034911155700684, + "logits/rejected": -2.5063111782073975, + "logps/chosen": -82.03058624267578, + "logps/rejected": -1236.0631103515625, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6094164848327637, + "rewards/margins": 11.258821487426758, + "rewards/rejected": -11.868239402770996, + "step": 940 + }, + { + "epoch": 0.24, + "grad_norm": 0.154296875, + "learning_rate": 4.718993161227231e-06, + "logits/chosen": -2.156198740005493, + "logits/rejected": -2.4342427253723145, + "logps/chosen": -52.259849548339844, + "logps/rejected": -1551.4473876953125, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2965083420276642, + "rewards/margins": 14.673095703125, + "rewards/rejected": -14.9696044921875, + "step": 950 + }, + { + "epoch": 0.24, + "grad_norm": 0.0185546875, + "learning_rate": 4.708868127100098e-06, + "logits/chosen": -2.225891351699829, + "logits/rejected": -2.446601629257202, + "logps/chosen": -45.30867385864258, + "logps/rejected": -1178.07958984375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24066181480884552, + "rewards/margins": 11.030438423156738, + "rewards/rejected": -11.271100044250488, + "step": 960 + }, + { + "epoch": 0.24, + "grad_norm": 0.000530242919921875, + "learning_rate": 4.6985751591177075e-06, + "logits/chosen": -2.071913242340088, + "logits/rejected": -2.3076140880584717, + "logps/chosen": -40.99647521972656, + "logps/rejected": -1356.589599609375, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1879800409078598, + "rewards/margins": 12.83641529083252, + "rewards/rejected": -13.024395942687988, + "step": 970 + }, + { + "epoch": 0.24, + "grad_norm": 0.0279541015625, + "learning_rate": 4.688115039824648e-06, + "logits/chosen": -2.138272523880005, + "logits/rejected": -2.3490092754364014, + "logps/chosen": -39.265869140625, + "logps/rejected": -1271.980712890625, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17672276496887207, + "rewards/margins": 12.01789665222168, + "rewards/rejected": -12.194620132446289, + "step": 980 + }, + { + "epoch": 0.25, + "grad_norm": 8.96453857421875e-05, + "learning_rate": 4.677488564473535e-06, + "logits/chosen": -2.0846240520477295, + "logits/rejected": -2.3261351585388184, + "logps/chosen": -54.3425178527832, + "logps/rejected": -1450.612548828125, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32301291823387146, + "rewards/margins": 13.639193534851074, + "rewards/rejected": -13.96220588684082, + "step": 990 + }, + { + "epoch": 0.25, + "grad_norm": 0.08349609375, + "learning_rate": 4.666696540964556e-06, + "logits/chosen": -2.2266921997070312, + "logits/rejected": -2.44096040725708, + "logps/chosen": -60.16071701049805, + "logps/rejected": -1275.765380859375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3776375949382782, + "rewards/margins": 11.887288093566895, + "rewards/rejected": -12.264925003051758, + "step": 1000 + }, + { + "epoch": 0.25, + "eval_logits/chosen": -2.585369348526001, + "eval_logits/rejected": -2.6955134868621826, + "eval_logps/chosen": -101.94501495361328, + "eval_logps/rejected": -625.497314453125, + "eval_loss": 0.01159477885812521, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -0.7603151202201843, + "eval_rewards/margins": 5.045938014984131, + "eval_rewards/rejected": -5.806252956390381, + "eval_runtime": 0.6566, + "eval_samples_per_second": 7.615, + "eval_steps_per_second": 4.569, + "step": 1000 + }, + { + "epoch": 0.25, + "grad_norm": 0.000972747802734375, + "learning_rate": 4.6557397897840454e-06, + "logits/chosen": -2.227430820465088, + "logits/rejected": -2.466034412384033, + "logps/chosen": -49.18635940551758, + "logps/rejected": -1351.74853515625, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27514129877090454, + "rewards/margins": 12.695302963256836, + "rewards/rejected": -12.970443725585938, + "step": 1010 + }, + { + "epoch": 0.25, + "grad_norm": 0.0011138916015625, + "learning_rate": 4.644619143942108e-06, + "logits/chosen": -2.2175045013427734, + "logits/rejected": -2.4644241333007812, + "logps/chosen": -38.532127380371094, + "logps/rejected": -1415.847412109375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16876588761806488, + "rewards/margins": 13.413320541381836, + "rewards/rejected": -13.582087516784668, + "step": 1020 + }, + { + "epoch": 0.26, + "grad_norm": 0.08740234375, + "learning_rate": 4.633335448909284e-06, + "logits/chosen": -2.0612175464630127, + "logits/rejected": -2.274484157562256, + "logps/chosen": -39.870052337646484, + "logps/rejected": -1330.5323486328125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17578956484794617, + "rewards/margins": 12.581718444824219, + "rewards/rejected": -12.75750732421875, + "step": 1030 + }, + { + "epoch": 0.26, + "grad_norm": 0.09912109375, + "learning_rate": 4.621889562552272e-06, + "logits/chosen": -2.163442850112915, + "logits/rejected": -2.4233555793762207, + "logps/chosen": -62.47473907470703, + "logps/rejected": -1504.832763671875, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40125980973243713, + "rewards/margins": 14.0916748046875, + "rewards/rejected": -14.492935180664062, + "step": 1040 + }, + { + "epoch": 0.26, + "grad_norm": 0.004669189453125, + "learning_rate": 4.610282355068707e-06, + "logits/chosen": -2.2863821983337402, + "logits/rejected": -2.5355706214904785, + "logps/chosen": -59.4514274597168, + "logps/rejected": -1562.39013671875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36575835943222046, + "rewards/margins": 14.687586784362793, + "rewards/rejected": -15.053342819213867, + "step": 1050 + }, + { + "epoch": 0.26, + "grad_norm": 0.64453125, + "learning_rate": 4.598514708921006e-06, + "logits/chosen": -2.262545108795166, + "logits/rejected": -2.510559558868408, + "logps/chosen": -55.71985626220703, + "logps/rejected": -1498.1640625, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3440173268318176, + "rewards/margins": 14.074090957641602, + "rewards/rejected": -14.418106079101562, + "step": 1060 + }, + { + "epoch": 0.27, + "grad_norm": 0.00142669677734375, + "learning_rate": 4.5865875187692695e-06, + "logits/chosen": -2.2046749591827393, + "logits/rejected": -2.423334836959839, + "logps/chosen": -48.60809326171875, + "logps/rejected": -1244.1680908203125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2774764597415924, + "rewards/margins": 11.670097351074219, + "rewards/rejected": -11.9475736618042, + "step": 1070 + }, + { + "epoch": 0.27, + "grad_norm": 0.041015625, + "learning_rate": 4.57450169140327e-06, + "logits/chosen": -2.0672097206115723, + "logits/rejected": -2.3301241397857666, + "logps/chosen": -47.35566711425781, + "logps/rejected": -1535.633544921875, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25116461515426636, + "rewards/margins": 14.5623779296875, + "rewards/rejected": -14.813543319702148, + "step": 1080 + }, + { + "epoch": 0.27, + "grad_norm": 0.0615234375, + "learning_rate": 4.562258145673507e-06, + "logits/chosen": -2.2260966300964355, + "logits/rejected": -2.4950501918792725, + "logps/chosen": -40.86091613769531, + "logps/rejected": -1499.596435546875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18965426087379456, + "rewards/margins": 14.258898735046387, + "rewards/rejected": -14.448553085327148, + "step": 1090 + }, + { + "epoch": 0.27, + "grad_norm": 0.01068115234375, + "learning_rate": 4.549857812421353e-06, + "logits/chosen": -2.14607572555542, + "logits/rejected": -2.3866307735443115, + "logps/chosen": -44.410030364990234, + "logps/rejected": -1332.9017333984375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23083043098449707, + "rewards/margins": 12.585546493530273, + "rewards/rejected": -12.816377639770508, + "step": 1100 + }, + { + "epoch": 0.28, + "grad_norm": 0.040283203125, + "learning_rate": 4.537301634408281e-06, + "logits/chosen": -2.169417142868042, + "logits/rejected": -2.4057748317718506, + "logps/chosen": -44.095577239990234, + "logps/rejected": -1315.9925537109375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2211724817752838, + "rewards/margins": 12.42273235321045, + "rewards/rejected": -12.643904685974121, + "step": 1110 + }, + { + "epoch": 0.28, + "grad_norm": 0.11376953125, + "learning_rate": 4.52459056624419e-06, + "logits/chosen": -2.217676877975464, + "logits/rejected": -2.4193835258483887, + "logps/chosen": -46.805503845214844, + "logps/rejected": -1376.5738525390625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2504929006099701, + "rewards/margins": 12.966726303100586, + "rewards/rejected": -13.217218399047852, + "step": 1120 + }, + { + "epoch": 0.28, + "grad_norm": 0.12451171875, + "learning_rate": 4.51172557431483e-06, + "logits/chosen": -2.1065962314605713, + "logits/rejected": -2.3267951011657715, + "logps/chosen": -61.67560958862305, + "logps/rejected": -1427.5928955078125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39770394563674927, + "rewards/margins": 13.320897102355957, + "rewards/rejected": -13.718599319458008, + "step": 1130 + }, + { + "epoch": 0.28, + "grad_norm": 0.06494140625, + "learning_rate": 4.49870763670833e-06, + "logits/chosen": -2.1609268188476562, + "logits/rejected": -2.4237403869628906, + "logps/chosen": -55.2051887512207, + "logps/rejected": -1529.240966796875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3337005376815796, + "rewards/margins": 14.417539596557617, + "rewards/rejected": -14.751240730285645, + "step": 1140 + }, + { + "epoch": 0.29, + "grad_norm": 0.1953125, + "learning_rate": 4.4855377431408335e-06, + "logits/chosen": -2.152674436569214, + "logits/rejected": -2.3682188987731934, + "logps/chosen": -57.719703674316406, + "logps/rejected": -1428.2706298828125, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3568641245365143, + "rewards/margins": 13.377996444702148, + "rewards/rejected": -13.73486042022705, + "step": 1150 + }, + { + "epoch": 0.29, + "grad_norm": 0.0169677734375, + "learning_rate": 4.472216894881261e-06, + "logits/chosen": -2.146556854248047, + "logits/rejected": -2.361703872680664, + "logps/chosen": -56.385284423828125, + "logps/rejected": -1308.488525390625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34260934591293335, + "rewards/margins": 12.229646682739258, + "rewards/rejected": -12.572256088256836, + "step": 1160 + }, + { + "epoch": 0.29, + "grad_norm": 0.0908203125, + "learning_rate": 4.4587461046751815e-06, + "logits/chosen": -2.1846487522125244, + "logits/rejected": -2.4170939922332764, + "logps/chosen": -47.7278938293457, + "logps/rejected": -1271.083740234375, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2519899308681488, + "rewards/margins": 11.94546890258789, + "rewards/rejected": -12.197460174560547, + "step": 1170 + }, + { + "epoch": 0.29, + "grad_norm": 0.0247802734375, + "learning_rate": 4.44512639666781e-06, + "logits/chosen": -2.1769089698791504, + "logits/rejected": -2.394580602645874, + "logps/chosen": -61.13446044921875, + "logps/rejected": -1223.753662109375, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3933146893978119, + "rewards/margins": 11.356694221496582, + "rewards/rejected": -11.75001049041748, + "step": 1180 + }, + { + "epoch": 0.3, + "grad_norm": 0.390625, + "learning_rate": 4.431358806326158e-06, + "logits/chosen": -2.1201298236846924, + "logits/rejected": -2.3456811904907227, + "logps/chosen": -81.98688507080078, + "logps/rejected": -1611.583984375, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6020825505256653, + "rewards/margins": 14.962076187133789, + "rewards/rejected": -15.56415843963623, + "step": 1190 + }, + { + "epoch": 0.3, + "grad_norm": 0.609375, + "learning_rate": 4.4174443803603e-06, + "logits/chosen": -2.204873561859131, + "logits/rejected": -2.4108097553253174, + "logps/chosen": -82.76813507080078, + "logps/rejected": -1430.592041015625, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6107802391052246, + "rewards/margins": 13.188611030578613, + "rewards/rejected": -13.79939079284668, + "step": 1200 + }, + { + "epoch": 0.3, + "grad_norm": 0.123046875, + "learning_rate": 4.4033841766438e-06, + "logits/chosen": -2.178987503051758, + "logits/rejected": -2.39570689201355, + "logps/chosen": -57.776702880859375, + "logps/rejected": -1284.997802734375, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3557528853416443, + "rewards/margins": 11.97689151763916, + "rewards/rejected": -12.33264446258545, + "step": 1210 + }, + { + "epoch": 0.3, + "grad_norm": 0.004180908203125, + "learning_rate": 4.389179264133281e-06, + "logits/chosen": -2.260874032974243, + "logits/rejected": -2.495485305786133, + "logps/chosen": -35.43501663208008, + "logps/rejected": -1262.712890625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1415407359600067, + "rewards/margins": 11.985517501831055, + "rewards/rejected": -12.127059936523438, + "step": 1220 + }, + { + "epoch": 0.31, + "grad_norm": 0.337890625, + "learning_rate": 4.374830722787159e-06, + "logits/chosen": -2.265794277191162, + "logits/rejected": -2.539062976837158, + "logps/chosen": -40.56992721557617, + "logps/rejected": -1323.746826171875, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.184851735830307, + "rewards/margins": 12.533957481384277, + "rewards/rejected": -12.718809127807617, + "step": 1230 + }, + { + "epoch": 0.31, + "grad_norm": 0.10986328125, + "learning_rate": 4.360339643483533e-06, + "logits/chosen": -2.2265820503234863, + "logits/rejected": -2.4537243843078613, + "logps/chosen": -40.92462921142578, + "logps/rejected": -1421.2384033203125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19266225397586823, + "rewards/margins": 13.46106243133545, + "rewards/rejected": -13.653724670410156, + "step": 1240 + }, + { + "epoch": 0.31, + "grad_norm": 0.00179290771484375, + "learning_rate": 4.345707127937253e-06, + "logits/chosen": -2.136321544647217, + "logits/rejected": -2.4158737659454346, + "logps/chosen": -47.67406463623047, + "logps/rejected": -1579.249267578125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2534434497356415, + "rewards/margins": 15.00439453125, + "rewards/rejected": -15.257838249206543, + "step": 1250 + }, + { + "epoch": 0.31, + "grad_norm": 0.060302734375, + "learning_rate": 4.330934288616154e-06, + "logits/chosen": -2.168765068054199, + "logits/rejected": -2.4067187309265137, + "logps/chosen": -62.91276931762695, + "logps/rejected": -1362.3446044921875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40555182099342346, + "rewards/margins": 12.703886032104492, + "rewards/rejected": -13.10943603515625, + "step": 1260 + }, + { + "epoch": 0.32, + "grad_norm": 0.150390625, + "learning_rate": 4.316022248656485e-06, + "logits/chosen": -2.1002354621887207, + "logits/rejected": -2.365851402282715, + "logps/chosen": -53.953285217285156, + "logps/rejected": -1585.8782958984375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32181739807128906, + "rewards/margins": 14.933857917785645, + "rewards/rejected": -15.255674362182617, + "step": 1270 + }, + { + "epoch": 0.32, + "grad_norm": 0.024658203125, + "learning_rate": 4.3009721417775166e-06, + "logits/chosen": -2.1251707077026367, + "logits/rejected": -2.363041639328003, + "logps/chosen": -58.41363525390625, + "logps/rejected": -1543.182861328125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36327052116394043, + "rewards/margins": 14.513801574707031, + "rewards/rejected": -14.87707233428955, + "step": 1280 + }, + { + "epoch": 0.32, + "grad_norm": 0.0230712890625, + "learning_rate": 4.285785112195346e-06, + "logits/chosen": -2.1945090293884277, + "logits/rejected": -2.4488844871520996, + "logps/chosen": -69.85707092285156, + "logps/rejected": -1662.5419921875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47537103295326233, + "rewards/margins": 15.588732719421387, + "rewards/rejected": -16.064102172851562, + "step": 1290 + }, + { + "epoch": 0.32, + "grad_norm": 0.035888671875, + "learning_rate": 4.27046231453591e-06, + "logits/chosen": -2.1391608715057373, + "logits/rejected": -2.379563808441162, + "logps/chosen": -63.22686004638672, + "logps/rejected": -1555.8231201171875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41675299406051636, + "rewards/margins": 14.574475288391113, + "rewards/rejected": -14.991228103637695, + "step": 1300 + }, + { + "epoch": 0.33, + "grad_norm": 2.682209014892578e-06, + "learning_rate": 4.255004913747196e-06, + "logits/chosen": -2.1814258098602295, + "logits/rejected": -2.415797710418701, + "logps/chosen": -57.22446823120117, + "logps/rejected": -1578.1937255859375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35579347610473633, + "rewards/margins": 14.885470390319824, + "rewards/rejected": -15.241262435913086, + "step": 1310 + }, + { + "epoch": 0.33, + "grad_norm": 0.0059814453125, + "learning_rate": 4.2394140850106825e-06, + "logits/chosen": -2.1057560443878174, + "logits/rejected": -2.3444247245788574, + "logps/chosen": -60.09722900390625, + "logps/rejected": -1566.6488037109375, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37801143527030945, + "rewards/margins": 14.739140510559082, + "rewards/rejected": -15.117152214050293, + "step": 1320 + }, + { + "epoch": 0.33, + "grad_norm": 0.00049591064453125, + "learning_rate": 4.223691013651986e-06, + "logits/chosen": -2.145397424697876, + "logits/rejected": -2.3859896659851074, + "logps/chosen": -50.876380920410156, + "logps/rejected": -1696.628173828125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28237268328666687, + "rewards/margins": 16.069416046142578, + "rewards/rejected": -16.351787567138672, + "step": 1330 + }, + { + "epoch": 0.33, + "grad_norm": 0.076171875, + "learning_rate": 4.207836895050748e-06, + "logits/chosen": -2.290546178817749, + "logits/rejected": -2.601999282836914, + "logps/chosen": -49.566925048828125, + "logps/rejected": -1810.896240234375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27409371733665466, + "rewards/margins": 17.24148178100586, + "rewards/rejected": -17.515573501586914, + "step": 1340 + }, + { + "epoch": 0.34, + "grad_norm": 0.23046875, + "learning_rate": 4.1918529345497525e-06, + "logits/chosen": -2.2135214805603027, + "logits/rejected": -2.4138569831848145, + "logps/chosen": -51.47322463989258, + "logps/rejected": -1197.149169921875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2979530692100525, + "rewards/margins": 11.184531211853027, + "rewards/rejected": -11.482483863830566, + "step": 1350 + }, + { + "epoch": 0.34, + "grad_norm": 0.22265625, + "learning_rate": 4.175740347363289e-06, + "logits/chosen": -2.2823052406311035, + "logits/rejected": -2.500483989715576, + "logps/chosen": -50.924964904785156, + "logps/rejected": -1341.169189453125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2940273880958557, + "rewards/margins": 12.595098495483398, + "rewards/rejected": -12.889126777648926, + "step": 1360 + }, + { + "epoch": 0.34, + "grad_norm": 0.001129150390625, + "learning_rate": 4.159500358484759e-06, + "logits/chosen": -2.1221683025360107, + "logits/rejected": -2.388002872467041, + "logps/chosen": -52.10107421875, + "logps/rejected": -1701.734375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2977118492126465, + "rewards/margins": 16.128461837768555, + "rewards/rejected": -16.42617416381836, + "step": 1370 + }, + { + "epoch": 0.34, + "grad_norm": 0.26171875, + "learning_rate": 4.143134202593549e-06, + "logits/chosen": -2.1562037467956543, + "logits/rejected": -2.3721659183502197, + "logps/chosen": -50.73106002807617, + "logps/rejected": -1416.411376953125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28152209520339966, + "rewards/margins": 13.32117748260498, + "rewards/rejected": -13.602702140808105, + "step": 1380 + }, + { + "epoch": 0.35, + "grad_norm": 0.00372314453125, + "learning_rate": 4.126643123961158e-06, + "logits/chosen": -2.2438769340515137, + "logits/rejected": -2.4929661750793457, + "logps/chosen": -71.16793060302734, + "logps/rejected": -1686.4351806640625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4894431233406067, + "rewards/margins": 15.813896179199219, + "rewards/rejected": -16.303340911865234, + "step": 1390 + }, + { + "epoch": 0.35, + "grad_norm": 0.03466796875, + "learning_rate": 4.110028376356599e-06, + "logits/chosen": -2.222071647644043, + "logits/rejected": -2.447359323501587, + "logps/chosen": -70.91515350341797, + "logps/rejected": -1337.4664306640625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4935643672943115, + "rewards/margins": 12.358075141906738, + "rewards/rejected": -12.851638793945312, + "step": 1400 + }, + { + "epoch": 0.35, + "grad_norm": 0.0791015625, + "learning_rate": 4.093291222951079e-06, + "logits/chosen": -2.1609065532684326, + "logits/rejected": -2.4100985527038574, + "logps/chosen": -71.06592559814453, + "logps/rejected": -1599.0948486328125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49402037262916565, + "rewards/margins": 14.944366455078125, + "rewards/rejected": -15.438386917114258, + "step": 1410 + }, + { + "epoch": 0.35, + "grad_norm": 0.234375, + "learning_rate": 4.076432936221965e-06, + "logits/chosen": -2.1633338928222656, + "logits/rejected": -2.3718645572662354, + "logps/chosen": -76.24402618408203, + "logps/rejected": -1331.0867919921875, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5511754751205444, + "rewards/margins": 12.27659797668457, + "rewards/rejected": -12.827774047851562, + "step": 1420 + }, + { + "epoch": 0.36, + "grad_norm": 0.07177734375, + "learning_rate": 4.059454797856039e-06, + "logits/chosen": -2.200438976287842, + "logits/rejected": -2.4105000495910645, + "logps/chosen": -72.47054290771484, + "logps/rejected": -1285.55029296875, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5113335251808167, + "rewards/margins": 11.851224899291992, + "rewards/rejected": -12.362558364868164, + "step": 1430 + }, + { + "epoch": 0.36, + "grad_norm": 0.0015869140625, + "learning_rate": 4.042358098652057e-06, + "logits/chosen": -2.257859468460083, + "logits/rejected": -2.485215187072754, + "logps/chosen": -52.50494384765625, + "logps/rejected": -1284.864990234375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30147919058799744, + "rewards/margins": 12.056472778320312, + "rewards/rejected": -12.357951164245605, + "step": 1440 + }, + { + "epoch": 0.36, + "grad_norm": 0.00494384765625, + "learning_rate": 4.025144138422615e-06, + "logits/chosen": -2.1999363899230957, + "logits/rejected": -2.436066150665283, + "logps/chosen": -60.535972595214844, + "logps/rejected": -1517.8021240234375, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3815072178840637, + "rewards/margins": 14.274576187133789, + "rewards/rejected": -14.656084060668945, + "step": 1450 + }, + { + "epoch": 0.36, + "grad_norm": 0.048583984375, + "learning_rate": 4.007814225895321e-06, + "logits/chosen": -2.1949074268341064, + "logits/rejected": -2.453916549682617, + "logps/chosen": -40.10565948486328, + "logps/rejected": -1380.6016845703125, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.184524804353714, + "rewards/margins": 13.0862398147583, + "rewards/rejected": -13.270764350891113, + "step": 1460 + }, + { + "epoch": 0.37, + "grad_norm": 0.0703125, + "learning_rate": 3.990369678613303e-06, + "logits/chosen": -2.1046247482299805, + "logits/rejected": -2.339478015899658, + "logps/chosen": -32.17253875732422, + "logps/rejected": -1487.5965576171875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1069754809141159, + "rewards/margins": 14.181404113769531, + "rewards/rejected": -14.288378715515137, + "step": 1470 + }, + { + "epoch": 0.37, + "grad_norm": 0.0157470703125, + "learning_rate": 3.97281182283504e-06, + "logits/chosen": -2.168814182281494, + "logits/rejected": -2.4204602241516113, + "logps/chosen": -33.689884185791016, + "logps/rejected": -1507.740966796875, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11984201520681381, + "rewards/margins": 14.405647277832031, + "rewards/rejected": -14.525489807128906, + "step": 1480 + }, + { + "epoch": 0.37, + "grad_norm": 0.3828125, + "learning_rate": 3.955141993433526e-06, + "logits/chosen": -2.2266287803649902, + "logits/rejected": -2.45817494392395, + "logps/chosen": -52.63502883911133, + "logps/rejected": -1366.678955078125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3042375445365906, + "rewards/margins": 12.845235824584961, + "rewards/rejected": -13.14947509765625, + "step": 1490 + }, + { + "epoch": 0.37, + "grad_norm": 0.08544921875, + "learning_rate": 3.937361533794784e-06, + "logits/chosen": -2.156094551086426, + "logits/rejected": -2.3926451206207275, + "logps/chosen": -44.07966232299805, + "logps/rejected": -1358.091064453125, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22154085338115692, + "rewards/margins": 12.819009780883789, + "rewards/rejected": -13.040552139282227, + "step": 1500 + }, + { + "epoch": 0.38, + "grad_norm": 0.021484375, + "learning_rate": 3.919471795715738e-06, + "logits/chosen": -2.212313652038574, + "logits/rejected": -2.4430899620056152, + "logps/chosen": -40.03847122192383, + "logps/rejected": -1265.60009765625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1857212483882904, + "rewards/margins": 11.979241371154785, + "rewards/rejected": -12.164961814880371, + "step": 1510 + }, + { + "epoch": 0.38, + "grad_norm": 0.150390625, + "learning_rate": 3.901474139301433e-06, + "logits/chosen": -2.100083112716675, + "logits/rejected": -2.327531337738037, + "logps/chosen": -47.98102569580078, + "logps/rejected": -1396.822021484375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2628587484359741, + "rewards/margins": 13.183023452758789, + "rewards/rejected": -13.445881843566895, + "step": 1520 + }, + { + "epoch": 0.38, + "grad_norm": 0.08740234375, + "learning_rate": 3.883369932861634e-06, + "logits/chosen": -2.2499475479125977, + "logits/rejected": -2.4626846313476562, + "logps/chosen": -53.71254348754883, + "logps/rejected": -1261.4793701171875, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31738823652267456, + "rewards/margins": 11.826452255249023, + "rewards/rejected": -12.143839836120605, + "step": 1530 + }, + { + "epoch": 0.38, + "grad_norm": 0.000232696533203125, + "learning_rate": 3.865160552806796e-06, + "logits/chosen": -2.293903350830078, + "logits/rejected": -2.5309927463531494, + "logps/chosen": -59.31644821166992, + "logps/rejected": -1348.3590087890625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37211018800735474, + "rewards/margins": 12.622639656066895, + "rewards/rejected": -12.994749069213867, + "step": 1540 + }, + { + "epoch": 0.39, + "grad_norm": 0.0002765655517578125, + "learning_rate": 3.84684738354342e-06, + "logits/chosen": -2.301741361618042, + "logits/rejected": -2.5269277095794678, + "logps/chosen": -35.07439422607422, + "logps/rejected": -1298.1329345703125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1266619861125946, + "rewards/margins": 12.352213859558105, + "rewards/rejected": -12.478874206542969, + "step": 1550 + }, + { + "epoch": 0.39, + "grad_norm": 0.0712890625, + "learning_rate": 3.828431817368798e-06, + "logits/chosen": -2.15970778465271, + "logits/rejected": -2.3912577629089355, + "logps/chosen": -23.495868682861328, + "logps/rejected": -1347.333740234375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02085093781352043, + "rewards/margins": 12.912274360656738, + "rewards/rejected": -12.933123588562012, + "step": 1560 + }, + { + "epoch": 0.39, + "grad_norm": 0.421875, + "learning_rate": 3.8099152543651684e-06, + "logits/chosen": -2.3851158618927, + "logits/rejected": -2.659996509552002, + "logps/chosen": -34.04401397705078, + "logps/rejected": -1443.980712890625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12582853436470032, + "rewards/margins": 13.780733108520508, + "rewards/rejected": -13.906559944152832, + "step": 1570 + }, + { + "epoch": 0.39, + "grad_norm": 0.154296875, + "learning_rate": 3.791299102293261e-06, + "logits/chosen": -2.125797748565674, + "logits/rejected": -2.3718996047973633, + "logps/chosen": -31.654226303100586, + "logps/rejected": -1515.550048828125, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0940675288438797, + "rewards/margins": 14.493083000183105, + "rewards/rejected": -14.587150573730469, + "step": 1580 + }, + { + "epoch": 0.4, + "grad_norm": 0.076171875, + "learning_rate": 3.7725847764852774e-06, + "logits/chosen": -2.117516040802002, + "logits/rejected": -2.376412868499756, + "logps/chosen": -33.58929443359375, + "logps/rejected": -1522.32470703125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11228612810373306, + "rewards/margins": 14.516647338867188, + "rewards/rejected": -14.628933906555176, + "step": 1590 + }, + { + "epoch": 0.4, + "grad_norm": 0.051025390625, + "learning_rate": 3.7537736997372833e-06, + "logits/chosen": -2.183899402618408, + "logits/rejected": -2.4056055545806885, + "logps/chosen": -38.9683723449707, + "logps/rejected": -1303.519287109375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17212000489234924, + "rewards/margins": 12.313672065734863, + "rewards/rejected": -12.485791206359863, + "step": 1600 + }, + { + "epoch": 0.4, + "grad_norm": 0.00689697265625, + "learning_rate": 3.734867302201038e-06, + "logits/chosen": -2.2842166423797607, + "logits/rejected": -2.4898123741149902, + "logps/chosen": -38.427486419677734, + "logps/rejected": -1249.1448974609375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17140671610832214, + "rewards/margins": 11.829398155212402, + "rewards/rejected": -12.000804901123047, + "step": 1610 + }, + { + "epoch": 0.4, + "grad_norm": 0.1435546875, + "learning_rate": 3.7158670212752666e-06, + "logits/chosen": -2.1761648654937744, + "logits/rejected": -2.4285309314727783, + "logps/chosen": -43.9667854309082, + "logps/rejected": -1409.6014404296875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21925847232341766, + "rewards/margins": 13.351308822631836, + "rewards/rejected": -13.570569038391113, + "step": 1620 + }, + { + "epoch": 0.41, + "grad_norm": 0.028076171875, + "learning_rate": 3.696774301496376e-06, + "logits/chosen": -2.253307342529297, + "logits/rejected": -2.4998929500579834, + "logps/chosen": -39.94139862060547, + "logps/rejected": -1315.309814453125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1798352301120758, + "rewards/margins": 12.484976768493652, + "rewards/rejected": -12.664812088012695, + "step": 1630 + }, + { + "epoch": 0.41, + "grad_norm": 0.0152587890625, + "learning_rate": 3.677590594428629e-06, + "logits/chosen": -2.187530517578125, + "logits/rejected": -2.411306142807007, + "logps/chosen": -46.19135284423828, + "logps/rejected": -1337.900390625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2449018955230713, + "rewards/margins": 12.619891166687012, + "rewards/rejected": -12.86479377746582, + "step": 1640 + }, + { + "epoch": 0.41, + "grad_norm": 0.0035400390625, + "learning_rate": 3.658317358553794e-06, + "logits/chosen": -2.1583094596862793, + "logits/rejected": -2.399893045425415, + "logps/chosen": -42.413978576660156, + "logps/rejected": -1464.1385498046875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20776596665382385, + "rewards/margins": 13.914667129516602, + "rewards/rejected": -14.122431755065918, + "step": 1650 + }, + { + "epoch": 0.41, + "grad_norm": 0.1435546875, + "learning_rate": 3.638956059160252e-06, + "logits/chosen": -2.2085630893707275, + "logits/rejected": -2.465798854827881, + "logps/chosen": -51.00899887084961, + "logps/rejected": -1475.9312744140625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28529030084609985, + "rewards/margins": 13.978610038757324, + "rewards/rejected": -14.26390266418457, + "step": 1660 + }, + { + "epoch": 0.42, + "grad_norm": 0.05029296875, + "learning_rate": 3.6195081682315972e-06, + "logits/chosen": -2.2395682334899902, + "logits/rejected": -2.461138963699341, + "logps/chosen": -52.74529266357422, + "logps/rejected": -1418.346923828125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3086877167224884, + "rewards/margins": 13.38988208770752, + "rewards/rejected": -13.698568344116211, + "step": 1670 + }, + { + "epoch": 0.42, + "grad_norm": 0.08544921875, + "learning_rate": 3.5999751643347342e-06, + "logits/chosen": -2.16579008102417, + "logits/rejected": -2.4046080112457275, + "logps/chosen": -46.71515655517578, + "logps/rejected": -1608.938232421875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24464385211467743, + "rewards/margins": 15.266571044921875, + "rewards/rejected": -15.51121711730957, + "step": 1680 + }, + { + "epoch": 0.42, + "grad_norm": 0.1923828125, + "learning_rate": 3.5803585325074536e-06, + "logits/chosen": -2.1881327629089355, + "logits/rejected": -2.427145481109619, + "logps/chosen": -37.16319274902344, + "logps/rejected": -1421.040771484375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15972432494163513, + "rewards/margins": 13.539385795593262, + "rewards/rejected": -13.69911003112793, + "step": 1690 + }, + { + "epoch": 0.42, + "grad_norm": 0.07275390625, + "learning_rate": 3.5606597641455387e-06, + "logits/chosen": -2.219900369644165, + "logits/rejected": -2.4398694038391113, + "logps/chosen": -32.802005767822266, + "logps/rejected": -1393.4263916015625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11474724858999252, + "rewards/margins": 13.298954963684082, + "rewards/rejected": -13.413702011108398, + "step": 1700 + }, + { + "epoch": 0.43, + "grad_norm": 0.130859375, + "learning_rate": 3.540880356889376e-06, + "logits/chosen": -2.23069429397583, + "logits/rejected": -2.4424965381622314, + "logps/chosen": -42.188209533691406, + "logps/rejected": -1353.078857421875, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2047566920518875, + "rewards/margins": 12.800407409667969, + "rewards/rejected": -13.005162239074707, + "step": 1710 + }, + { + "epoch": 0.43, + "grad_norm": 0.326171875, + "learning_rate": 3.5210218145100934e-06, + "logits/chosen": -2.1350436210632324, + "logits/rejected": -2.3985211849212646, + "logps/chosen": -51.05349349975586, + "logps/rejected": -1367.61767578125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29429805278778076, + "rewards/margins": 12.873303413391113, + "rewards/rejected": -13.167600631713867, + "step": 1720 + }, + { + "epoch": 0.43, + "grad_norm": 0.134765625, + "learning_rate": 3.5010856467952335e-06, + "logits/chosen": -2.1528429985046387, + "logits/rejected": -2.3955628871917725, + "logps/chosen": -42.9320068359375, + "logps/rejected": -1482.957763671875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20944638550281525, + "rewards/margins": 14.0554780960083, + "rewards/rejected": -14.264923095703125, + "step": 1730 + }, + { + "epoch": 0.43, + "grad_norm": 0.58984375, + "learning_rate": 3.4810733694339687e-06, + "logits/chosen": -2.2495784759521484, + "logits/rejected": -2.512760639190674, + "logps/chosen": -57.50274658203125, + "logps/rejected": -1577.8023681640625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3561248183250427, + "rewards/margins": 14.898828506469727, + "rewards/rejected": -15.25495433807373, + "step": 1740 + }, + { + "epoch": 0.44, + "grad_norm": 0.1435546875, + "learning_rate": 3.4609865039018676e-06, + "logits/chosen": -2.2507643699645996, + "logits/rejected": -2.475839614868164, + "logps/chosen": -41.08405685424805, + "logps/rejected": -1401.9703369140625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20163026452064514, + "rewards/margins": 13.3068265914917, + "rewards/rejected": -13.508456230163574, + "step": 1750 + }, + { + "epoch": 0.44, + "grad_norm": 0.044677734375, + "learning_rate": 3.4408265773452226e-06, + "logits/chosen": -2.1668903827667236, + "logits/rejected": -2.4009640216827393, + "logps/chosen": -43.23725891113281, + "logps/rejected": -1427.717041015625, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21243014931678772, + "rewards/margins": 13.541119575500488, + "rewards/rejected": -13.753549575805664, + "step": 1760 + }, + { + "epoch": 0.44, + "grad_norm": 0.0021514892578125, + "learning_rate": 3.420595122464942e-06, + "logits/chosen": -2.2544631958007812, + "logits/rejected": -2.4994306564331055, + "logps/chosen": -50.723716735839844, + "logps/rejected": -1400.42236328125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2909182906150818, + "rewards/margins": 13.212321281433105, + "rewards/rejected": -13.503240585327148, + "step": 1770 + }, + { + "epoch": 0.44, + "grad_norm": 0.1767578125, + "learning_rate": 3.4002936774000284e-06, + "logits/chosen": -2.1552722454071045, + "logits/rejected": -2.4494900703430176, + "logps/chosen": -53.8035888671875, + "logps/rejected": -1743.7855224609375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31691521406173706, + "rewards/margins": 16.568302154541016, + "rewards/rejected": -16.885215759277344, + "step": 1780 + }, + { + "epoch": 0.45, + "grad_norm": 0.04296875, + "learning_rate": 3.3799237856106348e-06, + "logits/chosen": -2.1529643535614014, + "logits/rejected": -2.4126904010772705, + "logps/chosen": -55.90287399291992, + "logps/rejected": -1550.77783203125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34558922052383423, + "rewards/margins": 14.627325057983398, + "rewards/rejected": -14.97291374206543, + "step": 1790 + }, + { + "epoch": 0.45, + "grad_norm": 0.004913330078125, + "learning_rate": 3.35948699576072e-06, + "logits/chosen": -2.108168363571167, + "logits/rejected": -2.371859550476074, + "logps/chosen": -63.180198669433594, + "logps/rejected": -1683.2808837890625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40645408630371094, + "rewards/margins": 15.862528800964355, + "rewards/rejected": -16.268983840942383, + "step": 1800 + }, + { + "epoch": 0.45, + "grad_norm": 0.37890625, + "learning_rate": 3.3389848616003085e-06, + "logits/chosen": -2.202070951461792, + "logits/rejected": -2.4270646572113037, + "logps/chosen": -47.17142105102539, + "logps/rejected": -1418.73046875, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25016140937805176, + "rewards/margins": 13.404383659362793, + "rewards/rejected": -13.654545783996582, + "step": 1810 + }, + { + "epoch": 0.45, + "grad_norm": 0.054443359375, + "learning_rate": 3.3184189418473674e-06, + "logits/chosen": -2.0919992923736572, + "logits/rejected": -2.3279192447662354, + "logps/chosen": -37.22324752807617, + "logps/rejected": -1371.5806884765625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15740033984184265, + "rewards/margins": 13.067277908325195, + "rewards/rejected": -13.224676132202148, + "step": 1820 + }, + { + "epoch": 0.46, + "grad_norm": 0.0159912109375, + "learning_rate": 3.2977908000692925e-06, + "logits/chosen": -2.1699509620666504, + "logits/rejected": -2.4078266620635986, + "logps/chosen": -46.939552307128906, + "logps/rejected": -1496.64501953125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24631306529045105, + "rewards/margins": 14.200462341308594, + "rewards/rejected": -14.44677448272705, + "step": 1830 + }, + { + "epoch": 0.46, + "grad_norm": 0.490234375, + "learning_rate": 3.2771020045640435e-06, + "logits/chosen": -2.314471960067749, + "logits/rejected": -2.533036708831787, + "logps/chosen": -49.747779846191406, + "logps/rejected": -1293.560302734375, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27524739503860474, + "rewards/margins": 12.140924453735352, + "rewards/rejected": -12.416172981262207, + "step": 1840 + }, + { + "epoch": 0.46, + "grad_norm": 0.19140625, + "learning_rate": 3.256354128240907e-06, + "logits/chosen": -2.101799488067627, + "logits/rejected": -2.320006847381592, + "logps/chosen": -58.1518669128418, + "logps/rejected": -1474.82666015625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.361042320728302, + "rewards/margins": 13.842974662780762, + "rewards/rejected": -14.204015731811523, + "step": 1850 + }, + { + "epoch": 0.46, + "grad_norm": 2.551823854446411e-07, + "learning_rate": 3.235548748500914e-06, + "logits/chosen": -2.3442602157592773, + "logits/rejected": -2.5813608169555664, + "logps/chosen": -64.3367691040039, + "logps/rejected": -1516.5281982421875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42836618423461914, + "rewards/margins": 14.244784355163574, + "rewards/rejected": -14.673149108886719, + "step": 1860 + }, + { + "epoch": 0.47, + "grad_norm": 0.130859375, + "learning_rate": 3.214687447116913e-06, + "logits/chosen": -2.129812717437744, + "logits/rejected": -2.35500168800354, + "logps/chosen": -60.48137664794922, + "logps/rejected": -1468.074462890625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3904629647731781, + "rewards/margins": 13.763618469238281, + "rewards/rejected": -14.154080390930176, + "step": 1870 + }, + { + "epoch": 0.47, + "grad_norm": 0.005218505859375, + "learning_rate": 3.193771810113313e-06, + "logits/chosen": -2.1812546253204346, + "logits/rejected": -2.450334072113037, + "logps/chosen": -57.156097412109375, + "logps/rejected": -1621.7850341796875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34869837760925293, + "rewards/margins": 15.336013793945312, + "rewards/rejected": -15.684713363647461, + "step": 1880 + }, + { + "epoch": 0.47, + "grad_norm": 0.03271484375, + "learning_rate": 3.1728034276455032e-06, + "logits/chosen": -2.1772501468658447, + "logits/rejected": -2.4167187213897705, + "logps/chosen": -47.676063537597656, + "logps/rejected": -1501.980224609375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2580206096172333, + "rewards/margins": 14.23988151550293, + "rewards/rejected": -14.497901916503906, + "step": 1890 + }, + { + "epoch": 0.47, + "grad_norm": 0.00274658203125, + "learning_rate": 3.1517838938789597e-06, + "logits/chosen": -2.1416432857513428, + "logits/rejected": -2.3887360095977783, + "logps/chosen": -31.932031631469727, + "logps/rejected": -1682.0501708984375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10166473686695099, + "rewards/margins": 16.101634979248047, + "rewards/rejected": -16.203296661376953, + "step": 1900 + }, + { + "epoch": 0.48, + "grad_norm": 0.36328125, + "learning_rate": 3.130714806868041e-06, + "logits/chosen": -2.132199764251709, + "logits/rejected": -2.3675732612609863, + "logps/chosen": -38.96401596069336, + "logps/rejected": -1434.172607421875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1679878979921341, + "rewards/margins": 13.631708145141602, + "rewards/rejected": -13.799695014953613, + "step": 1910 + }, + { + "epoch": 0.48, + "grad_norm": 0.1484375, + "learning_rate": 3.1095977684344976e-06, + "logits/chosen": -2.221590042114258, + "logits/rejected": -2.477220296859741, + "logps/chosen": -42.42957305908203, + "logps/rejected": -1500.699462890625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2044091671705246, + "rewards/margins": 14.278982162475586, + "rewards/rejected": -14.483392715454102, + "step": 1920 + }, + { + "epoch": 0.48, + "grad_norm": 0.0040283203125, + "learning_rate": 3.0884343840456874e-06, + "logits/chosen": -2.280695915222168, + "logits/rejected": -2.5356380939483643, + "logps/chosen": -51.98859405517578, + "logps/rejected": -1650.245361328125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3021391034126282, + "rewards/margins": 15.651565551757812, + "rewards/rejected": -15.953704833984375, + "step": 1930 + }, + { + "epoch": 0.48, + "grad_norm": 0.0002536773681640625, + "learning_rate": 3.0672262626925174e-06, + "logits/chosen": -2.1820268630981445, + "logits/rejected": -2.439319133758545, + "logps/chosen": -47.39429473876953, + "logps/rejected": -1611.6102294921875, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24379031360149384, + "rewards/margins": 15.318346977233887, + "rewards/rejected": -15.56213665008545, + "step": 1940 + }, + { + "epoch": 0.49, + "grad_norm": 3.910064697265625e-05, + "learning_rate": 3.0459750167671147e-06, + "logits/chosen": -2.1863160133361816, + "logits/rejected": -2.450911283493042, + "logps/chosen": -57.97031784057617, + "logps/rejected": -1733.2484130859375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35936951637268066, + "rewards/margins": 16.4171199798584, + "rewards/rejected": -16.776485443115234, + "step": 1950 + }, + { + "epoch": 0.49, + "grad_norm": 0.375, + "learning_rate": 3.024682261940247e-06, + "logits/chosen": -2.1711161136627197, + "logits/rejected": -2.381054401397705, + "logps/chosen": -63.16656494140625, + "logps/rejected": -1473.282958984375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40881744027137756, + "rewards/margins": 13.772600173950195, + "rewards/rejected": -14.181416511535645, + "step": 1960 + }, + { + "epoch": 0.49, + "grad_norm": 0.000568389892578125, + "learning_rate": 3.0033496170384803e-06, + "logits/chosen": -2.232100009918213, + "logits/rejected": -2.4612276554107666, + "logps/chosen": -56.055152893066406, + "logps/rejected": -1356.71484375, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3471956253051758, + "rewards/margins": 12.729546546936035, + "rewards/rejected": -13.076742172241211, + "step": 1970 + }, + { + "epoch": 0.49, + "grad_norm": 0.10791015625, + "learning_rate": 2.9819787039211068e-06, + "logits/chosen": -2.1615240573883057, + "logits/rejected": -2.393810510635376, + "logps/chosen": -35.02969741821289, + "logps/rejected": -1524.7955322265625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13164165616035461, + "rewards/margins": 14.575594902038574, + "rewards/rejected": -14.707235336303711, + "step": 1980 + }, + { + "epoch": 0.5, + "grad_norm": 0.0054931640625, + "learning_rate": 2.960571147356845e-06, + "logits/chosen": -2.256544828414917, + "logits/rejected": -2.5309910774230957, + "logps/chosen": -49.80757522583008, + "logps/rejected": -1592.794677734375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2703229784965515, + "rewards/margins": 15.133091926574707, + "rewards/rejected": -15.403416633605957, + "step": 1990 + }, + { + "epoch": 0.5, + "grad_norm": 0.0023040771484375, + "learning_rate": 2.9391285749003046e-06, + "logits/chosen": -2.15415620803833, + "logits/rejected": -2.405571460723877, + "logps/chosen": -40.737998962402344, + "logps/rejected": -1701.052734375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18067236244678497, + "rewards/margins": 16.24726104736328, + "rewards/rejected": -16.427934646606445, + "step": 2000 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.6136603355407715, + "eval_logits/rejected": -2.7333316802978516, + "eval_logps/chosen": -48.08984375, + "eval_logps/rejected": -693.2846069335938, + "eval_loss": 0.0037064917851239443, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -0.22176341712474823, + "eval_rewards/margins": 6.262362003326416, + "eval_rewards/rejected": -6.48412561416626, + "eval_runtime": 0.6544, + "eval_samples_per_second": 7.641, + "eval_steps_per_second": 4.585, + "step": 2000 + }, + { + "epoch": 0.5, + "grad_norm": 0.6328125, + "learning_rate": 2.9176526167682543e-06, + "logits/chosen": -2.1183362007141113, + "logits/rejected": -2.351123571395874, + "logps/chosen": -37.299964904785156, + "logps/rejected": -1437.7230224609375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15952737629413605, + "rewards/margins": 13.696490287780762, + "rewards/rejected": -13.85601806640625, + "step": 2010 + }, + { + "epoch": 0.5, + "grad_norm": 0.03857421875, + "learning_rate": 2.8961449057156775e-06, + "logits/chosen": -2.200801372528076, + "logits/rejected": -2.4389915466308594, + "logps/chosen": -42.25465774536133, + "logps/rejected": -1569.040771484375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20262226462364197, + "rewards/margins": 14.958514213562012, + "rewards/rejected": -15.16113567352295, + "step": 2020 + }, + { + "epoch": 0.51, + "grad_norm": 0.0037689208984375, + "learning_rate": 2.874607076911642e-06, + "logits/chosen": -2.212007999420166, + "logits/rejected": -2.4628169536590576, + "logps/chosen": -54.49187088012695, + "logps/rejected": -1452.176513671875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31496500968933105, + "rewards/margins": 13.68268871307373, + "rewards/rejected": -13.997654914855957, + "step": 2030 + }, + { + "epoch": 0.51, + "grad_norm": 0.0026092529296875, + "learning_rate": 2.8530407678149806e-06, + "logits/chosen": -2.1855294704437256, + "logits/rejected": -2.428863525390625, + "logps/chosen": -61.762428283691406, + "logps/rejected": -1588.792236328125, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4000323414802551, + "rewards/margins": 14.938389778137207, + "rewards/rejected": -15.338421821594238, + "step": 2040 + }, + { + "epoch": 0.51, + "grad_norm": 0.001739501953125, + "learning_rate": 2.8314476180498003e-06, + "logits/chosen": -2.0332534313201904, + "logits/rejected": -2.267488718032837, + "logps/chosen": -41.453369140625, + "logps/rejected": -1475.7647705078125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1973056197166443, + "rewards/margins": 14.028945922851562, + "rewards/rejected": -14.226251602172852, + "step": 2050 + }, + { + "epoch": 0.51, + "grad_norm": 0.13671875, + "learning_rate": 2.8098292692808253e-06, + "logits/chosen": -2.2281060218811035, + "logits/rejected": -2.422762632369995, + "logps/chosen": -41.2132453918457, + "logps/rejected": -1153.19775390625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1940850019454956, + "rewards/margins": 10.885441780090332, + "rewards/rejected": -11.079526901245117, + "step": 2060 + }, + { + "epoch": 0.52, + "grad_norm": 0.162109375, + "learning_rate": 2.7881873650885904e-06, + "logits/chosen": -2.227834463119507, + "logits/rejected": -2.4453253746032715, + "logps/chosen": -50.43096160888672, + "logps/rejected": -1375.1741943359375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2844979166984558, + "rewards/margins": 12.969167709350586, + "rewards/rejected": -13.25366497039795, + "step": 2070 + }, + { + "epoch": 0.52, + "grad_norm": 0.1513671875, + "learning_rate": 2.7665235508444772e-06, + "logits/chosen": -2.1580593585968018, + "logits/rejected": -2.404978036880493, + "logps/chosen": -47.8787841796875, + "logps/rejected": -1663.496826171875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2667320966720581, + "rewards/margins": 15.83378791809082, + "rewards/rejected": -16.10051727294922, + "step": 2080 + }, + { + "epoch": 0.52, + "grad_norm": 0.000820159912109375, + "learning_rate": 2.7448394735856275e-06, + "logits/chosen": -2.1202292442321777, + "logits/rejected": -2.387399196624756, + "logps/chosen": -29.072830200195312, + "logps/rejected": -1652.7601318359375, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07453112304210663, + "rewards/margins": 15.884991645812988, + "rewards/rejected": -15.959524154663086, + "step": 2090 + }, + { + "epoch": 0.52, + "grad_norm": 0.1259765625, + "learning_rate": 2.723136781889722e-06, + "logits/chosen": -2.248565912246704, + "logits/rejected": -2.483459949493408, + "logps/chosen": -49.5106201171875, + "logps/rejected": -1374.344482421875, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26804444193840027, + "rewards/margins": 12.9815092086792, + "rewards/rejected": -13.249552726745605, + "step": 2100 + }, + { + "epoch": 0.53, + "grad_norm": 0.0208740234375, + "learning_rate": 2.7014171257496414e-06, + "logits/chosen": -2.2338385581970215, + "logits/rejected": -2.4451489448547363, + "logps/chosen": -47.859092712402344, + "logps/rejected": -1475.451416015625, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.259570837020874, + "rewards/margins": 13.942883491516113, + "rewards/rejected": -14.20245361328125, + "step": 2110 + }, + { + "epoch": 0.53, + "grad_norm": 0.2216796875, + "learning_rate": 2.6796821564480237e-06, + "logits/chosen": -2.1667749881744385, + "logits/rejected": -2.3811049461364746, + "logps/chosen": -51.062232971191406, + "logps/rejected": -1320.582763671875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2894384562969208, + "rewards/margins": 12.419047355651855, + "rewards/rejected": -12.708486557006836, + "step": 2120 + }, + { + "epoch": 0.53, + "grad_norm": 0.00604248046875, + "learning_rate": 2.6579335264317253e-06, + "logits/chosen": -2.3176040649414062, + "logits/rejected": -2.558061361312866, + "logps/chosen": -36.845001220703125, + "logps/rejected": -1507.6351318359375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15489184856414795, + "rewards/margins": 14.385536193847656, + "rewards/rejected": -14.540430068969727, + "step": 2130 + }, + { + "epoch": 0.53, + "grad_norm": 0.00830078125, + "learning_rate": 2.6361728891861843e-06, + "logits/chosen": -2.067624568939209, + "logits/rejected": -2.2963385581970215, + "logps/chosen": -45.42739486694336, + "logps/rejected": -1546.36083984375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23402588069438934, + "rewards/margins": 14.646380424499512, + "rewards/rejected": -14.880406379699707, + "step": 2140 + }, + { + "epoch": 0.54, + "grad_norm": 0.0032806396484375, + "learning_rate": 2.614401899109716e-06, + "logits/chosen": -2.247525930404663, + "logits/rejected": -2.4837863445281982, + "logps/chosen": -48.529815673828125, + "logps/rejected": -1460.3306884765625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2696106433868408, + "rewards/margins": 13.825261116027832, + "rewards/rejected": -14.094873428344727, + "step": 2150 + }, + { + "epoch": 0.54, + "grad_norm": 0.06201171875, + "learning_rate": 2.5926222113877282e-06, + "logits/chosen": -2.243438482284546, + "logits/rejected": -2.4923970699310303, + "logps/chosen": -43.63848114013672, + "logps/rejected": -1591.591064453125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2216021567583084, + "rewards/margins": 15.112271308898926, + "rewards/rejected": -15.333871841430664, + "step": 2160 + }, + { + "epoch": 0.54, + "grad_norm": 0.205078125, + "learning_rate": 2.570835481866889e-06, + "logits/chosen": -2.144465923309326, + "logits/rejected": -2.3723580837249756, + "logps/chosen": -45.58980178833008, + "logps/rejected": -1466.3011474609375, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2433461844921112, + "rewards/margins": 13.90100383758545, + "rewards/rejected": -14.1443510055542, + "step": 2170 + }, + { + "epoch": 0.54, + "grad_norm": 0.140625, + "learning_rate": 2.5490433669292337e-06, + "logits/chosen": -2.0634944438934326, + "logits/rejected": -2.311782121658325, + "logps/chosen": -37.41926193237305, + "logps/rejected": -1625.005126953125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1547044962644577, + "rewards/margins": 15.56786060333252, + "rewards/rejected": -15.722564697265625, + "step": 2180 + }, + { + "epoch": 0.55, + "grad_norm": 0.00244140625, + "learning_rate": 2.527247523366232e-06, + "logits/chosen": -2.2304885387420654, + "logits/rejected": -2.4748549461364746, + "logps/chosen": -54.11591339111328, + "logps/rejected": -1548.42578125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31723862886428833, + "rewards/margins": 14.627525329589844, + "rewards/rejected": -14.9447660446167, + "step": 2190 + }, + { + "epoch": 0.55, + "grad_norm": 0.01214599609375, + "learning_rate": 2.5054496082528336e-06, + "logits/chosen": -2.2945401668548584, + "logits/rejected": -2.553946018218994, + "logps/chosen": -50.36088180541992, + "logps/rejected": -1503.6251220703125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2821517586708069, + "rewards/margins": 14.262479782104492, + "rewards/rejected": -14.544631958007812, + "step": 2200 + }, + { + "epoch": 0.55, + "grad_norm": 0.07373046875, + "learning_rate": 2.483651278821481e-06, + "logits/chosen": -2.240737199783325, + "logits/rejected": -2.468348264694214, + "logps/chosen": -38.926151275634766, + "logps/rejected": -1415.637939453125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17295430600643158, + "rewards/margins": 13.47205638885498, + "rewards/rejected": -13.645009994506836, + "step": 2210 + }, + { + "epoch": 0.55, + "grad_norm": 0.111328125, + "learning_rate": 2.4618541923361166e-06, + "logits/chosen": -2.4229185581207275, + "logits/rejected": -2.6283278465270996, + "logps/chosen": -44.134647369384766, + "logps/rejected": -1301.507080078125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2265511453151703, + "rewards/margins": 12.291707038879395, + "rewards/rejected": -12.518258094787598, + "step": 2220 + }, + { + "epoch": 0.56, + "grad_norm": 0.001983642578125, + "learning_rate": 2.4400600059661836e-06, + "logits/chosen": -2.0719246864318848, + "logits/rejected": -2.375192165374756, + "logps/chosen": -46.18827819824219, + "logps/rejected": -1760.173095703125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24284347891807556, + "rewards/margins": 16.816072463989258, + "rewards/rejected": -17.058916091918945, + "step": 2230 + }, + { + "epoch": 0.56, + "grad_norm": 0.046630859375, + "learning_rate": 2.41827037666064e-06, + "logits/chosen": -2.2636351585388184, + "logits/rejected": -2.4840915203094482, + "logps/chosen": -47.29922103881836, + "logps/rejected": -1315.336181640625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25474974513053894, + "rewards/margins": 12.407878875732422, + "rewards/rejected": -12.662630081176758, + "step": 2240 + }, + { + "epoch": 0.56, + "grad_norm": 0.01141357421875, + "learning_rate": 2.396486961021983e-06, + "logits/chosen": -2.1793510913848877, + "logits/rejected": -2.4308459758758545, + "logps/chosen": -41.266380310058594, + "logps/rejected": -1442.6708984375, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19039396941661835, + "rewards/margins": 13.722501754760742, + "rewards/rejected": -13.912895202636719, + "step": 2250 + }, + { + "epoch": 0.56, + "grad_norm": 0.010498046875, + "learning_rate": 2.3747114151802993e-06, + "logits/chosen": -2.3280482292175293, + "logits/rejected": -2.5701987743377686, + "logps/chosen": -47.68052673339844, + "logps/rejected": -1394.991455078125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2576830983161926, + "rewards/margins": 13.206995010375977, + "rewards/rejected": -13.464675903320312, + "step": 2260 + }, + { + "epoch": 0.57, + "grad_norm": 0.08349609375, + "learning_rate": 2.352945394667363e-06, + "logits/chosen": -2.0980782508850098, + "logits/rejected": -2.364197254180908, + "logps/chosen": -47.831058502197266, + "logps/rejected": -1665.154296875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25840622186660767, + "rewards/margins": 15.833898544311523, + "rewards/rejected": -16.092304229736328, + "step": 2270 + }, + { + "epoch": 0.57, + "grad_norm": 0.44921875, + "learning_rate": 2.3311905542907627e-06, + "logits/chosen": -2.256291389465332, + "logits/rejected": -2.486441135406494, + "logps/chosen": -42.5937614440918, + "logps/rejected": -1361.2073974609375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2082025557756424, + "rewards/margins": 12.911099433898926, + "rewards/rejected": -13.119302749633789, + "step": 2280 + }, + { + "epoch": 0.57, + "grad_norm": 0.06591796875, + "learning_rate": 2.30944854800809e-06, + "logits/chosen": -2.2147023677825928, + "logits/rejected": -2.4364144802093506, + "logps/chosen": -40.498531341552734, + "logps/rejected": -1479.181396484375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18588443100452423, + "rewards/margins": 14.101339340209961, + "rewards/rejected": -14.287226676940918, + "step": 2290 + }, + { + "epoch": 0.57, + "grad_norm": 0.004364013671875, + "learning_rate": 2.287721028801204e-06, + "logits/chosen": -2.175849676132202, + "logits/rejected": -2.4008584022521973, + "logps/chosen": -43.332298278808594, + "logps/rejected": -1385.960205078125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21884088218212128, + "rewards/margins": 13.133634567260742, + "rewards/rejected": -13.352476119995117, + "step": 2300 + }, + { + "epoch": 0.58, + "grad_norm": 0.057861328125, + "learning_rate": 2.26600964855055e-06, + "logits/chosen": -2.2437031269073486, + "logits/rejected": -2.4617691040039062, + "logps/chosen": -43.779388427734375, + "logps/rejected": -1358.084716796875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22216272354125977, + "rewards/margins": 12.88685417175293, + "rewards/rejected": -13.109016418457031, + "step": 2310 + }, + { + "epoch": 0.58, + "grad_norm": 0.0230712890625, + "learning_rate": 2.244316057909573e-06, + "logits/chosen": -2.205610752105713, + "logits/rejected": -2.4241251945495605, + "logps/chosen": -37.175682067871094, + "logps/rejected": -1405.6822509765625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15614905953407288, + "rewards/margins": 13.411378860473633, + "rewards/rejected": -13.567527770996094, + "step": 2320 + }, + { + "epoch": 0.58, + "grad_norm": 0.0208740234375, + "learning_rate": 2.2226419061792282e-06, + "logits/chosen": -2.284442901611328, + "logits/rejected": -2.527775526046753, + "logps/chosen": -48.442630767822266, + "logps/rejected": -1552.012939453125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2705609202384949, + "rewards/margins": 14.728759765625, + "rewards/rejected": -14.999322891235352, + "step": 2330 + }, + { + "epoch": 0.58, + "grad_norm": 0.0081787109375, + "learning_rate": 2.200988841182589e-06, + "logits/chosen": -2.219576358795166, + "logits/rejected": -2.4669265747070312, + "logps/chosen": -44.78432083129883, + "logps/rejected": -1667.3538818359375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2340162992477417, + "rewards/margins": 15.922113418579102, + "rewards/rejected": -16.156129837036133, + "step": 2340 + }, + { + "epoch": 0.59, + "grad_norm": 0.01287841796875, + "learning_rate": 2.179358509139559e-06, + "logits/chosen": -2.171391010284424, + "logits/rejected": -2.4009640216827393, + "logps/chosen": -61.962059020996094, + "logps/rejected": -1350.2457275390625, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3955707848072052, + "rewards/margins": 12.617085456848145, + "rewards/rejected": -13.012655258178711, + "step": 2350 + }, + { + "epoch": 0.59, + "grad_norm": 0.0517578125, + "learning_rate": 2.1577525545417254e-06, + "logits/chosen": -2.1860475540161133, + "logits/rejected": -2.418872594833374, + "logps/chosen": -58.040443420410156, + "logps/rejected": -1468.6392822265625, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3626258969306946, + "rewards/margins": 13.818751335144043, + "rewards/rejected": -14.181378364562988, + "step": 2360 + }, + { + "epoch": 0.59, + "grad_norm": 0.0186767578125, + "learning_rate": 2.1361726200273293e-06, + "logits/chosen": -2.2700607776641846, + "logits/rejected": -2.521707057952881, + "logps/chosen": -48.01632308959961, + "logps/rejected": -1544.2449951171875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25974512100219727, + "rewards/margins": 14.68183708190918, + "rewards/rejected": -14.941583633422852, + "step": 2370 + }, + { + "epoch": 0.59, + "grad_norm": 0.0859375, + "learning_rate": 2.1146203462563773e-06, + "logits/chosen": -2.335644483566284, + "logits/rejected": -2.5736241340637207, + "logps/chosen": -39.67052459716797, + "logps/rejected": -1430.16796875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18928556144237518, + "rewards/margins": 13.622029304504395, + "rewards/rejected": -13.811314582824707, + "step": 2380 + }, + { + "epoch": 0.6, + "grad_norm": 0.08056640625, + "learning_rate": 2.0930973717859117e-06, + "logits/chosen": -2.352358341217041, + "logits/rejected": -2.598140239715576, + "logps/chosen": -44.863258361816406, + "logps/rejected": -1464.2244873046875, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23050542175769806, + "rewards/margins": 13.890615463256836, + "rewards/rejected": -14.121121406555176, + "step": 2390 + }, + { + "epoch": 0.6, + "grad_norm": 0.00011587142944335938, + "learning_rate": 2.0716053329454337e-06, + "logits/chosen": -2.07816481590271, + "logits/rejected": -2.320413112640381, + "logps/chosen": -42.12782287597656, + "logps/rejected": -1603.09716796875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20372910797595978, + "rewards/margins": 15.295297622680664, + "rewards/rejected": -15.49902629852295, + "step": 2400 + }, + { + "epoch": 0.6, + "grad_norm": 0.0255126953125, + "learning_rate": 2.0501458637124963e-06, + "logits/chosen": -2.2174525260925293, + "logits/rejected": -2.5070488452911377, + "logps/chosen": -49.52367401123047, + "logps/rejected": -1763.182861328125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2707751393318176, + "rewards/margins": 16.840404510498047, + "rewards/rejected": -17.11117935180664, + "step": 2410 + }, + { + "epoch": 0.6, + "grad_norm": 0.0206298828125, + "learning_rate": 2.0287205955884812e-06, + "logits/chosen": -2.2282018661499023, + "logits/rejected": -2.47560453414917, + "logps/chosen": -39.33561706542969, + "logps/rejected": -1586.3204345703125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17531004548072815, + "rewards/margins": 15.11853313446045, + "rewards/rejected": -15.293844223022461, + "step": 2420 + }, + { + "epoch": 0.61, + "grad_norm": 0.0439453125, + "learning_rate": 2.0073311574745583e-06, + "logits/chosen": -2.1908931732177734, + "logits/rejected": -2.4531962871551514, + "logps/chosen": -46.56280517578125, + "logps/rejected": -1645.0823974609375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24412044882774353, + "rewards/margins": 15.665544509887695, + "rewards/rejected": -15.909663200378418, + "step": 2430 + }, + { + "epoch": 0.61, + "grad_norm": 0.01141357421875, + "learning_rate": 1.9859791755478453e-06, + "logits/chosen": -2.2081665992736816, + "logits/rejected": -2.4285478591918945, + "logps/chosen": -36.21527099609375, + "logps/rejected": -1296.198486328125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14344918727874756, + "rewards/margins": 12.341351509094238, + "rewards/rejected": -12.484800338745117, + "step": 2440 + }, + { + "epoch": 0.61, + "grad_norm": 0.2353515625, + "learning_rate": 1.9646662731377737e-06, + "logits/chosen": -2.157654285430908, + "logits/rejected": -2.3908090591430664, + "logps/chosen": -45.09668731689453, + "logps/rejected": -1404.2435302734375, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23492522537708282, + "rewards/margins": 13.305872917175293, + "rewards/rejected": -13.540797233581543, + "step": 2450 + }, + { + "epoch": 0.61, + "grad_norm": 0.01287841796875, + "learning_rate": 1.9433940706026743e-06, + "logits/chosen": -2.1844208240509033, + "logits/rejected": -2.438828468322754, + "logps/chosen": -47.74811935424805, + "logps/rejected": -1656.3568115234375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2575169503688812, + "rewards/margins": 15.76098346710205, + "rewards/rejected": -16.01849937438965, + "step": 2460 + }, + { + "epoch": 0.62, + "grad_norm": 0.000972747802734375, + "learning_rate": 1.9221641852065807e-06, + "logits/chosen": -2.18261456489563, + "logits/rejected": -2.4000496864318848, + "logps/chosen": -44.85232925415039, + "logps/rejected": -1401.519775390625, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22903266549110413, + "rewards/margins": 13.285311698913574, + "rewards/rejected": -13.51434326171875, + "step": 2470 + }, + { + "epoch": 0.62, + "grad_norm": 0.06201171875, + "learning_rate": 1.9009782309962805e-06, + "logits/chosen": -2.281862497329712, + "logits/rejected": -2.5180306434631348, + "logps/chosen": -35.415897369384766, + "logps/rejected": -1375.5728759765625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13269564509391785, + "rewards/margins": 13.11363697052002, + "rewards/rejected": -13.246332168579102, + "step": 2480 + }, + { + "epoch": 0.62, + "grad_norm": 0.09423828125, + "learning_rate": 1.8798378186785979e-06, + "logits/chosen": -2.2361299991607666, + "logits/rejected": -2.4721415042877197, + "logps/chosen": -30.1846923828125, + "logps/rejected": -1444.7518310546875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0831189677119255, + "rewards/margins": 13.8583402633667, + "rewards/rejected": -13.941459655761719, + "step": 2490 + }, + { + "epoch": 0.62, + "grad_norm": 0.035400390625, + "learning_rate": 1.8587445554979404e-06, + "logits/chosen": -2.073253870010376, + "logits/rejected": -2.3244481086730957, + "logps/chosen": -36.15102005004883, + "logps/rejected": -1567.7164306640625, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1387818604707718, + "rewards/margins": 14.997014045715332, + "rewards/rejected": -15.135795593261719, + "step": 2500 + }, + { + "epoch": 0.63, + "grad_norm": 0.0006103515625, + "learning_rate": 1.8377000451141013e-06, + "logits/chosen": -2.120227336883545, + "logits/rejected": -2.379242420196533, + "logps/chosen": -42.131507873535156, + "logps/rejected": -1565.419189453125, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.207178995013237, + "rewards/margins": 14.909128189086914, + "rewards/rejected": -15.116305351257324, + "step": 2510 + }, + { + "epoch": 0.63, + "grad_norm": 0.0927734375, + "learning_rate": 1.8167058874803405e-06, + "logits/chosen": -2.234502077102661, + "logits/rejected": -2.4847466945648193, + "logps/chosen": -42.673118591308594, + "logps/rejected": -1594.222900390625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2050865888595581, + "rewards/margins": 15.173876762390137, + "rewards/rejected": -15.3789644241333, + "step": 2520 + }, + { + "epoch": 0.63, + "grad_norm": 0.138671875, + "learning_rate": 1.7957636787217451e-06, + "logits/chosen": -2.1729538440704346, + "logits/rejected": -2.4276270866394043, + "logps/chosen": -26.112987518310547, + "logps/rejected": -1523.0291748046875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044597070664167404, + "rewards/margins": 14.675150871276855, + "rewards/rejected": -14.719749450683594, + "step": 2530 + }, + { + "epoch": 0.63, + "grad_norm": 0.0322265625, + "learning_rate": 1.7748750110138768e-06, + "logits/chosen": -2.106745481491089, + "logits/rejected": -2.3529787063598633, + "logps/chosen": -38.04988098144531, + "logps/rejected": -1700.769287109375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15463842451572418, + "rewards/margins": 16.276233673095703, + "rewards/rejected": -16.430871963500977, + "step": 2540 + }, + { + "epoch": 0.64, + "grad_norm": 0.0849609375, + "learning_rate": 1.7540414724617282e-06, + "logits/chosen": -2.070836067199707, + "logits/rejected": -2.3102221488952637, + "logps/chosen": -41.751487731933594, + "logps/rejected": -1488.04931640625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19269555807113647, + "rewards/margins": 14.147076606750488, + "rewards/rejected": -14.33977222442627, + "step": 2550 + }, + { + "epoch": 0.64, + "grad_norm": 0.00946044921875, + "learning_rate": 1.7332646469789827e-06, + "logits/chosen": -2.2572789192199707, + "logits/rejected": -2.481287956237793, + "logps/chosen": -29.41888427734375, + "logps/rejected": -1229.242431640625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07652698457241058, + "rewards/margins": 11.74826717376709, + "rewards/rejected": -11.824793815612793, + "step": 2560 + }, + { + "epoch": 0.64, + "grad_norm": 0.0830078125, + "learning_rate": 1.7125461141675881e-06, + "logits/chosen": -2.1423022747039795, + "logits/rejected": -2.3926641941070557, + "logps/chosen": -30.7061710357666, + "logps/rejected": -1465.9014892578125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09345726668834686, + "rewards/margins": 14.04102897644043, + "rewards/rejected": -14.134485244750977, + "step": 2570 + }, + { + "epoch": 0.64, + "grad_norm": 0.00014209747314453125, + "learning_rate": 1.6918874491976744e-06, + "logits/chosen": -2.290851354598999, + "logits/rejected": -2.5240445137023926, + "logps/chosen": -36.6445198059082, + "logps/rejected": -1480.7818603515625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.149479478597641, + "rewards/margins": 14.131547927856445, + "rewards/rejected": -14.281025886535645, + "step": 2580 + }, + { + "epoch": 0.65, + "grad_norm": 0.002655029296875, + "learning_rate": 1.6712902226877917e-06, + "logits/chosen": -2.1575067043304443, + "logits/rejected": -2.402039051055908, + "logps/chosen": -45.549842834472656, + "logps/rejected": -1545.114990234375, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23408398032188416, + "rewards/margins": 14.681310653686523, + "rewards/rejected": -14.91539478302002, + "step": 2590 + }, + { + "epoch": 0.65, + "grad_norm": 0.0028839111328125, + "learning_rate": 1.6507560005854977e-06, + "logits/chosen": -2.066991090774536, + "logits/rejected": -2.3206119537353516, + "logps/chosen": -47.11815643310547, + "logps/rejected": -1413.30126953125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24164950847625732, + "rewards/margins": 13.29053020477295, + "rewards/rejected": -13.532180786132812, + "step": 2600 + }, + { + "epoch": 0.65, + "grad_norm": 0.03759765625, + "learning_rate": 1.6302863440483121e-06, + "logits/chosen": -2.1091551780700684, + "logits/rejected": -2.394484043121338, + "logps/chosen": -54.07494354248047, + "logps/rejected": -1674.350341796875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3039209842681885, + "rewards/margins": 15.897099494934082, + "rewards/rejected": -16.201021194458008, + "step": 2610 + }, + { + "epoch": 0.65, + "grad_norm": 0.035400390625, + "learning_rate": 1.6098828093250203e-06, + "logits/chosen": -2.0393662452697754, + "logits/rejected": -2.2912774085998535, + "logps/chosen": -43.22583770751953, + "logps/rejected": -1745.9964599609375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20912513136863708, + "rewards/margins": 16.64576530456543, + "rewards/rejected": -16.85489273071289, + "step": 2620 + }, + { + "epoch": 0.66, + "grad_norm": 0.038330078125, + "learning_rate": 1.5895469476373545e-06, + "logits/chosen": -2.12833833694458, + "logits/rejected": -2.353044033050537, + "logps/chosen": -51.28118133544922, + "logps/rejected": -1477.751708984375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2927466034889221, + "rewards/margins": 13.93517017364502, + "rewards/rejected": -14.227917671203613, + "step": 2630 + }, + { + "epoch": 0.66, + "grad_norm": 0.002105712890625, + "learning_rate": 1.5692803050620642e-06, + "logits/chosen": -2.146883726119995, + "logits/rejected": -2.3877830505371094, + "logps/chosen": -42.891048431396484, + "logps/rejected": -1572.1334228515625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20707789063453674, + "rewards/margins": 14.972661018371582, + "rewards/rejected": -15.17973804473877, + "step": 2640 + }, + { + "epoch": 0.66, + "grad_norm": 0.05029296875, + "learning_rate": 1.5490844224133717e-06, + "logits/chosen": -2.2065834999084473, + "logits/rejected": -2.4583041667938232, + "logps/chosen": -58.87604522705078, + "logps/rejected": -1606.8402099609375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3703632652759552, + "rewards/margins": 15.166918754577637, + "rewards/rejected": -15.53728199005127, + "step": 2650 + }, + { + "epoch": 0.66, + "grad_norm": 0.06298828125, + "learning_rate": 1.528960835125822e-06, + "logits/chosen": -2.3619742393493652, + "logits/rejected": -2.5886929035186768, + "logps/chosen": -47.88628005981445, + "logps/rejected": -1394.3492431640625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2605898976325989, + "rewards/margins": 13.201173782348633, + "rewards/rejected": -13.461764335632324, + "step": 2660 + }, + { + "epoch": 0.67, + "grad_norm": 0.373046875, + "learning_rate": 1.5089110731375568e-06, + "logits/chosen": -2.1769912242889404, + "logits/rejected": -2.4125704765319824, + "logps/chosen": -54.75007247924805, + "logps/rejected": -1521.18310546875, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32575544714927673, + "rewards/margins": 14.370445251464844, + "rewards/rejected": -14.696202278137207, + "step": 2670 + }, + { + "epoch": 0.67, + "grad_norm": 0.080078125, + "learning_rate": 1.4889366607739925e-06, + "logits/chosen": -2.322796583175659, + "logits/rejected": -2.5181009769439697, + "logps/chosen": -45.69524383544922, + "logps/rejected": -1201.08251953125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24491195380687714, + "rewards/margins": 11.296446800231934, + "rewards/rejected": -11.541359901428223, + "step": 2680 + }, + { + "epoch": 0.67, + "grad_norm": 0.03076171875, + "learning_rate": 1.4690391166319307e-06, + "logits/chosen": -2.1181106567382812, + "logits/rejected": -2.3545029163360596, + "logps/chosen": -43.6742057800293, + "logps/rejected": -1542.8385009765625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21549615263938904, + "rewards/margins": 14.664543151855469, + "rewards/rejected": -14.880040168762207, + "step": 2690 + }, + { + "epoch": 0.67, + "grad_norm": 0.5078125, + "learning_rate": 1.4492199534641055e-06, + "logits/chosen": -2.21667218208313, + "logits/rejected": -2.4625155925750732, + "logps/chosen": -47.34065628051758, + "logps/rejected": -1472.347900390625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25726571679115295, + "rewards/margins": 13.980672836303711, + "rewards/rejected": -14.237937927246094, + "step": 2700 + }, + { + "epoch": 0.68, + "grad_norm": 0.0111083984375, + "learning_rate": 1.429480678064174e-06, + "logits/chosen": -2.2022199630737305, + "logits/rejected": -2.4795124530792236, + "logps/chosen": -51.5767822265625, + "logps/rejected": -1817.763671875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2908291518688202, + "rewards/margins": 17.308137893676758, + "rewards/rejected": -17.598966598510742, + "step": 2710 + }, + { + "epoch": 0.68, + "grad_norm": 0.76171875, + "learning_rate": 1.4098227911521523e-06, + "logits/chosen": -2.219804286956787, + "logits/rejected": -2.462226390838623, + "logps/chosen": -46.08092498779297, + "logps/rejected": -1526.5423583984375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23279635608196259, + "rewards/margins": 14.499191284179688, + "rewards/rejected": -14.731986999511719, + "step": 2720 + }, + { + "epoch": 0.68, + "grad_norm": 0.058349609375, + "learning_rate": 1.3902477872603295e-06, + "logits/chosen": -2.319612503051758, + "logits/rejected": -2.517526149749756, + "logps/chosen": -40.09135055541992, + "logps/rejected": -1286.315185546875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18802298605442047, + "rewards/margins": 12.167932510375977, + "rewards/rejected": -12.355955123901367, + "step": 2730 + }, + { + "epoch": 0.68, + "grad_norm": 0.0859375, + "learning_rate": 1.370757154619638e-06, + "logits/chosen": -2.2395832538604736, + "logits/rejected": -2.470933198928833, + "logps/chosen": -56.124351501464844, + "logps/rejected": -1607.262939453125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33287256956100464, + "rewards/margins": 15.186471939086914, + "rewards/rejected": -15.519342422485352, + "step": 2740 + }, + { + "epoch": 0.69, + "grad_norm": 8.761882781982422e-06, + "learning_rate": 1.3513523750465049e-06, + "logits/chosen": -2.2328319549560547, + "logits/rejected": -2.4625821113586426, + "logps/chosen": -39.597564697265625, + "logps/rejected": -1417.1002197265625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17806950211524963, + "rewards/margins": 13.474327087402344, + "rewards/rejected": -13.652397155761719, + "step": 2750 + }, + { + "epoch": 0.69, + "grad_norm": 0.0030517578125, + "learning_rate": 1.332034923830199e-06, + "logits/chosen": -2.136444568634033, + "logits/rejected": -2.3981611728668213, + "logps/chosen": -44.04825210571289, + "logps/rejected": -1527.0830078125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22466015815734863, + "rewards/margins": 14.532748222351074, + "rewards/rejected": -14.757411003112793, + "step": 2760 + }, + { + "epoch": 0.69, + "grad_norm": 0.130859375, + "learning_rate": 1.31280626962067e-06, + "logits/chosen": -2.2737619876861572, + "logits/rejected": -2.488204002380371, + "logps/chosen": -49.427528381347656, + "logps/rejected": -1363.083740234375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27385297417640686, + "rewards/margins": 12.863523483276367, + "rewards/rejected": -13.137374877929688, + "step": 2770 + }, + { + "epoch": 0.69, + "grad_norm": 0.0162353515625, + "learning_rate": 1.2936678743168813e-06, + "logits/chosen": -2.2063140869140625, + "logits/rejected": -2.450613498687744, + "logps/chosen": -47.45142364501953, + "logps/rejected": -1494.2115478515625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2554013133049011, + "rewards/margins": 14.182083129882812, + "rewards/rejected": -14.437482833862305, + "step": 2780 + }, + { + "epoch": 0.7, + "grad_norm": 0.06298828125, + "learning_rate": 1.2746211929556777e-06, + "logits/chosen": -2.171708583831787, + "logits/rejected": -2.4892578125, + "logps/chosen": -47.59801483154297, + "logps/rejected": -1927.90234375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25736138224601746, + "rewards/margins": 18.46487808227539, + "rewards/rejected": -18.722238540649414, + "step": 2790 + }, + { + "epoch": 0.7, + "grad_norm": 5.900859832763672e-06, + "learning_rate": 1.2556676736011558e-06, + "logits/chosen": -2.200247287750244, + "logits/rejected": -2.433065891265869, + "logps/chosen": -47.600555419921875, + "logps/rejected": -1647.335693359375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25239020586013794, + "rewards/margins": 15.647438049316406, + "rewards/rejected": -15.89982795715332, + "step": 2800 + }, + { + "epoch": 0.7, + "grad_norm": 0.0279541015625, + "learning_rate": 1.2368087572345772e-06, + "logits/chosen": -2.235849380493164, + "logits/rejected": -2.4413654804229736, + "logps/chosen": -48.26280975341797, + "logps/rejected": -1258.86572265625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26613086462020874, + "rewards/margins": 11.844823837280273, + "rewards/rejected": -12.110954284667969, + "step": 2810 + }, + { + "epoch": 0.7, + "grad_norm": 1.2734375, + "learning_rate": 1.2180458776448067e-06, + "logits/chosen": -2.187344551086426, + "logits/rejected": -2.4354748725891113, + "logps/chosen": -40.0956916809082, + "logps/rejected": -1655.0335693359375, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18102389574050903, + "rewards/margins": 15.813700675964355, + "rewards/rejected": -15.994723320007324, + "step": 2820 + }, + { + "epoch": 0.71, + "grad_norm": 0.0234375, + "learning_rate": 1.1993804613193158e-06, + "logits/chosen": -2.18884539604187, + "logits/rejected": -2.4379589557647705, + "logps/chosen": -64.91096496582031, + "logps/rejected": -1493.458740234375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4194706082344055, + "rewards/margins": 13.966886520385742, + "rewards/rejected": -14.386357307434082, + "step": 2830 + }, + { + "epoch": 0.71, + "grad_norm": 1.3649463653564453e-05, + "learning_rate": 1.1808139273357232e-06, + "logits/chosen": -2.1439809799194336, + "logits/rejected": -2.3814640045166016, + "logps/chosen": -47.53407669067383, + "logps/rejected": -1624.342529296875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25394895672798157, + "rewards/margins": 15.44206714630127, + "rewards/rejected": -15.696017265319824, + "step": 2840 + }, + { + "epoch": 0.71, + "grad_norm": 0.0003833770751953125, + "learning_rate": 1.1623476872539108e-06, + "logits/chosen": -2.1601688861846924, + "logits/rejected": -2.4301838874816895, + "logps/chosen": -46.782203674316406, + "logps/rejected": -1727.318115234375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24715037643909454, + "rewards/margins": 16.47604751586914, + "rewards/rejected": -16.72319984436035, + "step": 2850 + }, + { + "epoch": 0.71, + "grad_norm": 0.0908203125, + "learning_rate": 1.1439831450087032e-06, + "logits/chosen": -2.204617738723755, + "logits/rejected": -2.4746241569519043, + "logps/chosen": -70.88643646240234, + "logps/rejected": -1754.577392578125, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48623570799827576, + "rewards/margins": 16.506263732910156, + "rewards/rejected": -16.992502212524414, + "step": 2860 + }, + { + "epoch": 0.72, + "grad_norm": 0.68359375, + "learning_rate": 1.1257216968031357e-06, + "logits/chosen": -2.172727108001709, + "logits/rejected": -2.4189679622650146, + "logps/chosen": -53.47440719604492, + "logps/rejected": -1510.17333984375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3195067048072815, + "rewards/margins": 14.29955768585205, + "rewards/rejected": -14.619064331054688, + "step": 2870 + }, + { + "epoch": 0.72, + "grad_norm": 0.0003604888916015625, + "learning_rate": 1.1075647310022974e-06, + "logits/chosen": -2.324207305908203, + "logits/rejected": -2.5505588054656982, + "logps/chosen": -48.77964401245117, + "logps/rejected": -1276.3153076171875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2692165970802307, + "rewards/margins": 12.017995834350586, + "rewards/rejected": -12.287213325500488, + "step": 2880 + }, + { + "epoch": 0.72, + "grad_norm": 0.00640869140625, + "learning_rate": 1.0895136280277863e-06, + "logits/chosen": -2.1405930519104004, + "logits/rejected": -2.389354705810547, + "logps/chosen": -52.074989318847656, + "logps/rejected": -1792.354248046875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2989426851272583, + "rewards/margins": 17.034626007080078, + "rewards/rejected": -17.33357048034668, + "step": 2890 + }, + { + "epoch": 0.72, + "grad_norm": 0.0634765625, + "learning_rate": 1.0715697602527542e-06, + "logits/chosen": -2.0093884468078613, + "logits/rejected": -2.2803351879119873, + "logps/chosen": -60.875953674316406, + "logps/rejected": -1733.0355224609375, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3902908265590668, + "rewards/margins": 16.359115600585938, + "rewards/rejected": -16.749406814575195, + "step": 2900 + }, + { + "epoch": 0.73, + "grad_norm": 0.10546875, + "learning_rate": 1.0537344918975708e-06, + "logits/chosen": -2.2281734943389893, + "logits/rejected": -2.414677381515503, + "logps/chosen": -56.65943145751953, + "logps/rejected": -1399.209228515625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3390200734138489, + "rewards/margins": 13.123127937316895, + "rewards/rejected": -13.46214771270752, + "step": 2910 + }, + { + "epoch": 0.73, + "grad_norm": 3.910064697265625e-05, + "learning_rate": 1.036009178926107e-06, + "logits/chosen": -2.1897904872894287, + "logits/rejected": -2.426058530807495, + "logps/chosen": -48.45244216918945, + "logps/rejected": -1510.665771484375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26533347368240356, + "rewards/margins": 14.326631546020508, + "rewards/rejected": -14.59196662902832, + "step": 2920 + }, + { + "epoch": 0.73, + "grad_norm": 0.02099609375, + "learning_rate": 1.0183951689426438e-06, + "logits/chosen": -2.1068902015686035, + "logits/rejected": -2.3621068000793457, + "logps/chosen": -49.524208068847656, + "logps/rejected": -1805.392333984375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2760918438434601, + "rewards/margins": 17.226123809814453, + "rewards/rejected": -17.502214431762695, + "step": 2930 + }, + { + "epoch": 0.73, + "grad_norm": 0.109375, + "learning_rate": 1.0008938010894156e-06, + "logits/chosen": -2.0732312202453613, + "logits/rejected": -2.359827756881714, + "logps/chosen": -49.63148880004883, + "logps/rejected": -1718.0699462890625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2772377133369446, + "rewards/margins": 16.361858367919922, + "rewards/rejected": -16.639095306396484, + "step": 2940 + }, + { + "epoch": 0.74, + "grad_norm": 0.169921875, + "learning_rate": 9.83506405944804e-07, + "logits/chosen": -2.0447497367858887, + "logits/rejected": -2.2771499156951904, + "logps/chosen": -39.81549072265625, + "logps/rejected": -1609.721923828125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17990897595882416, + "rewards/margins": 15.340120315551758, + "rewards/rejected": -15.520029067993164, + "step": 2950 + }, + { + "epoch": 0.74, + "grad_norm": 9.894371032714844e-06, + "learning_rate": 9.662343054221743e-07, + "logits/chosen": -2.053480625152588, + "logits/rejected": -2.3034961223602295, + "logps/chosen": -49.807559967041016, + "logps/rejected": -1805.352294921875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2724885642528534, + "rewards/margins": 17.18954849243164, + "rewards/rejected": -17.462038040161133, + "step": 2960 + }, + { + "epoch": 0.74, + "grad_norm": 0.08349609375, + "learning_rate": 9.490788126693754e-07, + "logits/chosen": -2.081925868988037, + "logits/rejected": -2.337897777557373, + "logps/chosen": -39.89293670654297, + "logps/rejected": -1609.5567626953125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18545950949192047, + "rewards/margins": 15.35168170928955, + "rewards/rejected": -15.537139892578125, + "step": 2970 + }, + { + "epoch": 0.74, + "grad_norm": 0.007781982421875, + "learning_rate": 9.32041231968904e-07, + "logits/chosen": -2.1510047912597656, + "logits/rejected": -2.4004898071289062, + "logps/chosen": -42.21772766113281, + "logps/rejected": -1647.2308349609375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20343096554279327, + "rewards/margins": 15.7313232421875, + "rewards/rejected": -15.93475341796875, + "step": 2980 + }, + { + "epoch": 0.75, + "grad_norm": 0.030029296875, + "learning_rate": 9.151228586387464e-07, + "logits/chosen": -2.2137999534606934, + "logits/rejected": -2.4432384967803955, + "logps/chosen": -46.389976501464844, + "logps/rejected": -1448.36328125, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24593684077262878, + "rewards/margins": 13.737565994262695, + "rewards/rejected": -13.983503341674805, + "step": 2990 + }, + { + "epoch": 0.75, + "grad_norm": 0.19921875, + "learning_rate": 8.983249789338941e-07, + "logits/chosen": -2.1793341636657715, + "logits/rejected": -2.4067111015319824, + "logps/chosen": -53.83050537109375, + "logps/rejected": -1414.4827880859375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3225783705711365, + "rewards/margins": 13.334991455078125, + "rewards/rejected": -13.657569885253906, + "step": 3000 + }, + { + "epoch": 0.75, + "eval_logits/chosen": -2.61519455909729, + "eval_logits/rejected": -2.734154462814331, + "eval_logps/chosen": -62.61328125, + "eval_logps/rejected": -755.4296264648438, + "eval_loss": 0.002784780925139785, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -0.36699774861335754, + "eval_rewards/margins": 6.738577365875244, + "eval_rewards/rejected": -7.1055755615234375, + "eval_runtime": 0.6552, + "eval_samples_per_second": 7.632, + "eval_steps_per_second": 4.579, + "step": 3000 + }, + { + "epoch": 0.75, + "grad_norm": 0.0478515625, + "learning_rate": 8.816488699485593e-07, + "logits/chosen": -2.2049620151519775, + "logits/rejected": -2.431889057159424, + "logps/chosen": -43.32733917236328, + "logps/rejected": -1466.1322021484375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21807484328746796, + "rewards/margins": 13.928044319152832, + "rewards/rejected": -14.146120071411133, + "step": 3010 + }, + { + "epoch": 0.75, + "grad_norm": 0.00012302398681640625, + "learning_rate": 8.650957995190784e-07, + "logits/chosen": -2.168497085571289, + "logits/rejected": -2.439462661743164, + "logps/chosen": -43.25156784057617, + "logps/rejected": -1766.5902099609375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20799896121025085, + "rewards/margins": 16.896240234375, + "rewards/rejected": -17.104238510131836, + "step": 3020 + }, + { + "epoch": 0.76, + "grad_norm": 0.0830078125, + "learning_rate": 8.486670261275193e-07, + "logits/chosen": -2.28559947013855, + "logits/rejected": -2.535123348236084, + "logps/chosen": -46.22868347167969, + "logps/rejected": -1491.29638671875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24624836444854736, + "rewards/margins": 14.186907768249512, + "rewards/rejected": -14.43315601348877, + "step": 3030 + }, + { + "epoch": 0.76, + "grad_norm": 0.01068115234375, + "learning_rate": 8.32363798806011e-07, + "logits/chosen": -2.2580156326293945, + "logits/rejected": -2.499662160873413, + "logps/chosen": -43.39426803588867, + "logps/rejected": -1570.83154296875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2131904661655426, + "rewards/margins": 14.96166706085205, + "rewards/rejected": -15.174858093261719, + "step": 3040 + }, + { + "epoch": 0.76, + "grad_norm": 0.0703125, + "learning_rate": 8.161873570417742e-07, + "logits/chosen": -2.205913543701172, + "logits/rejected": -2.461812973022461, + "logps/chosen": -51.799095153808594, + "logps/rejected": -1628.696044921875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2980988919734955, + "rewards/margins": 15.454202651977539, + "rewards/rejected": -15.752302169799805, + "step": 3050 + }, + { + "epoch": 0.76, + "grad_norm": 0.0263671875, + "learning_rate": 8.001389306828897e-07, + "logits/chosen": -2.1009681224823, + "logits/rejected": -2.3735690116882324, + "logps/chosen": -57.14350128173828, + "logps/rejected": -1912.2584228515625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3501451313495636, + "rewards/margins": 18.137147903442383, + "rewards/rejected": -18.487293243408203, + "step": 3060 + }, + { + "epoch": 0.77, + "grad_norm": 2.002716064453125e-05, + "learning_rate": 7.842197398447993e-07, + "logits/chosen": -2.145404100418091, + "logits/rejected": -2.3879191875457764, + "logps/chosen": -46.874935150146484, + "logps/rejected": -1601.4229736328125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25245028734207153, + "rewards/margins": 15.218485832214355, + "rewards/rejected": -15.470934867858887, + "step": 3070 + }, + { + "epoch": 0.77, + "grad_norm": 0.0014801025390625, + "learning_rate": 7.684309948175414e-07, + "logits/chosen": -2.1167359352111816, + "logits/rejected": -2.340625047683716, + "logps/chosen": -41.6113395690918, + "logps/rejected": -1543.5413818359375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19553497433662415, + "rewards/margins": 14.705103874206543, + "rewards/rejected": -14.900639533996582, + "step": 3080 + }, + { + "epoch": 0.77, + "grad_norm": 0.0126953125, + "learning_rate": 7.527738959737371e-07, + "logits/chosen": -2.1751418113708496, + "logits/rejected": -2.421253204345703, + "logps/chosen": -55.10563278198242, + "logps/rejected": -1535.4105224609375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33046358823776245, + "rewards/margins": 14.514165878295898, + "rewards/rejected": -14.844629287719727, + "step": 3090 + }, + { + "epoch": 0.77, + "grad_norm": 0.1435546875, + "learning_rate": 7.372496336773269e-07, + "logits/chosen": -2.143078565597534, + "logits/rejected": -2.3641624450683594, + "logps/chosen": -44.76749038696289, + "logps/rejected": -1385.771728515625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23040492832660675, + "rewards/margins": 13.126627922058105, + "rewards/rejected": -13.357030868530273, + "step": 3100 + }, + { + "epoch": 0.78, + "grad_norm": 0.1650390625, + "learning_rate": 7.218593881930744e-07, + "logits/chosen": -2.237316370010376, + "logits/rejected": -2.46457839012146, + "logps/chosen": -43.913902282714844, + "logps/rejected": -1421.6866455078125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21920108795166016, + "rewards/margins": 13.50879955291748, + "rewards/rejected": -13.728001594543457, + "step": 3110 + }, + { + "epoch": 0.78, + "grad_norm": 0.0218505859375, + "learning_rate": 7.066043295968342e-07, + "logits/chosen": -2.2042956352233887, + "logits/rejected": -2.437238931655884, + "logps/chosen": -38.968666076660156, + "logps/rejected": -1539.2254638671875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1654917150735855, + "rewards/margins": 14.6710786819458, + "rewards/rejected": -14.836568832397461, + "step": 3120 + }, + { + "epoch": 0.78, + "grad_norm": 0.00147247314453125, + "learning_rate": 6.914856176865891e-07, + "logits/chosen": -2.2930877208709717, + "logits/rejected": -2.530980110168457, + "logps/chosen": -39.30299377441406, + "logps/rejected": -1486.0648193359375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18379421532154083, + "rewards/margins": 14.16865348815918, + "rewards/rejected": -14.352447509765625, + "step": 3130 + }, + { + "epoch": 0.78, + "grad_norm": 0.0001888275146484375, + "learning_rate": 6.765044018942804e-07, + "logits/chosen": -2.2794032096862793, + "logits/rejected": -2.5219268798828125, + "logps/chosen": -37.822265625, + "logps/rejected": -1375.9476318359375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16687843203544617, + "rewards/margins": 13.096723556518555, + "rewards/rejected": -13.263601303100586, + "step": 3140 + }, + { + "epoch": 0.79, + "grad_norm": 0.0673828125, + "learning_rate": 6.616618211984169e-07, + "logits/chosen": -2.189056873321533, + "logits/rejected": -2.428335666656494, + "logps/chosen": -45.38810348510742, + "logps/rejected": -1504.1986083984375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24011921882629395, + "rewards/margins": 14.305384635925293, + "rewards/rejected": -14.545504570007324, + "step": 3150 + }, + { + "epoch": 0.79, + "grad_norm": 0.0289306640625, + "learning_rate": 6.469590040374799e-07, + "logits/chosen": -2.135713815689087, + "logits/rejected": -2.3790910243988037, + "logps/chosen": -32.32978057861328, + "logps/rejected": -1641.7115478515625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10296233743429184, + "rewards/margins": 15.768649101257324, + "rewards/rejected": -15.871612548828125, + "step": 3160 + }, + { + "epoch": 0.79, + "grad_norm": 0.050537109375, + "learning_rate": 6.32397068224136e-07, + "logits/chosen": -2.248927593231201, + "logits/rejected": -2.501868963241577, + "logps/chosen": -40.18678283691406, + "logps/rejected": -1579.708251953125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18390187621116638, + "rewards/margins": 15.054614067077637, + "rewards/rejected": -15.238515853881836, + "step": 3170 + }, + { + "epoch": 0.79, + "grad_norm": 0.1005859375, + "learning_rate": 6.17977120860249e-07, + "logits/chosen": -2.2377326488494873, + "logits/rejected": -2.4842400550842285, + "logps/chosen": -68.27392578125, + "logps/rejected": -1504.4427490234375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4537542462348938, + "rewards/margins": 14.08979606628418, + "rewards/rejected": -14.543548583984375, + "step": 3180 + }, + { + "epoch": 0.8, + "grad_norm": 0.1435546875, + "learning_rate": 6.037002582527121e-07, + "logits/chosen": -2.17307710647583, + "logits/rejected": -2.4036478996276855, + "logps/chosen": -38.560646057128906, + "logps/rejected": -1505.03662109375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.169607013463974, + "rewards/margins": 14.335187911987305, + "rewards/rejected": -14.504794120788574, + "step": 3190 + }, + { + "epoch": 0.8, + "grad_norm": 0.07666015625, + "learning_rate": 5.895675658300981e-07, + "logits/chosen": -2.3447728157043457, + "logits/rejected": -2.5695431232452393, + "logps/chosen": -52.10234451293945, + "logps/rejected": -1275.713134765625, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3004547953605652, + "rewards/margins": 11.987954139709473, + "rewards/rejected": -12.288411140441895, + "step": 3200 + }, + { + "epoch": 0.8, + "grad_norm": 0.140625, + "learning_rate": 5.755801180601381e-07, + "logits/chosen": -2.2320406436920166, + "logits/rejected": -2.4947474002838135, + "logps/chosen": -46.99077224731445, + "logps/rejected": -1553.0062255859375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25020334124565125, + "rewards/margins": 14.758695602416992, + "rewards/rejected": -15.008898735046387, + "step": 3210 + }, + { + "epoch": 0.8, + "grad_norm": 0.1240234375, + "learning_rate": 5.617389783680307e-07, + "logits/chosen": -2.0936381816864014, + "logits/rejected": -2.3752357959747314, + "logps/chosen": -44.832740783691406, + "logps/rejected": -1831.029296875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22681677341461182, + "rewards/margins": 17.518943786621094, + "rewards/rejected": -17.74576187133789, + "step": 3220 + }, + { + "epoch": 0.81, + "grad_norm": 0.197265625, + "learning_rate": 5.48045199055596e-07, + "logits/chosen": -2.19124174118042, + "logits/rejected": -2.438732624053955, + "logps/chosen": -44.39277648925781, + "logps/rejected": -1470.6522216796875, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22872138023376465, + "rewards/margins": 13.976783752441406, + "rewards/rejected": -14.205507278442383, + "step": 3230 + }, + { + "epoch": 0.81, + "grad_norm": 7.264316082000732e-07, + "learning_rate": 5.344998212212704e-07, + "logits/chosen": -2.103717565536499, + "logits/rejected": -2.3787877559661865, + "logps/chosen": -46.40209197998047, + "logps/rejected": -1813.8245849609375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23586265742778778, + "rewards/margins": 17.30778694152832, + "rewards/rejected": -17.543649673461914, + "step": 3240 + }, + { + "epoch": 0.81, + "grad_norm": 0.01300048828125, + "learning_rate": 5.211038746809551e-07, + "logits/chosen": -2.2235634326934814, + "logits/rejected": -2.4578189849853516, + "logps/chosen": -50.56513595581055, + "logps/rejected": -1465.8817138671875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29284316301345825, + "rewards/margins": 13.869397163391113, + "rewards/rejected": -14.162240982055664, + "step": 3250 + }, + { + "epoch": 0.81, + "grad_norm": 0.1123046875, + "learning_rate": 5.078583778897216e-07, + "logits/chosen": -2.2172188758850098, + "logits/rejected": -2.4327051639556885, + "logps/chosen": -58.36212158203125, + "logps/rejected": -1398.1490478515625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36190560460090637, + "rewards/margins": 13.11742877960205, + "rewards/rejected": -13.479333877563477, + "step": 3260 + }, + { + "epoch": 0.82, + "grad_norm": 0.11962890625, + "learning_rate": 4.94764337864384e-07, + "logits/chosen": -2.304565668106079, + "logits/rejected": -2.5306572914123535, + "logps/chosen": -43.99140548706055, + "logps/rejected": -1435.986572265625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2204219549894333, + "rewards/margins": 13.629618644714355, + "rewards/rejected": -13.8500394821167, + "step": 3270 + }, + { + "epoch": 0.82, + "grad_norm": 0.0184326171875, + "learning_rate": 4.818227501069328e-07, + "logits/chosen": -2.259232521057129, + "logits/rejected": -2.5605130195617676, + "logps/chosen": -62.873863220214844, + "logps/rejected": -1876.69140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39689213037490845, + "rewards/margins": 17.804046630859375, + "rewards/rejected": -18.200939178466797, + "step": 3280 + }, + { + "epoch": 0.82, + "grad_norm": 0.0203857421875, + "learning_rate": 4.690345985288572e-07, + "logits/chosen": -2.158508777618408, + "logits/rejected": -2.399550437927246, + "logps/chosen": -40.623416900634766, + "logps/rejected": -1611.9981689453125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18651778995990753, + "rewards/margins": 15.386564254760742, + "rewards/rejected": -15.573080062866211, + "step": 3290 + }, + { + "epoch": 0.82, + "grad_norm": 1.5273690223693848e-06, + "learning_rate": 4.5640085537633633e-07, + "logits/chosen": -2.185797691345215, + "logits/rejected": -2.462428331375122, + "logps/chosen": -64.81330871582031, + "logps/rejected": -1722.494384765625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42417916655540466, + "rewards/margins": 16.25531578063965, + "rewards/rejected": -16.679494857788086, + "step": 3300 + }, + { + "epoch": 0.83, + "grad_norm": 5.936622619628906e-05, + "learning_rate": 4.439224811563211e-07, + "logits/chosen": -2.0893611907958984, + "logits/rejected": -2.3352718353271484, + "logps/chosen": -42.231239318847656, + "logps/rejected": -1722.9840087890625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20374104380607605, + "rewards/margins": 16.47307586669922, + "rewards/rejected": -16.67681884765625, + "step": 3310 + }, + { + "epoch": 0.83, + "grad_norm": 0.000370025634765625, + "learning_rate": 4.316004245635158e-07, + "logits/chosen": -2.1728897094726562, + "logits/rejected": -2.4218878746032715, + "logps/chosen": -47.56116485595703, + "logps/rejected": -1730.8961181640625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25542110204696655, + "rewards/margins": 16.49363899230957, + "rewards/rejected": -16.74905776977539, + "step": 3320 + }, + { + "epoch": 0.83, + "grad_norm": 9.1552734375e-05, + "learning_rate": 4.194356224082455e-07, + "logits/chosen": -2.095263957977295, + "logits/rejected": -2.3779187202453613, + "logps/chosen": -44.129798889160156, + "logps/rejected": -1774.712158203125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22583921253681183, + "rewards/margins": 16.942506790161133, + "rewards/rejected": -17.168346405029297, + "step": 3330 + }, + { + "epoch": 0.83, + "grad_norm": 0.00445556640625, + "learning_rate": 4.074289995452338e-07, + "logits/chosen": -2.1644439697265625, + "logits/rejected": -2.4039626121520996, + "logps/chosen": -55.25883102416992, + "logps/rejected": -1481.6717529296875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3257646858692169, + "rewards/margins": 13.985359191894531, + "rewards/rejected": -14.311124801635742, + "step": 3340 + }, + { + "epoch": 0.84, + "grad_norm": 0.00010395050048828125, + "learning_rate": 3.9558146880329246e-07, + "logits/chosen": -2.1858904361724854, + "logits/rejected": -2.422576427459717, + "logps/chosen": -38.67001724243164, + "logps/rejected": -1623.956298828125, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1640951931476593, + "rewards/margins": 15.4973783493042, + "rewards/rejected": -15.661474227905273, + "step": 3350 + }, + { + "epoch": 0.84, + "grad_norm": 0.03369140625, + "learning_rate": 3.838939309159187e-07, + "logits/chosen": -2.179760694503784, + "logits/rejected": -2.4091451168060303, + "logps/chosen": -44.970279693603516, + "logps/rejected": -1523.0604248046875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23116175830364227, + "rewards/margins": 14.479741096496582, + "rewards/rejected": -14.710905075073242, + "step": 3360 + }, + { + "epoch": 0.84, + "grad_norm": 0.005523681640625, + "learning_rate": 3.723672744528162e-07, + "logits/chosen": -2.256727695465088, + "logits/rejected": -2.5047221183776855, + "logps/chosen": -41.15488815307617, + "logps/rejected": -1610.76220703125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19345328211784363, + "rewards/margins": 15.386802673339844, + "rewards/rejected": -15.580256462097168, + "step": 3370 + }, + { + "epoch": 0.84, + "grad_norm": 0.00286865234375, + "learning_rate": 3.6100237575233647e-07, + "logits/chosen": -2.3228962421417236, + "logits/rejected": -2.5387914180755615, + "logps/chosen": -51.057395935058594, + "logps/rejected": -1306.785400390625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2881939113140106, + "rewards/margins": 12.320541381835938, + "rewards/rejected": -12.608736038208008, + "step": 3380 + }, + { + "epoch": 0.85, + "grad_norm": 0.06591796875, + "learning_rate": 3.4980009885486054e-07, + "logits/chosen": -2.25309157371521, + "logits/rejected": -2.4575724601745605, + "logps/chosen": -41.59147644042969, + "logps/rejected": -1270.520751953125, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20340308547019958, + "rewards/margins": 12.038886070251465, + "rewards/rejected": -12.242289543151855, + "step": 3390 + }, + { + "epoch": 0.85, + "grad_norm": 0.09423828125, + "learning_rate": 3.3876129543710197e-07, + "logits/chosen": -2.2136871814727783, + "logits/rejected": -2.4533486366271973, + "logps/chosen": -39.60608673095703, + "logps/rejected": -1685.295654296875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1774289309978485, + "rewards/margins": 16.114295959472656, + "rewards/rejected": -16.291725158691406, + "step": 3400 + }, + { + "epoch": 0.85, + "grad_norm": 0.002960205078125, + "learning_rate": 3.2788680474735687e-07, + "logits/chosen": -2.194180488586426, + "logits/rejected": -2.439089298248291, + "logps/chosen": -38.77867889404297, + "logps/rejected": -1473.8935546875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.174292653799057, + "rewards/margins": 14.067087173461914, + "rewards/rejected": -14.24138069152832, + "step": 3410 + }, + { + "epoch": 0.85, + "grad_norm": 0.01177978515625, + "learning_rate": 3.1717745354170214e-07, + "logits/chosen": -2.0905921459198, + "logits/rejected": -2.367194175720215, + "logps/chosen": -52.29819869995117, + "logps/rejected": -1684.010986328125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3022548258304596, + "rewards/margins": 16.003009796142578, + "rewards/rejected": -16.30526351928711, + "step": 3420 + }, + { + "epoch": 0.86, + "grad_norm": 0.002471923828125, + "learning_rate": 3.0663405602113727e-07, + "logits/chosen": -2.258749485015869, + "logits/rejected": -2.5248327255249023, + "logps/chosen": -48.49363327026367, + "logps/rejected": -1557.198486328125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2676192820072174, + "rewards/margins": 14.78343391418457, + "rewards/rejected": -15.051053047180176, + "step": 3430 + }, + { + "epoch": 0.86, + "grad_norm": 0.021240234375, + "learning_rate": 2.9625741376968107e-07, + "logits/chosen": -2.0779199600219727, + "logits/rejected": -2.3387115001678467, + "logps/chosen": -61.849632263183594, + "logps/rejected": -1759.1298828125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3940272629261017, + "rewards/margins": 16.581546783447266, + "rewards/rejected": -16.975570678710938, + "step": 3440 + }, + { + "epoch": 0.86, + "grad_norm": 0.0257568359375, + "learning_rate": 2.8604831569343324e-07, + "logits/chosen": -2.3138976097106934, + "logits/rejected": -2.529716968536377, + "logps/chosen": -50.220458984375, + "logps/rejected": -1407.71826171875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2772979736328125, + "rewards/margins": 13.286694526672363, + "rewards/rejected": -13.563992500305176, + "step": 3450 + }, + { + "epoch": 0.86, + "grad_norm": 0.0184326171875, + "learning_rate": 2.760075379605942e-07, + "logits/chosen": -2.144134759902954, + "logits/rejected": -2.3687326908111572, + "logps/chosen": -49.850528717041016, + "logps/rejected": -1546.928955078125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2831845283508301, + "rewards/margins": 14.664377212524414, + "rewards/rejected": -14.947561264038086, + "step": 3460 + }, + { + "epoch": 0.87, + "grad_norm": 0.291015625, + "learning_rate": 2.661358439424552e-07, + "logits/chosen": -2.209009885787964, + "logits/rejected": -2.4345531463623047, + "logps/chosen": -46.4788818359375, + "logps/rejected": -1371.7965087890625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24716854095458984, + "rewards/margins": 12.960786819458008, + "rewards/rejected": -13.207954406738281, + "step": 3470 + }, + { + "epoch": 0.87, + "grad_norm": 0.005462646484375, + "learning_rate": 2.564339841553615e-07, + "logits/chosen": -2.200819969177246, + "logits/rejected": -2.416544198989868, + "logps/chosen": -43.68715286254883, + "logps/rejected": -1402.603759765625, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22630712389945984, + "rewards/margins": 13.287277221679688, + "rewards/rejected": -13.513586044311523, + "step": 3480 + }, + { + "epoch": 0.87, + "grad_norm": 0.462890625, + "learning_rate": 2.469026962036539e-07, + "logits/chosen": -2.1682403087615967, + "logits/rejected": -2.384089946746826, + "logps/chosen": -43.91130447387695, + "logps/rejected": -1500.0106201171875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22072705626487732, + "rewards/margins": 14.21442699432373, + "rewards/rejected": -14.435153007507324, + "step": 3490 + }, + { + "epoch": 0.87, + "grad_norm": 0.0478515625, + "learning_rate": 2.3754270472358786e-07, + "logits/chosen": -2.17598032951355, + "logits/rejected": -2.39375638961792, + "logps/chosen": -40.356300354003906, + "logps/rejected": -1466.735595703125, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18523935973644257, + "rewards/margins": 13.940629959106445, + "rewards/rejected": -14.125869750976562, + "step": 3500 + }, + { + "epoch": 0.88, + "grad_norm": 0.51953125, + "learning_rate": 2.283547213282458e-07, + "logits/chosen": -2.2732253074645996, + "logits/rejected": -2.502781629562378, + "logps/chosen": -46.22926330566406, + "logps/rejected": -1531.5400390625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24311251938343048, + "rewards/margins": 14.52368450164795, + "rewards/rejected": -14.76679515838623, + "step": 3510 + }, + { + "epoch": 0.88, + "grad_norm": 0.1328125, + "learning_rate": 2.1933944455343166e-07, + "logits/chosen": -2.0053231716156006, + "logits/rejected": -2.297400951385498, + "logps/chosen": -57.17345428466797, + "logps/rejected": -1729.294921875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35318198800086975, + "rewards/margins": 16.38203239440918, + "rewards/rejected": -16.735218048095703, + "step": 3520 + }, + { + "epoch": 0.88, + "grad_norm": 0.00653076171875, + "learning_rate": 2.104975598045647e-07, + "logits/chosen": -2.1619279384613037, + "logits/rejected": -2.387904167175293, + "logps/chosen": -37.9974250793457, + "logps/rejected": -1388.173583984375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16347357630729675, + "rewards/margins": 13.225934982299805, + "rewards/rejected": -13.38940715789795, + "step": 3530 + }, + { + "epoch": 0.88, + "grad_norm": 0.19921875, + "learning_rate": 2.018297393045701e-07, + "logits/chosen": -2.201099395751953, + "logits/rejected": -2.4077112674713135, + "logps/chosen": -43.07371139526367, + "logps/rejected": -1454.396728515625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21469160914421082, + "rewards/margins": 13.828866958618164, + "rewards/rejected": -14.043559074401855, + "step": 3540 + }, + { + "epoch": 0.89, + "grad_norm": 0.05859375, + "learning_rate": 1.9333664204277236e-07, + "logits/chosen": -2.0957770347595215, + "logits/rejected": -2.3300156593322754, + "logps/chosen": -40.987003326416016, + "logps/rejected": -1783.749755859375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1888490617275238, + "rewards/margins": 17.056163787841797, + "rewards/rejected": -17.245014190673828, + "step": 3550 + }, + { + "epoch": 0.89, + "grad_norm": 0.0001068115234375, + "learning_rate": 1.8501891372479124e-07, + "logits/chosen": -2.1852810382843018, + "logits/rejected": -2.4316954612731934, + "logps/chosen": -44.98752975463867, + "logps/rejected": -1581.531982421875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22910073399543762, + "rewards/margins": 15.049044609069824, + "rewards/rejected": -15.278146743774414, + "step": 3560 + }, + { + "epoch": 0.89, + "grad_norm": 0.115234375, + "learning_rate": 1.7687718672345533e-07, + "logits/chosen": -2.1352264881134033, + "logits/rejected": -2.3711695671081543, + "logps/chosen": -51.448890686035156, + "logps/rejected": -1699.1624755859375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29201555252075195, + "rewards/margins": 16.14071273803711, + "rewards/rejected": -16.432727813720703, + "step": 3570 + }, + { + "epoch": 0.89, + "grad_norm": 0.002471923828125, + "learning_rate": 1.689120800307212e-07, + "logits/chosen": -2.0329132080078125, + "logits/rejected": -2.287954092025757, + "logps/chosen": -46.30390167236328, + "logps/rejected": -1923.584716796875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24167513847351074, + "rewards/margins": 18.367229461669922, + "rewards/rejected": -18.608905792236328, + "step": 3580 + }, + { + "epoch": 0.9, + "grad_norm": 0.1376953125, + "learning_rate": 1.6112419921061357e-07, + "logits/chosen": -2.1787500381469727, + "logits/rejected": -2.4122672080993652, + "logps/chosen": -50.2169075012207, + "logps/rejected": -1459.5537109375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2793883681297302, + "rewards/margins": 13.814018249511719, + "rewards/rejected": -14.093404769897461, + "step": 3590 + }, + { + "epoch": 0.9, + "grad_norm": 0.2060546875, + "learning_rate": 1.5351413635318807e-07, + "logits/chosen": -2.2764883041381836, + "logits/rejected": -2.5183794498443604, + "logps/chosen": -47.43731689453125, + "logps/rejected": -1477.555419921875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2550382614135742, + "rewards/margins": 13.986851692199707, + "rewards/rejected": -14.241891860961914, + "step": 3600 + }, + { + "epoch": 0.9, + "grad_norm": 0.1279296875, + "learning_rate": 1.460824700295138e-07, + "logits/chosen": -2.268395185470581, + "logits/rejected": -2.5008223056793213, + "logps/chosen": -55.70990753173828, + "logps/rejected": -1575.289794921875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3271048367023468, + "rewards/margins": 14.906885147094727, + "rewards/rejected": -15.233988761901855, + "step": 3610 + }, + { + "epoch": 0.9, + "grad_norm": 0.05224609375, + "learning_rate": 1.3882976524768694e-07, + "logits/chosen": -2.2560763359069824, + "logits/rejected": -2.474360942840576, + "logps/chosen": -48.290626525878906, + "logps/rejected": -1297.683349609375, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25945621728897095, + "rewards/margins": 12.217463493347168, + "rewards/rejected": -12.476920127868652, + "step": 3620 + }, + { + "epoch": 0.91, + "grad_norm": 0.0004482269287109375, + "learning_rate": 1.3175657340987664e-07, + "logits/chosen": -2.1752967834472656, + "logits/rejected": -2.405945301055908, + "logps/chosen": -39.73582077026367, + "logps/rejected": -1538.538818359375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1809137761592865, + "rewards/margins": 14.694430351257324, + "rewards/rejected": -14.875345230102539, + "step": 3630 + }, + { + "epoch": 0.91, + "grad_norm": 0.018798828125, + "learning_rate": 1.2486343227040122e-07, + "logits/chosen": -2.286973476409912, + "logits/rejected": -2.5369372367858887, + "logps/chosen": -47.38744354248047, + "logps/rejected": -1557.68798828125, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24152731895446777, + "rewards/margins": 14.780197143554688, + "rewards/rejected": -15.021723747253418, + "step": 3640 + }, + { + "epoch": 0.91, + "grad_norm": 0.91796875, + "learning_rate": 1.181508658948452e-07, + "logits/chosen": -2.2179925441741943, + "logits/rejected": -2.4420266151428223, + "logps/chosen": -38.083351135253906, + "logps/rejected": -1484.4072265625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16277261078357697, + "rewards/margins": 14.176862716674805, + "rewards/rejected": -14.339635848999023, + "step": 3650 + }, + { + "epoch": 0.91, + "grad_norm": 0.10107421875, + "learning_rate": 1.1161938462021627e-07, + "logits/chosen": -2.1011550426483154, + "logits/rejected": -2.328584671020508, + "logps/chosen": -42.358421325683594, + "logps/rejected": -1466.247802734375, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19884276390075684, + "rewards/margins": 13.929216384887695, + "rewards/rejected": -14.128057479858398, + "step": 3660 + }, + { + "epoch": 0.92, + "grad_norm": 0.45703125, + "learning_rate": 1.0526948501614536e-07, + "logits/chosen": -2.129077434539795, + "logits/rejected": -2.3908514976501465, + "logps/chosen": -53.09492874145508, + "logps/rejected": -1710.9476318359375, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3066931366920471, + "rewards/margins": 16.256519317626953, + "rewards/rejected": -16.563209533691406, + "step": 3670 + }, + { + "epoch": 0.92, + "grad_norm": 0.4296875, + "learning_rate": 9.910164984713477e-08, + "logits/chosen": -2.135789394378662, + "logits/rejected": -2.3950417041778564, + "logps/chosen": -43.19747543334961, + "logps/rejected": -1662.0009765625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2104502171278, + "rewards/margins": 15.8755464553833, + "rewards/rejected": -16.085996627807617, + "step": 3680 + }, + { + "epoch": 0.92, + "grad_norm": 0.0751953125, + "learning_rate": 9.311634803585323e-08, + "logits/chosen": -2.1814115047454834, + "logits/rejected": -2.445276975631714, + "logps/chosen": -53.1719970703125, + "logps/rejected": -1662.022705078125, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3151262402534485, + "rewards/margins": 15.78093433380127, + "rewards/rejected": -16.0960636138916, + "step": 3690 + }, + { + "epoch": 0.92, + "grad_norm": 0.00125885009765625, + "learning_rate": 8.7314034627487e-08, + "logits/chosen": -2.230149507522583, + "logits/rejected": -2.479522705078125, + "logps/chosen": -36.06591033935547, + "logps/rejected": -1619.81884765625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14542549848556519, + "rewards/margins": 15.533602714538574, + "rewards/rejected": -15.679028511047363, + "step": 3700 + }, + { + "epoch": 0.93, + "grad_norm": 0.023681640625, + "learning_rate": 8.16951507551439e-08, + "logits/chosen": -2.2386298179626465, + "logits/rejected": -2.456665515899658, + "logps/chosen": -45.547454833984375, + "logps/rejected": -1492.232666015625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2374843806028366, + "rewards/margins": 14.150115966796875, + "rewards/rejected": -14.387600898742676, + "step": 3710 + }, + { + "epoch": 0.93, + "grad_norm": 0.00019931793212890625, + "learning_rate": 7.626012360631291e-08, + "logits/chosen": -2.266707420349121, + "logits/rejected": -2.5029749870300293, + "logps/chosen": -49.74803924560547, + "logps/rejected": -1482.993896484375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2803342640399933, + "rewards/margins": 14.048635482788086, + "rewards/rejected": -14.328969955444336, + "step": 3720 + }, + { + "epoch": 0.93, + "grad_norm": 0.03662109375, + "learning_rate": 7.100936639038936e-08, + "logits/chosen": -2.040673017501831, + "logits/rejected": -2.3331856727600098, + "logps/chosen": -43.85186004638672, + "logps/rejected": -1894.5054931640625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2176673710346222, + "rewards/margins": 18.155574798583984, + "rewards/rejected": -18.373241424560547, + "step": 3730 + }, + { + "epoch": 0.93, + "grad_norm": 1.0609626770019531e-05, + "learning_rate": 6.594327830725916e-08, + "logits/chosen": -2.190392017364502, + "logits/rejected": -2.451129913330078, + "logps/chosen": -54.63507080078125, + "logps/rejected": -1576.1376953125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32724398374557495, + "rewards/margins": 14.939753532409668, + "rewards/rejected": -15.266998291015625, + "step": 3740 + }, + { + "epoch": 0.94, + "grad_norm": 0.1083984375, + "learning_rate": 6.106224451694592e-08, + "logits/chosen": -2.2175679206848145, + "logits/rejected": -2.4564685821533203, + "logps/chosen": -46.39513397216797, + "logps/rejected": -1616.2586669921875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24283328652381897, + "rewards/margins": 15.379629135131836, + "rewards/rejected": -15.622464179992676, + "step": 3750 + }, + { + "epoch": 0.94, + "grad_norm": 0.1669921875, + "learning_rate": 5.636663611033266e-08, + "logits/chosen": -2.0778698921203613, + "logits/rejected": -2.3482048511505127, + "logps/chosen": -44.864990234375, + "logps/rejected": -1604.7249755859375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23550191521644592, + "rewards/margins": 15.299467086791992, + "rewards/rejected": -15.534968376159668, + "step": 3760 + }, + { + "epoch": 0.94, + "grad_norm": 0.0162353515625, + "learning_rate": 5.185681008094579e-08, + "logits/chosen": -2.284482479095459, + "logits/rejected": -2.5175766944885254, + "logps/chosen": -47.569602966308594, + "logps/rejected": -1563.1016845703125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25453075766563416, + "rewards/margins": 14.83684253692627, + "rewards/rejected": -15.0913724899292, + "step": 3770 + }, + { + "epoch": 0.94, + "grad_norm": 0.00015735626220703125, + "learning_rate": 4.753310929781513e-08, + "logits/chosen": -2.2356629371643066, + "logits/rejected": -2.451608657836914, + "logps/chosen": -53.69340133666992, + "logps/rejected": -1443.2952880859375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3177313208580017, + "rewards/margins": 13.615710258483887, + "rewards/rejected": -13.933443069458008, + "step": 3780 + }, + { + "epoch": 0.95, + "grad_norm": 6.246566772460938e-05, + "learning_rate": 4.3395862479405914e-08, + "logits/chosen": -2.156893253326416, + "logits/rejected": -2.395084857940674, + "logps/chosen": -46.39254379272461, + "logps/rejected": -1630.5118408203125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24043354392051697, + "rewards/margins": 15.48585319519043, + "rewards/rejected": -15.726287841796875, + "step": 3790 + }, + { + "epoch": 0.95, + "grad_norm": 0.0162353515625, + "learning_rate": 3.9445384168628474e-08, + "logits/chosen": -2.328781843185425, + "logits/rejected": -2.580176591873169, + "logps/chosen": -52.544822692871094, + "logps/rejected": -1474.49365234375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3075386881828308, + "rewards/margins": 13.93040657043457, + "rewards/rejected": -14.237945556640625, + "step": 3800 + }, + { + "epoch": 0.95, + "grad_norm": 0.046630859375, + "learning_rate": 3.5681974708923484e-08, + "logits/chosen": -2.120448589324951, + "logits/rejected": -2.3445982933044434, + "logps/chosen": -37.53495788574219, + "logps/rejected": -1442.9287109375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1521245837211609, + "rewards/margins": 13.743891716003418, + "rewards/rejected": -13.896017074584961, + "step": 3810 + }, + { + "epoch": 0.95, + "grad_norm": 0.01068115234375, + "learning_rate": 3.210592022142717e-08, + "logits/chosen": -2.1601128578186035, + "logits/rejected": -2.3705685138702393, + "logps/chosen": -52.31241989135742, + "logps/rejected": -1557.874755859375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3095288872718811, + "rewards/margins": 14.724299430847168, + "rewards/rejected": -15.033828735351562, + "step": 3820 + }, + { + "epoch": 0.96, + "grad_norm": 0.09912109375, + "learning_rate": 2.8717492583220095e-08, + "logits/chosen": -2.2527565956115723, + "logits/rejected": -2.4976978302001953, + "logps/chosen": -45.60851287841797, + "logps/rejected": -1558.29931640625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24002361297607422, + "rewards/margins": 14.838605880737305, + "rewards/rejected": -15.078630447387695, + "step": 3830 + }, + { + "epoch": 0.96, + "grad_norm": 1.1171875, + "learning_rate": 2.551694940665539e-08, + "logits/chosen": -2.191880464553833, + "logits/rejected": -2.41998291015625, + "logps/chosen": -51.693626403808594, + "logps/rejected": -1440.926513671875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29828059673309326, + "rewards/margins": 13.601608276367188, + "rewards/rejected": -13.899889945983887, + "step": 3840 + }, + { + "epoch": 0.96, + "grad_norm": 0.0006561279296875, + "learning_rate": 2.2504534019774092e-08, + "logits/chosen": -2.347978353500366, + "logits/rejected": -2.5543174743652344, + "logps/chosen": -42.8640251159668, + "logps/rejected": -1363.5374755859375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2091234177350998, + "rewards/margins": 12.935150146484375, + "rewards/rejected": -13.14427375793457, + "step": 3850 + }, + { + "epoch": 0.96, + "grad_norm": 0.099609375, + "learning_rate": 1.9680475447805826e-08, + "logits/chosen": -2.231818437576294, + "logits/rejected": -2.4574391841888428, + "logps/chosen": -58.65113067626953, + "logps/rejected": -1416.2275390625, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3713977038860321, + "rewards/margins": 13.290824890136719, + "rewards/rejected": -13.662221908569336, + "step": 3860 + }, + { + "epoch": 0.97, + "grad_norm": 0.00102996826171875, + "learning_rate": 1.70449883957563e-08, + "logits/chosen": -2.254411220550537, + "logits/rejected": -2.486921787261963, + "logps/chosen": -50.42361831665039, + "logps/rejected": -1524.43603515625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28153225779533386, + "rewards/margins": 14.412869453430176, + "rewards/rejected": -14.694402694702148, + "step": 3870 + }, + { + "epoch": 0.97, + "grad_norm": 0.07080078125, + "learning_rate": 1.4598273232083182e-08, + "logits/chosen": -2.2290568351745605, + "logits/rejected": -2.4446380138397217, + "logps/chosen": -40.07840347290039, + "logps/rejected": -1425.9326171875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18418380618095398, + "rewards/margins": 13.563642501831055, + "rewards/rejected": -13.747825622558594, + "step": 3880 + }, + { + "epoch": 0.97, + "grad_norm": 0.000240325927734375, + "learning_rate": 1.2340515973464917e-08, + "logits/chosen": -2.1592583656311035, + "logits/rejected": -2.4244697093963623, + "logps/chosen": -54.51990509033203, + "logps/rejected": -1623.231689453125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32529979944229126, + "rewards/margins": 15.373883247375488, + "rewards/rejected": -15.699182510375977, + "step": 3890 + }, + { + "epoch": 0.97, + "grad_norm": 0.007537841796875, + "learning_rate": 1.0271888270655118e-08, + "logits/chosen": -2.0656638145446777, + "logits/rejected": -2.2895803451538086, + "logps/chosen": -38.120445251464844, + "logps/rejected": -1580.198486328125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16423656046390533, + "rewards/margins": 15.07524299621582, + "rewards/rejected": -15.239479064941406, + "step": 3900 + }, + { + "epoch": 0.98, + "grad_norm": 0.107421875, + "learning_rate": 8.392547395435769e-09, + "logits/chosen": -2.413311243057251, + "logits/rejected": -2.6234774589538574, + "logps/chosen": -62.75360107421875, + "logps/rejected": -1339.970458984375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40989524126052856, + "rewards/margins": 12.507749557495117, + "rewards/rejected": -12.917645454406738, + "step": 3910 + }, + { + "epoch": 0.98, + "grad_norm": 0.00030517578125, + "learning_rate": 6.702636228657911e-09, + "logits/chosen": -2.292491912841797, + "logits/rejected": -2.5207715034484863, + "logps/chosen": -43.633766174316406, + "logps/rejected": -1431.236083984375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2170587033033371, + "rewards/margins": 13.580774307250977, + "rewards/rejected": -13.797833442687988, + "step": 3920 + }, + { + "epoch": 0.98, + "grad_norm": 0.181640625, + "learning_rate": 5.2022832493800465e-09, + "logits/chosen": -2.363647937774658, + "logits/rejected": -2.5729119777679443, + "logps/chosen": -53.60352325439453, + "logps/rejected": -1316.2359619140625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3134381175041199, + "rewards/margins": 12.360208511352539, + "rewards/rejected": -12.673646926879883, + "step": 3930 + }, + { + "epoch": 0.98, + "grad_norm": 0.0001983642578125, + "learning_rate": 3.891602525100124e-09, + "logits/chosen": -2.2279720306396484, + "logits/rejected": -2.4751038551330566, + "logps/chosen": -46.06736373901367, + "logps/rejected": -1572.96337890625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24313127994537354, + "rewards/margins": 14.95093059539795, + "rewards/rejected": -15.194061279296875, + "step": 3940 + }, + { + "epoch": 0.99, + "grad_norm": 0.01434326171875, + "learning_rate": 2.7706937030827495e-09, + "logits/chosen": -2.2832016944885254, + "logits/rejected": -2.5126912593841553, + "logps/chosen": -51.094390869140625, + "logps/rejected": -1317.524169921875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2949807047843933, + "rewards/margins": 12.386547088623047, + "rewards/rejected": -12.681528091430664, + "step": 3950 + }, + { + "epoch": 0.99, + "grad_norm": 0.028564453125, + "learning_rate": 1.839642002783859e-09, + "logits/chosen": -2.2053470611572266, + "logits/rejected": -2.42755389213562, + "logps/chosen": -39.788963317871094, + "logps/rejected": -1376.3099365234375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18181900680065155, + "rewards/margins": 13.076495170593262, + "rewards/rejected": -13.25831413269043, + "step": 3960 + }, + { + "epoch": 0.99, + "grad_norm": 0.03857421875, + "learning_rate": 1.0985182093714574e-09, + "logits/chosen": -2.253420352935791, + "logits/rejected": -2.4623093605041504, + "logps/chosen": -59.57801055908203, + "logps/rejected": -1399.8045654296875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3837854266166687, + "rewards/margins": 13.101987838745117, + "rewards/rejected": -13.485774040222168, + "step": 3970 + }, + { + "epoch": 0.99, + "grad_norm": 0.0198974609375, + "learning_rate": 5.473786683440896e-10, + "logits/chosen": -2.1431727409362793, + "logits/rejected": -2.3964176177978516, + "logps/chosen": -56.668495178222656, + "logps/rejected": -1672.459716796875, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34237998723983765, + "rewards/margins": 15.848222732543945, + "rewards/rejected": -16.190601348876953, + "step": 3980 + }, + { + "epoch": 1.0, + "grad_norm": 0.09765625, + "learning_rate": 1.862652812467669e-10, + "logits/chosen": -2.1814169883728027, + "logits/rejected": -2.422477960586548, + "logps/chosen": -39.490379333496094, + "logps/rejected": -1714.8070068359375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17177413403987885, + "rewards/margins": 16.374120712280273, + "rewards/rejected": -16.545894622802734, + "step": 3990 + }, + { + "epoch": 1.0, + "grad_norm": 0.0001277923583984375, + "learning_rate": 1.5205502486292932e-11, + "logits/chosen": -2.172867774963379, + "logits/rejected": -2.422581911087036, + "logps/chosen": -44.31547164916992, + "logps/rejected": -1577.330078125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23060818016529083, + "rewards/margins": 15.038507461547852, + "rewards/rejected": -15.269113540649414, + "step": 4000 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -2.614182472229004, + "eval_logits/rejected": -2.732987642288208, + "eval_logps/chosen": -60.27886962890625, + "eval_logps/rejected": -752.9370727539062, + "eval_loss": 0.0027744148392230272, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -0.34365367889404297, + "eval_rewards/margins": 6.736997127532959, + "eval_rewards/rejected": -7.080650329589844, + "eval_runtime": 0.6551, + "eval_samples_per_second": 7.633, + "eval_steps_per_second": 4.58, + "step": 4000 + }, + { + "epoch": 1.0, + "step": 4004, + "total_flos": 0.0, + "train_loss": 0.04242442899794605, + "train_runtime": 8772.4234, + "train_samples_per_second": 1.826, + "train_steps_per_second": 0.456 + } + ], + "logging_steps": 10, + "max_steps": 4004, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}