{ "best_metric": 0.6289177536964417, "best_model_checkpoint": "models/llama-3.2-3b-sft-dpo/checkpoint-500", "epoch": 3.0, "eval_steps": 100, "global_step": 633, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004739336492890996, "grad_norm": 18.306584799400138, "learning_rate": 5.2631578947368416e-08, "logits/chosen": 1.1032867431640625, "logits/rejected": 1.1176480054855347, "logps/chosen": -175.54205322265625, "logps/rejected": -196.64266967773438, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.04739336492890995, "grad_norm": 18.19518017806804, "learning_rate": 5.263157894736842e-07, "logits/chosen": 0.6209686994552612, "logits/rejected": 0.7449740171432495, "logps/chosen": -350.8912658691406, "logps/rejected": -307.96142578125, "loss": 0.9979, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.00011829059076262638, "rewards/margins": 0.016186419874429703, "rewards/rejected": -0.016068127006292343, "step": 10 }, { "epoch": 0.0947867298578199, "grad_norm": 15.415652807377189, "learning_rate": 9.99993455114332e-07, "logits/chosen": 0.9229280352592468, "logits/rejected": 0.8609384298324585, "logps/chosen": -252.894775390625, "logps/rejected": -263.6702575683594, "loss": 0.9588, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0033816881477832794, "rewards/margins": 0.16803663969039917, "rewards/rejected": -0.164654940366745, "step": 20 }, { "epoch": 0.14218009478672985, "grad_norm": 12.850588595957225, "learning_rate": 9.992082761369566e-07, "logits/chosen": 0.8715411424636841, "logits/rejected": 0.8170267343521118, "logps/chosen": -296.8494567871094, "logps/rejected": -305.7926025390625, "loss": 0.8133, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.6128842830657959, "rewards/margins": 1.1374889612197876, "rewards/rejected": -0.5246046781539917, "step": 30 }, { "epoch": 0.1895734597156398, "grad_norm": 14.501186311778227, "learning_rate": 9.971164749660148e-07, "logits/chosen": 0.9155582189559937, "logits/rejected": 0.9567469358444214, "logps/chosen": -313.08514404296875, "logps/rejected": -309.0679626464844, "loss": 0.7405, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.23792271316051483, "rewards/margins": 2.1163926124572754, "rewards/rejected": -1.878469467163086, "step": 40 }, { "epoch": 0.23696682464454977, "grad_norm": 11.740811645701724, "learning_rate": 9.937235266586424e-07, "logits/chosen": 0.6986435651779175, "logits/rejected": 0.8309999704360962, "logps/chosen": -319.8310852050781, "logps/rejected": -317.59918212890625, "loss": 0.6552, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.6028285622596741, "rewards/margins": 3.663621425628662, "rewards/rejected": -3.060793161392212, "step": 50 }, { "epoch": 0.2843601895734597, "grad_norm": 14.434952077378005, "learning_rate": 9.890383118800284e-07, "logits/chosen": 0.7444020509719849, "logits/rejected": 0.7484663724899292, "logps/chosen": -327.59576416015625, "logps/rejected": -349.929931640625, "loss": 0.6285, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.3002261221408844, "rewards/margins": 3.5275771617889404, "rewards/rejected": -3.227351427078247, "step": 60 }, { "epoch": 0.33175355450236965, "grad_norm": 10.030890442911925, "learning_rate": 9.830730936592615e-07, "logits/chosen": 0.7815200090408325, "logits/rejected": 0.7069059610366821, "logps/chosen": -252.94921875, "logps/rejected": -323.2224426269531, "loss": 0.6106, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.3401187658309937, "rewards/margins": 5.26017427444458, "rewards/rejected": -3.920055866241455, "step": 70 }, { "epoch": 0.3791469194312796, "grad_norm": 12.131364583934603, "learning_rate": 9.758434852922123e-07, "logits/chosen": 0.7100412249565125, "logits/rejected": 0.6621907353401184, "logps/chosen": -271.33331298828125, "logps/rejected": -328.0660705566406, "loss": 0.59, "rewards/accuracies": 0.71875, "rewards/chosen": 0.908360481262207, "rewards/margins": 4.926724910736084, "rewards/rejected": -4.018364429473877, "step": 80 }, { "epoch": 0.4265402843601896, "grad_norm": 11.822232959802975, "learning_rate": 9.673684094754685e-07, "logits/chosen": 0.6003296375274658, "logits/rejected": 0.6765642762184143, "logps/chosen": -293.85015869140625, "logps/rejected": -305.929443359375, "loss": 0.586, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 1.231705904006958, "rewards/margins": 4.982685089111328, "rewards/rejected": -3.750978946685791, "step": 90 }, { "epoch": 0.47393364928909953, "grad_norm": 9.616291876594419, "learning_rate": 9.576700487782773e-07, "logits/chosen": 0.6642001867294312, "logits/rejected": 0.6596721410751343, "logps/chosen": -326.2373046875, "logps/rejected": -381.3326110839844, "loss": 0.5801, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 1.7316535711288452, "rewards/margins": 6.260350704193115, "rewards/rejected": -4.5286970138549805, "step": 100 }, { "epoch": 0.47393364928909953, "eval_logits/chosen": 0.610289990901947, "eval_logits/rejected": 0.6783497929573059, "eval_logps/chosen": -339.33251953125, "eval_logps/rejected": -361.24346923828125, "eval_loss": 0.6839759349822998, "eval_rewards/accuracies": 0.6898733973503113, "eval_rewards/chosen": 0.6485355496406555, "eval_rewards/margins": 3.587477684020996, "eval_rewards/rejected": -2.9389421939849854, "eval_runtime": 76.922, "eval_samples_per_second": 32.5, "eval_steps_per_second": 1.027, "step": 100 }, { "epoch": 0.5213270142180095, "grad_norm": 11.519611398516883, "learning_rate": 9.467737875821367e-07, "logits/chosen": 0.659843385219574, "logits/rejected": 0.6010033488273621, "logps/chosen": -293.62200927734375, "logps/rejected": -334.9098205566406, "loss": 0.5742, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 1.1434353590011597, "rewards/margins": 5.331825256347656, "rewards/rejected": -4.188389301300049, "step": 110 }, { "epoch": 0.5687203791469194, "grad_norm": 10.75922014108817, "learning_rate": 9.347081456399957e-07, "logits/chosen": 0.6637296676635742, "logits/rejected": 0.5958945155143738, "logps/chosen": -272.2585144042969, "logps/rejected": -393.41949462890625, "loss": 0.5821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9803568124771118, "rewards/margins": 6.413501739501953, "rewards/rejected": -5.433144569396973, "step": 120 }, { "epoch": 0.6161137440758294, "grad_norm": 11.497074098204886, "learning_rate": 9.215047034289715e-07, "logits/chosen": 0.6836856603622437, "logits/rejected": 0.6638469696044922, "logps/chosen": -275.0943603515625, "logps/rejected": -332.6889343261719, "loss": 0.5752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4476346969604492, "rewards/margins": 6.094024658203125, "rewards/rejected": -4.646389961242676, "step": 130 }, { "epoch": 0.6635071090047393, "grad_norm": 9.658859904375, "learning_rate": 9.07198019491959e-07, "logits/chosen": 0.61662757396698, "logits/rejected": 0.5779851675033569, "logps/chosen": -272.382080078125, "logps/rejected": -355.6089172363281, "loss": 0.5468, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.8889511227607727, "rewards/margins": 5.594452857971191, "rewards/rejected": -4.705502510070801, "step": 140 }, { "epoch": 0.7109004739336493, "grad_norm": 10.07652231167762, "learning_rate": 8.918255399844853e-07, "logits/chosen": 0.5373108983039856, "logits/rejected": 0.654308021068573, "logps/chosen": -330.0559997558594, "logps/rejected": -349.55224609375, "loss": 0.5738, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.3335852324962616, "rewards/margins": 4.550914287567139, "rewards/rejected": -4.217329502105713, "step": 150 }, { "epoch": 0.7582938388625592, "grad_norm": 8.965490487953566, "learning_rate": 8.754275006635572e-07, "logits/chosen": 0.565764844417572, "logits/rejected": 0.539226233959198, "logps/chosen": -269.29742431640625, "logps/rejected": -355.60589599609375, "loss": 0.5997, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.5406277179718018, "rewards/margins": 5.479567527770996, "rewards/rejected": -4.938939571380615, "step": 160 }, { "epoch": 0.8056872037914692, "grad_norm": 9.437674903727038, "learning_rate": 8.580468215750391e-07, "logits/chosen": 0.6932438611984253, "logits/rejected": 0.636594831943512, "logps/chosen": -296.7684631347656, "logps/rejected": -367.45318603515625, "loss": 0.5783, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 1.147369146347046, "rewards/margins": 5.5389909744262695, "rewards/rejected": -4.391622066497803, "step": 170 }, { "epoch": 0.8530805687203792, "grad_norm": 8.5658002946873, "learning_rate": 8.39728994715202e-07, "logits/chosen": 0.6020892858505249, "logits/rejected": 0.5168766379356384, "logps/chosen": -288.558349609375, "logps/rejected": -348.62640380859375, "loss": 0.5531, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.6757786870002747, "rewards/margins": 5.149857997894287, "rewards/rejected": -4.474079132080078, "step": 180 }, { "epoch": 0.9004739336492891, "grad_norm": 11.065263225689659, "learning_rate": 8.20521964960477e-07, "logits/chosen": 0.6599653363227844, "logits/rejected": 0.6458830237388611, "logps/chosen": -289.4867858886719, "logps/rejected": -342.56243896484375, "loss": 0.5439, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.274778962135315, "rewards/margins": 6.3435516357421875, "rewards/rejected": -5.068772792816162, "step": 190 }, { "epoch": 0.9478672985781991, "grad_norm": 8.426424572195439, "learning_rate": 8.0047600457707e-07, "logits/chosen": 0.6277160048484802, "logits/rejected": 0.6192003488540649, "logps/chosen": -318.033447265625, "logps/rejected": -377.3500061035156, "loss": 0.537, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.3354051113128662, "rewards/margins": 6.755140781402588, "rewards/rejected": -5.419735431671143, "step": 200 }, { "epoch": 0.9478672985781991, "eval_logits/chosen": 0.494819700717926, "eval_logits/rejected": 0.5648438930511475, "eval_logps/chosen": -343.7730712890625, "eval_logps/rejected": -372.1695861816406, "eval_loss": 0.6514427661895752, "eval_rewards/accuracies": 0.7278481125831604, "eval_rewards/chosen": 0.20448331534862518, "eval_rewards/margins": 4.236032485961914, "eval_rewards/rejected": -4.031548976898193, "eval_runtime": 74.0508, "eval_samples_per_second": 33.761, "eval_steps_per_second": 1.067, "step": 200 }, { "epoch": 0.995260663507109, "grad_norm": 9.878709661135902, "learning_rate": 7.796435816388898e-07, "logits/chosen": 0.6760674118995667, "logits/rejected": 0.6518660187721252, "logps/chosen": -284.24749755859375, "logps/rejected": -363.0601501464844, "loss": 0.554, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6821473836898804, "rewards/margins": 6.51880407333374, "rewards/rejected": -5.8366570472717285, "step": 210 }, { "epoch": 1.042654028436019, "grad_norm": 10.875728154843127, "learning_rate": 7.580792226981954e-07, "logits/chosen": 0.5221652984619141, "logits/rejected": 0.44479990005493164, "logps/chosen": -281.39190673828125, "logps/rejected": -370.33941650390625, "loss": 0.4911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0442254543304443, "rewards/margins": 7.068573951721191, "rewards/rejected": -5.024348258972168, "step": 220 }, { "epoch": 1.0900473933649288, "grad_norm": 10.04148994728917, "learning_rate": 7.358393700684032e-07, "logits/chosen": 0.5540430545806885, "logits/rejected": 0.5128260850906372, "logps/chosen": -279.4583435058594, "logps/rejected": -350.32684326171875, "loss": 0.5022, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.9357398152351379, "rewards/margins": 5.9159369468688965, "rewards/rejected": -4.980198383331299, "step": 230 }, { "epoch": 1.1374407582938388, "grad_norm": 11.466420945945197, "learning_rate": 7.129822340926043e-07, "logits/chosen": 0.5252267122268677, "logits/rejected": 0.6392233371734619, "logps/chosen": -300.5268859863281, "logps/rejected": -328.5356750488281, "loss": 0.4908, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 1.1534405946731567, "rewards/margins": 6.1857991218566895, "rewards/rejected": -5.032358169555664, "step": 240 }, { "epoch": 1.1848341232227488, "grad_norm": 9.714339627017372, "learning_rate": 6.895676407844586e-07, "logits/chosen": 0.5342652797698975, "logits/rejected": 0.5475658178329468, "logps/chosen": -275.02972412109375, "logps/rejected": -325.74993896484375, "loss": 0.4508, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.0915255546569824, "rewards/margins": 6.8750715255737305, "rewards/rejected": -4.783546447753906, "step": 250 }, { "epoch": 1.2322274881516588, "grad_norm": 8.702659887264469, "learning_rate": 6.656568752402521e-07, "logits/chosen": 0.4584909975528717, "logits/rejected": 0.5478152632713318, "logps/chosen": -314.6927185058594, "logps/rejected": -357.88226318359375, "loss": 0.4621, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 1.3858213424682617, "rewards/margins": 6.8659563064575195, "rewards/rejected": -5.480134963989258, "step": 260 }, { "epoch": 1.2796208530805688, "grad_norm": 10.924278197277149, "learning_rate": 6.413125212319663e-07, "logits/chosen": 0.6362992525100708, "logits/rejected": 0.6484791040420532, "logps/chosen": -285.7840270996094, "logps/rejected": -360.7676086425781, "loss": 0.4712, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.0224599838256836, "rewards/margins": 7.362783908843994, "rewards/rejected": -5.3403239250183105, "step": 270 }, { "epoch": 1.3270142180094786, "grad_norm": 9.286266066829205, "learning_rate": 6.165982974012104e-07, "logits/chosen": 0.48062658309936523, "logits/rejected": 0.4873732626438141, "logps/chosen": -345.07586669921875, "logps/rejected": -393.88165283203125, "loss": 0.4628, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 1.449973225593567, "rewards/margins": 7.039644718170166, "rewards/rejected": -5.589670658111572, "step": 280 }, { "epoch": 1.3744075829383886, "grad_norm": 9.83819564198541, "learning_rate": 5.915788904827553e-07, "logits/chosen": 0.43026304244995117, "logits/rejected": 0.459343820810318, "logps/chosen": -294.733154296875, "logps/rejected": -363.80340576171875, "loss": 0.4507, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 1.6585981845855713, "rewards/margins": 6.437933444976807, "rewards/rejected": -4.779335021972656, "step": 290 }, { "epoch": 1.4218009478672986, "grad_norm": 8.577071743246128, "learning_rate": 5.663197859941938e-07, "logits/chosen": 0.6086027026176453, "logits/rejected": 0.6251193881034851, "logps/chosen": -262.66644287109375, "logps/rejected": -320.42974853515625, "loss": 0.4787, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.298060655593872, "rewards/margins": 6.941515922546387, "rewards/rejected": -5.643455505371094, "step": 300 }, { "epoch": 1.4218009478672986, "eval_logits/chosen": 0.45885032415390015, "eval_logits/rejected": 0.5325651168823242, "eval_logps/chosen": -341.7187194824219, "eval_logps/rejected": -371.7361145019531, "eval_loss": 0.6386769413948059, "eval_rewards/accuracies": 0.7215189933776855, "eval_rewards/chosen": 0.40991881489753723, "eval_rewards/margins": 4.398120880126953, "eval_rewards/rejected": -3.98820161819458, "eval_runtime": 72.3153, "eval_samples_per_second": 34.571, "eval_steps_per_second": 1.092, "step": 300 }, { "epoch": 1.4691943127962086, "grad_norm": 12.642599504555136, "learning_rate": 5.408870968348749e-07, "logits/chosen": 0.46862930059432983, "logits/rejected": 0.45317015051841736, "logps/chosen": -269.1434631347656, "logps/rejected": -348.3428955078125, "loss": 0.4684, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 1.3798935413360596, "rewards/margins": 6.562399864196777, "rewards/rejected": -5.182506561279297, "step": 310 }, { "epoch": 1.5165876777251186, "grad_norm": 9.79584839845262, "learning_rate": 5.153473902427354e-07, "logits/chosen": 0.47858723998069763, "logits/rejected": 0.5644794702529907, "logps/chosen": -321.48345947265625, "logps/rejected": -343.6278991699219, "loss": 0.4803, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 1.1607013940811157, "rewards/margins": 5.799595832824707, "rewards/rejected": -4.638894557952881, "step": 320 }, { "epoch": 1.5639810426540284, "grad_norm": 8.875212778872154, "learning_rate": 4.897675135619516e-07, "logits/chosen": 0.47927242517471313, "logits/rejected": 0.605729341506958, "logps/chosen": -296.8520812988281, "logps/rejected": -339.26220703125, "loss": 0.48, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 1.206688404083252, "rewards/margins": 6.4211745262146, "rewards/rejected": -5.214486598968506, "step": 330 }, { "epoch": 1.6113744075829384, "grad_norm": 9.788751062324735, "learning_rate": 4.642144192774429e-07, "logits/chosen": 0.6517030000686646, "logits/rejected": 0.6343492269515991, "logps/chosen": -256.8311767578125, "logps/rejected": -318.10504150390625, "loss": 0.4687, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 1.4574129581451416, "rewards/margins": 7.180891513824463, "rewards/rejected": -5.723478317260742, "step": 340 }, { "epoch": 1.6587677725118484, "grad_norm": 8.123068784558978, "learning_rate": 4.387549897741825e-07, "logits/chosen": 0.43539008498191833, "logits/rejected": 0.4823547303676605, "logps/chosen": -322.7386474609375, "logps/rejected": -349.6393127441406, "loss": 0.4903, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 1.6534090042114258, "rewards/margins": 6.494222164154053, "rewards/rejected": -4.840813159942627, "step": 350 }, { "epoch": 1.7061611374407581, "grad_norm": 10.106462346167355, "learning_rate": 4.1345586227998634e-07, "logits/chosen": 0.4860106110572815, "logits/rejected": 0.48908883333206177, "logps/chosen": -289.710693359375, "logps/rejected": -384.22686767578125, "loss": 0.446, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.587738275527954, "rewards/margins": 7.2089128494262695, "rewards/rejected": -5.6211748123168945, "step": 360 }, { "epoch": 1.7535545023696684, "grad_norm": 10.81635763601606, "learning_rate": 3.883832544499735e-07, "logits/chosen": 0.5913195013999939, "logits/rejected": 0.5606914758682251, "logps/chosen": -292.9503173828125, "logps/rejected": -390.93878173828125, "loss": 0.4592, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.614689588546753, "rewards/margins": 6.656731605529785, "rewards/rejected": -5.042041301727295, "step": 370 }, { "epoch": 1.8009478672985781, "grad_norm": 10.495084061438284, "learning_rate": 3.636027910492114e-07, "logits/chosen": 0.4658740162849426, "logits/rejected": 0.5308722257614136, "logps/chosen": -305.28753662109375, "logps/rejected": -352.7513122558594, "loss": 0.4648, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 1.0712064504623413, "rewards/margins": 6.167966365814209, "rewards/rejected": -5.096759796142578, "step": 380 }, { "epoch": 1.8483412322274881, "grad_norm": 11.413974134819627, "learning_rate": 3.3917933218718566e-07, "logits/chosen": 0.6185089349746704, "logits/rejected": 0.6838531494140625, "logps/chosen": -284.1628112792969, "logps/rejected": -333.17657470703125, "loss": 0.4426, "rewards/accuracies": 0.8125, "rewards/chosen": 1.4776874780654907, "rewards/margins": 6.398137092590332, "rewards/rejected": -4.920449733734131, "step": 390 }, { "epoch": 1.8957345971563981, "grad_norm": 9.664147195442332, "learning_rate": 3.151768035536698e-07, "logits/chosen": 0.6407091617584229, "logits/rejected": 0.6542560458183289, "logps/chosen": -284.20037841796875, "logps/rejected": -345.27880859375, "loss": 0.4559, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 2.0247559547424316, "rewards/margins": 7.09304141998291, "rewards/rejected": -5.0682854652404785, "step": 400 }, { "epoch": 1.8957345971563981, "eval_logits/chosen": 0.41101595759391785, "eval_logits/rejected": 0.4840773642063141, "eval_logps/chosen": -338.1277160644531, "eval_logps/rejected": -368.54248046875, "eval_loss": 0.6332134008407593, "eval_rewards/accuracies": 0.7341772317886353, "eval_rewards/chosen": 0.7690173983573914, "eval_rewards/margins": 4.437857151031494, "eval_rewards/rejected": -3.668839931488037, "eval_runtime": 72.5998, "eval_samples_per_second": 34.435, "eval_steps_per_second": 1.088, "step": 400 }, { "epoch": 1.943127962085308, "grad_norm": 10.263641095491934, "learning_rate": 2.9165802910033603e-07, "logits/chosen": 0.5565508604049683, "logits/rejected": 0.5877315402030945, "logps/chosen": -328.7551574707031, "logps/rejected": -364.5121154785156, "loss": 0.4644, "rewards/accuracies": 0.78125, "rewards/chosen": 1.852020502090454, "rewards/margins": 6.0383710861206055, "rewards/rejected": -4.186350345611572, "step": 410 }, { "epoch": 1.9905213270142181, "grad_norm": 8.889403142715599, "learning_rate": 2.686845666060415e-07, "logits/chosen": 0.5102426409721375, "logits/rejected": 0.43454083800315857, "logps/chosen": -271.08160400390625, "logps/rejected": -369.26458740234375, "loss": 0.461, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 1.6376615762710571, "rewards/margins": 7.588493347167969, "rewards/rejected": -5.950831413269043, "step": 420 }, { "epoch": 2.037914691943128, "grad_norm": 7.4495856256114195, "learning_rate": 2.4631654655618287e-07, "logits/chosen": 0.37354058027267456, "logits/rejected": 0.4436867833137512, "logps/chosen": -310.15802001953125, "logps/rejected": -382.03253173828125, "loss": 0.3945, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8288238048553467, "rewards/margins": 7.114483833312988, "rewards/rejected": -5.2856597900390625, "step": 430 }, { "epoch": 2.085308056872038, "grad_norm": 8.829254132221473, "learning_rate": 2.2461251475783155e-07, "logits/chosen": 0.5162326693534851, "logits/rejected": 0.4021889567375183, "logps/chosen": -288.923095703125, "logps/rejected": -389.34979248046875, "loss": 0.3748, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.8741111755371094, "rewards/margins": 7.6665802001953125, "rewards/rejected": -5.792468547821045, "step": 440 }, { "epoch": 2.132701421800948, "grad_norm": 8.156529944948277, "learning_rate": 2.0362927910258986e-07, "logits/chosen": 0.45688456296920776, "logits/rejected": 0.4526469111442566, "logps/chosen": -253.50131225585938, "logps/rejected": -349.1957702636719, "loss": 0.4147, "rewards/accuracies": 0.875, "rewards/chosen": 2.0875327587127686, "rewards/margins": 8.09435749053955, "rewards/rejected": -6.006823539733887, "step": 450 }, { "epoch": 2.1800947867298577, "grad_norm": 7.824692642426332, "learning_rate": 1.8342176087824573e-07, "logits/chosen": 0.4325633645057678, "logits/rejected": 0.3565566837787628, "logps/chosen": -284.46624755859375, "logps/rejected": -372.12091064453125, "loss": 0.3992, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.8221031427383423, "rewards/margins": 7.619426727294922, "rewards/rejected": -5.797322750091553, "step": 460 }, { "epoch": 2.227488151658768, "grad_norm": 13.407256371457692, "learning_rate": 1.6404285101840565e-07, "logits/chosen": 0.3386808931827545, "logits/rejected": 0.47734910249710083, "logps/chosen": -331.7251892089844, "logps/rejected": -367.4866638183594, "loss": 0.3822, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 1.9130542278289795, "rewards/margins": 7.692631721496582, "rewards/rejected": -5.779577732086182, "step": 470 }, { "epoch": 2.2748815165876777, "grad_norm": 10.86707059625683, "learning_rate": 1.455432716663517e-07, "logits/chosen": 0.36686116456985474, "logits/rejected": 0.48829737305641174, "logps/chosen": -285.77008056640625, "logps/rejected": -328.3174743652344, "loss": 0.4089, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 1.7794748544692993, "rewards/margins": 6.214818477630615, "rewards/rejected": -4.435343265533447, "step": 480 }, { "epoch": 2.322274881516588, "grad_norm": 9.830177502454013, "learning_rate": 1.2797144341546883e-07, "logits/chosen": 0.3986554741859436, "logits/rejected": 0.44396382570266724, "logps/chosen": -321.13818359375, "logps/rejected": -390.934326171875, "loss": 0.4219, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 1.6029850244522095, "rewards/margins": 7.5643768310546875, "rewards/rejected": -5.961391448974609, "step": 490 }, { "epoch": 2.3696682464454977, "grad_norm": 9.42905977432162, "learning_rate": 1.1137335857372043e-07, "logits/chosen": 0.4437794089317322, "logits/rejected": 0.42870789766311646, "logps/chosen": -287.81451416015625, "logps/rejected": -374.01873779296875, "loss": 0.4028, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 2.1330111026763916, "rewards/margins": 7.767390251159668, "rewards/rejected": -5.6343793869018555, "step": 500 }, { "epoch": 2.3696682464454977, "eval_logits/chosen": 0.3730663061141968, "eval_logits/rejected": 0.4475269019603729, "eval_logps/chosen": -338.3392028808594, "eval_logps/rejected": -370.232666015625, "eval_loss": 0.6289177536964417, "eval_rewards/accuracies": 0.7405063509941101, "eval_rewards/chosen": 0.7478683590888977, "eval_rewards/margins": 4.585729122161865, "eval_rewards/rejected": -3.8378612995147705, "eval_runtime": 73.3012, "eval_samples_per_second": 34.106, "eval_steps_per_second": 1.078, "step": 500 }, { "epoch": 2.4170616113744074, "grad_norm": 10.06462647313331, "learning_rate": 9.579246078389403e-08, "logits/chosen": 0.5295278429985046, "logits/rejected": 0.43623122572898865, "logps/chosen": -258.68963623046875, "logps/rejected": -339.7721252441406, "loss": 0.3858, "rewards/accuracies": 0.84375, "rewards/chosen": 1.592254400253296, "rewards/margins": 7.2217698097229, "rewards/rejected": -5.629514694213867, "step": 510 }, { "epoch": 2.4644549763033177, "grad_norm": 9.022052721765009, "learning_rate": 8.126953131469228e-08, "logits/chosen": 0.44106584787368774, "logits/rejected": 0.39466392993927, "logps/chosen": -303.3637390136719, "logps/rejected": -370.74114990234375, "loss": 0.4143, "rewards/accuracies": 0.875, "rewards/chosen": 1.8263496160507202, "rewards/margins": 7.823184013366699, "rewards/rejected": -5.996834754943848, "step": 520 }, { "epoch": 2.5118483412322274, "grad_norm": 8.021054640921763, "learning_rate": 6.784258232029472e-08, "logits/chosen": 0.3634105622768402, "logits/rejected": 0.3859165608882904, "logps/chosen": -307.2467041015625, "logps/rejected": -376.1995849609375, "loss": 0.3822, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 2.497091770172119, "rewards/margins": 7.9943437576293945, "rewards/rejected": -5.497252464294434, "step": 530 }, { "epoch": 2.5592417061611377, "grad_norm": 10.013425700067337, "learning_rate": 5.554675734776665e-08, "logits/chosen": 0.5024563074111938, "logits/rejected": 0.5056658387184143, "logps/chosen": -276.1619567871094, "logps/rejected": -368.4447021484375, "loss": 0.4035, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 1.820339560508728, "rewards/margins": 8.141976356506348, "rewards/rejected": -6.321636199951172, "step": 540 }, { "epoch": 2.6066350710900474, "grad_norm": 9.209955480260117, "learning_rate": 4.4414239352730867e-08, "logits/chosen": 0.42310771346092224, "logits/rejected": 0.48689502477645874, "logps/chosen": -313.3210754394531, "logps/rejected": -351.4210205078125, "loss": 0.406, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.1306679248809814, "rewards/margins": 7.7195258140563965, "rewards/rejected": -5.588858127593994, "step": 550 }, { "epoch": 2.654028436018957, "grad_norm": 9.959818332708023, "learning_rate": 3.447416646405632e-08, "logits/chosen": 0.5685544610023499, "logits/rejected": 0.5256290435791016, "logps/chosen": -287.7798156738281, "logps/rejected": -380.33685302734375, "loss": 0.4009, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 1.8459497690200806, "rewards/margins": 7.295513153076172, "rewards/rejected": -5.449563503265381, "step": 560 }, { "epoch": 2.7014218009478674, "grad_norm": 8.593809820816018, "learning_rate": 2.575255571804391e-08, "logits/chosen": 0.41258078813552856, "logits/rejected": 0.4132450222969055, "logps/chosen": -287.94476318359375, "logps/rejected": -369.03656005859375, "loss": 0.4, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5231783390045166, "rewards/margins": 7.392594814300537, "rewards/rejected": -5.8694167137146, "step": 570 }, { "epoch": 2.748815165876777, "grad_norm": 9.646946039027634, "learning_rate": 1.8272234961725084e-08, "logits/chosen": 0.48128992319107056, "logits/rejected": 0.4887717366218567, "logps/chosen": -303.7729797363281, "logps/rejected": -359.5372314453125, "loss": 0.3912, "rewards/accuracies": 0.90625, "rewards/chosen": 2.173060655593872, "rewards/margins": 8.012847900390625, "rewards/rejected": -5.839787006378174, "step": 580 }, { "epoch": 2.7962085308056874, "grad_norm": 11.09612482230785, "learning_rate": 1.2052783103508102e-08, "logits/chosen": 0.5081132650375366, "logits/rejected": 0.5602059364318848, "logps/chosen": -270.61737060546875, "logps/rejected": -335.85577392578125, "loss": 0.3991, "rewards/accuracies": 0.8125, "rewards/chosen": 1.619431495666504, "rewards/margins": 6.8268561363220215, "rewards/rejected": -5.207424163818359, "step": 590 }, { "epoch": 2.843601895734597, "grad_norm": 8.273064520857158, "learning_rate": 7.1104788675613315e-09, "logits/chosen": 0.32943224906921387, "logits/rejected": 0.4085375666618347, "logps/chosen": -288.88995361328125, "logps/rejected": -364.12860107421875, "loss": 0.4029, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.0637223720550537, "rewards/margins": 7.937726020812988, "rewards/rejected": -5.874002933502197, "step": 600 }, { "epoch": 2.843601895734597, "eval_logits/chosen": 0.38198891282081604, "eval_logits/rejected": 0.45711585879325867, "eval_logps/chosen": -337.3143310546875, "eval_logps/rejected": -368.9125061035156, "eval_loss": 0.6283919215202332, "eval_rewards/accuracies": 0.7436708807945251, "eval_rewards/chosen": 0.8503568768501282, "eval_rewards/margins": 4.556199073791504, "eval_rewards/rejected": -3.7058422565460205, "eval_runtime": 73.7958, "eval_samples_per_second": 33.877, "eval_steps_per_second": 1.071, "step": 600 }, { "epoch": 2.890995260663507, "grad_norm": 9.238913123295514, "learning_rate": 3.4582581860612137e-09, "logits/chosen": 0.43385523557662964, "logits/rejected": 0.43230634927749634, "logps/chosen": -292.0911865234375, "logps/rejected": -353.61590576171875, "loss": 0.3884, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 1.989989995956421, "rewards/margins": 6.724064826965332, "rewards/rejected": -4.734074115753174, "step": 610 }, { "epoch": 2.938388625592417, "grad_norm": 9.407237089972764, "learning_rate": 1.1056803408273085e-09, "logits/chosen": 0.48387131094932556, "logits/rejected": 0.4587581753730774, "logps/chosen": -282.6869201660156, "logps/rejected": -344.5205078125, "loss": 0.4089, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 1.806133508682251, "rewards/margins": 7.467283725738525, "rewards/rejected": -5.661149978637695, "step": 620 }, { "epoch": 2.985781990521327, "grad_norm": 8.481488205996529, "learning_rate": 5.890294296428955e-11, "logits/chosen": 0.44664233922958374, "logits/rejected": 0.5504810810089111, "logps/chosen": -319.47119140625, "logps/rejected": -348.36090087890625, "loss": 0.3848, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.1828243732452393, "rewards/margins": 6.884246826171875, "rewards/rejected": -4.701422214508057, "step": 630 }, { "epoch": 3.0, "step": 633, "total_flos": 0.0, "train_loss": 0.5009220597491634, "train_runtime": 6227.6413, "train_samples_per_second": 13.002, "train_steps_per_second": 0.102 } ], "logging_steps": 10, "max_steps": 633, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }