{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 5.597671583305064, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.6193342208862305, "logits/rejected": -2.5525386333465576, "logps/chosen": -265.3854064941406, "logps/rejected": -236.1589813232422, "loss": 0.1941, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 0.0006044252077117562, "rewards/margins": 0.0008946189773268998, "rewards/rejected": -0.000290193798718974, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 5.252146327563864, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.657933473587036, "logits/rejected": -2.576028347015381, "logps/chosen": -298.8043518066406, "logps/rejected": -274.30718994140625, "loss": 0.1958, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.00019080772472079843, "rewards/margins": 0.0017816700274124742, "rewards/rejected": -0.001972477650269866, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 4.974318928321663, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6759729385375977, "logits/rejected": -2.6023097038269043, "logps/chosen": -290.3868713378906, "logps/rejected": -234.3571319580078, "loss": 0.1923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.009458948858082294, "rewards/margins": 0.013211230747401714, "rewards/rejected": -0.0037522814236581326, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 5.209557025843711, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.659982204437256, "logits/rejected": -2.6102194786071777, "logps/chosen": -280.9975891113281, "logps/rejected": -267.7278747558594, "loss": 0.1836, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04171619936823845, "rewards/margins": 0.041064582765102386, "rewards/rejected": 0.0006516153225675225, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 5.582751836908906, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.6198437213897705, "logits/rejected": -2.6123862266540527, "logps/chosen": -293.4254455566406, "logps/rejected": -303.90386962890625, "loss": 0.1732, "rewards/accuracies": 0.6875, "rewards/chosen": 0.028257910162210464, "rewards/margins": 0.0754549652338028, "rewards/rejected": -0.047197047621011734, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.5737149715423584, "eval_logits/rejected": -2.494300365447998, "eval_logps/chosen": -283.4266662597656, "eval_logps/rejected": -257.4446105957031, "eval_loss": 0.16405636072158813, "eval_rewards/accuracies": 0.7068965435028076, "eval_rewards/chosen": 0.016634328290820122, "eval_rewards/margins": 0.12585583329200745, "eval_rewards/rejected": -0.10922150313854218, "eval_runtime": 93.7026, "eval_samples_per_second": 19.402, "eval_steps_per_second": 0.309, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 7.241200335566244, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.562969207763672, "logits/rejected": -2.5053610801696777, "logps/chosen": -293.1821594238281, "logps/rejected": -272.9414367675781, "loss": 0.1593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03697499260306358, "rewards/margins": 0.11687393486499786, "rewards/rejected": -0.15384894609451294, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 7.058665086800562, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.4592204093933105, "logits/rejected": -2.4099252223968506, "logps/chosen": -327.544189453125, "logps/rejected": -284.48760986328125, "loss": 0.1564, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.00026793667348101735, "rewards/margins": 0.23106786608695984, "rewards/rejected": -0.23079994320869446, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 8.05919017424508, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.3360755443573, "logits/rejected": -2.2115492820739746, "logps/chosen": -298.69482421875, "logps/rejected": -292.111328125, "loss": 0.1478, "rewards/accuracies": 0.71875, "rewards/chosen": -0.128449946641922, "rewards/margins": 0.26610296964645386, "rewards/rejected": -0.39455294609069824, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 8.402102216426163, "learning_rate": 4.832031033425662e-07, "logits/chosen": -1.9170547723770142, "logits/rejected": -1.8550583124160767, "logps/chosen": -299.20281982421875, "logps/rejected": -289.49163818359375, "loss": 0.1441, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.1160941943526268, "rewards/margins": 0.3108092248439789, "rewards/rejected": -0.4269034266471863, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 8.625504630985954, "learning_rate": 4.752422169756047e-07, "logits/chosen": -0.48900121450424194, "logits/rejected": -0.44961875677108765, "logps/chosen": -311.1794738769531, "logps/rejected": -339.92254638671875, "loss": 0.1412, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3875480592250824, "rewards/margins": 0.2923927903175354, "rewards/rejected": -0.6799408197402954, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": -0.4184306561946869, "eval_logits/rejected": -0.1537734419107437, "eval_logps/chosen": -328.5106201171875, "eval_logps/rejected": -326.1405944824219, "eval_loss": 0.13468901813030243, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -0.4342052936553955, "eval_rewards/margins": 0.3619759678840637, "eval_rewards/rejected": -0.796181321144104, "eval_runtime": 92.0893, "eval_samples_per_second": 19.742, "eval_steps_per_second": 0.315, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 10.103043851045701, "learning_rate": 4.658354083558188e-07, "logits/chosen": -0.40221118927001953, "logits/rejected": -0.13967064023017883, "logps/chosen": -312.624267578125, "logps/rejected": -311.3497009277344, "loss": 0.1354, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.4743613302707672, "rewards/margins": 0.3018040060997009, "rewards/rejected": -0.776165246963501, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 10.671011000324334, "learning_rate": 4.550430636492389e-07, "logits/chosen": -0.04719700291752815, "logits/rejected": 0.09439365565776825, "logps/chosen": -324.4268798828125, "logps/rejected": -335.16424560546875, "loss": 0.1353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4860754609107971, "rewards/margins": 0.3509276807308197, "rewards/rejected": -0.8370031118392944, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 11.348522180816168, "learning_rate": 4.429344633468004e-07, "logits/chosen": -0.058751799166202545, "logits/rejected": 0.29970529675483704, "logps/chosen": -296.40399169921875, "logps/rejected": -316.8427429199219, "loss": 0.1373, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4022388458251953, "rewards/margins": 0.3400627374649048, "rewards/rejected": -0.7423015832901001, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 25.0836527688945, "learning_rate": 4.2958733752443187e-07, "logits/chosen": -0.29172104597091675, "logits/rejected": 0.11182677745819092, "logps/chosen": -303.4015197753906, "logps/rejected": -286.45526123046875, "loss": 0.1334, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2721634805202484, "rewards/margins": 0.34942418336868286, "rewards/rejected": -0.6215876340866089, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 11.640337926409853, "learning_rate": 4.150873668617898e-07, "logits/chosen": 0.08257939666509628, "logits/rejected": 0.5254168510437012, "logps/chosen": -308.8121337890625, "logps/rejected": -314.32220458984375, "loss": 0.1307, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.42268529534339905, "rewards/margins": 0.3886614143848419, "rewards/rejected": -0.8113466501235962, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": 0.018126631155610085, "eval_logits/rejected": 0.7143937349319458, "eval_logps/chosen": -320.6210021972656, "eval_logps/rejected": -332.353271484375, "eval_loss": 0.12613436579704285, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -0.3553086221218109, "eval_rewards/margins": 0.5029994249343872, "eval_rewards/rejected": -0.8583080768585205, "eval_runtime": 91.5757, "eval_samples_per_second": 19.852, "eval_steps_per_second": 0.317, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 11.56271576551216, "learning_rate": 3.9952763262280397e-07, "logits/chosen": 0.02932182513177395, "logits/rejected": 0.4751170575618744, "logps/chosen": -337.37237548828125, "logps/rejected": -373.10382080078125, "loss": 0.1247, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.40821242332458496, "rewards/margins": 0.46313947439193726, "rewards/rejected": -0.8713518381118774, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 7.985791463305948, "learning_rate": 3.8300801912883414e-07, "logits/chosen": 0.06568197906017303, "logits/rejected": 0.3852883279323578, "logps/chosen": -311.62481689453125, "logps/rejected": -353.85443115234375, "loss": 0.1191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.40023931860923767, "rewards/margins": 0.4376160204410553, "rewards/rejected": -0.837855339050293, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 9.33389793686436, "learning_rate": 3.6563457256020884e-07, "logits/chosen": 0.15779247879981995, "logits/rejected": 0.6506751775741577, "logps/chosen": -348.14398193359375, "logps/rejected": -325.84637451171875, "loss": 0.1278, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.44786080718040466, "rewards/margins": 0.4256533682346344, "rewards/rejected": -0.8735141754150391, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 10.313716932222244, "learning_rate": 3.475188202022617e-07, "logits/chosen": -0.43816322088241577, "logits/rejected": 0.023333895951509476, "logps/chosen": -283.8222351074219, "logps/rejected": -337.9872741699219, "loss": 0.1236, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2570773959159851, "rewards/margins": 0.47935089468955994, "rewards/rejected": -0.7364283800125122, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 9.248153089741166, "learning_rate": 3.287770545059052e-07, "logits/chosen": -0.08292710781097412, "logits/rejected": 0.5214661359786987, "logps/chosen": -312.56500244140625, "logps/rejected": -325.7038269042969, "loss": 0.1238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.34283357858657837, "rewards/margins": 0.4756447672843933, "rewards/rejected": -0.8184784054756165, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": 0.2969372272491455, "eval_logits/rejected": 1.2989132404327393, "eval_logps/chosen": -326.1716613769531, "eval_logps/rejected": -341.28619384765625, "eval_loss": 0.11991516500711441, "eval_rewards/accuracies": 0.732758641242981, "eval_rewards/chosen": -0.41081586480140686, "eval_rewards/margins": 0.5368219614028931, "eval_rewards/rejected": -0.9476377367973328, "eval_runtime": 92.1824, "eval_samples_per_second": 19.722, "eval_steps_per_second": 0.315, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 8.486691996298621, "learning_rate": 3.0952958655864954e-07, "logits/chosen": 0.504795253276825, "logits/rejected": 1.0519931316375732, "logps/chosen": -317.31756591796875, "logps/rejected": -346.00689697265625, "loss": 0.1175, "rewards/accuracies": 0.75, "rewards/chosen": -0.44013166427612305, "rewards/margins": 0.47651535272598267, "rewards/rejected": -0.9166469573974609, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 9.587760322945808, "learning_rate": 2.898999737583448e-07, "logits/chosen": 0.056784339249134064, "logits/rejected": 1.057108998298645, "logps/chosen": -358.15313720703125, "logps/rejected": -375.32452392578125, "loss": 0.1134, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.3547281324863434, "rewards/margins": 0.548815131187439, "rewards/rejected": -0.9035432934761047, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 9.093354835969592, "learning_rate": 2.7001422664752333e-07, "logits/chosen": -0.12167753279209137, "logits/rejected": 0.5305444002151489, "logps/chosen": -298.165283203125, "logps/rejected": -341.3959045410156, "loss": 0.1206, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3022175431251526, "rewards/margins": 0.4555422365665436, "rewards/rejected": -0.7577598094940186, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 9.439667913770359, "learning_rate": 2.5e-07, "logits/chosen": -0.21128520369529724, "logits/rejected": 0.5969688296318054, "logps/chosen": -321.5914306640625, "logps/rejected": -339.0221862792969, "loss": 0.1218, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.34059566259384155, "rewards/margins": 0.43612655997276306, "rewards/rejected": -0.7767221927642822, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 9.292342727311365, "learning_rate": 2.2998577335247667e-07, "logits/chosen": -0.020272482186555862, "logits/rejected": 0.808585524559021, "logps/chosen": -337.55767822265625, "logps/rejected": -346.179443359375, "loss": 0.1185, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.337638795375824, "rewards/margins": 0.5197167992591858, "rewards/rejected": -0.8573554754257202, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": -0.1745007336139679, "eval_logits/rejected": 0.8515735268592834, "eval_logps/chosen": -315.9549560546875, "eval_logps/rejected": -335.7632751464844, "eval_loss": 0.11658623069524765, "eval_rewards/accuracies": 0.7543103694915771, "eval_rewards/chosen": -0.3086487948894501, "eval_rewards/margins": 0.5837592482566833, "eval_rewards/rejected": -0.8924079537391663, "eval_runtime": 92.3611, "eval_samples_per_second": 19.684, "eval_steps_per_second": 0.314, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 8.186794570934321, "learning_rate": 2.1010002624165524e-07, "logits/chosen": -0.006458556745201349, "logits/rejected": 0.6227847933769226, "logps/chosen": -308.7506408691406, "logps/rejected": -372.9380798339844, "loss": 0.1168, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3803611397743225, "rewards/margins": 0.6050828695297241, "rewards/rejected": -0.9854438900947571, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 10.337683797583873, "learning_rate": 1.9047041344135043e-07, "logits/chosen": 0.44111576676368713, "logits/rejected": 0.8292443156242371, "logps/chosen": -313.4148254394531, "logps/rejected": -354.33258056640625, "loss": 0.1168, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.47352510690689087, "rewards/margins": 0.5513299703598022, "rewards/rejected": -1.0248548984527588, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 10.116957119119249, "learning_rate": 1.7122294549409482e-07, "logits/chosen": 0.1775570809841156, "logits/rejected": 1.0465366840362549, "logps/chosen": -317.04180908203125, "logps/rejected": -370.72479248046875, "loss": 0.1221, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4404297471046448, "rewards/margins": 0.6288600564002991, "rewards/rejected": -1.0692898035049438, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 9.609746165977107, "learning_rate": 1.524811797977383e-07, "logits/chosen": 0.02145857736468315, "logits/rejected": 0.823032021522522, "logps/chosen": -322.24542236328125, "logps/rejected": -342.22503662109375, "loss": 0.1156, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3451458811759949, "rewards/margins": 0.5379365086555481, "rewards/rejected": -0.883082389831543, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 9.08082569283608, "learning_rate": 1.3436542743979125e-07, "logits/chosen": -0.18167677521705627, "logits/rejected": 0.25138911604881287, "logps/chosen": -341.6298828125, "logps/rejected": -338.09869384765625, "loss": 0.1228, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3568056523799896, "rewards/margins": 0.43558257818222046, "rewards/rejected": -0.7923881411552429, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": -0.13157324492931366, "eval_logits/rejected": 0.8573818206787109, "eval_logps/chosen": -322.0434265136719, "eval_logps/rejected": -339.1875305175781, "eval_loss": 0.11548212170600891, "eval_rewards/accuracies": 0.7456896305084229, "eval_rewards/chosen": -0.3695334494113922, "eval_rewards/margins": 0.5571174621582031, "eval_rewards/rejected": -0.9266510009765625, "eval_runtime": 91.8257, "eval_samples_per_second": 19.798, "eval_steps_per_second": 0.316, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 9.32642322141669, "learning_rate": 1.1699198087116588e-07, "logits/chosen": -0.07652176916599274, "logits/rejected": 0.6259859800338745, "logps/chosen": -321.20538330078125, "logps/rejected": -355.15618896484375, "loss": 0.1239, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.43240228295326233, "rewards/margins": 0.4546627104282379, "rewards/rejected": -0.887065052986145, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 9.870138396091912, "learning_rate": 1.00472367377196e-07, "logits/chosen": 0.15437743067741394, "logits/rejected": 0.9939621686935425, "logps/chosen": -314.3326416015625, "logps/rejected": -335.50787353515625, "loss": 0.1193, "rewards/accuracies": 0.78125, "rewards/chosen": -0.40100646018981934, "rewards/margins": 0.6091704964637756, "rewards/rejected": -1.0101768970489502, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 10.470680354911353, "learning_rate": 8.49126331382102e-08, "logits/chosen": 0.41432857513427734, "logits/rejected": 0.9864773750305176, "logps/chosen": -319.98114013671875, "logps/rejected": -336.5364074707031, "loss": 0.1183, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.45458975434303284, "rewards/margins": 0.43977364897727966, "rewards/rejected": -0.8943634033203125, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 10.41854471188631, "learning_rate": 7.041266247556812e-08, "logits/chosen": 0.3841426968574524, "logits/rejected": 0.9351291656494141, "logps/chosen": -334.6560974121094, "logps/rejected": -349.6623229980469, "loss": 0.1152, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.4666348397731781, "rewards/margins": 0.4301484227180481, "rewards/rejected": -0.8967832326889038, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 9.996573814918259, "learning_rate": 5.706553665319955e-08, "logits/chosen": 0.6401292085647583, "logits/rejected": 1.3996922969818115, "logps/chosen": -324.6096496582031, "logps/rejected": -337.38555908203125, "loss": 0.1213, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5011857151985168, "rewards/margins": 0.4960555136203766, "rewards/rejected": -0.997241199016571, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": 0.5151604413986206, "eval_logits/rejected": 1.5740225315093994, "eval_logps/chosen": -329.048583984375, "eval_logps/rejected": -348.0972900390625, "eval_loss": 0.113578200340271, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -0.4395850598812103, "eval_rewards/margins": 0.5761635303497314, "eval_rewards/rejected": -1.0157485008239746, "eval_runtime": 92.2267, "eval_samples_per_second": 19.712, "eval_steps_per_second": 0.314, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 8.23863269387286, "learning_rate": 4.4956936350761005e-08, "logits/chosen": 0.49835261702537537, "logits/rejected": 1.0901247262954712, "logps/chosen": -292.41424560546875, "logps/rejected": -345.85540771484375, "loss": 0.1131, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.40094178915023804, "rewards/margins": 0.4722411632537842, "rewards/rejected": -0.8731829524040222, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 10.10305933177313, "learning_rate": 3.416459164418123e-08, "logits/chosen": 0.16921785473823547, "logits/rejected": 0.9673768877983093, "logps/chosen": -340.80059814453125, "logps/rejected": -353.12274169921875, "loss": 0.1129, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.41216129064559937, "rewards/margins": 0.4933200776576996, "rewards/rejected": -0.9054813385009766, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 9.150940276647933, "learning_rate": 2.475778302439524e-08, "logits/chosen": 0.3220874071121216, "logits/rejected": 1.2661678791046143, "logps/chosen": -332.2058410644531, "logps/rejected": -351.96856689453125, "loss": 0.1174, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.35827815532684326, "rewards/margins": 0.5848454236984253, "rewards/rejected": -0.9431236386299133, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 9.488885903438641, "learning_rate": 1.6796896657433805e-08, "logits/chosen": 0.6681944727897644, "logits/rejected": 1.2935806512832642, "logps/chosen": -292.4158630371094, "logps/rejected": -315.7539978027344, "loss": 0.123, "rewards/accuracies": 0.75, "rewards/chosen": -0.40458402037620544, "rewards/margins": 0.4598938524723053, "rewards/rejected": -0.8644779324531555, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 9.000204295912624, "learning_rate": 1.0333036740834855e-08, "logits/chosen": 0.7832537889480591, "logits/rejected": 1.2279746532440186, "logps/chosen": -260.876708984375, "logps/rejected": -314.25390625, "loss": 0.12, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3631708323955536, "rewards/margins": 0.4872121214866638, "rewards/rejected": -0.8503829836845398, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": 0.40672120451927185, "eval_logits/rejected": 1.4352502822875977, "eval_logps/chosen": -324.5692443847656, "eval_logps/rejected": -342.49981689453125, "eval_loss": 0.11321888864040375, "eval_rewards/accuracies": 0.7543103694915771, "eval_rewards/chosen": -0.39479103684425354, "eval_rewards/margins": 0.5649827122688293, "eval_rewards/rejected": -0.9597737789154053, "eval_runtime": 92.1464, "eval_samples_per_second": 19.729, "eval_steps_per_second": 0.315, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 12.797858678996972, "learning_rate": 5.4076974448211685e-09, "logits/chosen": 0.733371913433075, "logits/rejected": 1.6556295156478882, "logps/chosen": -313.32049560546875, "logps/rejected": -336.0906677246094, "loss": 0.1171, "rewards/accuracies": 0.75, "rewards/chosen": -0.42368635535240173, "rewards/margins": 0.5567531585693359, "rewards/rejected": -0.9804395437240601, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 11.330514295442143, "learning_rate": 2.052496544188487e-09, "logits/chosen": 0.8590083122253418, "logits/rejected": 1.5339264869689941, "logps/chosen": -304.95831298828125, "logps/rejected": -347.80169677734375, "loss": 0.1153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4551934599876404, "rewards/margins": 0.5640500783920288, "rewards/rejected": -1.019243597984314, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 9.086504274831077, "learning_rate": 2.889724508297886e-10, "logits/chosen": 0.5266289710998535, "logits/rejected": 1.5435163974761963, "logps/chosen": -344.6377868652344, "logps/rejected": -336.5421447753906, "loss": 0.1158, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.41462403535842896, "rewards/margins": 0.5104321837425232, "rewards/rejected": -0.9250561594963074, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.13218348616853767, "train_runtime": 11343.5766, "train_samples_per_second": 4.915, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }