{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 1.6432102724209243, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.619717597961426, "logits/rejected": -2.552694320678711, "logps/chosen": -265.4180908203125, "logps/rejected": -236.1053924560547, "loss": 0.0154, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 0.00027717588818632066, "rewards/margins": 3.1426665373146534e-05, "rewards/rejected": 0.0002457492519170046, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 1.5125635433247804, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6578421592712402, "logits/rejected": -2.5760762691497803, "logps/chosen": -298.83837890625, "logps/rejected": -274.30596923828125, "loss": 0.0155, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0005307936808094382, "rewards/margins": 0.0014294281136244535, "rewards/rejected": -0.0019602221436798573, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 1.3722478894206875, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6761069297790527, "logits/rejected": -2.602224826812744, "logps/chosen": -290.43157958984375, "logps/rejected": -234.3854522705078, "loss": 0.0146, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.00901214499026537, "rewards/margins": 0.013047738000750542, "rewards/rejected": -0.004035593010485172, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 1.2692399849923888, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.6601767539978027, "logits/rejected": -2.610305070877075, "logps/chosen": -280.9813537597656, "logps/rejected": -267.60333251953125, "loss": 0.013, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.04187846928834915, "rewards/margins": 0.0399814248085022, "rewards/rejected": 0.0018970475066453218, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 1.4063277329047579, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.6256864070892334, "logits/rejected": -2.6204254627227783, "logps/chosen": -289.7392578125, "logps/rejected": -299.10064697265625, "loss": 0.012, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.06511950492858887, "rewards/margins": 0.06428412348031998, "rewards/rejected": 0.0008353829616680741, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.578613519668579, "eval_logits/rejected": -2.502546787261963, "eval_logps/chosen": -277.33428955078125, "eval_logps/rejected": -247.25254821777344, "eval_loss": 0.01130068488419056, "eval_rewards/accuracies": 0.6982758641242981, "eval_rewards/chosen": 0.0775580108165741, "eval_rewards/margins": 0.0848592072725296, "eval_rewards/rejected": -0.007301207631826401, "eval_runtime": 93.3854, "eval_samples_per_second": 19.468, "eval_steps_per_second": 0.311, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 1.422544468484653, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.580706834793091, "logits/rejected": -2.525763750076294, "logps/chosen": -284.323486328125, "logps/rejected": -259.6044006347656, "loss": 0.0112, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.05161212012171745, "rewards/margins": 0.07209066301584244, "rewards/rejected": -0.020478537306189537, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 1.4554279086187263, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.5086066722869873, "logits/rejected": -2.471884250640869, "logps/chosen": -319.5674743652344, "logps/rejected": -265.2959899902344, "loss": 0.0115, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.08003517985343933, "rewards/margins": 0.11891887336969376, "rewards/rejected": -0.038883693516254425, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 1.3828352586845378, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.480229616165161, "logits/rejected": -2.3888580799102783, "logps/chosen": -279.42529296875, "logps/rejected": -256.35528564453125, "loss": 0.0109, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06424586474895477, "rewards/margins": 0.10123773664236069, "rewards/rejected": -0.036991871893405914, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 1.341734639019618, "learning_rate": 4.832031033425662e-07, "logits/chosen": -2.430302381515503, "logits/rejected": -2.3838858604431152, "logps/chosen": -280.6355895996094, "logps/rejected": -252.35107421875, "loss": 0.0103, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06957819312810898, "rewards/margins": 0.12507636845111847, "rewards/rejected": -0.0554981604218483, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 1.503281183515388, "learning_rate": 4.752422169756047e-07, "logits/chosen": -2.392822265625, "logits/rejected": -2.337982654571533, "logps/chosen": -269.06829833984375, "logps/rejected": -280.2345886230469, "loss": 0.0111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.033564209938049316, "rewards/margins": 0.11662591993808746, "rewards/rejected": -0.08306171000003815, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": -2.445643424987793, "eval_logits/rejected": -2.345205545425415, "eval_logps/chosen": -281.08892822265625, "eval_logps/rejected": -254.68817138671875, "eval_loss": 0.01000975538045168, "eval_rewards/accuracies": 0.7112069129943848, "eval_rewards/chosen": 0.04001150280237198, "eval_rewards/margins": 0.1216687560081482, "eval_rewards/rejected": -0.08165725320577621, "eval_runtime": 92.3022, "eval_samples_per_second": 19.696, "eval_steps_per_second": 0.314, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 2.297190949384611, "learning_rate": 4.658354083558188e-07, "logits/chosen": -2.466749668121338, "logits/rejected": -2.361680030822754, "logps/chosen": -263.682373046875, "logps/rejected": -241.5611572265625, "loss": 0.0113, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.015057370066642761, "rewards/margins": 0.09333699941635132, "rewards/rejected": -0.07827961444854736, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 1.422856756971901, "learning_rate": 4.550430636492389e-07, "logits/chosen": -2.3867597579956055, "logits/rejected": -2.3550314903259277, "logps/chosen": -275.4966125488281, "logps/rejected": -260.20843505859375, "loss": 0.0103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0032271749805659056, "rewards/margins": 0.09067237377166748, "rewards/rejected": -0.08744519203901291, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 2.508161764096464, "learning_rate": 4.429344633468004e-07, "logits/chosen": -2.4056849479675293, "logits/rejected": -2.361074686050415, "logps/chosen": -254.7312469482422, "logps/rejected": -252.63455200195312, "loss": 0.0105, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.014488674700260162, "rewards/margins": 0.11470804363489151, "rewards/rejected": -0.10021935403347015, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 3.762306745768528, "learning_rate": 4.2958733752443187e-07, "logits/chosen": -2.412415027618408, "logits/rejected": -2.3416905403137207, "logps/chosen": -273.1282043457031, "logps/rejected": -231.5812225341797, "loss": 0.0122, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03056950867176056, "rewards/margins": 0.10341660678386688, "rewards/rejected": -0.07284711301326752, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 1.323990928834917, "learning_rate": 4.150873668617898e-07, "logits/chosen": -2.454942226409912, "logits/rejected": -2.3723654747009277, "logps/chosen": -268.99053955078125, "logps/rejected": -247.30032348632812, "loss": 0.0104, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.02446950413286686, "rewards/margins": 0.11665806919336319, "rewards/rejected": -0.1411275714635849, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": -2.4970576763153076, "eval_logits/rejected": -2.400646924972534, "eval_logps/chosen": -286.011474609375, "eval_logps/rejected": -260.7337646484375, "eval_loss": 0.009798681363463402, "eval_rewards/accuracies": 0.7284482717514038, "eval_rewards/chosen": -0.009214152581989765, "eval_rewards/margins": 0.13289883732795715, "eval_rewards/rejected": -0.14211300015449524, "eval_runtime": 93.2661, "eval_samples_per_second": 19.493, "eval_steps_per_second": 0.311, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 2.7039803270775495, "learning_rate": 3.9952763262280397e-07, "logits/chosen": -2.460155963897705, "logits/rejected": -2.39618182182312, "logps/chosen": -293.5860595703125, "logps/rejected": -295.9964294433594, "loss": 0.01, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.02965056337416172, "rewards/margins": 0.1299286037683487, "rewards/rejected": -0.10027804225683212, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 1.3210709780236793, "learning_rate": 3.8300801912883414e-07, "logits/chosen": -2.503695249557495, "logits/rejected": -2.4564812183380127, "logps/chosen": -267.4649353027344, "logps/rejected": -279.4906921386719, "loss": 0.0091, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.041359588503837585, "rewards/margins": 0.13557755947113037, "rewards/rejected": -0.09421796351671219, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 1.4526636355174283, "learning_rate": 3.6563457256020884e-07, "logits/chosen": -2.5071005821228027, "logits/rejected": -2.4085752964019775, "logps/chosen": -306.22930908203125, "logps/rejected": -253.84475708007812, "loss": 0.0096, "rewards/accuracies": 0.75, "rewards/chosen": -0.028713583946228027, "rewards/margins": 0.12478481233119965, "rewards/rejected": -0.15349839627742767, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 4.8242221106390115, "learning_rate": 3.475188202022617e-07, "logits/chosen": -2.414685010910034, "logits/rejected": -2.3840391635894775, "logps/chosen": -256.085205078125, "logps/rejected": -276.09039306640625, "loss": 0.0106, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.020293032750487328, "rewards/margins": 0.13775238394737244, "rewards/rejected": -0.11745934188365936, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 1.2637397948323732, "learning_rate": 3.287770545059052e-07, "logits/chosen": -2.5293562412261963, "logits/rejected": -2.431912899017334, "logps/chosen": -277.83734130859375, "logps/rejected": -256.05206298828125, "loss": 0.0096, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.004443052224814892, "rewards/margins": 0.12640419602394104, "rewards/rejected": -0.12196113169193268, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": -2.5115060806274414, "eval_logits/rejected": -2.4205849170684814, "eval_logps/chosen": -282.7939147949219, "eval_logps/rejected": -258.3851013183594, "eval_loss": 0.00931267999112606, "eval_rewards/accuracies": 0.7887930870056152, "eval_rewards/chosen": 0.022961637005209923, "eval_rewards/margins": 0.14158831536769867, "eval_rewards/rejected": -0.1186266764998436, "eval_runtime": 92.3817, "eval_samples_per_second": 19.679, "eval_steps_per_second": 0.314, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 1.093362739669567, "learning_rate": 3.0952958655864954e-07, "logits/chosen": -2.4653537273406982, "logits/rejected": -2.448239803314209, "logps/chosen": -271.6222229003906, "logps/rejected": -265.7300109863281, "loss": 0.0094, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01682201772928238, "rewards/margins": 0.13070014119148254, "rewards/rejected": -0.11387811601161957, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 1.5040749987763142, "learning_rate": 2.898999737583448e-07, "logits/chosen": -2.50272798538208, "logits/rejected": -2.4043869972229004, "logps/chosen": -322.4222717285156, "logps/rejected": -300.2145690917969, "loss": 0.0091, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0025803411845117807, "rewards/margins": 0.15502406656742096, "rewards/rejected": -0.15244372189044952, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 1.2805773201998596, "learning_rate": 2.7001422664752333e-07, "logits/chosen": -2.396461009979248, "logits/rejected": -2.344364881515503, "logps/chosen": -268.88946533203125, "logps/rejected": -278.57122802734375, "loss": 0.0093, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.009459306485950947, "rewards/margins": 0.12005350738763809, "rewards/rejected": -0.12951281666755676, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 1.346452258119606, "learning_rate": 2.5e-07, "logits/chosen": -2.471860885620117, "logits/rejected": -2.379460573196411, "logps/chosen": -288.55877685546875, "logps/rejected": -274.3955078125, "loss": 0.0096, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.010269422084093094, "rewards/margins": 0.12018589675426483, "rewards/rejected": -0.13045531511306763, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 1.1636311800865655, "learning_rate": 2.2998577335247667e-07, "logits/chosen": -2.5101351737976074, "logits/rejected": -2.4231343269348145, "logps/chosen": -303.5893249511719, "logps/rejected": -274.129638671875, "loss": 0.0093, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.0020445596892386675, "rewards/margins": 0.13890263438224792, "rewards/rejected": -0.13685807585716248, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": -2.459127187728882, "eval_logits/rejected": -2.365269899368286, "eval_logps/chosen": -286.2547912597656, "eval_logps/rejected": -263.3385925292969, "eval_loss": 0.008932252414524555, "eval_rewards/accuracies": 0.7844827771186829, "eval_rewards/chosen": -0.011646694503724575, "eval_rewards/margins": 0.1565149575471878, "eval_rewards/rejected": -0.16816167533397675, "eval_runtime": 93.0437, "eval_samples_per_second": 19.539, "eval_steps_per_second": 0.312, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 1.1997721634582499, "learning_rate": 2.1010002624165524e-07, "logits/chosen": -2.4458436965942383, "logits/rejected": -2.4075264930725098, "logps/chosen": -273.95843505859375, "logps/rejected": -292.99114990234375, "loss": 0.0097, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.032438840717077255, "rewards/margins": 0.1535351574420929, "rewards/rejected": -0.18597400188446045, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 1.2414114105136553, "learning_rate": 1.9047041344135043e-07, "logits/chosen": -2.437638759613037, "logits/rejected": -2.409536600112915, "logps/chosen": -270.0658264160156, "logps/rejected": -270.24798583984375, "loss": 0.0092, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.04003521427512169, "rewards/margins": 0.14397411048412323, "rewards/rejected": -0.1840093433856964, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 1.288255010907884, "learning_rate": 1.7122294549409482e-07, "logits/chosen": -2.5124118328094482, "logits/rejected": -2.4503026008605957, "logps/chosen": -273.12841796875, "logps/rejected": -280.5234375, "loss": 0.0098, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.0012958184815943241, "rewards/margins": 0.165980726480484, "rewards/rejected": -0.167276531457901, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 1.3451525886655082, "learning_rate": 1.524811797977383e-07, "logits/chosen": -2.494507312774658, "logits/rejected": -2.4191977977752686, "logps/chosen": -287.3388366699219, "logps/rejected": -267.7834167480469, "loss": 0.009, "rewards/accuracies": 0.78125, "rewards/chosen": 0.003920386545360088, "rewards/margins": 0.1425868272781372, "rewards/rejected": -0.13866642117500305, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 1.3558837079563066, "learning_rate": 1.3436542743979125e-07, "logits/chosen": -2.498835563659668, "logits/rejected": -2.4762415885925293, "logps/chosen": -306.97705078125, "logps/rejected": -270.83367919921875, "loss": 0.0096, "rewards/accuracies": 0.71875, "rewards/chosen": -0.010277172550559044, "rewards/margins": 0.10946089029312134, "rewards/rejected": -0.11973806470632553, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": -2.4814062118530273, "eval_logits/rejected": -2.38907790184021, "eval_logps/chosen": -285.9173278808594, "eval_logps/rejected": -262.4115295410156, "eval_loss": 0.008779831230640411, "eval_rewards/accuracies": 0.7844827771186829, "eval_rewards/chosen": -0.008272184059023857, "eval_rewards/margins": 0.15061868727207184, "eval_rewards/rejected": -0.15889087319374084, "eval_runtime": 92.1897, "eval_samples_per_second": 19.72, "eval_steps_per_second": 0.315, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 1.3381129170655774, "learning_rate": 1.1699198087116588e-07, "logits/chosen": -2.515507221221924, "logits/rejected": -2.4294919967651367, "logps/chosen": -280.1512756347656, "logps/rejected": -280.0994873046875, "loss": 0.0094, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.02186095342040062, "rewards/margins": 0.11463706195354462, "rewards/rejected": -0.13649800419807434, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 1.338991121862378, "learning_rate": 1.00472367377196e-07, "logits/chosen": -2.451615571975708, "logits/rejected": -2.404048442840576, "logps/chosen": -275.2835388183594, "logps/rejected": -251.0447998046875, "loss": 0.0093, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.010515814647078514, "rewards/margins": 0.15502998232841492, "rewards/rejected": -0.16554580628871918, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 1.3896883263831044, "learning_rate": 8.49126331382102e-08, "logits/chosen": -2.4412286281585693, "logits/rejected": -2.392331600189209, "logps/chosen": -275.5787353515625, "logps/rejected": -260.4464111328125, "loss": 0.0095, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.010565539821982384, "rewards/margins": 0.12289756536483765, "rewards/rejected": -0.13346309959888458, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 1.360283909503132, "learning_rate": 7.041266247556812e-08, "logits/chosen": -2.525031566619873, "logits/rejected": -2.481128215789795, "logps/chosen": -289.6778564453125, "logps/rejected": -272.6752014160156, "loss": 0.009, "rewards/accuracies": 0.75, "rewards/chosen": -0.016852576285600662, "rewards/margins": 0.11005936563014984, "rewards/rejected": -0.1269119530916214, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 1.3078891508366295, "learning_rate": 5.706553665319955e-08, "logits/chosen": -2.501155138015747, "logits/rejected": -2.399773597717285, "logps/chosen": -277.02081298828125, "logps/rejected": -253.2749786376953, "loss": 0.0096, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.025297433137893677, "rewards/margins": 0.13083800673484802, "rewards/rejected": -0.1561354398727417, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": -2.4845948219299316, "eval_logits/rejected": -2.390641927719116, "eval_logps/chosen": -285.5014343261719, "eval_logps/rejected": -262.4816589355469, "eval_loss": 0.008705741725862026, "eval_rewards/accuracies": 0.7801724076271057, "eval_rewards/chosen": -0.004113705363124609, "eval_rewards/margins": 0.15547847747802734, "eval_rewards/rejected": -0.15959219634532928, "eval_runtime": 93.024, "eval_samples_per_second": 19.543, "eval_steps_per_second": 0.312, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 1.1167320443463302, "learning_rate": 4.4956936350761005e-08, "logits/chosen": -2.4814512729644775, "logits/rejected": -2.4325201511383057, "logps/chosen": -252.41159057617188, "logps/rejected": -271.5174865722656, "loss": 0.0088, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0009152599377557635, "rewards/margins": 0.128888800740242, "rewards/rejected": -0.12980404496192932, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 1.2531440295822198, "learning_rate": 3.416459164418123e-08, "logits/chosen": -2.5166125297546387, "logits/rejected": -2.4559273719787598, "logps/chosen": -299.89935302734375, "logps/rejected": -276.8605041503906, "loss": 0.0087, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.003148370888084173, "rewards/margins": 0.13971056044101715, "rewards/rejected": -0.14285892248153687, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 1.494922388543688, "learning_rate": 2.475778302439524e-08, "logits/chosen": -2.4861674308776855, "logits/rejected": -2.4356298446655273, "logps/chosen": -296.1441650390625, "logps/rejected": -272.86956787109375, "loss": 0.009, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0023386895190924406, "rewards/margins": 0.15447205305099487, "rewards/rejected": -0.15213337540626526, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 1.505735003806321, "learning_rate": 1.6796896657433805e-08, "logits/chosen": -2.4833850860595703, "logits/rejected": -2.381284236907959, "logps/chosen": -252.93844604492188, "logps/rejected": -242.3744659423828, "loss": 0.0095, "rewards/accuracies": 0.71875, "rewards/chosen": -0.009809909388422966, "rewards/margins": 0.1208723783493042, "rewards/rejected": -0.13068227469921112, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 1.3636138422869082, "learning_rate": 1.0333036740834855e-08, "logits/chosen": -2.41188383102417, "logits/rejected": -2.378366231918335, "logps/chosen": -223.8693389892578, "logps/rejected": -241.79000854492188, "loss": 0.0093, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.006902736611664295, "rewards/margins": 0.13264694809913635, "rewards/rejected": -0.12574420869350433, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": -2.4834423065185547, "eval_logits/rejected": -2.389298439025879, "eval_logps/chosen": -285.1916809082031, "eval_logps/rejected": -261.8979797363281, "eval_loss": 0.008685999549925327, "eval_rewards/accuracies": 0.7758620977401733, "eval_rewards/chosen": -0.0010154928313568234, "eval_rewards/margins": 0.15273970365524292, "eval_rewards/rejected": -0.15375518798828125, "eval_runtime": 92.569, "eval_samples_per_second": 19.639, "eval_steps_per_second": 0.313, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 1.3075058243660094, "learning_rate": 5.4076974448211685e-09, "logits/chosen": -2.419281005859375, "logits/rejected": -2.368946075439453, "logps/chosen": -271.1397705078125, "logps/rejected": -253.4750213623047, "loss": 0.009, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0018791112815961242, "rewards/margins": 0.15240374207496643, "rewards/rejected": -0.1542828381061554, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 2.8175803183046364, "learning_rate": 2.052496544188487e-09, "logits/chosen": -2.462890148162842, "logits/rejected": -2.377373218536377, "logps/chosen": -260.7967834472656, "logps/rejected": -261.8900451660156, "loss": 0.0093, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.013578072190284729, "rewards/margins": 0.14654859900474548, "rewards/rejected": -0.16012665629386902, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 1.3745801458045255, "learning_rate": 2.889724508297886e-10, "logits/chosen": -2.4770209789276123, "logits/rejected": -2.3631789684295654, "logps/chosen": -303.4706115722656, "logps/rejected": -256.81878662109375, "loss": 0.0094, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.0029516436625272036, "rewards/margins": 0.12487111985683441, "rewards/rejected": -0.12782277166843414, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.010263856175705927, "train_runtime": 11602.0724, "train_samples_per_second": 4.806, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }