{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 1.642045632140662, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.6192917823791504, "logits/rejected": -2.5524227619171143, "logps/chosen": -265.41119384765625, "logps/rejected": -236.11862182617188, "loss": 0.0154, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.00034645519917830825, "rewards/margins": 0.00023277592845261097, "rewards/rejected": 0.00011367930710548535, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 1.5167637902207143, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.657719135284424, "logits/rejected": -2.5759785175323486, "logps/chosen": -298.7945556640625, "logps/rejected": -274.304443359375, "loss": 0.0155, "rewards/accuracies": 0.625, "rewards/chosen": -9.255408076569438e-05, "rewards/margins": 0.0018523468170315027, "rewards/rejected": -0.0019449004903435707, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 1.3745597654790793, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.6762423515319824, "logits/rejected": -2.6026246547698975, "logps/chosen": -290.37896728515625, "logps/rejected": -234.3507080078125, "loss": 0.0146, "rewards/accuracies": 0.71875, "rewards/chosen": 0.009538007900118828, "rewards/margins": 0.013226142153143883, "rewards/rejected": -0.0036881337873637676, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 1.272192316985563, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.660547971725464, "logits/rejected": -2.6108529567718506, "logps/chosen": -280.96484375, "logps/rejected": -267.6105041503906, "loss": 0.013, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04204345494508743, "rewards/margins": 0.04021826013922691, "rewards/rejected": 0.0018251972505822778, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 1.4079236984792325, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.6261672973632812, "logits/rejected": -2.6208655834198, "logps/chosen": -289.76519775390625, "logps/rejected": -299.06353759765625, "loss": 0.012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06486029922962189, "rewards/margins": 0.06365373730659485, "rewards/rejected": 0.0012065758928656578, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.5786819458007812, "eval_logits/rejected": -2.502084732055664, "eval_logps/chosen": -277.3097839355469, "eval_logps/rejected": -247.28094482421875, "eval_loss": 0.011314952746033669, "eval_rewards/accuracies": 0.7025862336158752, "eval_rewards/chosen": 0.07780314981937408, "eval_rewards/margins": 0.08538833260536194, "eval_rewards/rejected": -0.007585177198052406, "eval_runtime": 94.9097, "eval_samples_per_second": 19.155, "eval_steps_per_second": 0.306, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 1.418614516322105, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.5812747478485107, "logits/rejected": -2.5265355110168457, "logps/chosen": -284.41473388671875, "logps/rejected": -259.66094970703125, "loss": 0.0112, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.05069952458143234, "rewards/margins": 0.0717436671257019, "rewards/rejected": -0.021044140681624413, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 1.4648752438020702, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.5097055435180664, "logits/rejected": -2.4729437828063965, "logps/chosen": -319.59161376953125, "logps/rejected": -265.26019287109375, "loss": 0.0115, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.07979384809732437, "rewards/margins": 0.1183195561170578, "rewards/rejected": -0.03852573037147522, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 1.379325651626498, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.4763333797454834, "logits/rejected": -2.385080099105835, "logps/chosen": -278.93988037109375, "logps/rejected": -255.78781127929688, "loss": 0.011, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.06909944862127304, "rewards/margins": 0.10041693598031998, "rewards/rejected": -0.031317487359046936, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 1.3303176283339873, "learning_rate": 4.832031033425662e-07, "logits/chosen": -2.423882246017456, "logits/rejected": -2.376399278640747, "logps/chosen": -280.73907470703125, "logps/rejected": -252.6663360595703, "loss": 0.0103, "rewards/accuracies": 0.71875, "rewards/chosen": 0.06854326277971268, "rewards/margins": 0.12719407677650452, "rewards/rejected": -0.058650821447372437, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 1.4785994754129348, "learning_rate": 4.752422169756047e-07, "logits/chosen": -2.3845717906951904, "logits/rejected": -2.3276596069335938, "logps/chosen": -268.87396240234375, "logps/rejected": -280.0634460449219, "loss": 0.011, "rewards/accuracies": 0.71875, "rewards/chosen": 0.035507336258888245, "rewards/margins": 0.11685723066329956, "rewards/rejected": -0.08134988695383072, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": -2.4392149448394775, "eval_logits/rejected": -2.3370442390441895, "eval_logps/chosen": -280.90240478515625, "eval_logps/rejected": -254.33267211914062, "eval_loss": 0.009970244951546192, "eval_rewards/accuracies": 0.7068965435028076, "eval_rewards/chosen": 0.04187687486410141, "eval_rewards/margins": 0.1199791207909584, "eval_rewards/rejected": -0.078102245926857, "eval_runtime": 95.3397, "eval_samples_per_second": 19.069, "eval_steps_per_second": 0.304, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 2.0224189698687702, "learning_rate": 4.658354083558188e-07, "logits/chosen": -2.4632654190063477, "logits/rejected": -2.354820966720581, "logps/chosen": -263.564453125, "logps/rejected": -241.26791381835938, "loss": 0.0113, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01623677834868431, "rewards/margins": 0.09158362448215485, "rewards/rejected": -0.07534684240818024, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 1.4135396422800603, "learning_rate": 4.550430636492389e-07, "logits/chosen": -2.382753610610962, "logits/rejected": -2.350785970687866, "logps/chosen": -275.2984619140625, "logps/rejected": -260.0333557128906, "loss": 0.0102, "rewards/accuracies": 0.6875, "rewards/chosen": 0.005208671558648348, "rewards/margins": 0.0909029170870781, "rewards/rejected": -0.08569424599409103, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 2.470336886918338, "learning_rate": 4.429344633468004e-07, "logits/chosen": -2.404059648513794, "logits/rejected": -2.3569350242614746, "logps/chosen": -254.55068969726562, "logps/rejected": -252.3956756591797, "loss": 0.0105, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.016294140368700027, "rewards/margins": 0.11412493139505386, "rewards/rejected": -0.09783079475164413, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 3.815944054063513, "learning_rate": 4.2958733752443187e-07, "logits/chosen": -2.4113519191741943, "logits/rejected": -2.340986728668213, "logps/chosen": -272.99420166015625, "logps/rejected": -231.37026977539062, "loss": 0.0123, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0319095216691494, "rewards/margins": 0.10264714062213898, "rewards/rejected": -0.07073761522769928, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 1.3170543399714232, "learning_rate": 4.150873668617898e-07, "logits/chosen": -2.454432249069214, "logits/rejected": -2.370666265487671, "logps/chosen": -268.79541015625, "logps/rejected": -247.0717315673828, "loss": 0.0104, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.022518452256917953, "rewards/margins": 0.11632315069437027, "rewards/rejected": -0.13884159922599792, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": -2.494401454925537, "eval_logits/rejected": -2.39561128616333, "eval_logps/chosen": -285.85308837890625, "eval_logps/rejected": -260.5493469238281, "eval_loss": 0.009796149097383022, "eval_rewards/accuracies": 0.7198275923728943, "eval_rewards/chosen": -0.007629875559359789, "eval_rewards/margins": 0.13263897597789764, "eval_rewards/rejected": -0.14026884734630585, "eval_runtime": 95.8939, "eval_samples_per_second": 18.958, "eval_steps_per_second": 0.302, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 2.6585100320508737, "learning_rate": 3.9952763262280397e-07, "logits/chosen": -2.4578630924224854, "logits/rejected": -2.3927180767059326, "logps/chosen": -293.46124267578125, "logps/rejected": -295.8601379394531, "loss": 0.01, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.030898962169885635, "rewards/margins": 0.12981417775154114, "rewards/rejected": -0.0989152044057846, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 1.3118055868117948, "learning_rate": 3.8300801912883414e-07, "logits/chosen": -2.504678249359131, "logits/rejected": -2.4544835090637207, "logps/chosen": -267.3302307128906, "logps/rejected": -279.33062744140625, "loss": 0.0091, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.04270657151937485, "rewards/margins": 0.13532397150993347, "rewards/rejected": -0.09261739999055862, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 1.4248945431420277, "learning_rate": 3.6563457256020884e-07, "logits/chosen": -2.509859561920166, "logits/rejected": -2.4090194702148438, "logps/chosen": -306.2326965332031, "logps/rejected": -253.7351531982422, "loss": 0.0096, "rewards/accuracies": 0.75, "rewards/chosen": -0.028747806325554848, "rewards/margins": 0.123654805123806, "rewards/rejected": -0.1524026244878769, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 4.61575651504461, "learning_rate": 3.475188202022617e-07, "logits/chosen": -2.4155473709106445, "logits/rejected": -2.3843960762023926, "logps/chosen": -256.64117431640625, "logps/rejected": -276.67425537109375, "loss": 0.0105, "rewards/accuracies": 0.78125, "rewards/chosen": 0.014733311720192432, "rewards/margins": 0.13803140819072723, "rewards/rejected": -0.12329809367656708, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 1.2837860780603194, "learning_rate": 3.287770545059052e-07, "logits/chosen": -2.529806613922119, "logits/rejected": -2.4307055473327637, "logps/chosen": -277.78741455078125, "logps/rejected": -256.05511474609375, "loss": 0.0096, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.004942134954035282, "rewards/margins": 0.12693332135677338, "rewards/rejected": -0.12199117988348007, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": -2.514023542404175, "eval_logits/rejected": -2.4208788871765137, "eval_logps/chosen": -282.1954345703125, "eval_logps/rejected": -257.61944580078125, "eval_loss": 0.009310290217399597, "eval_rewards/accuracies": 0.7931034564971924, "eval_rewards/chosen": 0.028946416452527046, "eval_rewards/margins": 0.13991650938987732, "eval_rewards/rejected": -0.11097008734941483, "eval_runtime": 96.1764, "eval_samples_per_second": 18.903, "eval_steps_per_second": 0.302, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 1.090616369349917, "learning_rate": 3.0952958655864954e-07, "logits/chosen": -2.4676451683044434, "logits/rejected": -2.4505865573883057, "logps/chosen": -270.9748840332031, "logps/rejected": -265.1389465332031, "loss": 0.0094, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.023295128718018532, "rewards/margins": 0.13126251101493835, "rewards/rejected": -0.10796739161014557, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 1.493461344772632, "learning_rate": 2.898999737583448e-07, "logits/chosen": -2.504462957382202, "logits/rejected": -2.404648542404175, "logps/chosen": -322.62689208984375, "logps/rejected": -300.34686279296875, "loss": 0.0091, "rewards/accuracies": 0.78125, "rewards/chosen": 0.00053420226322487, "rewards/margins": 0.15430037677288055, "rewards/rejected": -0.1537661850452423, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 1.2922195153842637, "learning_rate": 2.7001422664752333e-07, "logits/chosen": -2.3982276916503906, "logits/rejected": -2.3454272747039795, "logps/chosen": -269.4321594238281, "logps/rejected": -279.17926025390625, "loss": 0.0093, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.014886477962136269, "rewards/margins": 0.12070702016353607, "rewards/rejected": -0.1355935037136078, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 1.3338425925379636, "learning_rate": 2.5e-07, "logits/chosen": -2.4747557640075684, "logits/rejected": -2.3822951316833496, "logps/chosen": -288.63037109375, "logps/rejected": -274.5633850097656, "loss": 0.0095, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.010984973981976509, "rewards/margins": 0.12114904075860977, "rewards/rejected": -0.13213400542736053, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 1.161806057366623, "learning_rate": 2.2998577335247667e-07, "logits/chosen": -2.513209819793701, "logits/rejected": -2.424811363220215, "logps/chosen": -303.5930480957031, "logps/rejected": -273.9664611816406, "loss": 0.0094, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.002007619244977832, "rewards/margins": 0.13723386824131012, "rewards/rejected": -0.1352262645959854, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": -2.4590651988983154, "eval_logits/rejected": -2.362999200820923, "eval_logps/chosen": -286.21722412109375, "eval_logps/rejected": -263.2693786621094, "eval_loss": 0.008912510238587856, "eval_rewards/accuracies": 0.7801724076271057, "eval_rewards/chosen": -0.011271164752542973, "eval_rewards/margins": 0.15619821846485138, "eval_rewards/rejected": -0.16746938228607178, "eval_runtime": 94.78, "eval_samples_per_second": 19.181, "eval_steps_per_second": 0.306, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 1.1995883831745127, "learning_rate": 2.1010002624165524e-07, "logits/chosen": -2.4454376697540283, "logits/rejected": -2.407090902328491, "logps/chosen": -273.8803405761719, "logps/rejected": -292.88421630859375, "loss": 0.0097, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.03165815398097038, "rewards/margins": 0.15324734151363373, "rewards/rejected": -0.1849054992198944, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 1.236047360327654, "learning_rate": 1.9047041344135043e-07, "logits/chosen": -2.4361116886138916, "logits/rejected": -2.4080004692077637, "logps/chosen": -270.04998779296875, "logps/rejected": -270.3052978515625, "loss": 0.0091, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.03987707942724228, "rewards/margins": 0.14470525085926056, "rewards/rejected": -0.18458232283592224, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 1.3000373431490932, "learning_rate": 1.7122294549409482e-07, "logits/chosen": -2.512075424194336, "logits/rejected": -2.448713779449463, "logps/chosen": -273.0007019042969, "logps/rejected": -280.35565185546875, "loss": 0.0098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8634367734193802e-05, "rewards/margins": 0.16557954251766205, "rewards/rejected": -0.16559818387031555, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 1.354856941159808, "learning_rate": 1.524811797977383e-07, "logits/chosen": -2.493873119354248, "logits/rejected": -2.416597843170166, "logps/chosen": -287.20404052734375, "logps/rejected": -267.6150817871094, "loss": 0.009, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.005267986096441746, "rewards/margins": 0.1422511339187622, "rewards/rejected": -0.13698314130306244, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 1.3081620144699164, "learning_rate": 1.3436542743979125e-07, "logits/chosen": -2.4964632987976074, "logits/rejected": -2.472830057144165, "logps/chosen": -307.45733642578125, "logps/rejected": -271.1383361816406, "loss": 0.0096, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.015080097131431103, "rewards/margins": 0.10770467668771744, "rewards/rejected": -0.12278477847576141, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": -2.477182388305664, "eval_logits/rejected": -2.382119655609131, "eval_logps/chosen": -286.4156188964844, "eval_logps/rejected": -262.90252685546875, "eval_loss": 0.008760624565184116, "eval_rewards/accuracies": 0.7844827771186829, "eval_rewards/chosen": -0.013255205936729908, "eval_rewards/margins": 0.15054550766944885, "eval_rewards/rejected": -0.16380071640014648, "eval_runtime": 95.1377, "eval_samples_per_second": 19.109, "eval_steps_per_second": 0.305, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 1.3153487126066306, "learning_rate": 1.1699198087116588e-07, "logits/chosen": -2.5111746788024902, "logits/rejected": -2.422899007797241, "logps/chosen": -280.66143798828125, "logps/rejected": -280.439697265625, "loss": 0.0094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.026962900534272194, "rewards/margins": 0.11293704807758331, "rewards/rejected": -0.13989993929862976, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 1.337124035482138, "learning_rate": 1.00472367377196e-07, "logits/chosen": -2.4476630687713623, "logits/rejected": -2.397982120513916, "logps/chosen": -275.42584228515625, "logps/rejected": -251.29122924804688, "loss": 0.0093, "rewards/accuracies": 0.75, "rewards/chosen": -0.011938780546188354, "rewards/margins": 0.15607169270515442, "rewards/rejected": -0.16801045835018158, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 1.4113093599011106, "learning_rate": 8.49126331382102e-08, "logits/chosen": -2.435225009918213, "logits/rejected": -2.386702537536621, "logps/chosen": -275.8332214355469, "logps/rejected": -260.52587890625, "loss": 0.0095, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.013110649771988392, "rewards/margins": 0.12114731222391129, "rewards/rejected": -0.1342579573392868, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 1.366187901878708, "learning_rate": 7.041266247556812e-08, "logits/chosen": -2.521238327026367, "logits/rejected": -2.4764904975891113, "logps/chosen": -289.8477478027344, "logps/rejected": -272.86456298828125, "loss": 0.009, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.01855117455124855, "rewards/margins": 0.11025450378656387, "rewards/rejected": -0.12880566716194153, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 1.3145992203531836, "learning_rate": 5.706553665319955e-08, "logits/chosen": -2.4982964992523193, "logits/rejected": -2.3964176177978516, "logps/chosen": -277.12042236328125, "logps/rejected": -253.37203979492188, "loss": 0.0096, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.026293223723769188, "rewards/margins": 0.13081298768520355, "rewards/rejected": -0.1571062207221985, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": -2.481409788131714, "eval_logits/rejected": -2.3850882053375244, "eval_logps/chosen": -285.64593505859375, "eval_logps/rejected": -262.6905517578125, "eval_loss": 0.008698553778231144, "eval_rewards/accuracies": 0.7801724076271057, "eval_rewards/chosen": -0.005558254197239876, "eval_rewards/margins": 0.15612287819385529, "eval_rewards/rejected": -0.1616811454296112, "eval_runtime": 94.1358, "eval_samples_per_second": 19.313, "eval_steps_per_second": 0.308, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 1.1160006320842197, "learning_rate": 4.4956936350761005e-08, "logits/chosen": -2.4785516262054443, "logits/rejected": -2.427774429321289, "logps/chosen": -252.49581909179688, "logps/rejected": -271.5592041015625, "loss": 0.0088, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.0017578303813934326, "rewards/margins": 0.12846335768699646, "rewards/rejected": -0.1302211880683899, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 1.2680559242352918, "learning_rate": 3.416459164418123e-08, "logits/chosen": -2.514042854309082, "logits/rejected": -2.4509449005126953, "logps/chosen": -300.03057861328125, "logps/rejected": -276.8949279785156, "loss": 0.0087, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.004460807889699936, "rewards/margins": 0.13874222338199615, "rewards/rejected": -0.1432030349969864, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 1.5408590452374413, "learning_rate": 2.475778302439524e-08, "logits/chosen": -2.4833552837371826, "logits/rejected": -2.4306118488311768, "logps/chosen": -296.1810607910156, "logps/rejected": -272.9661560058594, "loss": 0.009, "rewards/accuracies": 0.78125, "rewards/chosen": 0.001969636185094714, "rewards/margins": 0.15506890416145325, "rewards/rejected": -0.15309928357601166, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 1.4629042335763474, "learning_rate": 1.6796896657433805e-08, "logits/chosen": -2.4795479774475098, "logits/rejected": -2.3772130012512207, "logps/chosen": -253.0847930908203, "logps/rejected": -242.50405883789062, "loss": 0.0095, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.011273050680756569, "rewards/margins": 0.12070544064044952, "rewards/rejected": -0.13197848200798035, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 1.3631415527821928, "learning_rate": 1.0333036740834855e-08, "logits/chosen": -2.4071569442749023, "logits/rejected": -2.371851682662964, "logps/chosen": -224.0278778076172, "logps/rejected": -241.8699188232422, "loss": 0.0093, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.0053174905478954315, "rewards/margins": 0.1318611204624176, "rewards/rejected": -0.12654362618923187, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": -2.4800400733947754, "eval_logits/rejected": -2.383610486984253, "eval_logps/chosen": -285.21075439453125, "eval_logps/rejected": -261.9634704589844, "eval_loss": 0.008683313615620136, "eval_rewards/accuracies": 0.767241358757019, "eval_rewards/chosen": -0.0012068306095898151, "eval_rewards/margins": 0.15320327877998352, "eval_rewards/rejected": -0.15441007912158966, "eval_runtime": 95.2651, "eval_samples_per_second": 19.084, "eval_steps_per_second": 0.304, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 1.2959245163133988, "learning_rate": 5.4076974448211685e-09, "logits/chosen": -2.414137363433838, "logits/rejected": -2.364811420440674, "logps/chosen": -271.1864318847656, "logps/rejected": -253.48568725585938, "loss": 0.009, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.0023452930618077517, "rewards/margins": 0.1520446538925171, "rewards/rejected": -0.15438991785049438, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 2.8287990040902504, "learning_rate": 2.052496544188487e-09, "logits/chosen": -2.4582016468048096, "logits/rejected": -2.3712384700775146, "logps/chosen": -260.9129638671875, "logps/rejected": -261.7887878417969, "loss": 0.0093, "rewards/accuracies": 0.75, "rewards/chosen": -0.014739753678441048, "rewards/margins": 0.14437474310398102, "rewards/rejected": -0.1591145098209381, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 1.3866473112375508, "learning_rate": 2.889724508297886e-10, "logits/chosen": -2.474024772644043, "logits/rejected": -2.357807159423828, "logps/chosen": -303.5400695800781, "logps/rejected": -256.8912658691406, "loss": 0.0094, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.003646157681941986, "rewards/margins": 0.12490139901638031, "rewards/rejected": -0.1285475790500641, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.010265434613673512, "train_runtime": 11699.9785, "train_samples_per_second": 4.766, "train_steps_per_second": 0.037 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }