diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,7 +2,7 @@ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995965030262273, - "eval_steps": 500, + "eval_steps": 300, "global_step": 1858, "is_hyper_param_search": false, "is_local_process_zero": true, @@ -11,11 +11,11 @@ { "epoch": 0.0, "learning_rate": 2.6881720430107528e-09, - "logits/chosen": -2.5808520317077637, - "logits/rejected": -2.0101242065429688, - "logps/chosen": -299.3489990234375, - "logps/rejected": -186.63014221191406, - "loss": 52500.0, + "logits/chosen": -2.670260429382324, + "logits/rejected": -2.1533777713775635, + "logps/chosen": -299.33551025390625, + "logps/rejected": -186.81130981445312, + "loss": 13125.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -27,3032 +27,3086 @@ { "epoch": 0.01, "learning_rate": 2.6881720430107527e-08, - "logits/chosen": -2.3875138759613037, - "logits/rejected": -2.2289838790893555, - "logps/chosen": -201.84078979492188, - "logps/rejected": -189.4849090576172, - "loss": 59865.5625, - "rewards/accuracies": 0.4444444477558136, - "rewards/chosen": -3.241965168854222e-05, - "rewards/margins": -8.215905836550519e-06, - "rewards/rejected": -2.4203753127949312e-05, - "rewards/safe_rewards": 0.00010516856127651408, - "rewards/unsafe_rewards": -0.00017000787192955613, + "logits/chosen": -2.4763858318328857, + "logits/rejected": -2.354341745376587, + "logps/chosen": -201.82504272460938, + "logps/rejected": -189.45822143554688, + "loss": 14971.9097, + "rewards/accuracies": 0.4930555522441864, + "rewards/chosen": 9.199242413160391e-06, + "rewards/margins": -8.218608854804188e-05, + "rewards/rejected": 9.138535824604332e-05, + "rewards/safe_rewards": 1.8223654478788376e-05, + "rewards/unsafe_rewards": 1.7482994962847442e-07, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.3763440860215054e-08, - "logits/chosen": -2.3489036560058594, - "logits/rejected": -2.053595781326294, - "logps/chosen": -226.28170776367188, - "logps/rejected": -181.14657592773438, - "loss": 60976.85, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": 9.253517782781273e-05, - "rewards/margins": -1.7863098037196323e-05, - "rewards/rejected": 0.00011039829405490309, - "rewards/safe_rewards": 2.8339805794530548e-05, - "rewards/unsafe_rewards": 0.00015673057350795716, + "logits/chosen": -2.44279408454895, + "logits/rejected": -2.218726873397827, + "logps/chosen": -226.34970092773438, + "logps/rejected": -181.1803436279297, + "loss": 15239.6516, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0002221722388640046, + "rewards/margins": -0.0001655549422139302, + "rewards/rejected": -5.6617216614540666e-05, + "rewards/safe_rewards": -0.0002462912234477699, + "rewards/unsafe_rewards": -0.0001980532251764089, "step": 20 }, { "epoch": 0.02, "learning_rate": 8.064516129032257e-08, - "logits/chosen": -2.3402774333953857, - "logits/rejected": -2.1458330154418945, - "logps/chosen": -215.087890625, - "logps/rejected": -189.3074493408203, - "loss": 60007.85, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -0.00015139608876779675, - "rewards/margins": 6.369686161633581e-05, - "rewards/rejected": -0.00021509295038413256, - "rewards/safe_rewards": 0.00011413628089940175, - "rewards/unsafe_rewards": -0.00041692849481478333, + "logits/chosen": -2.4326512813568115, + "logits/rejected": -2.2918035984039307, + "logps/chosen": -215.0783233642578, + "logps/rejected": -189.28309631347656, + "loss": 14993.0922, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0004244670271873474, + "rewards/margins": -0.000839566346257925, + "rewards/rejected": 0.0004150994645897299, + "rewards/safe_rewards": -0.0002224749478045851, + "rewards/unsafe_rewards": -0.0006264590774662793, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0752688172043011e-07, - "logits/chosen": -2.2764992713928223, - "logits/rejected": -1.9738476276397705, - "logps/chosen": -180.7752685546875, - "logps/rejected": -173.91864013671875, - "loss": 58964.075, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.00020164414308965206, - "rewards/margins": 0.0005626518395729363, - "rewards/rejected": -0.0007642959244549274, - "rewards/safe_rewards": 0.00011740434274543077, - "rewards/unsafe_rewards": -0.0005206926725804806, + "logits/chosen": -2.375096082687378, + "logits/rejected": -2.157045841217041, + "logps/chosen": -180.7657470703125, + "logps/rejected": -173.87054443359375, + "loss": 14723.7563, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.00011304272629786283, + "rewards/margins": 0.0008593280799686909, + "rewards/rejected": -0.0009723707917146385, + "rewards/safe_rewards": 0.0004990470479242504, + "rewards/unsafe_rewards": -0.0007251326460391283, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3440860215053762e-07, - "logits/chosen": -2.4042086601257324, - "logits/rejected": -2.0332813262939453, - "logps/chosen": -209.6074676513672, - "logps/rejected": -167.68734741210938, - "loss": 59476.6125, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.00035317527363076806, - "rewards/margins": 0.0021696495823562145, - "rewards/rejected": -0.0018164744833484292, - "rewards/safe_rewards": 5.224679989623837e-05, - "rewards/unsafe_rewards": 0.0006541038164868951, + "logits/chosen": -2.489865303039551, + "logits/rejected": -2.1804325580596924, + "logps/chosen": -209.59664916992188, + "logps/rejected": -167.6984100341797, + "loss": 14816.0938, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.0011160348076373339, + "rewards/margins": 0.0046949503012001514, + "rewards/rejected": -0.0035789154935628176, + "rewards/safe_rewards": 0.00042248546378687024, + "rewards/unsafe_rewards": 0.0018095843261107802, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.6129032258064515e-07, - "logits/chosen": -2.3313159942626953, - "logits/rejected": -2.1559650897979736, - "logps/chosen": -185.9324493408203, - "logps/rejected": -185.1822052001953, - "loss": 58669.425, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -0.0012478150893002748, - "rewards/margins": 0.000993359019048512, - "rewards/rejected": -0.002241173991933465, - "rewards/safe_rewards": -0.0010035478044301271, - "rewards/unsafe_rewards": -0.001492082723416388, + "logits/chosen": -2.4289777278900146, + "logits/rejected": -2.2897238731384277, + "logps/chosen": -185.92630004882812, + "logps/rejected": -185.1912841796875, + "loss": 14593.3875, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0020237788558006287, + "rewards/margins": 0.002217882312834263, + "rewards/rejected": -0.004241660702973604, + "rewards/safe_rewards": -0.001639070687815547, + "rewards/unsafe_rewards": -0.002408486558124423, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8817204301075268e-07, - "logits/chosen": -2.333059787750244, - "logits/rejected": -2.08148455619812, - "logps/chosen": -202.78738403320312, - "logps/rejected": -184.70339965820312, - "loss": 57406.7625, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.00300501543097198, - "rewards/margins": 0.0034041493199765682, - "rewards/rejected": -0.006409164518117905, - "rewards/safe_rewards": -0.003386855125427246, - "rewards/unsafe_rewards": -0.0026231766678392887, + "logits/chosen": -2.423598289489746, + "logits/rejected": -2.2320618629455566, + "logps/chosen": -202.7837677001953, + "logps/rejected": -184.62033081054688, + "loss": 14276.2172, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.005977807100862265, + "rewards/margins": 0.005661749746650457, + "rewards/rejected": -0.011639557778835297, + "rewards/safe_rewards": -0.00685838982462883, + "rewards/unsafe_rewards": -0.005097225774079561, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.1505376344086022e-07, - "logits/chosen": -2.336958169937134, - "logits/rejected": -2.1193723678588867, - "logps/chosen": -221.5605010986328, - "logps/rejected": -196.53025817871094, - "loss": 58509.7937, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.010360611602663994, - "rewards/margins": 0.006883770227432251, - "rewards/rejected": -0.017244379967451096, - "rewards/safe_rewards": -0.011615331284701824, - "rewards/unsafe_rewards": -0.009105890057981014, + "logits/chosen": -2.4342398643493652, + "logits/rejected": -2.271530866622925, + "logps/chosen": -221.3511505126953, + "logps/rejected": -196.20684814453125, + "loss": 14430.5766, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.017918918281793594, + "rewards/margins": 0.013557764701545238, + "rewards/rejected": -0.03147668391466141, + "rewards/safe_rewards": -0.020387938246130943, + "rewards/unsafe_rewards": -0.015449894592165947, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.4193548387096775e-07, - "logits/chosen": -2.3241987228393555, - "logits/rejected": -2.12538480758667, - "logps/chosen": -212.30160522460938, - "logps/rejected": -173.37435913085938, - "loss": 59040.6, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.03193185105919838, - "rewards/margins": 0.013278303667902946, - "rewards/rejected": -0.04521014913916588, - "rewards/safe_rewards": -0.03161809220910072, - "rewards/unsafe_rewards": -0.03224559873342514, + "logits/chosen": -2.4415786266326904, + "logits/rejected": -2.301713466644287, + "logps/chosen": -211.0481414794922, + "logps/rejected": -171.99159240722656, + "loss": 14501.5938, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05115891247987747, + "rewards/margins": 0.025721188634634018, + "rewards/rejected": -0.07688009738922119, + "rewards/safe_rewards": -0.04981667920947075, + "rewards/unsafe_rewards": -0.05250114947557449, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.6881720430107523e-07, - "logits/chosen": -2.3178677558898926, - "logits/rejected": -2.1205782890319824, - "logps/chosen": -206.67361450195312, - "logps/rejected": -181.85816955566406, - "loss": 59148.25, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.052975885570049286, - "rewards/margins": 0.020408818498253822, - "rewards/rejected": -0.07338471710681915, - "rewards/safe_rewards": -0.05294289067387581, - "rewards/unsafe_rewards": -0.05300889164209366, + "logits/chosen": -2.4574618339538574, + "logits/rejected": -2.3291263580322266, + "logps/chosen": -204.63087463378906, + "logps/rejected": -179.63002014160156, + "loss": 14364.2516, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08542316406965256, + "rewards/margins": 0.03905550017952919, + "rewards/rejected": -0.12447866052389145, + "rewards/safe_rewards": -0.08581903576850891, + "rewards/unsafe_rewards": -0.0850272923707962, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.956989247311828e-07, - "logits/chosen": -2.250896453857422, - "logits/rejected": -2.0068392753601074, - "logps/chosen": -225.31332397460938, - "logps/rejected": -190.78842163085938, - "loss": 60083.55, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.06683585792779922, - "rewards/margins": 0.027196455746889114, - "rewards/rejected": -0.09403230994939804, - "rewards/safe_rewards": -0.06477360427379608, - "rewards/unsafe_rewards": -0.06889811903238297, + "logits/chosen": -2.429097890853882, + "logits/rejected": -2.2684311866760254, + "logps/chosen": -222.95913696289062, + "logps/rejected": -187.91293334960938, + "loss": 14761.6688, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11019313335418701, + "rewards/margins": 0.048916272819042206, + "rewards/rejected": -0.159109428524971, + "rewards/safe_rewards": -0.10632093250751495, + "rewards/unsafe_rewards": -0.11406532675027847, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.225806451612903e-07, - "logits/chosen": -2.2784368991851807, - "logits/rejected": -1.933264970779419, - "logps/chosen": -223.38803100585938, - "logps/rejected": -174.36941528320312, - "loss": 57660.4313, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.060692548751831055, - "rewards/margins": 0.04699287936091423, - "rewards/rejected": -0.10768542438745499, - "rewards/safe_rewards": -0.05687148496508598, - "rewards/unsafe_rewards": -0.06451361626386642, + "logits/chosen": -2.4783873558044434, + "logits/rejected": -2.262389659881592, + "logps/chosen": -220.35800170898438, + "logps/rejected": -169.4020233154297, + "loss": 13980.0906, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.09087369590997696, + "rewards/margins": 0.07458756119012833, + "rewards/rejected": -0.1654612421989441, + "rewards/safe_rewards": -0.08417809009552002, + "rewards/unsafe_rewards": -0.0975693017244339, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.4946236559139783e-07, - "logits/chosen": -2.209768295288086, - "logits/rejected": -1.977660894393921, - "logps/chosen": -232.1643829345703, - "logps/rejected": -191.14297485351562, - "loss": 55493.675, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.08990620076656342, - "rewards/margins": 0.056245338171720505, - "rewards/rejected": -0.14615154266357422, - "rewards/safe_rewards": -0.06849895417690277, - "rewards/unsafe_rewards": -0.11131343990564346, + "logits/chosen": -2.47499942779541, + "logits/rejected": -2.3391237258911133, + "logps/chosen": -224.6880645751953, + "logps/rejected": -181.26388549804688, + "loss": 13439.1469, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10466276109218597, + "rewards/margins": 0.0889045000076294, + "rewards/rejected": -0.19356727600097656, + "rewards/safe_rewards": -0.07197071611881256, + "rewards/unsafe_rewards": -0.13735483586788177, "step": 130 }, { "epoch": 0.08, "learning_rate": 3.7634408602150537e-07, - "logits/chosen": -2.219853639602661, - "logits/rejected": -1.884545922279358, - "logps/chosen": -243.88595581054688, - "logps/rejected": -221.80856323242188, - "loss": 53782.9125, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.1624813824892044, - "rewards/margins": 0.04636792838573456, - "rewards/rejected": -0.20884928107261658, - "rewards/safe_rewards": -0.1539582759141922, - "rewards/unsafe_rewards": -0.1710045039653778, + "logits/chosen": -2.5111544132232666, + "logits/rejected": -2.323676347732544, + "logps/chosen": -232.02120971679688, + "logps/rejected": -207.3649444580078, + "loss": 13156.9813, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.20595864951610565, + "rewards/margins": 0.0673115998506546, + "rewards/rejected": -0.27327024936676025, + "rewards/safe_rewards": -0.19269119203090668, + "rewards/unsafe_rewards": -0.21922609210014343, "step": 140 }, { "epoch": 0.08, "learning_rate": 4.0322580645161285e-07, - "logits/chosen": -2.0882880687713623, - "logits/rejected": -1.8174384832382202, - "logps/chosen": -255.64419555664062, - "logps/rejected": -228.84622192382812, - "loss": 51780.8719, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.24349740147590637, - "rewards/margins": 0.0521768219769001, - "rewards/rejected": -0.29567423462867737, - "rewards/safe_rewards": -0.24871926009655, - "rewards/unsafe_rewards": -0.23827552795410156, + "logits/chosen": -2.440796375274658, + "logits/rejected": -2.294173002243042, + "logps/chosen": -229.3905792236328, + "logps/rejected": -201.4041748046875, + "loss": 12286.168, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2244747132062912, + "rewards/margins": 0.09208513051271439, + "rewards/rejected": -0.3165598511695862, + "rewards/safe_rewards": -0.2293473184108734, + "rewards/unsafe_rewards": -0.21960210800170898, "step": 150 }, { "epoch": 0.09, "learning_rate": 4.3010752688172043e-07, - "logits/chosen": -2.030080795288086, - "logits/rejected": -1.682318925857544, - "logps/chosen": -258.22711181640625, - "logps/rejected": -242.5928955078125, - "loss": 55186.9938, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.19012892246246338, - "rewards/margins": 0.07434957474470139, - "rewards/rejected": -0.264478474855423, - "rewards/safe_rewards": -0.17074565589427948, - "rewards/unsafe_rewards": -0.2095121592283249, + "logits/chosen": -2.4001574516296387, + "logits/rejected": -2.2244515419006348, + "logps/chosen": -242.0061492919922, + "logps/rejected": -220.7064666748047, + "loss": 13450.5672, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2182481735944748, + "rewards/margins": 0.09150904417037964, + "rewards/rejected": -0.30975720286369324, + "rewards/safe_rewards": -0.20791450142860413, + "rewards/unsafe_rewards": -0.22858186066150665, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.569892473118279e-07, - "logits/chosen": -1.997047781944275, - "logits/rejected": -1.6876182556152344, - "logps/chosen": -265.75982666015625, - "logps/rejected": -229.61105346679688, - "loss": 53758.3063, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.2569185793399811, - "rewards/margins": 0.05510791391134262, - "rewards/rejected": -0.3120265007019043, - "rewards/safe_rewards": -0.2607461214065552, - "rewards/unsafe_rewards": -0.253091037273407, + "logits/chosen": -2.4316458702087402, + "logits/rejected": -2.3187336921691895, + "logps/chosen": -229.7932891845703, + "logps/rejected": -191.04086303710938, + "loss": 12580.4609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15405751764774323, + "rewards/margins": 0.08396536856889725, + "rewards/rejected": -0.23802292346954346, + "rewards/safe_rewards": -0.13653624057769775, + "rewards/unsafe_rewards": -0.1715788096189499, "step": 170 }, { "epoch": 0.1, "learning_rate": 4.838709677419355e-07, - "logits/chosen": -2.0198779106140137, - "logits/rejected": -1.6132383346557617, - "logps/chosen": -230.9585723876953, - "logps/rejected": -226.0393524169922, - "loss": 51863.425, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.1949550211429596, - "rewards/margins": 0.07894422113895416, - "rewards/rejected": -0.27389925718307495, - "rewards/safe_rewards": -0.20130440592765808, - "rewards/unsafe_rewards": -0.1886056363582611, + "logits/chosen": -2.473118305206299, + "logits/rejected": -2.3191967010498047, + "logps/chosen": -217.89785766601562, + "logps/rejected": -209.47775268554688, + "loss": 12886.1875, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.2591598629951477, + "rewards/margins": 0.1223045364022255, + "rewards/rejected": -0.3814643919467926, + "rewards/safe_rewards": -0.2669682800769806, + "rewards/unsafe_rewards": -0.25135138630867004, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.999929391798331e-07, - "logits/chosen": -2.0746703147888184, - "logits/rejected": -1.6490455865859985, - "logps/chosen": -227.44564819335938, - "logps/rejected": -216.55239868164062, - "loss": 52794.2, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.1656118482351303, - "rewards/margins": 0.08124759048223495, - "rewards/rejected": -0.24685946106910706, - "rewards/safe_rewards": -0.1661626547574997, - "rewards/unsafe_rewards": -0.1650610715150833, + "logits/chosen": -2.4944000244140625, + "logits/rejected": -2.323819637298584, + "logps/chosen": -224.4022674560547, + "logps/rejected": -209.09341430664062, + "loss": 12964.0625, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.30063313245773315, + "rewards/margins": 0.11855147778987885, + "rewards/rejected": -0.4191845953464508, + "rewards/safe_rewards": -0.30268028378486633, + "rewards/unsafe_rewards": -0.2985859215259552, "step": 190 }, { "epoch": 0.11, "learning_rate": 4.9991350953333e-07, - "logits/chosen": -1.9676685333251953, - "logits/rejected": -1.6202901601791382, - "logps/chosen": -274.40704345703125, - "logps/rejected": -271.92388916015625, - "loss": 49774.425, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.27345985174179077, - "rewards/margins": 0.07984323054552078, - "rewards/rejected": -0.35330307483673096, - "rewards/safe_rewards": -0.2629985511302948, - "rewards/unsafe_rewards": -0.28392118215560913, + "logits/chosen": -2.4175021648406982, + "logits/rejected": -2.261751413345337, + "logps/chosen": -257.97100830078125, + "logps/rejected": -248.92697143554688, + "loss": 12408.5, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3828560709953308, + "rewards/margins": 0.09374293684959412, + "rewards/rejected": -0.4765990376472473, + "rewards/safe_rewards": -0.37296319007873535, + "rewards/unsafe_rewards": -0.39274901151657104, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.997458523498236e-07, - "logits/chosen": -2.046584367752075, - "logits/rejected": -1.7538659572601318, - "logps/chosen": -227.2253875732422, - "logps/rejected": -206.01016235351562, - "loss": 51264.0656, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.17535802721977234, - "rewards/margins": 0.057085663080215454, - "rewards/rejected": -0.2324436902999878, - "rewards/safe_rewards": -0.17087043821811676, - "rewards/unsafe_rewards": -0.17984560132026672, + "logits/chosen": -2.4530012607574463, + "logits/rejected": -2.3131752014160156, + "logps/chosen": -222.4829559326172, + "logps/rejected": -198.89447021484375, + "loss": 12109.7687, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3033100664615631, + "rewards/margins": 0.09060867875814438, + "rewards/rejected": -0.3939187526702881, + "rewards/safe_rewards": -0.29972246289253235, + "rewards/unsafe_rewards": -0.30689769983291626, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.99490026817712e-07, - "logits/chosen": -2.0286879539489746, - "logits/rejected": -1.7031996250152588, - "logps/chosen": -244.4539031982422, - "logps/rejected": -243.85653686523438, - "loss": 50839.2531, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.25039544701576233, - "rewards/margins": 0.12068897485733032, - "rewards/rejected": -0.37108439207077026, - "rewards/safe_rewards": -0.23208603262901306, - "rewards/unsafe_rewards": -0.2687048316001892, + "logits/chosen": -2.415255308151245, + "logits/rejected": -2.290353775024414, + "logps/chosen": -225.1759490966797, + "logps/rejected": -215.8201446533203, + "loss": 12286.7969, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.308417946100235, + "rewards/margins": 0.15336203575134277, + "rewards/rejected": -0.46177998185157776, + "rewards/safe_rewards": -0.2879489064216614, + "rewards/unsafe_rewards": -0.3288869559764862, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.991461232516674e-07, - "logits/chosen": -2.0239181518554688, - "logits/rejected": -1.6534967422485352, - "logps/chosen": -280.6416015625, - "logps/rejected": -263.65679931640625, - "loss": 53486.9625, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.2528027892112732, - "rewards/margins": 0.07662861049175262, - "rewards/rejected": -0.329431414604187, - "rewards/safe_rewards": -0.2424205094575882, - "rewards/unsafe_rewards": -0.2631850838661194, + "logits/chosen": -2.403742551803589, + "logits/rejected": -2.2323994636535645, + "logps/chosen": -265.6297607421875, + "logps/rejected": -247.0828399658203, + "loss": 13271.4391, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.35555627942085266, + "rewards/margins": 0.13768498599529266, + "rewards/rejected": -0.49324122071266174, + "rewards/safe_rewards": -0.34540650248527527, + "rewards/unsafe_rewards": -0.36570602655410767, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.98714263060751e-07, - "logits/chosen": -2.1409239768981934, - "logits/rejected": -1.7168201208114624, - "logps/chosen": -216.2005615234375, - "logps/rejected": -187.94398498535156, - "loss": 53698.725, + "logits/chosen": -2.5071444511413574, + "logits/rejected": -2.31742525100708, + "logps/chosen": -209.05905151367188, + "logps/rejected": -179.74855041503906, + "loss": 12854.6859, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.1389429271221161, - "rewards/margins": 0.06751132011413574, - "rewards/rejected": -0.20645423233509064, - "rewards/safe_rewards": -0.14350251853466034, - "rewards/unsafe_rewards": -0.13438332080841064, + "rewards/chosen": -0.2065323293209076, + "rewards/margins": 0.12446670234203339, + "rewards/rejected": -0.33099907636642456, + "rewards/safe_rewards": -0.21961939334869385, + "rewards/unsafe_rewards": -0.19344526529312134, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.98194598705552e-07, - "logits/chosen": -1.998090147972107, - "logits/rejected": -1.7716166973114014, - "logps/chosen": -246.077880859375, - "logps/rejected": -231.7785186767578, - "loss": 54199.1875, - "rewards/accuracies": 0.53125, - "rewards/chosen": -0.28174251317977905, - "rewards/margins": 0.04053831845521927, - "rewards/rejected": -0.3222808241844177, - "rewards/safe_rewards": -0.29175376892089844, - "rewards/unsafe_rewards": -0.2717311978340149, + "logits/chosen": -2.460813045501709, + "logits/rejected": -2.3778514862060547, + "logps/chosen": -238.38693237304688, + "logps/rejected": -223.18795776367188, + "loss": 13078.5102, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.48650604486465454, + "rewards/margins": 0.07239247858524323, + "rewards/rejected": -0.5588985681533813, + "rewards/safe_rewards": -0.4960268437862396, + "rewards/unsafe_rewards": -0.47698527574539185, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.975873136443648e-07, - "logits/chosen": -2.067817211151123, - "logits/rejected": -1.7341163158416748, - "logps/chosen": -307.3098449707031, - "logps/rejected": -295.1159973144531, - "loss": 48378.2469, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.3813348710536957, - "rewards/margins": 0.11161359399557114, - "rewards/rejected": -0.4929484724998474, - "rewards/safe_rewards": -0.4026872515678406, - "rewards/unsafe_rewards": -0.35998252034187317, + "logits/chosen": -2.5023112297058105, + "logits/rejected": -2.350739002227783, + "logps/chosen": -268.8946228027344, + "logps/rejected": -247.2296600341797, + "loss": 11995.3344, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3785567581653595, + "rewards/margins": 0.12816599011421204, + "rewards/rejected": -0.5067228078842163, + "rewards/safe_rewards": -0.39122840762138367, + "rewards/unsafe_rewards": -0.36588507890701294, "step": 260 }, { "epoch": 0.15, "learning_rate": 4.968926222684212e-07, - "logits/chosen": -1.9135534763336182, - "logits/rejected": -1.671265959739685, - "logps/chosen": -275.67645263671875, - "logps/rejected": -278.89312744140625, - "loss": 49364.65, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.3773640990257263, - "rewards/margins": 0.11218199878931046, - "rewards/rejected": -0.48954612016677856, - "rewards/safe_rewards": -0.37167319655418396, - "rewards/unsafe_rewards": -0.38305506110191345, + "logits/chosen": -2.4438796043395996, + "logits/rejected": -2.3461410999298096, + "logps/chosen": -229.61386108398438, + "logps/rejected": -223.92630004882812, + "loss": 12123.7047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.294075608253479, + "rewards/margins": 0.13579340279102325, + "rewards/rejected": -0.42986902594566345, + "rewards/safe_rewards": -0.300296813249588, + "rewards/unsafe_rewards": -0.2878544330596924, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.961107698262044e-07, - "logits/chosen": -1.8962326049804688, - "logits/rejected": -1.5111366510391235, - "logps/chosen": -293.2281799316406, - "logps/rejected": -275.4593811035156, - "loss": 49515.725, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.3748367428779602, - "rewards/margins": 0.09869714826345444, - "rewards/rejected": -0.4735339283943176, - "rewards/safe_rewards": -0.36353978514671326, - "rewards/unsafe_rewards": -0.38613370060920715, + "logits/chosen": -2.4015512466430664, + "logits/rejected": -2.263901948928833, + "logps/chosen": -255.80538940429688, + "logps/rejected": -228.9020233154297, + "loss": 12352.2758, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.375516414642334, + "rewards/margins": 0.10626471042633057, + "rewards/rejected": -0.48178109526634216, + "rewards/safe_rewards": -0.37126559019088745, + "rewards/unsafe_rewards": -0.37976714968681335, "step": 280 }, { "epoch": 0.16, "learning_rate": 4.952420323368673e-07, - "logits/chosen": -1.9515918493270874, - "logits/rejected": -1.717285394668579, - "logps/chosen": -269.0133361816406, - "logps/rejected": -283.813720703125, - "loss": 51060.2125, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.3880404531955719, - "rewards/margins": 0.12494431436061859, - "rewards/rejected": -0.5129847526550293, - "rewards/safe_rewards": -0.40422338247299194, - "rewards/unsafe_rewards": -0.37185752391815186, + "logits/chosen": -2.4498801231384277, + "logits/rejected": -2.382075548171997, + "logps/chosen": -239.36386108398438, + "logps/rejected": -241.67861938476562, + "loss": 12562.6453, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.47904032468795776, + "rewards/margins": 0.12548959255218506, + "rewards/rejected": -0.6045299172401428, + "rewards/safe_rewards": -0.4896617829799652, + "rewards/unsafe_rewards": -0.46841883659362793, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.942867164927899e-07, - "logits/chosen": -1.9774839878082275, - "logits/rejected": -1.7034270763397217, - "logps/chosen": -274.8741455078125, - "logps/rejected": -267.0771484375, - "loss": 53100.8938, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.36294037103652954, - "rewards/margins": 0.12649540603160858, - "rewards/rejected": -0.48943576216697693, - "rewards/safe_rewards": -0.36649006605148315, - "rewards/unsafe_rewards": -0.35939061641693115, + "logits/chosen": -2.448484182357788, + "logits/rejected": -2.3499300479888916, + "logps/chosen": -233.24447631835938, + "logps/rejected": -213.6923828125, + "loss": 13096.7359, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3095199167728424, + "rewards/margins": 0.13528046011924744, + "rewards/rejected": -0.44480031728744507, + "rewards/safe_rewards": -0.2981899082660675, + "rewards/unsafe_rewards": -0.3208498954772949, + "step": 300 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.319474458694458, + "eval_logits/rejected": -2.1815290451049805, + "eval_logps/chosen": -170.00274658203125, + "eval_logps/rejected": -140.18304443359375, + "eval_loss": 4529.67333984375, + "eval_rewards/accuracies": 0.6583978533744812, + "eval_rewards/chosen": -0.39572206139564514, + "eval_rewards/margins": 0.08145187050104141, + "eval_rewards/rejected": -0.4771738648414612, + "eval_rewards/safe_rewards": -0.39296460151672363, + "eval_rewards/unsafe_rewards": -0.3956325650215149, + "eval_runtime": 996.6135, + "eval_samples_per_second": 33.156, + "eval_steps_per_second": 1.037, "step": 300 }, { "epoch": 0.17, "learning_rate": 4.932451595513062e-07, - "logits/chosen": -2.0701467990875244, - "logits/rejected": -1.671668291091919, - "logps/chosen": -291.11383056640625, - "logps/rejected": -287.5212097167969, - "loss": 47355.6312, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.3607241213321686, - "rewards/margins": 0.13552391529083252, - "rewards/rejected": -0.4962480068206787, - "rewards/safe_rewards": -0.36866235733032227, - "rewards/unsafe_rewards": -0.3527859151363373, + "logits/chosen": -2.464543342590332, + "logits/rejected": -2.3167717456817627, + "logps/chosen": -244.10421752929688, + "logps/rejected": -229.1746368408203, + "loss": 11442.1586, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25108686089515686, + "rewards/margins": 0.15766267478466034, + "rewards/rejected": -0.408749520778656, + "rewards/safe_rewards": -0.2550010681152344, + "rewards/unsafe_rewards": -0.24717266857624054, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.921177292156419e-07, - "logits/chosen": -2.0575995445251465, - "logits/rejected": -1.617762804031372, - "logps/chosen": -293.3443603515625, - "logps/rejected": -299.7580261230469, - "loss": 46413.5, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.436631441116333, - "rewards/margins": 0.15407030284404755, - "rewards/rejected": -0.5907017588615417, - "rewards/safe_rewards": -0.4272303581237793, - "rewards/unsafe_rewards": -0.4460326135158539, + "logits/chosen": -2.486952543258667, + "logits/rejected": -2.317573070526123, + "logps/chosen": -234.8188934326172, + "logps/rejected": -228.7853240966797, + "loss": 11535.2578, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2879149317741394, + "rewards/margins": 0.18317629396915436, + "rewards/rejected": -0.47109121084213257, + "rewards/safe_rewards": -0.2716086506843567, + "rewards/unsafe_rewards": -0.30422115325927734, "step": 320 }, { "epoch": 0.18, "learning_rate": 4.909048235051033e-07, - "logits/chosen": -1.8155206441879272, - "logits/rejected": -1.5518898963928223, - "logps/chosen": -310.9698181152344, - "logps/rejected": -319.9571838378906, - "loss": 46899.1125, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5577954053878784, - "rewards/margins": 0.12865307927131653, - "rewards/rejected": -0.6864485740661621, - "rewards/safe_rewards": -0.5377126932144165, - "rewards/unsafe_rewards": -0.5778781175613403, + "logits/chosen": -2.408730983734131, + "logits/rejected": -2.313384532928467, + "logps/chosen": -240.1051483154297, + "logps/rejected": -238.6286163330078, + "loss": 11998.6938, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4070356488227844, + "rewards/margins": 0.15228822827339172, + "rewards/rejected": -0.559323787689209, + "rewards/safe_rewards": -0.38683491945266724, + "rewards/unsafe_rewards": -0.42723625898361206, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.896068706145631e-07, - "logits/chosen": -1.8018848896026611, - "logits/rejected": -1.4418330192565918, - "logps/chosen": -329.0890808105469, - "logps/rejected": -306.06854248046875, - "loss": 49554.8969, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5919759273529053, - "rewards/margins": 0.13498583436012268, - "rewards/rejected": -0.7269617319107056, - "rewards/safe_rewards": -0.5998921394348145, - "rewards/unsafe_rewards": -0.5840597152709961, + "logits/chosen": -2.430915355682373, + "logits/rejected": -2.3069653511047363, + "logps/chosen": -263.82440185546875, + "logps/rejected": -228.1202392578125, + "loss": 12415.2109, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5314391255378723, + "rewards/margins": 0.14294834434986115, + "rewards/rejected": -0.674387514591217, + "rewards/safe_rewards": -0.5249046087265015, + "rewards/unsafe_rewards": -0.5379736423492432, "step": 340 }, { "epoch": 0.19, "learning_rate": 4.882243287632946e-07, - "logits/chosen": -2.0392603874206543, - "logits/rejected": -1.6928720474243164, - "logps/chosen": -268.0147399902344, - "logps/rejected": -274.80975341796875, - "loss": 48110.2406, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.3750418424606323, - "rewards/margins": 0.09303764253854752, - "rewards/rejected": -0.46807947754859924, - "rewards/safe_rewards": -0.3772274851799011, - "rewards/unsafe_rewards": -0.37285616993904114, + "logits/chosen": -2.5057172775268555, + "logits/rejected": -2.3827576637268066, + "logps/chosen": -237.1790771484375, + "logps/rejected": -237.27392578125, + "loss": 11917.3391, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.44174814224243164, + "rewards/margins": 0.1188526302576065, + "rewards/rejected": -0.5606008172035217, + "rewards/safe_rewards": -0.4454471170902252, + "rewards/unsafe_rewards": -0.4380492568016052, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.867576860332048e-07, - "logits/chosen": -1.9731874465942383, - "logits/rejected": -1.6687225103378296, - "logps/chosen": -267.7218322753906, - "logps/rejected": -293.46722412109375, - "loss": 48097.2562, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.4783262610435486, - "rewards/margins": 0.1437402069568634, - "rewards/rejected": -0.6220664978027344, - "rewards/safe_rewards": -0.5029984712600708, - "rewards/unsafe_rewards": -0.45365405082702637, + "logits/chosen": -2.5187079906463623, + "logits/rejected": -2.4128689765930176, + "logps/chosen": -207.39404296875, + "logps/rejected": -220.6639404296875, + "loss": 11619.4062, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.353740930557251, + "rewards/margins": 0.16230328381061554, + "rewards/rejected": -0.5160442590713501, + "rewards/safe_rewards": -0.37073829770088196, + "rewards/unsafe_rewards": -0.33674362301826477, "step": 360 }, { "epoch": 0.2, "learning_rate": 4.85207460196526e-07, - "logits/chosen": -1.8471896648406982, - "logits/rejected": -1.4721193313598633, - "logps/chosen": -328.16876220703125, - "logps/rejected": -336.55218505859375, - "loss": 49041.6687, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6189309358596802, - "rewards/margins": 0.138859361410141, - "rewards/rejected": -0.7577903866767883, - "rewards/safe_rewards": -0.6294093132019043, - "rewards/unsafe_rewards": -0.6084526777267456, + "logits/chosen": -2.5497612953186035, + "logits/rejected": -2.4256086349487305, + "logps/chosen": -251.84036254882812, + "logps/rejected": -246.384033203125, + "loss": 11846.7016, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.47489339113235474, + "rewards/margins": 0.13903862237930298, + "rewards/rejected": -0.6139320135116577, + "rewards/safe_rewards": -0.4880181849002838, + "rewards/unsafe_rewards": -0.46176856756210327, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.835741985330259e-07, - "logits/chosen": -1.7684574127197266, - "logits/rejected": -1.360176682472229, - "logps/chosen": -291.7095642089844, - "logps/rejected": -290.01031494140625, - "loss": 47190.6, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.45449382066726685, - "rewards/margins": 0.1359688937664032, - "rewards/rejected": -0.5904628038406372, - "rewards/safe_rewards": -0.42255863547325134, - "rewards/unsafe_rewards": -0.48642903566360474, + "logits/chosen": -2.5404162406921387, + "logits/rejected": -2.4239704608917236, + "logps/chosen": -246.53939819335938, + "logps/rejected": -233.34353637695312, + "loss": 11351.725, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.45782560110092163, + "rewards/margins": 0.15662749111652374, + "rewards/rejected": -0.614453136920929, + "rewards/safe_rewards": -0.42890676856040955, + "rewards/unsafe_rewards": -0.48674440383911133, "step": 380 }, { "epoch": 0.21, "learning_rate": 4.818584776367992e-07, - "logits/chosen": -1.542189598083496, - "logits/rejected": -1.2524335384368896, - "logps/chosen": -304.78131103515625, - "logps/rejected": -325.6260986328125, - "loss": 47781.7688, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.5043835043907166, - "rewards/margins": 0.1637241244316101, - "rewards/rejected": -0.6681076288223267, - "rewards/safe_rewards": -0.5346864461898804, - "rewards/unsafe_rewards": -0.4740806519985199, + "logits/chosen": -2.5068519115448, + "logits/rejected": -2.4303596019744873, + "logps/chosen": -244.3930206298828, + "logps/rejected": -249.14187622070312, + "loss": 11726.7609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40478819608688354, + "rewards/margins": 0.16650350391864777, + "rewards/rejected": -0.5712917447090149, + "rewards/safe_rewards": -0.43173927068710327, + "rewards/unsafe_rewards": -0.377837210893631, "step": 390 }, { "epoch": 0.22, "learning_rate": 4.800609032127122e-07, - "logits/chosen": -1.6008304357528687, - "logits/rejected": -1.172656536102295, - "logps/chosen": -339.5008544921875, - "logps/rejected": -321.77838134765625, - "loss": 49869.325, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.6697827577590942, - "rewards/margins": 0.0950266569852829, - "rewards/rejected": -0.7648094296455383, - "rewards/safe_rewards": -0.643047571182251, - "rewards/unsafe_rewards": -0.6965180039405823, + "logits/chosen": -2.4628491401672363, + "logits/rejected": -2.374331474304199, + "logps/chosen": -254.27822875976562, + "logps/rejected": -228.73330688476562, + "loss": 12046.5695, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.48730865120887756, + "rewards/margins": 0.11220350116491318, + "rewards/rejected": -0.5995121598243713, + "rewards/safe_rewards": -0.4591635763645172, + "rewards/unsafe_rewards": -0.5154536962509155, "step": 400 }, { "epoch": 0.22, "learning_rate": 4.78182109862569e-07, - "logits/chosen": -1.600804328918457, - "logits/rejected": -1.4069411754608154, - "logps/chosen": -276.9542541503906, - "logps/rejected": -286.2479248046875, - "loss": 51331.9031, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.42776933312416077, - "rewards/margins": 0.10375501960515976, - "rewards/rejected": -0.5315243005752563, - "rewards/safe_rewards": -0.40943676233291626, - "rewards/unsafe_rewards": -0.4461018145084381, + "logits/chosen": -2.4446866512298584, + "logits/rejected": -2.408660650253296, + "logps/chosen": -230.39208984375, + "logps/rejected": -231.1884765625, + "loss": 12854.2617, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3896816670894623, + "rewards/margins": 0.12279312312602997, + "rewards/rejected": -0.5124748945236206, + "rewards/safe_rewards": -0.35095858573913574, + "rewards/unsafe_rewards": -0.4284047484397888, "step": 410 }, { "epoch": 0.23, "learning_rate": 4.7622276086107677e-07, - "logits/chosen": -1.656137466430664, - "logits/rejected": -1.2384175062179565, - "logps/chosen": -307.77423095703125, - "logps/rejected": -313.8507385253906, - "loss": 48322.2312, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.48740506172180176, - "rewards/margins": 0.1517179012298584, - "rewards/rejected": -0.6391229629516602, - "rewards/safe_rewards": -0.4884099066257477, - "rewards/unsafe_rewards": -0.48640021681785583, + "logits/chosen": -2.4900124073028564, + "logits/rejected": -2.3827693462371826, + "logps/chosen": -251.73007202148438, + "logps/rejected": -243.5518341064453, + "loss": 12053.4797, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4146080017089844, + "rewards/margins": 0.16079458594322205, + "rewards/rejected": -0.5754026174545288, + "rewards/safe_rewards": -0.4179013669490814, + "rewards/unsafe_rewards": -0.41131457686424255, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.741835479216879e-07, - "logits/chosen": -1.4653375148773193, - "logits/rejected": -0.8188030123710632, - "logps/chosen": -362.851806640625, - "logps/rejected": -342.6094055175781, - "loss": 47674.475, + "logits/chosen": -2.445971727371216, + "logits/rejected": -2.286686420440674, + "logps/chosen": -294.9362487792969, + "logps/rejected": -262.58404541015625, + "loss": 11785.9859, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5799063444137573, - "rewards/margins": 0.14497725665569305, - "rewards/rejected": -0.7248835563659668, - "rewards/safe_rewards": -0.6209739446640015, - "rewards/unsafe_rewards": -0.5388387441635132, + "rewards/chosen": -0.48062044382095337, + "rewards/margins": 0.16858412325382233, + "rewards/rejected": -0.6492044925689697, + "rewards/safe_rewards": -0.5107543468475342, + "rewards/unsafe_rewards": -0.450486421585083, "step": 430 }, { "epoch": 0.24, "learning_rate": 4.720651909524036e-07, - "logits/chosen": -1.1252074241638184, - "logits/rejected": -0.5117732286453247, - "logps/chosen": -297.4981994628906, - "logps/rejected": -291.9035949707031, - "loss": 47749.0969, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.507114052772522, - "rewards/margins": 0.12389920651912689, - "rewards/rejected": -0.6310132145881653, - "rewards/safe_rewards": -0.5425899624824524, - "rewards/unsafe_rewards": -0.47163811326026917, + "logits/chosen": -2.3577795028686523, + "logits/rejected": -2.2147059440612793, + "logps/chosen": -236.0032196044922, + "logps/rejected": -218.00820922851562, + "loss": 11980.575, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.3994632661342621, + "rewards/margins": 0.1237812489271164, + "rewards/rejected": -0.5232445001602173, + "rewards/safe_rewards": -0.4401687681674957, + "rewards/unsafe_rewards": -0.3587578237056732, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.698684378016222e-07, - "logits/chosen": -0.8045175671577454, - "logits/rejected": -0.15142253041267395, - "logps/chosen": -300.20233154296875, - "logps/rejected": -310.9920654296875, - "loss": 48739.55, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.5643971562385559, - "rewards/margins": 0.13239194452762604, - "rewards/rejected": -0.6967890858650208, - "rewards/safe_rewards": -0.5555019378662109, - "rewards/unsafe_rewards": -0.5732922554016113, + "logits/chosen": -2.2142910957336426, + "logits/rejected": -2.0808310508728027, + "logps/chosen": -249.1070556640625, + "logps/rejected": -246.5348663330078, + "loss": 12188.9062, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6178480982780457, + "rewards/margins": 0.13151735067367554, + "rewards/rejected": -0.7493655681610107, + "rewards/safe_rewards": -0.6169500350952148, + "rewards/unsafe_rewards": -0.6187463402748108, "step": 450 }, { "epoch": 0.25, "learning_rate": 4.675940639941256e-07, - "logits/chosen": -0.637567400932312, - "logits/rejected": 0.17254072427749634, - "logps/chosen": -340.4341125488281, - "logps/rejected": -343.8965148925781, - "loss": 47404.7906, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.6787372827529907, - "rewards/margins": 0.1538914442062378, - "rewards/rejected": -0.832628607749939, - "rewards/safe_rewards": -0.642540693283081, - "rewards/unsafe_rewards": -0.7149337530136108, + "logits/chosen": -2.2552475929260254, + "logits/rejected": -2.087047815322876, + "logps/chosen": -268.5553283691406, + "logps/rejected": -256.51849365234375, + "loss": 11744.7563, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6386964917182922, + "rewards/margins": 0.15222086012363434, + "rewards/rejected": -0.7909173965454102, + "rewards/safe_rewards": -0.6151617765426636, + "rewards/unsafe_rewards": -0.6622311472892761, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.6524287245729286e-07, - "logits/chosen": -0.46970778703689575, - "logits/rejected": 0.11218448728322983, - "logps/chosen": -324.87884521484375, - "logps/rejected": -330.160888671875, - "loss": 47668.7719, + "logits/chosen": -2.2173266410827637, + "logits/rejected": -2.0759875774383545, + "logps/chosen": -247.02444458007812, + "logps/rejected": -235.70431518554688, + "loss": 11539.7117, "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.6513844728469849, - "rewards/margins": 0.16710397601127625, - "rewards/rejected": -0.8184884190559387, - "rewards/safe_rewards": -0.6502984762191772, - "rewards/unsafe_rewards": -0.6524705290794373, + "rewards/chosen": -0.5241425633430481, + "rewards/margins": 0.1680615395307541, + "rewards/rejected": -0.6922041177749634, + "rewards/safe_rewards": -0.5475858449935913, + "rewards/unsafe_rewards": -0.5006993412971497, "step": 470 }, { "epoch": 0.26, "learning_rate": 4.628156932376418e-07, - "logits/chosen": -1.2571052312850952, - "logits/rejected": -0.4860140383243561, - "logps/chosen": -310.5690612792969, - "logps/rejected": -300.1791076660156, - "loss": 47230.5844, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5835075974464417, - "rewards/margins": 0.16074423491954803, - "rewards/rejected": -0.7442517280578613, - "rewards/safe_rewards": -0.5902873873710632, - "rewards/unsafe_rewards": -0.5767275094985962, + "logits/chosen": -2.254505157470703, + "logits/rejected": -2.057737112045288, + "logps/chosen": -252.7125244140625, + "logps/rejected": -227.06204223632812, + "loss": 11626.5016, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5887488126754761, + "rewards/margins": 0.16838543117046356, + "rewards/rejected": -0.7571342587471008, + "rewards/safe_rewards": -0.5993281602859497, + "rewards/unsafe_rewards": -0.5781695246696472, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.603133832077953e-07, - "logits/chosen": -1.9671266078948975, - "logits/rejected": -1.6400283575057983, - "logps/chosen": -341.34503173828125, - "logps/rejected": -352.0642395019531, - "loss": 47158.0844, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.5433287024497986, - "rewards/margins": 0.14422836899757385, - "rewards/rejected": -0.68755704164505, - "rewards/safe_rewards": -0.5558397769927979, - "rewards/unsafe_rewards": -0.5308177471160889, + "logits/chosen": -2.2982699871063232, + "logits/rejected": -2.1872057914733887, + "logps/chosen": -293.5793762207031, + "logps/rejected": -291.3560791015625, + "loss": 11510.6367, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.609082043170929, + "rewards/margins": 0.1590423583984375, + "rewards/rejected": -0.7681244015693665, + "rewards/safe_rewards": -0.6237068176269531, + "rewards/unsafe_rewards": -0.5944572687149048, "step": 490 }, { "epoch": 0.27, "learning_rate": 4.5773682576397776e-07, - "logits/chosen": -2.0917410850524902, - "logits/rejected": -1.8676202297210693, - "logps/chosen": -287.2514953613281, - "logps/rejected": -285.0293884277344, - "loss": 48756.7969, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.43676862120628357, - "rewards/margins": 0.12084832042455673, - "rewards/rejected": -0.5576169490814209, - "rewards/safe_rewards": -0.43437495827674866, - "rewards/unsafe_rewards": -0.43916234374046326, - "step": 500 - }, - { - "epoch": 0.27, - "eval_logits/chosen": -1.804201602935791, - "eval_logits/rejected": -1.4277713298797607, - "eval_logps/chosen": -260.77484130859375, - "eval_logps/rejected": -230.28091430664062, - "eval_loss": 17836.07421875, - "eval_rewards/accuracies": 0.5688528418540955, - "eval_rewards/chosen": -0.6516796946525574, - "eval_rewards/margins": 0.03736867383122444, - "eval_rewards/rejected": -0.6890482902526855, - "eval_rewards/safe_rewards": -0.6482035517692566, - "eval_rewards/unsafe_rewards": -0.6494295001029968, - "eval_runtime": 1060.9169, - "eval_samples_per_second": 31.147, - "eval_steps_per_second": 0.974, + "logits/chosen": -2.2795727252960205, + "logits/rejected": -2.150864839553833, + "logps/chosen": -247.63088989257812, + "logps/rejected": -234.78408813476562, + "loss": 11965.7203, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.47748714685440063, + "rewards/margins": 0.13509830832481384, + "rewards/rejected": -0.6125854849815369, + "rewards/safe_rewards": -0.48745498061180115, + "rewards/unsafe_rewards": -0.4675193428993225, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.5508693051414774e-07, - "logits/chosen": -2.2481167316436768, - "logits/rejected": -2.08062744140625, - "logps/chosen": -289.83843994140625, - "logps/rejected": -295.86517333984375, - "loss": 46488.1156, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.46742701530456543, - "rewards/margins": 0.13163916766643524, - "rewards/rejected": -0.5990661382675171, - "rewards/safe_rewards": -0.4728309214115143, - "rewards/unsafe_rewards": -0.4620230793952942, + "logits/chosen": -2.3894762992858887, + "logits/rejected": -2.2791035175323486, + "logps/chosen": -240.63394165039062, + "logps/rejected": -236.6797332763672, + "loss": 11414.6078, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4428652822971344, + "rewards/margins": 0.16295991837978363, + "rewards/rejected": -0.6058252453804016, + "rewards/safe_rewards": -0.4437985420227051, + "rewards/unsafe_rewards": -0.44193196296691895, "step": 510 }, { "epoch": 0.28, "learning_rate": 4.52364632956877e-07, - "logits/chosen": -2.188800096511841, - "logits/rejected": -1.9668811559677124, - "logps/chosen": -305.479248046875, - "logps/rejected": -278.39288330078125, - "loss": 52177.75, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.4467028081417084, - "rewards/margins": 0.10257605463266373, - "rewards/rejected": -0.5492788553237915, - "rewards/safe_rewards": -0.4408493936061859, - "rewards/unsafe_rewards": -0.45255613327026367, + "logits/chosen": -2.376743793487549, + "logits/rejected": -2.2216954231262207, + "logps/chosen": -262.9234313964844, + "logps/rejected": -226.41116333007812, + "loss": 12766.6898, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4680143892765045, + "rewards/margins": 0.11080136150121689, + "rewards/rejected": -0.578815758228302, + "rewards/safe_rewards": -0.4691530764102936, + "rewards/unsafe_rewards": -0.4668757915496826, "step": 520 }, { "epoch": 0.29, "learning_rate": 4.4957089415108895e-07, - "logits/chosen": -1.9463974237442017, - "logits/rejected": -1.6417328119277954, - "logps/chosen": -292.5940856933594, - "logps/rejected": -324.30706787109375, - "loss": 46336.2125, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.48848190903663635, - "rewards/margins": 0.18697507679462433, - "rewards/rejected": -0.6754569411277771, - "rewards/safe_rewards": -0.4803538918495178, - "rewards/unsafe_rewards": -0.49660998582839966, + "logits/chosen": -2.3553097248077393, + "logits/rejected": -2.2464723587036133, + "logps/chosen": -242.263671875, + "logps/rejected": -255.6951141357422, + "loss": 11431.9516, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4732840061187744, + "rewards/margins": 0.19163894653320312, + "rewards/rejected": -0.6649229526519775, + "rewards/safe_rewards": -0.4660876393318176, + "rewards/unsafe_rewards": -0.4804803729057312, "step": 530 }, { "epoch": 0.29, "learning_rate": 4.467067003767745e-07, - "logits/chosen": -1.8115772008895874, - "logits/rejected": -1.3471866846084595, - "logps/chosen": -317.49114990234375, - "logps/rejected": -330.3946533203125, - "loss": 48224.0813, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.622870147228241, - "rewards/margins": 0.19709816575050354, - "rewards/rejected": -0.8199684023857117, - "rewards/safe_rewards": -0.625496506690979, - "rewards/unsafe_rewards": -0.6202439069747925, + "logits/chosen": -2.3562865257263184, + "logits/rejected": -2.181304931640625, + "logps/chosen": -247.70449829101562, + "logps/rejected": -240.8135528564453, + "loss": 12058.2938, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5476623177528381, + "rewards/margins": 0.19659535586833954, + "rewards/rejected": -0.7442576885223389, + "rewards/safe_rewards": -0.5530461072921753, + "rewards/unsafe_rewards": -0.5422784686088562, "step": 540 }, { "epoch": 0.3, "learning_rate": 4.437730627868027e-07, - "logits/chosen": -1.8123080730438232, - "logits/rejected": -1.3614463806152344, - "logps/chosen": -293.47314453125, - "logps/rejected": -301.69744873046875, - "loss": 45175.2812, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.5827856063842773, - "rewards/margins": 0.19569487869739532, - "rewards/rejected": -0.7784804701805115, - "rewards/safe_rewards": -0.5721400380134583, - "rewards/unsafe_rewards": -0.5934312343597412, + "logits/chosen": -2.3266587257385254, + "logits/rejected": -2.120724678039551, + "logps/chosen": -231.5970916748047, + "logps/rejected": -221.3756103515625, + "loss": 11200.6187, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5469081997871399, + "rewards/margins": 0.20715579390525818, + "rewards/rejected": -0.7540639638900757, + "rewards/safe_rewards": -0.5482960939407349, + "rewards/unsafe_rewards": -0.5455203652381897, "step": 550 }, { "epoch": 0.3, "learning_rate": 4.4077101704995163e-07, - "logits/chosen": -1.9096673727035522, - "logits/rejected": -1.5819532871246338, - "logps/chosen": -296.38641357421875, - "logps/rejected": -306.5096130371094, - "loss": 46444.8812, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5214236974716187, - "rewards/margins": 0.14416955411434174, - "rewards/rejected": -0.6655932068824768, - "rewards/safe_rewards": -0.5140654444694519, - "rewards/unsafe_rewards": -0.5287817716598511, + "logits/chosen": -2.3897128105163574, + "logits/rejected": -2.2368927001953125, + "logps/chosen": -238.13796997070312, + "logps/rejected": -232.872314453125, + "loss": 11633.2375, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4596262574195862, + "rewards/margins": 0.13524024188518524, + "rewards/rejected": -0.5948664546012878, + "rewards/safe_rewards": -0.4546341300010681, + "rewards/unsafe_rewards": -0.4646182954311371, "step": 560 }, { "epoch": 0.31, "learning_rate": 4.3770162298528356e-07, - "logits/chosen": -2.028366804122925, - "logits/rejected": -1.7259242534637451, - "logps/chosen": -292.74237060546875, - "logps/rejected": -279.4795227050781, - "loss": 48454.6937, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.4893677234649658, - "rewards/margins": 0.1378137469291687, - "rewards/rejected": -0.6271814107894897, - "rewards/safe_rewards": -0.4714062809944153, - "rewards/unsafe_rewards": -0.5073291063308716, + "logits/chosen": -2.396641969680786, + "logits/rejected": -2.2687771320343018, + "logps/chosen": -244.2262420654297, + "logps/rejected": -219.45248413085938, + "loss": 12088.3172, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.493887335062027, + "rewards/margins": 0.15973922610282898, + "rewards/rejected": -0.653626561164856, + "rewards/safe_rewards": -0.48037633299827576, + "rewards/unsafe_rewards": -0.5073983073234558, "step": 570 }, { "epoch": 0.31, "learning_rate": 4.3456596418799476e-07, - "logits/chosen": -1.9284662008285522, - "logits/rejected": -1.6226657629013062, - "logps/chosen": -310.54779052734375, - "logps/rejected": -310.19232177734375, - "loss": 42358.225, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.5339833498001099, - "rewards/margins": 0.15684859454631805, - "rewards/rejected": -0.6908319592475891, - "rewards/safe_rewards": -0.5378093719482422, - "rewards/unsafe_rewards": -0.5301573872566223, + "logits/chosen": -2.3043763637542725, + "logits/rejected": -2.173457145690918, + "logps/chosen": -257.493896484375, + "logps/rejected": -241.50222778320312, + "loss": 10751.843, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5369631052017212, + "rewards/margins": 0.15752388536930084, + "rewards/rejected": -0.6944869756698608, + "rewards/safe_rewards": -0.5310293436050415, + "rewards/unsafe_rewards": -0.5428968667984009, "step": 580 }, { "epoch": 0.32, "learning_rate": 4.313651476468715e-07, - "logits/chosen": -1.5436499118804932, - "logits/rejected": -1.1208113431930542, - "logps/chosen": -316.5739440917969, - "logps/rejected": -325.2558288574219, - "loss": 44966.8812, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.6000299453735352, - "rewards/margins": 0.16303375363349915, - "rewards/rejected": -0.7630637288093567, - "rewards/safe_rewards": -0.6396278142929077, - "rewards/unsafe_rewards": -0.5604321360588074, + "logits/chosen": -2.270984172821045, + "logits/rejected": -2.1114678382873535, + "logps/chosen": -245.8052520751953, + "logps/rejected": -238.4367218017578, + "loss": 11156.9102, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4919458329677582, + "rewards/margins": 0.16596952080726624, + "rewards/rejected": -0.6579153537750244, + "rewards/safe_rewards": -0.533513605594635, + "rewards/unsafe_rewards": -0.45037803053855896, "step": 590 }, { "epoch": 0.32, "learning_rate": 4.2810030335348693e-07, - "logits/chosen": -1.4212372303009033, - "logits/rejected": -0.8773279190063477, - "logps/chosen": -322.83013916015625, - "logps/rejected": -308.69482421875, - "loss": 45772.8812, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5812667608261108, - "rewards/margins": 0.13742713630199432, - "rewards/rejected": -0.7186940312385559, - "rewards/safe_rewards": -0.5496013760566711, - "rewards/unsafe_rewards": -0.6129323244094849, + "logits/chosen": -2.199920177459717, + "logits/rejected": -2.001626491546631, + "logps/chosen": -263.06634521484375, + "logps/rejected": -236.2901153564453, + "loss": 11584.7875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5653510689735413, + "rewards/margins": 0.14779801666736603, + "rewards/rejected": -0.7131490707397461, + "rewards/safe_rewards": -0.5651000142097473, + "rewards/unsafe_rewards": -0.56560218334198, + "step": 600 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -1.993375301361084, + "eval_logits/rejected": -1.7938207387924194, + "eval_logps/chosen": -211.25747680664062, + "eval_logps/rejected": -180.6571044921875, + "eval_loss": 4406.71337890625, + "eval_rewards/accuracies": 0.6338334679603577, + "eval_rewards/chosen": -0.8082689642906189, + "eval_rewards/margins": 0.07364560663700104, + "eval_rewards/rejected": -0.8819145560264587, + "eval_rewards/safe_rewards": -0.8027816414833069, + "eval_rewards/unsafe_rewards": -0.8050090670585632, + "eval_runtime": 995.4426, + "eval_samples_per_second": 33.195, + "eval_steps_per_second": 1.038, "step": 600 }, { "epoch": 0.33, "learning_rate": 4.2477258390327806e-07, - "logits/chosen": -1.7338860034942627, - "logits/rejected": -1.173690915107727, - "logps/chosen": -292.8702392578125, - "logps/rejected": -315.7215881347656, - "loss": 45187.8, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.539310097694397, - "rewards/margins": 0.17059774696826935, - "rewards/rejected": -0.7099078893661499, - "rewards/safe_rewards": -0.5670661926269531, - "rewards/unsafe_rewards": -0.5115541219711304, + "logits/chosen": -2.2489686012268066, + "logits/rejected": -2.072410821914673, + "logps/chosen": -241.9006805419922, + "logps/rejected": -251.07376098632812, + "loss": 11304.5898, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5690335035324097, + "rewards/margins": 0.20418229699134827, + "rewards/rejected": -0.7732157707214355, + "rewards/safe_rewards": -0.5881134271621704, + "rewards/unsafe_rewards": -0.5499535799026489, "step": 610 }, { "epoch": 0.33, "learning_rate": 4.2138316408864197e-07, - "logits/chosen": -1.934067964553833, - "logits/rejected": -1.371891736984253, - "logps/chosen": -311.4843444824219, - "logps/rejected": -325.8730163574219, - "loss": 41554.4938, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.5662498474121094, - "rewards/margins": 0.2509652376174927, - "rewards/rejected": -0.817215085029602, - "rewards/safe_rewards": -0.5518594980239868, - "rewards/unsafe_rewards": -0.5806401968002319, + "logits/chosen": -2.334721803665161, + "logits/rejected": -2.150754928588867, + "logps/chosen": -246.8868865966797, + "logps/rejected": -237.44033813476562, + "loss": 10365.4461, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.48651427030563354, + "rewards/margins": 0.2631617486476898, + "rewards/rejected": -0.7496760487556458, + "rewards/safe_rewards": -0.4882384240627289, + "rewards/unsafe_rewards": -0.4847901463508606, "step": 620 }, { "epoch": 0.34, "learning_rate": 4.179332404841962e-07, - "logits/chosen": -1.474639892578125, - "logits/rejected": -0.817841649055481, - "logps/chosen": -351.102783203125, - "logps/rejected": -350.62420654296875, - "loss": 44838.0969, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7048746347427368, - "rewards/margins": 0.1902434527873993, - "rewards/rejected": -0.8951181173324585, - "rewards/safe_rewards": -0.7227796912193298, - "rewards/unsafe_rewards": -0.6869696378707886, + "logits/chosen": -2.2644031047821045, + "logits/rejected": -2.0903661251068115, + "logps/chosen": -267.8985290527344, + "logps/rejected": -251.5312042236328, + "loss": 10894.2219, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5778486728668213, + "rewards/margins": 0.22166796028614044, + "rewards/rejected": -0.7995165586471558, + "rewards/safe_rewards": -0.5826883316040039, + "rewards/unsafe_rewards": -0.5730089545249939, "step": 630 }, { "epoch": 0.34, "learning_rate": 4.1442403102434954e-07, - "logits/chosen": -1.323033332824707, - "logits/rejected": -0.587798535823822, - "logps/chosen": -355.1231994628906, - "logps/rejected": -358.1360778808594, - "loss": 47672.8438, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.6757093667984009, - "rewards/margins": 0.22136211395263672, - "rewards/rejected": -0.8970714807510376, - "rewards/safe_rewards": -0.6709599494934082, - "rewards/unsafe_rewards": -0.680458664894104, + "logits/chosen": -2.282378911972046, + "logits/rejected": -2.113847255706787, + "logps/chosen": -270.49737548828125, + "logps/rejected": -250.124755859375, + "loss": 11471.2906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5048245787620544, + "rewards/margins": 0.20923128724098206, + "rewards/rejected": -0.7140558958053589, + "rewards/safe_rewards": -0.5091570615768433, + "rewards/unsafe_rewards": -0.5004920959472656, "step": 640 }, { "epoch": 0.35, "learning_rate": 4.108567745733318e-07, - "logits/chosen": -1.583213448524475, - "logits/rejected": -0.9868221282958984, - "logps/chosen": -294.44580078125, - "logps/rejected": -310.91058349609375, - "loss": 49633.6969, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.6046614646911621, - "rewards/margins": 0.14481785893440247, - "rewards/rejected": -0.749479353427887, - "rewards/safe_rewards": -0.6139060258865356, - "rewards/unsafe_rewards": -0.5954169631004333, + "logits/chosen": -2.283024311065674, + "logits/rejected": -2.1171913146972656, + "logps/chosen": -216.49264526367188, + "logps/rejected": -221.82421875, + "loss": 11842.7375, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4298115372657776, + "rewards/margins": 0.178109809756279, + "rewards/rejected": -0.6079213619232178, + "rewards/safe_rewards": -0.41675862669944763, + "rewards/unsafe_rewards": -0.4428643584251404, "step": 650 }, { "epoch": 0.36, "learning_rate": 4.0723273048783426e-07, - "logits/chosen": -1.9343016147613525, - "logits/rejected": -1.5659198760986328, - "logps/chosen": -321.99871826171875, - "logps/rejected": -298.6355895996094, - "loss": 47186.7219, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.4885231554508209, - "rewards/margins": 0.15925437211990356, - "rewards/rejected": -0.6477775573730469, - "rewards/safe_rewards": -0.4665687084197998, - "rewards/unsafe_rewards": -0.5104776620864868, + "logits/chosen": -2.3322463035583496, + "logits/rejected": -2.1912810802459717, + "logps/chosen": -271.88299560546875, + "logps/rejected": -232.814208984375, + "loss": 11859.6484, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.47567135095596313, + "rewards/margins": 0.16224288940429688, + "rewards/rejected": -0.6379141807556152, + "rewards/safe_rewards": -0.4415673315525055, + "rewards/unsafe_rewards": -0.5097752809524536, "step": 660 }, { "epoch": 0.36, "learning_rate": 4.0355317817241697e-07, - "logits/chosen": -1.663923978805542, - "logits/rejected": -1.1053364276885986, - "logps/chosen": -333.9637451171875, - "logps/rejected": -300.2287902832031, - "loss": 46297.6469, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5251131057739258, - "rewards/margins": 0.16899068653583527, - "rewards/rejected": -0.6941038966178894, - "rewards/safe_rewards": -0.4914467930793762, - "rewards/unsafe_rewards": -0.5587795376777649, + "logits/chosen": -2.3272366523742676, + "logits/rejected": -2.1486003398895264, + "logps/chosen": -275.4678039550781, + "logps/rejected": -224.7365264892578, + "loss": 11542.4758, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.465405136346817, + "rewards/margins": 0.16741091012954712, + "rewards/rejected": -0.6328160166740417, + "rewards/safe_rewards": -0.43667951226234436, + "rewards/unsafe_rewards": -0.4941307008266449, "step": 670 }, { "epoch": 0.37, "learning_rate": 3.998194166278367e-07, - "logits/chosen": -1.3979709148406982, - "logits/rejected": -0.9500627517700195, - "logps/chosen": -307.57305908203125, - "logps/rejected": -306.31890869140625, - "loss": 47727.2438, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.5814173221588135, - "rewards/margins": 0.11520357429981232, - "rewards/rejected": -0.6966208219528198, - "rewards/safe_rewards": -0.5897179841995239, - "rewards/unsafe_rewards": -0.573116660118103, + "logits/chosen": -2.378793239593506, + "logits/rejected": -2.2765145301818848, + "logps/chosen": -242.05722045898438, + "logps/rejected": -228.02822875976562, + "loss": 12028.9203, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5077196359634399, + "rewards/margins": 0.10326583683490753, + "rewards/rejected": -0.6109854578971863, + "rewards/safe_rewards": -0.519899845123291, + "rewards/unsafe_rewards": -0.4955393671989441, "step": 680 }, { "epoch": 0.37, "learning_rate": 3.9603276399245855e-07, - "logits/chosen": -1.2701140642166138, - "logits/rejected": -0.5338165163993835, - "logps/chosen": -337.90081787109375, - "logps/rejected": -328.6393127441406, - "loss": 48448.0687, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.6263846158981323, - "rewards/margins": 0.17685876786708832, - "rewards/rejected": -0.8032432794570923, - "rewards/safe_rewards": -0.6113296747207642, - "rewards/unsafe_rewards": -0.6414395570755005, + "logits/chosen": -2.3500876426696777, + "logits/rejected": -2.2054431438446045, + "logps/chosen": -266.26800537109375, + "logps/rejected": -240.6906280517578, + "loss": 11937.7102, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5368236303329468, + "rewards/margins": 0.19003283977508545, + "rewards/rejected": -0.7268564105033875, + "rewards/safe_rewards": -0.5230244994163513, + "rewards/unsafe_rewards": -0.5506226420402527, "step": 690 }, { "epoch": 0.38, "learning_rate": 3.9219455707691e-07, - "logits/chosen": -0.9895772933959961, - "logits/rejected": -0.3174577057361603, - "logps/chosen": -341.63702392578125, - "logps/rejected": -341.65838623046875, - "loss": 45612.6469, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6892026662826538, - "rewards/margins": 0.15997081995010376, - "rewards/rejected": -0.8491734266281128, - "rewards/safe_rewards": -0.70364910364151, - "rewards/unsafe_rewards": -0.6747562289237976, + "logits/chosen": -2.3385093212127686, + "logits/rejected": -2.194088935852051, + "logps/chosen": -262.49957275390625, + "logps/rejected": -248.57937622070312, + "loss": 11237.9461, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5866261720657349, + "rewards/margins": 0.18088310956954956, + "rewards/rejected": -0.7675093412399292, + "rewards/safe_rewards": -0.5994827747344971, + "rewards/unsafe_rewards": -0.5737696886062622, "step": 700 }, { "epoch": 0.38, "learning_rate": 3.883061508921439e-07, - "logits/chosen": -1.3048111200332642, - "logits/rejected": -0.8342126607894897, - "logps/chosen": -320.4631652832031, - "logps/rejected": -351.50555419921875, - "loss": 45613.9906, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6085165739059448, - "rewards/margins": 0.1441672295331955, - "rewards/rejected": -0.7526838183403015, - "rewards/safe_rewards": -0.6081237196922302, - "rewards/unsafe_rewards": -0.6089093685150146, + "logits/chosen": -2.4109256267547607, + "logits/rejected": -2.309866428375244, + "logps/chosen": -253.4579620361328, + "logps/rejected": -269.43060302734375, + "loss": 11465.2805, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5475128889083862, + "rewards/margins": 0.13771310448646545, + "rewards/rejected": -0.6852259635925293, + "rewards/safe_rewards": -0.5719602704048157, + "rewards/unsafe_rewards": -0.5230655670166016, "step": 710 }, { "epoch": 0.39, "learning_rate": 3.8436891817107555e-07, - "logits/chosen": -1.5195503234863281, - "logits/rejected": -1.1332530975341797, - "logps/chosen": -301.80035400390625, - "logps/rejected": -319.84844970703125, - "loss": 47662.1875, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.5566147565841675, - "rewards/margins": 0.1676829606294632, - "rewards/rejected": -0.724297821521759, - "rewards/safe_rewards": -0.5669485330581665, - "rewards/unsafe_rewards": -0.5462811589241028, + "logits/chosen": -2.3683512210845947, + "logits/rejected": -2.29433012008667, + "logps/chosen": -238.59597778320312, + "logps/rejected": -241.7389678955078, + "loss": 11748.5664, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.481682687997818, + "rewards/margins": 0.18611426651477814, + "rewards/rejected": -0.6677969694137573, + "rewards/safe_rewards": -0.4765700697898865, + "rewards/unsafe_rewards": -0.4867952764034271, "step": 720 }, { "epoch": 0.39, "learning_rate": 3.8038424888396414e-07, - "logits/chosen": -1.494560956954956, - "logits/rejected": -0.9302406311035156, - "logps/chosen": -318.72369384765625, - "logps/rejected": -327.00433349609375, - "loss": 44502.925, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.564653754234314, - "rewards/margins": 0.14311900734901428, - "rewards/rejected": -0.7077728509902954, - "rewards/safe_rewards": -0.5561032295227051, - "rewards/unsafe_rewards": -0.5732042193412781, + "logits/chosen": -2.3680453300476074, + "logits/rejected": -2.233586072921753, + "logps/chosen": -255.9365692138672, + "logps/rejected": -252.37393188476562, + "loss": 10985.4922, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5011948347091675, + "rewards/margins": 0.1678631603717804, + "rewards/rejected": -0.6690580248832703, + "rewards/safe_rewards": -0.5107179880142212, + "rewards/unsafe_rewards": -0.49167174100875854, "step": 730 }, { "epoch": 0.4, "learning_rate": 3.763535497477079e-07, - "logits/chosen": -1.575844407081604, - "logits/rejected": -1.0153863430023193, - "logps/chosen": -318.7821350097656, - "logps/rejected": -318.4511413574219, - "loss": 45381.4156, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5534173846244812, - "rewards/margins": 0.18378493189811707, - "rewards/rejected": -0.7372023463249207, - "rewards/safe_rewards": -0.5398276448249817, - "rewards/unsafe_rewards": -0.5670071840286255, + "logits/chosen": -2.376601457595825, + "logits/rejected": -2.2197787761688232, + "logps/chosen": -260.691650390625, + "logps/rejected": -245.3418731689453, + "loss": 11114.0781, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5256686210632324, + "rewards/margins": 0.21716609597206116, + "rewards/rejected": -0.742834746837616, + "rewards/safe_rewards": -0.5268703103065491, + "rewards/unsafe_rewards": -0.524466872215271, "step": 740 }, { "epoch": 0.4, "learning_rate": 3.7227824372922795e-07, - "logits/chosen": -1.490094780921936, - "logits/rejected": -0.8493000268936157, - "logps/chosen": -314.93963623046875, - "logps/rejected": -317.8399963378906, - "loss": 45227.8063, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.6335738301277161, - "rewards/margins": 0.14306354522705078, - "rewards/rejected": -0.7766374349594116, - "rewards/safe_rewards": -0.6398617625236511, - "rewards/unsafe_rewards": -0.6272858381271362, + "logits/chosen": -2.3355045318603516, + "logits/rejected": -2.202324867248535, + "logps/chosen": -244.21817016601562, + "logps/rejected": -234.26766967773438, + "loss": 11000.375, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5600683093070984, + "rewards/margins": 0.15717127919197083, + "rewards/rejected": -0.7172395586967468, + "rewards/safe_rewards": -0.5515316724777222, + "rewards/unsafe_rewards": -0.5686048269271851, "step": 750 }, { "epoch": 0.41, "learning_rate": 3.681597695431148e-07, - "logits/chosen": -1.4038476943969727, - "logits/rejected": -0.8173944354057312, - "logps/chosen": -322.56329345703125, - "logps/rejected": -344.23065185546875, - "loss": 44762.4938, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.6197393536567688, - "rewards/margins": 0.2086222618818283, - "rewards/rejected": -0.8283616304397583, - "rewards/safe_rewards": -0.6478582620620728, - "rewards/unsafe_rewards": -0.5916203260421753, + "logits/chosen": -2.351304292678833, + "logits/rejected": -2.2070152759552, + "logps/chosen": -249.6879425048828, + "logps/rejected": -252.49722290039062, + "loss": 11395.6094, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5104349851608276, + "rewards/margins": 0.22875532507896423, + "rewards/rejected": -0.7391902804374695, + "rewards/safe_rewards": -0.5327340364456177, + "rewards/unsafe_rewards": -0.4881359040737152, "step": 760 }, { "epoch": 0.41, "learning_rate": 3.639995811437159e-07, - "logits/chosen": -1.5005619525909424, - "logits/rejected": -1.0413631200790405, - "logps/chosen": -322.5632019042969, - "logps/rejected": -347.0985107421875, - "loss": 43768.4469, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.6186314225196838, - "rewards/margins": 0.19521121680736542, - "rewards/rejected": -0.8138425946235657, - "rewards/safe_rewards": -0.6386642456054688, - "rewards/unsafe_rewards": -0.5985985994338989, + "logits/chosen": -2.3204877376556396, + "logits/rejected": -2.2027907371520996, + "logps/chosen": -252.81082153320312, + "logps/rejected": -258.1892395019531, + "loss": 11175.6797, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5397385358810425, + "rewards/margins": 0.19889900088310242, + "rewards/rejected": -0.7386375069618225, + "rewards/safe_rewards": -0.5588012337684631, + "rewards/unsafe_rewards": -0.520675778388977, "step": 770 }, { "epoch": 0.42, "learning_rate": 3.597991472118426e-07, - "logits/chosen": -1.4816805124282837, - "logits/rejected": -0.883453369140625, - "logps/chosen": -339.1617736816406, - "logps/rejected": -346.25567626953125, - "loss": 46740.1562, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.6220620274543762, - "rewards/margins": 0.1839497685432434, - "rewards/rejected": -0.8060116767883301, - "rewards/safe_rewards": -0.6542383432388306, - "rewards/unsafe_rewards": -0.5898855924606323, + "logits/chosen": -2.3824949264526367, + "logits/rejected": -2.2542223930358887, + "logps/chosen": -267.04437255859375, + "logps/rejected": -254.86337280273438, + "loss": 11511.8547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5231243371963501, + "rewards/margins": 0.17550811171531677, + "rewards/rejected": -0.6986324191093445, + "rewards/safe_rewards": -0.5508791208267212, + "rewards/unsafe_rewards": -0.49536967277526855, "step": 780 }, { "epoch": 0.43, "learning_rate": 3.5555995063627836e-07, - "logits/chosen": -1.280864953994751, - "logits/rejected": -0.6846857070922852, - "logps/chosen": -361.9394226074219, - "logps/rejected": -350.7897644042969, - "loss": 44129.625, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.6580780148506165, - "rewards/margins": 0.1662452220916748, - "rewards/rejected": -0.824323296546936, - "rewards/safe_rewards": -0.6648796796798706, - "rewards/unsafe_rewards": -0.6512764096260071, + "logits/chosen": -2.3838717937469482, + "logits/rejected": -2.264705181121826, + "logps/chosen": -288.45098876953125, + "logps/rejected": -261.7911071777344, + "loss": 11126.725, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5807030200958252, + "rewards/margins": 0.1779719889163971, + "rewards/rejected": -0.7586749792098999, + "rewards/safe_rewards": -0.5821424722671509, + "rewards/unsafe_rewards": -0.5792635679244995, "step": 790 }, { "epoch": 0.43, "learning_rate": 3.512834879902715e-07, - "logits/chosen": -1.0769619941711426, - "logits/rejected": -0.41296401619911194, - "logps/chosen": -336.5204162597656, - "logps/rejected": -348.14459228515625, - "loss": 45309.1406, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.6826930642127991, - "rewards/margins": 0.18147502839565277, - "rewards/rejected": -0.8641681671142578, - "rewards/safe_rewards": -0.6699363589286804, - "rewards/unsafe_rewards": -0.6954498291015625, + "logits/chosen": -2.325700521469116, + "logits/rejected": -2.20318603515625, + "logps/chosen": -262.4079284667969, + "logps/rejected": -256.8218078613281, + "loss": 11113.3211, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6240984797477722, + "rewards/margins": 0.1907762587070465, + "rewards/rejected": -0.8148747682571411, + "rewards/safe_rewards": -0.6181318759918213, + "rewards/unsafe_rewards": -0.6300650835037231, "step": 800 }, { "epoch": 0.44, "learning_rate": 3.4697126900319616e-07, - "logits/chosen": -1.0169395208358765, - "logits/rejected": -0.2606657147407532, - "logps/chosen": -335.27630615234375, - "logps/rejected": -340.00225830078125, - "loss": 47046.7656, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.6733043193817139, - "rewards/margins": 0.2028558999300003, - "rewards/rejected": -0.8761602640151978, - "rewards/safe_rewards": -0.6532942056655884, - "rewards/unsafe_rewards": -0.6933144330978394, + "logits/chosen": -2.2967851161956787, + "logits/rejected": -2.1394267082214355, + "logps/chosen": -259.38018798828125, + "logps/rejected": -245.7625732421875, + "loss": 11771.4461, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.5871919393539429, + "rewards/margins": 0.2232244461774826, + "rewards/rejected": -0.8104164004325867, + "rewards/safe_rewards": -0.5566633939743042, + "rewards/unsafe_rewards": -0.6177204251289368, "step": 810 }, { "epoch": 0.44, "learning_rate": 3.426248160275693e-07, - "logits/chosen": -1.1688584089279175, - "logits/rejected": -0.3506912589073181, - "logps/chosen": -326.68792724609375, - "logps/rejected": -342.00372314453125, - "loss": 45918.925, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.6458049416542053, - "rewards/margins": 0.18735942244529724, - "rewards/rejected": -0.8331642150878906, - "rewards/safe_rewards": -0.6722704768180847, - "rewards/unsafe_rewards": -0.6193392872810364, + "logits/chosen": -2.33948016166687, + "logits/rejected": -2.2020907402038574, + "logps/chosen": -251.8537139892578, + "logps/rejected": -247.29443359375, + "loss": 11619.0344, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5438165068626404, + "rewards/margins": 0.17501583695411682, + "rewards/rejected": -0.7188323736190796, + "rewards/safe_rewards": -0.5689524412155151, + "rewards/unsafe_rewards": -0.5186805725097656, "step": 820 }, { "epoch": 0.45, "learning_rate": 3.3824566350161094e-07, - "logits/chosen": -1.128920078277588, - "logits/rejected": -0.12889890372753143, - "logps/chosen": -333.7740783691406, - "logps/rejected": -334.2523193359375, - "loss": 45350.6875, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6272070407867432, - "rewards/margins": 0.20219750702381134, - "rewards/rejected": -0.8294045329093933, - "rewards/safe_rewards": -0.6233872771263123, - "rewards/unsafe_rewards": -0.6310268044471741, + "logits/chosen": -2.3640289306640625, + "logits/rejected": -2.195350408554077, + "logps/chosen": -256.7790222167969, + "logps/rejected": -235.6261444091797, + "loss": 11238.1625, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4845482409000397, + "rewards/margins": 0.1882091611623764, + "rewards/rejected": -0.6727573871612549, + "rewards/safe_rewards": -0.48699751496315, + "rewards/unsafe_rewards": -0.48209887742996216, "step": 830 }, { "epoch": 0.45, "learning_rate": 3.338353574075381e-07, - "logits/chosen": -0.6518674492835999, - "logits/rejected": -0.0776379331946373, - "logps/chosen": -316.5337219238281, - "logps/rejected": -331.23553466796875, - "loss": 52081.9437, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7058514356613159, - "rewards/margins": 0.13784918189048767, - "rewards/rejected": -0.8437005281448364, - "rewards/safe_rewards": -0.7225490808486938, - "rewards/unsafe_rewards": -0.6891536712646484, + "logits/chosen": -2.319687843322754, + "logits/rejected": -2.2427151203155518, + "logps/chosen": -229.2741241455078, + "logps/rejected": -230.6369171142578, + "loss": 13030.1109, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5387877225875854, + "rewards/margins": 0.1425672322511673, + "rewards/rejected": -0.6813548803329468, + "rewards/safe_rewards": -0.5679432153701782, + "rewards/unsafe_rewards": -0.5096321702003479, "step": 840 }, { "epoch": 0.46, "learning_rate": 3.2939545472578314e-07, - "logits/chosen": -1.2321569919586182, - "logits/rejected": -0.23729200661182404, - "logps/chosen": -359.31182861328125, - "logps/rejected": -345.58441162109375, - "loss": 46003.5875, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.6657665967941284, - "rewards/margins": 0.1415308713912964, - "rewards/rejected": -0.8072975277900696, - "rewards/safe_rewards": -0.6434159874916077, - "rewards/unsafe_rewards": -0.688117265701294, + "logits/chosen": -2.3902430534362793, + "logits/rejected": -2.1937272548675537, + "logps/chosen": -279.47705078125, + "logps/rejected": -250.4775848388672, + "loss": 11581.1313, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5328287482261658, + "rewards/margins": 0.13031499087810516, + "rewards/rejected": -0.6631438136100769, + "rewards/safe_rewards": -0.5133041143417358, + "rewards/unsafe_rewards": -0.5523533821105957, "step": 850 }, { "epoch": 0.46, "learning_rate": 3.2492752288532916e-07, - "logits/chosen": -1.210000991821289, - "logits/rejected": -0.28997331857681274, - "logps/chosen": -325.0906677246094, - "logps/rejected": -321.0521545410156, - "loss": 45587.1937, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.6069344282150269, - "rewards/margins": 0.1568630486726761, - "rewards/rejected": -0.7637975215911865, - "rewards/safe_rewards": -0.5850565433502197, - "rewards/unsafe_rewards": -0.6288123726844788, + "logits/chosen": -2.325867176055908, + "logits/rejected": -2.1522111892700195, + "logps/chosen": -253.1811981201172, + "logps/rejected": -233.44174194335938, + "loss": 11311.8109, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4949401319026947, + "rewards/margins": 0.15677447617053986, + "rewards/rejected": -0.6517146825790405, + "rewards/safe_rewards": -0.47377505898475647, + "rewards/unsafe_rewards": -0.5161052942276001, "step": 860 }, { "epoch": 0.47, "learning_rate": 3.204331392103574e-07, - "logits/chosen": -1.279143214225769, - "logits/rejected": -0.15136761963367462, - "logps/chosen": -336.57080078125, - "logps/rejected": -319.11614990234375, - "loss": 46576.9969, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.6153108477592468, - "rewards/margins": 0.16059115529060364, - "rewards/rejected": -0.7759020328521729, - "rewards/safe_rewards": -0.6058703064918518, - "rewards/unsafe_rewards": -0.6247513890266418, + "logits/chosen": -2.337698459625244, + "logits/rejected": -2.1352732181549072, + "logps/chosen": -268.2446594238281, + "logps/rejected": -234.0172882080078, + "loss": 11545.3641, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5469434261322021, + "rewards/margins": 0.15391257405281067, + "rewards/rejected": -0.7008560299873352, + "rewards/safe_rewards": -0.5454758405685425, + "rewards/unsafe_rewards": -0.5484111905097961, "step": 870 }, { "epoch": 0.47, "learning_rate": 3.159138903634006e-07, - "logits/chosen": -0.6373772621154785, - "logits/rejected": 0.15321095287799835, - "logps/chosen": -352.9801330566406, - "logps/rejected": -341.5760803222656, - "loss": 45152.8313, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.7217403650283813, - "rewards/margins": 0.12334167957305908, - "rewards/rejected": -0.8450821042060852, - "rewards/safe_rewards": -0.7119439244270325, - "rewards/unsafe_rewards": -0.731536865234375, + "logits/chosen": -2.1978166103363037, + "logits/rejected": -2.0318470001220703, + "logps/chosen": -280.46319580078125, + "logps/rejected": -256.2702941894531, + "loss": 11248.9852, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7179843187332153, + "rewards/margins": 0.11917723715305328, + "rewards/rejected": -0.837161660194397, + "rewards/safe_rewards": -0.7153393030166626, + "rewards/unsafe_rewards": -0.7206293940544128, "step": 880 }, { "epoch": 0.48, "learning_rate": 3.1137137178519977e-07, - "logits/chosen": -0.7835987210273743, - "logits/rejected": -0.08453743904829025, - "logps/chosen": -325.07196044921875, - "logps/rejected": -350.0148010253906, - "loss": 46177.3281, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7151413559913635, - "rewards/margins": 0.17061486840248108, - "rewards/rejected": -0.8857563138008118, - "rewards/safe_rewards": -0.698540210723877, - "rewards/unsafe_rewards": -0.7317426204681396, + "logits/chosen": -2.1384453773498535, + "logits/rejected": -2.003512382507324, + "logps/chosen": -253.1010284423828, + "logps/rejected": -262.6402893066406, + "loss": 11450.3414, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7104918956756592, + "rewards/margins": 0.18712057173252106, + "rewards/rejected": -0.897612452507019, + "rewards/safe_rewards": -0.6939632296562195, + "rewards/unsafe_rewards": -0.7270206212997437, "step": 890 }, { "epoch": 0.48, "learning_rate": 3.068071871314626e-07, - "logits/chosen": -0.8801188468933105, - "logits/rejected": -0.3386787474155426, - "logps/chosen": -304.0417785644531, - "logps/rejected": -309.9365234375, - "loss": 44813.5219, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.6266143918037415, - "rewards/margins": 0.12571533024311066, - "rewards/rejected": -0.7523297071456909, - "rewards/safe_rewards": -0.6163514852523804, - "rewards/unsafe_rewards": -0.6368770599365234, + "logits/chosen": -2.135709524154663, + "logits/rejected": -2.028604507446289, + "logps/chosen": -242.0713653564453, + "logps/rejected": -236.3900909423828, + "loss": 10862.3484, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6334865689277649, + "rewards/margins": 0.13563254475593567, + "rewards/rejected": -0.7691190838813782, + "rewards/safe_rewards": -0.63458251953125, + "rewards/unsafe_rewards": -0.6323906183242798, + "step": 900 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -1.9376695156097412, + "eval_logits/rejected": -1.7468173503875732, + "eval_logps/chosen": -218.71400451660156, + "eval_logps/rejected": -187.7608642578125, + "eval_loss": 4377.5634765625, + "eval_rewards/accuracies": 0.6195546984672546, + "eval_rewards/chosen": -0.8828346133232117, + "eval_rewards/margins": 0.07011755555868149, + "eval_rewards/rejected": -0.9529521465301514, + "eval_rewards/safe_rewards": -0.8774688839912415, + "eval_rewards/unsafe_rewards": -0.8777852058410645, + "eval_runtime": 994.9448, + "eval_samples_per_second": 33.212, + "eval_steps_per_second": 1.038, "step": 900 }, { "epoch": 0.49, "learning_rate": 3.022229477067205e-07, - "logits/chosen": -1.1570842266082764, - "logits/rejected": -0.48729056119918823, - "logps/chosen": -341.6775817871094, - "logps/rejected": -331.5805969238281, - "loss": 43438.3063, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.6229197978973389, - "rewards/margins": 0.18367961049079895, - "rewards/rejected": -0.8065993189811707, - "rewards/safe_rewards": -0.6110964417457581, - "rewards/unsafe_rewards": -0.6347432136535645, + "logits/chosen": -2.1737711429595947, + "logits/rejected": -1.9998136758804321, + "logps/chosen": -283.02313232421875, + "logps/rejected": -255.57656860351562, + "loss": 10507.2695, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6587027311325073, + "rewards/margins": 0.193914994597435, + "rewards/rejected": -0.8526177406311035, + "rewards/safe_rewards": -0.6458637714385986, + "rewards/unsafe_rewards": -0.6715416312217712, "step": 910 }, { "epoch": 0.49, "learning_rate": 2.976202718954869e-07, - "logits/chosen": -1.220199465751648, - "logits/rejected": -0.414750874042511, - "logps/chosen": -348.66632080078125, - "logps/rejected": -354.89569091796875, - "loss": 47517.2937, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.6859378814697266, - "rewards/margins": 0.17666980624198914, - "rewards/rejected": -0.8626075983047485, - "rewards/safe_rewards": -0.6974785923957825, - "rewards/unsafe_rewards": -0.6743971109390259, + "logits/chosen": -2.183156728744507, + "logits/rejected": -1.9853588342666626, + "logps/chosen": -282.3530578613281, + "logps/rejected": -271.27069091796875, + "loss": 11861.3852, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7082624435424805, + "rewards/margins": 0.18091826140880585, + "rewards/rejected": -0.8891807794570923, + "rewards/safe_rewards": -0.7145110368728638, + "rewards/unsafe_rewards": -0.7020138502120972, "step": 920 }, { "epoch": 0.5, "learning_rate": 2.930007845909146e-07, - "logits/chosen": -1.1370770931243896, - "logits/rejected": -0.5163604021072388, - "logps/chosen": -341.5598449707031, - "logps/rejected": -353.5052795410156, - "loss": 46465.8406, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7197028398513794, - "rewards/margins": 0.1374286562204361, - "rewards/rejected": -0.8571313619613647, - "rewards/safe_rewards": -0.7223156094551086, - "rewards/unsafe_rewards": -0.7170900106430054, + "logits/chosen": -2.177462339401245, + "logits/rejected": -2.0282797813415527, + "logps/chosen": -268.3103942871094, + "logps/rejected": -266.30548095703125, + "loss": 11683.3781, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7064036726951599, + "rewards/margins": 0.13596151769161224, + "rewards/rejected": -0.8423651456832886, + "rewards/safe_rewards": -0.7049628496170044, + "rewards/unsafe_rewards": -0.7078445553779602, "step": 930 }, { "epoch": 0.51, "learning_rate": 2.8836611662115634e-07, - "logits/chosen": -1.01023268699646, - "logits/rejected": -0.16050496697425842, - "logps/chosen": -363.1627502441406, - "logps/rejected": -346.3548583984375, - "loss": 47587.2, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.7151740789413452, - "rewards/margins": 0.1757824420928955, - "rewards/rejected": -0.8909565210342407, - "rewards/safe_rewards": -0.7078148126602173, - "rewards/unsafe_rewards": -0.7225331664085388, + "logits/chosen": -2.1535885334014893, + "logits/rejected": -1.9535331726074219, + "logps/chosen": -287.9471130371094, + "logps/rejected": -254.46157836914062, + "loss": 12032.6703, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6779440641403198, + "rewards/margins": 0.1851780265569687, + "rewards/rejected": -0.8631221652030945, + "rewards/safe_rewards": -0.6615537405014038, + "rewards/unsafe_rewards": -0.6943344473838806, "step": 940 }, { "epoch": 0.51, "learning_rate": 2.8371790417362986e-07, - "logits/chosen": -0.6607567667961121, - "logits/rejected": 0.020231375470757484, - "logps/chosen": -321.7113342285156, - "logps/rejected": -337.2732849121094, - "loss": 50934.0375, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.67184978723526, - "rewards/margins": 0.13356643915176392, - "rewards/rejected": -0.8054162263870239, - "rewards/safe_rewards": -0.6443673372268677, - "rewards/unsafe_rewards": -0.6993322372436523, + "logits/chosen": -2.1404051780700684, + "logits/rejected": -1.9921302795410156, + "logps/chosen": -250.58895874023438, + "logps/rejected": -252.75021362304688, + "loss": 12575.8664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6325485110282898, + "rewards/margins": 0.13314202427864075, + "rewards/rejected": -0.7656905651092529, + "rewards/safe_rewards": -0.615986168384552, + "rewards/unsafe_rewards": -0.6491108536720276, "step": 950 }, { "epoch": 0.52, "learning_rate": 2.7905778821739056e-07, - "logits/chosen": -0.6478228569030762, - "logits/rejected": 0.119105264544487, - "logps/chosen": -328.67352294921875, - "logps/rejected": -323.33843994140625, - "loss": 45763.6406, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6695024371147156, - "rewards/margins": 0.1415373980998993, - "rewards/rejected": -0.8110397458076477, - "rewards/safe_rewards": -0.6400096416473389, - "rewards/unsafe_rewards": -0.6989951729774475, + "logits/chosen": -2.1147265434265137, + "logits/rejected": -1.9771724939346313, + "logps/chosen": -253.51351928710938, + "logps/rejected": -233.7639923095703, + "loss": 11670.4188, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5872529745101929, + "rewards/margins": 0.1395425647497177, + "rewards/rejected": -0.7267955541610718, + "rewards/safe_rewards": -0.5655093193054199, + "rewards/unsafe_rewards": -0.6089966297149658, "step": 960 }, { "epoch": 0.52, "learning_rate": 2.74387413923817e-07, - "logits/chosen": -0.09958457946777344, - "logits/rejected": 0.604131817817688, - "logps/chosen": -373.8058166503906, - "logps/rejected": -366.9934997558594, - "loss": 45703.1188, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7466106414794922, - "rewards/margins": 0.16125424206256866, - "rewards/rejected": -0.9078648686408997, - "rewards/safe_rewards": -0.7459925413131714, - "rewards/unsafe_rewards": -0.7472288012504578, + "logits/chosen": -2.056212902069092, + "logits/rejected": -1.9303762912750244, + "logps/chosen": -294.0152893066406, + "logps/rejected": -272.24615478515625, + "loss": 11716.1664, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6957898736000061, + "rewards/margins": 0.17264129221439362, + "rewards/rejected": -0.8684310913085938, + "rewards/safe_rewards": -0.688141405582428, + "rewards/unsafe_rewards": -0.7034383416175842, "step": 970 }, { "epoch": 0.53, "learning_rate": 2.69708430085812e-07, - "logits/chosen": -0.4206933379173279, - "logits/rejected": 0.8734380006790161, - "logps/chosen": -385.4507751464844, - "logps/rejected": -379.23052978515625, - "loss": 49056.8125, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.810356616973877, - "rewards/margins": 0.20261280238628387, - "rewards/rejected": -1.0129693746566772, - "rewards/safe_rewards": -0.7880682349205017, - "rewards/unsafe_rewards": -0.832645058631897, + "logits/chosen": -2.0949666500091553, + "logits/rejected": -1.8358476161956787, + "logps/chosen": -303.7754821777344, + "logps/rejected": -279.3113708496094, + "loss": 12070.975, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8039963841438293, + "rewards/margins": 0.2226048707962036, + "rewards/rejected": -1.0266011953353882, + "rewards/safe_rewards": -0.7967602014541626, + "rewards/unsafe_rewards": -0.8112322688102722, "step": 980 }, { "epoch": 0.53, "learning_rate": 2.6502248853572504e-07, - "logits/chosen": -0.34545964002609253, - "logits/rejected": 0.5168466567993164, - "logps/chosen": -334.7643737792969, - "logps/rejected": -355.3497314453125, - "loss": 47162.0437, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.7344347238540649, - "rewards/margins": 0.23106582462787628, - "rewards/rejected": -0.9655006527900696, - "rewards/safe_rewards": -0.725695013999939, - "rewards/unsafe_rewards": -0.7431745529174805, + "logits/chosen": -2.018751859664917, + "logits/rejected": -1.81708562374115, + "logps/chosen": -268.05535888671875, + "logps/rejected": -263.68487548828125, + "loss": 12117.2781, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8013098835945129, + "rewards/margins": 0.21267572045326233, + "rewards/rejected": -1.013985514640808, + "rewards/safe_rewards": -0.7930777668952942, + "rewards/unsafe_rewards": -0.8095420002937317, "step": 990 }, { "epoch": 0.54, "learning_rate": 2.6033124356220325e-07, - "logits/chosen": -0.6046223640441895, - "logits/rejected": 0.4678574204444885, - "logps/chosen": -327.68353271484375, - "logps/rejected": -325.10675048828125, - "loss": 45676.9531, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.6245463490486145, - "rewards/margins": 0.20411181449890137, - "rewards/rejected": -0.8286581039428711, - "rewards/safe_rewards": -0.5984090566635132, - "rewards/unsafe_rewards": -0.6506836414337158, - "step": 1000 - }, - { - "epoch": 0.54, - "eval_logits/chosen": 0.8552502393722534, - "eval_logits/rejected": 1.9950228929519653, - "eval_logps/chosen": -286.8121337890625, - "eval_logps/rejected": -266.63848876953125, - "eval_loss": 17429.828125, - "eval_rewards/accuracies": 0.6481122970581055, - "eval_rewards/chosen": -0.7818662524223328, - "eval_rewards/margins": 0.08896998316049576, - "eval_rewards/rejected": -0.8708361983299255, - "eval_rewards/safe_rewards": -0.7753673791885376, - "eval_rewards/unsafe_rewards": -0.777836263179779, - "eval_runtime": 1061.6051, - "eval_samples_per_second": 31.126, - "eval_steps_per_second": 0.973, + "logits/chosen": -2.040283441543579, + "logits/rejected": -1.8361635208129883, + "logps/chosen": -266.68841552734375, + "logps/rejected": -244.3878631591797, + "loss": 11242.6422, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.639050304889679, + "rewards/margins": 0.21101757884025574, + "rewards/rejected": -0.8500677943229675, + "rewards/safe_rewards": -0.6206936836242676, + "rewards/unsafe_rewards": -0.6574069261550903, "step": 1000 }, { "epoch": 0.54, "learning_rate": 2.55636351326173e-07, - "logits/chosen": -0.6972484588623047, - "logits/rejected": 0.22190746665000916, - "logps/chosen": -326.7273864746094, - "logps/rejected": -320.91680908203125, - "loss": 42472.95, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.6033691167831421, - "rewards/margins": 0.20326733589172363, - "rewards/rejected": -0.8066364526748657, - "rewards/safe_rewards": -0.6165899634361267, - "rewards/unsafe_rewards": -0.5901483297348022, + "logits/chosen": -2.074812412261963, + "logits/rejected": -1.9070053100585938, + "logps/chosen": -267.3968505859375, + "logps/rejected": -242.2158660888672, + "loss": 10413.1977, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6132654547691345, + "rewards/margins": 0.21324153244495392, + "rewards/rejected": -0.8265069723129272, + "rewards/safe_rewards": -0.638529360294342, + "rewards/unsafe_rewards": -0.588001549243927, "step": 1010 }, { "epoch": 0.55, "learning_rate": 2.509394692761622e-07, - "logits/chosen": -0.6674490571022034, - "logits/rejected": 0.4369078576564789, - "logps/chosen": -331.0621032714844, - "logps/rejected": -328.1492919921875, - "loss": 47981.6125, + "logits/chosen": -2.1109747886657715, + "logits/rejected": -1.8711456060409546, + "logps/chosen": -266.0138244628906, + "logps/rejected": -246.04159545898438, + "loss": 11730.4734, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6269241571426392, - "rewards/margins": 0.18894127011299133, - "rewards/rejected": -0.8158655166625977, - "rewards/safe_rewards": -0.6308945417404175, - "rewards/unsafe_rewards": -0.6229538917541504, + "rewards/chosen": -0.6028767824172974, + "rewards/margins": 0.20766966044902802, + "rewards/rejected": -0.8105465173721313, + "rewards/safe_rewards": -0.5901221632957458, + "rewards/unsafe_rewards": -0.6156314611434937, "step": 1020 }, { "epoch": 0.55, "learning_rate": 2.462422555631674e-07, - "logits/chosen": -0.28683388233184814, - "logits/rejected": 0.8458186388015747, - "logps/chosen": -342.2594909667969, - "logps/rejected": -328.2859191894531, - "loss": 44831.7969, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.6991252899169922, - "rewards/margins": 0.19242990016937256, - "rewards/rejected": -0.89155513048172, - "rewards/safe_rewards": -0.6973845958709717, - "rewards/unsafe_rewards": -0.7008658647537231, + "logits/chosen": -2.073171854019165, + "logits/rejected": -1.8314094543457031, + "logps/chosen": -267.2673034667969, + "logps/rejected": -234.9337921142578, + "loss": 11384.8359, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6479398012161255, + "rewards/margins": 0.20152795314788818, + "rewards/rejected": -0.8494676351547241, + "rewards/safe_rewards": -0.6472254991531372, + "rewards/unsafe_rewards": -0.6486541032791138, "step": 1030 }, { "epoch": 0.56, "learning_rate": 2.415463684552728e-07, - "logits/chosen": -0.028222378343343735, - "logits/rejected": 0.8066191673278809, - "logps/chosen": -347.1553649902344, - "logps/rejected": -348.95166015625, - "loss": 46981.4625, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.7508881092071533, - "rewards/margins": 0.15713204443454742, - "rewards/rejected": -0.9080201387405396, - "rewards/safe_rewards": -0.7454043030738831, - "rewards/unsafe_rewards": -0.7563718557357788, + "logits/chosen": -2.0377871990203857, + "logits/rejected": -1.8571240901947021, + "logps/chosen": -264.3353576660156, + "logps/rejected": -250.5574493408203, + "loss": 11869.4391, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6739367246627808, + "rewards/margins": 0.1584298014640808, + "rewards/rejected": -0.8323665857315063, + "rewards/safe_rewards": -0.6686100363731384, + "rewards/unsafe_rewards": -0.6792632341384888, "step": 1040 }, { "epoch": 0.56, "learning_rate": 2.3685346575222807e-07, - "logits/chosen": -0.2877802848815918, - "logits/rejected": 0.9473918080329895, - "logps/chosen": -357.5995178222656, - "logps/rejected": -347.2440185546875, - "loss": 46055.2562, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.6998244524002075, - "rewards/margins": 0.1614559143781662, - "rewards/rejected": -0.8612804412841797, - "rewards/safe_rewards": -0.6987451314926147, - "rewards/unsafe_rewards": -0.7009039521217346, + "logits/chosen": -2.111082077026367, + "logits/rejected": -1.8854366540908813, + "logps/chosen": -280.7798767089844, + "logps/rejected": -254.7810821533203, + "loss": 11278.0977, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6307443976402283, + "rewards/margins": 0.16730864346027374, + "rewards/rejected": -0.7980531454086304, + "rewards/safe_rewards": -0.6317691802978516, + "rewards/unsafe_rewards": -0.6297196745872498, "step": 1050 }, { "epoch": 0.57, "learning_rate": 2.321652042001919e-07, - "logits/chosen": -0.26601386070251465, - "logits/rejected": 0.8794665336608887, - "logps/chosen": -351.0436706542969, - "logps/rejected": -370.7359924316406, - "loss": 44550.6562, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.6992644667625427, - "rewards/margins": 0.19244304299354553, - "rewards/rejected": -0.8917075395584106, - "rewards/safe_rewards": -0.700819730758667, - "rewards/unsafe_rewards": -0.6977092623710632, + "logits/chosen": -2.124929189682007, + "logits/rejected": -1.8838306665420532, + "logps/chosen": -272.2227783203125, + "logps/rejected": -273.8869323730469, + "loss": 11306.2313, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6101773381233215, + "rewards/margins": 0.20483005046844482, + "rewards/rejected": -0.8150073885917664, + "rewards/safe_rewards": -0.6046205759048462, + "rewards/unsafe_rewards": -0.6157340407371521, "step": 1060 }, { "epoch": 0.58, "learning_rate": 2.2748323890684662e-07, - "logits/chosen": -0.43971866369247437, - "logits/rejected": 0.918236255645752, - "logps/chosen": -349.2288513183594, - "logps/rejected": -350.5439758300781, - "loss": 44084.125, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.683814525604248, - "rewards/margins": 0.2440708428621292, - "rewards/rejected": -0.9278853535652161, - "rewards/safe_rewards": -0.6862753033638, - "rewards/unsafe_rewards": -0.6813536882400513, + "logits/chosen": -2.1761631965637207, + "logits/rejected": -1.8986377716064453, + "logps/chosen": -270.1111145019531, + "logps/rejected": -247.43576049804688, + "loss": 11255.1672, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5767061710357666, + "rewards/margins": 0.2480432540178299, + "rewards/rejected": -0.8247492909431458, + "rewards/safe_rewards": -0.5869646072387695, + "rewards/unsafe_rewards": -0.5664476156234741, "step": 1070 }, { "epoch": 0.58, "learning_rate": 2.2280922275709213e-07, - "logits/chosen": -0.49487119913101196, - "logits/rejected": 0.31752508878707886, - "logps/chosen": -362.0475769042969, - "logps/rejected": -365.53973388671875, - "loss": 46928.5375, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7547538876533508, - "rewards/margins": 0.1689223200082779, - "rewards/rejected": -0.9236763119697571, - "rewards/safe_rewards": -0.7296870946884155, - "rewards/unsafe_rewards": -0.7798206806182861, + "logits/chosen": -2.1585605144500732, + "logits/rejected": -2.006892681121826, + "logps/chosen": -272.51593017578125, + "logps/rejected": -259.193603515625, + "loss": 11610.1609, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6135168671607971, + "rewards/margins": 0.16986827552318573, + "rewards/rejected": -0.783385157585144, + "rewards/safe_rewards": -0.5932785272598267, + "rewards/unsafe_rewards": -0.6337552666664124, "step": 1080 }, { "epoch": 0.59, "learning_rate": 2.1814480582952375e-07, - "logits/chosen": -0.5348813533782959, - "logits/rejected": 0.41831517219543457, - "logps/chosen": -357.3843994140625, - "logps/rejected": -369.3674621582031, - "loss": 45527.4406, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7763819694519043, - "rewards/margins": 0.1709180772304535, - "rewards/rejected": -0.9473000764846802, - "rewards/safe_rewards": -0.762442946434021, - "rewards/unsafe_rewards": -0.7903211712837219, + "logits/chosen": -2.176574230194092, + "logits/rejected": -2.0007762908935547, + "logps/chosen": -264.99627685546875, + "logps/rejected": -260.7152099609375, + "loss": 11244.9641, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6294008493423462, + "rewards/margins": 0.1786467581987381, + "rewards/rejected": -0.8080476522445679, + "rewards/safe_rewards": -0.610293984413147, + "rewards/unsafe_rewards": -0.6485077738761902, "step": 1090 }, { "epoch": 0.59, "learning_rate": 2.1349163481390187e-07, - "logits/chosen": -0.5055888295173645, - "logits/rejected": 0.47217974066734314, - "logps/chosen": -345.64752197265625, - "logps/rejected": -357.1788330078125, - "loss": 46146.0281, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.7256776690483093, - "rewards/margins": 0.17912715673446655, - "rewards/rejected": -0.9048048257827759, - "rewards/safe_rewards": -0.7069441080093384, - "rewards/unsafe_rewards": -0.7444112300872803, + "logits/chosen": -2.1403450965881348, + "logits/rejected": -1.9742145538330078, + "logps/chosen": -258.41851806640625, + "logps/rejected": -253.95596313476562, + "loss": 11432.9805, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.579275369644165, + "rewards/margins": 0.1978916972875595, + "rewards/rejected": -0.7771671414375305, + "rewards/safe_rewards": -0.5507979393005371, + "rewards/unsafe_rewards": -0.6077528595924377, "step": 1100 }, { "epoch": 0.6, "learning_rate": 2.0885135242981647e-07, - "logits/chosen": -0.38241681456565857, - "logits/rejected": 0.7439740896224976, - "logps/chosen": -382.3736572265625, - "logps/rejected": -347.1092224121094, - "loss": 43435.7688, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7509212493896484, - "rewards/margins": 0.18192267417907715, - "rewards/rejected": -0.9328439831733704, - "rewards/safe_rewards": -0.7785581350326538, - "rewards/unsafe_rewards": -0.7232844829559326, + "logits/chosen": -2.110644817352295, + "logits/rejected": -1.8921115398406982, + "logps/chosen": -295.12310791015625, + "logps/rejected": -242.7633056640625, + "loss": 10866.6008, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6292051672935486, + "rewards/margins": 0.1930016279220581, + "rewards/rejected": -0.8222068548202515, + "rewards/safe_rewards": -0.6636418104171753, + "rewards/unsafe_rewards": -0.5947686433792114, "step": 1110 }, { "epoch": 0.6, "learning_rate": 2.0422559684675494e-07, - "logits/chosen": -0.323326975107193, - "logits/rejected": 0.8406647443771362, - "logps/chosen": -359.6302795410156, - "logps/rejected": -350.448486328125, - "loss": 44759.3531, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.712114691734314, - "rewards/margins": 0.17733532190322876, - "rewards/rejected": -0.889449954032898, - "rewards/safe_rewards": -0.7039347887039185, - "rewards/unsafe_rewards": -0.7202944755554199, + "logits/chosen": -2.07668137550354, + "logits/rejected": -1.870774269104004, + "logps/chosen": -279.4585876464844, + "logps/rejected": -252.98422241210938, + "loss": 11123.7453, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.622269868850708, + "rewards/margins": 0.18196968734264374, + "rewards/rejected": -0.8042396306991577, + "rewards/safe_rewards": -0.6149351596832275, + "rewards/unsafe_rewards": -0.629604697227478, "step": 1120 }, { "epoch": 0.61, "learning_rate": 1.9961600110577457e-07, - "logits/chosen": 0.03827141597867012, - "logits/rejected": 1.122315764427185, - "logps/chosen": -363.9014892578125, - "logps/rejected": -375.36114501953125, - "loss": 46126.075, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.7749402523040771, - "rewards/margins": 0.15946057438850403, - "rewards/rejected": -0.934400737285614, - "rewards/safe_rewards": -0.7422567009925842, - "rewards/unsafe_rewards": -0.8076237440109253, + "logits/chosen": -2.0477161407470703, + "logits/rejected": -1.8454376459121704, + "logps/chosen": -281.2010803222656, + "logps/rejected": -276.59503173828125, + "loss": 11451.4273, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7229688763618469, + "rewards/margins": 0.15784484148025513, + "rewards/rejected": -0.880813717842102, + "rewards/safe_rewards": -0.6920644640922546, + "rewards/unsafe_rewards": -0.7538732290267944, "step": 1130 }, { "epoch": 0.61, "learning_rate": 1.950241925429867e-07, - "logits/chosen": -0.3007999062538147, - "logits/rejected": 1.1402733325958252, - "logps/chosen": -339.51458740234375, - "logps/rejected": -346.62188720703125, - "loss": 44908.9656, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6971844434738159, - "rewards/margins": 0.2297726422548294, - "rewards/rejected": -0.9269570112228394, - "rewards/safe_rewards": -0.6988447308540344, - "rewards/unsafe_rewards": -0.6955242156982422, + "logits/chosen": -2.1420865058898926, + "logits/rejected": -1.9303150177001953, + "logps/chosen": -264.2613830566406, + "logps/rejected": -247.3074493408203, + "loss": 11185.8914, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6415218114852905, + "rewards/margins": 0.21913623809814453, + "rewards/rejected": -0.8606580495834351, + "rewards/safe_rewards": -0.6352616548538208, + "rewards/unsafe_rewards": -0.647782027721405, "step": 1140 }, { "epoch": 0.62, "learning_rate": 1.9045179221505495e-07, - "logits/chosen": -0.34602755308151245, - "logits/rejected": 0.5907880663871765, - "logps/chosen": -368.44097900390625, - "logps/rejected": -368.545654296875, - "loss": 44071.8187, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.6957541704177856, - "rewards/margins": 0.20806510746479034, - "rewards/rejected": -0.9038192629814148, - "rewards/safe_rewards": -0.6724596619606018, - "rewards/unsafe_rewards": -0.7190487384796143, + "logits/chosen": -2.1051979064941406, + "logits/rejected": -1.9602015018463135, + "logps/chosen": -292.8426513671875, + "logps/rejected": -272.5882568359375, + "loss": 10919.0359, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6359490156173706, + "rewards/margins": 0.21196213364601135, + "rewards/rejected": -0.8479111790657043, + "rewards/safe_rewards": -0.605754554271698, + "rewards/unsafe_rewards": -0.666143536567688, "step": 1150 }, { "epoch": 0.62, "learning_rate": 1.8590041432690893e-07, - "logits/chosen": -0.24286985397338867, - "logits/rejected": 0.4792594909667969, - "logps/chosen": -331.3720397949219, - "logps/rejected": -338.881103515625, - "loss": 44840.5813, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.7186309099197388, - "rewards/margins": 0.13764968514442444, - "rewards/rejected": -0.856280505657196, - "rewards/safe_rewards": -0.7337942719459534, - "rewards/unsafe_rewards": -0.7034674882888794, + "logits/chosen": -2.0412395000457764, + "logits/rejected": -1.9340324401855469, + "logps/chosen": -256.9598693847656, + "logps/rejected": -250.4047393798828, + "loss": 11277.7938, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.6930029988288879, + "rewards/margins": 0.13424697518348694, + "rewards/rejected": -0.8272498846054077, + "rewards/safe_rewards": -0.6941550970077515, + "rewards/unsafe_rewards": -0.6918508410453796, "step": 1160 }, { "epoch": 0.63, "learning_rate": 1.813716656618788e-07, - "logits/chosen": -0.3030107319355011, - "logits/rejected": 0.4574352204799652, - "logps/chosen": -333.5826110839844, - "logps/rejected": -345.04656982421875, - "loss": 46893.8375, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7264057993888855, - "rewards/margins": 0.1747409552335739, - "rewards/rejected": -0.9011467695236206, - "rewards/safe_rewards": -0.7102692723274231, - "rewards/unsafe_rewards": -0.7425424456596375, + "logits/chosen": -2.0102741718292236, + "logits/rejected": -1.9039026498794556, + "logps/chosen": -259.2917175292969, + "logps/rejected": -252.873046875, + "loss": 11988.2266, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7098476886749268, + "rewards/margins": 0.1709776222705841, + "rewards/rejected": -0.8808252215385437, + "rewards/safe_rewards": -0.7011247873306274, + "rewards/unsafe_rewards": -0.7185705304145813, "step": 1170 }, { "epoch": 0.63, "learning_rate": 1.7686714501444788e-07, - "logits/chosen": -0.6258971095085144, - "logits/rejected": 0.7857614755630493, - "logps/chosen": -370.97015380859375, - "logps/rejected": -354.3018493652344, - "loss": 47053.025, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7930737137794495, - "rewards/margins": 0.16986317932605743, - "rewards/rejected": -0.9629368782043457, - "rewards/safe_rewards": -0.8054510951042175, - "rewards/unsafe_rewards": -0.7806962728500366, + "logits/chosen": -2.084367275238037, + "logits/rejected": -1.8202540874481201, + "logps/chosen": -288.35626220703125, + "logps/rejected": -254.33651733398438, + "loss": 11612.3062, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7599023580551147, + "rewards/margins": 0.16640828549861908, + "rewards/rejected": -0.926310658454895, + "rewards/safe_rewards": -0.7765257954597473, + "rewards/unsafe_rewards": -0.7432790994644165, "step": 1180 }, { "epoch": 0.64, "learning_rate": 1.7238844262582768e-07, - "logits/chosen": -0.27803701162338257, - "logits/rejected": 0.2697659730911255, - "logps/chosen": -351.29156494140625, - "logps/rejected": -366.9908752441406, - "loss": 42961.8938, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7215646505355835, - "rewards/margins": 0.15517830848693848, - "rewards/rejected": -0.876742959022522, - "rewards/safe_rewards": -0.6825219988822937, - "rewards/unsafe_rewards": -0.7606073617935181, + "logits/chosen": -2.052704095840454, + "logits/rejected": -1.9864280223846436, + "logps/chosen": -275.9404602050781, + "logps/rejected": -276.5259094238281, + "loss": 10682.7672, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6891142129898071, + "rewards/margins": 0.15954537689685822, + "rewards/rejected": -0.8486596345901489, + "rewards/safe_rewards": -0.6521228551864624, + "rewards/unsafe_rewards": -0.7261057496070862, "step": 1190 }, { "epoch": 0.65, "learning_rate": 1.679371396225504e-07, - "logits/chosen": -0.2736941874027252, - "logits/rejected": 0.9284217953681946, - "logps/chosen": -350.8744201660156, - "logps/rejected": -370.12548828125, - "loss": 46068.6625, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.7488675713539124, - "rewards/margins": 0.19550403952598572, - "rewards/rejected": -0.9443715810775757, - "rewards/safe_rewards": -0.7266346216201782, - "rewards/unsafe_rewards": -0.7711005210876465, + "logits/chosen": -2.0941474437713623, + "logits/rejected": -1.8897063732147217, + "logps/chosen": -271.2891540527344, + "logps/rejected": -270.78924560546875, + "loss": 11671.4219, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7015270590782166, + "rewards/margins": 0.19374321401119232, + "rewards/rejected": -0.8952702283859253, + "rewards/safe_rewards": -0.6884527802467346, + "rewards/unsafe_rewards": -0.7146013379096985, + "step": 1200 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -1.8665262460708618, + "eval_logits/rejected": -1.6740189790725708, + "eval_logps/chosen": -228.5369415283203, + "eval_logps/rejected": -197.5587921142578, + "eval_loss": 4346.4052734375, + "eval_rewards/accuracies": 0.6158034801483154, + "eval_rewards/chosen": -0.9810636043548584, + "eval_rewards/margins": 0.06986771523952484, + "eval_rewards/rejected": -1.0509313344955444, + "eval_rewards/safe_rewards": -0.9763943552970886, + "eval_rewards/unsafe_rewards": -0.9767957329750061, + "eval_runtime": 992.8374, + "eval_samples_per_second": 33.282, + "eval_steps_per_second": 1.04, "step": 1200 }, { "epoch": 0.65, "learning_rate": 1.6351480745828096e-07, - "logits/chosen": -0.32376575469970703, - "logits/rejected": 0.6478773355484009, - "logps/chosen": -354.409423828125, - "logps/rejected": -359.4781494140625, - "loss": 40626.1438, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7673383951187134, - "rewards/margins": 0.1785522997379303, - "rewards/rejected": -0.9458906054496765, - "rewards/safe_rewards": -0.7691024541854858, - "rewards/unsafe_rewards": -0.7655742764472961, + "logits/chosen": -2.1039538383483887, + "logits/rejected": -1.9394419193267822, + "logps/chosen": -265.77984619140625, + "logps/rejected": -253.25137329101562, + "loss": 10276.6594, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6483036279678345, + "rewards/margins": 0.1812000572681427, + "rewards/rejected": -0.8295037150382996, + "rewards/safe_rewards": -0.6423134803771973, + "rewards/unsafe_rewards": -0.6542937755584717, "step": 1210 }, { "epoch": 0.66, "learning_rate": 1.5912300735904248e-07, - "logits/chosen": -0.31905311346054077, - "logits/rejected": 0.7552626729011536, - "logps/chosen": -383.111328125, - "logps/rejected": -369.0758361816406, - "loss": 46492.475, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.8015672564506531, - "rewards/margins": 0.15824849903583527, - "rewards/rejected": -0.9598156809806824, - "rewards/safe_rewards": -0.8104740381240845, - "rewards/unsafe_rewards": -0.7926604747772217, + "logits/chosen": -2.1374449729919434, + "logits/rejected": -1.983070969581604, + "logps/chosen": -290.1751403808594, + "logps/rejected": -259.19097900390625, + "loss": 11657.7609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6737682223320007, + "rewards/margins": 0.14703810214996338, + "rewards/rejected": -0.8208063244819641, + "rewards/safe_rewards": -0.6786805987358093, + "rewards/unsafe_rewards": -0.6688558459281921, "step": 1220 }, { "epoch": 0.66, "learning_rate": 1.5476328977205395e-07, - "logits/chosen": -0.05965734273195267, - "logits/rejected": 1.079012393951416, - "logps/chosen": -364.68792724609375, - "logps/rejected": -364.67218017578125, - "loss": 43970.3938, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.7916014194488525, - "rewards/margins": 0.20983903110027313, - "rewards/rejected": -1.001440405845642, - "rewards/safe_rewards": -0.8239424824714661, - "rewards/unsafe_rewards": -0.7592602968215942, + "logits/chosen": -2.1445086002349854, + "logits/rejected": -1.9767332077026367, + "logps/chosen": -271.8307189941406, + "logps/rejected": -251.6882781982422, + "loss": 11008.7156, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6544319987297058, + "rewards/margins": 0.21836614608764648, + "rewards/rejected": -0.8727981448173523, + "rewards/safe_rewards": -0.6966055631637573, + "rewards/unsafe_rewards": -0.6122584342956543, "step": 1230 }, { "epoch": 0.67, "learning_rate": 1.5043719381837112e-07, - "logits/chosen": 0.07217688113451004, - "logits/rejected": 0.9429874420166016, - "logps/chosen": -371.9664001464844, - "logps/rejected": -373.0435791015625, - "loss": 45319.7312, + "logits/chosen": -2.08951473236084, + "logits/rejected": -1.9547208547592163, + "logps/chosen": -287.0835266113281, + "logps/rejected": -271.43670654296875, + "loss": 11362.9422, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.7567300200462341, - "rewards/margins": 0.17278294265270233, - "rewards/rejected": -0.9295129776000977, - "rewards/safe_rewards": -0.7445767521858215, - "rewards/unsafe_rewards": -0.7688833475112915, + "rewards/chosen": -0.6643306612968445, + "rewards/margins": 0.17835856974124908, + "rewards/rejected": -0.8426891565322876, + "rewards/safe_rewards": -0.6574736833572388, + "rewards/unsafe_rewards": -0.6711875200271606, "step": 1240 }, { "epoch": 0.67, "learning_rate": 1.461462467495284e-07, - "logits/chosen": 0.011873995885252953, - "logits/rejected": 1.0727477073669434, - "logps/chosen": -334.2684631347656, - "logps/rejected": -365.14593505859375, - "loss": 41724.0625, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.7818735837936401, - "rewards/margins": 0.21924328804016113, - "rewards/rejected": -1.0011169910430908, - "rewards/safe_rewards": -0.7840328812599182, - "rewards/unsafe_rewards": -0.7797145843505859, + "logits/chosen": -2.0471625328063965, + "logits/rejected": -1.8923746347427368, + "logps/chosen": -252.0188446044922, + "logps/rejected": -260.54522705078125, + "loss": 10503.6422, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7411776185035706, + "rewards/margins": 0.21499836444854736, + "rewards/rejected": -0.9561759829521179, + "rewards/safe_rewards": -0.7554097175598145, + "rewards/unsafe_rewards": -0.7269455194473267, "step": 1250 }, { "epoch": 0.68, "learning_rate": 1.4189196340836865e-07, - "logits/chosen": -0.3158958852291107, - "logits/rejected": 0.9457992315292358, - "logps/chosen": -351.5196838378906, - "logps/rejected": -357.3620910644531, - "loss": 42574.2875, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7637627124786377, - "rewards/margins": 0.17515845596790314, - "rewards/rejected": -0.9389212727546692, - "rewards/safe_rewards": -0.7908867001533508, - "rewards/unsafe_rewards": -0.7366387248039246, + "logits/chosen": -2.1763851642608643, + "logits/rejected": -1.9493157863616943, + "logps/chosen": -269.4758605957031, + "logps/rejected": -258.123291015625, + "loss": 10853.5781, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7069443464279175, + "rewards/margins": 0.17843110859394073, + "rewards/rejected": -0.8853754997253418, + "rewards/safe_rewards": -0.7389132380485535, + "rewards/unsafe_rewards": -0.6749754548072815, "step": 1260 }, { "epoch": 0.68, "learning_rate": 1.3767584569425561e-07, - "logits/chosen": -0.42510318756103516, - "logits/rejected": 0.8703487515449524, - "logps/chosen": -360.560546875, - "logps/rejected": -357.7935485839844, - "loss": 45935.4094, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7610708475112915, - "rewards/margins": 0.1780497133731842, - "rewards/rejected": -0.9391204714775085, - "rewards/safe_rewards": -0.7755937576293945, - "rewards/unsafe_rewards": -0.7465478181838989, + "logits/chosen": -2.19474458694458, + "logits/rejected": -1.9729722738265991, + "logps/chosen": -277.60992431640625, + "logps/rejected": -258.0458984375, + "loss": 11364.5312, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6931368708610535, + "rewards/margins": 0.18750238418579102, + "rewards/rejected": -0.8806392550468445, + "rewards/safe_rewards": -0.7000595927238464, + "rewards/unsafe_rewards": -0.6862143278121948, "step": 1270 }, { "epoch": 0.69, "learning_rate": 1.334993820328541e-07, - "logits/chosen": -0.16254398226737976, - "logits/rejected": 0.888950526714325, - "logps/chosen": -333.65997314453125, - "logps/rejected": -353.986572265625, - "loss": 43710.4563, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7857218384742737, - "rewards/margins": 0.23325271904468536, - "rewards/rejected": -1.0189745426177979, - "rewards/safe_rewards": -0.7682746052742004, - "rewards/unsafe_rewards": -0.8031692504882812, + "logits/chosen": -2.0907111167907715, + "logits/rejected": -1.9239038228988647, + "logps/chosen": -246.83642578125, + "logps/rejected": -245.0505828857422, + "loss": 10652.2328, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7036882638931274, + "rewards/margins": 0.24492530524730682, + "rewards/rejected": -0.9486135244369507, + "rewards/safe_rewards": -0.6736655235290527, + "rewards/unsafe_rewards": -0.7337108850479126, "step": 1280 }, { "epoch": 0.69, "learning_rate": 1.2936404685066852e-07, - "logits/chosen": -0.14050248265266418, - "logits/rejected": 0.8069308400154114, - "logps/chosen": -374.6936950683594, - "logps/rejected": -384.58514404296875, - "loss": 45712.7562, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.783234715461731, - "rewards/margins": 0.17418265342712402, - "rewards/rejected": -0.9574173092842102, - "rewards/safe_rewards": -0.8019232749938965, - "rewards/unsafe_rewards": -0.7645460367202759, + "logits/chosen": -2.0505154132843018, + "logits/rejected": -1.9092705249786377, + "logps/chosen": -286.58990478515625, + "logps/rejected": -280.3271789550781, + "loss": 11526.9, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.68525230884552, + "rewards/margins": 0.18679992854595184, + "rewards/rejected": -0.8720521926879883, + "rewards/safe_rewards": -0.7125786542892456, + "rewards/unsafe_rewards": -0.6579257249832153, "step": 1290 }, { "epoch": 0.7, "learning_rate": 1.252713000545221e-07, - "logits/chosen": -0.5100895166397095, - "logits/rejected": 0.7043701410293579, - "logps/chosen": -363.90460205078125, - "logps/rejected": -361.3905334472656, - "loss": 41950.5938, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7255988121032715, - "rewards/margins": 0.2226296216249466, - "rewards/rejected": -0.9482284784317017, - "rewards/safe_rewards": -0.7191574573516846, - "rewards/unsafe_rewards": -0.7320401668548584, + "logits/chosen": -2.2164149284362793, + "logits/rejected": -2.019528388977051, + "logps/chosen": -282.0648498535156, + "logps/rejected": -258.7922058105469, + "loss": 10187.7117, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6330300569534302, + "rewards/margins": 0.23740728199481964, + "rewards/rejected": -0.8704373240470886, + "rewards/safe_rewards": -0.6281386613845825, + "rewards/unsafe_rewards": -0.6379214525222778, "step": 1300 }, { "epoch": 0.7, "learning_rate": 1.2122258651616304e-07, - "logits/chosen": -0.35903382301330566, - "logits/rejected": 0.911495566368103, - "logps/chosen": -360.46392822265625, - "logps/rejected": -342.14434814453125, - "loss": 44085.7844, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7511452436447144, - "rewards/margins": 0.14979678392410278, - "rewards/rejected": -0.9009420275688171, - "rewards/safe_rewards": -0.7544023394584656, - "rewards/unsafe_rewards": -0.7478880286216736, + "logits/chosen": -2.183960437774658, + "logits/rejected": -1.976144790649414, + "logps/chosen": -275.6617126464844, + "logps/rejected": -244.86434936523438, + "loss": 10813.9688, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6538507342338562, + "rewards/margins": 0.17538292706012726, + "rewards/rejected": -0.8292337656021118, + "rewards/safe_rewards": -0.6337584853172302, + "rewards/unsafe_rewards": -0.673943042755127, "step": 1310 }, { "epoch": 0.71, "learning_rate": 1.1721933556217792e-07, - "logits/chosen": -0.04007592052221298, - "logits/rejected": 0.9895780682563782, - "logps/chosen": -348.7082824707031, - "logps/rejected": -361.0719299316406, - "loss": 44762.45, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7407525777816772, - "rewards/margins": 0.18461810052394867, - "rewards/rejected": -0.9253706932067871, - "rewards/safe_rewards": -0.767977237701416, - "rewards/unsafe_rewards": -0.7135279178619385, + "logits/chosen": -2.167842388153076, + "logits/rejected": -2.0031533241271973, + "logps/chosen": -265.863525390625, + "logps/rejected": -261.1900329589844, + "loss": 11171.5523, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6531133651733398, + "rewards/margins": 0.19861795008182526, + "rewards/rejected": -0.8517313003540039, + "rewards/safe_rewards": -0.6670490503311157, + "rewards/unsafe_rewards": -0.639177680015564, "step": 1320 }, { "epoch": 0.72, "learning_rate": 1.1326296046939333e-07, - "logits/chosen": -0.08905103802680969, - "logits/rejected": 0.9929917454719543, - "logps/chosen": -336.84429931640625, - "logps/rejected": -344.197021484375, - "loss": 42105.9531, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7096694707870483, - "rewards/margins": 0.22010605037212372, - "rewards/rejected": -0.929775595664978, - "rewards/safe_rewards": -0.6870702505111694, - "rewards/unsafe_rewards": -0.732268750667572, + "logits/chosen": -2.134734869003296, + "logits/rejected": -1.9698076248168945, + "logps/chosen": -257.0758972167969, + "logps/rejected": -242.34640502929688, + "loss": 10608.8219, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6214333772659302, + "rewards/margins": 0.21942774951457977, + "rewards/rejected": -0.8408611416816711, + "rewards/safe_rewards": -0.6008444428443909, + "rewards/unsafe_rewards": -0.6420222520828247, "step": 1330 }, { "epoch": 0.72, "learning_rate": 1.0935485796594351e-07, - "logits/chosen": -0.11782164871692657, - "logits/rejected": 1.2576526403427124, - "logps/chosen": -376.26922607421875, - "logps/rejected": -366.0262451171875, - "loss": 48384.0469, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7587876915931702, - "rewards/margins": 0.22040753066539764, - "rewards/rejected": -0.979195237159729, - "rewards/safe_rewards": -0.75086510181427, - "rewards/unsafe_rewards": -0.7667102217674255, + "logits/chosen": -2.174132823944092, + "logits/rejected": -1.9792499542236328, + "logps/chosen": -291.65814208984375, + "logps/rejected": -260.0698547363281, + "loss": 11895.8789, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6717311143875122, + "rewards/margins": 0.22725185751914978, + "rewards/rejected": -0.8989830017089844, + "rewards/safe_rewards": -0.6783491373062134, + "rewards/unsafe_rewards": -0.665113091468811, "step": 1340 }, { "epoch": 0.73, "learning_rate": 1.0549640773818028e-07, - "logits/chosen": 0.08562326431274414, - "logits/rejected": 0.8367131352424622, - "logps/chosen": -360.604736328125, - "logps/rejected": -358.8101806640625, - "loss": 46761.5906, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8013836145401001, - "rewards/margins": 0.1653718650341034, - "rewards/rejected": -0.9667553901672363, - "rewards/safe_rewards": -0.8469769358634949, - "rewards/unsafe_rewards": -0.755790114402771, + "logits/chosen": -2.139070749282837, + "logits/rejected": -2.040194034576416, + "logps/chosen": -272.8217468261719, + "logps/rejected": -252.80606079101562, + "loss": 11776.8328, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.724553108215332, + "rewards/margins": 0.14907190203666687, + "rewards/rejected": -0.8736250996589661, + "rewards/safe_rewards": -0.764702320098877, + "rewards/unsafe_rewards": -0.6844038963317871, "step": 1350 }, { "epoch": 0.73, "learning_rate": 1.0168897194359921e-07, - "logits/chosen": -0.24259543418884277, - "logits/rejected": 0.8187444806098938, - "logps/chosen": -391.5111083984375, - "logps/rejected": -378.02239990234375, - "loss": 44688.0656, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.8185140490531921, - "rewards/margins": 0.1429864764213562, - "rewards/rejected": -0.9615005254745483, - "rewards/safe_rewards": -0.7842386364936829, - "rewards/unsafe_rewards": -0.8527895212173462, + "logits/chosen": -2.1789050102233887, + "logits/rejected": -1.9954169988632202, + "logps/chosen": -300.97711181640625, + "logps/rejected": -274.7542724609375, + "loss": 10877.8953, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7322704195976257, + "rewards/margins": 0.15806125104427338, + "rewards/rejected": -0.8903317451477051, + "rewards/safe_rewards": -0.6902467608451843, + "rewards/unsafe_rewards": -0.7742940783500671, "step": 1360 }, { "epoch": 0.74, "learning_rate": 9.793389472995392e-08, - "logits/chosen": -0.15840213000774384, - "logits/rejected": 1.3275569677352905, - "logps/chosen": -361.35205078125, - "logps/rejected": -345.7695007324219, - "loss": 40645.4469, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7290118336677551, - "rewards/margins": 0.2429216206073761, - "rewards/rejected": -0.9719333648681641, - "rewards/safe_rewards": -0.7022704482078552, - "rewards/unsafe_rewards": -0.755753219127655, + "logits/chosen": -2.155348539352417, + "logits/rejected": -1.9421207904815674, + "logps/chosen": -279.4602966308594, + "logps/rejected": -240.51791381835938, + "loss": 10089.3242, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.639030933380127, + "rewards/margins": 0.252288281917572, + "rewards/rejected": -0.8913192749023438, + "rewards/safe_rewards": -0.6204690933227539, + "rewards/unsafe_rewards": -0.6575928330421448, "step": 1370 }, { "epoch": 0.74, "learning_rate": 9.423250176072874e-08, - "logits/chosen": -0.295163094997406, - "logits/rejected": 1.0145258903503418, - "logps/chosen": -345.1089172363281, - "logps/rejected": -333.9171142578125, - "loss": 51095.5219, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.7809767127037048, - "rewards/margins": 0.15118253231048584, - "rewards/rejected": -0.9321592450141907, - "rewards/safe_rewards": -0.7673217058181763, - "rewards/unsafe_rewards": -0.7946317195892334, + "logits/chosen": -2.1420297622680664, + "logits/rejected": -1.946412444114685, + "logps/chosen": -260.1219177246094, + "logps/rejected": -234.1204833984375, + "loss": 12708.7586, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.71236252784729, + "rewards/margins": 0.1536431610584259, + "rewards/rejected": -0.8660055994987488, + "rewards/safe_rewards": -0.6938896179199219, + "rewards/unsafe_rewards": -0.7308354377746582, "step": 1380 }, { "epoch": 0.75, "learning_rate": 9.058609974713654e-08, - "logits/chosen": -0.4195622503757477, - "logits/rejected": 0.8018298149108887, - "logps/chosen": -347.4532775878906, - "logps/rejected": -358.97308349609375, - "loss": 44678.1062, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.7135136723518372, - "rewards/margins": 0.21250465512275696, - "rewards/rejected": -0.9260183572769165, - "rewards/safe_rewards": -0.6976640224456787, - "rewards/unsafe_rewards": -0.7293633222579956, + "logits/chosen": -2.1647183895111084, + "logits/rejected": -1.9520107507705688, + "logps/chosen": -270.0279235839844, + "logps/rejected": -260.9911193847656, + "loss": 10654.6578, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6530768275260925, + "rewards/margins": 0.2193538248538971, + "rewards/rejected": -0.872430682182312, + "rewards/safe_rewards": -0.6400060653686523, + "rewards/unsafe_rewards": -0.6661475896835327, "step": 1390 }, { "epoch": 0.75, "learning_rate": 8.699597598680753e-08, - "logits/chosen": -0.25206297636032104, - "logits/rejected": 0.8641384243965149, - "logps/chosen": -327.6678771972656, - "logps/rejected": -337.4767761230469, - "loss": 41291.7125, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.6778867840766907, - "rewards/margins": 0.19725951552391052, - "rewards/rejected": -0.8751462697982788, - "rewards/safe_rewards": -0.6752787828445435, - "rewards/unsafe_rewards": -0.6804946660995483, + "logits/chosen": -2.1252872943878174, + "logits/rejected": -1.9515492916107178, + "logps/chosen": -257.53680419921875, + "logps/rejected": -247.7872314453125, + "loss": 10438.4352, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6546493768692017, + "rewards/margins": 0.19867801666259766, + "rewards/rejected": -0.8533273935317993, + "rewards/safe_rewards": -0.6472882032394409, + "rewards/unsafe_rewards": -0.6620105504989624, "step": 1400 }, { "epoch": 0.76, "learning_rate": 8.346339790933166e-08, - "logits/chosen": -0.38414591550827026, - "logits/rejected": 0.8469989895820618, - "logps/chosen": -335.01214599609375, - "logps/rejected": -335.544189453125, - "loss": 45752.425, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7107453346252441, - "rewards/margins": 0.18056799471378326, - "rewards/rejected": -0.891313374042511, - "rewards/safe_rewards": -0.7035800814628601, - "rewards/unsafe_rewards": -0.7179104685783386, + "logits/chosen": -2.163567543029785, + "logits/rejected": -1.9771531820297241, + "logps/chosen": -261.906982421875, + "logps/rejected": -245.43344116210938, + "loss": 11502.7297, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6902798414230347, + "rewards/margins": 0.19094708561897278, + "rewards/rejected": -0.8812268376350403, + "rewards/safe_rewards": -0.6901260018348694, + "rewards/unsafe_rewards": -0.6904336214065552, "step": 1410 }, { "epoch": 0.76, "learning_rate": 7.998961262881506e-08, - "logits/chosen": -0.3762363791465759, - "logits/rejected": 0.9578374028205872, - "logps/chosen": -355.64178466796875, - "logps/rejected": -338.7761535644531, - "loss": 43834.4125, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.692502498626709, - "rewards/margins": 0.20904675126075745, - "rewards/rejected": -0.901549220085144, - "rewards/safe_rewards": -0.7310479283332825, - "rewards/unsafe_rewards": -0.6539570689201355, + "logits/chosen": -2.136033535003662, + "logits/rejected": -1.915771245956421, + "logps/chosen": -282.3092346191406, + "logps/rejected": -245.1779022216797, + "loss": 10907.2641, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6517400741577148, + "rewards/margins": 0.2158946692943573, + "rewards/rejected": -0.8676347732543945, + "rewards/safe_rewards": -0.6872564554214478, + "rewards/unsafe_rewards": -0.6162236928939819, "step": 1420 }, { "epoch": 0.77, "learning_rate": 7.657584650360846e-08, - "logits/chosen": -0.005143237300217152, - "logits/rejected": 0.8091180920600891, - "logps/chosen": -321.630126953125, - "logps/rejected": -326.096923828125, - "loss": 46398.875, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.6763697266578674, - "rewards/margins": 0.20085851848125458, - "rewards/rejected": -0.8772281408309937, - "rewards/safe_rewards": -0.6963562965393066, - "rewards/unsafe_rewards": -0.6563830375671387, + "logits/chosen": -2.068126678466797, + "logits/rejected": -1.9587552547454834, + "logps/chosen": -251.7869873046875, + "logps/rejected": -236.9489288330078, + "loss": 11409.3359, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6544846296310425, + "rewards/margins": 0.2086174488067627, + "rewards/rejected": -0.8631020784378052, + "rewards/safe_rewards": -0.6824710965156555, + "rewards/unsafe_rewards": -0.6264981627464294, "step": 1430 }, { "epoch": 0.77, "learning_rate": 7.322330470336313e-08, - "logits/chosen": -0.16294366121292114, - "logits/rejected": 1.051831841468811, - "logps/chosen": -344.1456604003906, - "logps/rejected": -363.4648742675781, - "loss": 44117.925, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.6761170029640198, - "rewards/margins": 0.2284509390592575, - "rewards/rejected": -0.9045678973197937, - "rewards/safe_rewards": -0.6408445239067078, - "rewards/unsafe_rewards": -0.711389422416687, + "logits/chosen": -2.122067928314209, + "logits/rejected": -1.9209505319595337, + "logps/chosen": -269.77520751953125, + "logps/rejected": -268.40972900390625, + "loss": 10821.843, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6080222129821777, + "rewards/margins": 0.25041937828063965, + "rewards/rejected": -0.8584416508674622, + "rewards/safe_rewards": -0.5671194195747375, + "rewards/unsafe_rewards": -0.6489250659942627, "step": 1440 }, { "epoch": 0.78, "learning_rate": 6.993317078356709e-08, - "logits/chosen": -0.1736331433057785, - "logits/rejected": 0.4382328391075134, - "logps/chosen": -361.22430419921875, - "logps/rejected": -349.16925048828125, - "loss": 45505.9906, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.7554510831832886, - "rewards/margins": 0.12425854057073593, - "rewards/rejected": -0.8797096014022827, - "rewards/safe_rewards": -0.7775359749794006, - "rewards/unsafe_rewards": -0.7333661317825317, + "logits/chosen": -2.1524910926818848, + "logits/rejected": -2.056199789047241, + "logps/chosen": -279.29583740234375, + "logps/rejected": -255.84521484375, + "loss": 11391.4539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.691224217414856, + "rewards/margins": 0.13461162149906158, + "rewards/rejected": -0.8258358240127563, + "rewards/safe_rewards": -0.7272582054138184, + "rewards/unsafe_rewards": -0.6551901698112488, "step": 1450 }, { "epoch": 0.79, "learning_rate": 6.67066062677118e-08, - "logits/chosen": -0.4063405394554138, - "logits/rejected": 0.7672702074050903, - "logps/chosen": -338.43865966796875, - "logps/rejected": -325.3241271972656, - "loss": 47023.7, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.6877012848854065, - "rewards/margins": 0.15492403507232666, - "rewards/rejected": -0.8426253199577332, - "rewards/safe_rewards": -0.7019062042236328, - "rewards/unsafe_rewards": -0.6734963655471802, + "logits/chosen": -2.191707134246826, + "logits/rejected": -2.0195794105529785, + "logps/chosen": -263.27642822265625, + "logps/rejected": -235.50973510742188, + "loss": 11926.3453, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6236655116081238, + "rewards/margins": 0.16336306929588318, + "rewards/rejected": -0.7870286107063293, + "rewards/safe_rewards": -0.6308915615081787, + "rewards/unsafe_rewards": -0.6164394617080688, "step": 1460 }, { "epoch": 0.79, "learning_rate": 6.354475023723685e-08, - "logits/chosen": -0.34032854437828064, - "logits/rejected": 0.7701922655105591, - "logps/chosen": -382.75933837890625, - "logps/rejected": -370.2638244628906, - "loss": 43725.7156, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.7435750365257263, - "rewards/margins": 0.22489742934703827, - "rewards/rejected": -0.9684725999832153, - "rewards/safe_rewards": -0.7354261875152588, - "rewards/unsafe_rewards": -0.7517240643501282, + "logits/chosen": -2.1155574321746826, + "logits/rejected": -1.9720067977905273, + "logps/chosen": -301.7005920410156, + "logps/rejected": -267.5169372558594, + "loss": 11094.9625, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6763675808906555, + "rewards/margins": 0.23334193229675293, + "rewards/rejected": -0.9097094535827637, + "rewards/safe_rewards": -0.6679859757423401, + "rewards/unsafe_rewards": -0.6847492456436157, "step": 1470 }, { "epoch": 0.8, "learning_rate": 6.044871892939746e-08, - "logits/chosen": -0.5503937602043152, - "logits/rejected": 0.761081337928772, - "logps/chosen": -364.8021545410156, - "logps/rejected": -368.11505126953125, - "loss": 44683.6062, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.7646824717521667, - "rewards/margins": 0.17859305441379547, - "rewards/rejected": -0.9432755708694458, - "rewards/safe_rewards": -0.7754616737365723, - "rewards/unsafe_rewards": -0.753903329372406, + "logits/chosen": -2.184211254119873, + "logits/rejected": -1.9999923706054688, + "logps/chosen": -281.41900634765625, + "logps/rejected": -268.82464599609375, + "loss": 10924.2703, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.6950246691703796, + "rewards/margins": 0.19827482104301453, + "rewards/rejected": -0.8932995796203613, + "rewards/safe_rewards": -0.6960557699203491, + "rewards/unsafe_rewards": -0.6939936280250549, "step": 1480 }, { "epoch": 0.8, "learning_rate": 5.741960534319676e-08, - "logits/chosen": -0.3159703016281128, - "logits/rejected": 0.39256638288497925, - "logps/chosen": -322.4219665527344, - "logps/rejected": -335.59259033203125, - "loss": 43932.2094, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.7464033961296082, - "rewards/margins": 0.15226200222969055, - "rewards/rejected": -0.8986655473709106, - "rewards/safe_rewards": -0.7843297123908997, - "rewards/unsafe_rewards": -0.7084770202636719, + "logits/chosen": -2.134298801422119, + "logits/rejected": -2.02238130569458, + "logps/chosen": -241.8282012939453, + "logps/rejected": -240.76290893554688, + "loss": 11064.8711, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.687341570854187, + "rewards/margins": 0.16167931258678436, + "rewards/rejected": -0.8490209579467773, + "rewards/safe_rewards": -0.7233562469482422, + "rewards/unsafe_rewards": -0.6513269543647766, "step": 1490 }, { "epoch": 0.81, "learning_rate": 5.44584788535217e-08, - "logits/chosen": -0.2922753393650055, - "logits/rejected": 0.813672661781311, - "logps/chosen": -370.59619140625, - "logps/rejected": -367.9254455566406, - "loss": 40856.9313, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.766488790512085, - "rewards/margins": 0.2095346748828888, - "rewards/rejected": -0.9760234951972961, - "rewards/safe_rewards": -0.7445180416107178, - "rewards/unsafe_rewards": -0.7884594202041626, + "logits/chosen": -2.1548972129821777, + "logits/rejected": -2.0020337104797363, + "logps/chosen": -288.1227111816406, + "logps/rejected": -265.52337646484375, + "loss": 10202.4125, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7084766626358032, + "rewards/margins": 0.21978096663951874, + "rewards/rejected": -0.9282576441764832, + "rewards/safe_rewards": -0.6754325032234192, + "rewards/unsafe_rewards": -0.741520881652832, "step": 1500 }, { "epoch": 0.81, - "eval_logits/chosen": 1.2878165245056152, - "eval_logits/rejected": 2.5177412033081055, - "eval_logps/chosen": -323.39276123046875, - "eval_logps/rejected": -297.2071838378906, - "eval_loss": 17213.609375, - "eval_rewards/accuracies": 0.6079380512237549, - "eval_rewards/chosen": -0.964769184589386, - "eval_rewards/margins": 0.058910515159368515, - "eval_rewards/rejected": -1.0236797332763672, - "eval_rewards/safe_rewards": -0.9604586362838745, - "eval_rewards/unsafe_rewards": -0.9637781381607056, - "eval_runtime": 1061.5718, - "eval_samples_per_second": 31.127, - "eval_steps_per_second": 0.973, + "eval_logits/chosen": -1.9419655799865723, + "eval_logits/rejected": -1.764477252960205, + "eval_logps/chosen": -226.97747802734375, + "eval_logps/rejected": -195.17942810058594, + "eval_loss": 4320.98779296875, + "eval_rewards/accuracies": 0.6022507548332214, + "eval_rewards/chosen": -0.9654689431190491, + "eval_rewards/margins": 0.061668772250413895, + "eval_rewards/rejected": -1.0271376371383667, + "eval_rewards/safe_rewards": -0.9610660672187805, + "eval_rewards/unsafe_rewards": -0.9618369936943054, + "eval_runtime": 993.2958, + "eval_samples_per_second": 33.267, + "eval_steps_per_second": 1.04, "step": 1500 }, { "epoch": 0.81, "learning_rate": 5.156638483361933e-08, - "logits/chosen": -0.5403560400009155, - "logits/rejected": 0.570114254951477, - "logps/chosen": -364.564453125, - "logps/rejected": -375.5169372558594, - "loss": 44379.4563, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.7334067821502686, - "rewards/margins": 0.2277216613292694, - "rewards/rejected": -0.9611285328865051, - "rewards/safe_rewards": -0.7564698457717896, - "rewards/unsafe_rewards": -0.7103437185287476, + "logits/chosen": -2.2049641609191895, + "logits/rejected": -2.0106749534606934, + "logps/chosen": -285.2719421386719, + "logps/rejected": -273.9066467285156, + "loss": 11107.9672, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6739949584007263, + "rewards/margins": 0.23203198611736298, + "rewards/rejected": -0.9060269594192505, + "rewards/safe_rewards": -0.6972517967224121, + "rewards/unsafe_rewards": -0.6507382988929749, "step": 1510 }, { "epoch": 0.82, "learning_rate": 4.8744344286046236e-08, - "logits/chosen": -0.2975967824459076, - "logits/rejected": 0.6875097155570984, - "logps/chosen": -367.8726501464844, - "logps/rejected": -365.4129333496094, - "loss": 45052.2688, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7557625770568848, - "rewards/margins": 0.16084986925125122, - "rewards/rejected": -0.9166123270988464, - "rewards/safe_rewards": -0.7888222932815552, - "rewards/unsafe_rewards": -0.7227028012275696, + "logits/chosen": -2.148716926574707, + "logits/rejected": -2.0059280395507812, + "logps/chosen": -285.7190856933594, + "logps/rejected": -266.8186950683594, + "loss": 11427.2711, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6893398761749268, + "rewards/margins": 0.15746144950389862, + "rewards/rejected": -0.8468014001846313, + "rewards/safe_rewards": -0.7316367030143738, + "rewards/unsafe_rewards": -0.6470431089401245, "step": 1520 }, { "epoch": 0.82, "learning_rate": 4.599335348222169e-08, - "logits/chosen": -0.24563631415367126, - "logits/rejected": 0.5304837226867676, - "logps/chosen": -373.958251953125, - "logps/rejected": -394.1438903808594, - "loss": 43362.5375, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.7600733041763306, - "rewards/margins": 0.20462003350257874, - "rewards/rejected": -0.9646932482719421, - "rewards/safe_rewards": -0.7698140740394592, - "rewards/unsafe_rewards": -0.7503325939178467, + "logits/chosen": -2.174553871154785, + "logits/rejected": -2.05711030960083, + "logps/chosen": -288.70849609375, + "logps/rejected": -289.8013610839844, + "loss": 10682.943, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6674157977104187, + "rewards/margins": 0.21851542592048645, + "rewards/rejected": -0.8859313130378723, + "rewards/safe_rewards": -0.6756128668785095, + "rewards/unsafe_rewards": -0.6592189073562622, "step": 1530 }, { "epoch": 0.83, "learning_rate": 4.331438361071163e-08, - "logits/chosen": -0.3741500973701477, - "logits/rejected": 0.21047651767730713, - "logps/chosen": -383.0041198730469, - "logps/rejected": -384.45159912109375, - "loss": 46612.6562, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.716231107711792, - "rewards/margins": 0.16729871928691864, - "rewards/rejected": -0.8835298418998718, - "rewards/safe_rewards": -0.7156515121459961, - "rewards/unsafe_rewards": -0.7168108224868774, + "logits/chosen": -2.1525204181671143, + "logits/rejected": -2.059199571609497, + "logps/chosen": -302.69195556640625, + "logps/rejected": -287.97161865234375, + "loss": 11478.0016, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6294053792953491, + "rewards/margins": 0.17254997789859772, + "rewards/rejected": -0.8019553422927856, + "rewards/safe_rewards": -0.6268101334571838, + "rewards/unsafe_rewards": -0.6320004463195801, "step": 1540 }, { "epoch": 0.83, "learning_rate": 4.0708380434367864e-08, - "logits/chosen": -0.45448416471481323, - "logits/rejected": 0.6257972717285156, - "logps/chosen": -350.1400451660156, - "logps/rejected": -364.63519287109375, - "loss": 41781.4969, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7609917521476746, - "rewards/margins": 0.1914409101009369, - "rewards/rejected": -0.9524327516555786, - "rewards/safe_rewards": -0.7708221673965454, - "rewards/unsafe_rewards": -0.7511614561080933, + "logits/chosen": -2.1821987628936768, + "logits/rejected": -1.9993159770965576, + "logps/chosen": -265.56854248046875, + "logps/rejected": -261.4121398925781, + "loss": 10532.0812, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6764332056045532, + "rewards/margins": 0.19545750319957733, + "rewards/rejected": -0.8718908429145813, + "rewards/safe_rewards": -0.6992810964584351, + "rewards/unsafe_rewards": -0.6535855531692505, "step": 1550 }, { "epoch": 0.84, "learning_rate": 3.817626395644305e-08, - "logits/chosen": -0.5271582007408142, - "logits/rejected": 0.5084184408187866, - "logps/chosen": -348.3282470703125, - "logps/rejected": -347.14801025390625, - "loss": 45690.55, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.7518355846405029, - "rewards/margins": 0.1429557502269745, - "rewards/rejected": -0.894791305065155, - "rewards/safe_rewards": -0.7422589063644409, - "rewards/unsafe_rewards": -0.7614122033119202, + "logits/chosen": -2.1872050762176514, + "logits/rejected": -2.0328307151794434, + "logps/chosen": -265.10205078125, + "logps/rejected": -248.8824462890625, + "loss": 11308.0203, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6716850399971008, + "rewards/margins": 0.13508987426757812, + "rewards/rejected": -0.806774914264679, + "rewards/safe_rewards": -0.6651741862297058, + "rewards/unsafe_rewards": -0.6781958341598511, "step": 1560 }, { "epoch": 0.84, "learning_rate": 3.571892809580013e-08, - "logits/chosen": -0.366781622171402, - "logits/rejected": 0.43934473395347595, - "logps/chosen": -344.8788146972656, - "logps/rejected": -349.99615478515625, - "loss": 45845.7812, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.7688021659851074, - "rewards/margins": 0.13870775699615479, - "rewards/rejected": -0.9075098037719727, - "rewards/safe_rewards": -0.736695408821106, - "rewards/unsafe_rewards": -0.8009088635444641, + "logits/chosen": -2.157172203063965, + "logits/rejected": -2.0089964866638184, + "logps/chosen": -260.57525634765625, + "logps/rejected": -251.6233367919922, + "loss": 11400.9055, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6940633058547974, + "rewards/margins": 0.136986643075943, + "rewards/rejected": -0.831049919128418, + "rewards/safe_rewards": -0.6672950983047485, + "rewards/unsafe_rewards": -0.7208314538002014, "step": 1570 }, { "epoch": 0.85, "learning_rate": 3.333724037132976e-08, - "logits/chosen": -0.42608147859573364, - "logits/rejected": 0.503635048866272, - "logps/chosen": -357.31024169921875, - "logps/rejected": -369.7557067871094, - "loss": 44812.5, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7559465765953064, - "rewards/margins": 0.17133986949920654, - "rewards/rejected": -0.9272864460945129, - "rewards/safe_rewards": -0.7375823259353638, - "rewards/unsafe_rewards": -0.7743107080459595, + "logits/chosen": -2.180004596710205, + "logits/rejected": -2.0329430103302, + "logps/chosen": -274.8149108886719, + "logps/rejected": -270.170654296875, + "loss": 11022.3203, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6869722008705139, + "rewards/margins": 0.17142841219902039, + "rewards/rejected": -0.8584005236625671, + "rewards/safe_rewards": -0.664495587348938, + "rewards/unsafe_rewards": -0.7094486951828003, "step": 1580 }, { "epoch": 0.86, "learning_rate": 3.1032041595688506e-08, - "logits/chosen": -0.4155375063419342, - "logits/rejected": 0.7212396264076233, - "logps/chosen": -347.390625, - "logps/rejected": -359.71722412109375, - "loss": 43479.9688, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.7526252865791321, - "rewards/margins": 0.18866637349128723, - "rewards/rejected": -0.9412916302680969, - "rewards/safe_rewards": -0.7736086249351501, - "rewards/unsafe_rewards": -0.7316418886184692, + "logits/chosen": -2.107234477996826, + "logits/rejected": -1.9175293445587158, + "logps/chosen": -265.2936706542969, + "logps/rejected": -258.67987060546875, + "loss": 10853.2336, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6839567422866821, + "rewards/margins": 0.1879226267337799, + "rewards/rejected": -0.8718793988227844, + "rewards/safe_rewards": -0.6944083571434021, + "rewards/unsafe_rewards": -0.6735051870346069, "step": 1590 }, { "epoch": 0.86, "learning_rate": 2.880414557846453e-08, - "logits/chosen": -0.44633907079696655, - "logits/rejected": 0.20320913195610046, - "logps/chosen": -330.8631591796875, - "logps/rejected": -338.0090026855469, - "loss": 41196.0062, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7292383313179016, - "rewards/margins": 0.17190253734588623, - "rewards/rejected": -0.9011408686637878, - "rewards/safe_rewards": -0.7252318263053894, - "rewards/unsafe_rewards": -0.733244776725769, + "logits/chosen": -2.131779432296753, + "logits/rejected": -2.045048236846924, + "logps/chosen": -249.33114624023438, + "logps/rejected": -240.8230438232422, + "loss": 10239.6547, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6429909467697144, + "rewards/margins": 0.1874755173921585, + "rewards/rejected": -0.8304663896560669, + "rewards/safe_rewards": -0.6290788650512695, + "rewards/unsafe_rewards": -0.6569029092788696, "step": 1600 }, { "epoch": 0.87, "learning_rate": 2.6654338838876662e-08, - "logits/chosen": -0.517281174659729, - "logits/rejected": 0.856174647808075, - "logps/chosen": -361.58380126953125, - "logps/rejected": -341.9182434082031, - "loss": 43830.575, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.7361901998519897, - "rewards/margins": 0.21951782703399658, - "rewards/rejected": -0.9557080268859863, - "rewards/safe_rewards": -0.7586108446121216, - "rewards/unsafe_rewards": -0.7137697339057922, + "logits/chosen": -2.2182207107543945, + "logits/rejected": -1.985608458518982, + "logps/chosen": -280.660400390625, + "logps/rejected": -240.7994384765625, + "loss": 11038.0031, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6624557971954346, + "rewards/margins": 0.23739679157733917, + "rewards/rejected": -0.8998525738716125, + "rewards/safe_rewards": -0.68045574426651, + "rewards/unsafe_rewards": -0.6444558501243591, "step": 1610 }, { "epoch": 0.87, "learning_rate": 2.4583380328107805e-08, - "logits/chosen": -0.4754597544670105, - "logits/rejected": 0.6607877016067505, - "logps/chosen": -369.90460205078125, - "logps/rejected": -357.216552734375, - "loss": 45800.725, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7210997939109802, - "rewards/margins": 0.20530486106872559, - "rewards/rejected": -0.9264047741889954, - "rewards/safe_rewards": -0.7267229557037354, - "rewards/unsafe_rewards": -0.7154766321182251, + "logits/chosen": -2.1470413208007812, + "logits/rejected": -1.985733985900879, + "logps/chosen": -291.9235534667969, + "logps/rejected": -258.4531555175781, + "loss": 11528.2828, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6623538136482239, + "rewards/margins": 0.20262321829795837, + "rewards/rejected": -0.8649770021438599, + "rewards/safe_rewards": -0.6641503572463989, + "rewards/unsafe_rewards": -0.660557210445404, "step": 1620 }, { "epoch": 0.88, "learning_rate": 2.259200116137039e-08, - "logits/chosen": -0.46567052602767944, - "logits/rejected": 0.40764307975769043, - "logps/chosen": -364.71600341796875, - "logps/rejected": -374.5581359863281, - "loss": 45540.2469, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7384431958198547, - "rewards/margins": 0.17130059003829956, - "rewards/rejected": -0.9097437858581543, - "rewards/safe_rewards": -0.7265445590019226, - "rewards/unsafe_rewards": -0.7503417134284973, + "logits/chosen": -2.1394782066345215, + "logits/rejected": -1.9972803592681885, + "logps/chosen": -284.652099609375, + "logps/rejected": -278.02423095703125, + "loss": 11545.0234, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6758708357810974, + "rewards/margins": 0.17819495499134064, + "rewards/rejected": -0.8540657758712769, + "rewards/safe_rewards": -0.6568797826766968, + "rewards/unsafe_rewards": -0.6948619484901428, "step": 1630 }, { "epoch": 0.88, "learning_rate": 2.068090435979958e-08, - "logits/chosen": -0.24766401946544647, - "logits/rejected": 0.6277973651885986, - "logps/chosen": -345.60919189453125, - "logps/rejected": -343.9813537597656, - "loss": 45637.0875, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.6989878416061401, - "rewards/margins": 0.1655181348323822, - "rewards/rejected": -0.8645059466362, - "rewards/safe_rewards": -0.6970330476760864, - "rewards/unsafe_rewards": -0.7009425759315491, + "logits/chosen": -2.1189610958099365, + "logits/rejected": -2.0093135833740234, + "logps/chosen": -268.87371826171875, + "logps/rejected": -250.8845672607422, + "loss": 11243.7469, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6307465434074402, + "rewards/margins": 0.16744786500930786, + "rewards/rejected": -0.798194408416748, + "rewards/safe_rewards": -0.6202843189239502, + "rewards/unsafe_rewards": -0.6412087678909302, "step": 1640 }, { "epoch": 0.89, "learning_rate": 1.8850764602263423e-08, - "logits/chosen": -0.3011489808559418, - "logits/rejected": 0.8038506507873535, - "logps/chosen": -343.5799255371094, - "logps/rejected": -361.7254638671875, - "loss": 44789.225, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7333818674087524, - "rewards/margins": 0.1577393114566803, - "rewards/rejected": -0.8911210894584656, - "rewards/safe_rewards": -0.7370047569274902, - "rewards/unsafe_rewards": -0.7297589182853699, + "logits/chosen": -2.1497740745544434, + "logits/rejected": -1.9683231115341187, + "logps/chosen": -263.89495849609375, + "logps/rejected": -266.2047424316406, + "loss": 11196.7602, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6697946786880493, + "rewards/margins": 0.1570041924715042, + "rewards/rejected": -0.8267987966537476, + "rewards/safe_rewards": -0.6728789806365967, + "rewards/unsafe_rewards": -0.666710376739502, "step": 1650 }, { "epoch": 0.89, "learning_rate": 1.710222798718028e-08, - "logits/chosen": -0.39880818128585815, - "logits/rejected": 0.3965887725353241, - "logps/chosen": -357.93316650390625, - "logps/rejected": -377.0177307128906, - "loss": 43983.0188, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7351225018501282, - "rewards/margins": 0.16761592030525208, - "rewards/rejected": -0.9027383923530579, - "rewards/safe_rewards": -0.737257719039917, - "rewards/unsafe_rewards": -0.7329872250556946, + "logits/chosen": -2.16522479057312, + "logits/rejected": -2.045431613922119, + "logps/chosen": -276.67889404296875, + "logps/rejected": -280.6698303222656, + "loss": 10997.7469, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6575156450271606, + "rewards/margins": 0.1838151067495346, + "rewards/rejected": -0.8413307070732117, + "rewards/safe_rewards": -0.6694937944412231, + "rewards/unsafe_rewards": -0.6455374360084534, "step": 1660 }, { "epoch": 0.9, "learning_rate": 1.5435911804424356e-08, - "logits/chosen": -0.4592857360839844, - "logits/rejected": 0.32937127351760864, - "logps/chosen": -369.8361511230469, - "logps/rejected": -367.25994873046875, - "loss": 48111.3063, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.7118639945983887, - "rewards/margins": 0.183955118060112, - "rewards/rejected": -0.8958190679550171, - "rewards/safe_rewards": -0.7287789583206177, - "rewards/unsafe_rewards": -0.6949489116668701, + "logits/chosen": -2.159721851348877, + "logits/rejected": -2.048668384552002, + "logps/chosen": -290.70916748046875, + "logps/rejected": -270.44683837890625, + "loss": 11924.0805, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6324605345726013, + "rewards/margins": 0.19088464975357056, + "rewards/rejected": -0.8233451843261719, + "rewards/safe_rewards": -0.6401357054710388, + "rewards/unsafe_rewards": -0.624785304069519, "step": 1670 }, { "epoch": 0.9, "learning_rate": 1.3852404317403199e-08, - "logits/chosen": -0.39470523595809937, - "logits/rejected": 0.48426881432533264, - "logps/chosen": -329.90692138671875, - "logps/rejected": -360.4842224121094, - "loss": 45857.1438, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.7146719098091125, - "rewards/margins": 0.14826802909374237, - "rewards/rejected": -0.8629398345947266, - "rewards/safe_rewards": -0.7360923290252686, - "rewards/unsafe_rewards": -0.6932514309883118, + "logits/chosen": -2.108271360397339, + "logits/rejected": -1.9915637969970703, + "logps/chosen": -250.95361328125, + "logps/rejected": -268.6639099121094, + "loss": 11402.2625, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6399115324020386, + "rewards/margins": 0.16807249188423157, + "rewards/rejected": -0.8079840540885925, + "rewards/safe_rewards": -0.6549087762832642, + "rewards/unsafe_rewards": -0.6249145269393921, "step": 1680 }, { "epoch": 0.91, "learning_rate": 1.235226455538113e-08, - "logits/chosen": -0.3582441210746765, - "logits/rejected": 0.43277034163475037, - "logps/chosen": -354.54376220703125, - "logps/rejected": -364.79437255859375, - "loss": 46436.4125, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.7465845346450806, - "rewards/margins": 0.17824284732341766, - "rewards/rejected": -0.9248273968696594, - "rewards/safe_rewards": -0.7581877112388611, - "rewards/unsafe_rewards": -0.7349813580513, + "logits/chosen": -2.1952316761016846, + "logits/rejected": -2.0874345302581787, + "logps/chosen": -270.4241943359375, + "logps/rejected": -263.0255432128906, + "loss": 11680.2766, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6519337892532349, + "rewards/margins": 0.18009743094444275, + "rewards/rejected": -0.8320311307907104, + "rewards/safe_rewards": -0.662136435508728, + "rewards/unsafe_rewards": -0.6417311429977417, "step": 1690 }, { "epoch": 0.91, "learning_rate": 1.0936022116124321e-08, - "logits/chosen": -0.4733123779296875, - "logits/rejected": 0.5592867136001587, - "logps/chosen": -341.32080078125, - "logps/rejected": -351.58270263671875, - "loss": 43335.8156, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.706405520439148, - "rewards/margins": 0.2071431428194046, - "rewards/rejected": -0.9135486483573914, - "rewards/safe_rewards": -0.7256786227226257, - "rewards/unsafe_rewards": -0.6871322989463806, + "logits/chosen": -2.155649185180664, + "logits/rejected": -2.0004172325134277, + "logps/chosen": -262.668212890625, + "logps/rejected": -252.78994750976562, + "loss": 10840.3375, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6261528134346008, + "rewards/margins": 0.2126920521259308, + "rewards/rejected": -0.838844895362854, + "rewards/safe_rewards": -0.6410089731216431, + "rewards/unsafe_rewards": -0.6112965941429138, "step": 1700 }, { "epoch": 0.92, "learning_rate": 9.60417697893534e-09, - "logits/chosen": -0.43451374769210815, - "logits/rejected": 0.4350733757019043, - "logps/chosen": -341.27191162109375, - "logps/rejected": -361.10089111328125, - "loss": 45431.4563, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.7235937118530273, - "rewards/margins": 0.17308807373046875, - "rewards/rejected": -0.8966817855834961, - "rewards/safe_rewards": -0.6771775484085083, - "rewards/unsafe_rewards": -0.7700098752975464, + "logits/chosen": -2.155482292175293, + "logits/rejected": -2.0352649688720703, + "logps/chosen": -259.88397216796875, + "logps/rejected": -262.71746826171875, + "loss": 11428.7422, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.632883608341217, + "rewards/margins": 0.17666089534759521, + "rewards/rejected": -0.8095444440841675, + "rewards/safe_rewards": -0.5953446626663208, + "rewards/unsafe_rewards": -0.6704224348068237, "step": 1710 }, { "epoch": 0.93, "learning_rate": 8.357199328144576e-09, - "logits/chosen": -0.5011011362075806, - "logits/rejected": 0.2814331650733948, - "logps/chosen": -393.62139892578125, - "logps/rejected": -389.73065185546875, - "loss": 41492.775, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7578811645507812, - "rewards/margins": 0.1635650396347046, - "rewards/rejected": -0.9214461445808411, - "rewards/safe_rewards": -0.7962349653244019, - "rewards/unsafe_rewards": -0.7195273637771606, + "logits/chosen": -2.1424155235290527, + "logits/rejected": -2.0332658290863037, + "logps/chosen": -308.6863708496094, + "logps/rejected": -289.75830078125, + "loss": 10265.0281, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6669701337814331, + "rewards/margins": 0.17594949901103973, + "rewards/rejected": -0.8429197072982788, + "rewards/safe_rewards": -0.7015780210494995, + "rewards/unsafe_rewards": -0.6323622465133667, "step": 1720 }, { "epoch": 0.93, "learning_rate": 7.1955293871198144e-09, - "logits/chosen": -0.17354485392570496, - "logits/rejected": 0.3540908694267273, - "logps/chosen": -324.1124572753906, - "logps/rejected": -338.9903564453125, - "loss": 47151.6719, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.7489806413650513, - "rewards/margins": 0.14638124406337738, - "rewards/rejected": -0.8953617811203003, - "rewards/safe_rewards": -0.7462954521179199, - "rewards/unsafe_rewards": -0.7516657114028931, + "logits/chosen": -2.112140417098999, + "logits/rejected": -2.030024528503418, + "logps/chosen": -241.85476684570312, + "logps/rejected": -241.1482391357422, + "loss": 11823.4781, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6754211187362671, + "rewards/margins": 0.13721489906311035, + "rewards/rejected": -0.8126360177993774, + "rewards/safe_rewards": -0.6722853779792786, + "rewards/unsafe_rewards": -0.6785567998886108, "step": 1730 }, { "epoch": 0.94, "learning_rate": 6.119577262853254e-09, - "logits/chosen": -0.36382755637168884, - "logits/rejected": 0.8483802676200867, - "logps/chosen": -334.2537536621094, - "logps/rejected": -333.51611328125, - "loss": 46034.5875, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7338821887969971, - "rewards/margins": 0.20424802601337433, - "rewards/rejected": -0.9381301999092102, - "rewards/safe_rewards": -0.7272058725357056, - "rewards/unsafe_rewards": -0.7405586242675781, + "logits/chosen": -2.1218502521514893, + "logits/rejected": -1.944239854812622, + "logps/chosen": -251.3634490966797, + "logps/rejected": -230.8335723876953, + "loss": 11556.3273, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.638350248336792, + "rewards/margins": 0.21106202900409698, + "rewards/rejected": -0.8494122624397278, + "rewards/safe_rewards": -0.613598108291626, + "rewards/unsafe_rewards": -0.6631024479866028, "step": 1740 }, { "epoch": 0.94, "learning_rate": 5.129722801180542e-09, - "logits/chosen": -0.2866384983062744, - "logits/rejected": 0.6360049247741699, - "logps/chosen": -353.8183288574219, - "logps/rejected": -358.28839111328125, - "loss": 41055.3875, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7532376050949097, - "rewards/margins": 0.18598462641239166, - "rewards/rejected": -0.9392221570014954, - "rewards/safe_rewards": -0.7865945100784302, - "rewards/unsafe_rewards": -0.7198807001113892, + "logits/chosen": -2.1147098541259766, + "logits/rejected": -2.0030481815338135, + "logps/chosen": -270.8954162597656, + "logps/rejected": -255.99600219726562, + "loss": 10173.3906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6771684885025024, + "rewards/margins": 0.1781720221042633, + "rewards/rejected": -0.8553404808044434, + "rewards/safe_rewards": -0.7092324495315552, + "rewards/unsafe_rewards": -0.6451044082641602, "step": 1750 }, { "epoch": 0.95, "learning_rate": 4.226315452682816e-09, - "logits/chosen": -0.34552446007728577, - "logits/rejected": 0.5495534539222717, - "logps/chosen": -344.78887939453125, - "logps/rejected": -356.21435546875, - "loss": 43965.1937, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7194116115570068, - "rewards/margins": 0.19497081637382507, - "rewards/rejected": -0.9143824577331543, - "rewards/safe_rewards": -0.7274887561798096, - "rewards/unsafe_rewards": -0.7113345265388489, + "logits/chosen": -2.1494641304016113, + "logits/rejected": -2.025930404663086, + "logps/chosen": -264.47100830078125, + "logps/rejected": -256.65093994140625, + "loss": 10950.5703, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6349852681159973, + "rewards/margins": 0.19823208451271057, + "rewards/rejected": -0.833217442035675, + "rewards/safe_rewards": -0.6437097787857056, + "rewards/unsafe_rewards": -0.6262607574462891, "step": 1760 }, { "epoch": 0.95, "learning_rate": 3.4096741493194193e-09, - "logits/chosen": -0.43240195512771606, - "logits/rejected": 0.2978712022304535, - "logps/chosen": -344.36578369140625, - "logps/rejected": -352.2105407714844, - "loss": 48400.5625, + "logits/chosen": -2.2127685546875, + "logits/rejected": -2.0972578525543213, + "logps/chosen": -264.87530517578125, + "logps/rejected": -257.4256286621094, + "loss": 11913.2609, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.7159353494644165, - "rewards/margins": 0.15215197205543518, - "rewards/rejected": -0.8680872917175293, - "rewards/safe_rewards": -0.7002893686294556, - "rewards/unsafe_rewards": -0.7315811514854431, + "rewards/chosen": -0.6370722651481628, + "rewards/margins": 0.15080325305461884, + "rewards/rejected": -0.7878755927085876, + "rewards/safe_rewards": -0.617976725101471, + "rewards/unsafe_rewards": -0.6561679840087891, "step": 1770 }, { "epoch": 0.96, "learning_rate": 2.6800871918346846e-09, - "logits/chosen": -0.6116408109664917, - "logits/rejected": 0.5449002981185913, - "logps/chosen": -354.43841552734375, - "logps/rejected": -358.74676513671875, - "loss": 44591.675, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.7034182548522949, - "rewards/margins": 0.1897074282169342, - "rewards/rejected": -0.8931257128715515, - "rewards/safe_rewards": -0.7300769090652466, - "rewards/unsafe_rewards": -0.6767595410346985, + "logits/chosen": -2.220578670501709, + "logits/rejected": -2.04236102104187, + "logps/chosen": -274.6324768066406, + "logps/rejected": -260.865234375, + "loss": 11005.4078, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6086142659187317, + "rewards/margins": 0.19890496134757996, + "rewards/rejected": -0.8075191378593445, + "rewards/safe_rewards": -0.6277071237564087, + "rewards/unsafe_rewards": -0.5895212888717651, "step": 1780 }, { "epoch": 0.96, "learning_rate": 2.0378121479783796e-09, - "logits/chosen": -0.2463545799255371, - "logits/rejected": 0.7831018567085266, - "logps/chosen": -349.6209716796875, - "logps/rejected": -358.6968994140625, - "loss": 46046.6531, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.7623225450515747, - "rewards/margins": 0.19337311387062073, - "rewards/rejected": -0.9556958079338074, - "rewards/safe_rewards": -0.7712682485580444, - "rewards/unsafe_rewards": -0.7533770799636841, + "logits/chosen": -2.132037401199341, + "logits/rejected": -1.972651481628418, + "logps/chosen": -265.41961669921875, + "logps/rejected": -256.274658203125, + "loss": 11459.243, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6822707056999207, + "rewards/margins": 0.2049897462129593, + "rewards/rejected": -0.8872605562210083, + "rewards/safe_rewards": -0.6853871941566467, + "rewards/unsafe_rewards": -0.6791542768478394, "step": 1790 }, { "epoch": 0.97, "learning_rate": 1.4830757615760247e-09, - "logits/chosen": -0.3106337785720825, - "logits/rejected": 0.6603339910507202, - "logps/chosen": -363.07989501953125, - "logps/rejected": -356.5513610839844, - "loss": 45439.5531, + "logits/chosen": -2.1159682273864746, + "logits/rejected": -1.9551427364349365, + "logps/chosen": -281.0558166503906, + "logps/rejected": -260.3757629394531, + "loss": 11785.8336, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.738674521446228, - "rewards/margins": 0.14763765037059784, - "rewards/rejected": -0.8863121271133423, - "rewards/safe_rewards": -0.7692455649375916, - "rewards/unsafe_rewards": -0.7081034779548645, + "rewards/chosen": -0.6576239466667175, + "rewards/margins": 0.15307986736297607, + "rewards/rejected": -0.8107039332389832, + "rewards/safe_rewards": -0.689802348613739, + "rewards/unsafe_rewards": -0.6254457235336304, + "step": 1800 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -1.954970121383667, + "eval_logits/rejected": -1.7745355367660522, + "eval_logps/chosen": -224.6013946533203, + "eval_logps/rejected": -193.1151123046875, + "eval_loss": 4320.82080078125, + "eval_rewards/accuracies": 0.6027347445487976, + "eval_rewards/chosen": -0.9417080879211426, + "eval_rewards/margins": 0.06478659808635712, + "eval_rewards/rejected": -1.0064946413040161, + "eval_rewards/safe_rewards": -0.9369282722473145, + "eval_rewards/unsafe_rewards": -0.9372634291648865, + "eval_runtime": 993.1393, + "eval_samples_per_second": 33.272, + "eval_steps_per_second": 1.04, "step": 1800 }, { "epoch": 0.97, "learning_rate": 1.0160738724809548e-09, - "logits/chosen": -0.4888392984867096, - "logits/rejected": 0.7107800841331482, - "logps/chosen": -339.5579833984375, - "logps/rejected": -355.8209228515625, - "loss": 42139.625, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.7269004583358765, - "rewards/margins": 0.18830473721027374, - "rewards/rejected": -0.9152051210403442, - "rewards/safe_rewards": -0.7479559183120728, - "rewards/unsafe_rewards": -0.7058448791503906, + "logits/chosen": -2.1704697608947754, + "logits/rejected": -1.9731838703155518, + "logps/chosen": -259.1123046875, + "logps/rejected": -257.42254638671875, + "loss": 10364.2625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6492170691490173, + "rewards/margins": 0.19766607880592346, + "rewards/rejected": -0.8468831777572632, + "rewards/safe_rewards": -0.6583928465843201, + "rewards/unsafe_rewards": -0.6400412321090698, "step": 1810 }, { "epoch": 0.98, "learning_rate": 6.369713474366212e-10, - "logits/chosen": -0.38188737630844116, - "logits/rejected": 0.5465607643127441, - "logps/chosen": -380.78741455078125, - "logps/rejected": -392.10528564453125, - "loss": 40518.675, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7784629464149475, - "rewards/margins": 0.19433815777301788, - "rewards/rejected": -0.9728010892868042, - "rewards/safe_rewards": -0.7767602205276489, - "rewards/unsafe_rewards": -0.7801656723022461, + "logits/chosen": -2.1475830078125, + "logits/rejected": -1.9969937801361084, + "logps/chosen": -294.9414367675781, + "logps/rejected": -288.337890625, + "loss": 10068.2195, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6980792880058289, + "rewards/margins": 0.2098824679851532, + "rewards/rejected": -0.9079617261886597, + "rewards/safe_rewards": -0.7020989656448364, + "rewards/unsafe_rewards": -0.6940596699714661, "step": 1820 }, { "epoch": 0.98, "learning_rate": 3.459020218731512e-10, - "logits/chosen": -0.42126020789146423, - "logits/rejected": 0.4062139093875885, - "logps/chosen": -337.8227844238281, - "logps/rejected": -343.28143310546875, - "loss": 41459.1625, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.7428838610649109, - "rewards/margins": 0.19188812375068665, - "rewards/rejected": -0.9347720146179199, - "rewards/safe_rewards": -0.7397066354751587, - "rewards/unsafe_rewards": -0.7460610270500183, + "logits/chosen": -2.1376261711120605, + "logits/rejected": -2.0157809257507324, + "logps/chosen": -254.5150604248047, + "logps/rejected": -243.1461181640625, + "loss": 10131.9508, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6527233719825745, + "rewards/margins": 0.2155275046825409, + "rewards/rejected": -0.8682507276535034, + "rewards/safe_rewards": -0.6437439322471619, + "rewards/unsafe_rewards": -0.6617026329040527, "step": 1830 }, { "epoch": 0.99, "learning_rate": 1.429686526593088e-10, - "logits/chosen": -0.3659382462501526, - "logits/rejected": 0.4401054382324219, - "logps/chosen": -352.1277770996094, - "logps/rejected": -364.5936584472656, - "loss": 44849.6813, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.7432361245155334, - "rewards/margins": 0.17817668616771698, - "rewards/rejected": -0.9214128255844116, - "rewards/safe_rewards": -0.7562955021858215, - "rewards/unsafe_rewards": -0.7301768064498901, + "logits/chosen": -2.116459608078003, + "logits/rejected": -1.9903713464736938, + "logps/chosen": -270.08428955078125, + "logps/rejected": -265.38336181640625, + "loss": 11322.0812, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6661794185638428, + "rewards/margins": 0.18438491225242615, + "rewards/rejected": -0.8505643606185913, + "rewards/safe_rewards": -0.6831308603286743, + "rewards/unsafe_rewards": -0.6492279767990112, "step": 1840 }, { "epoch": 1.0, "learning_rate": 2.824288182584622e-11, - "logits/chosen": -0.5492274165153503, - "logits/rejected": 0.5288087725639343, - "logps/chosen": -366.0335693359375, - "logps/rejected": -355.26727294921875, - "loss": 42273.875, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.7461895942687988, - "rewards/margins": 0.15412096679210663, - "rewards/rejected": -0.9003106355667114, - "rewards/safe_rewards": -0.7434717416763306, - "rewards/unsafe_rewards": -0.7489073872566223, + "logits/chosen": -2.1975011825561523, + "logits/rejected": -2.020158290863037, + "logps/chosen": -284.1047668457031, + "logps/rejected": -259.551513671875, + "loss": 10472.5969, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6722114682197571, + "rewards/margins": 0.17123940587043762, + "rewards/rejected": -0.8434508442878723, + "rewards/safe_rewards": -0.6627709865570068, + "rewards/unsafe_rewards": -0.6816519498825073, "step": 1850 }, { "epoch": 1.0, "step": 1858, "total_flos": 0.0, - "train_loss": 47137.349905812705, - "train_runtime": 22393.3393, - "train_samples_per_second": 2.656, - "train_steps_per_second": 0.083 + "train_loss": 11692.422362755651, + "train_runtime": 23508.9776, + "train_samples_per_second": 2.53, + "train_steps_per_second": 0.079 } ], "logging_steps": 10, "max_steps": 1858, "num_input_tokens_seen": 0, "num_train_epochs": 1, - "save_steps": 500, + "save_steps": 300, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null,