{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 6.745415365180119, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.06497789919376373, "logits/rejected": 0.14137546718120575, "logps/chosen": -1.7163540124893188, "logps/rejected": -1.8895915746688843, "loss": 0.8353, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7163540124893188, "rewards/margins": 0.17323757708072662, "rewards/rejected": -1.8895915746688843, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 9.615201310017493, "learning_rate": 1.7825311942959e-08, "logits/chosen": -0.0029649347998201847, "logits/rejected": 0.11484275758266449, "logps/chosen": -1.802668809890747, "logps/rejected": -1.8453788757324219, "loss": 0.9256, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.802668809890747, "rewards/margins": 0.04271003603935242, "rewards/rejected": -1.8453788757324219, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 9.995349138101655, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.03049122728407383, "logits/rejected": 0.06733934581279755, "logps/chosen": -1.6332365274429321, "logps/rejected": -1.7625558376312256, "loss": 0.9247, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6332365274429321, "rewards/margins": 0.12931933999061584, "rewards/rejected": -1.7625558376312256, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 6.6761442021352755, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.036745935678482056, "logits/rejected": 0.04758107662200928, "logps/chosen": -1.7253265380859375, "logps/rejected": -1.806983232498169, "loss": 0.9429, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7253265380859375, "rewards/margins": 0.08165649324655533, "rewards/rejected": -1.806983232498169, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 15.331653979847616, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.03747902438044548, "logits/rejected": 0.04722173884510994, "logps/chosen": -1.8687280416488647, "logps/rejected": -1.7781184911727905, "loss": 1.0807, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -1.8687280416488647, "rewards/margins": -0.09060955047607422, "rewards/rejected": -1.7781184911727905, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 8.44263683687928, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.0649641752243042, "logits/rejected": 0.027637016028165817, "logps/chosen": -1.9092857837677002, "logps/rejected": -1.8325525522232056, "loss": 0.982, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.9092857837677002, "rewards/margins": -0.07673348486423492, "rewards/rejected": -1.8325525522232056, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 9.12122622830145, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.046943675726652145, "logits/rejected": 0.11221543699502945, "logps/chosen": -1.8440439701080322, "logps/rejected": -1.995361566543579, "loss": 0.9627, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.8440439701080322, "rewards/margins": 0.15131758153438568, "rewards/rejected": -1.995361566543579, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 8.204216269442778, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.04793083667755127, "logits/rejected": 0.22470280528068542, "logps/chosen": -1.8831361532211304, "logps/rejected": -1.7434349060058594, "loss": 1.0245, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8831361532211304, "rewards/margins": -0.13970127701759338, "rewards/rejected": -1.7434349060058594, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 13.858265889791658, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.03839867562055588, "logits/rejected": 0.23907847702503204, "logps/chosen": -1.8365857601165771, "logps/rejected": -1.8719298839569092, "loss": 0.9824, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8365857601165771, "rewards/margins": 0.03534409776329994, "rewards/rejected": -1.8719298839569092, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 11.359280491699518, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.03781601041555405, "logits/rejected": 0.11211410909891129, "logps/chosen": -1.8974231481552124, "logps/rejected": -1.7776927947998047, "loss": 1.0307, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8974231481552124, "rewards/margins": -0.11973042786121368, "rewards/rejected": -1.7776927947998047, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 7.6241654165957895, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.09588184952735901, "logits/rejected": 0.12800191342830658, "logps/chosen": -1.8320366144180298, "logps/rejected": -1.8668434619903564, "loss": 0.99, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8320366144180298, "rewards/margins": 0.034806858748197556, "rewards/rejected": -1.8668434619903564, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 7.470174607941018, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.08247846364974976, "logits/rejected": 0.10538563877344131, "logps/chosen": -1.7869726419448853, "logps/rejected": -1.892783761024475, "loss": 0.9022, "rewards/accuracies": 0.53125, "rewards/chosen": -1.7869726419448853, "rewards/margins": 0.10581111907958984, "rewards/rejected": -1.892783761024475, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 6.27372034250797, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.03359535336494446, "logits/rejected": 0.11176600307226181, "logps/chosen": -1.6362268924713135, "logps/rejected": -1.7655067443847656, "loss": 0.8758, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6362268924713135, "rewards/margins": 0.12927986681461334, "rewards/rejected": -1.7655067443847656, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 11.618832155722101, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.06761655956506729, "logits/rejected": 0.08668618649244308, "logps/chosen": -1.7646713256835938, "logps/rejected": -1.8109506368637085, "loss": 0.9913, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7646713256835938, "rewards/margins": 0.04627937823534012, "rewards/rejected": -1.8109506368637085, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 12.401783006884278, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.04526766017079353, "logits/rejected": 0.1374778002500534, "logps/chosen": -1.775294303894043, "logps/rejected": -2.0359389781951904, "loss": 0.8579, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.775294303894043, "rewards/margins": 0.2606446444988251, "rewards/rejected": -2.0359389781951904, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 8.36328738385819, "learning_rate": 1.42602495543672e-07, "logits/chosen": 0.007079380098730326, "logits/rejected": 0.11145032942295074, "logps/chosen": -1.7160499095916748, "logps/rejected": -1.749887466430664, "loss": 0.946, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7160499095916748, "rewards/margins": 0.03383733332157135, "rewards/rejected": -1.749887466430664, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 6.013951417596108, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.1325346678495407, "logits/rejected": 0.11916611343622208, "logps/chosen": -1.787327527999878, "logps/rejected": -1.9644066095352173, "loss": 0.9125, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.787327527999878, "rewards/margins": 0.17707891762256622, "rewards/rejected": -1.9644066095352173, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 14.957140274696403, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.08482521772384644, "logits/rejected": 0.0444350466132164, "logps/chosen": -1.7432467937469482, "logps/rejected": -1.7715190649032593, "loss": 0.9867, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.7432467937469482, "rewards/margins": 0.028272151947021484, "rewards/rejected": -1.7715190649032593, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 5.563544661896379, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.06795939058065414, "logits/rejected": 0.08141766488552094, "logps/chosen": -1.7914842367172241, "logps/rejected": -1.901329755783081, "loss": 0.9315, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7914842367172241, "rewards/margins": 0.10984551906585693, "rewards/rejected": -1.901329755783081, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 5.987202069980056, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.034299951046705246, "logits/rejected": 0.028665874153375626, "logps/chosen": -1.6831705570220947, "logps/rejected": -1.7905528545379639, "loss": 0.8938, "rewards/accuracies": 0.5, "rewards/chosen": -1.6831705570220947, "rewards/margins": 0.1073823943734169, "rewards/rejected": -1.7905528545379639, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 9.008088749121416, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.04120921716094017, "logits/rejected": 0.06815817952156067, "logps/chosen": -1.6348825693130493, "logps/rejected": -1.8038127422332764, "loss": 0.8741, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6348825693130493, "rewards/margins": 0.16893038153648376, "rewards/rejected": -1.8038127422332764, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 6.841612787738681, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.012015903368592262, "logits/rejected": 0.10734136402606964, "logps/chosen": -1.6701446771621704, "logps/rejected": -1.7274916172027588, "loss": 0.9474, "rewards/accuracies": 0.5, "rewards/chosen": -1.6701446771621704, "rewards/margins": 0.05734696984291077, "rewards/rejected": -1.7274916172027588, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 9.950350257870056, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.03455201908946037, "logits/rejected": 0.24307577311992645, "logps/chosen": -1.6569664478302002, "logps/rejected": -1.940863847732544, "loss": 0.8176, "rewards/accuracies": 0.625, "rewards/chosen": -1.6569664478302002, "rewards/margins": 0.28389739990234375, "rewards/rejected": -1.940863847732544, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 6.608491974353304, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.0682152807712555, "logits/rejected": 0.10647422075271606, "logps/chosen": -1.7350132465362549, "logps/rejected": -1.8560783863067627, "loss": 0.8973, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.7350132465362549, "rewards/margins": 0.12106498330831528, "rewards/rejected": -1.8560783863067627, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 7.136374333968139, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.08536295592784882, "logits/rejected": 0.04411270469427109, "logps/chosen": -1.653207778930664, "logps/rejected": -1.6063206195831299, "loss": 0.9772, "rewards/accuracies": 0.5, "rewards/chosen": -1.653207778930664, "rewards/margins": -0.04688725620508194, "rewards/rejected": -1.6063206195831299, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 9.145341705027388, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.03893875330686569, "logits/rejected": 0.17409241199493408, "logps/chosen": -1.6900441646575928, "logps/rejected": -1.8113453388214111, "loss": 0.8563, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.6900441646575928, "rewards/margins": 0.12130121886730194, "rewards/rejected": -1.8113453388214111, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 15.961871635842657, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.04261577129364014, "logits/rejected": 0.07430346310138702, "logps/chosen": -1.7398273944854736, "logps/rejected": -1.760817289352417, "loss": 0.9574, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7398273944854736, "rewards/margins": 0.020990237593650818, "rewards/rejected": -1.760817289352417, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 11.344247759870614, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.02744327485561371, "logits/rejected": 0.1395677775144577, "logps/chosen": -1.6990106105804443, "logps/rejected": -1.8429638147354126, "loss": 0.8678, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.6990106105804443, "rewards/margins": 0.14395304024219513, "rewards/rejected": -1.8429638147354126, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 10.743443563777921, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.011493165977299213, "logits/rejected": 0.14371457695960999, "logps/chosen": -1.612004280090332, "logps/rejected": -1.7274078130722046, "loss": 0.8822, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.612004280090332, "rewards/margins": 0.11540355533361435, "rewards/rejected": -1.7274078130722046, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 11.428121428161498, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.05299872159957886, "logits/rejected": 0.10899674892425537, "logps/chosen": -1.5665837526321411, "logps/rejected": -1.56602942943573, "loss": 0.9565, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.5665837526321411, "rewards/margins": -0.0005542755243368447, "rewards/rejected": -1.56602942943573, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 9.573352612884204, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.05986739322543144, "logits/rejected": -0.015056168660521507, "logps/chosen": -1.5833594799041748, "logps/rejected": -1.676007866859436, "loss": 0.8945, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5833594799041748, "rewards/margins": 0.09264858812093735, "rewards/rejected": -1.676007866859436, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 8.18790309095409, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.15015223622322083, "logits/rejected": -0.009835200384259224, "logps/chosen": -1.7044143676757812, "logps/rejected": -1.6820611953735352, "loss": 0.9818, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7044143676757812, "rewards/margins": -0.022353051230311394, "rewards/rejected": -1.6820611953735352, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 8.354926142076947, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.05939929559826851, "logits/rejected": 0.11361409723758698, "logps/chosen": -1.5418336391448975, "logps/rejected": -1.6844842433929443, "loss": 0.894, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5418336391448975, "rewards/margins": 0.14265072345733643, "rewards/rejected": -1.6844842433929443, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 13.994861835179528, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.0878124088048935, "logits/rejected": -0.034666262567043304, "logps/chosen": -1.691471815109253, "logps/rejected": -1.7404849529266357, "loss": 0.935, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.691471815109253, "rewards/margins": 0.049013249576091766, "rewards/rejected": -1.7404849529266357, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 10.560600420217263, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.02487664297223091, "logits/rejected": 0.02249312400817871, "logps/chosen": -1.5649936199188232, "logps/rejected": -1.6665769815444946, "loss": 0.9257, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.5649936199188232, "rewards/margins": 0.10158351808786392, "rewards/rejected": -1.6665769815444946, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 8.844047479421752, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.008646877482533455, "logits/rejected": -0.010379401035606861, "logps/chosen": -1.582965612411499, "logps/rejected": -1.7397037744522095, "loss": 0.9043, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.582965612411499, "rewards/margins": 0.1567383110523224, "rewards/rejected": -1.7397037744522095, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 8.561649821918001, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.13111087679862976, "logits/rejected": -0.04188833758234978, "logps/chosen": -1.540212869644165, "logps/rejected": -1.608209252357483, "loss": 0.9456, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.540212869644165, "rewards/margins": 0.06799633800983429, "rewards/rejected": -1.608209252357483, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 9.356147620244778, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.05670984461903572, "logits/rejected": 0.06327557563781738, "logps/chosen": -1.5478922128677368, "logps/rejected": -1.6548068523406982, "loss": 0.88, "rewards/accuracies": 0.5, "rewards/chosen": -1.5478922128677368, "rewards/margins": 0.10691466182470322, "rewards/rejected": -1.6548068523406982, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 6.391171751809567, "learning_rate": 3.475935828877005e-07, "logits/chosen": 0.016581177711486816, "logits/rejected": 0.1721985936164856, "logps/chosen": -1.3983471393585205, "logps/rejected": -1.5581902265548706, "loss": 0.8609, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3983471393585205, "rewards/margins": 0.15984311699867249, "rewards/rejected": -1.5581902265548706, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 14.421807873262015, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.07179798930883408, "logits/rejected": 0.06579109281301498, "logps/chosen": -1.5170024633407593, "logps/rejected": -1.5453741550445557, "loss": 0.9291, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5170024633407593, "rewards/margins": 0.028371721506118774, "rewards/rejected": -1.5453741550445557, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 14.733862703707466, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.06070606783032417, "logits/rejected": 0.08200714737176895, "logps/chosen": -1.4389501810073853, "logps/rejected": -1.498337745666504, "loss": 0.9223, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4389501810073853, "rewards/margins": 0.05938757583498955, "rewards/rejected": -1.498337745666504, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 12.125579904380258, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.15393272042274475, "logits/rejected": 0.034471504390239716, "logps/chosen": -1.501002311706543, "logps/rejected": -1.6698964834213257, "loss": 0.8633, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.501002311706543, "rewards/margins": 0.16889388859272003, "rewards/rejected": -1.6698964834213257, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 8.000455624194773, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.18284690380096436, "logits/rejected": 0.06020547077059746, "logps/chosen": -1.503688097000122, "logps/rejected": -1.5946009159088135, "loss": 0.86, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.503688097000122, "rewards/margins": 0.09091290086507797, "rewards/rejected": -1.5946009159088135, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 18.195259494969463, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.04820474982261658, "logits/rejected": 0.14560113847255707, "logps/chosen": -1.4830628633499146, "logps/rejected": -1.6915302276611328, "loss": 0.8318, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4830628633499146, "rewards/margins": 0.20846720039844513, "rewards/rejected": -1.6915302276611328, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 6.811881564803529, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.10213228315114975, "logits/rejected": 0.06591366231441498, "logps/chosen": -1.466652274131775, "logps/rejected": -1.615389108657837, "loss": 0.8412, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.466652274131775, "rewards/margins": 0.1487365961074829, "rewards/rejected": -1.615389108657837, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 6.596030737152498, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.02046014554798603, "logits/rejected": 0.05466270446777344, "logps/chosen": -1.5274227857589722, "logps/rejected": -1.6979058980941772, "loss": 0.8675, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5274227857589722, "rewards/margins": 0.17048311233520508, "rewards/rejected": -1.6979058980941772, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 11.575850568398238, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.005199754144996405, "logits/rejected": 0.13961032032966614, "logps/chosen": -1.4755836725234985, "logps/rejected": -1.6504920721054077, "loss": 0.8252, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4755836725234985, "rewards/margins": 0.1749085932970047, "rewards/rejected": -1.6504920721054077, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 7.650414545747714, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.031107615679502487, "logits/rejected": 0.09087537229061127, "logps/chosen": -1.4849299192428589, "logps/rejected": -1.6810789108276367, "loss": 0.8558, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4849299192428589, "rewards/margins": 0.19614894688129425, "rewards/rejected": -1.6810789108276367, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 7.827859201306663, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.03517443686723709, "logits/rejected": 0.1547480672597885, "logps/chosen": -1.5838836431503296, "logps/rejected": -1.6754264831542969, "loss": 0.8814, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5838836431503296, "rewards/margins": 0.09154288470745087, "rewards/rejected": -1.6754264831542969, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 14.633103475628868, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.05150257423520088, "logits/rejected": 0.11110256612300873, "logps/chosen": -1.580965280532837, "logps/rejected": -1.6479336023330688, "loss": 0.9333, "rewards/accuracies": 0.5, "rewards/chosen": -1.580965280532837, "rewards/margins": 0.06696829944849014, "rewards/rejected": -1.6479336023330688, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 8.998250476277446, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.02708597108721733, "logits/rejected": 0.11728671938180923, "logps/chosen": -1.4529967308044434, "logps/rejected": -1.622768759727478, "loss": 0.8482, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4529967308044434, "rewards/margins": 0.16977210342884064, "rewards/rejected": -1.622768759727478, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 7.288196613116792, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.21352967619895935, "logits/rejected": -0.10617400705814362, "logps/chosen": -1.6058521270751953, "logps/rejected": -1.7349302768707275, "loss": 0.8312, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6058521270751953, "rewards/margins": 0.12907829880714417, "rewards/rejected": -1.7349302768707275, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 16.00167073243763, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.08325710892677307, "logits/rejected": 0.0008392162853851914, "logps/chosen": -1.6183125972747803, "logps/rejected": -1.7546857595443726, "loss": 0.8793, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6183125972747803, "rewards/margins": 0.13637325167655945, "rewards/rejected": -1.7546857595443726, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 6.589955036402904, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.08158577978610992, "logits/rejected": 0.051729291677474976, "logps/chosen": -1.513541340827942, "logps/rejected": -1.6356121301651, "loss": 0.8553, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.513541340827942, "rewards/margins": 0.12207069247961044, "rewards/rejected": -1.6356121301651, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 8.581853336747658, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.01784585788846016, "logits/rejected": 0.07567474246025085, "logps/chosen": -1.4707576036453247, "logps/rejected": -1.6973224878311157, "loss": 0.8477, "rewards/accuracies": 0.625, "rewards/chosen": -1.4707576036453247, "rewards/margins": 0.22656476497650146, "rewards/rejected": -1.6973224878311157, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 16.078729232614222, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.09840095788240433, "logits/rejected": 0.061822064220905304, "logps/chosen": -1.5649346113204956, "logps/rejected": -1.6863704919815063, "loss": 0.8653, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5649346113204956, "rewards/margins": 0.12143599987030029, "rewards/rejected": -1.6863704919815063, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 7.882917318383516, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.08384941518306732, "logits/rejected": 0.05558781698346138, "logps/chosen": -1.5443254709243774, "logps/rejected": -1.6616817712783813, "loss": 0.895, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5443254709243774, "rewards/margins": 0.11735613644123077, "rewards/rejected": -1.6616817712783813, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 8.520267864479807, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.1282961070537567, "logits/rejected": 0.16424764692783356, "logps/chosen": -1.5298783779144287, "logps/rejected": -1.7103841304779053, "loss": 0.812, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5298783779144287, "rewards/margins": 0.18050578236579895, "rewards/rejected": -1.7103841304779053, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 11.825997652889331, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.05358947068452835, "logits/rejected": 0.002478866372257471, "logps/chosen": -1.4826228618621826, "logps/rejected": -1.6104042530059814, "loss": 0.8515, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.4826228618621826, "rewards/margins": 0.12778130173683167, "rewards/rejected": -1.6104042530059814, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 9.566997266552656, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.06900415569543839, "logits/rejected": 0.1003919392824173, "logps/chosen": -1.533227801322937, "logps/rejected": -1.646733045578003, "loss": 0.8729, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.533227801322937, "rewards/margins": 0.11350531876087189, "rewards/rejected": -1.646733045578003, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 7.282248499499289, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.035725586116313934, "logits/rejected": 0.0352834016084671, "logps/chosen": -1.637495994567871, "logps/rejected": -1.6349891424179077, "loss": 0.9238, "rewards/accuracies": 0.46875, "rewards/chosen": -1.637495994567871, "rewards/margins": -0.002506789518520236, "rewards/rejected": -1.6349891424179077, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 10.294285889158306, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.19744086265563965, "logits/rejected": -0.10811547189950943, "logps/chosen": -1.6056547164916992, "logps/rejected": -1.7236793041229248, "loss": 0.8796, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.6056547164916992, "rewards/margins": 0.11802470684051514, "rewards/rejected": -1.7236793041229248, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 12.270620433353177, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.007066287100315094, "logits/rejected": 0.15328717231750488, "logps/chosen": -1.6061559915542603, "logps/rejected": -1.7938963174819946, "loss": 0.8455, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6061559915542603, "rewards/margins": 0.18774044513702393, "rewards/rejected": -1.7938963174819946, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 6.836304265434531, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.07597661018371582, "logits/rejected": 0.05206872150301933, "logps/chosen": -1.5353093147277832, "logps/rejected": -1.5962353944778442, "loss": 0.8877, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.5353093147277832, "rewards/margins": 0.060926176607608795, "rewards/rejected": -1.5962353944778442, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 10.173763944456734, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.12505470216274261, "logits/rejected": -0.008930942043662071, "logps/chosen": -1.56186842918396, "logps/rejected": -1.869795799255371, "loss": 0.8036, "rewards/accuracies": 0.5625, "rewards/chosen": -1.56186842918396, "rewards/margins": 0.3079273998737335, "rewards/rejected": -1.869795799255371, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 12.766262742563864, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.027056461200118065, "logits/rejected": 0.12089814990758896, "logps/chosen": -1.5520985126495361, "logps/rejected": -1.8318803310394287, "loss": 0.7881, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5520985126495361, "rewards/margins": 0.2797819972038269, "rewards/rejected": -1.8318803310394287, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 19.720551633091002, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.01687520742416382, "logits/rejected": 0.12172763049602509, "logps/chosen": -1.6105413436889648, "logps/rejected": -1.6831810474395752, "loss": 0.8792, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6105413436889648, "rewards/margins": 0.07263951748609543, "rewards/rejected": -1.6831810474395752, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 14.93909552240222, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.05352864786982536, "logits/rejected": 0.08943343907594681, "logps/chosen": -1.688166618347168, "logps/rejected": -1.7949199676513672, "loss": 0.9022, "rewards/accuracies": 0.5625, "rewards/chosen": -1.688166618347168, "rewards/margins": 0.10675332695245743, "rewards/rejected": -1.7949199676513672, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 12.950805455193036, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.031132811680436134, "logits/rejected": 0.05951271206140518, "logps/chosen": -1.585524320602417, "logps/rejected": -1.7751219272613525, "loss": 0.849, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.585524320602417, "rewards/margins": 0.18959780037403107, "rewards/rejected": -1.7751219272613525, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 11.3766986018264, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.010494798421859741, "logits/rejected": 0.07931246608495712, "logps/chosen": -1.538698434829712, "logps/rejected": -1.6715770959854126, "loss": 0.8801, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.538698434829712, "rewards/margins": 0.13287845253944397, "rewards/rejected": -1.6715770959854126, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 10.235427849578214, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.07612549513578415, "logits/rejected": 0.14606355130672455, "logps/chosen": -1.6175472736358643, "logps/rejected": -1.694308876991272, "loss": 0.8867, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6175472736358643, "rewards/margins": 0.07676161825656891, "rewards/rejected": -1.694308876991272, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 9.446569865132822, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.062459319829940796, "logits/rejected": 0.015417033806443214, "logps/chosen": -1.5924055576324463, "logps/rejected": -1.762820839881897, "loss": 0.867, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.5924055576324463, "rewards/margins": 0.17041513323783875, "rewards/rejected": -1.762820839881897, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 15.008815831906713, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.0033310563303530216, "logits/rejected": 0.08453454077243805, "logps/chosen": -1.555394172668457, "logps/rejected": -1.6433461904525757, "loss": 0.8882, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.555394172668457, "rewards/margins": 0.08795187622308731, "rewards/rejected": -1.6433461904525757, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 12.285174367078287, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.033322982490062714, "logits/rejected": 0.05640743300318718, "logps/chosen": -1.5363000631332397, "logps/rejected": -1.622886061668396, "loss": 0.904, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5363000631332397, "rewards/margins": 0.0865858644247055, "rewards/rejected": -1.622886061668396, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 11.335162886257029, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.06213964894413948, "logits/rejected": 0.08812294900417328, "logps/chosen": -1.5501368045806885, "logps/rejected": -1.7793262004852295, "loss": 0.8208, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5501368045806885, "rewards/margins": 0.22918951511383057, "rewards/rejected": -1.7793262004852295, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 9.625852617765176, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.028366774320602417, "logits/rejected": 0.05531026050448418, "logps/chosen": -1.63349187374115, "logps/rejected": -1.845192313194275, "loss": 0.7948, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.63349187374115, "rewards/margins": 0.21170052886009216, "rewards/rejected": -1.845192313194275, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 5.5790435517200425, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.00018651635036803782, "logits/rejected": 0.07417208701372147, "logps/chosen": -1.6837339401245117, "logps/rejected": -1.747012734413147, "loss": 0.8742, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.6837339401245117, "rewards/margins": 0.06327863037586212, "rewards/rejected": -1.747012734413147, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 10.890392999334175, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.06456553190946579, "logits/rejected": 0.2283523976802826, "logps/chosen": -1.710921049118042, "logps/rejected": -1.8671058416366577, "loss": 0.8386, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.710921049118042, "rewards/margins": 0.15618488192558289, "rewards/rejected": -1.8671058416366577, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 10.011039988389093, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.0369555726647377, "logits/rejected": 0.12461593002080917, "logps/chosen": -1.6567661762237549, "logps/rejected": -1.7613284587860107, "loss": 0.8524, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.6567661762237549, "rewards/margins": 0.10456228256225586, "rewards/rejected": -1.7613284587860107, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 9.744330529857526, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.07047872245311737, "logits/rejected": 0.16159841418266296, "logps/chosen": -1.7146040201187134, "logps/rejected": -1.8575248718261719, "loss": 0.8222, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.7146040201187134, "rewards/margins": 0.14292074739933014, "rewards/rejected": -1.8575248718261719, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.2893581986427307, "eval_logits/rejected": 0.3776567578315735, "eval_logps/chosen": -1.711828589439392, "eval_logps/rejected": -1.9086363315582275, "eval_loss": 0.828525960445404, "eval_rewards/accuracies": 0.5534124374389648, "eval_rewards/chosen": -1.711828589439392, "eval_rewards/margins": 0.19680754840373993, "eval_rewards/rejected": -1.9086363315582275, "eval_runtime": 41.7006, "eval_samples_per_second": 32.254, "eval_steps_per_second": 8.081, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 8.867775009960125, "learning_rate": 7.219251336898395e-07, "logits/chosen": 0.010755693539977074, "logits/rejected": 0.10263502597808838, "logps/chosen": -1.7386600971221924, "logps/rejected": -1.9027583599090576, "loss": 0.8737, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.7386600971221924, "rewards/margins": 0.16409820318222046, "rewards/rejected": -1.9027583599090576, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 11.344262110986445, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.04614900425076485, "logits/rejected": 0.17397210001945496, "logps/chosen": -1.6706117391586304, "logps/rejected": -1.8571571111679077, "loss": 0.8325, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6706117391586304, "rewards/margins": 0.18654537200927734, "rewards/rejected": -1.8571571111679077, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 8.588464866170263, "learning_rate": 7.397504456327985e-07, "logits/chosen": 0.04392815753817558, "logits/rejected": 0.08737512677907944, "logps/chosen": -1.6802873611450195, "logps/rejected": -1.8546464443206787, "loss": 0.8327, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.6802873611450195, "rewards/margins": 0.17435915768146515, "rewards/rejected": -1.8546464443206787, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 10.562846701493706, "learning_rate": 7.486631016042781e-07, "logits/chosen": 0.011984582059085369, "logits/rejected": 0.20924067497253418, "logps/chosen": -1.5698673725128174, "logps/rejected": -1.7264931201934814, "loss": 0.855, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.5698673725128174, "rewards/margins": 0.15662574768066406, "rewards/rejected": -1.7264931201934814, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 9.781607125913197, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.022232074290513992, "logits/rejected": 0.1726207286119461, "logps/chosen": -1.6340458393096924, "logps/rejected": -1.9014164209365845, "loss": 0.778, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6340458393096924, "rewards/margins": 0.26737073063850403, "rewards/rejected": -1.9014164209365845, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 8.947119827808859, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.05679629370570183, "logits/rejected": 0.1442783772945404, "logps/chosen": -1.6273634433746338, "logps/rejected": -1.9590866565704346, "loss": 0.7863, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6273634433746338, "rewards/margins": 0.33172309398651123, "rewards/rejected": -1.9590866565704346, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 21.023805952716458, "learning_rate": 7.754010695187165e-07, "logits/chosen": 0.027353938668966293, "logits/rejected": 0.11465086787939072, "logps/chosen": -1.577866792678833, "logps/rejected": -1.7316259145736694, "loss": 0.8269, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.577866792678833, "rewards/margins": 0.15375933051109314, "rewards/rejected": -1.7316259145736694, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 12.40447879409035, "learning_rate": 7.84313725490196e-07, "logits/chosen": 0.015877995640039444, "logits/rejected": 0.10901105403900146, "logps/chosen": -1.6282848119735718, "logps/rejected": -1.833214521408081, "loss": 0.8097, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6282848119735718, "rewards/margins": 0.20492970943450928, "rewards/rejected": -1.833214521408081, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 12.133729943006117, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.04150679334998131, "logits/rejected": 0.0644896999001503, "logps/chosen": -1.6798919439315796, "logps/rejected": -1.9496148824691772, "loss": 0.8065, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6798919439315796, "rewards/margins": 0.26972320675849915, "rewards/rejected": -1.9496148824691772, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 12.220240461644998, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.014851631596684456, "logits/rejected": 0.13735055923461914, "logps/chosen": -1.7085788249969482, "logps/rejected": -1.9029312133789062, "loss": 0.7889, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.7085788249969482, "rewards/margins": 0.1943524330854416, "rewards/rejected": -1.9029312133789062, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 14.180161160869961, "learning_rate": 8.110516934046346e-07, "logits/chosen": 0.012332240119576454, "logits/rejected": 0.09464090317487717, "logps/chosen": -1.6334879398345947, "logps/rejected": -1.9320026636123657, "loss": 0.7636, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6334879398345947, "rewards/margins": 0.2985147535800934, "rewards/rejected": -1.9320026636123657, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 9.840708225301016, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.10136841237545013, "logits/rejected": 0.01598675176501274, "logps/chosen": -1.75235116481781, "logps/rejected": -1.8934476375579834, "loss": 0.8255, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.75235116481781, "rewards/margins": 0.14109662175178528, "rewards/rejected": -1.8934476375579834, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 11.938808338129723, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.11536283791065216, "logits/rejected": 0.12867453694343567, "logps/chosen": -1.7553989887237549, "logps/rejected": -1.9721330404281616, "loss": 0.82, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7553989887237549, "rewards/margins": 0.2167343646287918, "rewards/rejected": -1.9721330404281616, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 8.588156036293805, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.1666308045387268, "logits/rejected": 0.1229475885629654, "logps/chosen": -1.704858422279358, "logps/rejected": -1.9202430248260498, "loss": 0.8206, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.704858422279358, "rewards/margins": 0.21538472175598145, "rewards/rejected": -1.9202430248260498, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 8.078163380221401, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.03521030768752098, "logits/rejected": 0.0967731699347496, "logps/chosen": -1.686549425125122, "logps/rejected": -2.1026577949523926, "loss": 0.737, "rewards/accuracies": 0.65625, "rewards/chosen": -1.686549425125122, "rewards/margins": 0.41610804200172424, "rewards/rejected": -2.1026577949523926, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 13.593380625128253, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.007721737027168274, "logits/rejected": 0.18299898505210876, "logps/chosen": -1.7134068012237549, "logps/rejected": -1.9060900211334229, "loss": 0.804, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7134068012237549, "rewards/margins": 0.1926833689212799, "rewards/rejected": -1.9060900211334229, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 14.607030492124009, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.008199826814234257, "logits/rejected": 0.04754864424467087, "logps/chosen": -1.9009838104248047, "logps/rejected": -2.054046154022217, "loss": 0.8349, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.9009838104248047, "rewards/margins": 0.15306249260902405, "rewards/rejected": -2.054046154022217, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 9.77041726026761, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.03227580338716507, "logits/rejected": 0.09891127049922943, "logps/chosen": -1.8824619054794312, "logps/rejected": -2.027503490447998, "loss": 0.8337, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.8824619054794312, "rewards/margins": 0.14504151046276093, "rewards/rejected": -2.027503490447998, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 12.427280155537265, "learning_rate": 8.823529411764705e-07, "logits/chosen": 0.009776165708899498, "logits/rejected": 0.032349422574043274, "logps/chosen": -1.9015257358551025, "logps/rejected": -2.0345866680145264, "loss": 0.8232, "rewards/accuracies": 0.5, "rewards/chosen": -1.9015257358551025, "rewards/margins": 0.1330607384443283, "rewards/rejected": -2.0345866680145264, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 10.928833021547387, "learning_rate": 8.912655971479501e-07, "logits/chosen": 0.012938466854393482, "logits/rejected": 0.10318900644779205, "logps/chosen": -1.846160888671875, "logps/rejected": -2.087414503097534, "loss": 0.7887, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.846160888671875, "rewards/margins": 0.24125385284423828, "rewards/rejected": -2.087414503097534, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 11.233383604935481, "learning_rate": 9.001782531194295e-07, "logits/chosen": 0.00709199532866478, "logits/rejected": 0.1446431577205658, "logps/chosen": -1.9040355682373047, "logps/rejected": -2.0653538703918457, "loss": 0.7904, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9040355682373047, "rewards/margins": 0.1613180935382843, "rewards/rejected": -2.0653538703918457, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 8.674045346853037, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.13635993003845215, "logits/rejected": 0.18865881860256195, "logps/chosen": -1.9006564617156982, "logps/rejected": -2.1958041191101074, "loss": 0.7496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9006564617156982, "rewards/margins": 0.29514774680137634, "rewards/rejected": -2.1958041191101074, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 10.436450398135673, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.10228490829467773, "logits/rejected": 0.19639454782009125, "logps/chosen": -1.8103158473968506, "logps/rejected": -2.0610549449920654, "loss": 0.7639, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8103158473968506, "rewards/margins": 0.2507394254207611, "rewards/rejected": -2.0610549449920654, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 9.84012043961766, "learning_rate": 9.26916221033868e-07, "logits/chosen": 0.02292679250240326, "logits/rejected": 0.15393266081809998, "logps/chosen": -1.9247709512710571, "logps/rejected": -2.1875367164611816, "loss": 0.7599, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9247709512710571, "rewards/margins": 0.262766033411026, "rewards/rejected": -2.1875367164611816, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 16.632987369664583, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.17620179057121277, "logits/rejected": 0.24573767185211182, "logps/chosen": -2.011390447616577, "logps/rejected": -2.375781297683716, "loss": 0.7168, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.011390447616577, "rewards/margins": 0.36439090967178345, "rewards/rejected": -2.375781297683716, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 15.894084870640146, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.151467964053154, "logits/rejected": 0.22984011471271515, "logps/chosen": -2.00235652923584, "logps/rejected": -2.2910542488098145, "loss": 0.771, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.00235652923584, "rewards/margins": 0.2886977791786194, "rewards/rejected": -2.2910542488098145, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 10.898557325604136, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.0009826838504523039, "logits/rejected": 0.2439630776643753, "logps/chosen": -2.0542657375335693, "logps/rejected": -2.343630075454712, "loss": 0.7146, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0542657375335693, "rewards/margins": 0.2893642485141754, "rewards/rejected": -2.343630075454712, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 10.778282661227223, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.11748628318309784, "logits/rejected": 0.18395043909549713, "logps/chosen": -2.295041561126709, "logps/rejected": -2.56608247756958, "loss": 0.7436, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.295041561126709, "rewards/margins": 0.2710411250591278, "rewards/rejected": -2.56608247756958, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 10.86422613632532, "learning_rate": 9.714795008912655e-07, "logits/chosen": 0.00028176605701446533, "logits/rejected": 0.17567940056324005, "logps/chosen": -2.2681760787963867, "logps/rejected": -2.675942897796631, "loss": 0.6673, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2681760787963867, "rewards/margins": 0.40776705741882324, "rewards/rejected": -2.675942897796631, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 10.633717081676235, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.15468603372573853, "logits/rejected": 0.21574631333351135, "logps/chosen": -2.4424657821655273, "logps/rejected": -2.7834603786468506, "loss": 0.687, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4424657821655273, "rewards/margins": 0.3409945070743561, "rewards/rejected": -2.7834603786468506, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 23.50664255567351, "learning_rate": 9.893048128342244e-07, "logits/chosen": 0.06517226994037628, "logits/rejected": 0.17138729989528656, "logps/chosen": -2.6565728187561035, "logps/rejected": -2.837501287460327, "loss": 0.7644, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.6565728187561035, "rewards/margins": 0.18092870712280273, "rewards/rejected": -2.837501287460327, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 15.726490860346612, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.16730624437332153, "logits/rejected": 0.1779600828886032, "logps/chosen": -2.56695294380188, "logps/rejected": -2.908813238143921, "loss": 0.6901, "rewards/accuracies": 0.625, "rewards/chosen": -2.56695294380188, "rewards/margins": 0.3418603837490082, "rewards/rejected": -2.908813238143921, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 10.910170925157312, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.15981949865818024, "logits/rejected": 0.21009591221809387, "logps/chosen": -2.738394260406494, "logps/rejected": -3.158615827560425, "loss": 0.638, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.738394260406494, "rewards/margins": 0.42022204399108887, "rewards/rejected": -3.158615827560425, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 26.282447566732287, "learning_rate": 9.999921413906797e-07, "logits/chosen": 0.07072894275188446, "logits/rejected": 0.2572658956050873, "logps/chosen": -2.9132676124572754, "logps/rejected": -3.3420510292053223, "loss": 0.6496, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.9132676124572754, "rewards/margins": 0.4287834167480469, "rewards/rejected": -3.3420510292053223, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 12.201266811584158, "learning_rate": 9.999809841765644e-07, "logits/chosen": 0.0929381400346756, "logits/rejected": 0.1420120745897293, "logps/chosen": -3.0638320446014404, "logps/rejected": -3.459153413772583, "loss": 0.6739, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.0638320446014404, "rewards/margins": 0.39532148838043213, "rewards/rejected": -3.459153413772583, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 14.06026849126269, "learning_rate": 9.999649761447477e-07, "logits/chosen": 0.07190564274787903, "logits/rejected": 0.20766079425811768, "logps/chosen": -3.333181381225586, "logps/rejected": -3.77064847946167, "loss": 0.6595, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.333181381225586, "rewards/margins": 0.4374672472476959, "rewards/rejected": -3.77064847946167, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 16.877730621364048, "learning_rate": 9.999441174505398e-07, "logits/chosen": 0.039818353950977325, "logits/rejected": 0.11877290904521942, "logps/chosen": -3.8149142265319824, "logps/rejected": -4.079066276550293, "loss": 0.739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.8149142265319824, "rewards/margins": 0.2641516625881195, "rewards/rejected": -4.079066276550293, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 29.214086108262144, "learning_rate": 9.999184082963116e-07, "logits/chosen": 0.07015234231948853, "logits/rejected": 0.17049241065979004, "logps/chosen": -3.6108219623565674, "logps/rejected": -3.888394832611084, "loss": 0.7275, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.6108219623565674, "rewards/margins": 0.27757319808006287, "rewards/rejected": -3.888394832611084, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 18.056496637384395, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.13749486207962036, "logits/rejected": 0.23767301440238953, "logps/chosen": -3.129044771194458, "logps/rejected": -3.470829725265503, "loss": 0.677, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.129044771194458, "rewards/margins": 0.34178513288497925, "rewards/rejected": -3.470829725265503, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 10.713614259915145, "learning_rate": 9.99852439652573e-07, "logits/chosen": 0.06214252859354019, "logits/rejected": 0.17821946740150452, "logps/chosen": -3.211444854736328, "logps/rejected": -3.5581867694854736, "loss": 0.655, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.211444854736328, "rewards/margins": 0.34674185514450073, "rewards/rejected": -3.5581867694854736, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 16.794131303334627, "learning_rate": 9.998121808030904e-07, "logits/chosen": 0.01984679326415062, "logits/rejected": 0.09208065271377563, "logps/chosen": -3.5737006664276123, "logps/rejected": -3.8592307567596436, "loss": 0.7262, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.5737006664276123, "rewards/margins": 0.2855294644832611, "rewards/rejected": -3.8592307567596436, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 30.230327552098668, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.08241340517997742, "logits/rejected": 0.2318786382675171, "logps/chosen": -3.57232403755188, "logps/rejected": -3.9421088695526123, "loss": 0.7021, "rewards/accuracies": 0.65625, "rewards/chosen": -3.57232403755188, "rewards/margins": 0.3697851300239563, "rewards/rejected": -3.9421088695526123, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 13.045253819106485, "learning_rate": 9.99717116001853e-07, "logits/chosen": 0.07159192860126495, "logits/rejected": 0.1577690690755844, "logps/chosen": -3.714366912841797, "logps/rejected": -4.321863651275635, "loss": 0.5915, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.714366912841797, "rewards/margins": 0.6074962615966797, "rewards/rejected": -4.321863651275635, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 14.488006107956625, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.15464843809604645, "logits/rejected": 0.19680888950824738, "logps/chosen": -3.988550901412964, "logps/rejected": -4.495797157287598, "loss": 0.6329, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.988550901412964, "rewards/margins": 0.5072460770606995, "rewards/rejected": -4.495797157287598, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 12.74641321902138, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.158662810921669, "logits/rejected": 0.2673875093460083, "logps/chosen": -3.9176974296569824, "logps/rejected": -4.413961887359619, "loss": 0.65, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.9176974296569824, "rewards/margins": 0.4962642192840576, "rewards/rejected": -4.413961887359619, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 20.852734290736144, "learning_rate": 9.995381583144996e-07, "logits/chosen": 0.11772291362285614, "logits/rejected": 0.2085292786359787, "logps/chosen": -3.919628858566284, "logps/rejected": -4.478269100189209, "loss": 0.623, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.919628858566284, "rewards/margins": 0.5586410760879517, "rewards/rejected": -4.478269100189209, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 16.994137624365706, "learning_rate": 9.994688118905471e-07, "logits/chosen": 0.13509145379066467, "logits/rejected": 0.3325079083442688, "logps/chosen": -3.966566562652588, "logps/rejected": -4.450629234313965, "loss": 0.6533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.966566562652588, "rewards/margins": 0.4840625822544098, "rewards/rejected": -4.450629234313965, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 18.90268494426562, "learning_rate": 9.993946196179912e-07, "logits/chosen": 0.061876535415649414, "logits/rejected": 0.24048753082752228, "logps/chosen": -3.7621490955352783, "logps/rejected": -4.214397430419922, "loss": 0.6456, "rewards/accuracies": 0.625, "rewards/chosen": -3.7621490955352783, "rewards/margins": 0.45224839448928833, "rewards/rejected": -4.214397430419922, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 12.299634614750797, "learning_rate": 9.993155822166455e-07, "logits/chosen": 0.0931006520986557, "logits/rejected": 0.14876405894756317, "logps/chosen": -3.486232280731201, "logps/rejected": -3.9384491443634033, "loss": 0.6524, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.486232280731201, "rewards/margins": 0.4522164762020111, "rewards/rejected": -3.9384491443634033, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 16.513090361648555, "learning_rate": 9.992317004533313e-07, "logits/chosen": 0.1209297627210617, "logits/rejected": 0.21610169112682343, "logps/chosen": -3.719529628753662, "logps/rejected": -4.19992733001709, "loss": 0.6303, "rewards/accuracies": 0.65625, "rewards/chosen": -3.719529628753662, "rewards/margins": 0.48039764165878296, "rewards/rejected": -4.19992733001709, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 15.93622658731135, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.14742624759674072, "logits/rejected": 0.15836049616336823, "logps/chosen": -3.6117260456085205, "logps/rejected": -4.100099563598633, "loss": 0.6764, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.6117260456085205, "rewards/margins": 0.48837408423423767, "rewards/rejected": -4.100099563598633, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 14.025271961434267, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.10184492915868759, "logits/rejected": 0.1985752284526825, "logps/chosen": -3.645207643508911, "logps/rejected": -3.9447760581970215, "loss": 0.7035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.645207643508911, "rewards/margins": 0.29956820607185364, "rewards/rejected": -3.9447760581970215, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 10.487002639146162, "learning_rate": 9.989509973647416e-07, "logits/chosen": 0.09171708673238754, "logits/rejected": 0.21969366073608398, "logps/chosen": -3.4477672576904297, "logps/rejected": -3.829850673675537, "loss": 0.6868, "rewards/accuracies": 0.625, "rewards/chosen": -3.4477672576904297, "rewards/margins": 0.38208359479904175, "rewards/rejected": -3.829850673675537, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 11.930388591946208, "learning_rate": 9.988477467616445e-07, "logits/chosen": 0.08496109396219254, "logits/rejected": 0.23462577164173126, "logps/chosen": -3.398437023162842, "logps/rejected": -3.8340351581573486, "loss": 0.6163, "rewards/accuracies": 0.65625, "rewards/chosen": -3.398437023162842, "rewards/margins": 0.4355979561805725, "rewards/rejected": -3.8340351581573486, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 16.141902634318562, "learning_rate": 9.987396563355205e-07, "logits/chosen": 0.10136698186397552, "logits/rejected": 0.16289269924163818, "logps/chosen": -3.3947532176971436, "logps/rejected": -3.880528688430786, "loss": 0.6193, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.3947532176971436, "rewards/margins": 0.4857753813266754, "rewards/rejected": -3.880528688430786, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 14.237959261570664, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.1333998441696167, "logits/rejected": 0.26940596103668213, "logps/chosen": -3.630070447921753, "logps/rejected": -4.028790473937988, "loss": 0.6945, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.630070447921753, "rewards/margins": 0.39872002601623535, "rewards/rejected": -4.028790473937988, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 30.482560776470763, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.12947018444538116, "logits/rejected": 0.2682071924209595, "logps/chosen": -3.8325068950653076, "logps/rejected": -4.313271522521973, "loss": 0.645, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.8325068950653076, "rewards/margins": 0.480764776468277, "rewards/rejected": -4.313271522521973, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 17.08715117514046, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.13011527061462402, "logits/rejected": 0.1541045606136322, "logps/chosen": -3.9572131633758545, "logps/rejected": -4.385382652282715, "loss": 0.6541, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.9572131633758545, "rewards/margins": 0.4281691908836365, "rewards/rejected": -4.385382652282715, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 12.136615982285758, "learning_rate": 9.982589180787532e-07, "logits/chosen": 0.0913553461432457, "logits/rejected": 0.161931112408638, "logps/chosen": -3.7658467292785645, "logps/rejected": -4.249650478363037, "loss": 0.607, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.7658467292785645, "rewards/margins": 0.48380357027053833, "rewards/rejected": -4.249650478363037, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 18.45103086270248, "learning_rate": 9.981266452066553e-07, "logits/chosen": 0.00033470019116066396, "logits/rejected": 0.10600058734416962, "logps/chosen": -4.113137245178223, "logps/rejected": -4.494670867919922, "loss": 0.6257, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.113137245178223, "rewards/margins": 0.3815329670906067, "rewards/rejected": -4.494670867919922, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 15.93622876608832, "learning_rate": 9.979895395076608e-07, "logits/chosen": 0.031657874584198, "logits/rejected": 0.19204388558864594, "logps/chosen": -4.230342388153076, "logps/rejected": -4.765054702758789, "loss": 0.601, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -4.230342388153076, "rewards/margins": 0.5347122550010681, "rewards/rejected": -4.765054702758789, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 20.973828320912652, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.09494075924158096, "logits/rejected": 0.19493290781974792, "logps/chosen": -4.270822525024414, "logps/rejected": -4.790404319763184, "loss": 0.6006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.270822525024414, "rewards/margins": 0.5195817351341248, "rewards/rejected": -4.790404319763184, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 24.12876222959481, "learning_rate": 9.97700834996658e-07, "logits/chosen": 0.04487309604883194, "logits/rejected": 0.19675400853157043, "logps/chosen": -4.533698558807373, "logps/rejected": -4.955192565917969, "loss": 0.6374, "rewards/accuracies": 0.6875, "rewards/chosen": -4.533698558807373, "rewards/margins": 0.4214935898780823, "rewards/rejected": -4.955192565917969, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 18.687878234802408, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.15608493983745575, "logits/rejected": 0.311082124710083, "logps/chosen": -4.372023582458496, "logps/rejected": -4.8708953857421875, "loss": 0.6386, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.372023582458496, "rewards/margins": 0.4988718628883362, "rewards/rejected": -4.8708953857421875, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 24.57197772452728, "learning_rate": 9.973928157497674e-07, "logits/chosen": 0.06193660944700241, "logits/rejected": 0.17250049114227295, "logps/chosen": -4.096022605895996, "logps/rejected": -4.559399127960205, "loss": 0.6163, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.096022605895996, "rewards/margins": 0.46337661147117615, "rewards/rejected": -4.559399127960205, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 15.503228225050673, "learning_rate": 9.972315668065927e-07, "logits/chosen": 0.04487502947449684, "logits/rejected": 0.14266487956047058, "logps/chosen": -4.1945343017578125, "logps/rejected": -4.574312686920166, "loss": 0.6739, "rewards/accuracies": 0.65625, "rewards/chosen": -4.1945343017578125, "rewards/margins": 0.37977835536003113, "rewards/rejected": -4.574312686920166, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 14.589179958748876, "learning_rate": 9.97065493720576e-07, "logits/chosen": 0.026894057169556618, "logits/rejected": 0.11165788024663925, "logps/chosen": -3.8227832317352295, "logps/rejected": -4.127780437469482, "loss": 0.6823, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.8227832317352295, "rewards/margins": 0.30499714612960815, "rewards/rejected": -4.127780437469482, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 15.177285028546677, "learning_rate": 9.968945981029594e-07, "logits/chosen": 0.07921586185693741, "logits/rejected": 0.20370376110076904, "logps/chosen": -3.8551414012908936, "logps/rejected": -4.35502815246582, "loss": 0.599, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.8551414012908936, "rewards/margins": 0.499886691570282, "rewards/rejected": -4.35502815246582, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 12.9724978581589, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.1661972850561142, "logits/rejected": 0.23001596331596375, "logps/chosen": -4.048464775085449, "logps/rejected": -4.526503086090088, "loss": 0.6513, "rewards/accuracies": 0.65625, "rewards/chosen": -4.048464775085449, "rewards/margins": 0.47803840041160583, "rewards/rejected": -4.526503086090088, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 24.356171964640268, "learning_rate": 9.965383459518179e-07, "logits/chosen": 0.11209660768508911, "logits/rejected": 0.2574175000190735, "logps/chosen": -4.164693355560303, "logps/rejected": -4.628790855407715, "loss": 0.6338, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.164693355560303, "rewards/margins": 0.46409791707992554, "rewards/rejected": -4.628790855407715, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 14.764686256421596, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.18605700135231018, "logits/rejected": 0.2938101589679718, "logps/chosen": -4.067325592041016, "logps/rejected": -4.463907718658447, "loss": 0.6683, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.067325592041016, "rewards/margins": 0.39658260345458984, "rewards/rejected": -4.463907718658447, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 14.898717382797834, "learning_rate": 9.961628241785746e-07, "logits/chosen": 0.08773010224103928, "logits/rejected": 0.16184954345226288, "logps/chosen": -4.077237606048584, "logps/rejected": -4.507750034332275, "loss": 0.6524, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.077237606048584, "rewards/margins": 0.4305122494697571, "rewards/rejected": -4.507750034332275, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 14.060720027577695, "learning_rate": 9.959678417085998e-07, "logits/chosen": 0.12876908481121063, "logits/rejected": 0.20110642910003662, "logps/chosen": -4.061873912811279, "logps/rejected": -4.476706504821777, "loss": 0.6299, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.061873912811279, "rewards/margins": 0.4148319661617279, "rewards/rejected": -4.476706504821777, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 14.146681959357727, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.22178605198860168, "logits/rejected": 0.31649231910705566, "logps/chosen": -4.050434112548828, "logps/rejected": -4.576292991638184, "loss": 0.6116, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.050434112548828, "rewards/margins": 0.5258593559265137, "rewards/rejected": -4.576292991638184, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 9.437846409172463, "learning_rate": 9.95563443060529e-07, "logits/chosen": 0.06642812490463257, "logits/rejected": 0.18083670735359192, "logps/chosen": -4.088677883148193, "logps/rejected": -4.403240203857422, "loss": 0.7013, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.088677883148193, "rewards/margins": 0.31456226110458374, "rewards/rejected": -4.403240203857422, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 14.41525784838785, "learning_rate": 9.95354030805911e-07, "logits/chosen": 0.03479626402258873, "logits/rejected": 0.1540902554988861, "logps/chosen": -3.9732284545898438, "logps/rejected": -4.343350410461426, "loss": 0.6293, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.9732284545898438, "rewards/margins": 0.37012243270874023, "rewards/rejected": -4.343350410461426, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 13.446091638557753, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.15001167356967926, "logits/rejected": 0.23922619223594666, "logps/chosen": -3.9581551551818848, "logps/rejected": -4.4130940437316895, "loss": 0.6439, "rewards/accuracies": 0.65625, "rewards/chosen": -3.9581551551818848, "rewards/margins": 0.45493918657302856, "rewards/rejected": -4.4130940437316895, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 13.562175714834037, "learning_rate": 9.94920790594082e-07, "logits/chosen": 0.07130423933267593, "logits/rejected": 0.1596033126115799, "logps/chosen": -3.7518463134765625, "logps/rejected": -4.277368545532227, "loss": 0.5863, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.7518463134765625, "rewards/margins": 0.5255222320556641, "rewards/rejected": -4.277368545532227, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 12.53818772753645, "learning_rate": 9.946969668401696e-07, "logits/chosen": 0.05194659158587456, "logits/rejected": 0.18004316091537476, "logps/chosen": -4.107532501220703, "logps/rejected": -4.683318138122559, "loss": 0.5988, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.107532501220703, "rewards/margins": 0.5757859945297241, "rewards/rejected": -4.683318138122559, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 12.51138730933159, "learning_rate": 9.944683435341155e-07, "logits/chosen": 0.11643725633621216, "logits/rejected": 0.1717463731765747, "logps/chosen": -4.136044025421143, "logps/rejected": -4.712512969970703, "loss": 0.5698, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.136044025421143, "rewards/margins": 0.5764691829681396, "rewards/rejected": -4.712512969970703, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.3680090606212616, "eval_logits/rejected": 0.4423644244670868, "eval_logps/chosen": -4.308505058288574, "eval_logps/rejected": -4.878539562225342, "eval_loss": 0.5833772420883179, "eval_rewards/accuracies": 0.6899110078811646, "eval_rewards/chosen": -4.308505058288574, "eval_rewards/margins": 0.570034384727478, "eval_rewards/rejected": -4.878539562225342, "eval_runtime": 40.6154, "eval_samples_per_second": 33.116, "eval_steps_per_second": 8.297, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 14.136453899433716, "learning_rate": 9.942349228940236e-07, "logits/chosen": 0.0765170082449913, "logits/rejected": 0.1916988343000412, "logps/chosen": -4.293303489685059, "logps/rejected": -4.934248447418213, "loss": 0.5539, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.293303489685059, "rewards/margins": 0.640945315361023, "rewards/rejected": -4.934248447418213, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 16.337955794387945, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.13747012615203857, "logits/rejected": 0.1938168853521347, "logps/chosen": -4.470924377441406, "logps/rejected": -4.852771759033203, "loss": 0.6465, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.470924377441406, "rewards/margins": 0.3818475306034088, "rewards/rejected": -4.852771759033203, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 14.000484635571498, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.15113818645477295, "logits/rejected": 0.25059324502944946, "logps/chosen": -4.234372615814209, "logps/rejected": -4.907202243804932, "loss": 0.5986, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.234372615814209, "rewards/margins": 0.6728296279907227, "rewards/rejected": -4.907202243804932, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 18.092249010731592, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.16950729489326477, "logits/rejected": 0.2003379613161087, "logps/chosen": -4.349903583526611, "logps/rejected": -4.93494176864624, "loss": 0.5927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.349903583526611, "rewards/margins": 0.5850378274917603, "rewards/rejected": -4.93494176864624, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 22.209423049263393, "learning_rate": 9.932533129839333e-07, "logits/chosen": 0.12965020537376404, "logits/rejected": 0.22494366765022278, "logps/chosen": -4.419579029083252, "logps/rejected": -5.02409029006958, "loss": 0.5842, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.419579029083252, "rewards/margins": 0.6045114994049072, "rewards/rejected": -5.02409029006958, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 15.42445024128305, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.1293991208076477, "logits/rejected": 0.2629973590373993, "logps/chosen": -4.460977077484131, "logps/rejected": -5.001198768615723, "loss": 0.5945, "rewards/accuracies": 0.65625, "rewards/chosen": -4.460977077484131, "rewards/margins": 0.540221095085144, "rewards/rejected": -5.001198768615723, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 17.878526091674168, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.07012822479009628, "logits/rejected": 0.17119929194450378, "logps/chosen": -4.606560230255127, "logps/rejected": -5.08737850189209, "loss": 0.6268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.606560230255127, "rewards/margins": 0.4808182716369629, "rewards/rejected": -5.08737850189209, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 17.30451084128454, "learning_rate": 9.924668491496474e-07, "logits/chosen": 0.04732608422636986, "logits/rejected": 0.22716931998729706, "logps/chosen": -4.64849328994751, "logps/rejected": -5.12246561050415, "loss": 0.6517, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.64849328994751, "rewards/margins": 0.4739730954170227, "rewards/rejected": -5.12246561050415, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 14.240016623069137, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.10674294084310532, "logits/rejected": 0.15598464012145996, "logps/chosen": -4.671469688415527, "logps/rejected": -5.1211419105529785, "loss": 0.6577, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.671469688415527, "rewards/margins": 0.44967150688171387, "rewards/rejected": -5.1211419105529785, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 22.713987490926204, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.08552362024784088, "logits/rejected": 0.15929841995239258, "logps/chosen": -4.878894329071045, "logps/rejected": -5.3015313148498535, "loss": 0.627, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.878894329071045, "rewards/margins": 0.42263659834861755, "rewards/rejected": -5.3015313148498535, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 48.04405800782595, "learning_rate": 9.9163738435372e-07, "logits/chosen": 0.09746511280536652, "logits/rejected": 0.21800121665000916, "logps/chosen": -4.8507399559021, "logps/rejected": -5.3712663650512695, "loss": 0.6601, "rewards/accuracies": 0.6875, "rewards/chosen": -4.8507399559021, "rewards/margins": 0.5205264091491699, "rewards/rejected": -5.3712663650512695, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 12.135499260191466, "learning_rate": 9.913513527293234e-07, "logits/chosen": 0.030156215652823448, "logits/rejected": 0.18335846066474915, "logps/chosen": -5.055727958679199, "logps/rejected": -5.727547645568848, "loss": 0.596, "rewards/accuracies": 0.6875, "rewards/chosen": -5.055727958679199, "rewards/margins": 0.6718195080757141, "rewards/rejected": -5.727547645568848, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 22.787512679460278, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.10199908912181854, "logits/rejected": 0.20210430026054382, "logps/chosen": -4.92391300201416, "logps/rejected": -5.457892417907715, "loss": 0.6312, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.92391300201416, "rewards/margins": 0.5339800119400024, "rewards/rejected": -5.457892417907715, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 13.081639212334231, "learning_rate": 9.907649910229227e-07, "logits/chosen": 0.04462980106472969, "logits/rejected": 0.24769258499145508, "logps/chosen": -4.988059997558594, "logps/rejected": -5.593521595001221, "loss": 0.5656, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.988059997558594, "rewards/margins": 0.6054618954658508, "rewards/rejected": -5.593521595001221, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 18.42022346620481, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.13370801508426666, "logits/rejected": 0.19591127336025238, "logps/chosen": -5.069474220275879, "logps/rejected": -5.5045270919799805, "loss": 0.6737, "rewards/accuracies": 0.625, "rewards/chosen": -5.069474220275879, "rewards/margins": 0.4350530505180359, "rewards/rejected": -5.5045270919799805, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 10.418297689322024, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.10689453780651093, "logits/rejected": 0.2890579104423523, "logps/chosen": -5.058760643005371, "logps/rejected": -5.734701633453369, "loss": 0.5467, "rewards/accuracies": 0.71875, "rewards/chosen": -5.058760643005371, "rewards/margins": 0.6759408712387085, "rewards/rejected": -5.734701633453369, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 17.489656126744457, "learning_rate": 9.898497453324384e-07, "logits/chosen": 0.024927783757448196, "logits/rejected": 0.1131664514541626, "logps/chosen": -5.0872368812561035, "logps/rejected": -5.671210765838623, "loss": 0.5756, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.0872368812561035, "rewards/margins": 0.5839737057685852, "rewards/rejected": -5.671210765838623, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 14.91607871439468, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.014569303020834923, "logits/rejected": 0.09106171131134033, "logps/chosen": -4.97722053527832, "logps/rejected": -5.493784427642822, "loss": 0.6101, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.97722053527832, "rewards/margins": 0.5165637135505676, "rewards/rejected": -5.493784427642822, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 14.243081048973087, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.12994366884231567, "logits/rejected": 0.19874386489391327, "logps/chosen": -4.980952262878418, "logps/rejected": -5.45808744430542, "loss": 0.6236, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -4.980952262878418, "rewards/margins": 0.477135568857193, "rewards/rejected": -5.45808744430542, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 22.19055660611138, "learning_rate": 9.88891727199209e-07, "logits/chosen": 0.06759266555309296, "logits/rejected": 0.14810121059417725, "logps/chosen": -4.805438041687012, "logps/rejected": -5.4054131507873535, "loss": 0.5971, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.805438041687012, "rewards/margins": 0.5999752283096313, "rewards/rejected": -5.4054131507873535, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 13.944126223787151, "learning_rate": 9.885628971850641e-07, "logits/chosen": 0.13829362392425537, "logits/rejected": 0.3043214678764343, "logps/chosen": -4.582094192504883, "logps/rejected": -5.295670509338379, "loss": 0.5653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.582094192504883, "rewards/margins": 0.7135763168334961, "rewards/rejected": -5.295670509338379, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 12.979111882526238, "learning_rate": 9.882293271315481e-07, "logits/chosen": 0.11515919119119644, "logits/rejected": 0.19706693291664124, "logps/chosen": -4.561379909515381, "logps/rejected": -5.0516157150268555, "loss": 0.6307, "rewards/accuracies": 0.6875, "rewards/chosen": -4.561379909515381, "rewards/margins": 0.4902358055114746, "rewards/rejected": -5.0516157150268555, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 13.232816281927093, "learning_rate": 9.878910202749589e-07, "logits/chosen": 0.11254336684942245, "logits/rejected": 0.2646767497062683, "logps/chosen": -4.474375247955322, "logps/rejected": -5.071609973907471, "loss": 0.576, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -4.474375247955322, "rewards/margins": 0.5972346067428589, "rewards/rejected": -5.071609973907471, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 14.235681789081742, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.1441764533519745, "logits/rejected": 0.2878187894821167, "logps/chosen": -4.431388854980469, "logps/rejected": -5.1005682945251465, "loss": 0.5785, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.431388854980469, "rewards/margins": 0.6691794395446777, "rewards/rejected": -5.1005682945251465, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 16.727504428268272, "learning_rate": 9.87200209327504e-07, "logits/chosen": 0.1225767731666565, "logits/rejected": 0.27200278639793396, "logps/chosen": -4.8292107582092285, "logps/rejected": -5.343542575836182, "loss": 0.6138, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.8292107582092285, "rewards/margins": 0.514331042766571, "rewards/rejected": -5.343542575836182, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 19.36857659305303, "learning_rate": 9.868477119388894e-07, "logits/chosen": 0.0767655000090599, "logits/rejected": 0.1348889172077179, "logps/chosen": -4.617282867431641, "logps/rejected": -5.306047439575195, "loss": 0.5769, "rewards/accuracies": 0.6875, "rewards/chosen": -4.617282867431641, "rewards/margins": 0.688764750957489, "rewards/rejected": -5.306047439575195, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 16.253422960914545, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.10183491557836533, "logits/rejected": 0.15028637647628784, "logps/chosen": -4.9497151374816895, "logps/rejected": -5.486759662628174, "loss": 0.6017, "rewards/accuracies": 0.625, "rewards/chosen": -4.9497151374816895, "rewards/margins": 0.53704434633255, "rewards/rejected": -5.486759662628174, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 15.578534976926822, "learning_rate": 9.861285504315084e-07, "logits/chosen": 0.10452082008123398, "logits/rejected": 0.18630391359329224, "logps/chosen": -4.9952592849731445, "logps/rejected": -5.562484264373779, "loss": 0.5745, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.9952592849731445, "rewards/margins": 0.5672253370285034, "rewards/rejected": -5.562484264373779, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 15.871741001533938, "learning_rate": 9.857618932900502e-07, "logits/chosen": 0.05265814810991287, "logits/rejected": 0.18030931055545807, "logps/chosen": -5.266424179077148, "logps/rejected": -5.829395771026611, "loss": 0.592, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.266424179077148, "rewards/margins": 0.5629717707633972, "rewards/rejected": -5.829395771026611, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 20.46095122131056, "learning_rate": 9.853905232845727e-07, "logits/chosen": 0.07458920031785965, "logits/rejected": 0.21533843874931335, "logps/chosen": -5.193603515625, "logps/rejected": -5.774901866912842, "loss": 0.6031, "rewards/accuracies": 0.6875, "rewards/chosen": -5.193603515625, "rewards/margins": 0.5812984704971313, "rewards/rejected": -5.774901866912842, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 16.96180201508439, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.12050509452819824, "logits/rejected": 0.2789466381072998, "logps/chosen": -5.481587886810303, "logps/rejected": -6.1133713722229, "loss": 0.552, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.481587886810303, "rewards/margins": 0.6317835450172424, "rewards/rejected": -6.1133713722229, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 22.057912786191807, "learning_rate": 9.846336591393832e-07, "logits/chosen": 0.13709673285484314, "logits/rejected": 0.2472180426120758, "logps/chosen": -5.536463737487793, "logps/rejected": -6.164149761199951, "loss": 0.6072, "rewards/accuracies": 0.6875, "rewards/chosen": -5.536463737487793, "rewards/margins": 0.6276865005493164, "rewards/rejected": -6.164149761199951, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 18.594180635002445, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.20625586807727814, "logits/rejected": 0.22861690819263458, "logps/chosen": -5.818711280822754, "logps/rejected": -6.3472819328308105, "loss": 0.6571, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -5.818711280822754, "rewards/margins": 0.5285712480545044, "rewards/rejected": -6.3472819328308105, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 14.477138987949935, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.22909626364707947, "logits/rejected": 0.22380170226097107, "logps/chosen": -5.4652533531188965, "logps/rejected": -5.900015354156494, "loss": 0.6339, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -5.4652533531188965, "rewards/margins": 0.43476223945617676, "rewards/rejected": -5.900015354156494, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 10.774254409061957, "learning_rate": 9.834631080014457e-07, "logits/chosen": 0.21065068244934082, "logits/rejected": 0.40507012605667114, "logps/chosen": -5.059942722320557, "logps/rejected": -5.719550132751465, "loss": 0.5381, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.059942722320557, "rewards/margins": 0.6596078872680664, "rewards/rejected": -5.719550132751465, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 21.306311487390147, "learning_rate": 9.830635380734312e-07, "logits/chosen": 0.158293679356575, "logits/rejected": 0.3137975037097931, "logps/chosen": -5.007941246032715, "logps/rejected": -5.524171829223633, "loss": 0.6087, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.007941246032715, "rewards/margins": 0.5162306427955627, "rewards/rejected": -5.524171829223633, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 15.992785106678324, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.2361408919095993, "logits/rejected": 0.413705438375473, "logps/chosen": -4.755031108856201, "logps/rejected": -5.34442663192749, "loss": 0.5804, "rewards/accuracies": 0.6875, "rewards/chosen": -4.755031108856201, "rewards/margins": 0.5893956422805786, "rewards/rejected": -5.34442663192749, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 11.699896128719258, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.33061686158180237, "logits/rejected": 0.3520127832889557, "logps/chosen": -4.636082649230957, "logps/rejected": -5.253480434417725, "loss": 0.5688, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.636082649230957, "rewards/margins": 0.6173979043960571, "rewards/rejected": -5.253480434417725, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 14.641166127770285, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.31574827432632446, "logits/rejected": 0.3793763816356659, "logps/chosen": -4.673220157623291, "logps/rejected": -5.268143177032471, "loss": 0.5977, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.673220157623291, "rewards/margins": 0.5949229001998901, "rewards/rejected": -5.268143177032471, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 13.791448147799434, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.2934790849685669, "logits/rejected": 0.41704511642456055, "logps/chosen": -5.063307762145996, "logps/rejected": -5.667517185211182, "loss": 0.5676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.063307762145996, "rewards/margins": 0.6042091250419617, "rewards/rejected": -5.667517185211182, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 21.89544394323037, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.31097519397735596, "logits/rejected": 0.4482477605342865, "logps/chosen": -4.9805097579956055, "logps/rejected": -5.498573303222656, "loss": 0.6381, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.9805097579956055, "rewards/margins": 0.5180639028549194, "rewards/rejected": -5.498573303222656, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 14.288089409217719, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.30252641439437866, "logits/rejected": 0.46473073959350586, "logps/chosen": -4.748977184295654, "logps/rejected": -5.470629692077637, "loss": 0.5732, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.748977184295654, "rewards/margins": 0.7216525077819824, "rewards/rejected": -5.470629692077637, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 16.450634754715974, "learning_rate": 9.801355442251625e-07, "logits/chosen": 0.2952435612678528, "logits/rejected": 0.4374850392341614, "logps/chosen": -4.6993303298950195, "logps/rejected": -5.276191711425781, "loss": 0.5967, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.6993303298950195, "rewards/margins": 0.5768610239028931, "rewards/rejected": -5.276191711425781, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 17.23606485112971, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.24990224838256836, "logits/rejected": 0.39226099848747253, "logps/chosen": -4.721253395080566, "logps/rejected": -5.312615394592285, "loss": 0.5763, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.721253395080566, "rewards/margins": 0.5913619995117188, "rewards/rejected": -5.312615394592285, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 18.348395960642605, "learning_rate": 9.792569880987724e-07, "logits/chosen": 0.2161470204591751, "logits/rejected": 0.3239986002445221, "logps/chosen": -4.841641426086426, "logps/rejected": -5.574504375457764, "loss": 0.5545, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.841641426086426, "rewards/margins": 0.7328632473945618, "rewards/rejected": -5.574504375457764, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 23.68426530763003, "learning_rate": 9.788107332632493e-07, "logits/chosen": 0.2725701332092285, "logits/rejected": 0.3485357165336609, "logps/chosen": -4.837586402893066, "logps/rejected": -5.371668338775635, "loss": 0.6449, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.837586402893066, "rewards/margins": 0.5340815782546997, "rewards/rejected": -5.371668338775635, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 13.60994353040657, "learning_rate": 9.783598330038924e-07, "logits/chosen": 0.2203231304883957, "logits/rejected": 0.33435872197151184, "logps/chosen": -4.887129783630371, "logps/rejected": -5.462523937225342, "loss": 0.5909, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.887129783630371, "rewards/margins": 0.5753947496414185, "rewards/rejected": -5.462523937225342, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 15.992176948446557, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.21786603331565857, "logits/rejected": 0.3903740346431732, "logps/chosen": -4.740747451782227, "logps/rejected": -5.588156700134277, "loss": 0.5291, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.740747451782227, "rewards/margins": 0.847409725189209, "rewards/rejected": -5.588156700134277, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 15.305204156503235, "learning_rate": 9.774441137572487e-07, "logits/chosen": 0.16291995346546173, "logits/rejected": 0.3054003119468689, "logps/chosen": -5.2441792488098145, "logps/rejected": -6.015246391296387, "loss": 0.5262, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.2441792488098145, "rewards/margins": 0.7710676193237305, "rewards/rejected": -6.015246391296387, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 17.29850058563769, "learning_rate": 9.76979303654274e-07, "logits/chosen": 0.1594499945640564, "logits/rejected": 0.25126054883003235, "logps/chosen": -5.5972700119018555, "logps/rejected": -6.344670295715332, "loss": 0.5621, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.5972700119018555, "rewards/margins": 0.7474009394645691, "rewards/rejected": -6.344670295715332, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 22.855109247078705, "learning_rate": 9.765098658960035e-07, "logits/chosen": 0.25617220997810364, "logits/rejected": 0.30602145195007324, "logps/chosen": -5.525954246520996, "logps/rejected": -6.195716857910156, "loss": 0.5662, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.525954246520996, "rewards/margins": 0.6697630882263184, "rewards/rejected": -6.195716857910156, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 21.73298083494903, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.33891740441322327, "logits/rejected": 0.4949556887149811, "logps/chosen": -5.749730587005615, "logps/rejected": -6.356629371643066, "loss": 0.6006, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.749730587005615, "rewards/margins": 0.6068984270095825, "rewards/rejected": -6.356629371643066, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 18.76533682629971, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.33777058124542236, "logits/rejected": 0.4592292904853821, "logps/chosen": -5.452566623687744, "logps/rejected": -6.146149158477783, "loss": 0.5666, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.452566623687744, "rewards/margins": 0.6935827136039734, "rewards/rejected": -6.146149158477783, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 12.758802239292304, "learning_rate": 9.750738324585097e-07, "logits/chosen": 0.25770294666290283, "logits/rejected": 0.48060542345046997, "logps/chosen": -5.470420837402344, "logps/rejected": -6.1262712478637695, "loss": 0.5654, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.470420837402344, "rewards/margins": 0.6558502316474915, "rewards/rejected": -6.1262712478637695, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 10.984300032033834, "learning_rate": 9.74585930072237e-07, "logits/chosen": 0.3312050402164459, "logits/rejected": 0.46263259649276733, "logps/chosen": -5.0946125984191895, "logps/rejected": -5.854950904846191, "loss": 0.5754, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.0946125984191895, "rewards/margins": 0.7603389024734497, "rewards/rejected": -5.854950904846191, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 13.620929682910244, "learning_rate": 9.740934232511892e-07, "logits/chosen": 0.2597391903400421, "logits/rejected": 0.35427379608154297, "logps/chosen": -5.120131969451904, "logps/rejected": -5.784483909606934, "loss": 0.5782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.120131969451904, "rewards/margins": 0.6643514633178711, "rewards/rejected": -5.784483909606934, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 14.111522153176713, "learning_rate": 9.735963167736698e-07, "logits/chosen": 0.3054198622703552, "logits/rejected": 0.45205599069595337, "logps/chosen": -4.985550880432129, "logps/rejected": -5.567275047302246, "loss": 0.5954, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.985550880432129, "rewards/margins": 0.5817250609397888, "rewards/rejected": -5.567275047302246, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 17.19897587538619, "learning_rate": 9.730946154626078e-07, "logits/chosen": 0.3400455713272095, "logits/rejected": 0.41743141412734985, "logps/chosen": -5.11470890045166, "logps/rejected": -5.673919677734375, "loss": 0.6433, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.11470890045166, "rewards/margins": 0.5592107772827148, "rewards/rejected": -5.673919677734375, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 15.984826873078429, "learning_rate": 9.725883241855117e-07, "logits/chosen": 0.19843299686908722, "logits/rejected": 0.3453465402126312, "logps/chosen": -5.0446319580078125, "logps/rejected": -5.695275783538818, "loss": 0.5735, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.0446319580078125, "rewards/margins": 0.650644063949585, "rewards/rejected": -5.695275783538818, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 14.249666845056318, "learning_rate": 9.720774478544218e-07, "logits/chosen": 0.26621291041374207, "logits/rejected": 0.40066710114479065, "logps/chosen": -4.8911848068237305, "logps/rejected": -5.623963356018066, "loss": 0.5621, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.8911848068237305, "rewards/margins": 0.7327794432640076, "rewards/rejected": -5.623963356018066, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 16.259937644192508, "learning_rate": 9.715619914258624e-07, "logits/chosen": 0.20120957493782043, "logits/rejected": 0.28875917196273804, "logps/chosen": -5.166097164154053, "logps/rejected": -5.727505683898926, "loss": 0.6102, "rewards/accuracies": 0.65625, "rewards/chosen": -5.166097164154053, "rewards/margins": 0.5614089369773865, "rewards/rejected": -5.727505683898926, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 19.83984205517796, "learning_rate": 9.710419599007937e-07, "logits/chosen": 0.23607775568962097, "logits/rejected": 0.391889750957489, "logps/chosen": -5.258492946624756, "logps/rejected": -5.80498743057251, "loss": 0.6017, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.258492946624756, "rewards/margins": 0.5464946627616882, "rewards/rejected": -5.80498743057251, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 22.902339107005808, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.26504239439964294, "logits/rejected": 0.4376647472381592, "logps/chosen": -5.094302177429199, "logps/rejected": -5.6042375564575195, "loss": 0.6653, "rewards/accuracies": 0.65625, "rewards/chosen": -5.094302177429199, "rewards/margins": 0.509935736656189, "rewards/rejected": -5.6042375564575195, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 12.341522759881144, "learning_rate": 9.699881917868609e-07, "logits/chosen": 0.18295054137706757, "logits/rejected": 0.29605168104171753, "logps/chosen": -4.891472816467285, "logps/rejected": -5.554740905761719, "loss": 0.5742, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.891472816467285, "rewards/margins": 0.6632689237594604, "rewards/rejected": -5.554740905761719, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 15.52676442724201, "learning_rate": 9.694544654216594e-07, "logits/chosen": 0.17597678303718567, "logits/rejected": 0.3558964729309082, "logps/chosen": -4.852838039398193, "logps/rejected": -5.499570369720459, "loss": 0.5694, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.852838039398193, "rewards/margins": 0.6467326879501343, "rewards/rejected": -5.499570369720459, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 15.606786723117184, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.3111589848995209, "logits/rejected": 0.39077430963516235, "logps/chosen": -4.479925155639648, "logps/rejected": -5.084510326385498, "loss": 0.5795, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.479925155639648, "rewards/margins": 0.6045850515365601, "rewards/rejected": -5.084510326385498, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 15.220416403553692, "learning_rate": 9.683733539658138e-07, "logits/chosen": 0.19960640370845795, "logits/rejected": 0.373668372631073, "logps/chosen": -4.842585563659668, "logps/rejected": -5.471686363220215, "loss": 0.5787, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.842585563659668, "rewards/margins": 0.6291012763977051, "rewards/rejected": -5.471686363220215, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 15.06282165787837, "learning_rate": 9.678259793641178e-07, "logits/chosen": 0.23491637408733368, "logits/rejected": 0.2762298583984375, "logps/chosen": -4.9560418128967285, "logps/rejected": -5.411709785461426, "loss": 0.6156, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.9560418128967285, "rewards/margins": 0.45566827058792114, "rewards/rejected": -5.411709785461426, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 16.43105376415252, "learning_rate": 9.672740659127183e-07, "logits/chosen": 0.11958172172307968, "logits/rejected": 0.24528367817401886, "logps/chosen": -5.2414422035217285, "logps/rejected": -6.028550148010254, "loss": 0.5626, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.2414422035217285, "rewards/margins": 0.7871086597442627, "rewards/rejected": -6.028550148010254, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 17.728938409226515, "learning_rate": 9.667176189662818e-07, "logits/chosen": 0.17226769030094147, "logits/rejected": 0.2921416759490967, "logps/chosen": -5.604882717132568, "logps/rejected": -6.285414695739746, "loss": 0.5648, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.604882717132568, "rewards/margins": 0.6805322766304016, "rewards/rejected": -6.285414695739746, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 10.458517249375687, "learning_rate": 9.661566439234592e-07, "logits/chosen": 0.2221958339214325, "logits/rejected": 0.31999343633651733, "logps/chosen": -5.400237560272217, "logps/rejected": -6.015561580657959, "loss": 0.5781, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.400237560272217, "rewards/margins": 0.6153234243392944, "rewards/rejected": -6.015561580657959, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 12.310416477301308, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.27029454708099365, "logits/rejected": 0.3843326270580292, "logps/chosen": -5.3095293045043945, "logps/rejected": -6.04095983505249, "loss": 0.554, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.3095293045043945, "rewards/margins": 0.7314309477806091, "rewards/rejected": -6.04095983505249, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 16.49859858281843, "learning_rate": 9.650211313628636e-07, "logits/chosen": 0.16574755311012268, "logits/rejected": 0.25395551323890686, "logps/chosen": -5.306234836578369, "logps/rejected": -5.740855693817139, "loss": 0.6557, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -5.306234836578369, "rewards/margins": 0.4346213936805725, "rewards/rejected": -5.740855693817139, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 14.67356722732915, "learning_rate": 9.644466048618386e-07, "logits/chosen": 0.22032615542411804, "logits/rejected": 0.35630637407302856, "logps/chosen": -5.3804168701171875, "logps/rejected": -5.948591232299805, "loss": 0.6134, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.3804168701171875, "rewards/margins": 0.5681743621826172, "rewards/rejected": -5.948591232299805, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 12.98102507062549, "learning_rate": 9.63867572297816e-07, "logits/chosen": 0.17474202811717987, "logits/rejected": 0.3417607545852661, "logps/chosen": -5.069199085235596, "logps/rejected": -5.674149990081787, "loss": 0.5814, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.069199085235596, "rewards/margins": 0.6049512624740601, "rewards/rejected": -5.674149990081787, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 16.03585800400677, "learning_rate": 9.632840392885727e-07, "logits/chosen": 0.176101952791214, "logits/rejected": 0.3059012293815613, "logps/chosen": -5.233376502990723, "logps/rejected": -5.868188858032227, "loss": 0.5947, "rewards/accuracies": 0.6875, "rewards/chosen": -5.233376502990723, "rewards/margins": 0.6348131895065308, "rewards/rejected": -5.868188858032227, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 13.352708841319558, "learning_rate": 9.626960114955483e-07, "logits/chosen": 0.25874418020248413, "logits/rejected": 0.3938962519168854, "logps/chosen": -5.079336166381836, "logps/rejected": -5.861237525939941, "loss": 0.5288, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.079336166381836, "rewards/margins": 0.7819018363952637, "rewards/rejected": -5.861237525939941, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 19.306934551892116, "learning_rate": 9.621034946237909e-07, "logits/chosen": 0.1534922868013382, "logits/rejected": 0.27825698256492615, "logps/chosen": -5.256809234619141, "logps/rejected": -5.943486213684082, "loss": 0.559, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.256809234619141, "rewards/margins": 0.6866768598556519, "rewards/rejected": -5.943486213684082, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 17.32811641284677, "learning_rate": 9.615064944219021e-07, "logits/chosen": 0.21924109756946564, "logits/rejected": 0.33183610439300537, "logps/chosen": -5.133397579193115, "logps/rejected": -5.791072368621826, "loss": 0.5613, "rewards/accuracies": 0.71875, "rewards/chosen": -5.133397579193115, "rewards/margins": 0.6576749086380005, "rewards/rejected": -5.791072368621826, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 19.090045276363135, "learning_rate": 9.609050166819803e-07, "logits/chosen": 0.1627103090286255, "logits/rejected": 0.2272244244813919, "logps/chosen": -5.426455974578857, "logps/rejected": -6.099776744842529, "loss": 0.5645, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.426455974578857, "rewards/margins": 0.6733212471008301, "rewards/rejected": -6.099776744842529, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.48668551445007324, "eval_logits/rejected": 0.5839446187019348, "eval_logps/chosen": -5.367247104644775, "eval_logps/rejected": -6.133624076843262, "eval_loss": 0.5407058596611023, "eval_rewards/accuracies": 0.719584584236145, "eval_rewards/chosen": -5.367247104644775, "eval_rewards/margins": 0.7663767337799072, "eval_rewards/rejected": -6.133624076843262, "eval_runtime": 40.2443, "eval_samples_per_second": 33.421, "eval_steps_per_second": 8.374, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 19.10344409811311, "learning_rate": 9.602990672395653e-07, "logits/chosen": 0.07960721850395203, "logits/rejected": 0.24876335263252258, "logps/chosen": -5.328033447265625, "logps/rejected": -6.046864986419678, "loss": 0.5468, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.328033447265625, "rewards/margins": 0.7188326120376587, "rewards/rejected": -6.046864986419678, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 14.2675712009498, "learning_rate": 9.59688651973581e-07, "logits/chosen": 0.18518412113189697, "logits/rejected": 0.36908772587776184, "logps/chosen": -5.282735347747803, "logps/rejected": -5.887304306030273, "loss": 0.5866, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.282735347747803, "rewards/margins": 0.6045690774917603, "rewards/rejected": -5.887304306030273, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 14.324903597274911, "learning_rate": 9.590737768062792e-07, "logits/chosen": 0.13332310318946838, "logits/rejected": 0.23672330379486084, "logps/chosen": -5.372260093688965, "logps/rejected": -5.985402584075928, "loss": 0.5959, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.372260093688965, "rewards/margins": 0.6131423711776733, "rewards/rejected": -5.985402584075928, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 15.838155550603874, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.3152899146080017, "logits/rejected": 0.42080777883529663, "logps/chosen": -4.849628448486328, "logps/rejected": -5.483916282653809, "loss": 0.5765, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.849628448486328, "rewards/margins": 0.63428795337677, "rewards/rejected": -5.483916282653809, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 16.54166650607216, "learning_rate": 9.578306706730215e-07, "logits/chosen": 0.12844684720039368, "logits/rejected": 0.30873560905456543, "logps/chosen": -5.147610664367676, "logps/rejected": -5.638329982757568, "loss": 0.645, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.147610664367676, "rewards/margins": 0.49071961641311646, "rewards/rejected": -5.638329982757568, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 14.59212830596106, "learning_rate": 9.572024517676865e-07, "logits/chosen": 0.22263777256011963, "logits/rejected": 0.3084496855735779, "logps/chosen": -5.088961124420166, "logps/rejected": -5.6632080078125, "loss": 0.6027, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.088961124420166, "rewards/margins": 0.5742468237876892, "rewards/rejected": -5.6632080078125, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 16.935571754229482, "learning_rate": 9.565697970821593e-07, "logits/chosen": 0.22315695881843567, "logits/rejected": 0.37225663661956787, "logps/chosen": -4.953070163726807, "logps/rejected": -5.519973278045654, "loss": 0.588, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.953070163726807, "rewards/margins": 0.5669037103652954, "rewards/rejected": -5.519973278045654, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 11.176952391085932, "learning_rate": 9.559327127544585e-07, "logits/chosen": 0.13900962471961975, "logits/rejected": 0.26038673520088196, "logps/chosen": -4.944551467895508, "logps/rejected": -5.529752254486084, "loss": 0.5661, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.944551467895508, "rewards/margins": 0.5852009654045105, "rewards/rejected": -5.529752254486084, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 19.407751525481448, "learning_rate": 9.552912049655789e-07, "logits/chosen": 0.16628827154636383, "logits/rejected": 0.31289148330688477, "logps/chosen": -4.769149303436279, "logps/rejected": -5.428027153015137, "loss": 0.5645, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.769149303436279, "rewards/margins": 0.6588776111602783, "rewards/rejected": -5.428027153015137, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 19.591376183358932, "learning_rate": 9.546452799394315e-07, "logits/chosen": 0.17146886885166168, "logits/rejected": 0.34151017665863037, "logps/chosen": -4.960524559020996, "logps/rejected": -5.436453819274902, "loss": 0.6452, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.960524559020996, "rewards/margins": 0.4759295582771301, "rewards/rejected": -5.436453819274902, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 14.006288029277163, "learning_rate": 9.539949439427846e-07, "logits/chosen": 0.17026430368423462, "logits/rejected": 0.27707618474960327, "logps/chosen": -4.856151580810547, "logps/rejected": -5.565400123596191, "loss": 0.5357, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.856151580810547, "rewards/margins": 0.7092481851577759, "rewards/rejected": -5.565400123596191, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 10.412736935731951, "learning_rate": 9.533402032852002e-07, "logits/chosen": 0.13797181844711304, "logits/rejected": 0.2823859453201294, "logps/chosen": -5.050716400146484, "logps/rejected": -5.834721565246582, "loss": 0.5301, "rewards/accuracies": 0.71875, "rewards/chosen": -5.050716400146484, "rewards/margins": 0.7840049266815186, "rewards/rejected": -5.834721565246582, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 16.246431136470022, "learning_rate": 9.526810643189754e-07, "logits/chosen": 0.1979864090681076, "logits/rejected": 0.3522976338863373, "logps/chosen": -5.204963684082031, "logps/rejected": -5.83574104309082, "loss": 0.5602, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -5.204963684082031, "rewards/margins": 0.6307777762413025, "rewards/rejected": -5.83574104309082, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 18.31560128901907, "learning_rate": 9.52017533439079e-07, "logits/chosen": 0.16014745831489563, "logits/rejected": 0.22926263511180878, "logps/chosen": -5.367801666259766, "logps/rejected": -5.904559135437012, "loss": 0.5922, "rewards/accuracies": 0.71875, "rewards/chosen": -5.367801666259766, "rewards/margins": 0.5367578268051147, "rewards/rejected": -5.904559135437012, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 12.20143460916007, "learning_rate": 9.513496170830909e-07, "logits/chosen": 0.11499267816543579, "logits/rejected": 0.21672482788562775, "logps/chosen": -5.606972694396973, "logps/rejected": -6.129176139831543, "loss": 0.6505, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.606972694396973, "rewards/margins": 0.5222033262252808, "rewards/rejected": -6.129176139831543, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 18.406612004386826, "learning_rate": 9.506773217311382e-07, "logits/chosen": 0.11978916823863983, "logits/rejected": 0.2845809757709503, "logps/chosen": -5.279967308044434, "logps/rejected": -5.923506736755371, "loss": 0.5649, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.279967308044434, "rewards/margins": 0.6435400247573853, "rewards/rejected": -5.923506736755371, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 17.39127934655346, "learning_rate": 9.500006539058334e-07, "logits/chosen": 0.153179332613945, "logits/rejected": 0.2890383303165436, "logps/chosen": -5.422624588012695, "logps/rejected": -5.834120750427246, "loss": 0.6277, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.422624588012695, "rewards/margins": 0.4114955961704254, "rewards/rejected": -5.834120750427246, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 17.169929254739305, "learning_rate": 9.493196201722109e-07, "logits/chosen": 0.0674019306898117, "logits/rejected": 0.211639404296875, "logps/chosen": -5.254630088806152, "logps/rejected": -5.7230072021484375, "loss": 0.6189, "rewards/accuracies": 0.65625, "rewards/chosen": -5.254630088806152, "rewards/margins": 0.4683769643306732, "rewards/rejected": -5.7230072021484375, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 14.594674955349564, "learning_rate": 9.486342271376628e-07, "logits/chosen": 0.13996633887290955, "logits/rejected": 0.13986703753471375, "logps/chosen": -5.283727645874023, "logps/rejected": -6.020252227783203, "loss": 0.5404, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.283727645874023, "rewards/margins": 0.7365242838859558, "rewards/rejected": -6.020252227783203, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 20.541310812151377, "learning_rate": 9.479444814518755e-07, "logits/chosen": 0.09658704698085785, "logits/rejected": 0.3593280613422394, "logps/chosen": -5.447458267211914, "logps/rejected": -6.233078956604004, "loss": 0.537, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.447458267211914, "rewards/margins": 0.7856214642524719, "rewards/rejected": -6.233078956604004, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 13.017763564055166, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.23063841462135315, "logits/rejected": 0.27201181650161743, "logps/chosen": -5.499756813049316, "logps/rejected": -6.101656913757324, "loss": 0.5975, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -5.499756813049316, "rewards/margins": 0.6019010543823242, "rewards/rejected": -6.101656913757324, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 15.639464553888544, "learning_rate": 9.465519589364099e-07, "logits/chosen": 0.21758981049060822, "logits/rejected": 0.3056447505950928, "logps/chosen": -5.345944404602051, "logps/rejected": -6.078901767730713, "loss": 0.537, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.345944404602051, "rewards/margins": 0.7329575419425964, "rewards/rejected": -6.078901767730713, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 19.538185529744666, "learning_rate": 9.458491956169914e-07, "logits/chosen": 0.1948762685060501, "logits/rejected": 0.3358132243156433, "logps/chosen": -5.632081031799316, "logps/rejected": -6.298153877258301, "loss": 0.5879, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.632081031799316, "rewards/margins": 0.6660725474357605, "rewards/rejected": -6.298153877258301, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 15.835218103510611, "learning_rate": 9.451421066667215e-07, "logits/chosen": 0.07561054080724716, "logits/rejected": 0.2313728630542755, "logps/chosen": -5.5757646560668945, "logps/rejected": -6.270555019378662, "loss": 0.5487, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.5757646560668945, "rewards/margins": 0.6947903633117676, "rewards/rejected": -6.270555019378662, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 20.860576258221805, "learning_rate": 9.444306989457805e-07, "logits/chosen": 0.17807747423648834, "logits/rejected": 0.29062455892562866, "logps/chosen": -5.396263122558594, "logps/rejected": -6.020081996917725, "loss": 0.639, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.396263122558594, "rewards/margins": 0.6238193511962891, "rewards/rejected": -6.020081996917725, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 16.562027807053052, "learning_rate": 9.437149793562489e-07, "logits/chosen": 0.1450710892677307, "logits/rejected": 0.2431553155183792, "logps/chosen": -5.375389575958252, "logps/rejected": -6.067778587341309, "loss": 0.5657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.375389575958252, "rewards/margins": 0.6923894286155701, "rewards/rejected": -6.067778587341309, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 18.662220841740137, "learning_rate": 9.429949548420417e-07, "logits/chosen": 0.1299891471862793, "logits/rejected": 0.22579917311668396, "logps/chosen": -5.325529098510742, "logps/rejected": -5.947188377380371, "loss": 0.5778, "rewards/accuracies": 0.6875, "rewards/chosen": -5.325529098510742, "rewards/margins": 0.6216592788696289, "rewards/rejected": -5.947188377380371, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 14.578958895550022, "learning_rate": 9.422706323888396e-07, "logits/chosen": 0.16260290145874023, "logits/rejected": 0.23728713393211365, "logps/chosen": -5.306668281555176, "logps/rejected": -5.951327323913574, "loss": 0.5903, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.306668281555176, "rewards/margins": 0.6446584463119507, "rewards/rejected": -5.951327323913574, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 13.271275514743074, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.19149494171142578, "logits/rejected": 0.3800589442253113, "logps/chosen": -5.500617504119873, "logps/rejected": -6.329413414001465, "loss": 0.4756, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.500617504119873, "rewards/margins": 0.828795313835144, "rewards/rejected": -6.329413414001465, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 19.04424555316614, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.17074351012706757, "logits/rejected": 0.23852291703224182, "logps/chosen": -5.459762096405029, "logps/rejected": -5.980296611785889, "loss": 0.6066, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.459762096405029, "rewards/margins": 0.520534098148346, "rewards/rejected": -5.980296611785889, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 25.153691199023406, "learning_rate": 9.400719478771449e-07, "logits/chosen": 0.10599102824926376, "logits/rejected": 0.3678357005119324, "logps/chosen": -5.6926679611206055, "logps/rejected": -6.28774356842041, "loss": 0.5837, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.6926679611206055, "rewards/margins": 0.5950756669044495, "rewards/rejected": -6.28774356842041, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 15.427509106942214, "learning_rate": 9.393305043577209e-07, "logits/chosen": 0.14103470742702484, "logits/rejected": 0.23386581242084503, "logps/chosen": -5.555902004241943, "logps/rejected": -6.28226375579834, "loss": 0.5449, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.555902004241943, "rewards/margins": 0.7263619899749756, "rewards/rejected": -6.28226375579834, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 10.963386901537348, "learning_rate": 9.38584798451817e-07, "logits/chosen": 0.09337516874074936, "logits/rejected": 0.22478385269641876, "logps/chosen": -5.415030002593994, "logps/rejected": -6.029922008514404, "loss": 0.5773, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.415030002593994, "rewards/margins": 0.6148921847343445, "rewards/rejected": -6.029922008514404, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 28.64792763547615, "learning_rate": 9.37834837394275e-07, "logits/chosen": 0.09760557860136032, "logits/rejected": 0.2388264238834381, "logps/chosen": -5.409117221832275, "logps/rejected": -6.181727886199951, "loss": 0.5841, "rewards/accuracies": 0.71875, "rewards/chosen": -5.409117221832275, "rewards/margins": 0.7726110219955444, "rewards/rejected": -6.181727886199951, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 17.835762665539168, "learning_rate": 9.370806284612203e-07, "logits/chosen": 0.08004681766033173, "logits/rejected": 0.23193173110485077, "logps/chosen": -5.294720649719238, "logps/rejected": -6.063483238220215, "loss": 0.5282, "rewards/accuracies": 0.75, "rewards/chosen": -5.294720649719238, "rewards/margins": 0.7687628269195557, "rewards/rejected": -6.063483238220215, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 13.476019276495645, "learning_rate": 9.363221789699912e-07, "logits/chosen": 0.024543192237615585, "logits/rejected": 0.15870940685272217, "logps/chosen": -5.2045488357543945, "logps/rejected": -5.700894355773926, "loss": 0.6341, "rewards/accuracies": 0.625, "rewards/chosen": -5.2045488357543945, "rewards/margins": 0.4963453710079193, "rewards/rejected": -5.700894355773926, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 21.250545778405, "learning_rate": 9.355594962790682e-07, "logits/chosen": 0.03644163906574249, "logits/rejected": 0.16758307814598083, "logps/chosen": -4.997161388397217, "logps/rejected": -5.678461074829102, "loss": 0.5626, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.997161388397217, "rewards/margins": 0.6812992691993713, "rewards/rejected": -5.678461074829102, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 12.746359152755561, "learning_rate": 9.34792587788002e-07, "logits/chosen": 0.05942535400390625, "logits/rejected": 0.19825051724910736, "logps/chosen": -5.076573371887207, "logps/rejected": -5.61143159866333, "loss": 0.6072, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -5.076573371887207, "rewards/margins": 0.5348578095436096, "rewards/rejected": -5.61143159866333, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 17.338256083032455, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.08298267424106598, "logits/rejected": 0.17170774936676025, "logps/chosen": -5.190849304199219, "logps/rejected": -5.659786701202393, "loss": 0.6116, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.190849304199219, "rewards/margins": 0.46893778443336487, "rewards/rejected": -5.659786701202393, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 10.600457242209838, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.10248676687479019, "logits/rejected": 0.01171294879168272, "logps/chosen": -5.310924053192139, "logps/rejected": -5.8432817459106445, "loss": 0.5907, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.310924053192139, "rewards/margins": 0.532357394695282, "rewards/rejected": -5.8432817459106445, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 14.54154755697397, "learning_rate": 9.324665821239998e-07, "logits/chosen": 0.008988935500383377, "logits/rejected": 0.20336274802684784, "logps/chosen": -5.1461005210876465, "logps/rejected": -5.903047561645508, "loss": 0.5805, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.1461005210876465, "rewards/margins": 0.7569469213485718, "rewards/rejected": -5.903047561645508, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 15.733881747234664, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.025676827877759933, "logits/rejected": 0.1373903900384903, "logps/chosen": -5.440557956695557, "logps/rejected": -6.089810848236084, "loss": 0.5554, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.440557956695557, "rewards/margins": 0.6492525935173035, "rewards/rejected": -6.089810848236084, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 21.604807084128684, "learning_rate": 9.30894920180659e-07, "logits/chosen": 0.07698488980531693, "logits/rejected": 0.20073942840099335, "logps/chosen": -5.242859840393066, "logps/rejected": -5.743607521057129, "loss": 0.5978, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -5.242859840393066, "rewards/margins": 0.5007481575012207, "rewards/rejected": -5.743607521057129, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 11.987727396254414, "learning_rate": 9.301028145701543e-07, "logits/chosen": 0.07262875139713287, "logits/rejected": 0.218061164021492, "logps/chosen": -5.430690765380859, "logps/rejected": -6.147607803344727, "loss": 0.5601, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.430690765380859, "rewards/margins": 0.7169171571731567, "rewards/rejected": -6.147607803344727, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 11.946438398814731, "learning_rate": 9.293065361002563e-07, "logits/chosen": 0.07677487283945084, "logits/rejected": 0.2107279747724533, "logps/chosen": -5.488462448120117, "logps/rejected": -6.077086448669434, "loss": 0.6037, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.488462448120117, "rewards/margins": 0.5886243581771851, "rewards/rejected": -6.077086448669434, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 17.08206881304088, "learning_rate": 9.285060924964622e-07, "logits/chosen": 0.012628954835236073, "logits/rejected": 0.14654064178466797, "logps/chosen": -5.570025444030762, "logps/rejected": -6.159371376037598, "loss": 0.559, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -5.570025444030762, "rewards/margins": 0.5893458127975464, "rewards/rejected": -6.159371376037598, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 14.355318833920908, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.14896006882190704, "logits/rejected": 0.22062699496746063, "logps/chosen": -5.511810302734375, "logps/rejected": -6.194446563720703, "loss": 0.5616, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.511810302734375, "rewards/margins": 0.6826368570327759, "rewards/rejected": -6.194446563720703, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 10.362892128999878, "learning_rate": 9.268927409911498e-07, "logits/chosen": 0.0718618854880333, "logits/rejected": 0.1886373907327652, "logps/chosen": -5.489444255828857, "logps/rejected": -6.1179680824279785, "loss": 0.5712, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.489444255828857, "rewards/margins": 0.6285243034362793, "rewards/rejected": -6.1179680824279785, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 18.57873978413128, "learning_rate": 9.260798487423749e-07, "logits/chosen": 0.049703534692525864, "logits/rejected": 0.22829990088939667, "logps/chosen": -5.499743938446045, "logps/rejected": -6.060249328613281, "loss": 0.5796, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.499743938446045, "rewards/margins": 0.5605055093765259, "rewards/rejected": -6.060249328613281, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 17.971929813819067, "learning_rate": 9.252628226650389e-07, "logits/chosen": 0.13852578401565552, "logits/rejected": 0.2460833042860031, "logps/chosen": -5.625993251800537, "logps/rejected": -6.224217891693115, "loss": 0.6039, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.625993251800537, "rewards/margins": 0.5982246994972229, "rewards/rejected": -6.224217891693115, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 16.131859654082774, "learning_rate": 9.244416706859321e-07, "logits/chosen": 0.1056995838880539, "logits/rejected": 0.25705376267433167, "logps/chosen": -5.572797775268555, "logps/rejected": -6.190807819366455, "loss": 0.6063, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.572797775268555, "rewards/margins": 0.6180100440979004, "rewards/rejected": -6.190807819366455, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 12.338113460364587, "learning_rate": 9.23616400771875e-07, "logits/chosen": 0.14175085723400116, "logits/rejected": 0.3181765079498291, "logps/chosen": -5.475106239318848, "logps/rejected": -6.177533149719238, "loss": 0.5563, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.475106239318848, "rewards/margins": 0.7024272084236145, "rewards/rejected": -6.177533149719238, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 11.463601673948695, "learning_rate": 9.227870209296395e-07, "logits/chosen": 0.10631022602319717, "logits/rejected": 0.24700376391410828, "logps/chosen": -5.545660972595215, "logps/rejected": -6.080859184265137, "loss": 0.6066, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.545660972595215, "rewards/margins": 0.5351985692977905, "rewards/rejected": -6.080859184265137, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 10.899788511532039, "learning_rate": 9.219535392058728e-07, "logits/chosen": 0.09463248401880264, "logits/rejected": 0.12708118557929993, "logps/chosen": -5.471802711486816, "logps/rejected": -6.077259540557861, "loss": 0.5997, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -5.471802711486816, "rewards/margins": 0.6054567694664001, "rewards/rejected": -6.077259540557861, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 13.376860813634394, "learning_rate": 9.211159636870181e-07, "logits/chosen": 0.08320939540863037, "logits/rejected": 0.250630646944046, "logps/chosen": -5.6101179122924805, "logps/rejected": -6.320847988128662, "loss": 0.579, "rewards/accuracies": 0.71875, "rewards/chosen": -5.6101179122924805, "rewards/margins": 0.7107303738594055, "rewards/rejected": -6.320847988128662, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 14.63909645189534, "learning_rate": 9.202743024992367e-07, "logits/chosen": 0.2532096207141876, "logits/rejected": 0.3361726403236389, "logps/chosen": -5.401800632476807, "logps/rejected": -6.142020225524902, "loss": 0.5767, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.401800632476807, "rewards/margins": 0.7402194738388062, "rewards/rejected": -6.142020225524902, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 13.849388300398779, "learning_rate": 9.194285638083293e-07, "logits/chosen": 0.22629323601722717, "logits/rejected": 0.36272627115249634, "logps/chosen": -5.643438339233398, "logps/rejected": -6.481289863586426, "loss": 0.5072, "rewards/accuracies": 0.71875, "rewards/chosen": -5.643438339233398, "rewards/margins": 0.8378515243530273, "rewards/rejected": -6.481289863586426, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 16.354661748971626, "learning_rate": 9.185787558196562e-07, "logits/chosen": 0.17721596360206604, "logits/rejected": 0.27777132391929626, "logps/chosen": -5.5958452224731445, "logps/rejected": -6.3385748863220215, "loss": 0.581, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.5958452224731445, "rewards/margins": 0.7427297830581665, "rewards/rejected": -6.3385748863220215, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 16.3709082501682, "learning_rate": 9.177248867780583e-07, "logits/chosen": 0.2522861957550049, "logits/rejected": 0.3358996510505676, "logps/chosen": -5.87462854385376, "logps/rejected": -6.3511457443237305, "loss": 0.6265, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.87462854385376, "rewards/margins": 0.4765172004699707, "rewards/rejected": -6.3511457443237305, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 14.493966468119874, "learning_rate": 9.168669649677769e-07, "logits/chosen": 0.17982062697410583, "logits/rejected": 0.28020036220550537, "logps/chosen": -5.541110038757324, "logps/rejected": -6.17678165435791, "loss": 0.6135, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -5.541110038757324, "rewards/margins": 0.6356716156005859, "rewards/rejected": -6.17678165435791, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 12.637848622631362, "learning_rate": 9.16004998712373e-07, "logits/chosen": 0.2259092777967453, "logits/rejected": 0.304887056350708, "logps/chosen": -5.619663715362549, "logps/rejected": -6.217935085296631, "loss": 0.5969, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.619663715362549, "rewards/margins": 0.598271369934082, "rewards/rejected": -6.217935085296631, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 12.036135640964684, "learning_rate": 9.151389963746472e-07, "logits/chosen": 0.14261266589164734, "logits/rejected": 0.4007405638694763, "logps/chosen": -5.555891513824463, "logps/rejected": -6.384294033050537, "loss": 0.5065, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -5.555891513824463, "rewards/margins": 0.8284032940864563, "rewards/rejected": -6.384294033050537, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 13.2202893369627, "learning_rate": 9.142689663565577e-07, "logits/chosen": 0.20879845321178436, "logits/rejected": 0.26952487230300903, "logps/chosen": -5.465828895568848, "logps/rejected": -6.1048173904418945, "loss": 0.5606, "rewards/accuracies": 0.6875, "rewards/chosen": -5.465828895568848, "rewards/margins": 0.6389875411987305, "rewards/rejected": -6.1048173904418945, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 13.92339249252641, "learning_rate": 9.133949170991397e-07, "logits/chosen": 0.1543331891298294, "logits/rejected": 0.24706482887268066, "logps/chosen": -5.337285041809082, "logps/rejected": -5.988917827606201, "loss": 0.549, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.337285041809082, "rewards/margins": 0.6516331434249878, "rewards/rejected": -5.988917827606201, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 10.77368991868015, "learning_rate": 9.125168570824231e-07, "logits/chosen": 0.10748803615570068, "logits/rejected": 0.26399269700050354, "logps/chosen": -5.3506646156311035, "logps/rejected": -6.005049705505371, "loss": 0.558, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.3506646156311035, "rewards/margins": 0.6543856859207153, "rewards/rejected": -6.005049705505371, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 36.35365244770725, "learning_rate": 9.116347948253496e-07, "logits/chosen": 0.06940366327762604, "logits/rejected": 0.2131757289171219, "logps/chosen": -5.62113094329834, "logps/rejected": -6.126827716827393, "loss": 0.6025, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.62113094329834, "rewards/margins": 0.5056957006454468, "rewards/rejected": -6.126827716827393, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 14.23813183432654, "learning_rate": 9.107487388856916e-07, "logits/chosen": 0.05752000957727432, "logits/rejected": 0.2247922718524933, "logps/chosen": -5.475346088409424, "logps/rejected": -6.16228723526001, "loss": 0.5201, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.475346088409424, "rewards/margins": 0.6869407892227173, "rewards/rejected": -6.16228723526001, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 15.19188190120627, "learning_rate": 9.098586978599673e-07, "logits/chosen": 0.11307302862405777, "logits/rejected": 0.2648463845252991, "logps/chosen": -5.60988187789917, "logps/rejected": -6.429734230041504, "loss": 0.5599, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.60988187789917, "rewards/margins": 0.8198525309562683, "rewards/rejected": -6.429734230041504, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 12.361030014202406, "learning_rate": 9.089646803833588e-07, "logits/chosen": 0.11198661476373672, "logits/rejected": 0.2504861652851105, "logps/chosen": -5.484559059143066, "logps/rejected": -6.180167198181152, "loss": 0.5525, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.484559059143066, "rewards/margins": 0.6956076622009277, "rewards/rejected": -6.180167198181152, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 13.27554973504844, "learning_rate": 9.080666951296276e-07, "logits/chosen": 0.01736786961555481, "logits/rejected": 0.261422336101532, "logps/chosen": -5.406741142272949, "logps/rejected": -6.4367523193359375, "loss": 0.4381, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -5.406741142272949, "rewards/margins": 1.0300109386444092, "rewards/rejected": -6.4367523193359375, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 11.758687127269544, "learning_rate": 9.071647508110305e-07, "logits/chosen": 0.07991054654121399, "logits/rejected": 0.31443414092063904, "logps/chosen": -5.383767604827881, "logps/rejected": -6.304274082183838, "loss": 0.5435, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.383767604827881, "rewards/margins": 0.9205058813095093, "rewards/rejected": -6.304274082183838, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 12.744630026281696, "learning_rate": 9.062588561782354e-07, "logits/chosen": 0.16700521111488342, "logits/rejected": 0.2418934404850006, "logps/chosen": -5.623802185058594, "logps/rejected": -6.238170146942139, "loss": 0.5951, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -5.623802185058594, "rewards/margins": 0.614367663860321, "rewards/rejected": -6.238170146942139, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 12.27542353610822, "learning_rate": 9.053490200202358e-07, "logits/chosen": 0.19946250319480896, "logits/rejected": 0.28316736221313477, "logps/chosen": -5.699484348297119, "logps/rejected": -6.318110466003418, "loss": 0.5919, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.699484348297119, "rewards/margins": 0.6186257600784302, "rewards/rejected": -6.318110466003418, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 17.408392465258704, "learning_rate": 9.044352511642661e-07, "logits/chosen": 0.18078812956809998, "logits/rejected": 0.22953173518180847, "logps/chosen": -5.640721321105957, "logps/rejected": -6.184195518493652, "loss": 0.6414, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.640721321105957, "rewards/margins": 0.5434733033180237, "rewards/rejected": -6.184195518493652, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 13.708838724804911, "learning_rate": 9.03517558475716e-07, "logits/chosen": 0.13065364956855774, "logits/rejected": 0.23177531361579895, "logps/chosen": -5.465883255004883, "logps/rejected": -6.036374092102051, "loss": 0.5694, "rewards/accuracies": 0.6875, "rewards/chosen": -5.465883255004883, "rewards/margins": 0.5704902410507202, "rewards/rejected": -6.036374092102051, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 9.745533112501015, "learning_rate": 9.025959508580436e-07, "logits/chosen": 0.18008752167224884, "logits/rejected": 0.37324437499046326, "logps/chosen": -5.709378719329834, "logps/rejected": -6.463801383972168, "loss": 0.5405, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.709378719329834, "rewards/margins": 0.7544230818748474, "rewards/rejected": -6.463801383972168, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 12.37412813811776, "learning_rate": 9.016704372526905e-07, "logits/chosen": 0.12400758266448975, "logits/rejected": 0.25632956624031067, "logps/chosen": -5.570986747741699, "logps/rejected": -6.264461517333984, "loss": 0.5454, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.570986747741699, "rewards/margins": 0.693474531173706, "rewards/rejected": -6.264461517333984, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 18.110292828178327, "learning_rate": 9.007410266389934e-07, "logits/chosen": 0.08309497684240341, "logits/rejected": 0.16480982303619385, "logps/chosen": -5.608468055725098, "logps/rejected": -6.174483299255371, "loss": 0.5948, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.608468055725098, "rewards/margins": 0.5660146474838257, "rewards/rejected": -6.174483299255371, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 18.20647297796142, "learning_rate": 8.998077280340981e-07, "logits/chosen": 0.18582332134246826, "logits/rejected": 0.24724039435386658, "logps/chosen": -5.8928351402282715, "logps/rejected": -6.439637660980225, "loss": 0.5913, "rewards/accuracies": 0.6875, "rewards/chosen": -5.8928351402282715, "rewards/margins": 0.5468022227287292, "rewards/rejected": -6.439637660980225, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 14.597450989317602, "learning_rate": 8.988705504928722e-07, "logits/chosen": 0.1214490756392479, "logits/rejected": 0.28071409463882446, "logps/chosen": -5.931302070617676, "logps/rejected": -6.872589111328125, "loss": 0.4723, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.931302070617676, "rewards/margins": 0.9412870407104492, "rewards/rejected": -6.872589111328125, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.35798612236976624, "eval_logits/rejected": 0.4449055790901184, "eval_logps/chosen": -6.02386474609375, "eval_logps/rejected": -6.782904624938965, "eval_loss": 0.5308428406715393, "eval_rewards/accuracies": 0.7188427448272705, "eval_rewards/chosen": -6.02386474609375, "eval_rewards/margins": 0.7590389847755432, "eval_rewards/rejected": -6.782904624938965, "eval_runtime": 40.5261, "eval_samples_per_second": 33.188, "eval_steps_per_second": 8.316, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 15.29857307215633, "learning_rate": 8.979295031078157e-07, "logits/chosen": 0.14387187361717224, "logits/rejected": 0.3309789299964905, "logps/chosen": -6.258933067321777, "logps/rejected": -7.016753196716309, "loss": 0.5223, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.258933067321777, "rewards/margins": 0.7578204870223999, "rewards/rejected": -7.016753196716309, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 12.612950668406821, "learning_rate": 8.969845950089751e-07, "logits/chosen": 0.13824522495269775, "logits/rejected": 0.2949258089065552, "logps/chosen": -5.9939680099487305, "logps/rejected": -6.851494789123535, "loss": 0.5158, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.9939680099487305, "rewards/margins": 0.8575266003608704, "rewards/rejected": -6.851494789123535, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 17.78519045821518, "learning_rate": 8.960358353638526e-07, "logits/chosen": 0.21123471856117249, "logits/rejected": 0.31997302174568176, "logps/chosen": -5.816453456878662, "logps/rejected": -6.492132663726807, "loss": 0.6052, "rewards/accuracies": 0.65625, "rewards/chosen": -5.816453456878662, "rewards/margins": 0.6756792068481445, "rewards/rejected": -6.492132663726807, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 15.250062194309272, "learning_rate": 8.950832333773184e-07, "logits/chosen": 0.19270232319831848, "logits/rejected": 0.33080360293388367, "logps/chosen": -5.814416885375977, "logps/rejected": -6.456835746765137, "loss": 0.6221, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -5.814416885375977, "rewards/margins": 0.6424186825752258, "rewards/rejected": -6.456835746765137, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 15.674114222136074, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.220210999250412, "logits/rejected": 0.25518444180488586, "logps/chosen": -5.676119804382324, "logps/rejected": -6.0189433097839355, "loss": 0.7025, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -5.676119804382324, "rewards/margins": 0.3428238034248352, "rewards/rejected": -6.0189433097839355, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 13.798818028867709, "learning_rate": 8.931665393857983e-07, "logits/chosen": 0.20737290382385254, "logits/rejected": 0.32157620787620544, "logps/chosen": -5.354896545410156, "logps/rejected": -6.0102128982543945, "loss": 0.5707, "rewards/accuracies": 0.6875, "rewards/chosen": -5.354896545410156, "rewards/margins": 0.6553162336349487, "rewards/rejected": -6.0102128982543945, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 12.676979654831209, "learning_rate": 8.922024659765861e-07, "logits/chosen": 0.11383476108312607, "logits/rejected": 0.24133272469043732, "logps/chosen": -5.128402233123779, "logps/rejected": -5.845340728759766, "loss": 0.5275, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.128402233123779, "rewards/margins": 0.716938853263855, "rewards/rejected": -5.845340728759766, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 18.980815836005608, "learning_rate": 8.912345874173288e-07, "logits/chosen": 0.11772704124450684, "logits/rejected": 0.25058311223983765, "logps/chosen": -5.384622097015381, "logps/rejected": -6.073968410491943, "loss": 0.5711, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.384622097015381, "rewards/margins": 0.6893460750579834, "rewards/rejected": -6.073968410491943, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 14.843575529330332, "learning_rate": 8.902629130983885e-07, "logits/chosen": 0.1320587396621704, "logits/rejected": 0.17445063591003418, "logps/chosen": -5.536566734313965, "logps/rejected": -6.069863796234131, "loss": 0.5892, "rewards/accuracies": 0.6875, "rewards/chosen": -5.536566734313965, "rewards/margins": 0.5332968831062317, "rewards/rejected": -6.069863796234131, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 14.81577656660621, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.22955891489982605, "logits/rejected": 0.2678947150707245, "logps/chosen": -5.384530544281006, "logps/rejected": -6.112283229827881, "loss": 0.5107, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.384530544281006, "rewards/margins": 0.7277525067329407, "rewards/rejected": -6.112283229827881, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 15.3416608653916, "learning_rate": 8.883082149269478e-07, "logits/chosen": 0.11190225183963776, "logits/rejected": 0.21479888260364532, "logps/chosen": -5.580390453338623, "logps/rejected": -6.2487568855285645, "loss": 0.5401, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.580390453338623, "rewards/margins": 0.6683661937713623, "rewards/rejected": -6.2487568855285645, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 15.444782575106254, "learning_rate": 8.873252100389377e-07, "logits/chosen": 0.17756783962249756, "logits/rejected": 0.24597899615764618, "logps/chosen": -5.533586025238037, "logps/rejected": -6.265809535980225, "loss": 0.5283, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.533586025238037, "rewards/margins": 0.7322239875793457, "rewards/rejected": -6.265809535980225, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 20.164548927987145, "learning_rate": 8.863384473200411e-07, "logits/chosen": 0.17108054459095, "logits/rejected": 0.21203382313251495, "logps/chosen": -5.927115440368652, "logps/rejected": -6.435086727142334, "loss": 0.6023, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.927115440368652, "rewards/margins": 0.5079714059829712, "rewards/rejected": -6.435086727142334, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 13.432471300515754, "learning_rate": 8.853479363438342e-07, "logits/chosen": 0.24180154502391815, "logits/rejected": 0.374270498752594, "logps/chosen": -5.901001930236816, "logps/rejected": -6.413620948791504, "loss": 0.6198, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.901001930236816, "rewards/margins": 0.512619137763977, "rewards/rejected": -6.413620948791504, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 15.291190174881345, "learning_rate": 8.843536867202588e-07, "logits/chosen": 0.19205886125564575, "logits/rejected": 0.35523292422294617, "logps/chosen": -5.895748138427734, "logps/rejected": -6.573211669921875, "loss": 0.5595, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.895748138427734, "rewards/margins": 0.6774641275405884, "rewards/rejected": -6.573211669921875, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 18.44966327739092, "learning_rate": 8.833557080955292e-07, "logits/chosen": 0.1551905870437622, "logits/rejected": 0.2461961805820465, "logps/chosen": -5.779360771179199, "logps/rejected": -6.300524711608887, "loss": 0.6082, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -5.779360771179199, "rewards/margins": 0.5211645364761353, "rewards/rejected": -6.300524711608887, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 17.143441173720916, "learning_rate": 8.823540101520381e-07, "logits/chosen": 0.17627717554569244, "logits/rejected": 0.3512203097343445, "logps/chosen": -5.654678821563721, "logps/rejected": -6.333632946014404, "loss": 0.5785, "rewards/accuracies": 0.71875, "rewards/chosen": -5.654678821563721, "rewards/margins": 0.6789540648460388, "rewards/rejected": -6.333632946014404, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 17.565361615785797, "learning_rate": 8.813486026082637e-07, "logits/chosen": 0.15582673251628876, "logits/rejected": 0.3443092107772827, "logps/chosen": -5.573996067047119, "logps/rejected": -6.326644420623779, "loss": 0.54, "rewards/accuracies": 0.71875, "rewards/chosen": -5.573996067047119, "rewards/margins": 0.7526488304138184, "rewards/rejected": -6.326644420623779, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 25.222681982337267, "learning_rate": 8.803394952186742e-07, "logits/chosen": 0.049422696232795715, "logits/rejected": 0.18728506565093994, "logps/chosen": -5.521978378295898, "logps/rejected": -6.14212703704834, "loss": 0.5479, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.521978378295898, "rewards/margins": 0.6201489567756653, "rewards/rejected": -6.14212703704834, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 14.205692268084219, "learning_rate": 8.793266977736342e-07, "logits/chosen": 0.16259056329727173, "logits/rejected": 0.11589845269918442, "logps/chosen": -5.803591728210449, "logps/rejected": -6.200663089752197, "loss": 0.6383, "rewards/accuracies": 0.65625, "rewards/chosen": -5.803591728210449, "rewards/margins": 0.397070974111557, "rewards/rejected": -6.200663089752197, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 17.341985528843246, "learning_rate": 8.783102200993085e-07, "logits/chosen": 0.18979358673095703, "logits/rejected": 0.30549517273902893, "logps/chosen": -5.923371315002441, "logps/rejected": -6.649941921234131, "loss": 0.5321, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.923371315002441, "rewards/margins": 0.726570725440979, "rewards/rejected": -6.649941921234131, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 14.617021764299702, "learning_rate": 8.772900720575683e-07, "logits/chosen": 0.1571238487958908, "logits/rejected": 0.2531681954860687, "logps/chosen": -6.186182975769043, "logps/rejected": -6.775763511657715, "loss": 0.5761, "rewards/accuracies": 0.71875, "rewards/chosen": -6.186182975769043, "rewards/margins": 0.5895804762840271, "rewards/rejected": -6.775763511657715, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 22.096470126004565, "learning_rate": 8.762662635458944e-07, "logits/chosen": 0.14979322254657745, "logits/rejected": 0.28241169452667236, "logps/chosen": -6.364068984985352, "logps/rejected": -7.038887023925781, "loss": 0.5996, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.364068984985352, "rewards/margins": 0.6748181581497192, "rewards/rejected": -7.038887023925781, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 16.979769109378267, "learning_rate": 8.752388044972811e-07, "logits/chosen": 0.19243720173835754, "logits/rejected": 0.257870614528656, "logps/chosen": -5.914661407470703, "logps/rejected": -6.654353141784668, "loss": 0.5544, "rewards/accuracies": 0.71875, "rewards/chosen": -5.914661407470703, "rewards/margins": 0.7396913766860962, "rewards/rejected": -6.654353141784668, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 15.275234445038745, "learning_rate": 8.74207704880141e-07, "logits/chosen": 0.2115100920200348, "logits/rejected": 0.3003782331943512, "logps/chosen": -5.870343208312988, "logps/rejected": -6.7727766036987305, "loss": 0.5017, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.870343208312988, "rewards/margins": 0.9024330973625183, "rewards/rejected": -6.7727766036987305, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 12.640657242743238, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.2496422529220581, "logits/rejected": 0.31352147459983826, "logps/chosen": -5.5420427322387695, "logps/rejected": -6.236983776092529, "loss": 0.5315, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.5420427322387695, "rewards/margins": 0.6949411630630493, "rewards/rejected": -6.236983776092529, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 18.055919283499584, "learning_rate": 8.721346239904355e-07, "logits/chosen": 0.13457481563091278, "logits/rejected": 0.31524306535720825, "logps/chosen": -5.6595916748046875, "logps/rejected": -6.476833343505859, "loss": 0.5531, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.6595916748046875, "rewards/margins": 0.8172420263290405, "rewards/rejected": -6.476833343505859, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 17.122514765381077, "learning_rate": 8.710926628309101e-07, "logits/chosen": 0.15976467728614807, "logits/rejected": 0.3133135437965393, "logps/chosen": -5.854723930358887, "logps/rejected": -6.473104000091553, "loss": 0.5602, "rewards/accuracies": 0.6875, "rewards/chosen": -5.854723930358887, "rewards/margins": 0.6183798313140869, "rewards/rejected": -6.473104000091553, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 10.822084635317134, "learning_rate": 8.700471013287424e-07, "logits/chosen": 0.22708716988563538, "logits/rejected": 0.2691616415977478, "logps/chosen": -5.637332916259766, "logps/rejected": -6.276172637939453, "loss": 0.542, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.637332916259766, "rewards/margins": 0.6388393044471741, "rewards/rejected": -6.276172637939453, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 19.85807734295468, "learning_rate": 8.689979496279746e-07, "logits/chosen": 0.20187854766845703, "logits/rejected": 0.2646445631980896, "logps/chosen": -5.8525800704956055, "logps/rejected": -6.3532609939575195, "loss": 0.6581, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -5.8525800704956055, "rewards/margins": 0.5006800889968872, "rewards/rejected": -6.3532609939575195, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 12.802093742336572, "learning_rate": 8.679452179074811e-07, "logits/chosen": 0.20651888847351074, "logits/rejected": 0.3104880154132843, "logps/chosen": -5.751865386962891, "logps/rejected": -6.506569862365723, "loss": 0.5, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -5.751865386962891, "rewards/margins": 0.7547035217285156, "rewards/rejected": -6.506569862365723, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 14.648920659616783, "learning_rate": 8.668889163808698e-07, "logits/chosen": 0.2184140384197235, "logits/rejected": 0.34038546681404114, "logps/chosen": -5.645837306976318, "logps/rejected": -6.247082710266113, "loss": 0.5666, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.645837306976318, "rewards/margins": 0.6012449860572815, "rewards/rejected": -6.247082710266113, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 14.26250299475767, "learning_rate": 8.658290552963827e-07, "logits/chosen": 0.23741602897644043, "logits/rejected": 0.28060996532440186, "logps/chosen": -5.616380214691162, "logps/rejected": -6.297336578369141, "loss": 0.5857, "rewards/accuracies": 0.6875, "rewards/chosen": -5.616380214691162, "rewards/margins": 0.6809557676315308, "rewards/rejected": -6.297336578369141, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 10.515156516195294, "learning_rate": 8.647656449367966e-07, "logits/chosen": 0.2404049187898636, "logits/rejected": 0.36479660868644714, "logps/chosen": -5.643769264221191, "logps/rejected": -6.319647312164307, "loss": 0.5573, "rewards/accuracies": 0.71875, "rewards/chosen": -5.643769264221191, "rewards/margins": 0.6758776307106018, "rewards/rejected": -6.319647312164307, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 12.436576795804049, "learning_rate": 8.636986956193235e-07, "logits/chosen": 0.16680355370044708, "logits/rejected": 0.2690460979938507, "logps/chosen": -5.517597675323486, "logps/rejected": -6.17305850982666, "loss": 0.5697, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.517597675323486, "rewards/margins": 0.6554608345031738, "rewards/rejected": -6.17305850982666, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 12.891542063021113, "learning_rate": 8.626282176955104e-07, "logits/chosen": 0.2042749673128128, "logits/rejected": 0.3137625455856323, "logps/chosen": -5.347027778625488, "logps/rejected": -6.093008995056152, "loss": 0.5269, "rewards/accuracies": 0.75, "rewards/chosen": -5.347027778625488, "rewards/margins": 0.7459806203842163, "rewards/rejected": -6.093008995056152, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 18.61876675112565, "learning_rate": 8.615542215511389e-07, "logits/chosen": 0.23789939284324646, "logits/rejected": 0.28443318605422974, "logps/chosen": -5.404725551605225, "logps/rejected": -5.885302543640137, "loss": 0.6045, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -5.404725551605225, "rewards/margins": 0.48057737946510315, "rewards/rejected": -5.885302543640137, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 18.72306354792733, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.25966137647628784, "logits/rejected": 0.34388163685798645, "logps/chosen": -5.613162040710449, "logps/rejected": -6.148386478424072, "loss": 0.5912, "rewards/accuracies": 0.65625, "rewards/chosen": -5.613162040710449, "rewards/margins": 0.5352237820625305, "rewards/rejected": -6.148386478424072, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 10.258642381339513, "learning_rate": 8.593957163144141e-07, "logits/chosen": 0.1562170833349228, "logits/rejected": 0.2808864712715149, "logps/chosen": -5.313093662261963, "logps/rejected": -6.068539142608643, "loss": 0.5264, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.313093662261963, "rewards/margins": 0.7554447054862976, "rewards/rejected": -6.068539142608643, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 11.208788577101435, "learning_rate": 8.58311228163888e-07, "logits/chosen": 0.17716021835803986, "logits/rejected": 0.25549858808517456, "logps/chosen": -5.498555660247803, "logps/rejected": -6.058858394622803, "loss": 0.5648, "rewards/accuracies": 0.71875, "rewards/chosen": -5.498555660247803, "rewards/margins": 0.5603026747703552, "rewards/rejected": -6.058858394622803, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 16.640933022243978, "learning_rate": 8.57223263676255e-07, "logits/chosen": 0.12011446803808212, "logits/rejected": 0.22969922423362732, "logps/chosen": -5.297333717346191, "logps/rejected": -6.263106346130371, "loss": 0.4494, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -5.297333717346191, "rewards/margins": 0.9657731056213379, "rewards/rejected": -6.263106346130371, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 11.787034423414816, "learning_rate": 8.561318334069511e-07, "logits/chosen": 0.2358277142047882, "logits/rejected": 0.3527395725250244, "logps/chosen": -5.541464805603027, "logps/rejected": -6.241854190826416, "loss": 0.528, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -5.541464805603027, "rewards/margins": 0.7003897428512573, "rewards/rejected": -6.241854190826416, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 15.020512238539657, "learning_rate": 8.550369479450375e-07, "logits/chosen": 0.21781201660633087, "logits/rejected": 0.3269873261451721, "logps/chosen": -5.8010053634643555, "logps/rejected": -6.569317817687988, "loss": 0.5312, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.8010053634643555, "rewards/margins": 0.7683130502700806, "rewards/rejected": -6.569317817687988, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 17.350309096267768, "learning_rate": 8.539386179130977e-07, "logits/chosen": 0.2558678984642029, "logits/rejected": 0.28598424792289734, "logps/chosen": -5.773947715759277, "logps/rejected": -6.498719692230225, "loss": 0.5309, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.773947715759277, "rewards/margins": 0.7247717380523682, "rewards/rejected": -6.498719692230225, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 13.500365423739764, "learning_rate": 8.528368539671347e-07, "logits/chosen": 0.18265984952449799, "logits/rejected": 0.3013036847114563, "logps/chosen": -5.60908317565918, "logps/rejected": -6.699934482574463, "loss": 0.4855, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.60908317565918, "rewards/margins": 1.0908517837524414, "rewards/rejected": -6.699934482574463, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 16.005346897523854, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.24924102425575256, "logits/rejected": 0.32233044505119324, "logps/chosen": -6.151415824890137, "logps/rejected": -6.871801853179932, "loss": 0.5725, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.151415824890137, "rewards/margins": 0.7203862071037292, "rewards/rejected": -6.871801853179932, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 17.11289163524801, "learning_rate": 8.506230671236254e-07, "logits/chosen": 0.16594907641410828, "logits/rejected": 0.24023489654064178, "logps/chosen": -6.065356731414795, "logps/rejected": -6.639665126800537, "loss": 0.5863, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.065356731414795, "rewards/margins": 0.5743091702461243, "rewards/rejected": -6.639665126800537, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 15.878527895392969, "learning_rate": 8.495110657042488e-07, "logits/chosen": 0.22305187582969666, "logits/rejected": 0.36137571930885315, "logps/chosen": -6.219046592712402, "logps/rejected": -6.992509365081787, "loss": 0.5364, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.219046592712402, "rewards/margins": 0.7734623551368713, "rewards/rejected": -6.992509365081787, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 16.007532699195036, "learning_rate": 8.483956733269799e-07, "logits/chosen": 0.19012035429477692, "logits/rejected": 0.2789403200149536, "logps/chosen": -6.241211891174316, "logps/rejected": -7.051243782043457, "loss": 0.5435, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.241211891174316, "rewards/margins": 0.8100322484970093, "rewards/rejected": -7.051243782043457, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 19.618255130757156, "learning_rate": 8.472769008133602e-07, "logits/chosen": 0.1263648420572281, "logits/rejected": 0.24237623810768127, "logps/chosen": -6.539755344390869, "logps/rejected": -7.306853294372559, "loss": 0.5427, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -6.539755344390869, "rewards/margins": 0.7670982480049133, "rewards/rejected": -7.306853294372559, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 20.765255536617254, "learning_rate": 8.461547590177259e-07, "logits/chosen": 0.22141221165657043, "logits/rejected": 0.3248814344406128, "logps/chosen": -6.352464199066162, "logps/rejected": -7.167881965637207, "loss": 0.5967, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -6.352464199066162, "rewards/margins": 0.8154180645942688, "rewards/rejected": -7.167881965637207, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 15.14947910536799, "learning_rate": 8.450292588271014e-07, "logits/chosen": 0.21818144619464874, "logits/rejected": 0.314275324344635, "logps/chosen": -6.645957946777344, "logps/rejected": -7.430220127105713, "loss": 0.5436, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -6.645957946777344, "rewards/margins": 0.7842621207237244, "rewards/rejected": -7.430220127105713, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 18.79829783213535, "learning_rate": 8.439004111610945e-07, "logits/chosen": 0.2498481571674347, "logits/rejected": 0.3003018796443939, "logps/chosen": -6.233012676239014, "logps/rejected": -7.057332515716553, "loss": 0.5589, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.233012676239014, "rewards/margins": 0.8243201971054077, "rewards/rejected": -7.057332515716553, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 14.177206507503046, "learning_rate": 8.427682269717901e-07, "logits/chosen": 0.24166591465473175, "logits/rejected": 0.335605651140213, "logps/chosen": -6.310477256774902, "logps/rejected": -7.1328558921813965, "loss": 0.525, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.310477256774902, "rewards/margins": 0.8223794102668762, "rewards/rejected": -7.1328558921813965, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 15.19390387746427, "learning_rate": 8.416327172436446e-07, "logits/chosen": 0.21852943301200867, "logits/rejected": 0.3223131000995636, "logps/chosen": -6.043982028961182, "logps/rejected": -6.626863956451416, "loss": 0.5938, "rewards/accuracies": 0.65625, "rewards/chosen": -6.043982028961182, "rewards/margins": 0.5828819870948792, "rewards/rejected": -6.626863956451416, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 11.913865729089455, "learning_rate": 8.404938929933778e-07, "logits/chosen": 0.3176259398460388, "logits/rejected": 0.4193806052207947, "logps/chosen": -5.699124336242676, "logps/rejected": -6.688092231750488, "loss": 0.4554, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.699124336242676, "rewards/margins": 0.9889682531356812, "rewards/rejected": -6.688092231750488, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 12.751594689227122, "learning_rate": 8.39351765269868e-07, "logits/chosen": 0.25101524591445923, "logits/rejected": 0.31966131925582886, "logps/chosen": -5.547338962554932, "logps/rejected": -6.2216315269470215, "loss": 0.5795, "rewards/accuracies": 0.6875, "rewards/chosen": -5.547338962554932, "rewards/margins": 0.674292802810669, "rewards/rejected": -6.2216315269470215, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 15.283329629391767, "learning_rate": 8.382063451540431e-07, "logits/chosen": 0.247838094830513, "logits/rejected": 0.4131386876106262, "logps/chosen": -5.80750036239624, "logps/rejected": -6.624395847320557, "loss": 0.5096, "rewards/accuracies": 0.75, "rewards/chosen": -5.80750036239624, "rewards/margins": 0.8168947100639343, "rewards/rejected": -6.624395847320557, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 12.516491977296903, "learning_rate": 8.370576437587742e-07, "logits/chosen": 0.29110175371170044, "logits/rejected": 0.3179221451282501, "logps/chosen": -5.713548183441162, "logps/rejected": -6.4964399337768555, "loss": 0.4962, "rewards/accuracies": 0.75, "rewards/chosen": -5.713548183441162, "rewards/margins": 0.7828910946846008, "rewards/rejected": -6.4964399337768555, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 12.646703554483478, "learning_rate": 8.359056722287674e-07, "logits/chosen": 0.2126801460981369, "logits/rejected": 0.40291085839271545, "logps/chosen": -5.831676483154297, "logps/rejected": -6.67948055267334, "loss": 0.5049, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.831676483154297, "rewards/margins": 0.847804844379425, "rewards/rejected": -6.67948055267334, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 14.489443931073554, "learning_rate": 8.347504417404553e-07, "logits/chosen": 0.25622475147247314, "logits/rejected": 0.36224812269210815, "logps/chosen": -5.9590864181518555, "logps/rejected": -6.749197483062744, "loss": 0.5402, "rewards/accuracies": 0.71875, "rewards/chosen": -5.9590864181518555, "rewards/margins": 0.790111243724823, "rewards/rejected": -6.749197483062744, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 11.065587677899948, "learning_rate": 8.335919635018893e-07, "logits/chosen": 0.12853407859802246, "logits/rejected": 0.21952052414417267, "logps/chosen": -5.948999881744385, "logps/rejected": -6.663697719573975, "loss": 0.5314, "rewards/accuracies": 0.78125, "rewards/chosen": -5.948999881744385, "rewards/margins": 0.7146986722946167, "rewards/rejected": -6.663697719573975, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 12.767969162674545, "learning_rate": 8.324302487526303e-07, "logits/chosen": 0.18974968791007996, "logits/rejected": 0.2903616726398468, "logps/chosen": -6.230439186096191, "logps/rejected": -6.996090888977051, "loss": 0.5204, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.230439186096191, "rewards/margins": 0.7656509280204773, "rewards/rejected": -6.996090888977051, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 13.152626302297893, "learning_rate": 8.312653087636398e-07, "logits/chosen": 0.18966086208820343, "logits/rejected": 0.2524445652961731, "logps/chosen": -6.036347389221191, "logps/rejected": -6.925161838531494, "loss": 0.5158, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.036347389221191, "rewards/margins": 0.8888150453567505, "rewards/rejected": -6.925161838531494, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 18.043701624127852, "learning_rate": 8.300971548371711e-07, "logits/chosen": 0.06610165536403656, "logits/rejected": 0.2325301617383957, "logps/chosen": -6.377996921539307, "logps/rejected": -7.129579067230225, "loss": 0.535, "rewards/accuracies": 0.71875, "rewards/chosen": -6.377996921539307, "rewards/margins": 0.7515822052955627, "rewards/rejected": -7.129579067230225, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 17.96252946090006, "learning_rate": 8.289257983066582e-07, "logits/chosen": 0.0962035059928894, "logits/rejected": 0.20685270428657532, "logps/chosen": -6.180153846740723, "logps/rejected": -6.946649074554443, "loss": 0.5389, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.180153846740723, "rewards/margins": 0.766494870185852, "rewards/rejected": -6.946649074554443, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 16.188596755160887, "learning_rate": 8.277512505366077e-07, "logits/chosen": 0.04402122646570206, "logits/rejected": 0.22091951966285706, "logps/chosen": -6.179617881774902, "logps/rejected": -7.041774749755859, "loss": 0.519, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.179617881774902, "rewards/margins": 0.8621565699577332, "rewards/rejected": -7.041774749755859, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 16.767151230628347, "learning_rate": 8.265735229224868e-07, "logits/chosen": 0.1340595781803131, "logits/rejected": 0.23955588042736053, "logps/chosen": -6.057554721832275, "logps/rejected": -7.098371982574463, "loss": 0.4874, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.057554721832275, "rewards/margins": 1.0408169031143188, "rewards/rejected": -7.098371982574463, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 13.098202171278327, "learning_rate": 8.253926268906144e-07, "logits/chosen": 0.1157931536436081, "logits/rejected": 0.25393274426460266, "logps/chosen": -6.154962062835693, "logps/rejected": -7.197298526763916, "loss": 0.4537, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.154962062835693, "rewards/margins": 1.0423352718353271, "rewards/rejected": -7.197298526763916, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 13.517495678477655, "learning_rate": 8.242085738980487e-07, "logits/chosen": 0.17463275790214539, "logits/rejected": 0.36602383852005005, "logps/chosen": -6.2473320960998535, "logps/rejected": -7.102453708648682, "loss": 0.5337, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.2473320960998535, "rewards/margins": 0.8551211357116699, "rewards/rejected": -7.102453708648682, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 20.712260359978668, "learning_rate": 8.230213754324772e-07, "logits/chosen": 0.13900445401668549, "logits/rejected": 0.2082657366991043, "logps/chosen": -6.22186803817749, "logps/rejected": -7.050421714782715, "loss": 0.4953, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.22186803817749, "rewards/margins": 0.8285531997680664, "rewards/rejected": -7.050421714782715, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 19.229692394592984, "learning_rate": 8.218310430121045e-07, "logits/chosen": 0.16844718158245087, "logits/rejected": 0.2008642852306366, "logps/chosen": -6.170546054840088, "logps/rejected": -6.932792663574219, "loss": 0.5512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.170546054840088, "rewards/margins": 0.7622462511062622, "rewards/rejected": -6.932792663574219, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 14.914611475366184, "learning_rate": 8.20637588185541e-07, "logits/chosen": 0.20437756180763245, "logits/rejected": 0.2762525975704193, "logps/chosen": -6.374381065368652, "logps/rejected": -7.5163254737854, "loss": 0.4453, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.374381065368652, "rewards/margins": 1.1419436931610107, "rewards/rejected": -7.5163254737854, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 16.03219924267573, "learning_rate": 8.194410225316906e-07, "logits/chosen": 0.14394992589950562, "logits/rejected": 0.27011603116989136, "logps/chosen": -6.165439128875732, "logps/rejected": -6.9398345947265625, "loss": 0.5303, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.165439128875732, "rewards/margins": 0.7743954062461853, "rewards/rejected": -6.9398345947265625, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 16.120836189167232, "learning_rate": 8.182413576596385e-07, "logits/chosen": 0.24175231158733368, "logits/rejected": 0.2851027548313141, "logps/chosen": -6.205076217651367, "logps/rejected": -7.016226768493652, "loss": 0.5483, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.205076217651367, "rewards/margins": 0.8111513257026672, "rewards/rejected": -7.016226768493652, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 19.369070787397895, "learning_rate": 8.170386052085389e-07, "logits/chosen": 0.24014422297477722, "logits/rejected": 0.3286511301994324, "logps/chosen": -6.2573981285095215, "logps/rejected": -7.114475250244141, "loss": 0.5342, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.2573981285095215, "rewards/margins": 0.8570768237113953, "rewards/rejected": -7.114475250244141, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 18.48545187316394, "learning_rate": 8.158327768475008e-07, "logits/chosen": 0.17265857756137848, "logits/rejected": 0.284074604511261, "logps/chosen": -6.185421466827393, "logps/rejected": -6.857685089111328, "loss": 0.584, "rewards/accuracies": 0.6875, "rewards/chosen": -6.185421466827393, "rewards/margins": 0.6722639203071594, "rewards/rejected": -6.857685089111328, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 20.273390556120045, "learning_rate": 8.146238842754767e-07, "logits/chosen": 0.1347537785768509, "logits/rejected": 0.21253633499145508, "logps/chosen": -6.445331573486328, "logps/rejected": -7.1278276443481445, "loss": 0.5594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.445331573486328, "rewards/margins": 0.6824960112571716, "rewards/rejected": -7.1278276443481445, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 20.4320617951501, "learning_rate": 8.134119392211476e-07, "logits/chosen": 0.20157483220100403, "logits/rejected": 0.3361803889274597, "logps/chosen": -6.184488296508789, "logps/rejected": -7.191983222961426, "loss": 0.4957, "rewards/accuracies": 0.75, "rewards/chosen": -6.184488296508789, "rewards/margins": 1.007494568824768, "rewards/rejected": -7.191983222961426, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 23.43161371763835, "learning_rate": 8.121969534428094e-07, "logits/chosen": 0.12299346923828125, "logits/rejected": 0.26505616307258606, "logps/chosen": -6.256556034088135, "logps/rejected": -7.002509117126465, "loss": 0.5671, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.256556034088135, "rewards/margins": 0.7459535598754883, "rewards/rejected": -7.002509117126465, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.45361965894699097, "eval_logits/rejected": 0.5458019971847534, "eval_logps/chosen": -6.129868984222412, "eval_logps/rejected": -6.974369049072266, "eval_loss": 0.5245053768157959, "eval_rewards/accuracies": 0.7270029783248901, "eval_rewards/chosen": -6.129868984222412, "eval_rewards/margins": 0.8444996476173401, "eval_rewards/rejected": -6.974369049072266, "eval_runtime": 40.4927, "eval_samples_per_second": 33.216, "eval_steps_per_second": 8.322, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 17.202130595752188, "learning_rate": 8.109789387282599e-07, "logits/chosen": 0.17678597569465637, "logits/rejected": 0.22834455966949463, "logps/chosen": -6.1362104415893555, "logps/rejected": -6.886933326721191, "loss": 0.5577, "rewards/accuracies": 0.6875, "rewards/chosen": -6.1362104415893555, "rewards/margins": 0.7507225275039673, "rewards/rejected": -6.886933326721191, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 18.57114026496875, "learning_rate": 8.097579068946827e-07, "logits/chosen": 0.1430993676185608, "logits/rejected": 0.25479885935783386, "logps/chosen": -5.889231204986572, "logps/rejected": -6.660426139831543, "loss": 0.5134, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.889231204986572, "rewards/margins": 0.7711955904960632, "rewards/rejected": -6.660426139831543, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 17.049440919266214, "learning_rate": 8.085338697885344e-07, "logits/chosen": 0.18931570649147034, "logits/rejected": 0.28908759355545044, "logps/chosen": -6.022083282470703, "logps/rejected": -6.789105415344238, "loss": 0.518, "rewards/accuracies": 0.75, "rewards/chosen": -6.022083282470703, "rewards/margins": 0.7670217156410217, "rewards/rejected": -6.789105415344238, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 15.936337816879359, "learning_rate": 8.073068392854282e-07, "logits/chosen": 0.03720499202609062, "logits/rejected": 0.22801735997200012, "logps/chosen": -6.139120101928711, "logps/rejected": -7.007529258728027, "loss": 0.4747, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.139120101928711, "rewards/margins": 0.8684089779853821, "rewards/rejected": -7.007529258728027, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 13.691033291814302, "learning_rate": 8.060768272900193e-07, "logits/chosen": 0.15387536585330963, "logits/rejected": 0.2870241701602936, "logps/chosen": -5.81230354309082, "logps/rejected": -6.625840187072754, "loss": 0.5444, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.81230354309082, "rewards/margins": 0.8135362863540649, "rewards/rejected": -6.625840187072754, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 10.10387044811954, "learning_rate": 8.0484384573589e-07, "logits/chosen": 0.0756431445479393, "logits/rejected": 0.128414124250412, "logps/chosen": -5.8111958503723145, "logps/rejected": -6.627129554748535, "loss": 0.5213, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.8111958503723145, "rewards/margins": 0.8159326314926147, "rewards/rejected": -6.627129554748535, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 17.913277336454254, "learning_rate": 8.03607906585432e-07, "logits/chosen": 0.0950445681810379, "logits/rejected": 0.2263830602169037, "logps/chosen": -5.970423221588135, "logps/rejected": -6.686938285827637, "loss": 0.553, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -5.970423221588135, "rewards/margins": 0.7165160179138184, "rewards/rejected": -6.686938285827637, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 29.452473935554462, "learning_rate": 8.023690218297329e-07, "logits/chosen": 0.010259026661515236, "logits/rejected": 0.06864193826913834, "logps/chosen": -5.929680824279785, "logps/rejected": -6.849461555480957, "loss": 0.5005, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.929680824279785, "rewards/margins": 0.9197807312011719, "rewards/rejected": -6.849461555480957, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 17.263033564831858, "learning_rate": 8.01127203488458e-07, "logits/chosen": 0.10500442981719971, "logits/rejected": 0.15882495045661926, "logps/chosen": -6.237523078918457, "logps/rejected": -6.955941677093506, "loss": 0.5405, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.237523078918457, "rewards/margins": 0.7184182405471802, "rewards/rejected": -6.955941677093506, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 19.529431209319867, "learning_rate": 7.998824636097339e-07, "logits/chosen": 0.06514953076839447, "logits/rejected": 0.18319979310035706, "logps/chosen": -6.027078151702881, "logps/rejected": -6.923079490661621, "loss": 0.5012, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.027078151702881, "rewards/margins": 0.8960021734237671, "rewards/rejected": -6.923079490661621, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 15.349318991715643, "learning_rate": 7.986348142700328e-07, "logits/chosen": 0.11661596596240997, "logits/rejected": 0.24865713715553284, "logps/chosen": -6.0407490730285645, "logps/rejected": -7.0623979568481445, "loss": 0.4869, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.0407490730285645, "rewards/margins": 1.0216484069824219, "rewards/rejected": -7.0623979568481445, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 24.023943434761783, "learning_rate": 7.973842675740539e-07, "logits/chosen": 0.17658278346061707, "logits/rejected": 0.23971199989318848, "logps/chosen": -5.893896102905273, "logps/rejected": -6.807537078857422, "loss": 0.4995, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -5.893896102905273, "rewards/margins": 0.9136406183242798, "rewards/rejected": -6.807537078857422, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 17.890384827933413, "learning_rate": 7.961308356546066e-07, "logits/chosen": 0.15690180659294128, "logits/rejected": 0.28969475626945496, "logps/chosen": -6.024742126464844, "logps/rejected": -7.083406925201416, "loss": 0.4778, "rewards/accuracies": 0.78125, "rewards/chosen": -6.024742126464844, "rewards/margins": 1.0586647987365723, "rewards/rejected": -7.083406925201416, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 15.783624593467467, "learning_rate": 7.948745306724931e-07, "logits/chosen": 0.22627416253089905, "logits/rejected": 0.36442768573760986, "logps/chosen": -5.594944477081299, "logps/rejected": -6.624652862548828, "loss": 0.441, "rewards/accuracies": 0.78125, "rewards/chosen": -5.594944477081299, "rewards/margins": 1.0297082662582397, "rewards/rejected": -6.624652862548828, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 22.11339926214215, "learning_rate": 7.936153648163897e-07, "logits/chosen": 0.16960184276103973, "logits/rejected": 0.2633897364139557, "logps/chosen": -5.7268147468566895, "logps/rejected": -6.534048557281494, "loss": 0.5136, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.7268147468566895, "rewards/margins": 0.8072345852851868, "rewards/rejected": -6.534048557281494, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 16.538826769958156, "learning_rate": 7.92353350302729e-07, "logits/chosen": 0.11724768579006195, "logits/rejected": 0.25568071007728577, "logps/chosen": -5.619394302368164, "logps/rejected": -6.547186851501465, "loss": 0.4983, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.619394302368164, "rewards/margins": 0.9277926683425903, "rewards/rejected": -6.547186851501465, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 22.12911334959255, "learning_rate": 7.910884993755816e-07, "logits/chosen": 0.16513338685035706, "logits/rejected": 0.23703798651695251, "logps/chosen": -5.650341033935547, "logps/rejected": -6.640130519866943, "loss": 0.4895, "rewards/accuracies": 0.75, "rewards/chosen": -5.650341033935547, "rewards/margins": 0.9897898435592651, "rewards/rejected": -6.640130519866943, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 17.36768109885876, "learning_rate": 7.898208243065367e-07, "logits/chosen": 0.1310061663389206, "logits/rejected": 0.15318405628204346, "logps/chosen": -5.502734184265137, "logps/rejected": -6.223504066467285, "loss": 0.549, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.502734184265137, "rewards/margins": 0.7207694053649902, "rewards/rejected": -6.223504066467285, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 18.328817717618396, "learning_rate": 7.88550337394583e-07, "logits/chosen": 0.10827328264713287, "logits/rejected": 0.21655476093292236, "logps/chosen": -5.9376630783081055, "logps/rejected": -6.719278812408447, "loss": 0.5433, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.9376630783081055, "rewards/margins": 0.7816165089607239, "rewards/rejected": -6.719278812408447, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 22.455326932435337, "learning_rate": 7.872770509659905e-07, "logits/chosen": 0.18621854484081268, "logits/rejected": 0.22991891205310822, "logps/chosen": -5.907458305358887, "logps/rejected": -6.648918151855469, "loss": 0.5582, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.907458305358887, "rewards/margins": 0.7414597868919373, "rewards/rejected": -6.648918151855469, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 13.716933753033263, "learning_rate": 7.860009773741896e-07, "logits/chosen": 0.14775003492832184, "logits/rejected": 0.26721328496932983, "logps/chosen": -5.644070625305176, "logps/rejected": -6.704782962799072, "loss": 0.4506, "rewards/accuracies": 0.78125, "rewards/chosen": -5.644070625305176, "rewards/margins": 1.060712456703186, "rewards/rejected": -6.704782962799072, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 16.71446898693671, "learning_rate": 7.84722128999652e-07, "logits/chosen": 0.13316002488136292, "logits/rejected": 0.21198323369026184, "logps/chosen": -5.823159694671631, "logps/rejected": -6.863847255706787, "loss": 0.4941, "rewards/accuracies": 0.75, "rewards/chosen": -5.823159694671631, "rewards/margins": 1.040687918663025, "rewards/rejected": -6.863847255706787, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 16.162722320129237, "learning_rate": 7.834405182497699e-07, "logits/chosen": 0.179213747382164, "logits/rejected": 0.22565798461437225, "logps/chosen": -6.053199768066406, "logps/rejected": -6.9789581298828125, "loss": 0.5073, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.053199768066406, "rewards/margins": 0.9257580637931824, "rewards/rejected": -6.9789581298828125, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 17.709376557733414, "learning_rate": 7.821561575587368e-07, "logits/chosen": 0.09293942153453827, "logits/rejected": 0.13577643036842346, "logps/chosen": -5.983864784240723, "logps/rejected": -6.727473258972168, "loss": 0.5311, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.983864784240723, "rewards/margins": 0.7436081171035767, "rewards/rejected": -6.727473258972168, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 15.212029725976805, "learning_rate": 7.808690593874254e-07, "logits/chosen": 0.10780803114175797, "logits/rejected": 0.1763109564781189, "logps/chosen": -6.165822505950928, "logps/rejected": -7.049136161804199, "loss": 0.5148, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.165822505950928, "rewards/margins": 0.8833147287368774, "rewards/rejected": -7.049136161804199, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 18.09939469891023, "learning_rate": 7.79579236223268e-07, "logits/chosen": 0.19880545139312744, "logits/rejected": 0.35581156611442566, "logps/chosen": -5.880246162414551, "logps/rejected": -6.855000972747803, "loss": 0.4925, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.880246162414551, "rewards/margins": 0.9747552871704102, "rewards/rejected": -6.855000972747803, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 19.227773761703453, "learning_rate": 7.782867005801346e-07, "logits/chosen": 0.1398889720439911, "logits/rejected": 0.2980535626411438, "logps/chosen": -5.7821221351623535, "logps/rejected": -6.801084041595459, "loss": 0.4919, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.7821221351623535, "rewards/margins": 1.0189621448516846, "rewards/rejected": -6.801084041595459, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 22.01558259233896, "learning_rate": 7.769914649982117e-07, "logits/chosen": 0.16456761956214905, "logits/rejected": 0.2562982439994812, "logps/chosen": -5.685970783233643, "logps/rejected": -6.577097415924072, "loss": 0.5015, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.685970783233643, "rewards/margins": 0.8911269903182983, "rewards/rejected": -6.577097415924072, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 15.103882252563077, "learning_rate": 7.756935420438803e-07, "logits/chosen": 0.18431316316127777, "logits/rejected": 0.2743546962738037, "logps/chosen": -5.622121334075928, "logps/rejected": -6.8304290771484375, "loss": 0.4604, "rewards/accuracies": 0.78125, "rewards/chosen": -5.622121334075928, "rewards/margins": 1.2083070278167725, "rewards/rejected": -6.8304290771484375, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 16.77403037306164, "learning_rate": 7.743929443095951e-07, "logits/chosen": 0.17035691440105438, "logits/rejected": 0.23822757601737976, "logps/chosen": -5.837258338928223, "logps/rejected": -6.805042266845703, "loss": 0.4627, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -5.837258338928223, "rewards/margins": 0.9677834510803223, "rewards/rejected": -6.805042266845703, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 17.493072916503955, "learning_rate": 7.730896844137609e-07, "logits/chosen": 0.2099418193101883, "logits/rejected": 0.2816038429737091, "logps/chosen": -6.0883002281188965, "logps/rejected": -6.817152976989746, "loss": 0.5725, "rewards/accuracies": 0.71875, "rewards/chosen": -6.0883002281188965, "rewards/margins": 0.7288532257080078, "rewards/rejected": -6.817152976989746, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 17.640230722089512, "learning_rate": 7.717837750006106e-07, "logits/chosen": 0.20208656787872314, "logits/rejected": 0.2683083415031433, "logps/chosen": -5.794085502624512, "logps/rejected": -6.851934909820557, "loss": 0.5023, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -5.794085502624512, "rewards/margins": 1.057849407196045, "rewards/rejected": -6.851934909820557, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 18.484259582569003, "learning_rate": 7.704752287400832e-07, "logits/chosen": 0.1902150809764862, "logits/rejected": 0.3391354978084564, "logps/chosen": -5.876228332519531, "logps/rejected": -6.921287536621094, "loss": 0.4973, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.876228332519531, "rewards/margins": 1.0450587272644043, "rewards/rejected": -6.921287536621094, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 11.454201196045164, "learning_rate": 7.691640583277004e-07, "logits/chosen": 0.2179061621427536, "logits/rejected": 0.3303486108779907, "logps/chosen": -5.808591365814209, "logps/rejected": -6.8543806076049805, "loss": 0.4968, "rewards/accuracies": 0.8125, "rewards/chosen": -5.808591365814209, "rewards/margins": 1.0457905530929565, "rewards/rejected": -6.8543806076049805, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 13.234002780906847, "learning_rate": 7.678502764844433e-07, "logits/chosen": 0.11953376233577728, "logits/rejected": 0.2811211347579956, "logps/chosen": -5.876927375793457, "logps/rejected": -6.699154853820801, "loss": 0.5296, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.876927375793457, "rewards/margins": 0.822227954864502, "rewards/rejected": -6.699154853820801, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 16.168015072552326, "learning_rate": 7.665338959566288e-07, "logits/chosen": 0.1649668961763382, "logits/rejected": 0.247666597366333, "logps/chosen": -5.852034568786621, "logps/rejected": -6.861608028411865, "loss": 0.4668, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.852034568786621, "rewards/margins": 1.009574055671692, "rewards/rejected": -6.861608028411865, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 18.575767271471857, "learning_rate": 7.652149295157868e-07, "logits/chosen": 0.24930758774280548, "logits/rejected": 0.3756440281867981, "logps/chosen": -6.021227836608887, "logps/rejected": -6.831601619720459, "loss": 0.5183, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.021227836608887, "rewards/margins": 0.8103734850883484, "rewards/rejected": -6.831601619720459, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 17.960399002045953, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.2869338393211365, "logits/rejected": 0.30931708216667175, "logps/chosen": -5.834458351135254, "logps/rejected": -6.744782447814941, "loss": 0.5017, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -5.834458351135254, "rewards/margins": 0.9103237390518188, "rewards/rejected": -6.744782447814941, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 16.52946452506619, "learning_rate": 7.625692901064573e-07, "logits/chosen": 0.18396279215812683, "logits/rejected": 0.2748900055885315, "logps/chosen": -6.143970489501953, "logps/rejected": -7.1088409423828125, "loss": 0.5272, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.143970489501953, "rewards/margins": 0.964870810508728, "rewards/rejected": -7.1088409423828125, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 16.37478611059713, "learning_rate": 7.61242642805975e-07, "logits/chosen": 0.21236881613731384, "logits/rejected": 0.19955475628376007, "logps/chosen": -6.082472801208496, "logps/rejected": -6.950004577636719, "loss": 0.516, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.082472801208496, "rewards/margins": 0.8675323724746704, "rewards/rejected": -6.950004577636719, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 17.687518598319897, "learning_rate": 7.599134609282266e-07, "logits/chosen": 0.13771533966064453, "logits/rejected": 0.2721841335296631, "logps/chosen": -6.354277610778809, "logps/rejected": -7.230761528015137, "loss": 0.5075, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.354277610778809, "rewards/margins": 0.8764839172363281, "rewards/rejected": -7.230761528015137, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 26.39987510472528, "learning_rate": 7.585817573689402e-07, "logits/chosen": 0.12672816216945648, "logits/rejected": 0.23820586502552032, "logps/chosen": -5.959419250488281, "logps/rejected": -7.042623996734619, "loss": 0.4626, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -5.959419250488281, "rewards/margins": 1.0832041501998901, "rewards/rejected": -7.042623996734619, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 17.4533363132894, "learning_rate": 7.572475450483098e-07, "logits/chosen": 0.12043800204992294, "logits/rejected": 0.17995648086071014, "logps/chosen": -6.100076675415039, "logps/rejected": -6.964562892913818, "loss": 0.5223, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.100076675415039, "rewards/margins": 0.8644863963127136, "rewards/rejected": -6.964562892913818, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 17.480146823454703, "learning_rate": 7.559108369108689e-07, "logits/chosen": 0.1125221699476242, "logits/rejected": 0.21118512749671936, "logps/chosen": -5.943696022033691, "logps/rejected": -6.807468414306641, "loss": 0.5322, "rewards/accuracies": 0.71875, "rewards/chosen": -5.943696022033691, "rewards/margins": 0.8637717962265015, "rewards/rejected": -6.807468414306641, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 13.092806690716051, "learning_rate": 7.54571645925366e-07, "logits/chosen": 0.1145809143781662, "logits/rejected": 0.3303011357784271, "logps/chosen": -5.782382011413574, "logps/rejected": -6.937787055969238, "loss": 0.4424, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.782382011413574, "rewards/margins": 1.1554043292999268, "rewards/rejected": -6.937787055969238, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 16.14469305131183, "learning_rate": 7.532299850846378e-07, "logits/chosen": 0.17623932659626007, "logits/rejected": 0.32279539108276367, "logps/chosen": -5.644859313964844, "logps/rejected": -6.7017669677734375, "loss": 0.5071, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.644859313964844, "rewards/margins": 1.0569080114364624, "rewards/rejected": -6.7017669677734375, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 21.84348454936625, "learning_rate": 7.518858674054838e-07, "logits/chosen": 0.18657724559307098, "logits/rejected": 0.3472784161567688, "logps/chosen": -5.7748613357543945, "logps/rejected": -6.686038970947266, "loss": 0.5098, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.7748613357543945, "rewards/margins": 0.9111774563789368, "rewards/rejected": -6.686038970947266, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 17.98754835458458, "learning_rate": 7.505393059285394e-07, "logits/chosen": 0.17335431277751923, "logits/rejected": 0.31976914405822754, "logps/chosen": -5.914244174957275, "logps/rejected": -6.797031402587891, "loss": 0.5194, "rewards/accuracies": 0.78125, "rewards/chosen": -5.914244174957275, "rewards/margins": 0.8827871084213257, "rewards/rejected": -6.797031402587891, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 24.60095038318492, "learning_rate": 7.491903137181501e-07, "logits/chosen": 0.20609024167060852, "logits/rejected": 0.2291346788406372, "logps/chosen": -5.750668525695801, "logps/rejected": -6.663257598876953, "loss": 0.4949, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -5.750668525695801, "rewards/margins": 0.9125885963439941, "rewards/rejected": -6.663257598876953, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 14.409013081726695, "learning_rate": 7.478389038622441e-07, "logits/chosen": 0.2875801920890808, "logits/rejected": 0.3200586438179016, "logps/chosen": -6.002729892730713, "logps/rejected": -6.945242881774902, "loss": 0.5138, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.002729892730713, "rewards/margins": 0.9425133466720581, "rewards/rejected": -6.945242881774902, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 26.976855556154046, "learning_rate": 7.46485089472206e-07, "logits/chosen": 0.21935153007507324, "logits/rejected": 0.2860395908355713, "logps/chosen": -6.124142169952393, "logps/rejected": -6.994647026062012, "loss": 0.5461, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.124142169952393, "rewards/margins": 0.8705047369003296, "rewards/rejected": -6.994647026062012, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 16.827776765150286, "learning_rate": 7.451288836827487e-07, "logits/chosen": 0.2290511578321457, "logits/rejected": 0.2399894893169403, "logps/chosen": -5.844072341918945, "logps/rejected": -6.615323066711426, "loss": 0.5407, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.844072341918945, "rewards/margins": 0.7712504267692566, "rewards/rejected": -6.615323066711426, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 13.973064452727304, "learning_rate": 7.437702996517869e-07, "logits/chosen": 0.20473237335681915, "logits/rejected": 0.286801278591156, "logps/chosen": -5.871101379394531, "logps/rejected": -6.866879940032959, "loss": 0.4898, "rewards/accuracies": 0.78125, "rewards/chosen": -5.871101379394531, "rewards/margins": 0.9957789182662964, "rewards/rejected": -6.866879940032959, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 20.788251642581187, "learning_rate": 7.424093505603087e-07, "logits/chosen": 0.12261930853128433, "logits/rejected": 0.2647572159767151, "logps/chosen": -6.016678810119629, "logps/rejected": -7.0653486251831055, "loss": 0.4557, "rewards/accuracies": 0.78125, "rewards/chosen": -6.016678810119629, "rewards/margins": 1.048669695854187, "rewards/rejected": -7.0653486251831055, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 14.722286485245908, "learning_rate": 7.410460496122482e-07, "logits/chosen": 0.1587086170911789, "logits/rejected": 0.2953066825866699, "logps/chosen": -5.72568416595459, "logps/rejected": -6.942556858062744, "loss": 0.4268, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -5.72568416595459, "rewards/margins": 1.2168729305267334, "rewards/rejected": -6.942556858062744, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 20.649332345234036, "learning_rate": 7.396804100343572e-07, "logits/chosen": 0.14310452342033386, "logits/rejected": 0.290775328874588, "logps/chosen": -5.676429748535156, "logps/rejected": -6.626187801361084, "loss": 0.4842, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.676429748535156, "rewards/margins": 0.9497580528259277, "rewards/rejected": -6.626187801361084, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 12.309968443395842, "learning_rate": 7.383124450760768e-07, "logits/chosen": 0.16771818697452545, "logits/rejected": 0.31539756059646606, "logps/chosen": -5.971822261810303, "logps/rejected": -6.971066951751709, "loss": 0.4803, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.971822261810303, "rewards/margins": 0.9992449879646301, "rewards/rejected": -6.971066951751709, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 19.59709062341259, "learning_rate": 7.369421680094091e-07, "logits/chosen": 0.09537863731384277, "logits/rejected": 0.21557554602622986, "logps/chosen": -5.9603800773620605, "logps/rejected": -7.002479553222656, "loss": 0.5229, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.9603800773620605, "rewards/margins": 1.0420993566513062, "rewards/rejected": -7.002479553222656, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 24.96914235291267, "learning_rate": 7.355695921287881e-07, "logits/chosen": 0.10257432609796524, "logits/rejected": 0.1622716784477234, "logps/chosen": -6.157417297363281, "logps/rejected": -6.99847412109375, "loss": 0.5691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.157417297363281, "rewards/margins": 0.8410555720329285, "rewards/rejected": -6.99847412109375, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 19.314553185553457, "learning_rate": 7.341947307509513e-07, "logits/chosen": 0.14304575324058533, "logits/rejected": 0.250826895236969, "logps/chosen": -5.963129997253418, "logps/rejected": -6.9162726402282715, "loss": 0.5074, "rewards/accuracies": 0.75, "rewards/chosen": -5.963129997253418, "rewards/margins": 0.9531432390213013, "rewards/rejected": -6.9162726402282715, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 14.450663821618447, "learning_rate": 7.328175972148094e-07, "logits/chosen": 0.1500689536333084, "logits/rejected": 0.2425858974456787, "logps/chosen": -6.3051557540893555, "logps/rejected": -7.284122467041016, "loss": 0.4961, "rewards/accuracies": 0.78125, "rewards/chosen": -6.3051557540893555, "rewards/margins": 0.9789665341377258, "rewards/rejected": -7.284122467041016, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 18.78687142240483, "learning_rate": 7.314382048813185e-07, "logits/chosen": 0.1635875552892685, "logits/rejected": 0.35941606760025024, "logps/chosen": -6.095834255218506, "logps/rejected": -7.130131721496582, "loss": 0.4741, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.095834255218506, "rewards/margins": 1.0342967510223389, "rewards/rejected": -7.130131721496582, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 13.420998466953689, "learning_rate": 7.300565671333486e-07, "logits/chosen": 0.09271900355815887, "logits/rejected": 0.2630603611469269, "logps/chosen": -6.212649345397949, "logps/rejected": -7.1307878494262695, "loss": 0.5015, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.212649345397949, "rewards/margins": 0.9181381464004517, "rewards/rejected": -7.1307878494262695, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 13.897025650806489, "learning_rate": 7.286726973755554e-07, "logits/chosen": 0.1887483447790146, "logits/rejected": 0.2300991714000702, "logps/chosen": -5.981460094451904, "logps/rejected": -6.973616600036621, "loss": 0.4483, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -5.981460094451904, "rewards/margins": 0.9921572804450989, "rewards/rejected": -6.973616600036621, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 15.317032426667316, "learning_rate": 7.272866090342493e-07, "logits/chosen": 0.22718966007232666, "logits/rejected": 0.26908794045448303, "logps/chosen": -5.7803192138671875, "logps/rejected": -6.809161186218262, "loss": 0.4262, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.7803192138671875, "rewards/margins": 1.0288417339324951, "rewards/rejected": -6.809161186218262, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 19.184477957670573, "learning_rate": 7.258983155572656e-07, "logits/chosen": 0.13491706550121307, "logits/rejected": 0.22115369141101837, "logps/chosen": -5.7117791175842285, "logps/rejected": -6.639819145202637, "loss": 0.5109, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -5.7117791175842285, "rewards/margins": 0.9280405044555664, "rewards/rejected": -6.639819145202637, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 15.469692638194354, "learning_rate": 7.245078304138335e-07, "logits/chosen": 0.18605497479438782, "logits/rejected": 0.2611579895019531, "logps/chosen": -5.945569038391113, "logps/rejected": -6.897100925445557, "loss": 0.4947, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.945569038391113, "rewards/margins": 0.951531708240509, "rewards/rejected": -6.897100925445557, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 16.50168530356075, "learning_rate": 7.231151670944462e-07, "logits/chosen": 0.040394969284534454, "logits/rejected": 0.19212186336517334, "logps/chosen": -5.99670934677124, "logps/rejected": -6.910712242126465, "loss": 0.5084, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.99670934677124, "rewards/margins": 0.9140037298202515, "rewards/rejected": -6.910712242126465, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 15.151994746315143, "learning_rate": 7.217203391107291e-07, "logits/chosen": 0.12506279349327087, "logits/rejected": 0.26483994722366333, "logps/chosen": -5.909339427947998, "logps/rejected": -6.89986515045166, "loss": 0.4975, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -5.909339427947998, "rewards/margins": 0.990526020526886, "rewards/rejected": -6.89986515045166, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 16.70194167387008, "learning_rate": 7.203233599953096e-07, "logits/chosen": 0.1429775059223175, "logits/rejected": 0.2698334753513336, "logps/chosen": -6.0720438957214355, "logps/rejected": -7.0256805419921875, "loss": 0.4798, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.0720438957214355, "rewards/margins": 0.9536367654800415, "rewards/rejected": -7.0256805419921875, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 22.370967377965705, "learning_rate": 7.189242433016852e-07, "logits/chosen": 0.12595674395561218, "logits/rejected": 0.23196463286876678, "logps/chosen": -5.84996223449707, "logps/rejected": -6.92620325088501, "loss": 0.4656, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.84996223449707, "rewards/margins": 1.0762412548065186, "rewards/rejected": -6.92620325088501, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 20.316827313929224, "learning_rate": 7.17523002604092e-07, "logits/chosen": 0.11098027229309082, "logits/rejected": 0.24058881402015686, "logps/chosen": -6.27556848526001, "logps/rejected": -7.338924407958984, "loss": 0.4694, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.27556848526001, "rewards/margins": 1.0633567571640015, "rewards/rejected": -7.338924407958984, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 20.504963205002127, "learning_rate": 7.161196514973734e-07, "logits/chosen": 0.14478333294391632, "logits/rejected": 0.25037336349487305, "logps/chosen": -6.167120456695557, "logps/rejected": -7.217752933502197, "loss": 0.4969, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.167120456695557, "rewards/margins": 1.0506327152252197, "rewards/rejected": -7.217752933502197, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 18.488041736247038, "learning_rate": 7.147142035968483e-07, "logits/chosen": 0.2224707156419754, "logits/rejected": 0.34721943736076355, "logps/chosen": -6.559262275695801, "logps/rejected": -7.518952369689941, "loss": 0.4933, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.559262275695801, "rewards/margins": 0.9596904516220093, "rewards/rejected": -7.518952369689941, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 19.763007230468382, "learning_rate": 7.133066725381781e-07, "logits/chosen": 0.10347548872232437, "logits/rejected": 0.22554615139961243, "logps/chosen": -6.3506646156311035, "logps/rejected": -7.3634467124938965, "loss": 0.4971, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.3506646156311035, "rewards/margins": 1.0127819776535034, "rewards/rejected": -7.3634467124938965, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 22.25905205124137, "learning_rate": 7.118970719772354e-07, "logits/chosen": 0.15294012427330017, "logits/rejected": 0.29219064116477966, "logps/chosen": -6.451391696929932, "logps/rejected": -7.551220893859863, "loss": 0.4944, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.451391696929932, "rewards/margins": 1.0998289585113525, "rewards/rejected": -7.551220893859863, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 20.376588173901457, "learning_rate": 7.104854155899711e-07, "logits/chosen": 0.21640603244304657, "logits/rejected": 0.3168531060218811, "logps/chosen": -6.497882843017578, "logps/rejected": -7.4412031173706055, "loss": 0.5029, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.497882843017578, "rewards/margins": 0.9433202743530273, "rewards/rejected": -7.4412031173706055, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 19.146432509291014, "learning_rate": 7.090717170722817e-07, "logits/chosen": 0.1998690366744995, "logits/rejected": 0.2478518784046173, "logps/chosen": -6.401918888092041, "logps/rejected": -7.590080261230469, "loss": 0.4338, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.401918888092041, "rewards/margins": 1.1881616115570068, "rewards/rejected": -7.590080261230469, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 25.35475141745858, "learning_rate": 7.076559901398762e-07, "logits/chosen": 0.11758820712566376, "logits/rejected": 0.21281762421131134, "logps/chosen": -6.200407981872559, "logps/rejected": -7.060977935791016, "loss": 0.5226, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.200407981872559, "rewards/margins": 0.8605702519416809, "rewards/rejected": -7.060977935791016, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 24.0486777455112, "learning_rate": 7.062382485281436e-07, "logits/chosen": 0.18151164054870605, "logits/rejected": 0.2502385079860687, "logps/chosen": -6.121356010437012, "logps/rejected": -6.976804256439209, "loss": 0.5184, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.121356010437012, "rewards/margins": 0.8554478883743286, "rewards/rejected": -6.976804256439209, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.45949801802635193, "eval_logits/rejected": 0.5422627925872803, "eval_logps/chosen": -6.276676654815674, "eval_logps/rejected": -7.2502288818359375, "eval_loss": 0.5193930864334106, "eval_rewards/accuracies": 0.7299703359603882, "eval_rewards/chosen": -6.276676654815674, "eval_rewards/margins": 0.9735525846481323, "eval_rewards/rejected": -7.2502288818359375, "eval_runtime": 40.5571, "eval_samples_per_second": 33.163, "eval_steps_per_second": 8.309, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 15.314931049504239, "learning_rate": 7.048185059920193e-07, "logits/chosen": 0.19615164399147034, "logits/rejected": 0.31912848353385925, "logps/chosen": -6.090547561645508, "logps/rejected": -7.253050327301025, "loss": 0.4834, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.090547561645508, "rewards/margins": 1.1625028848648071, "rewards/rejected": -7.253050327301025, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 20.405760730739928, "learning_rate": 7.033967763058516e-07, "logits/chosen": 0.12850715219974518, "logits/rejected": 0.2679179012775421, "logps/chosen": -6.171474456787109, "logps/rejected": -7.080384731292725, "loss": 0.4892, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.171474456787109, "rewards/margins": 0.9089103937149048, "rewards/rejected": -7.080384731292725, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 17.35751364631236, "learning_rate": 7.019730732632681e-07, "logits/chosen": 0.26285654306411743, "logits/rejected": 0.34266722202301025, "logps/chosen": -6.166930675506592, "logps/rejected": -7.251943111419678, "loss": 0.4846, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.166930675506592, "rewards/margins": 1.085012435913086, "rewards/rejected": -7.251943111419678, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 16.162076606512507, "learning_rate": 7.005474106770418e-07, "logits/chosen": 0.13909681141376495, "logits/rejected": 0.2284708321094513, "logps/chosen": -6.0758843421936035, "logps/rejected": -7.1303391456604, "loss": 0.493, "rewards/accuracies": 0.71875, "rewards/chosen": -6.0758843421936035, "rewards/margins": 1.0544540882110596, "rewards/rejected": -7.1303391456604, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 16.26482173864461, "learning_rate": 6.991198023789577e-07, "logits/chosen": 0.16017483174800873, "logits/rejected": 0.23917949199676514, "logps/chosen": -5.7948713302612305, "logps/rejected": -6.659997463226318, "loss": 0.4985, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.7948713302612305, "rewards/margins": 0.8651263117790222, "rewards/rejected": -6.659997463226318, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 24.780442066875207, "learning_rate": 6.976902622196776e-07, "logits/chosen": 0.19367751479148865, "logits/rejected": 0.27410250902175903, "logps/chosen": -6.099483966827393, "logps/rejected": -7.036294460296631, "loss": 0.5218, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.099483966827393, "rewards/margins": 0.9368104934692383, "rewards/rejected": -7.036294460296631, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 17.12728765173702, "learning_rate": 6.962588040686064e-07, "logits/chosen": 0.17056076228618622, "logits/rejected": 0.28316670656204224, "logps/chosen": -5.848304271697998, "logps/rejected": -6.768074035644531, "loss": 0.5486, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.848304271697998, "rewards/margins": 0.9197691679000854, "rewards/rejected": -6.768074035644531, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 17.339726704621565, "learning_rate": 6.948254418137573e-07, "logits/chosen": 0.1776285618543625, "logits/rejected": 0.25934362411499023, "logps/chosen": -5.811620235443115, "logps/rejected": -6.803094387054443, "loss": 0.5178, "rewards/accuracies": 0.71875, "rewards/chosen": -5.811620235443115, "rewards/margins": 0.9914735555648804, "rewards/rejected": -6.803094387054443, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 22.779486065265203, "learning_rate": 6.933901893616174e-07, "logits/chosen": 0.15147341787815094, "logits/rejected": 0.26968351006507874, "logps/chosen": -5.857061862945557, "logps/rejected": -6.726177215576172, "loss": 0.5338, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.857061862945557, "rewards/margins": 0.8691149950027466, "rewards/rejected": -6.726177215576172, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 24.222908585069295, "learning_rate": 6.919530606370121e-07, "logits/chosen": 0.21891406178474426, "logits/rejected": 0.3230067789554596, "logps/chosen": -5.709885597229004, "logps/rejected": -6.641817569732666, "loss": 0.508, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.709885597229004, "rewards/margins": 0.9319319725036621, "rewards/rejected": -6.641817569732666, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 15.157958934430866, "learning_rate": 6.905140695829706e-07, "logits/chosen": 0.12486258894205093, "logits/rejected": 0.33449774980545044, "logps/chosen": -5.985657691955566, "logps/rejected": -6.978099822998047, "loss": 0.4788, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.985657691955566, "rewards/margins": 0.9924421310424805, "rewards/rejected": -6.978099822998047, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 22.735210910458566, "learning_rate": 6.890732301605904e-07, "logits/chosen": 0.1811182200908661, "logits/rejected": 0.2520715892314911, "logps/chosen": -5.746588230133057, "logps/rejected": -6.6248345375061035, "loss": 0.5302, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.746588230133057, "rewards/margins": 0.8782461881637573, "rewards/rejected": -6.6248345375061035, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 19.829917560041945, "learning_rate": 6.876305563489021e-07, "logits/chosen": 0.17515604197978973, "logits/rejected": 0.2685873806476593, "logps/chosen": -6.027820110321045, "logps/rejected": -7.131739139556885, "loss": 0.4336, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.027820110321045, "rewards/margins": 1.1039190292358398, "rewards/rejected": -7.131739139556885, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 20.6882239173909, "learning_rate": 6.861860621447331e-07, "logits/chosen": 0.08686832338571548, "logits/rejected": 0.18017563223838806, "logps/chosen": -6.162463188171387, "logps/rejected": -6.984823703765869, "loss": 0.522, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.162463188171387, "rewards/margins": 0.8223612904548645, "rewards/rejected": -6.984823703765869, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 19.942548487835126, "learning_rate": 6.847397615625725e-07, "logits/chosen": 0.21859896183013916, "logits/rejected": 0.26055413484573364, "logps/chosen": -6.1787004470825195, "logps/rejected": -7.019227504730225, "loss": 0.5396, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.1787004470825195, "rewards/margins": 0.8405271768569946, "rewards/rejected": -7.019227504730225, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 14.140306099911832, "learning_rate": 6.83291668634435e-07, "logits/chosen": 0.12272389233112335, "logits/rejected": 0.2778452932834625, "logps/chosen": -6.114518165588379, "logps/rejected": -7.326783180236816, "loss": 0.4434, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.114518165588379, "rewards/margins": 1.2122652530670166, "rewards/rejected": -7.326783180236816, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 16.680750428509285, "learning_rate": 6.818417974097246e-07, "logits/chosen": 0.3098542094230652, "logits/rejected": 0.4479276239871979, "logps/chosen": -6.061105251312256, "logps/rejected": -7.353457450866699, "loss": 0.4313, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.061105251312256, "rewards/margins": 1.2923524379730225, "rewards/rejected": -7.353457450866699, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 16.722513433982854, "learning_rate": 6.803901619550981e-07, "logits/chosen": 0.18296250700950623, "logits/rejected": 0.23452425003051758, "logps/chosen": -6.2257232666015625, "logps/rejected": -7.162517547607422, "loss": 0.4958, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.2257232666015625, "rewards/margins": 0.9367941617965698, "rewards/rejected": -7.162517547607422, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 18.696440286974337, "learning_rate": 6.789367763543292e-07, "logits/chosen": 0.24497303366661072, "logits/rejected": 0.27524641156196594, "logps/chosen": -6.080451011657715, "logps/rejected": -6.997108459472656, "loss": 0.5526, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.080451011657715, "rewards/margins": 0.9166574478149414, "rewards/rejected": -6.997108459472656, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 25.04269188220103, "learning_rate": 6.774816547081714e-07, "logits/chosen": 0.21521957218647003, "logits/rejected": 0.3439801037311554, "logps/chosen": -6.106442451477051, "logps/rejected": -6.832700252532959, "loss": 0.5333, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -6.106442451477051, "rewards/margins": 0.726258397102356, "rewards/rejected": -6.832700252532959, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 16.63209146903063, "learning_rate": 6.760248111342211e-07, "logits/chosen": 0.19699034094810486, "logits/rejected": 0.3251239061355591, "logps/chosen": -5.7267560958862305, "logps/rejected": -6.7977190017700195, "loss": 0.462, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.7267560958862305, "rewards/margins": 1.0709636211395264, "rewards/rejected": -6.7977190017700195, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 20.235971475629395, "learning_rate": 6.745662597667813e-07, "logits/chosen": 0.17114906013011932, "logits/rejected": 0.2899430990219116, "logps/chosen": -5.537110328674316, "logps/rejected": -6.593564033508301, "loss": 0.4391, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -5.537110328674316, "rewards/margins": 1.0564533472061157, "rewards/rejected": -6.593564033508301, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 15.973986201705568, "learning_rate": 6.731060147567236e-07, "logits/chosen": 0.261476069688797, "logits/rejected": 0.3387894034385681, "logps/chosen": -5.574726104736328, "logps/rejected": -6.59771728515625, "loss": 0.4756, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -5.574726104736328, "rewards/margins": 1.0229907035827637, "rewards/rejected": -6.59771728515625, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 20.790584465598148, "learning_rate": 6.716440902713515e-07, "logits/chosen": 0.21439795196056366, "logits/rejected": 0.28630223870277405, "logps/chosen": -5.979474067687988, "logps/rejected": -6.945030212402344, "loss": 0.4629, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.979474067687988, "rewards/margins": 0.9655561447143555, "rewards/rejected": -6.945030212402344, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 19.26816069176633, "learning_rate": 6.701805004942627e-07, "logits/chosen": 0.2177700251340866, "logits/rejected": 0.27205830812454224, "logps/chosen": -6.0796051025390625, "logps/rejected": -7.090445518493652, "loss": 0.4944, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.0796051025390625, "rewards/margins": 1.0108410120010376, "rewards/rejected": -7.090445518493652, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 22.77162766397486, "learning_rate": 6.687152596252119e-07, "logits/chosen": 0.2516229748725891, "logits/rejected": 0.3109031319618225, "logps/chosen": -6.38559103012085, "logps/rejected": -7.288458824157715, "loss": 0.5512, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.38559103012085, "rewards/margins": 0.9028676748275757, "rewards/rejected": -7.288458824157715, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 27.067196338075696, "learning_rate": 6.672483818799722e-07, "logits/chosen": 0.21050748229026794, "logits/rejected": 0.32094651460647583, "logps/chosen": -6.413076877593994, "logps/rejected": -7.294304847717285, "loss": 0.5167, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.413076877593994, "rewards/margins": 0.8812281489372253, "rewards/rejected": -7.294304847717285, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 18.796787052977887, "learning_rate": 6.657798814901978e-07, "logits/chosen": 0.22314269840717316, "logits/rejected": 0.3591897487640381, "logps/chosen": -6.5046186447143555, "logps/rejected": -7.42484188079834, "loss": 0.4933, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.5046186447143555, "rewards/margins": 0.9202238321304321, "rewards/rejected": -7.42484188079834, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 16.517640891687257, "learning_rate": 6.643097727032863e-07, "logits/chosen": 0.18471279740333557, "logits/rejected": 0.3313792645931244, "logps/chosen": -6.379749298095703, "logps/rejected": -7.499573707580566, "loss": 0.4459, "rewards/accuracies": 0.78125, "rewards/chosen": -6.379749298095703, "rewards/margins": 1.1198241710662842, "rewards/rejected": -7.499573707580566, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 18.70460775793003, "learning_rate": 6.628380697822392e-07, "logits/chosen": 0.22686338424682617, "logits/rejected": 0.33594751358032227, "logps/chosen": -6.5355424880981445, "logps/rejected": -7.333507537841797, "loss": 0.5511, "rewards/accuracies": 0.71875, "rewards/chosen": -6.5355424880981445, "rewards/margins": 0.7979653477668762, "rewards/rejected": -7.333507537841797, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 20.63793668217959, "learning_rate": 6.61364787005525e-07, "logits/chosen": 0.22906167805194855, "logits/rejected": 0.33341819047927856, "logps/chosen": -5.982217788696289, "logps/rejected": -7.136512756347656, "loss": 0.4614, "rewards/accuracies": 0.8125, "rewards/chosen": -5.982217788696289, "rewards/margins": 1.1542942523956299, "rewards/rejected": -7.136512756347656, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 21.39124501657475, "learning_rate": 6.598899386669395e-07, "logits/chosen": 0.2343016415834427, "logits/rejected": 0.32648029923439026, "logps/chosen": -6.168181419372559, "logps/rejected": -7.057168006896973, "loss": 0.5224, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.168181419372559, "rewards/margins": 0.8889864087104797, "rewards/rejected": -7.057168006896973, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 24.46201688807348, "learning_rate": 6.584135390754679e-07, "logits/chosen": 0.21507863700389862, "logits/rejected": 0.3173958957195282, "logps/chosen": -5.926985263824463, "logps/rejected": -6.98834753036499, "loss": 0.4825, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.926985263824463, "rewards/margins": 1.0613621473312378, "rewards/rejected": -6.98834753036499, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 13.384207261800285, "learning_rate": 6.569356025551454e-07, "logits/chosen": 0.22934818267822266, "logits/rejected": 0.3059101998806, "logps/chosen": -5.866464614868164, "logps/rejected": -6.857785224914551, "loss": 0.5021, "rewards/accuracies": 0.75, "rewards/chosen": -5.866464614868164, "rewards/margins": 0.991320788860321, "rewards/rejected": -6.857785224914551, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 25.48088697328881, "learning_rate": 6.554561434449186e-07, "logits/chosen": 0.12403549998998642, "logits/rejected": 0.2438894510269165, "logps/chosen": -5.888209342956543, "logps/rejected": -6.84426736831665, "loss": 0.5073, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.888209342956543, "rewards/margins": 0.9560573697090149, "rewards/rejected": -6.84426736831665, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 23.80695122442853, "learning_rate": 6.539751760985063e-07, "logits/chosen": 0.18662911653518677, "logits/rejected": 0.2552419900894165, "logps/chosen": -6.183432579040527, "logps/rejected": -6.8864426612854, "loss": 0.5604, "rewards/accuracies": 0.6875, "rewards/chosen": -6.183432579040527, "rewards/margins": 0.7030097842216492, "rewards/rejected": -6.8864426612854, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 20.865673265587535, "learning_rate": 6.524927148842602e-07, "logits/chosen": 0.25318893790245056, "logits/rejected": 0.3708975613117218, "logps/chosen": -5.9495649337768555, "logps/rejected": -6.879406929016113, "loss": 0.5191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -5.9495649337768555, "rewards/margins": 0.9298413991928101, "rewards/rejected": -6.879406929016113, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 21.533211858869397, "learning_rate": 6.510087741850254e-07, "logits/chosen": 0.1384793221950531, "logits/rejected": 0.2358626127243042, "logps/chosen": -5.8774027824401855, "logps/rejected": -6.806514739990234, "loss": 0.5115, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.8774027824401855, "rewards/margins": 0.9291118383407593, "rewards/rejected": -6.806514739990234, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 23.204334766677455, "learning_rate": 6.495233683980012e-07, "logits/chosen": 0.17257975041866302, "logits/rejected": 0.21821144223213196, "logps/chosen": -6.239302635192871, "logps/rejected": -7.068957328796387, "loss": 0.5111, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.239302635192871, "rewards/margins": 0.8296549916267395, "rewards/rejected": -7.068957328796387, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 20.006112975586785, "learning_rate": 6.480365119346011e-07, "logits/chosen": 0.2793341279029846, "logits/rejected": 0.38580167293548584, "logps/chosen": -6.043140411376953, "logps/rejected": -7.001716613769531, "loss": 0.4825, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.043140411376953, "rewards/margins": 0.9585763812065125, "rewards/rejected": -7.001716613769531, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 14.45833122148035, "learning_rate": 6.465482192203129e-07, "logits/chosen": 0.24829316139221191, "logits/rejected": 0.306845486164093, "logps/chosen": -6.007363796234131, "logps/rejected": -6.947943687438965, "loss": 0.4808, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.007363796234131, "rewards/margins": 0.940579891204834, "rewards/rejected": -6.947943687438965, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 25.767155584754562, "learning_rate": 6.45058504694559e-07, "logits/chosen": 0.2399607002735138, "logits/rejected": 0.26813334226608276, "logps/chosen": -6.171053409576416, "logps/rejected": -7.173449516296387, "loss": 0.4812, "rewards/accuracies": 0.75, "rewards/chosen": -6.171053409576416, "rewards/margins": 1.0023963451385498, "rewards/rejected": -7.173449516296387, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 24.04460775689099, "learning_rate": 6.435673828105564e-07, "logits/chosen": 0.20154432952404022, "logits/rejected": 0.2984001338481903, "logps/chosen": -6.174256801605225, "logps/rejected": -7.223874568939209, "loss": 0.5052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.174256801605225, "rewards/margins": 1.049617886543274, "rewards/rejected": -7.223874568939209, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 17.18694155274937, "learning_rate": 6.420748680351763e-07, "logits/chosen": 0.2612171471118927, "logits/rejected": 0.21179655194282532, "logps/chosen": -6.370073318481445, "logps/rejected": -7.127557277679443, "loss": 0.5594, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.370073318481445, "rewards/margins": 0.757483959197998, "rewards/rejected": -7.127557277679443, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 27.42763231332852, "learning_rate": 6.405809748488032e-07, "logits/chosen": 0.22283296287059784, "logits/rejected": 0.3382520377635956, "logps/chosen": -6.305356979370117, "logps/rejected": -7.383749961853027, "loss": 0.5127, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.305356979370117, "rewards/margins": 1.0783923864364624, "rewards/rejected": -7.383749961853027, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 18.261925724896905, "learning_rate": 6.390857177451956e-07, "logits/chosen": 0.11623439937829971, "logits/rejected": 0.260323166847229, "logps/chosen": -6.272123336791992, "logps/rejected": -7.137209415435791, "loss": 0.5031, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.272123336791992, "rewards/margins": 0.8650856018066406, "rewards/rejected": -7.137209415435791, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 18.659306422535746, "learning_rate": 6.375891112313445e-07, "logits/chosen": 0.17161008715629578, "logits/rejected": 0.25503864884376526, "logps/chosen": -6.498648166656494, "logps/rejected": -7.474682807922363, "loss": 0.4784, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.498648166656494, "rewards/margins": 0.9760352969169617, "rewards/rejected": -7.474682807922363, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 19.511124687426072, "learning_rate": 6.360911698273326e-07, "logits/chosen": 0.26095929741859436, "logits/rejected": 0.3581278920173645, "logps/chosen": -6.594778537750244, "logps/rejected": -7.427119255065918, "loss": 0.5262, "rewards/accuracies": 0.71875, "rewards/chosen": -6.594778537750244, "rewards/margins": 0.8323402404785156, "rewards/rejected": -7.427119255065918, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 14.94881715422757, "learning_rate": 6.345919080661944e-07, "logits/chosen": 0.23444826900959015, "logits/rejected": 0.28223916888237, "logps/chosen": -6.229555606842041, "logps/rejected": -7.274050712585449, "loss": 0.4532, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.229555606842041, "rewards/margins": 1.0444955825805664, "rewards/rejected": -7.274050712585449, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 21.75474458834851, "learning_rate": 6.330913404937737e-07, "logits/chosen": 0.23042869567871094, "logits/rejected": 0.3141385614871979, "logps/chosen": -6.407815456390381, "logps/rejected": -7.53855037689209, "loss": 0.4603, "rewards/accuracies": 0.78125, "rewards/chosen": -6.407815456390381, "rewards/margins": 1.1307344436645508, "rewards/rejected": -7.53855037689209, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 22.680627619534505, "learning_rate": 6.315894816685838e-07, "logits/chosen": 0.23714987933635712, "logits/rejected": 0.329181045293808, "logps/chosen": -6.348174571990967, "logps/rejected": -7.242630958557129, "loss": 0.4966, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.348174571990967, "rewards/margins": 0.8944567441940308, "rewards/rejected": -7.242630958557129, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 15.8339265531164, "learning_rate": 6.300863461616657e-07, "logits/chosen": 0.23665933310985565, "logits/rejected": 0.2919309437274933, "logps/chosen": -6.0339555740356445, "logps/rejected": -6.838563442230225, "loss": 0.5661, "rewards/accuracies": 0.71875, "rewards/chosen": -6.0339555740356445, "rewards/margins": 0.804608166217804, "rewards/rejected": -6.838563442230225, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 13.827215078066121, "learning_rate": 6.285819485564465e-07, "logits/chosen": 0.1799328625202179, "logits/rejected": 0.2721804678440094, "logps/chosen": -6.298506259918213, "logps/rejected": -7.249512672424316, "loss": 0.469, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.298506259918213, "rewards/margins": 0.951005756855011, "rewards/rejected": -7.249512672424316, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 17.34909261979186, "learning_rate": 6.270763034485986e-07, "logits/chosen": 0.29923123121261597, "logits/rejected": 0.3617471754550934, "logps/chosen": -6.427297115325928, "logps/rejected": -7.377211570739746, "loss": 0.4939, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.427297115325928, "rewards/margins": 0.9499150514602661, "rewards/rejected": -7.377211570739746, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 28.92750023624755, "learning_rate": 6.255694254458972e-07, "logits/chosen": 0.28279370069503784, "logits/rejected": 0.3879690170288086, "logps/chosen": -6.5980939865112305, "logps/rejected": -7.543734550476074, "loss": 0.5308, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.5980939865112305, "rewards/margins": 0.945639967918396, "rewards/rejected": -7.543734550476074, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 24.856973473543448, "learning_rate": 6.240613291680795e-07, "logits/chosen": 0.18259096145629883, "logits/rejected": 0.3060842454433441, "logps/chosen": -6.156343936920166, "logps/rejected": -7.090039253234863, "loss": 0.5348, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.156343936920166, "rewards/margins": 0.9336953163146973, "rewards/rejected": -7.090039253234863, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 13.447701742733123, "learning_rate": 6.225520292467021e-07, "logits/chosen": 0.18624433875083923, "logits/rejected": 0.3402232527732849, "logps/chosen": -6.05726432800293, "logps/rejected": -7.2886528968811035, "loss": 0.4061, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -6.05726432800293, "rewards/margins": 1.2313883304595947, "rewards/rejected": -7.2886528968811035, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 24.36820160443709, "learning_rate": 6.210415403249993e-07, "logits/chosen": 0.08098767697811127, "logits/rejected": 0.25168949365615845, "logps/chosen": -6.051745414733887, "logps/rejected": -7.0631818771362305, "loss": 0.5172, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.051745414733887, "rewards/margins": 1.011436104774475, "rewards/rejected": -7.0631818771362305, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 20.833016878226314, "learning_rate": 6.195298770577415e-07, "logits/chosen": 0.26706191897392273, "logits/rejected": 0.2743946611881256, "logps/chosen": -6.177419662475586, "logps/rejected": -7.165745735168457, "loss": 0.5036, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.177419662475586, "rewards/margins": 0.9883264303207397, "rewards/rejected": -7.165745735168457, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 16.847524337616605, "learning_rate": 6.180170541110923e-07, "logits/chosen": 0.1760694682598114, "logits/rejected": 0.2969212532043457, "logps/chosen": -6.273100852966309, "logps/rejected": -7.28066873550415, "loss": 0.4875, "rewards/accuracies": 0.78125, "rewards/chosen": -6.273100852966309, "rewards/margins": 1.0075680017471313, "rewards/rejected": -7.28066873550415, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 15.924233626184876, "learning_rate": 6.165030861624663e-07, "logits/chosen": 0.11017143726348877, "logits/rejected": 0.23728613555431366, "logps/chosen": -6.381758213043213, "logps/rejected": -7.697253227233887, "loss": 0.4091, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.381758213043213, "rewards/margins": 1.3154950141906738, "rewards/rejected": -7.697253227233887, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 22.324308475539247, "learning_rate": 6.149879879003876e-07, "logits/chosen": 0.20750120282173157, "logits/rejected": 0.22585979104042053, "logps/chosen": -6.222146987915039, "logps/rejected": -7.275618076324463, "loss": 0.4702, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.222146987915039, "rewards/margins": 1.0534698963165283, "rewards/rejected": -7.275618076324463, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 19.061389200647003, "learning_rate": 6.13471774024346e-07, "logits/chosen": 0.08445904403924942, "logits/rejected": 0.17165620625019073, "logps/chosen": -6.124358654022217, "logps/rejected": -7.124139308929443, "loss": 0.4691, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.124358654022217, "rewards/margins": 0.9997808337211609, "rewards/rejected": -7.124139308929443, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 13.871617423553499, "learning_rate": 6.119544592446551e-07, "logits/chosen": 0.10067824274301529, "logits/rejected": 0.20037582516670227, "logps/chosen": -6.243529319763184, "logps/rejected": -7.069155693054199, "loss": 0.5314, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.243529319763184, "rewards/margins": 0.8256263732910156, "rewards/rejected": -7.069155693054199, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 23.479941237436133, "learning_rate": 6.104360582823096e-07, "logits/chosen": 0.17534135282039642, "logits/rejected": 0.22013907134532928, "logps/chosen": -5.980776309967041, "logps/rejected": -6.914759635925293, "loss": 0.4867, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.980776309967041, "rewards/margins": 0.9339843988418579, "rewards/rejected": -6.914759635925293, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 17.23521657573716, "learning_rate": 6.089165858688423e-07, "logits/chosen": 0.1536351889371872, "logits/rejected": 0.2602033019065857, "logps/chosen": -5.878857612609863, "logps/rejected": -6.900129795074463, "loss": 0.4967, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.878857612609863, "rewards/margins": 1.0212715864181519, "rewards/rejected": -6.900129795074463, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 13.535813592318425, "learning_rate": 6.073960567461811e-07, "logits/chosen": 0.1762942671775818, "logits/rejected": 0.2913457751274109, "logps/chosen": -5.590030670166016, "logps/rejected": -6.7590532302856445, "loss": 0.4112, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -5.590030670166016, "rewards/margins": 1.1690219640731812, "rewards/rejected": -6.7590532302856445, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 15.726626661186668, "learning_rate": 6.058744856665065e-07, "logits/chosen": 0.1258622407913208, "logits/rejected": 0.1856011301279068, "logps/chosen": -5.879513740539551, "logps/rejected": -6.9437737464904785, "loss": 0.464, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.879513740539551, "rewards/margins": 1.0642606019973755, "rewards/rejected": -6.9437737464904785, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 17.786581650894494, "learning_rate": 6.043518873921074e-07, "logits/chosen": 0.15566766262054443, "logits/rejected": 0.25233739614486694, "logps/chosen": -5.924063205718994, "logps/rejected": -6.872513771057129, "loss": 0.4757, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -5.924063205718994, "rewards/margins": 0.9484499096870422, "rewards/rejected": -6.872513771057129, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 22.302444823654227, "learning_rate": 6.028282766952393e-07, "logits/chosen": 0.1867285817861557, "logits/rejected": 0.2665676772594452, "logps/chosen": -6.000852108001709, "logps/rejected": -7.099644660949707, "loss": 0.4732, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.000852108001709, "rewards/margins": 1.0987926721572876, "rewards/rejected": -7.099644660949707, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 28.349457924160006, "learning_rate": 6.013036683579798e-07, "logits/chosen": 0.20536640286445618, "logits/rejected": 0.28531551361083984, "logps/chosen": -5.965720176696777, "logps/rejected": -7.034081935882568, "loss": 0.4645, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -5.965720176696777, "rewards/margins": 1.0683619976043701, "rewards/rejected": -7.034081935882568, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 19.326958313207594, "learning_rate": 5.997780771720854e-07, "logits/chosen": 0.12123097479343414, "logits/rejected": 0.22387108206748962, "logps/chosen": -6.1069464683532715, "logps/rejected": -7.190478324890137, "loss": 0.4608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.1069464683532715, "rewards/margins": 1.0835316181182861, "rewards/rejected": -7.190478324890137, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 20.27717282805611, "learning_rate": 5.982515179388486e-07, "logits/chosen": 0.21311935782432556, "logits/rejected": 0.29622143507003784, "logps/chosen": -6.140543460845947, "logps/rejected": -7.106387138366699, "loss": 0.5132, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.140543460845947, "rewards/margins": 0.9658439755439758, "rewards/rejected": -7.106387138366699, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 14.995216436675015, "learning_rate": 5.967240054689541e-07, "logits/chosen": 0.2236863672733307, "logits/rejected": 0.28038954734802246, "logps/chosen": -6.361308574676514, "logps/rejected": -7.367376804351807, "loss": 0.4907, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.361308574676514, "rewards/margins": 1.0060679912567139, "rewards/rejected": -7.367376804351807, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 19.219548559407972, "learning_rate": 5.951955545823342e-07, "logits/chosen": 0.21152298152446747, "logits/rejected": 0.25916406512260437, "logps/chosen": -6.640192985534668, "logps/rejected": -7.619248867034912, "loss": 0.5138, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.640192985534668, "rewards/margins": 0.9790557026863098, "rewards/rejected": -7.619248867034912, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 15.225867961734888, "learning_rate": 5.936661801080263e-07, "logits/chosen": 0.22134506702423096, "logits/rejected": 0.29216116666793823, "logps/chosen": -6.605828762054443, "logps/rejected": -7.481935977935791, "loss": 0.5485, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.605828762054443, "rewards/margins": 0.8761063814163208, "rewards/rejected": -7.481935977935791, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 13.398505281258863, "learning_rate": 5.92135896884028e-07, "logits/chosen": 0.20348718762397766, "logits/rejected": 0.31101515889167786, "logps/chosen": -6.470333099365234, "logps/rejected": -7.622544288635254, "loss": 0.454, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.470333099365234, "rewards/margins": 1.1522115468978882, "rewards/rejected": -7.622544288635254, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 23.554366217498664, "learning_rate": 5.906047197571541e-07, "logits/chosen": 0.2396855652332306, "logits/rejected": 0.23777341842651367, "logps/chosen": -6.197471618652344, "logps/rejected": -7.138236999511719, "loss": 0.5163, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.197471618652344, "rewards/margins": 0.9407650232315063, "rewards/rejected": -7.138236999511719, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 15.49707900062981, "learning_rate": 5.890726635828919e-07, "logits/chosen": 0.26965242624282837, "logits/rejected": 0.2818291187286377, "logps/chosen": -6.109783172607422, "logps/rejected": -7.177038669586182, "loss": 0.4823, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.109783172607422, "rewards/margins": 1.0672557353973389, "rewards/rejected": -7.177038669586182, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 20.197900708663283, "learning_rate": 5.875397432252569e-07, "logits/chosen": 0.15579013526439667, "logits/rejected": 0.21123051643371582, "logps/chosen": -6.129229545593262, "logps/rejected": -7.128995418548584, "loss": 0.4823, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.129229545593262, "rewards/margins": 0.9997653961181641, "rewards/rejected": -7.128995418548584, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.40029582381248474, "eval_logits/rejected": 0.46814489364624023, "eval_logps/chosen": -6.43025016784668, "eval_logps/rejected": -7.3915696144104, "eval_loss": 0.5165576934814453, "eval_rewards/accuracies": 0.7284866571426392, "eval_rewards/chosen": -6.43025016784668, "eval_rewards/margins": 0.9613184928894043, "eval_rewards/rejected": -7.3915696144104, "eval_runtime": 40.5534, "eval_samples_per_second": 33.166, "eval_steps_per_second": 8.31, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 14.853307197311079, "learning_rate": 5.860059735566491e-07, "logits/chosen": 0.12676751613616943, "logits/rejected": 0.22240300476551056, "logps/chosen": -6.3314738273620605, "logps/rejected": -7.310635566711426, "loss": 0.5039, "rewards/accuracies": 0.71875, "rewards/chosen": -6.3314738273620605, "rewards/margins": 0.979160487651825, "rewards/rejected": -7.310635566711426, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 20.74945130046968, "learning_rate": 5.844713694577087e-07, "logits/chosen": 0.18120229244232178, "logits/rejected": 0.2314457893371582, "logps/chosen": -6.246757984161377, "logps/rejected": -7.223686218261719, "loss": 0.4888, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.246757984161377, "rewards/margins": 0.9769285321235657, "rewards/rejected": -7.223686218261719, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 15.430229207822924, "learning_rate": 5.829359458171714e-07, "logits/chosen": 0.2070266306400299, "logits/rejected": 0.2830543518066406, "logps/chosen": -6.30063009262085, "logps/rejected": -7.561201572418213, "loss": 0.3988, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -6.30063009262085, "rewards/margins": 1.2605712413787842, "rewards/rejected": -7.561201572418213, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 17.168910476271204, "learning_rate": 5.81399717531724e-07, "logits/chosen": 0.16277417540550232, "logits/rejected": 0.27424725890159607, "logps/chosen": -6.482580661773682, "logps/rejected": -7.316930294036865, "loss": 0.5529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.482580661773682, "rewards/margins": 0.8343492746353149, "rewards/rejected": -7.316930294036865, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 15.695317001221413, "learning_rate": 5.798626995058602e-07, "logits/chosen": 0.14231693744659424, "logits/rejected": 0.2778443992137909, "logps/chosen": -6.491742134094238, "logps/rejected": -7.500138759613037, "loss": 0.4801, "rewards/accuracies": 0.75, "rewards/chosen": -6.491742134094238, "rewards/margins": 1.0083979368209839, "rewards/rejected": -7.500138759613037, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 13.11708274200767, "learning_rate": 5.783249066517354e-07, "logits/chosen": 0.1678686887025833, "logits/rejected": 0.2490328848361969, "logps/chosen": -6.209481239318848, "logps/rejected": -7.282641410827637, "loss": 0.4693, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.209481239318848, "rewards/margins": 1.07315993309021, "rewards/rejected": -7.282641410827637, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 31.533449291042448, "learning_rate": 5.767863538890228e-07, "logits/chosen": 0.15707454085350037, "logits/rejected": 0.24957947432994843, "logps/chosen": -6.423956871032715, "logps/rejected": -7.588784694671631, "loss": 0.4369, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.423956871032715, "rewards/margins": 1.1648277044296265, "rewards/rejected": -7.588784694671631, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 18.13738353512322, "learning_rate": 5.75247056144768e-07, "logits/chosen": 0.16873756051063538, "logits/rejected": 0.2055487334728241, "logps/chosen": -6.2693657875061035, "logps/rejected": -7.170986175537109, "loss": 0.5552, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.2693657875061035, "rewards/margins": 0.9016210436820984, "rewards/rejected": -7.170986175537109, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 17.584553813067068, "learning_rate": 5.737070283532444e-07, "logits/chosen": 0.24587778747081757, "logits/rejected": 0.27612775564193726, "logps/chosen": -6.281549453735352, "logps/rejected": -7.1617431640625, "loss": 0.5695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.281549453735352, "rewards/margins": 0.8801944851875305, "rewards/rejected": -7.1617431640625, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 16.745327113916556, "learning_rate": 5.721662854558084e-07, "logits/chosen": 0.15860623121261597, "logits/rejected": 0.2237144261598587, "logps/chosen": -6.311585903167725, "logps/rejected": -7.420734405517578, "loss": 0.4625, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.311585903167725, "rewards/margins": 1.109148383140564, "rewards/rejected": -7.420734405517578, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 16.940421533272836, "learning_rate": 5.706248424007545e-07, "logits/chosen": 0.13119611144065857, "logits/rejected": 0.25429287552833557, "logps/chosen": -6.439759731292725, "logps/rejected": -7.328015327453613, "loss": 0.5233, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.439759731292725, "rewards/margins": 0.8882551193237305, "rewards/rejected": -7.328015327453613, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 16.344107984448, "learning_rate": 5.690827141431699e-07, "logits/chosen": 0.08858231455087662, "logits/rejected": 0.2204386293888092, "logps/chosen": -6.188693046569824, "logps/rejected": -7.0304059982299805, "loss": 0.5038, "rewards/accuracies": 0.75, "rewards/chosen": -6.188693046569824, "rewards/margins": 0.8417131304740906, "rewards/rejected": -7.0304059982299805, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 20.28249647333897, "learning_rate": 5.675399156447897e-07, "logits/chosen": 0.10806681960821152, "logits/rejected": 0.191233292222023, "logps/chosen": -6.146536350250244, "logps/rejected": -6.924802303314209, "loss": 0.545, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.146536350250244, "rewards/margins": 0.7782659530639648, "rewards/rejected": -6.924802303314209, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 16.97714399198867, "learning_rate": 5.659964618738515e-07, "logits/chosen": 0.1419149786233902, "logits/rejected": 0.2160245180130005, "logps/chosen": -6.081233024597168, "logps/rejected": -6.983085632324219, "loss": 0.5185, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.081233024597168, "rewards/margins": 0.9018527865409851, "rewards/rejected": -6.983085632324219, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 20.601429498137158, "learning_rate": 5.644523678049509e-07, "logits/chosen": 0.08605459332466125, "logits/rejected": 0.1769315004348755, "logps/chosen": -6.087510585784912, "logps/rejected": -6.997509002685547, "loss": 0.5015, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.087510585784912, "rewards/margins": 0.9099981188774109, "rewards/rejected": -6.997509002685547, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 18.891219260564643, "learning_rate": 5.629076484188952e-07, "logits/chosen": 0.18951095640659332, "logits/rejected": 0.2633412182331085, "logps/chosen": -5.817484378814697, "logps/rejected": -6.823545932769775, "loss": 0.466, "rewards/accuracies": 0.78125, "rewards/chosen": -5.817484378814697, "rewards/margins": 1.0060614347457886, "rewards/rejected": -6.823545932769775, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 22.307472311720375, "learning_rate": 5.613623187025587e-07, "logits/chosen": 0.1370941698551178, "logits/rejected": 0.23443543910980225, "logps/chosen": -6.017722129821777, "logps/rejected": -6.999619483947754, "loss": 0.4813, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.017722129821777, "rewards/margins": 0.9818977117538452, "rewards/rejected": -6.999619483947754, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 16.000512036712365, "learning_rate": 5.598163936487369e-07, "logits/chosen": 0.12150659412145615, "logits/rejected": 0.23525385558605194, "logps/chosen": -6.1015214920043945, "logps/rejected": -7.231861114501953, "loss": 0.4607, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.1015214920043945, "rewards/margins": 1.1303402185440063, "rewards/rejected": -7.231861114501953, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 15.765256837088168, "learning_rate": 5.582698882560017e-07, "logits/chosen": 0.11502428352832794, "logits/rejected": 0.20620615780353546, "logps/chosen": -5.86514949798584, "logps/rejected": -6.855123043060303, "loss": 0.488, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.86514949798584, "rewards/margins": 0.9899742007255554, "rewards/rejected": -6.855123043060303, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 15.171152318591258, "learning_rate": 5.567228175285549e-07, "logits/chosen": 0.1926734447479248, "logits/rejected": 0.2866966724395752, "logps/chosen": -6.039583683013916, "logps/rejected": -7.14728307723999, "loss": 0.4357, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.039583683013916, "rewards/margins": 1.1076990365982056, "rewards/rejected": -7.14728307723999, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 18.374356630758125, "learning_rate": 5.551751964760838e-07, "logits/chosen": 0.2405618131160736, "logits/rejected": 0.26600295305252075, "logps/chosen": -5.962469100952148, "logps/rejected": -6.997993469238281, "loss": 0.4628, "rewards/accuracies": 0.78125, "rewards/chosen": -5.962469100952148, "rewards/margins": 1.0355241298675537, "rewards/rejected": -6.997993469238281, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 22.95992453238303, "learning_rate": 5.536270401136145e-07, "logits/chosen": 0.17917904257774353, "logits/rejected": 0.24551351368427277, "logps/chosen": -6.250828742980957, "logps/rejected": -7.189608573913574, "loss": 0.4921, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.250828742980957, "rewards/margins": 0.9387801885604858, "rewards/rejected": -7.189608573913574, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 23.061356308806094, "learning_rate": 5.520783634613667e-07, "logits/chosen": 0.19938156008720398, "logits/rejected": 0.332527220249176, "logps/chosen": -6.302947998046875, "logps/rejected": -7.36586856842041, "loss": 0.4911, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.302947998046875, "rewards/margins": 1.062920331954956, "rewards/rejected": -7.36586856842041, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 19.050406773463493, "learning_rate": 5.505291815446082e-07, "logits/chosen": 0.21090111136436462, "logits/rejected": 0.2883283197879791, "logps/chosen": -6.425827980041504, "logps/rejected": -7.502095699310303, "loss": 0.4901, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.425827980041504, "rewards/margins": 1.076268196105957, "rewards/rejected": -7.502095699310303, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 21.699781061151132, "learning_rate": 5.489795093935089e-07, "logits/chosen": 0.25534263253211975, "logits/rejected": 0.3196747303009033, "logps/chosen": -6.318089008331299, "logps/rejected": -7.303831577301025, "loss": 0.5121, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.318089008331299, "rewards/margins": 0.9857425689697266, "rewards/rejected": -7.303831577301025, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 17.551773966792535, "learning_rate": 5.474293620429946e-07, "logits/chosen": 0.15170113742351532, "logits/rejected": 0.2653399705886841, "logps/chosen": -6.27533483505249, "logps/rejected": -7.6304755210876465, "loss": 0.4404, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.27533483505249, "rewards/margins": 1.3551397323608398, "rewards/rejected": -7.6304755210876465, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 20.147226089298766, "learning_rate": 5.458787545326018e-07, "logits/chosen": 0.18482725322246552, "logits/rejected": 0.25002187490463257, "logps/chosen": -6.7651543617248535, "logps/rejected": -7.816110134124756, "loss": 0.4747, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.7651543617248535, "rewards/margins": 1.0509551763534546, "rewards/rejected": -7.816110134124756, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 23.800940038169575, "learning_rate": 5.443277019063311e-07, "logits/chosen": 0.16216909885406494, "logits/rejected": 0.2893802523612976, "logps/chosen": -6.891751289367676, "logps/rejected": -8.044198989868164, "loss": 0.4984, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.891751289367676, "rewards/margins": 1.1524465084075928, "rewards/rejected": -8.044198989868164, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 27.035876766471304, "learning_rate": 5.427762192125023e-07, "logits/chosen": 0.19467368721961975, "logits/rejected": 0.28056275844573975, "logps/chosen": -6.858161926269531, "logps/rejected": -7.914292812347412, "loss": 0.5039, "rewards/accuracies": 0.75, "rewards/chosen": -6.858161926269531, "rewards/margins": 1.0561314821243286, "rewards/rejected": -7.914292812347412, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 23.751887317275543, "learning_rate": 5.41224321503607e-07, "logits/chosen": 0.17790207266807556, "logits/rejected": 0.3198242783546448, "logps/chosen": -6.71868371963501, "logps/rejected": -7.871993064880371, "loss": 0.4283, "rewards/accuracies": 0.8125, "rewards/chosen": -6.71868371963501, "rewards/margins": 1.1533094644546509, "rewards/rejected": -7.871993064880371, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 23.926031536885947, "learning_rate": 5.396720238361637e-07, "logits/chosen": 0.22789266705513, "logits/rejected": 0.3097589313983917, "logps/chosen": -6.692767143249512, "logps/rejected": -7.658299922943115, "loss": 0.515, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.692767143249512, "rewards/margins": 0.9655327796936035, "rewards/rejected": -7.658299922943115, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 13.460267977243817, "learning_rate": 5.381193412705711e-07, "logits/chosen": 0.14565755426883698, "logits/rejected": 0.25317442417144775, "logps/chosen": -6.461983680725098, "logps/rejected": -7.573000431060791, "loss": 0.4377, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.461983680725098, "rewards/margins": 1.111016869544983, "rewards/rejected": -7.573000431060791, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 15.259923713219294, "learning_rate": 5.365662888709622e-07, "logits/chosen": 0.17890270054340363, "logits/rejected": 0.2641114592552185, "logps/chosen": -6.717536926269531, "logps/rejected": -7.898177146911621, "loss": 0.4522, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.717536926269531, "rewards/margins": 1.180641770362854, "rewards/rejected": -7.898177146911621, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 18.34780672388678, "learning_rate": 5.350128817050585e-07, "logits/chosen": 0.16652999818325043, "logits/rejected": 0.2823794484138489, "logps/chosen": -6.743743896484375, "logps/rejected": -7.777483940124512, "loss": 0.4832, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.743743896484375, "rewards/margins": 1.033739447593689, "rewards/rejected": -7.777483940124512, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 23.442775120612204, "learning_rate": 5.334591348440229e-07, "logits/chosen": 0.19622598588466644, "logits/rejected": 0.3198692500591278, "logps/chosen": -6.523850917816162, "logps/rejected": -7.3970746994018555, "loss": 0.5356, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -6.523850917816162, "rewards/margins": 0.8732233047485352, "rewards/rejected": -7.3970746994018555, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 16.32986459874557, "learning_rate": 5.319050633623141e-07, "logits/chosen": 0.19831351935863495, "logits/rejected": 0.3089882731437683, "logps/chosen": -6.537728786468506, "logps/rejected": -7.424249172210693, "loss": 0.4853, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.537728786468506, "rewards/margins": 0.8865203857421875, "rewards/rejected": -7.424249172210693, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 19.86403792212192, "learning_rate": 5.303506823375409e-07, "logits/chosen": 0.17175276577472687, "logits/rejected": 0.3240048289299011, "logps/chosen": -6.476335048675537, "logps/rejected": -7.710141181945801, "loss": 0.4695, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.476335048675537, "rewards/margins": 1.2338054180145264, "rewards/rejected": -7.710141181945801, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 16.015164411132016, "learning_rate": 5.287960068503143e-07, "logits/chosen": 0.19814713299274445, "logits/rejected": 0.32239776849746704, "logps/chosen": -6.400434970855713, "logps/rejected": -7.563230991363525, "loss": 0.4301, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.400434970855713, "rewards/margins": 1.1627951860427856, "rewards/rejected": -7.563230991363525, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 22.644201256623404, "learning_rate": 5.272410519841032e-07, "logits/chosen": 0.26743531227111816, "logits/rejected": 0.37001627683639526, "logps/chosen": -6.603501319885254, "logps/rejected": -7.830699920654297, "loss": 0.4564, "rewards/accuracies": 0.78125, "rewards/chosen": -6.603501319885254, "rewards/margins": 1.2271989583969116, "rewards/rejected": -7.830699920654297, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 13.954548441331928, "learning_rate": 5.256858328250861e-07, "logits/chosen": 0.19470393657684326, "logits/rejected": 0.33217793703079224, "logps/chosen": -6.342373847961426, "logps/rejected": -7.289586067199707, "loss": 0.5018, "rewards/accuracies": 0.75, "rewards/chosen": -6.342373847961426, "rewards/margins": 0.9472112655639648, "rewards/rejected": -7.289586067199707, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 34.2570458368732, "learning_rate": 5.241303644620063e-07, "logits/chosen": 0.17494231462478638, "logits/rejected": 0.2956286668777466, "logps/chosen": -6.475231170654297, "logps/rejected": -7.3121209144592285, "loss": 0.5304, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.475231170654297, "rewards/margins": 0.8368890881538391, "rewards/rejected": -7.3121209144592285, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 29.687537739761023, "learning_rate": 5.225746619860248e-07, "logits/chosen": 0.23682686686515808, "logits/rejected": 0.3308120369911194, "logps/chosen": -6.541970729827881, "logps/rejected": -7.451929569244385, "loss": 0.5577, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.541970729827881, "rewards/margins": 0.9099578857421875, "rewards/rejected": -7.451929569244385, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 20.558728671530563, "learning_rate": 5.210187404905735e-07, "logits/chosen": 0.3471252918243408, "logits/rejected": 0.3935493230819702, "logps/chosen": -6.554591178894043, "logps/rejected": -7.5616559982299805, "loss": 0.4854, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.554591178894043, "rewards/margins": 1.0070655345916748, "rewards/rejected": -7.5616559982299805, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 17.304787782690894, "learning_rate": 5.194626150712098e-07, "logits/chosen": 0.27570056915283203, "logits/rejected": 0.33130085468292236, "logps/chosen": -6.574110984802246, "logps/rejected": -7.4604620933532715, "loss": 0.5044, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.574110984802246, "rewards/margins": 0.8863519430160522, "rewards/rejected": -7.4604620933532715, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 18.433255055304482, "learning_rate": 5.179063008254695e-07, "logits/chosen": 0.23976044356822968, "logits/rejected": 0.3457261919975281, "logps/chosen": -6.4324140548706055, "logps/rejected": -7.408753395080566, "loss": 0.5069, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.4324140548706055, "rewards/margins": 0.9763398170471191, "rewards/rejected": -7.408753395080566, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 18.04990327921787, "learning_rate": 5.163498128527199e-07, "logits/chosen": 0.31583625078201294, "logits/rejected": 0.41882187128067017, "logps/chosen": -6.766513824462891, "logps/rejected": -7.7595534324646, "loss": 0.516, "rewards/accuracies": 0.75, "rewards/chosen": -6.766513824462891, "rewards/margins": 0.9930397272109985, "rewards/rejected": -7.7595534324646, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 18.61119368455046, "learning_rate": 5.147931662540144e-07, "logits/chosen": 0.3800766170024872, "logits/rejected": 0.47233977913856506, "logps/chosen": -6.486734867095947, "logps/rejected": -7.380870819091797, "loss": 0.5033, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.486734867095947, "rewards/margins": 0.8941363096237183, "rewards/rejected": -7.380870819091797, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 19.669806483394954, "learning_rate": 5.132363761319449e-07, "logits/chosen": 0.2459898740053177, "logits/rejected": 0.3135351240634918, "logps/chosen": -6.404805660247803, "logps/rejected": -7.605037689208984, "loss": 0.4531, "rewards/accuracies": 0.78125, "rewards/chosen": -6.404805660247803, "rewards/margins": 1.2002323865890503, "rewards/rejected": -7.605037689208984, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 36.42788103959041, "learning_rate": 5.116794575904962e-07, "logits/chosen": 0.284351110458374, "logits/rejected": 0.37209925055503845, "logps/chosen": -6.136031150817871, "logps/rejected": -7.110169410705566, "loss": 0.4998, "rewards/accuracies": 0.75, "rewards/chosen": -6.136031150817871, "rewards/margins": 0.9741382598876953, "rewards/rejected": -7.110169410705566, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 12.450413897744921, "learning_rate": 5.101224257348987e-07, "logits/chosen": 0.27013668417930603, "logits/rejected": 0.37406450510025024, "logps/chosen": -6.360276222229004, "logps/rejected": -7.557656288146973, "loss": 0.4257, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.360276222229004, "rewards/margins": 1.1973804235458374, "rewards/rejected": -7.557656288146973, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 17.379881058591934, "learning_rate": 5.085652956714823e-07, "logits/chosen": 0.22448793053627014, "logits/rejected": 0.33974939584732056, "logps/chosen": -6.615704536437988, "logps/rejected": -7.529040336608887, "loss": 0.5194, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.615704536437988, "rewards/margins": 0.9133356213569641, "rewards/rejected": -7.529040336608887, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 18.471182213945827, "learning_rate": 5.070080825075298e-07, "logits/chosen": 0.24397747218608856, "logits/rejected": 0.38046401739120483, "logps/chosen": -6.413047790527344, "logps/rejected": -7.437440395355225, "loss": 0.5148, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.413047790527344, "rewards/margins": 1.0243926048278809, "rewards/rejected": -7.437440395355225, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 17.666812463058786, "learning_rate": 5.0545080135113e-07, "logits/chosen": 0.24228878319263458, "logits/rejected": 0.30700019001960754, "logps/chosen": -6.453700065612793, "logps/rejected": -7.4567718505859375, "loss": 0.5344, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.453700065612793, "rewards/margins": 1.0030714273452759, "rewards/rejected": -7.4567718505859375, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 24.698936772483417, "learning_rate": 5.038934673110316e-07, "logits/chosen": 0.19088833034038544, "logits/rejected": 0.27388995885849, "logps/chosen": -6.500283718109131, "logps/rejected": -7.562563896179199, "loss": 0.5133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.500283718109131, "rewards/margins": 1.0622799396514893, "rewards/rejected": -7.562563896179199, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 17.821465235516225, "learning_rate": 5.023360954964963e-07, "logits/chosen": 0.17239531874656677, "logits/rejected": 0.24293124675750732, "logps/chosen": -6.18967342376709, "logps/rejected": -7.205197811126709, "loss": 0.4387, "rewards/accuracies": 0.8125, "rewards/chosen": -6.18967342376709, "rewards/margins": 1.0155235528945923, "rewards/rejected": -7.205197811126709, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 17.08165694762414, "learning_rate": 5.007787010171524e-07, "logits/chosen": 0.12307925522327423, "logits/rejected": 0.2541181147098541, "logps/chosen": -6.175047874450684, "logps/rejected": -7.312661647796631, "loss": 0.4177, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.175047874450684, "rewards/margins": 1.1376142501831055, "rewards/rejected": -7.312661647796631, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 22.021482804456898, "learning_rate": 4.992212989828477e-07, "logits/chosen": 0.2705684304237366, "logits/rejected": 0.2951211929321289, "logps/chosen": -6.270462512969971, "logps/rejected": -7.168450355529785, "loss": 0.4911, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.270462512969971, "rewards/margins": 0.8979882001876831, "rewards/rejected": -7.168450355529785, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 20.731898518120584, "learning_rate": 4.976639045035036e-07, "logits/chosen": 0.26649701595306396, "logits/rejected": 0.3205729126930237, "logps/chosen": -6.107386589050293, "logps/rejected": -6.9685211181640625, "loss": 0.5692, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.107386589050293, "rewards/margins": 0.8611332774162292, "rewards/rejected": -6.9685211181640625, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 19.9014695994682, "learning_rate": 4.961065326889683e-07, "logits/chosen": 0.2631784975528717, "logits/rejected": 0.3515758216381073, "logps/chosen": -6.233813762664795, "logps/rejected": -7.1180596351623535, "loss": 0.5104, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.233813762664795, "rewards/margins": 0.8842450976371765, "rewards/rejected": -7.1180596351623535, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 21.19052067989215, "learning_rate": 4.9454919864887e-07, "logits/chosen": 0.15125130116939545, "logits/rejected": 0.26079872250556946, "logps/chosen": -6.098044395446777, "logps/rejected": -7.0517706871032715, "loss": 0.512, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.098044395446777, "rewards/margins": 0.9537268877029419, "rewards/rejected": -7.0517706871032715, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 19.701394051999788, "learning_rate": 4.929919174924701e-07, "logits/chosen": 0.14767125248908997, "logits/rejected": 0.27680107951164246, "logps/chosen": -5.939681053161621, "logps/rejected": -6.872743129730225, "loss": 0.4834, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.939681053161621, "rewards/margins": 0.9330614805221558, "rewards/rejected": -6.872743129730225, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 15.557168855900036, "learning_rate": 4.914347043285177e-07, "logits/chosen": 0.20714859664440155, "logits/rejected": 0.30318066477775574, "logps/chosen": -5.960616111755371, "logps/rejected": -6.951633453369141, "loss": 0.4728, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.960616111755371, "rewards/margins": 0.9910171627998352, "rewards/rejected": -6.951633453369141, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 18.40867715270816, "learning_rate": 4.898775742651013e-07, "logits/chosen": 0.2323804348707199, "logits/rejected": 0.3249072730541229, "logps/chosen": -6.057922840118408, "logps/rejected": -7.182548522949219, "loss": 0.4189, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.057922840118408, "rewards/margins": 1.1246259212493896, "rewards/rejected": -7.182548522949219, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 12.676565502747213, "learning_rate": 4.883205424095037e-07, "logits/chosen": 0.15468178689479828, "logits/rejected": 0.25900548696517944, "logps/chosen": -6.141829490661621, "logps/rejected": -7.190564155578613, "loss": 0.4671, "rewards/accuracies": 0.75, "rewards/chosen": -6.141829490661621, "rewards/margins": 1.0487340688705444, "rewards/rejected": -7.190564155578613, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 16.577630552823745, "learning_rate": 4.86763623868055e-07, "logits/chosen": 0.20529921352863312, "logits/rejected": 0.279304176568985, "logps/chosen": -6.0611419677734375, "logps/rejected": -7.010176181793213, "loss": 0.4871, "rewards/accuracies": 0.78125, "rewards/chosen": -6.0611419677734375, "rewards/margins": 0.9490336179733276, "rewards/rejected": -7.010176181793213, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 20.820722254806963, "learning_rate": 4.852068337459856e-07, "logits/chosen": 0.25038784742355347, "logits/rejected": 0.3502267897129059, "logps/chosen": -6.2995429039001465, "logps/rejected": -7.2527666091918945, "loss": 0.4688, "rewards/accuracies": 0.75, "rewards/chosen": -6.2995429039001465, "rewards/margins": 0.9532238245010376, "rewards/rejected": -7.2527666091918945, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 18.494234451199496, "learning_rate": 4.8365018714728e-07, "logits/chosen": 0.2604433298110962, "logits/rejected": 0.3061125874519348, "logps/chosen": -6.330544471740723, "logps/rejected": -7.280308723449707, "loss": 0.4737, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.330544471740723, "rewards/margins": 0.9497642517089844, "rewards/rejected": -7.280308723449707, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 24.63944357525925, "learning_rate": 4.820936991745304e-07, "logits/chosen": 0.11152280867099762, "logits/rejected": 0.17737384140491486, "logps/chosen": -6.428671360015869, "logps/rejected": -7.32094669342041, "loss": 0.4943, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.428671360015869, "rewards/margins": 0.8922744989395142, "rewards/rejected": -7.32094669342041, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 23.857651313241714, "learning_rate": 4.8053738492879e-07, "logits/chosen": 0.25600194931030273, "logits/rejected": 0.3463723063468933, "logps/chosen": -6.270092964172363, "logps/rejected": -7.496466159820557, "loss": 0.4296, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.270092964172363, "rewards/margins": 1.226372480392456, "rewards/rejected": -7.496466159820557, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 23.713054417407722, "learning_rate": 4.789812595094265e-07, "logits/chosen": 0.17232567071914673, "logits/rejected": 0.24414309859275818, "logps/chosen": -6.438145637512207, "logps/rejected": -7.589231014251709, "loss": 0.4389, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.438145637512207, "rewards/margins": 1.1510860919952393, "rewards/rejected": -7.589231014251709, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 18.110130128297442, "learning_rate": 4.774253380139752e-07, "logits/chosen": 0.15104824304580688, "logits/rejected": 0.2494117021560669, "logps/chosen": -6.333140850067139, "logps/rejected": -7.495399475097656, "loss": 0.4363, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.333140850067139, "rewards/margins": 1.1622591018676758, "rewards/rejected": -7.495399475097656, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 21.114141193578536, "learning_rate": 4.758696355379936e-07, "logits/chosen": 0.21927842497825623, "logits/rejected": 0.2055889368057251, "logps/chosen": -6.214723110198975, "logps/rejected": -7.248956203460693, "loss": 0.4727, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.214723110198975, "rewards/margins": 1.0342328548431396, "rewards/rejected": -7.248956203460693, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 20.69272059415271, "learning_rate": 4.743141671749138e-07, "logits/chosen": 0.12042804807424545, "logits/rejected": 0.22026565670967102, "logps/chosen": -6.594400882720947, "logps/rejected": -7.4010329246521, "loss": 0.5614, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -6.594400882720947, "rewards/margins": 0.806632399559021, "rewards/rejected": -7.4010329246521, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 18.521160062715012, "learning_rate": 4.727589480158968e-07, "logits/chosen": 0.23713497817516327, "logits/rejected": 0.294402539730072, "logps/chosen": -6.552815914154053, "logps/rejected": -7.631190299987793, "loss": 0.4666, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.552815914154053, "rewards/margins": 1.0783748626708984, "rewards/rejected": -7.631190299987793, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 22.734283056433625, "learning_rate": 4.712039931496855e-07, "logits/chosen": 0.21373441815376282, "logits/rejected": 0.31241974234580994, "logps/chosen": -6.539563179016113, "logps/rejected": -7.2292375564575195, "loss": 0.615, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.539563179016113, "rewards/margins": 0.6896741390228271, "rewards/rejected": -7.2292375564575195, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 21.726406672794923, "learning_rate": 4.6964931766245905e-07, "logits/chosen": 0.30776771903038025, "logits/rejected": 0.3619793653488159, "logps/chosen": -6.617989540100098, "logps/rejected": -7.693255424499512, "loss": 0.4678, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.617989540100098, "rewards/margins": 1.0752661228179932, "rewards/rejected": -7.693255424499512, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 22.841656117459394, "learning_rate": 4.6809493663768575e-07, "logits/chosen": 0.23914091289043427, "logits/rejected": 0.26955336332321167, "logps/chosen": -6.6125640869140625, "logps/rejected": -7.32776403427124, "loss": 0.5689, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -6.6125640869140625, "rewards/margins": 0.7152001261711121, "rewards/rejected": -7.32776403427124, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 14.772076301821423, "learning_rate": 4.6654086515597716e-07, "logits/chosen": 0.20221495628356934, "logits/rejected": 0.30394989252090454, "logps/chosen": -6.712398529052734, "logps/rejected": -7.827568054199219, "loss": 0.4531, "rewards/accuracies": 0.8125, "rewards/chosen": -6.712398529052734, "rewards/margins": 1.115168809890747, "rewards/rejected": -7.827568054199219, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 17.259779689466356, "learning_rate": 4.6498711829494154e-07, "logits/chosen": 0.15508592128753662, "logits/rejected": 0.23378519713878632, "logps/chosen": -6.640683650970459, "logps/rejected": -7.662077903747559, "loss": 0.4965, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.640683650970459, "rewards/margins": 1.0213943719863892, "rewards/rejected": -7.662077903747559, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 19.702029754416955, "learning_rate": 4.6343371112903777e-07, "logits/chosen": 0.2537457048892975, "logits/rejected": 0.3358924686908722, "logps/chosen": -6.726037502288818, "logps/rejected": -7.646450996398926, "loss": 0.5627, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -6.726037502288818, "rewards/margins": 0.9204134941101074, "rewards/rejected": -7.646450996398926, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.4488539397716522, "eval_logits/rejected": 0.5174131989479065, "eval_logps/chosen": -6.657162666320801, "eval_logps/rejected": -7.66880989074707, "eval_loss": 0.5133790969848633, "eval_rewards/accuracies": 0.7351632118225098, "eval_rewards/chosen": -6.657162666320801, "eval_rewards/margins": 1.011647343635559, "eval_rewards/rejected": -7.66880989074707, "eval_runtime": 40.6378, "eval_samples_per_second": 33.097, "eval_steps_per_second": 8.293, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 15.869132159832395, "learning_rate": 4.618806587294291e-07, "logits/chosen": 0.1306411176919937, "logits/rejected": 0.22377701103687286, "logps/chosen": -6.532301902770996, "logps/rejected": -7.654069423675537, "loss": 0.4774, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.532301902770996, "rewards/margins": 1.1217682361602783, "rewards/rejected": -7.654069423675537, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 21.697678944592113, "learning_rate": 4.603279761638365e-07, "logits/chosen": 0.13842923939228058, "logits/rejected": 0.23352530598640442, "logps/chosen": -6.451352119445801, "logps/rejected": -7.373714447021484, "loss": 0.5298, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.451352119445801, "rewards/margins": 0.922362208366394, "rewards/rejected": -7.373714447021484, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 21.5368219344796, "learning_rate": 4.5877567849639315e-07, "logits/chosen": 0.2461564540863037, "logits/rejected": 0.3232786953449249, "logps/chosen": -6.603086948394775, "logps/rejected": -7.727430820465088, "loss": 0.4586, "rewards/accuracies": 0.8125, "rewards/chosen": -6.603086948394775, "rewards/margins": 1.124343752861023, "rewards/rejected": -7.727430820465088, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 17.934184955159086, "learning_rate": 4.572237807874979e-07, "logits/chosen": 0.17551244795322418, "logits/rejected": 0.34024620056152344, "logps/chosen": -7.046766757965088, "logps/rejected": -7.994105339050293, "loss": 0.5557, "rewards/accuracies": 0.71875, "rewards/chosen": -7.046766757965088, "rewards/margins": 0.9473400115966797, "rewards/rejected": -7.994105339050293, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 23.250484700062525, "learning_rate": 4.5567229809366895e-07, "logits/chosen": 0.19587990641593933, "logits/rejected": 0.2797605097293854, "logps/chosen": -6.6569719314575195, "logps/rejected": -7.640380859375, "loss": 0.5033, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.6569719314575195, "rewards/margins": 0.9834094047546387, "rewards/rejected": -7.640380859375, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 24.77455687165252, "learning_rate": 4.541212454673984e-07, "logits/chosen": 0.19988128542900085, "logits/rejected": 0.2967901825904846, "logps/chosen": -6.832402229309082, "logps/rejected": -8.091900825500488, "loss": 0.4688, "rewards/accuracies": 0.71875, "rewards/chosen": -6.832402229309082, "rewards/margins": 1.2594985961914062, "rewards/rejected": -8.091900825500488, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 23.473239864668482, "learning_rate": 4.525706379570055e-07, "logits/chosen": 0.22459253668785095, "logits/rejected": 0.2889271378517151, "logps/chosen": -6.761626243591309, "logps/rejected": -7.800679683685303, "loss": 0.4962, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.761626243591309, "rewards/margins": 1.0390526056289673, "rewards/rejected": -7.800679683685303, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 16.31503103215448, "learning_rate": 4.510204906064911e-07, "logits/chosen": 0.25568336248397827, "logits/rejected": 0.3477041721343994, "logps/chosen": -6.7611541748046875, "logps/rejected": -7.930889129638672, "loss": 0.443, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.7611541748046875, "rewards/margins": 1.1697345972061157, "rewards/rejected": -7.930889129638672, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 17.99726949675002, "learning_rate": 4.4947081845539177e-07, "logits/chosen": 0.13691622018814087, "logits/rejected": 0.20457307994365692, "logps/chosen": -6.836938381195068, "logps/rejected": -7.808624267578125, "loss": 0.5044, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.836938381195068, "rewards/margins": 0.9716847538948059, "rewards/rejected": -7.808624267578125, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 16.58229021489111, "learning_rate": 4.479216365386333e-07, "logits/chosen": 0.27485713362693787, "logits/rejected": 0.4061592221260071, "logps/chosen": -6.8820624351501465, "logps/rejected": -7.99901819229126, "loss": 0.4568, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.8820624351501465, "rewards/margins": 1.1169553995132446, "rewards/rejected": -7.99901819229126, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 14.23618269821615, "learning_rate": 4.4637295988638555e-07, "logits/chosen": 0.20007339119911194, "logits/rejected": 0.2903497815132141, "logps/chosen": -6.644203186035156, "logps/rejected": -7.748054504394531, "loss": 0.458, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.644203186035156, "rewards/margins": 1.1038516759872437, "rewards/rejected": -7.748054504394531, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 23.41584851586018, "learning_rate": 4.4482480352391623e-07, "logits/chosen": 0.16862265765666962, "logits/rejected": 0.276591956615448, "logps/chosen": -6.806331634521484, "logps/rejected": -7.845320224761963, "loss": 0.4835, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.806331634521484, "rewards/margins": 1.0389878749847412, "rewards/rejected": -7.845320224761963, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 28.008161829483953, "learning_rate": 4.4327718247144507e-07, "logits/chosen": 0.22933514416217804, "logits/rejected": 0.32784542441368103, "logps/chosen": -6.740170478820801, "logps/rejected": -7.7988762855529785, "loss": 0.4824, "rewards/accuracies": 0.75, "rewards/chosen": -6.740170478820801, "rewards/margins": 1.058706283569336, "rewards/rejected": -7.7988762855529785, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 23.903915778560997, "learning_rate": 4.417301117439984e-07, "logits/chosen": 0.21390454471111298, "logits/rejected": 0.2882561683654785, "logps/chosen": -6.873053073883057, "logps/rejected": -7.864251613616943, "loss": 0.4975, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.873053073883057, "rewards/margins": 0.9911983609199524, "rewards/rejected": -7.864251613616943, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 19.546626204647605, "learning_rate": 4.401836063512631e-07, "logits/chosen": 0.17093515396118164, "logits/rejected": 0.3686601519584656, "logps/chosen": -6.650028228759766, "logps/rejected": -7.750426292419434, "loss": 0.4793, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.650028228759766, "rewards/margins": 1.1003978252410889, "rewards/rejected": -7.750426292419434, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 21.583069626921592, "learning_rate": 4.386376812974413e-07, "logits/chosen": 0.14424388110637665, "logits/rejected": 0.1989099234342575, "logps/chosen": -6.258247375488281, "logps/rejected": -7.313899993896484, "loss": 0.479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.258247375488281, "rewards/margins": 1.0556527376174927, "rewards/rejected": -7.313899993896484, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 19.742335669617617, "learning_rate": 4.370923515811048e-07, "logits/chosen": 0.1942090094089508, "logits/rejected": 0.3205759823322296, "logps/chosen": -6.529296875, "logps/rejected": -7.59033727645874, "loss": 0.4753, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.529296875, "rewards/margins": 1.0610404014587402, "rewards/rejected": -7.59033727645874, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 18.146081982641167, "learning_rate": 4.35547632195049e-07, "logits/chosen": 0.16381600499153137, "logits/rejected": 0.25740283727645874, "logps/chosen": -6.231782913208008, "logps/rejected": -7.269131660461426, "loss": 0.4495, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.231782913208008, "rewards/margins": 1.0373494625091553, "rewards/rejected": -7.269131660461426, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 18.22473485298211, "learning_rate": 4.340035381261484e-07, "logits/chosen": 0.19077131152153015, "logits/rejected": 0.2695925533771515, "logps/chosen": -6.432314395904541, "logps/rejected": -7.508018493652344, "loss": 0.4809, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.432314395904541, "rewards/margins": 1.0757038593292236, "rewards/rejected": -7.508018493652344, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 20.263923493581792, "learning_rate": 4.324600843552104e-07, "logits/chosen": 0.13665637373924255, "logits/rejected": 0.24925723671913147, "logps/chosen": -6.676652431488037, "logps/rejected": -7.784036159515381, "loss": 0.4805, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.676652431488037, "rewards/margins": 1.107384443283081, "rewards/rejected": -7.784036159515381, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 20.497016023233254, "learning_rate": 4.309172858568302e-07, "logits/chosen": 0.10507404804229736, "logits/rejected": 0.24651379883289337, "logps/chosen": -6.4755659103393555, "logps/rejected": -7.529807090759277, "loss": 0.479, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.4755659103393555, "rewards/margins": 1.054241418838501, "rewards/rejected": -7.529807090759277, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 19.05127153228479, "learning_rate": 4.293751575992455e-07, "logits/chosen": 0.277086079120636, "logits/rejected": 0.3203951418399811, "logps/chosen": -6.46514368057251, "logps/rejected": -7.453202724456787, "loss": 0.4648, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.46514368057251, "rewards/margins": 0.9880601167678833, "rewards/rejected": -7.453202724456787, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 23.675268393333273, "learning_rate": 4.278337145441916e-07, "logits/chosen": 0.21136336028575897, "logits/rejected": 0.3222659230232239, "logps/chosen": -6.553615570068359, "logps/rejected": -7.577325344085693, "loss": 0.4676, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.553615570068359, "rewards/margins": 1.0237104892730713, "rewards/rejected": -7.577325344085693, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 15.080107496063537, "learning_rate": 4.262929716467556e-07, "logits/chosen": 0.2260635793209076, "logits/rejected": 0.3735947608947754, "logps/chosen": -6.411507606506348, "logps/rejected": -7.608042240142822, "loss": 0.4633, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.411507606506348, "rewards/margins": 1.1965347528457642, "rewards/rejected": -7.608042240142822, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 20.209121095499455, "learning_rate": 4.247529438552321e-07, "logits/chosen": 0.2311449944972992, "logits/rejected": 0.34996360540390015, "logps/chosen": -6.454397678375244, "logps/rejected": -7.426302909851074, "loss": 0.5101, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.454397678375244, "rewards/margins": 0.9719057083129883, "rewards/rejected": -7.426302909851074, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 21.98259769611044, "learning_rate": 4.232136461109773e-07, "logits/chosen": 0.26113951206207275, "logits/rejected": 0.34420132637023926, "logps/chosen": -6.4713454246521, "logps/rejected": -7.6526689529418945, "loss": 0.4486, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.4713454246521, "rewards/margins": 1.1813232898712158, "rewards/rejected": -7.6526689529418945, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 25.25417578027106, "learning_rate": 4.216750933482646e-07, "logits/chosen": 0.21682074666023254, "logits/rejected": 0.3427589237689972, "logps/chosen": -6.749870300292969, "logps/rejected": -7.6591057777404785, "loss": 0.5322, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.749870300292969, "rewards/margins": 0.9092347025871277, "rewards/rejected": -7.6591057777404785, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 32.66389033970582, "learning_rate": 4.2013730049413986e-07, "logits/chosen": 0.20450341701507568, "logits/rejected": 0.29150307178497314, "logps/chosen": -6.404050350189209, "logps/rejected": -7.533034324645996, "loss": 0.4826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.404050350189209, "rewards/margins": 1.1289833784103394, "rewards/rejected": -7.533034324645996, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 17.874969463098303, "learning_rate": 4.1860028246827594e-07, "logits/chosen": 0.22480730712413788, "logits/rejected": 0.34472066164016724, "logps/chosen": -6.208386421203613, "logps/rejected": -7.257425785064697, "loss": 0.4673, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.208386421203613, "rewards/margins": 1.0490391254425049, "rewards/rejected": -7.257425785064697, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 18.580677936825296, "learning_rate": 4.170640541828285e-07, "logits/chosen": 0.16384154558181763, "logits/rejected": 0.2590642273426056, "logps/chosen": -6.433566093444824, "logps/rejected": -7.4382123947143555, "loss": 0.4765, "rewards/accuracies": 0.78125, "rewards/chosen": -6.433566093444824, "rewards/margins": 1.0046457052230835, "rewards/rejected": -7.4382123947143555, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 24.45779862275371, "learning_rate": 4.1552863054229116e-07, "logits/chosen": 0.26491600275039673, "logits/rejected": 0.3206622898578644, "logps/chosen": -6.606454372406006, "logps/rejected": -7.6248297691345215, "loss": 0.5243, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.606454372406006, "rewards/margins": 1.0183751583099365, "rewards/rejected": -7.6248297691345215, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 21.147684028508152, "learning_rate": 4.139940264433508e-07, "logits/chosen": 0.11042556911706924, "logits/rejected": 0.21519234776496887, "logps/chosen": -6.281399726867676, "logps/rejected": -7.391129493713379, "loss": 0.4673, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.281399726867676, "rewards/margins": 1.1097290515899658, "rewards/rejected": -7.391129493713379, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 18.417589022658706, "learning_rate": 4.1246025677474303e-07, "logits/chosen": 0.15590153634548187, "logits/rejected": 0.270027220249176, "logps/chosen": -6.439671993255615, "logps/rejected": -7.293754577636719, "loss": 0.5159, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.439671993255615, "rewards/margins": 0.8540828824043274, "rewards/rejected": -7.293754577636719, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 22.270476116996807, "learning_rate": 4.10927336417108e-07, "logits/chosen": 0.2180463969707489, "logits/rejected": 0.31267091631889343, "logps/chosen": -6.5035223960876465, "logps/rejected": -7.20119571685791, "loss": 0.5977, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.5035223960876465, "rewards/margins": 0.697672963142395, "rewards/rejected": -7.20119571685791, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 19.73240270329718, "learning_rate": 4.093952802428457e-07, "logits/chosen": 0.27625393867492676, "logits/rejected": 0.33694809675216675, "logps/chosen": -6.6766676902771, "logps/rejected": -7.540618896484375, "loss": 0.6048, "rewards/accuracies": 0.6875, "rewards/chosen": -6.6766676902771, "rewards/margins": 0.8639516830444336, "rewards/rejected": -7.540618896484375, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 17.540178654209587, "learning_rate": 4.0786410311597184e-07, "logits/chosen": 0.16687259078025818, "logits/rejected": 0.30098867416381836, "logps/chosen": -6.243479251861572, "logps/rejected": -7.212285041809082, "loss": 0.5156, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.243479251861572, "rewards/margins": 0.9688063859939575, "rewards/rejected": -7.212285041809082, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 17.344625843600365, "learning_rate": 4.063338198919737e-07, "logits/chosen": 0.19912198185920715, "logits/rejected": 0.20550537109375, "logps/chosen": -6.367600440979004, "logps/rejected": -7.254168510437012, "loss": 0.5271, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.367600440979004, "rewards/margins": 0.8865678906440735, "rewards/rejected": -7.254168510437012, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 32.76384774560029, "learning_rate": 4.0480444541766575e-07, "logits/chosen": 0.17722304165363312, "logits/rejected": 0.25820109248161316, "logps/chosen": -6.268993854522705, "logps/rejected": -7.099878787994385, "loss": 0.5853, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.268993854522705, "rewards/margins": 0.830885112285614, "rewards/rejected": -7.099878787994385, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 16.194223119083226, "learning_rate": 4.0327599453104606e-07, "logits/chosen": 0.12692640721797943, "logits/rejected": 0.24236471951007843, "logps/chosen": -6.213504791259766, "logps/rejected": -7.311017036437988, "loss": 0.4408, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.213504791259766, "rewards/margins": 1.097511649131775, "rewards/rejected": -7.311017036437988, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 19.369602497734977, "learning_rate": 4.017484820611514e-07, "logits/chosen": 0.13687637448310852, "logits/rejected": 0.22643017768859863, "logps/chosen": -6.2551045417785645, "logps/rejected": -7.339658260345459, "loss": 0.4675, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.2551045417785645, "rewards/margins": 1.08455491065979, "rewards/rejected": -7.339658260345459, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 18.995653770148508, "learning_rate": 4.002219228279148e-07, "logits/chosen": 0.13641352951526642, "logits/rejected": 0.2479369193315506, "logps/chosen": -6.512181758880615, "logps/rejected": -7.4519500732421875, "loss": 0.4749, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.512181758880615, "rewards/margins": 0.9397680163383484, "rewards/rejected": -7.4519500732421875, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 16.751208913626197, "learning_rate": 3.9869633164202045e-07, "logits/chosen": 0.18358120322227478, "logits/rejected": 0.3124134838581085, "logps/chosen": -6.632837772369385, "logps/rejected": -7.606378078460693, "loss": 0.4687, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.632837772369385, "rewards/margins": 0.9735404253005981, "rewards/rejected": -7.606378078460693, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 22.36400382457632, "learning_rate": 3.9717172330476077e-07, "logits/chosen": 0.1367681473493576, "logits/rejected": 0.2325524538755417, "logps/chosen": -6.523054599761963, "logps/rejected": -7.534343719482422, "loss": 0.4956, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.523054599761963, "rewards/margins": 1.0112890005111694, "rewards/rejected": -7.534343719482422, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 18.493065064323915, "learning_rate": 3.956481126078927e-07, "logits/chosen": 0.24453692138195038, "logits/rejected": 0.31568101048469543, "logps/chosen": -6.389092445373535, "logps/rejected": -7.433390140533447, "loss": 0.5431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.389092445373535, "rewards/margins": 1.0442975759506226, "rewards/rejected": -7.433390140533447, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 19.23036313009905, "learning_rate": 3.941255143334937e-07, "logits/chosen": 0.1470910608768463, "logits/rejected": 0.1949562281370163, "logps/chosen": -6.6343674659729, "logps/rejected": -7.659320831298828, "loss": 0.4738, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.6343674659729, "rewards/margins": 1.0249537229537964, "rewards/rejected": -7.659320831298828, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 20.148233678819246, "learning_rate": 3.9260394325381895e-07, "logits/chosen": 0.13003337383270264, "logits/rejected": 0.21735802292823792, "logps/chosen": -6.517716884613037, "logps/rejected": -7.860361576080322, "loss": 0.4301, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -6.517716884613037, "rewards/margins": 1.3426440954208374, "rewards/rejected": -7.860361576080322, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 21.512003317669382, "learning_rate": 3.9108341413115784e-07, "logits/chosen": 0.14755678176879883, "logits/rejected": 0.23817653954029083, "logps/chosen": -6.584360599517822, "logps/rejected": -7.651620388031006, "loss": 0.4387, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.584360599517822, "rewards/margins": 1.0672591924667358, "rewards/rejected": -7.651620388031006, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 24.585953370696455, "learning_rate": 3.895639417176905e-07, "logits/chosen": 0.09006103873252869, "logits/rejected": 0.16051539778709412, "logps/chosen": -6.485753059387207, "logps/rejected": -7.43698263168335, "loss": 0.5552, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.485753059387207, "rewards/margins": 0.9512289762496948, "rewards/rejected": -7.43698263168335, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 23.127918686045877, "learning_rate": 3.8804554075534497e-07, "logits/chosen": 0.1548965871334076, "logits/rejected": 0.29891422390937805, "logps/chosen": -6.521195411682129, "logps/rejected": -7.63997745513916, "loss": 0.4813, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.521195411682129, "rewards/margins": 1.1187822818756104, "rewards/rejected": -7.63997745513916, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 19.099721464936273, "learning_rate": 3.8652822597565403e-07, "logits/chosen": 0.09252112358808517, "logits/rejected": 0.20740166306495667, "logps/chosen": -6.487610816955566, "logps/rejected": -7.647575378417969, "loss": 0.4427, "rewards/accuracies": 0.78125, "rewards/chosen": -6.487610816955566, "rewards/margins": 1.159964919090271, "rewards/rejected": -7.647575378417969, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 20.551076331977153, "learning_rate": 3.850120120996123e-07, "logits/chosen": 0.15979830920696259, "logits/rejected": 0.28849855065345764, "logps/chosen": -6.671955108642578, "logps/rejected": -7.7100653648376465, "loss": 0.5265, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.671955108642578, "rewards/margins": 1.0381101369857788, "rewards/rejected": -7.7100653648376465, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 16.8771609193641, "learning_rate": 3.8349691383753356e-07, "logits/chosen": 0.23671522736549377, "logits/rejected": 0.32592126727104187, "logps/chosen": -6.303369045257568, "logps/rejected": -7.381552696228027, "loss": 0.4737, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.303369045257568, "rewards/margins": 1.0781834125518799, "rewards/rejected": -7.381552696228027, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 22.66722387598913, "learning_rate": 3.819829458889078e-07, "logits/chosen": 0.18265636265277863, "logits/rejected": 0.24627861380577087, "logps/chosen": -6.601975440979004, "logps/rejected": -7.597805023193359, "loss": 0.4886, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.601975440979004, "rewards/margins": 0.9958289265632629, "rewards/rejected": -7.597805023193359, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 17.485484036001626, "learning_rate": 3.804701229422585e-07, "logits/chosen": 0.17104171216487885, "logits/rejected": 0.24990804493427277, "logps/chosen": -6.531071662902832, "logps/rejected": -7.70626974105835, "loss": 0.4525, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.531071662902832, "rewards/margins": 1.1751972436904907, "rewards/rejected": -7.70626974105835, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 21.037468907523962, "learning_rate": 3.789584596750007e-07, "logits/chosen": 0.21032123267650604, "logits/rejected": 0.24975328147411346, "logps/chosen": -6.565499782562256, "logps/rejected": -7.537527561187744, "loss": 0.509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.565499782562256, "rewards/margins": 0.9720270037651062, "rewards/rejected": -7.537527561187744, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 19.21096510142112, "learning_rate": 3.77447970753298e-07, "logits/chosen": 0.24196204543113708, "logits/rejected": 0.26354843378067017, "logps/chosen": -6.75180196762085, "logps/rejected": -7.796343803405762, "loss": 0.5037, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.75180196762085, "rewards/margins": 1.0445427894592285, "rewards/rejected": -7.796343803405762, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 23.12730490204625, "learning_rate": 3.7593867083192057e-07, "logits/chosen": 0.13923628628253937, "logits/rejected": 0.25029754638671875, "logps/chosen": -6.437714576721191, "logps/rejected": -7.4407854080200195, "loss": 0.5007, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.437714576721191, "rewards/margins": 1.0030710697174072, "rewards/rejected": -7.4407854080200195, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 25.770639634393877, "learning_rate": 3.7443057455410276e-07, "logits/chosen": 0.2578660845756531, "logits/rejected": 0.32676053047180176, "logps/chosen": -6.359456539154053, "logps/rejected": -7.49765157699585, "loss": 0.4276, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.359456539154053, "rewards/margins": 1.1381956338882446, "rewards/rejected": -7.49765157699585, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 16.40928758868845, "learning_rate": 3.7292369655140145e-07, "logits/chosen": 0.13904188573360443, "logits/rejected": 0.26749277114868164, "logps/chosen": -6.574984550476074, "logps/rejected": -7.527205467224121, "loss": 0.4583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.574984550476074, "rewards/margins": 0.9522213935852051, "rewards/rejected": -7.527205467224121, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 17.412141740096136, "learning_rate": 3.714180514435534e-07, "logits/chosen": 0.19764995574951172, "logits/rejected": 0.3117005228996277, "logps/chosen": -6.262470722198486, "logps/rejected": -7.379081726074219, "loss": 0.4735, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.262470722198486, "rewards/margins": 1.1166101694107056, "rewards/rejected": -7.379081726074219, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 24.53262705324183, "learning_rate": 3.6991365383833426e-07, "logits/chosen": 0.16422924399375916, "logits/rejected": 0.2566792666912079, "logps/chosen": -6.193147659301758, "logps/rejected": -7.2510881423950195, "loss": 0.455, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.193147659301758, "rewards/margins": 1.0579402446746826, "rewards/rejected": -7.2510881423950195, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 21.316272962975766, "learning_rate": 3.684105183314162e-07, "logits/chosen": 0.14263905584812164, "logits/rejected": 0.2233494222164154, "logps/chosen": -6.029327869415283, "logps/rejected": -7.102616310119629, "loss": 0.4448, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -6.029327869415283, "rewards/margins": 1.0732887983322144, "rewards/rejected": -7.102616310119629, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 25.799981764201448, "learning_rate": 3.669086595062263e-07, "logits/chosen": 0.17610026895999908, "logits/rejected": 0.30271443724632263, "logps/chosen": -6.439400672912598, "logps/rejected": -7.432157039642334, "loss": 0.4764, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.439400672912598, "rewards/margins": 0.9927564859390259, "rewards/rejected": -7.432157039642334, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 19.330661568050292, "learning_rate": 3.654080919338056e-07, "logits/chosen": 0.14207343757152557, "logits/rejected": 0.22544574737548828, "logps/chosen": -6.330053806304932, "logps/rejected": -7.3705339431762695, "loss": 0.489, "rewards/accuracies": 0.71875, "rewards/chosen": -6.330053806304932, "rewards/margins": 1.0404796600341797, "rewards/rejected": -7.3705339431762695, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 21.172458290024025, "learning_rate": 3.639088301726673e-07, "logits/chosen": 0.21120992302894592, "logits/rejected": 0.34980717301368713, "logps/chosen": -6.366480350494385, "logps/rejected": -7.365818023681641, "loss": 0.4911, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.366480350494385, "rewards/margins": 0.9993377923965454, "rewards/rejected": -7.365818023681641, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 18.084670894679792, "learning_rate": 3.624108887686556e-07, "logits/chosen": 0.1853090226650238, "logits/rejected": 0.24252355098724365, "logps/chosen": -6.3801093101501465, "logps/rejected": -7.330789089202881, "loss": 0.4727, "rewards/accuracies": 0.75, "rewards/chosen": -6.3801093101501465, "rewards/margins": 0.9506810903549194, "rewards/rejected": -7.330789089202881, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 14.428694978425291, "learning_rate": 3.6091428225480433e-07, "logits/chosen": 0.16146475076675415, "logits/rejected": 0.2579134702682495, "logps/chosen": -6.463724613189697, "logps/rejected": -7.545462131500244, "loss": 0.4775, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.463724613189697, "rewards/margins": 1.0817375183105469, "rewards/rejected": -7.545462131500244, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 23.15216601844246, "learning_rate": 3.5941902515119674e-07, "logits/chosen": 0.14911453425884247, "logits/rejected": 0.2860339879989624, "logps/chosen": -6.520458221435547, "logps/rejected": -7.3302412033081055, "loss": 0.5426, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.520458221435547, "rewards/margins": 0.8097823858261108, "rewards/rejected": -7.3302412033081055, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 22.074598778141155, "learning_rate": 3.5792513196482373e-07, "logits/chosen": 0.09652672708034515, "logits/rejected": 0.2736071050167084, "logps/chosen": -6.366788387298584, "logps/rejected": -7.48568868637085, "loss": 0.4207, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.366788387298584, "rewards/margins": 1.1189005374908447, "rewards/rejected": -7.48568868637085, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 20.313744463449986, "learning_rate": 3.5643261718944346e-07, "logits/chosen": 0.2503211796283722, "logits/rejected": 0.3162500262260437, "logps/chosen": -6.5696516036987305, "logps/rejected": -7.405554294586182, "loss": 0.5706, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.5696516036987305, "rewards/margins": 0.8359017372131348, "rewards/rejected": -7.405554294586182, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 15.292317740623275, "learning_rate": 3.5494149530544087e-07, "logits/chosen": 0.14119234681129456, "logits/rejected": 0.21549062430858612, "logps/chosen": -6.321418762207031, "logps/rejected": -7.37194299697876, "loss": 0.4996, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.321418762207031, "rewards/margins": 1.0505234003067017, "rewards/rejected": -7.37194299697876, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 22.20620411026749, "learning_rate": 3.534517807796871e-07, "logits/chosen": 0.16339029371738434, "logits/rejected": 0.24118542671203613, "logps/chosen": -6.264065742492676, "logps/rejected": -7.228880405426025, "loss": 0.4912, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.264065742492676, "rewards/margins": 0.9648143649101257, "rewards/rejected": -7.228880405426025, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 16.87493196685934, "learning_rate": 3.519634880653988e-07, "logits/chosen": 0.17063704133033752, "logits/rejected": 0.25663450360298157, "logps/chosen": -6.436666965484619, "logps/rejected": -7.636883735656738, "loss": 0.4372, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.436666965484619, "rewards/margins": 1.20021653175354, "rewards/rejected": -7.636883735656738, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 17.625573225754394, "learning_rate": 3.504766316019987e-07, "logits/chosen": 0.16094836592674255, "logits/rejected": 0.2970316708087921, "logps/chosen": -6.153613090515137, "logps/rejected": -7.277252197265625, "loss": 0.4302, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.153613090515137, "rewards/margins": 1.123638391494751, "rewards/rejected": -7.277252197265625, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 16.31400289516364, "learning_rate": 3.489912258149745e-07, "logits/chosen": 0.275907427072525, "logits/rejected": 0.34968143701553345, "logps/chosen": -6.258004188537598, "logps/rejected": -7.378037452697754, "loss": 0.4605, "rewards/accuracies": 0.78125, "rewards/chosen": -6.258004188537598, "rewards/margins": 1.120032548904419, "rewards/rejected": -7.378037452697754, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 15.855950463138166, "learning_rate": 3.475072851157397e-07, "logits/chosen": 0.18137823045253754, "logits/rejected": 0.2283487617969513, "logps/chosen": -6.178892612457275, "logps/rejected": -7.254115104675293, "loss": 0.4633, "rewards/accuracies": 0.75, "rewards/chosen": -6.178892612457275, "rewards/margins": 1.0752227306365967, "rewards/rejected": -7.254115104675293, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 16.62684694108839, "learning_rate": 3.460248239014936e-07, "logits/chosen": 0.23303687572479248, "logits/rejected": 0.28519847989082336, "logps/chosen": -6.376425743103027, "logps/rejected": -7.515501499176025, "loss": 0.4463, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.376425743103027, "rewards/margins": 1.1390752792358398, "rewards/rejected": -7.515501499176025, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 22.26104304088215, "learning_rate": 3.4454385655508134e-07, "logits/chosen": 0.2415611743927002, "logits/rejected": 0.2640741765499115, "logps/chosen": -6.387829303741455, "logps/rejected": -7.295698642730713, "loss": 0.53, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.387829303741455, "rewards/margins": 0.9078701138496399, "rewards/rejected": -7.295698642730713, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 15.382326506425983, "learning_rate": 3.4306439744485447e-07, "logits/chosen": 0.13610319793224335, "logits/rejected": 0.2552124559879303, "logps/chosen": -6.556148529052734, "logps/rejected": -7.546764373779297, "loss": 0.5065, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.556148529052734, "rewards/margins": 0.990615725517273, "rewards/rejected": -7.546764373779297, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 21.11524096712412, "learning_rate": 3.415864609245322e-07, "logits/chosen": 0.2464778870344162, "logits/rejected": 0.35940033197402954, "logps/chosen": -6.498780727386475, "logps/rejected": -7.491261959075928, "loss": 0.5355, "rewards/accuracies": 0.71875, "rewards/chosen": -6.498780727386475, "rewards/margins": 0.9924813508987427, "rewards/rejected": -7.491261959075928, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.4009808599948883, "eval_logits/rejected": 0.46716082096099854, "eval_logps/chosen": -6.359859466552734, "eval_logps/rejected": -7.362969875335693, "eval_loss": 0.5092985033988953, "eval_rewards/accuracies": 0.7351632118225098, "eval_rewards/chosen": -6.359859466552734, "eval_rewards/margins": 1.0031099319458008, "eval_rewards/rejected": -7.362969875335693, "eval_runtime": 40.5408, "eval_samples_per_second": 33.176, "eval_steps_per_second": 8.313, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 22.5693100633145, "learning_rate": 3.401100613330605e-07, "logits/chosen": 0.1845104694366455, "logits/rejected": 0.19720450043678284, "logps/chosen": -6.135837554931641, "logps/rejected": -7.097196102142334, "loss": 0.4967, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.135837554931641, "rewards/margins": 0.9613581895828247, "rewards/rejected": -7.097196102142334, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 16.043583291969696, "learning_rate": 3.3863521299447514e-07, "logits/chosen": 0.132811039686203, "logits/rejected": 0.22994449734687805, "logps/chosen": -6.092348098754883, "logps/rejected": -7.191383361816406, "loss": 0.4075, "rewards/accuracies": 0.84375, "rewards/chosen": -6.092348098754883, "rewards/margins": 1.0990359783172607, "rewards/rejected": -7.191383361816406, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 20.561720238032684, "learning_rate": 3.371619302177609e-07, "logits/chosen": 0.201839417219162, "logits/rejected": 0.29043930768966675, "logps/chosen": -6.447142124176025, "logps/rejected": -7.472611427307129, "loss": 0.4918, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.447142124176025, "rewards/margins": 1.0254695415496826, "rewards/rejected": -7.472611427307129, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 21.238447292630944, "learning_rate": 3.3569022729671393e-07, "logits/chosen": 0.20971384644508362, "logits/rejected": 0.2832814157009125, "logps/chosen": -6.45370626449585, "logps/rejected": -7.381368160247803, "loss": 0.5083, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.45370626449585, "rewards/margins": 0.9276623725891113, "rewards/rejected": -7.381368160247803, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 17.681314803586645, "learning_rate": 3.342201185098024e-07, "logits/chosen": 0.21420466899871826, "logits/rejected": 0.22679977118968964, "logps/chosen": -6.175483703613281, "logps/rejected": -7.165809631347656, "loss": 0.4664, "rewards/accuracies": 0.78125, "rewards/chosen": -6.175483703613281, "rewards/margins": 0.9903267025947571, "rewards/rejected": -7.165809631347656, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 20.546958056672235, "learning_rate": 3.3275161812002807e-07, "logits/chosen": 0.19906464219093323, "logits/rejected": 0.2329658716917038, "logps/chosen": -6.296006679534912, "logps/rejected": -7.427008628845215, "loss": 0.4839, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.296006679534912, "rewards/margins": 1.131002426147461, "rewards/rejected": -7.427008628845215, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 18.818402511872844, "learning_rate": 3.312847403747883e-07, "logits/chosen": 0.15629816055297852, "logits/rejected": 0.25591129064559937, "logps/chosen": -6.170465469360352, "logps/rejected": -7.316367149353027, "loss": 0.4363, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.170465469360352, "rewards/margins": 1.145900845527649, "rewards/rejected": -7.316367149353027, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 19.675678749252704, "learning_rate": 3.2981949950573733e-07, "logits/chosen": 0.17365175485610962, "logits/rejected": 0.23541617393493652, "logps/chosen": -6.451229095458984, "logps/rejected": -7.375931739807129, "loss": 0.482, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.451229095458984, "rewards/margins": 0.9247023463249207, "rewards/rejected": -7.375931739807129, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 17.013326712368894, "learning_rate": 3.283559097286486e-07, "logits/chosen": 0.16418492794036865, "logits/rejected": 0.25839662551879883, "logps/chosen": -6.458489418029785, "logps/rejected": -7.2075653076171875, "loss": 0.54, "rewards/accuracies": 0.75, "rewards/chosen": -6.458489418029785, "rewards/margins": 0.7490754127502441, "rewards/rejected": -7.2075653076171875, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 17.562984195001665, "learning_rate": 3.268939852432765e-07, "logits/chosen": 0.1517658233642578, "logits/rejected": 0.21947559714317322, "logps/chosen": -6.511991024017334, "logps/rejected": -7.364911079406738, "loss": 0.5085, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -6.511991024017334, "rewards/margins": 0.852920651435852, "rewards/rejected": -7.364911079406738, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 22.872561651367647, "learning_rate": 3.254337402332187e-07, "logits/chosen": 0.24469709396362305, "logits/rejected": 0.30830642580986023, "logps/chosen": -6.504847526550293, "logps/rejected": -7.3935956954956055, "loss": 0.5341, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.504847526550293, "rewards/margins": 0.8887487649917603, "rewards/rejected": -7.3935956954956055, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 19.95732127273803, "learning_rate": 3.239751888657788e-07, "logits/chosen": 0.21787777543067932, "logits/rejected": 0.30197378993034363, "logps/chosen": -6.6033782958984375, "logps/rejected": -7.485899448394775, "loss": 0.534, "rewards/accuracies": 0.71875, "rewards/chosen": -6.6033782958984375, "rewards/margins": 0.8825214505195618, "rewards/rejected": -7.485899448394775, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 16.193192807206206, "learning_rate": 3.2251834529182856e-07, "logits/chosen": 0.1818004846572876, "logits/rejected": 0.24020631611347198, "logps/chosen": -6.1061248779296875, "logps/rejected": -7.202364444732666, "loss": 0.4805, "rewards/accuracies": 0.78125, "rewards/chosen": -6.1061248779296875, "rewards/margins": 1.0962388515472412, "rewards/rejected": -7.202364444732666, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 19.389765764523275, "learning_rate": 3.2106322364567075e-07, "logits/chosen": 0.19006848335266113, "logits/rejected": 0.2673812806606293, "logps/chosen": -6.144223213195801, "logps/rejected": -7.327752113342285, "loss": 0.4227, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -6.144223213195801, "rewards/margins": 1.1835286617279053, "rewards/rejected": -7.327752113342285, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 19.125450166435336, "learning_rate": 3.1960983804490183e-07, "logits/chosen": 0.15509849786758423, "logits/rejected": 0.2556911110877991, "logps/chosen": -6.481911659240723, "logps/rejected": -7.526867866516113, "loss": 0.5267, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.481911659240723, "rewards/margins": 1.0449565649032593, "rewards/rejected": -7.526867866516113, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 16.28714066454134, "learning_rate": 3.1815820259027537e-07, "logits/chosen": 0.17974096536636353, "logits/rejected": 0.2654067575931549, "logps/chosen": -6.174489974975586, "logps/rejected": -7.2637200355529785, "loss": 0.438, "rewards/accuracies": 0.8125, "rewards/chosen": -6.174489974975586, "rewards/margins": 1.0892300605773926, "rewards/rejected": -7.2637200355529785, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 28.482993395337576, "learning_rate": 3.16708331365565e-07, "logits/chosen": 0.1324462592601776, "logits/rejected": 0.20237961411476135, "logps/chosen": -6.627023220062256, "logps/rejected": -7.691824436187744, "loss": 0.4781, "rewards/accuracies": 0.78125, "rewards/chosen": -6.627023220062256, "rewards/margins": 1.0648006200790405, "rewards/rejected": -7.691824436187744, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 18.047585799931007, "learning_rate": 3.152602384374275e-07, "logits/chosen": 0.18161442875862122, "logits/rejected": 0.2857312262058258, "logps/chosen": -6.6233015060424805, "logps/rejected": -7.610257148742676, "loss": 0.489, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.6233015060424805, "rewards/margins": 0.9869546890258789, "rewards/rejected": -7.610257148742676, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 16.584880554225006, "learning_rate": 3.1381393785526697e-07, "logits/chosen": 0.1531459093093872, "logits/rejected": 0.22634652256965637, "logps/chosen": -6.527143955230713, "logps/rejected": -7.575995445251465, "loss": 0.4848, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.527143955230713, "rewards/margins": 1.0488523244857788, "rewards/rejected": -7.575995445251465, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 16.55343371630716, "learning_rate": 3.123694436510979e-07, "logits/chosen": 0.16725540161132812, "logits/rejected": 0.26145586371421814, "logps/chosen": -6.30380916595459, "logps/rejected": -7.361602783203125, "loss": 0.4649, "rewards/accuracies": 0.75, "rewards/chosen": -6.30380916595459, "rewards/margins": 1.0577939748764038, "rewards/rejected": -7.361602783203125, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 21.640864308240587, "learning_rate": 3.1092676983940946e-07, "logits/chosen": 0.1762697398662567, "logits/rejected": 0.2139081209897995, "logps/chosen": -6.335051536560059, "logps/rejected": -7.490813255310059, "loss": 0.4523, "rewards/accuracies": 0.78125, "rewards/chosen": -6.335051536560059, "rewards/margins": 1.1557615995407104, "rewards/rejected": -7.490813255310059, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 19.88164888724295, "learning_rate": 3.094859304170293e-07, "logits/chosen": 0.2889614701271057, "logits/rejected": 0.31625667214393616, "logps/chosen": -6.309903144836426, "logps/rejected": -7.208881378173828, "loss": 0.5375, "rewards/accuracies": 0.6875, "rewards/chosen": -6.309903144836426, "rewards/margins": 0.8989775776863098, "rewards/rejected": -7.208881378173828, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 18.075732603840756, "learning_rate": 3.0804693936298795e-07, "logits/chosen": 0.2011052668094635, "logits/rejected": 0.2411438226699829, "logps/chosen": -6.4421281814575195, "logps/rejected": -7.668890476226807, "loss": 0.4427, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -6.4421281814575195, "rewards/margins": 1.2267627716064453, "rewards/rejected": -7.668890476226807, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 20.34430988977955, "learning_rate": 3.066098106383826e-07, "logits/chosen": 0.16811896860599518, "logits/rejected": 0.24398362636566162, "logps/chosen": -6.354432582855225, "logps/rejected": -7.336121559143066, "loss": 0.4725, "rewards/accuracies": 0.75, "rewards/chosen": -6.354432582855225, "rewards/margins": 0.9816884994506836, "rewards/rejected": -7.336121559143066, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 15.28419797826619, "learning_rate": 3.0517455818624263e-07, "logits/chosen": 0.11736585944890976, "logits/rejected": 0.19134798645973206, "logps/chosen": -6.4159440994262695, "logps/rejected": -7.565770149230957, "loss": 0.4288, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.4159440994262695, "rewards/margins": 1.1498258113861084, "rewards/rejected": -7.565770149230957, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 19.90368557041769, "learning_rate": 3.037411959313936e-07, "logits/chosen": 0.18977683782577515, "logits/rejected": 0.2764529287815094, "logps/chosen": -6.380740165710449, "logps/rejected": -7.380450248718262, "loss": 0.4705, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.380740165710449, "rewards/margins": 0.9997096061706543, "rewards/rejected": -7.380450248718262, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 24.512457428106682, "learning_rate": 3.023097377803224e-07, "logits/chosen": 0.2059437334537506, "logits/rejected": 0.26462262868881226, "logps/chosen": -6.545748233795166, "logps/rejected": -7.490653038024902, "loss": 0.5372, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.545748233795166, "rewards/margins": 0.9449056386947632, "rewards/rejected": -7.490653038024902, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 19.984404313535737, "learning_rate": 3.008801976210423e-07, "logits/chosen": 0.24294297397136688, "logits/rejected": 0.2845238149166107, "logps/chosen": -6.624716281890869, "logps/rejected": -7.566445827484131, "loss": 0.4681, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.624716281890869, "rewards/margins": 0.9417294263839722, "rewards/rejected": -7.566445827484131, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 18.68971671325928, "learning_rate": 2.994525893229581e-07, "logits/chosen": 0.20279259979724884, "logits/rejected": 0.23716776072978973, "logps/chosen": -6.516075134277344, "logps/rejected": -7.833211421966553, "loss": 0.3724, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -6.516075134277344, "rewards/margins": 1.3171364068984985, "rewards/rejected": -7.833211421966553, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 15.919065890699846, "learning_rate": 2.98026926736732e-07, "logits/chosen": 0.12718412280082703, "logits/rejected": 0.19841906428337097, "logps/chosen": -6.290389537811279, "logps/rejected": -7.515704154968262, "loss": 0.4105, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.290389537811279, "rewards/margins": 1.2253153324127197, "rewards/rejected": -7.515704154968262, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 13.545846052967303, "learning_rate": 2.9660322369414846e-07, "logits/chosen": 0.1779874563217163, "logits/rejected": 0.2717987596988678, "logps/chosen": -6.562331199645996, "logps/rejected": -7.84018087387085, "loss": 0.3834, "rewards/accuracies": 0.8125, "rewards/chosen": -6.562331199645996, "rewards/margins": 1.277849793434143, "rewards/rejected": -7.84018087387085, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 14.044711718207182, "learning_rate": 2.9518149400798063e-07, "logits/chosen": 0.16214436292648315, "logits/rejected": 0.2133154571056366, "logps/chosen": -6.7123823165893555, "logps/rejected": -8.115591049194336, "loss": 0.3962, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.7123823165893555, "rewards/margins": 1.4032083749771118, "rewards/rejected": -8.115591049194336, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 23.51849586834621, "learning_rate": 2.9376175147185633e-07, "logits/chosen": 0.15311595797538757, "logits/rejected": 0.27384883165359497, "logps/chosen": -6.929312705993652, "logps/rejected": -8.16248893737793, "loss": 0.4401, "rewards/accuracies": 0.78125, "rewards/chosen": -6.929312705993652, "rewards/margins": 1.2331759929656982, "rewards/rejected": -8.16248893737793, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 28.312500035421433, "learning_rate": 2.9234400986012376e-07, "logits/chosen": 0.10491786897182465, "logits/rejected": 0.2064250409603119, "logps/chosen": -6.7369890213012695, "logps/rejected": -8.129039764404297, "loss": 0.4004, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -6.7369890213012695, "rewards/margins": 1.392051100730896, "rewards/rejected": -8.129039764404297, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 20.814762527741113, "learning_rate": 2.9092828292771817e-07, "logits/chosen": 0.1984149068593979, "logits/rejected": 0.22802993655204773, "logps/chosen": -6.864100456237793, "logps/rejected": -8.15636920928955, "loss": 0.4104, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.864100456237793, "rewards/margins": 1.2922685146331787, "rewards/rejected": -8.15636920928955, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 15.692065540469166, "learning_rate": 2.8951458441002875e-07, "logits/chosen": 0.1788557469844818, "logits/rejected": 0.22720523178577423, "logps/chosen": -6.66799783706665, "logps/rejected": -7.976343631744385, "loss": 0.4052, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.66799783706665, "rewards/margins": 1.308345079421997, "rewards/rejected": -7.976343631744385, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 17.0978163933742, "learning_rate": 2.881029280227643e-07, "logits/chosen": 0.19646182656288147, "logits/rejected": 0.3076898157596588, "logps/chosen": -6.88651180267334, "logps/rejected": -8.155689239501953, "loss": 0.4185, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.88651180267334, "rewards/margins": 1.269178032875061, "rewards/rejected": -8.155689239501953, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 13.744591472796394, "learning_rate": 2.8669332746182177e-07, "logits/chosen": 0.13107994198799133, "logits/rejected": 0.23754458129405975, "logps/chosen": -6.7966156005859375, "logps/rejected": -8.194231033325195, "loss": 0.3921, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.7966156005859375, "rewards/margins": 1.3976160287857056, "rewards/rejected": -8.194231033325195, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 22.44972544980934, "learning_rate": 2.8528579640315156e-07, "logits/chosen": 0.2209732085466385, "logits/rejected": 0.260599821805954, "logps/chosen": -6.637648582458496, "logps/rejected": -7.802105903625488, "loss": 0.4355, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.637648582458496, "rewards/margins": 1.1644564867019653, "rewards/rejected": -7.802105903625488, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 22.40860826846662, "learning_rate": 2.8388034850262646e-07, "logits/chosen": 0.16404923796653748, "logits/rejected": 0.2670738101005554, "logps/chosen": -6.7091240882873535, "logps/rejected": -7.979203224182129, "loss": 0.4174, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.7091240882873535, "rewards/margins": 1.270079493522644, "rewards/rejected": -7.979203224182129, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 25.789912094686024, "learning_rate": 2.824769973959079e-07, "logits/chosen": 0.21405735611915588, "logits/rejected": 0.32851654291152954, "logps/chosen": -6.871840000152588, "logps/rejected": -8.093363761901855, "loss": 0.4058, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -6.871840000152588, "rewards/margins": 1.2215242385864258, "rewards/rejected": -8.093363761901855, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 17.835773720459418, "learning_rate": 2.81075756698315e-07, "logits/chosen": 0.21477186679840088, "logits/rejected": 0.29332786798477173, "logps/chosen": -6.618260860443115, "logps/rejected": -7.978990077972412, "loss": 0.3812, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -6.618260860443115, "rewards/margins": 1.3607289791107178, "rewards/rejected": -7.978990077972412, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 19.23496370990568, "learning_rate": 2.7967664000469035e-07, "logits/chosen": 0.14592570066452026, "logits/rejected": 0.22034184634685516, "logps/chosen": -6.82195520401001, "logps/rejected": -8.154292106628418, "loss": 0.3586, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -6.82195520401001, "rewards/margins": 1.3323371410369873, "rewards/rejected": -8.154292106628418, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 17.7387053985562, "learning_rate": 2.7827966088927095e-07, "logits/chosen": 0.16201874613761902, "logits/rejected": 0.30555272102355957, "logps/chosen": -7.005523681640625, "logps/rejected": -8.340397834777832, "loss": 0.3854, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.005523681640625, "rewards/margins": 1.3348742723464966, "rewards/rejected": -8.340397834777832, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 17.367201982175303, "learning_rate": 2.768848329055538e-07, "logits/chosen": 0.24481698870658875, "logits/rejected": 0.28893136978149414, "logps/chosen": -6.7548041343688965, "logps/rejected": -8.176729202270508, "loss": 0.3578, "rewards/accuracies": 0.875, "rewards/chosen": -6.7548041343688965, "rewards/margins": 1.4219236373901367, "rewards/rejected": -8.176729202270508, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 21.375498447548186, "learning_rate": 2.7549216958616657e-07, "logits/chosen": 0.19555334746837616, "logits/rejected": 0.2984967827796936, "logps/chosen": -7.206672668457031, "logps/rejected": -8.610469818115234, "loss": 0.3848, "rewards/accuracies": 0.8125, "rewards/chosen": -7.206672668457031, "rewards/margins": 1.4037978649139404, "rewards/rejected": -8.610469818115234, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 13.954861673095726, "learning_rate": 2.741016844427344e-07, "logits/chosen": 0.21182803809642792, "logits/rejected": 0.3090684711933136, "logps/chosen": -7.080285549163818, "logps/rejected": -8.478662490844727, "loss": 0.3654, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -7.080285549163818, "rewards/margins": 1.3983768224716187, "rewards/rejected": -8.478662490844727, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 19.286538122627856, "learning_rate": 2.7271339096575073e-07, "logits/chosen": 0.29053154587745667, "logits/rejected": 0.3705064356327057, "logps/chosen": -6.82970666885376, "logps/rejected": -8.157980918884277, "loss": 0.3894, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -6.82970666885376, "rewards/margins": 1.3282746076583862, "rewards/rejected": -8.157980918884277, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 16.198921324358647, "learning_rate": 2.713273026244446e-07, "logits/chosen": 0.24077287316322327, "logits/rejected": 0.3663024306297302, "logps/chosen": -7.2091779708862305, "logps/rejected": -8.676916122436523, "loss": 0.3511, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -7.2091779708862305, "rewards/margins": 1.4677376747131348, "rewards/rejected": -8.676916122436523, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 19.462117094425622, "learning_rate": 2.6994343286665156e-07, "logits/chosen": 0.28788474202156067, "logits/rejected": 0.3955186903476715, "logps/chosen": -7.373469352722168, "logps/rejected": -8.527563095092773, "loss": 0.4325, "rewards/accuracies": 0.78125, "rewards/chosen": -7.373469352722168, "rewards/margins": 1.154093861579895, "rewards/rejected": -8.527563095092773, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 22.72330561061412, "learning_rate": 2.6856179511868156e-07, "logits/chosen": 0.3011389970779419, "logits/rejected": 0.3851096034049988, "logps/chosen": -7.197577476501465, "logps/rejected": -8.778141975402832, "loss": 0.4042, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.197577476501465, "rewards/margins": 1.5805647373199463, "rewards/rejected": -8.778141975402832, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 20.93653362370402, "learning_rate": 2.6718240278519056e-07, "logits/chosen": 0.337589830160141, "logits/rejected": 0.3954477906227112, "logps/chosen": -7.3479905128479, "logps/rejected": -8.790702819824219, "loss": 0.3788, "rewards/accuracies": 0.84375, "rewards/chosen": -7.3479905128479, "rewards/margins": 1.4427111148834229, "rewards/rejected": -8.790702819824219, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 21.059330404863864, "learning_rate": 2.6580526924904866e-07, "logits/chosen": 0.19513832032680511, "logits/rejected": 0.2786014676094055, "logps/chosen": -7.086444854736328, "logps/rejected": -8.478320121765137, "loss": 0.367, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.086444854736328, "rewards/margins": 1.3918753862380981, "rewards/rejected": -8.478320121765137, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 22.22096002820016, "learning_rate": 2.6443040787121186e-07, "logits/chosen": 0.1702469438314438, "logits/rejected": 0.235189750790596, "logps/chosen": -7.0696868896484375, "logps/rejected": -8.350391387939453, "loss": 0.4177, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.0696868896484375, "rewards/margins": 1.2807053327560425, "rewards/rejected": -8.350391387939453, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 21.70280832213852, "learning_rate": 2.6305783199059084e-07, "logits/chosen": 0.28480881452560425, "logits/rejected": 0.3594028353691101, "logps/chosen": -7.272819519042969, "logps/rejected": -8.633054733276367, "loss": 0.425, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.272819519042969, "rewards/margins": 1.3602343797683716, "rewards/rejected": -8.633054733276367, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 20.945527542249387, "learning_rate": 2.6168755492392324e-07, "logits/chosen": 0.29719698429107666, "logits/rejected": 0.3829507529735565, "logps/chosen": -6.966008186340332, "logps/rejected": -8.556936264038086, "loss": 0.331, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -6.966008186340332, "rewards/margins": 1.5909273624420166, "rewards/rejected": -8.556936264038086, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 22.893723963065803, "learning_rate": 2.6031958996564274e-07, "logits/chosen": 0.29260966181755066, "logits/rejected": 0.3335273265838623, "logps/chosen": -7.07828426361084, "logps/rejected": -8.606851577758789, "loss": 0.38, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.07828426361084, "rewards/margins": 1.5285664796829224, "rewards/rejected": -8.606851577758789, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 28.65659063784423, "learning_rate": 2.589539503877518e-07, "logits/chosen": 0.29898813366889954, "logits/rejected": 0.35525014996528625, "logps/chosen": -7.289124965667725, "logps/rejected": -8.659504890441895, "loss": 0.4046, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.289124965667725, "rewards/margins": 1.3703796863555908, "rewards/rejected": -8.659504890441895, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 18.24665927335132, "learning_rate": 2.5759064943969125e-07, "logits/chosen": 0.278289258480072, "logits/rejected": 0.3985983729362488, "logps/chosen": -7.285027503967285, "logps/rejected": -8.726434707641602, "loss": 0.391, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.285027503967285, "rewards/margins": 1.4414079189300537, "rewards/rejected": -8.726434707641602, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 21.347201131452916, "learning_rate": 2.562297003482131e-07, "logits/chosen": 0.3559954762458801, "logits/rejected": 0.39463549852371216, "logps/chosen": -7.281728267669678, "logps/rejected": -8.676322937011719, "loss": 0.3807, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.281728267669678, "rewards/margins": 1.3945951461791992, "rewards/rejected": -8.676322937011719, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 19.013770103989557, "learning_rate": 2.548711163172512e-07, "logits/chosen": 0.3564327359199524, "logits/rejected": 0.42117422819137573, "logps/chosen": -7.483202934265137, "logps/rejected": -8.816009521484375, "loss": 0.4258, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.483202934265137, "rewards/margins": 1.3328053951263428, "rewards/rejected": -8.816009521484375, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 17.59520920937043, "learning_rate": 2.53514910527794e-07, "logits/chosen": 0.3418017625808716, "logits/rejected": 0.40759143233299255, "logps/chosen": -7.094291687011719, "logps/rejected": -8.409260749816895, "loss": 0.3849, "rewards/accuracies": 0.8125, "rewards/chosen": -7.094291687011719, "rewards/margins": 1.3149694204330444, "rewards/rejected": -8.409260749816895, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 23.558597313073548, "learning_rate": 2.5216109613775573e-07, "logits/chosen": 0.3155134916305542, "logits/rejected": 0.40043672919273376, "logps/chosen": -7.457625389099121, "logps/rejected": -8.678041458129883, "loss": 0.4627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.457625389099121, "rewards/margins": 1.22041654586792, "rewards/rejected": -8.678041458129883, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 17.671149070952687, "learning_rate": 2.5080968628184993e-07, "logits/chosen": 0.2696431577205658, "logits/rejected": 0.3848080635070801, "logps/chosen": -7.120835781097412, "logps/rejected": -8.772043228149414, "loss": 0.3444, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.120835781097412, "rewards/margins": 1.6512069702148438, "rewards/rejected": -8.772043228149414, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 16.865439519421884, "learning_rate": 2.494606940714605e-07, "logits/chosen": 0.31801673769950867, "logits/rejected": 0.3613842725753784, "logps/chosen": -7.077091217041016, "logps/rejected": -8.580062866210938, "loss": 0.3704, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.077091217041016, "rewards/margins": 1.5029720067977905, "rewards/rejected": -8.580062866210938, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 17.57535405790568, "learning_rate": 2.4811413259451625e-07, "logits/chosen": 0.2882600426673889, "logits/rejected": 0.4006773829460144, "logps/chosen": -7.268555641174316, "logps/rejected": -8.799579620361328, "loss": 0.3791, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.268555641174316, "rewards/margins": 1.5310240983963013, "rewards/rejected": -8.799579620361328, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 19.30653193933471, "learning_rate": 2.46770014915362e-07, "logits/chosen": 0.32759007811546326, "logits/rejected": 0.4152456820011139, "logps/chosen": -7.290339469909668, "logps/rejected": -8.706626892089844, "loss": 0.3997, "rewards/accuracies": 0.8125, "rewards/chosen": -7.290339469909668, "rewards/margins": 1.4162886142730713, "rewards/rejected": -8.706626892089844, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 21.902865993325936, "learning_rate": 2.45428354074634e-07, "logits/chosen": 0.3168705105781555, "logits/rejected": 0.3539208769798279, "logps/chosen": -7.406926155090332, "logps/rejected": -8.826581954956055, "loss": 0.4299, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.406926155090332, "rewards/margins": 1.419655203819275, "rewards/rejected": -8.826581954956055, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 24.825128295083466, "learning_rate": 2.4408916308913105e-07, "logits/chosen": 0.2690446376800537, "logits/rejected": 0.34987881779670715, "logps/chosen": -7.617909908294678, "logps/rejected": -8.722278594970703, "loss": 0.4791, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -7.617909908294678, "rewards/margins": 1.1043680906295776, "rewards/rejected": -8.722278594970703, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 27.09576120150009, "learning_rate": 2.4275245495169025e-07, "logits/chosen": 0.3692486882209778, "logits/rejected": 0.47970151901245117, "logps/chosen": -7.289440155029297, "logps/rejected": -8.897256851196289, "loss": 0.3626, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.289440155029297, "rewards/margins": 1.6078166961669922, "rewards/rejected": -8.897256851196289, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 24.196124522278275, "learning_rate": 2.414182426310597e-07, "logits/chosen": 0.26486772298812866, "logits/rejected": 0.30557340383529663, "logps/chosen": -7.209486484527588, "logps/rejected": -8.764165878295898, "loss": 0.3875, "rewards/accuracies": 0.84375, "rewards/chosen": -7.209486484527588, "rewards/margins": 1.554681420326233, "rewards/rejected": -8.764165878295898, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 14.083708551265351, "learning_rate": 2.400865390717734e-07, "logits/chosen": 0.3150179386138916, "logits/rejected": 0.40604129433631897, "logps/chosen": -7.19879674911499, "logps/rejected": -8.958131790161133, "loss": 0.3384, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -7.19879674911499, "rewards/margins": 1.7593348026275635, "rewards/rejected": -8.958131790161133, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 21.457101419089092, "learning_rate": 2.3875735719402475e-07, "logits/chosen": 0.3123451769351959, "logits/rejected": 0.4002881944179535, "logps/chosen": -7.5686140060424805, "logps/rejected": -9.067090034484863, "loss": 0.3824, "rewards/accuracies": 0.8125, "rewards/chosen": -7.5686140060424805, "rewards/margins": 1.4984757900238037, "rewards/rejected": -9.067090034484863, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 18.8575231099115, "learning_rate": 2.3743070989354258e-07, "logits/chosen": 0.3650689721107483, "logits/rejected": 0.44167470932006836, "logps/chosen": -7.283629417419434, "logps/rejected": -8.816935539245605, "loss": 0.4128, "rewards/accuracies": 0.78125, "rewards/chosen": -7.283629417419434, "rewards/margins": 1.5333067178726196, "rewards/rejected": -8.816935539245605, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 23.031596627791238, "learning_rate": 2.3610661004146454e-07, "logits/chosen": 0.36233842372894287, "logits/rejected": 0.43186140060424805, "logps/chosen": -7.1493096351623535, "logps/rejected": -8.53859806060791, "loss": 0.3702, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.1493096351623535, "rewards/margins": 1.3892881870269775, "rewards/rejected": -8.53859806060791, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 18.846719868182316, "learning_rate": 2.3478507048421314e-07, "logits/chosen": 0.28360021114349365, "logits/rejected": 0.3200862407684326, "logps/chosen": -7.145040035247803, "logps/rejected": -8.613717079162598, "loss": 0.4134, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.145040035247803, "rewards/margins": 1.468676209449768, "rewards/rejected": -8.613717079162598, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 28.616221223530843, "learning_rate": 2.334661040433713e-07, "logits/chosen": 0.23798814415931702, "logits/rejected": 0.3087126612663269, "logps/chosen": -7.063495635986328, "logps/rejected": -8.533388137817383, "loss": 0.3838, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.063495635986328, "rewards/margins": 1.4698925018310547, "rewards/rejected": -8.533388137817383, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 18.303202545513322, "learning_rate": 2.321497235155568e-07, "logits/chosen": 0.24652013182640076, "logits/rejected": 0.31738215684890747, "logps/chosen": -7.087806701660156, "logps/rejected": -8.691329002380371, "loss": 0.342, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -7.087806701660156, "rewards/margins": 1.6035234928131104, "rewards/rejected": -8.691329002380371, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 36.62265891692267, "learning_rate": 2.3083594167229965e-07, "logits/chosen": 0.2635677754878998, "logits/rejected": 0.40870967507362366, "logps/chosen": -7.407968044281006, "logps/rejected": -8.812593460083008, "loss": 0.4257, "rewards/accuracies": 0.78125, "rewards/chosen": -7.407968044281006, "rewards/margins": 1.4046257734298706, "rewards/rejected": -8.812593460083008, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 23.77792858859415, "learning_rate": 2.295247712599167e-07, "logits/chosen": 0.30186787247657776, "logits/rejected": 0.33435964584350586, "logps/chosen": -7.1621246337890625, "logps/rejected": -8.6127347946167, "loss": 0.3968, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.1621246337890625, "rewards/margins": 1.4506099224090576, "rewards/rejected": -8.6127347946167, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.5128344893455505, "eval_logits/rejected": 0.5677589774131775, "eval_logps/chosen": -7.492998123168945, "eval_logps/rejected": -8.627630233764648, "eval_loss": 0.5234202742576599, "eval_rewards/accuracies": 0.7247774600982666, "eval_rewards/chosen": -7.492998123168945, "eval_rewards/margins": 1.1346323490142822, "eval_rewards/rejected": -8.627630233764648, "eval_runtime": 40.5062, "eval_samples_per_second": 33.205, "eval_steps_per_second": 8.32, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 20.586128886952075, "learning_rate": 2.2821622499938948e-07, "logits/chosen": 0.3701968789100647, "logits/rejected": 0.46226710081100464, "logps/chosen": -7.652381896972656, "logps/rejected": -8.887216567993164, "loss": 0.4484, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -7.652381896972656, "rewards/margins": 1.2348355054855347, "rewards/rejected": -8.887216567993164, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 24.821011620009557, "learning_rate": 2.269103155862391e-07, "logits/chosen": 0.28518763184547424, "logits/rejected": 0.3683343529701233, "logps/chosen": -7.406941890716553, "logps/rejected": -8.689632415771484, "loss": 0.4347, "rewards/accuracies": 0.78125, "rewards/chosen": -7.406941890716553, "rewards/margins": 1.2826902866363525, "rewards/rejected": -8.689632415771484, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 21.428032428402247, "learning_rate": 2.2560705569040483e-07, "logits/chosen": 0.2812894284725189, "logits/rejected": 0.3923163414001465, "logps/chosen": -7.40716028213501, "logps/rejected": -8.658193588256836, "loss": 0.4301, "rewards/accuracies": 0.78125, "rewards/chosen": -7.40716028213501, "rewards/margins": 1.2510334253311157, "rewards/rejected": -8.658193588256836, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 17.795825647586035, "learning_rate": 2.2430645795611963e-07, "logits/chosen": 0.23884764313697815, "logits/rejected": 0.3389735817909241, "logps/chosen": -7.396805763244629, "logps/rejected": -8.817538261413574, "loss": 0.388, "rewards/accuracies": 0.8125, "rewards/chosen": -7.396805763244629, "rewards/margins": 1.4207324981689453, "rewards/rejected": -8.817538261413574, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 27.083489061145542, "learning_rate": 2.230085350017884e-07, "logits/chosen": 0.2747723460197449, "logits/rejected": 0.3228207528591156, "logps/chosen": -7.0380988121032715, "logps/rejected": -8.227170944213867, "loss": 0.4598, "rewards/accuracies": 0.78125, "rewards/chosen": -7.0380988121032715, "rewards/margins": 1.1890724897384644, "rewards/rejected": -8.227170944213867, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 17.922502946123434, "learning_rate": 2.2171329941986554e-07, "logits/chosen": 0.2208956480026245, "logits/rejected": 0.29901427030563354, "logps/chosen": -6.916403770446777, "logps/rejected": -8.511252403259277, "loss": 0.3406, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -6.916403770446777, "rewards/margins": 1.5948477983474731, "rewards/rejected": -8.511252403259277, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 16.674011562377768, "learning_rate": 2.2042076377673202e-07, "logits/chosen": 0.2774532437324524, "logits/rejected": 0.2912542223930359, "logps/chosen": -6.946537971496582, "logps/rejected": -8.198385238647461, "loss": 0.4079, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.946537971496582, "rewards/margins": 1.2518484592437744, "rewards/rejected": -8.198385238647461, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 21.429529859461816, "learning_rate": 2.1913094061257476e-07, "logits/chosen": 0.2737177312374115, "logits/rejected": 0.25447797775268555, "logps/chosen": -7.059529781341553, "logps/rejected": -8.295661926269531, "loss": 0.4183, "rewards/accuracies": 0.84375, "rewards/chosen": -7.059529781341553, "rewards/margins": 1.2361325025558472, "rewards/rejected": -8.295661926269531, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 23.474484893331383, "learning_rate": 2.178438424412633e-07, "logits/chosen": 0.28978613018989563, "logits/rejected": 0.3645782470703125, "logps/chosen": -6.959961891174316, "logps/rejected": -8.186089515686035, "loss": 0.4244, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -6.959961891174316, "rewards/margins": 1.2261273860931396, "rewards/rejected": -8.186089515686035, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 28.045042735429245, "learning_rate": 2.165594817502302e-07, "logits/chosen": 0.2586674988269806, "logits/rejected": 0.32604771852493286, "logps/chosen": -7.326498508453369, "logps/rejected": -8.368438720703125, "loss": 0.4911, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -7.326498508453369, "rewards/margins": 1.0419405698776245, "rewards/rejected": -8.368438720703125, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 20.05598382024633, "learning_rate": 2.1527787100034806e-07, "logits/chosen": 0.30128225684165955, "logits/rejected": 0.3545406758785248, "logps/chosen": -6.983913421630859, "logps/rejected": -8.093557357788086, "loss": 0.4342, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.983913421630859, "rewards/margins": 1.1096433401107788, "rewards/rejected": -8.093557357788086, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 18.388401657822204, "learning_rate": 2.1399902262581037e-07, "logits/chosen": 0.359287291765213, "logits/rejected": 0.4251515865325928, "logps/chosen": -6.8913397789001465, "logps/rejected": -8.172552108764648, "loss": 0.432, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -6.8913397789001465, "rewards/margins": 1.2812113761901855, "rewards/rejected": -8.172552108764648, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 21.255397831300876, "learning_rate": 2.127229490340094e-07, "logits/chosen": 0.20232252776622772, "logits/rejected": 0.23142214119434357, "logps/chosen": -6.989743709564209, "logps/rejected": -8.504972457885742, "loss": 0.3687, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -6.989743709564209, "rewards/margins": 1.5152287483215332, "rewards/rejected": -8.504972457885742, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 24.966680137310647, "learning_rate": 2.1144966260541698e-07, "logits/chosen": 0.33680111169815063, "logits/rejected": 0.4156359136104584, "logps/chosen": -7.108415126800537, "logps/rejected": -8.596893310546875, "loss": 0.4105, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.108415126800537, "rewards/margins": 1.488478422164917, "rewards/rejected": -8.596893310546875, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 17.83498550339734, "learning_rate": 2.1017917569346332e-07, "logits/chosen": 0.2590417265892029, "logits/rejected": 0.3429079055786133, "logps/chosen": -7.071276664733887, "logps/rejected": -8.44837760925293, "loss": 0.3902, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.071276664733887, "rewards/margins": 1.3771021366119385, "rewards/rejected": -8.44837760925293, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 19.426977353768425, "learning_rate": 2.0891150062441837e-07, "logits/chosen": 0.24587580561637878, "logits/rejected": 0.31630924344062805, "logps/chosen": -7.2105607986450195, "logps/rejected": -8.677949905395508, "loss": 0.3832, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.2105607986450195, "rewards/margins": 1.4673891067504883, "rewards/rejected": -8.677949905395508, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 19.59563362089964, "learning_rate": 2.0764664969727086e-07, "logits/chosen": 0.2676798701286316, "logits/rejected": 0.3478240370750427, "logps/chosen": -6.980031490325928, "logps/rejected": -8.363582611083984, "loss": 0.3589, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -6.980031490325928, "rewards/margins": 1.3835513591766357, "rewards/rejected": -8.363582611083984, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 16.883645680479848, "learning_rate": 2.0638463518361033e-07, "logits/chosen": 0.21074536442756653, "logits/rejected": 0.326322466135025, "logps/chosen": -7.063347816467285, "logps/rejected": -8.50495433807373, "loss": 0.3716, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.063347816467285, "rewards/margins": 1.441607117652893, "rewards/rejected": -8.50495433807373, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 23.133419023671703, "learning_rate": 2.0512546932750702e-07, "logits/chosen": 0.22777286171913147, "logits/rejected": 0.28445711731910706, "logps/chosen": -7.15268087387085, "logps/rejected": -8.482824325561523, "loss": 0.371, "rewards/accuracies": 0.84375, "rewards/chosen": -7.15268087387085, "rewards/margins": 1.3301441669464111, "rewards/rejected": -8.482824325561523, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 23.748512675706785, "learning_rate": 2.0386916434539343e-07, "logits/chosen": 0.24273300170898438, "logits/rejected": 0.31494542956352234, "logps/chosen": -6.956430912017822, "logps/rejected": -8.45906925201416, "loss": 0.3525, "rewards/accuracies": 0.90625, "rewards/chosen": -6.956430912017822, "rewards/margins": 1.5026376247406006, "rewards/rejected": -8.45906925201416, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 22.20014668260631, "learning_rate": 2.0261573242594627e-07, "logits/chosen": 0.3220357894897461, "logits/rejected": 0.42692071199417114, "logps/chosen": -7.4511518478393555, "logps/rejected": -8.774401664733887, "loss": 0.4103, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.4511518478393555, "rewards/margins": 1.3232495784759521, "rewards/rejected": -8.774401664733887, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 26.956902114121657, "learning_rate": 2.0136518572996724e-07, "logits/chosen": 0.26610884070396423, "logits/rejected": 0.39333176612854004, "logps/chosen": -7.101998805999756, "logps/rejected": -8.667492866516113, "loss": 0.3652, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.101998805999756, "rewards/margins": 1.565492868423462, "rewards/rejected": -8.667492866516113, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 21.288321171456026, "learning_rate": 2.0011753639026617e-07, "logits/chosen": 0.30251285433769226, "logits/rejected": 0.3677985966205597, "logps/chosen": -7.283047676086426, "logps/rejected": -8.59282398223877, "loss": 0.4031, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.283047676086426, "rewards/margins": 1.3097761869430542, "rewards/rejected": -8.59282398223877, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 25.02677979129819, "learning_rate": 1.988727965115421e-07, "logits/chosen": 0.3021322190761566, "logits/rejected": 0.336160808801651, "logps/chosen": -7.058053493499756, "logps/rejected": -8.484304428100586, "loss": 0.3772, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.058053493499756, "rewards/margins": 1.4262504577636719, "rewards/rejected": -8.484304428100586, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 18.941052832747378, "learning_rate": 1.9763097817026713e-07, "logits/chosen": 0.28010308742523193, "logits/rejected": 0.37843847274780273, "logps/chosen": -7.177609443664551, "logps/rejected": -8.816247940063477, "loss": 0.3387, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.177609443664551, "rewards/margins": 1.6386394500732422, "rewards/rejected": -8.816247940063477, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 18.04830956633701, "learning_rate": 1.9639209341456796e-07, "logits/chosen": 0.28597843647003174, "logits/rejected": 0.3539894223213196, "logps/chosen": -7.2694411277771, "logps/rejected": -8.657193183898926, "loss": 0.3985, "rewards/accuracies": 0.8125, "rewards/chosen": -7.2694411277771, "rewards/margins": 1.387751817703247, "rewards/rejected": -8.657193183898926, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 16.911188776088043, "learning_rate": 1.951561542641102e-07, "logits/chosen": 0.3289716839790344, "logits/rejected": 0.3689533770084381, "logps/chosen": -7.2944016456604, "logps/rejected": -8.767526626586914, "loss": 0.4291, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.2944016456604, "rewards/margins": 1.4731240272521973, "rewards/rejected": -8.767526626586914, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 19.295852991335092, "learning_rate": 1.939231727099806e-07, "logits/chosen": 0.2441384494304657, "logits/rejected": 0.28995856642723083, "logps/chosen": -7.197137355804443, "logps/rejected": -8.512828826904297, "loss": 0.4274, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -7.197137355804443, "rewards/margins": 1.3156921863555908, "rewards/rejected": -8.512828826904297, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 20.360327721490776, "learning_rate": 1.926931607145719e-07, "logits/chosen": 0.3853500783443451, "logits/rejected": 0.45198655128479004, "logps/chosen": -7.447711944580078, "logps/rejected": -8.777891159057617, "loss": 0.4072, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.447711944580078, "rewards/margins": 1.3301804065704346, "rewards/rejected": -8.777891159057617, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 20.93229375905249, "learning_rate": 1.9146613021146564e-07, "logits/chosen": 0.3252336084842682, "logits/rejected": 0.3746287226676941, "logps/chosen": -6.945518493652344, "logps/rejected": -8.274484634399414, "loss": 0.4146, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -6.945518493652344, "rewards/margins": 1.3289663791656494, "rewards/rejected": -8.274484634399414, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 23.71715369546326, "learning_rate": 1.9024209310531736e-07, "logits/chosen": 0.40182560682296753, "logits/rejected": 0.42231717705726624, "logps/chosen": -7.349024772644043, "logps/rejected": -8.789191246032715, "loss": 0.399, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.349024772644043, "rewards/margins": 1.440166711807251, "rewards/rejected": -8.789191246032715, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 25.223115545792268, "learning_rate": 1.890210612717401e-07, "logits/chosen": 0.31770116090774536, "logits/rejected": 0.3994571268558502, "logps/chosen": -7.256894111633301, "logps/rejected": -8.712347030639648, "loss": 0.3763, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.256894111633301, "rewards/margins": 1.4554524421691895, "rewards/rejected": -8.712347030639648, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 24.744965290463323, "learning_rate": 1.8780304655719054e-07, "logits/chosen": 0.37890809774398804, "logits/rejected": 0.4597468972206116, "logps/chosen": -7.381892204284668, "logps/rejected": -8.970640182495117, "loss": 0.3708, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.381892204284668, "rewards/margins": 1.5887479782104492, "rewards/rejected": -8.970640182495117, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 35.51021704704494, "learning_rate": 1.865880607788523e-07, "logits/chosen": 0.4357808530330658, "logits/rejected": 0.4823951721191406, "logps/chosen": -7.215002536773682, "logps/rejected": -8.721412658691406, "loss": 0.3863, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.215002536773682, "rewards/margins": 1.5064113140106201, "rewards/rejected": -8.721412658691406, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 27.443746355209214, "learning_rate": 1.8537611572452316e-07, "logits/chosen": 0.37630370259284973, "logits/rejected": 0.41747456789016724, "logps/chosen": -7.618195533752441, "logps/rejected": -8.870119094848633, "loss": 0.4137, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.618195533752441, "rewards/margins": 1.2519233226776123, "rewards/rejected": -8.870119094848633, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 25.95475472076193, "learning_rate": 1.84167223152499e-07, "logits/chosen": 0.38739213347435, "logits/rejected": 0.469780832529068, "logps/chosen": -7.560072422027588, "logps/rejected": -8.987447738647461, "loss": 0.4048, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.560072422027588, "rewards/margins": 1.4273760318756104, "rewards/rejected": -8.987447738647461, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 26.93252270355457, "learning_rate": 1.8296139479146112e-07, "logits/chosen": 0.21999125182628632, "logits/rejected": 0.28495317697525024, "logps/chosen": -7.123342990875244, "logps/rejected": -8.598133087158203, "loss": 0.4058, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.123342990875244, "rewards/margins": 1.4747909307479858, "rewards/rejected": -8.598133087158203, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 20.833666139652905, "learning_rate": 1.8175864234036132e-07, "logits/chosen": 0.36224880814552307, "logits/rejected": 0.3978690207004547, "logps/chosen": -7.167868137359619, "logps/rejected": -8.642248153686523, "loss": 0.4125, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.167868137359619, "rewards/margins": 1.4743801355361938, "rewards/rejected": -8.642248153686523, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 20.20560518364077, "learning_rate": 1.805589774683094e-07, "logits/chosen": 0.23001787066459656, "logits/rejected": 0.3107824921607971, "logps/chosen": -7.210622310638428, "logps/rejected": -8.481039047241211, "loss": 0.3957, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.210622310638428, "rewards/margins": 1.2704179286956787, "rewards/rejected": -8.481039047241211, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 25.984367525325315, "learning_rate": 1.79362411814459e-07, "logits/chosen": 0.3601521849632263, "logits/rejected": 0.32500123977661133, "logps/chosen": -7.375657081604004, "logps/rejected": -8.538119316101074, "loss": 0.461, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.375657081604004, "rewards/margins": 1.1624622344970703, "rewards/rejected": -8.538119316101074, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 21.95702061189765, "learning_rate": 1.7816895698789552e-07, "logits/chosen": 0.259951651096344, "logits/rejected": 0.3377164602279663, "logps/chosen": -7.27734375, "logps/rejected": -8.556171417236328, "loss": 0.4117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.27734375, "rewards/margins": 1.2788277864456177, "rewards/rejected": -8.556171417236328, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 20.350472367190825, "learning_rate": 1.7697862456752271e-07, "logits/chosen": 0.2638227939605713, "logits/rejected": 0.3472591042518616, "logps/chosen": -7.268867492675781, "logps/rejected": -8.953725814819336, "loss": 0.3475, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.268867492675781, "rewards/margins": 1.684857964515686, "rewards/rejected": -8.953725814819336, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 20.775356220293407, "learning_rate": 1.7579142610195124e-07, "logits/chosen": 0.27310094237327576, "logits/rejected": 0.37697815895080566, "logps/chosen": -7.356204032897949, "logps/rejected": -8.73918342590332, "loss": 0.4079, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.356204032897949, "rewards/margins": 1.3829797506332397, "rewards/rejected": -8.73918342590332, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 17.770470598670098, "learning_rate": 1.7460737310938568e-07, "logits/chosen": 0.304316908121109, "logits/rejected": 0.4179569184780121, "logps/chosen": -7.217268943786621, "logps/rejected": -8.762430191040039, "loss": 0.3663, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.217268943786621, "rewards/margins": 1.5451613664627075, "rewards/rejected": -8.762430191040039, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 20.569074020340857, "learning_rate": 1.734264770775133e-07, "logits/chosen": 0.2867479920387268, "logits/rejected": 0.40134191513061523, "logps/chosen": -7.286637306213379, "logps/rejected": -8.72738265991211, "loss": 0.4082, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.286637306213379, "rewards/margins": 1.4407459497451782, "rewards/rejected": -8.72738265991211, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 22.827176369568864, "learning_rate": 1.7224874946339241e-07, "logits/chosen": 0.3088391125202179, "logits/rejected": 0.34242871403694153, "logps/chosen": -7.229901313781738, "logps/rejected": -8.646784782409668, "loss": 0.4149, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.229901313781738, "rewards/margins": 1.416884183883667, "rewards/rejected": -8.646784782409668, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 16.300498860409597, "learning_rate": 1.7107420169334186e-07, "logits/chosen": 0.29744452238082886, "logits/rejected": 0.3626137971878052, "logps/chosen": -7.269029140472412, "logps/rejected": -8.643649101257324, "loss": 0.3961, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.269029140472412, "rewards/margins": 1.3746190071105957, "rewards/rejected": -8.643649101257324, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 28.797855877693326, "learning_rate": 1.6990284516282893e-07, "logits/chosen": 0.3027147352695465, "logits/rejected": 0.3628007173538208, "logps/chosen": -7.095419406890869, "logps/rejected": -8.492265701293945, "loss": 0.382, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.095419406890869, "rewards/margins": 1.3968467712402344, "rewards/rejected": -8.492265701293945, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 18.761089798409653, "learning_rate": 1.687346912363602e-07, "logits/chosen": 0.3126368224620819, "logits/rejected": 0.3793523609638214, "logps/chosen": -7.306955814361572, "logps/rejected": -8.739879608154297, "loss": 0.3653, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.306955814361572, "rewards/margins": 1.432924509048462, "rewards/rejected": -8.739879608154297, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 17.981490596989516, "learning_rate": 1.675697512473697e-07, "logits/chosen": 0.29584017395973206, "logits/rejected": 0.42201533913612366, "logps/chosen": -7.214234352111816, "logps/rejected": -8.751830101013184, "loss": 0.3446, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.214234352111816, "rewards/margins": 1.5375958681106567, "rewards/rejected": -8.751830101013184, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 22.388460030936407, "learning_rate": 1.6640803649811087e-07, "logits/chosen": 0.3105349838733673, "logits/rejected": 0.4311566948890686, "logps/chosen": -7.259086608886719, "logps/rejected": -8.898451805114746, "loss": 0.3451, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.259086608886719, "rewards/margins": 1.639365553855896, "rewards/rejected": -8.898451805114746, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 27.42681556916781, "learning_rate": 1.6524955825954472e-07, "logits/chosen": 0.35017019510269165, "logits/rejected": 0.40390628576278687, "logps/chosen": -7.213613986968994, "logps/rejected": -8.579636573791504, "loss": 0.3955, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.213613986968994, "rewards/margins": 1.3660229444503784, "rewards/rejected": -8.579636573791504, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 17.924632516668755, "learning_rate": 1.6409432777123277e-07, "logits/chosen": 0.34108155965805054, "logits/rejected": 0.4067533612251282, "logps/chosen": -7.477619171142578, "logps/rejected": -9.115545272827148, "loss": 0.378, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.477619171142578, "rewards/margins": 1.6379257440567017, "rewards/rejected": -9.115545272827148, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 19.36644897914002, "learning_rate": 1.6294235624122577e-07, "logits/chosen": 0.3796878755092621, "logits/rejected": 0.45275530219078064, "logps/chosen": -7.464455604553223, "logps/rejected": -8.9327392578125, "loss": 0.3927, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.464455604553223, "rewards/margins": 1.4682835340499878, "rewards/rejected": -8.9327392578125, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 19.73084918872749, "learning_rate": 1.6179365484595697e-07, "logits/chosen": 0.2804982662200928, "logits/rejected": 0.3383084237575531, "logps/chosen": -7.256462097167969, "logps/rejected": -8.667964935302734, "loss": 0.3945, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.256462097167969, "rewards/margins": 1.4115025997161865, "rewards/rejected": -8.667964935302734, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 23.868422297020413, "learning_rate": 1.60648234730132e-07, "logits/chosen": 0.3542520999908447, "logits/rejected": 0.3913983106613159, "logps/chosen": -7.277891635894775, "logps/rejected": -8.826860427856445, "loss": 0.3598, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.277891635894775, "rewards/margins": 1.5489692687988281, "rewards/rejected": -8.826860427856445, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 24.248560666148684, "learning_rate": 1.595061070066222e-07, "logits/chosen": 0.3257039189338684, "logits/rejected": 0.3559548556804657, "logps/chosen": -7.229429721832275, "logps/rejected": -8.743112564086914, "loss": 0.3646, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.229429721832275, "rewards/margins": 1.5136826038360596, "rewards/rejected": -8.743112564086914, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 31.536316085624016, "learning_rate": 1.5836728275635542e-07, "logits/chosen": 0.24791069328784943, "logits/rejected": 0.3158980906009674, "logps/chosen": -7.516233921051025, "logps/rejected": -8.740018844604492, "loss": 0.4598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.516233921051025, "rewards/margins": 1.2237855195999146, "rewards/rejected": -8.740018844604492, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 20.22048823513954, "learning_rate": 1.5723177302820984e-07, "logits/chosen": 0.2842888832092285, "logits/rejected": 0.3263532221317291, "logps/chosen": -7.354862213134766, "logps/rejected": -8.585168838500977, "loss": 0.4042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.354862213134766, "rewards/margins": 1.230306625366211, "rewards/rejected": -8.585168838500977, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 17.737283004428008, "learning_rate": 1.5609958883890544e-07, "logits/chosen": 0.3343093991279602, "logits/rejected": 0.41966503858566284, "logps/chosen": -7.334109306335449, "logps/rejected": -8.668596267700195, "loss": 0.395, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.334109306335449, "rewards/margins": 1.3344876766204834, "rewards/rejected": -8.668596267700195, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 22.297546452541173, "learning_rate": 1.5497074117289865e-07, "logits/chosen": 0.2042149305343628, "logits/rejected": 0.27025216817855835, "logps/chosen": -7.032327175140381, "logps/rejected": -8.50899600982666, "loss": 0.3789, "rewards/accuracies": 0.84375, "rewards/chosen": -7.032327175140381, "rewards/margins": 1.4766688346862793, "rewards/rejected": -8.50899600982666, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 21.876103446224324, "learning_rate": 1.5384524098227402e-07, "logits/chosen": 0.2956775724887848, "logits/rejected": 0.3782113492488861, "logps/chosen": -7.388510227203369, "logps/rejected": -9.00120735168457, "loss": 0.3359, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.388510227203369, "rewards/margins": 1.6126970052719116, "rewards/rejected": -9.00120735168457, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 23.641547916696315, "learning_rate": 1.5272309918663974e-07, "logits/chosen": 0.3261813819408417, "logits/rejected": 0.4177629053592682, "logps/chosen": -7.414539337158203, "logps/rejected": -8.666234970092773, "loss": 0.4494, "rewards/accuracies": 0.78125, "rewards/chosen": -7.414539337158203, "rewards/margins": 1.2516957521438599, "rewards/rejected": -8.666234970092773, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 16.551447946883776, "learning_rate": 1.516043266730201e-07, "logits/chosen": 0.30112114548683167, "logits/rejected": 0.3724484443664551, "logps/chosen": -7.437216281890869, "logps/rejected": -8.909406661987305, "loss": 0.3747, "rewards/accuracies": 0.84375, "rewards/chosen": -7.437216281890869, "rewards/margins": 1.4721910953521729, "rewards/rejected": -8.909406661987305, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 31.91018645501002, "learning_rate": 1.504889342957512e-07, "logits/chosen": 0.31613850593566895, "logits/rejected": 0.3930799961090088, "logps/chosen": -7.30059814453125, "logps/rejected": -8.66826057434082, "loss": 0.4532, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.30059814453125, "rewards/margins": 1.3676625490188599, "rewards/rejected": -8.66826057434082, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 23.71485349334059, "learning_rate": 1.4937693287637453e-07, "logits/chosen": 0.3149440884590149, "logits/rejected": 0.41017061471939087, "logps/chosen": -7.462906837463379, "logps/rejected": -8.79146671295166, "loss": 0.4168, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.462906837463379, "rewards/margins": 1.3285603523254395, "rewards/rejected": -8.79146671295166, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 20.863497798866717, "learning_rate": 1.4826833320353305e-07, "logits/chosen": 0.2932584583759308, "logits/rejected": 0.3568701148033142, "logps/chosen": -7.259173393249512, "logps/rejected": -8.74791145324707, "loss": 0.366, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.259173393249512, "rewards/margins": 1.4887363910675049, "rewards/rejected": -8.74791145324707, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 25.295891216814542, "learning_rate": 1.4716314603286528e-07, "logits/chosen": 0.29353460669517517, "logits/rejected": 0.37524330615997314, "logps/chosen": -7.257359981536865, "logps/rejected": -8.782527923583984, "loss": 0.3912, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.257359981536865, "rewards/margins": 1.5251675844192505, "rewards/rejected": -8.782527923583984, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 35.13295363357051, "learning_rate": 1.4606138208690233e-07, "logits/chosen": 0.2928846478462219, "logits/rejected": 0.36363643407821655, "logps/chosen": -7.482172966003418, "logps/rejected": -8.83690071105957, "loss": 0.4278, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.482172966003418, "rewards/margins": 1.3547272682189941, "rewards/rejected": -8.83690071105957, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 20.015304026039523, "learning_rate": 1.4496305205496251e-07, "logits/chosen": 0.2880600094795227, "logits/rejected": 0.3562549948692322, "logps/chosen": -7.411035060882568, "logps/rejected": -8.877151489257812, "loss": 0.3777, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.411035060882568, "rewards/margins": 1.4661157131195068, "rewards/rejected": -8.877151489257812, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 14.828793144485612, "learning_rate": 1.4386816659304895e-07, "logits/chosen": 0.24500772356987, "logits/rejected": 0.3184560239315033, "logps/chosen": -7.2300567626953125, "logps/rejected": -8.655884742736816, "loss": 0.3616, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -7.2300567626953125, "rewards/margins": 1.4258277416229248, "rewards/rejected": -8.655884742736816, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 25.715305223480616, "learning_rate": 1.4277673632374492e-07, "logits/chosen": 0.24100224673748016, "logits/rejected": 0.3240317106246948, "logps/chosen": -7.463017463684082, "logps/rejected": -8.840161323547363, "loss": 0.3996, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.463017463684082, "rewards/margins": 1.3771435022354126, "rewards/rejected": -8.840161323547363, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 19.24236205080274, "learning_rate": 1.416887718361119e-07, "logits/chosen": 0.3612043261528015, "logits/rejected": 0.38172999024391174, "logps/chosen": -7.479483604431152, "logps/rejected": -8.840932846069336, "loss": 0.4064, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.479483604431152, "rewards/margins": 1.3614503145217896, "rewards/rejected": -8.840932846069336, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 28.102356427895096, "learning_rate": 1.406042836855859e-07, "logits/chosen": 0.3336125314235687, "logits/rejected": 0.3777526021003723, "logps/chosen": -7.087874412536621, "logps/rejected": -8.605081558227539, "loss": 0.3609, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.087874412536621, "rewards/margins": 1.5172059535980225, "rewards/rejected": -8.605081558227539, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 22.512826697257275, "learning_rate": 1.3952328239387595e-07, "logits/chosen": 0.22621822357177734, "logits/rejected": 0.3450269103050232, "logps/chosen": -7.238430023193359, "logps/rejected": -8.828360557556152, "loss": 0.3573, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.238430023193359, "rewards/margins": 1.5899317264556885, "rewards/rejected": -8.828360557556152, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 21.737630478491848, "learning_rate": 1.3844577844886109e-07, "logits/chosen": 0.3239154517650604, "logits/rejected": 0.44966164231300354, "logps/chosen": -7.449564456939697, "logps/rejected": -8.967256546020508, "loss": 0.381, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.449564456939697, "rewards/margins": 1.5176920890808105, "rewards/rejected": -8.967256546020508, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 26.932502127585135, "learning_rate": 1.3737178230448955e-07, "logits/chosen": 0.2672556936740875, "logits/rejected": 0.3381572961807251, "logps/chosen": -7.6018218994140625, "logps/rejected": -8.910539627075195, "loss": 0.4227, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.6018218994140625, "rewards/margins": 1.3087173700332642, "rewards/rejected": -8.910539627075195, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 18.207888806046267, "learning_rate": 1.363013043806764e-07, "logits/chosen": 0.33152395486831665, "logits/rejected": 0.3967929482460022, "logps/chosen": -7.299252986907959, "logps/rejected": -8.655993461608887, "loss": 0.3887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.299252986907959, "rewards/margins": 1.3567407131195068, "rewards/rejected": -8.655993461608887, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 19.40885039829476, "learning_rate": 1.352343550632034e-07, "logits/chosen": 0.3127058148384094, "logits/rejected": 0.3579404950141907, "logps/chosen": -7.280699253082275, "logps/rejected": -8.803844451904297, "loss": 0.4202, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.280699253082275, "rewards/margins": 1.5231444835662842, "rewards/rejected": -8.803844451904297, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 20.168820398714175, "learning_rate": 1.3417094470361722e-07, "logits/chosen": 0.26944494247436523, "logits/rejected": 0.33124467730522156, "logps/chosen": -7.332216739654541, "logps/rejected": -8.636500358581543, "loss": 0.4135, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.332216739654541, "rewards/margins": 1.3042832612991333, "rewards/rejected": -8.636500358581543, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.4203110933303833, "eval_logits/rejected": 0.4661051630973816, "eval_logps/chosen": -7.495187282562256, "eval_logps/rejected": -8.656486511230469, "eval_loss": 0.5202957987785339, "eval_rewards/accuracies": 0.7240356206893921, "eval_rewards/chosen": -7.495187282562256, "eval_rewards/margins": 1.161300539970398, "eval_rewards/rejected": -8.656486511230469, "eval_runtime": 40.549, "eval_samples_per_second": 33.17, "eval_steps_per_second": 8.311, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 22.721588310501875, "learning_rate": 1.3311108361913015e-07, "logits/chosen": 0.2115565836429596, "logits/rejected": 0.26245272159576416, "logps/chosen": -7.2965850830078125, "logps/rejected": -8.726715087890625, "loss": 0.3683, "rewards/accuracies": 0.84375, "rewards/chosen": -7.2965850830078125, "rewards/margins": 1.430129051208496, "rewards/rejected": -8.726715087890625, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 17.212506282642266, "learning_rate": 1.3205478209251874e-07, "logits/chosen": 0.3105309009552002, "logits/rejected": 0.3998563289642334, "logps/chosen": -7.399471282958984, "logps/rejected": -8.931981086730957, "loss": 0.3844, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.399471282958984, "rewards/margins": 1.5325100421905518, "rewards/rejected": -8.931981086730957, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 20.71913946056699, "learning_rate": 1.310020503720254e-07, "logits/chosen": 0.3022151589393616, "logits/rejected": 0.3662305772304535, "logps/chosen": -7.508314609527588, "logps/rejected": -8.977043151855469, "loss": 0.3805, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.508314609527588, "rewards/margins": 1.46872878074646, "rewards/rejected": -8.977043151855469, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 23.931625155503884, "learning_rate": 1.2995289867125752e-07, "logits/chosen": 0.3012247085571289, "logits/rejected": 0.35043641924858093, "logps/chosen": -7.3162407875061035, "logps/rejected": -8.485893249511719, "loss": 0.4433, "rewards/accuracies": 0.78125, "rewards/chosen": -7.3162407875061035, "rewards/margins": 1.1696536540985107, "rewards/rejected": -8.485893249511719, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 18.134213033668022, "learning_rate": 1.2890733716908986e-07, "logits/chosen": 0.2679382860660553, "logits/rejected": 0.34424129128456116, "logps/chosen": -7.173986911773682, "logps/rejected": -8.630284309387207, "loss": 0.3223, "rewards/accuracies": 0.875, "rewards/chosen": -7.173986911773682, "rewards/margins": 1.4562976360321045, "rewards/rejected": -8.630284309387207, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 23.096316646452998, "learning_rate": 1.2786537600956454e-07, "logits/chosen": 0.29924339056015015, "logits/rejected": 0.36566609144210815, "logps/chosen": -7.378855228424072, "logps/rejected": -8.845715522766113, "loss": 0.4021, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.378855228424072, "rewards/margins": 1.4668595790863037, "rewards/rejected": -8.845715522766113, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 14.87355352652969, "learning_rate": 1.268270253017933e-07, "logits/chosen": 0.30705955624580383, "logits/rejected": 0.38139989972114563, "logps/chosen": -7.284056663513184, "logps/rejected": -8.749556541442871, "loss": 0.3923, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.284056663513184, "rewards/margins": 1.4655005931854248, "rewards/rejected": -8.749556541442871, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 21.194205662044002, "learning_rate": 1.257922951198591e-07, "logits/chosen": 0.2085656374692917, "logits/rejected": 0.34717991948127747, "logps/chosen": -7.309917449951172, "logps/rejected": -8.645244598388672, "loss": 0.4084, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.309917449951172, "rewards/margins": 1.3353265523910522, "rewards/rejected": -8.645244598388672, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 21.3454847784539, "learning_rate": 1.24761195502719e-07, "logits/chosen": 0.25661319494247437, "logits/rejected": 0.35344451665878296, "logps/chosen": -7.232792854309082, "logps/rejected": -8.466408729553223, "loss": 0.4517, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -7.232792854309082, "rewards/margins": 1.2336152791976929, "rewards/rejected": -8.466408729553223, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 24.74493920199024, "learning_rate": 1.2373373645410573e-07, "logits/chosen": 0.2743425965309143, "logits/rejected": 0.3470795750617981, "logps/chosen": -7.310993194580078, "logps/rejected": -8.837270736694336, "loss": 0.4048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.310993194580078, "rewards/margins": 1.5262773036956787, "rewards/rejected": -8.837270736694336, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 23.41399705027671, "learning_rate": 1.2270992794243175e-07, "logits/chosen": 0.21707472205162048, "logits/rejected": 0.2889866530895233, "logps/chosen": -7.1582136154174805, "logps/rejected": -8.628338813781738, "loss": 0.3866, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.1582136154174805, "rewards/margins": 1.47012460231781, "rewards/rejected": -8.628338813781738, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 17.755480497028685, "learning_rate": 1.2168977990069147e-07, "logits/chosen": 0.27614498138427734, "logits/rejected": 0.3800671398639679, "logps/chosen": -7.185783386230469, "logps/rejected": -8.564518928527832, "loss": 0.4089, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.185783386230469, "rewards/margins": 1.3787345886230469, "rewards/rejected": -8.564518928527832, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 21.863083875237013, "learning_rate": 1.206733022263659e-07, "logits/chosen": 0.29333335161209106, "logits/rejected": 0.42292237281799316, "logps/chosen": -7.549624443054199, "logps/rejected": -8.923303604125977, "loss": 0.4268, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.549624443054199, "rewards/margins": 1.3736793994903564, "rewards/rejected": -8.923303604125977, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 18.113480104663488, "learning_rate": 1.1966050478132572e-07, "logits/chosen": 0.30930572748184204, "logits/rejected": 0.3778761625289917, "logps/chosen": -7.155702114105225, "logps/rejected": -8.622213363647461, "loss": 0.4011, "rewards/accuracies": 0.84375, "rewards/chosen": -7.155702114105225, "rewards/margins": 1.4665101766586304, "rewards/rejected": -8.622213363647461, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 21.57515129017574, "learning_rate": 1.1865139739173635e-07, "logits/chosen": 0.27360352873802185, "logits/rejected": 0.3828732371330261, "logps/chosen": -7.331957817077637, "logps/rejected": -8.700000762939453, "loss": 0.3798, "rewards/accuracies": 0.84375, "rewards/chosen": -7.331957817077637, "rewards/margins": 1.3680423498153687, "rewards/rejected": -8.700000762939453, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 27.638107951464175, "learning_rate": 1.1764598984796187e-07, "logits/chosen": 0.3031611442565918, "logits/rejected": 0.39114993810653687, "logps/chosen": -7.41256856918335, "logps/rejected": -8.725973129272461, "loss": 0.3882, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.41256856918335, "rewards/margins": 1.3134046792984009, "rewards/rejected": -8.725973129272461, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 29.66576038832304, "learning_rate": 1.1664429190447095e-07, "logits/chosen": 0.30261072516441345, "logits/rejected": 0.3615226745605469, "logps/chosen": -7.265384674072266, "logps/rejected": -8.756854057312012, "loss": 0.3614, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.265384674072266, "rewards/margins": 1.4914695024490356, "rewards/rejected": -8.756854057312012, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 34.89398676673244, "learning_rate": 1.1564631327974122e-07, "logits/chosen": 0.2688743472099304, "logits/rejected": 0.36438971757888794, "logps/chosen": -7.421568870544434, "logps/rejected": -8.833404541015625, "loss": 0.4158, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.421568870544434, "rewards/margins": 1.4118359088897705, "rewards/rejected": -8.833404541015625, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 20.783491921939074, "learning_rate": 1.1465206365616587e-07, "logits/chosen": 0.22252492606639862, "logits/rejected": 0.34680983424186707, "logps/chosen": -7.3934760093688965, "logps/rejected": -8.616147994995117, "loss": 0.4361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.3934760093688965, "rewards/margins": 1.2226709127426147, "rewards/rejected": -8.616147994995117, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 19.23313500712169, "learning_rate": 1.1366155267995887e-07, "logits/chosen": 0.2753832936286926, "logits/rejected": 0.2971547245979309, "logps/chosen": -7.22985315322876, "logps/rejected": -8.663167953491211, "loss": 0.3838, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.22985315322876, "rewards/margins": 1.4333144426345825, "rewards/rejected": -8.663167953491211, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 22.77792337691314, "learning_rate": 1.1267478996106228e-07, "logits/chosen": 0.30802375078201294, "logits/rejected": 0.417527437210083, "logps/chosen": -7.260392665863037, "logps/rejected": -8.597624778747559, "loss": 0.4143, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.260392665863037, "rewards/margins": 1.3372321128845215, "rewards/rejected": -8.597624778747559, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 25.56250080708749, "learning_rate": 1.116917850730521e-07, "logits/chosen": 0.29951125383377075, "logits/rejected": 0.35530391335487366, "logps/chosen": -7.400708198547363, "logps/rejected": -8.616018295288086, "loss": 0.4868, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -7.400708198547363, "rewards/margins": 1.2153102159500122, "rewards/rejected": -8.616018295288086, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 22.44311106222233, "learning_rate": 1.1071254755304637e-07, "logits/chosen": 0.28395557403564453, "logits/rejected": 0.31452617049217224, "logps/chosen": -7.259737968444824, "logps/rejected": -8.608312606811523, "loss": 0.4158, "rewards/accuracies": 0.8125, "rewards/chosen": -7.259737968444824, "rewards/margins": 1.3485740423202515, "rewards/rejected": -8.608312606811523, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 22.640050175074183, "learning_rate": 1.0973708690161143e-07, "logits/chosen": 0.2774253189563751, "logits/rejected": 0.32397031784057617, "logps/chosen": -7.28174352645874, "logps/rejected": -8.699816703796387, "loss": 0.388, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.28174352645874, "rewards/margins": 1.4180728197097778, "rewards/rejected": -8.699816703796387, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 29.29024682145824, "learning_rate": 1.0876541258267119e-07, "logits/chosen": 0.28267163038253784, "logits/rejected": 0.39578986167907715, "logps/chosen": -7.4117841720581055, "logps/rejected": -8.875246047973633, "loss": 0.3938, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.4117841720581055, "rewards/margins": 1.4634608030319214, "rewards/rejected": -8.875246047973633, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 22.792954737557718, "learning_rate": 1.0779753402341379e-07, "logits/chosen": 0.2850467562675476, "logits/rejected": 0.3397276699542999, "logps/chosen": -7.447117805480957, "logps/rejected": -8.629554748535156, "loss": 0.4523, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.447117805480957, "rewards/margins": 1.182437539100647, "rewards/rejected": -8.629554748535156, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 24.959516920448443, "learning_rate": 1.0683346061420157e-07, "logits/chosen": 0.34460899233818054, "logits/rejected": 0.3747154176235199, "logps/chosen": -7.047287940979004, "logps/rejected": -8.46027946472168, "loss": 0.4268, "rewards/accuracies": 0.78125, "rewards/chosen": -7.047287940979004, "rewards/margins": 1.412990927696228, "rewards/rejected": -8.46027946472168, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 21.135249962138243, "learning_rate": 1.0587320170847874e-07, "logits/chosen": 0.3209449350833893, "logits/rejected": 0.3985745310783386, "logps/chosen": -7.121005058288574, "logps/rejected": -8.309045791625977, "loss": 0.4532, "rewards/accuracies": 0.75, "rewards/chosen": -7.121005058288574, "rewards/margins": 1.1880406141281128, "rewards/rejected": -8.309045791625977, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 18.611424061764993, "learning_rate": 1.0491676662268156e-07, "logits/chosen": 0.3384827673435211, "logits/rejected": 0.4088340699672699, "logps/chosen": -7.141387939453125, "logps/rejected": -8.435712814331055, "loss": 0.4354, "rewards/accuracies": 0.78125, "rewards/chosen": -7.141387939453125, "rewards/margins": 1.2943230867385864, "rewards/rejected": -8.435712814331055, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 25.550305393269934, "learning_rate": 1.0396416463614732e-07, "logits/chosen": 0.2555903494358063, "logits/rejected": 0.31796544790267944, "logps/chosen": -7.057486534118652, "logps/rejected": -8.49744987487793, "loss": 0.4027, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.057486534118652, "rewards/margins": 1.4399631023406982, "rewards/rejected": -8.49744987487793, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 22.77595561657411, "learning_rate": 1.0301540499102479e-07, "logits/chosen": 0.2880895435810089, "logits/rejected": 0.3716292977333069, "logps/chosen": -7.48153018951416, "logps/rejected": -8.606226921081543, "loss": 0.4582, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -7.48153018951416, "rewards/margins": 1.1246974468231201, "rewards/rejected": -8.606226921081543, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 25.30797206810832, "learning_rate": 1.0207049689218405e-07, "logits/chosen": 0.2554571032524109, "logits/rejected": 0.34082508087158203, "logps/chosen": -7.374205589294434, "logps/rejected": -8.877347946166992, "loss": 0.3919, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.374205589294434, "rewards/margins": 1.5031421184539795, "rewards/rejected": -8.877347946166992, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 18.580228083693463, "learning_rate": 1.0112944950712782e-07, "logits/chosen": 0.26179489493370056, "logits/rejected": 0.3440473675727844, "logps/chosen": -7.253432273864746, "logps/rejected": -8.755427360534668, "loss": 0.3627, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.253432273864746, "rewards/margins": 1.5019937753677368, "rewards/rejected": -8.755427360534668, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 28.124477685802923, "learning_rate": 1.0019227196590174e-07, "logits/chosen": 0.3475266396999359, "logits/rejected": 0.40890058875083923, "logps/chosen": -7.347159385681152, "logps/rejected": -8.64026927947998, "loss": 0.4694, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.347159385681152, "rewards/margins": 1.2931098937988281, "rewards/rejected": -8.64026927947998, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 27.471955422300532, "learning_rate": 9.925897336100664e-08, "logits/chosen": 0.3238454759120941, "logits/rejected": 0.37783247232437134, "logps/chosen": -7.163313865661621, "logps/rejected": -8.712347984313965, "loss": 0.3578, "rewards/accuracies": 0.84375, "rewards/chosen": -7.163313865661621, "rewards/margins": 1.5490344762802124, "rewards/rejected": -8.712347984313965, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 27.052149121810967, "learning_rate": 9.832956274730946e-08, "logits/chosen": 0.25367921590805054, "logits/rejected": 0.2831543982028961, "logps/chosen": -7.057225704193115, "logps/rejected": -8.291794776916504, "loss": 0.4482, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.057225704193115, "rewards/margins": 1.23456871509552, "rewards/rejected": -8.291794776916504, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 20.841176965595178, "learning_rate": 9.740404914195633e-08, "logits/chosen": 0.2802411913871765, "logits/rejected": 0.3875223696231842, "logps/chosen": -7.340878486633301, "logps/rejected": -8.686870574951172, "loss": 0.4088, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.340878486633301, "rewards/margins": 1.3459930419921875, "rewards/rejected": -8.686870574951172, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 18.325825593023367, "learning_rate": 9.648244152428392e-08, "logits/chosen": 0.2515996992588043, "logits/rejected": 0.32943809032440186, "logps/chosen": -7.0436224937438965, "logps/rejected": -8.281435012817383, "loss": 0.4239, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.0436224937438965, "rewards/margins": 1.2378120422363281, "rewards/rejected": -8.281435012817383, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 20.827255857115514, "learning_rate": 9.556474883573379e-08, "logits/chosen": 0.22962871193885803, "logits/rejected": 0.31234246492385864, "logps/chosen": -7.045889377593994, "logps/rejected": -8.489694595336914, "loss": 0.4205, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.045889377593994, "rewards/margins": 1.44380521774292, "rewards/rejected": -8.489694595336914, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 16.44053363812745, "learning_rate": 9.465097997976412e-08, "logits/chosen": 0.2996979057788849, "logits/rejected": 0.4060366749763489, "logps/chosen": -7.245124816894531, "logps/rejected": -8.871626853942871, "loss": 0.3435, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -7.245124816894531, "rewards/margins": 1.6265023946762085, "rewards/rejected": -8.871626853942871, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 19.122442978553945, "learning_rate": 9.374114382176457e-08, "logits/chosen": 0.3058284521102905, "logits/rejected": 0.38038885593414307, "logps/chosen": -7.241418361663818, "logps/rejected": -8.686393737792969, "loss": 0.3933, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.241418361663818, "rewards/margins": 1.4449747800827026, "rewards/rejected": -8.686393737792969, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 21.337178201629882, "learning_rate": 9.283524918896945e-08, "logits/chosen": 0.2980613112449646, "logits/rejected": 0.3538917005062103, "logps/chosen": -7.2838239669799805, "logps/rejected": -8.70457649230957, "loss": 0.4162, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.2838239669799805, "rewards/margins": 1.4207518100738525, "rewards/rejected": -8.70457649230957, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 20.199527402933203, "learning_rate": 9.193330487037232e-08, "logits/chosen": 0.3371972441673279, "logits/rejected": 0.43850836157798767, "logps/chosen": -7.391890048980713, "logps/rejected": -8.830408096313477, "loss": 0.3925, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.391890048980713, "rewards/margins": 1.4385178089141846, "rewards/rejected": -8.830408096313477, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 17.301860357531563, "learning_rate": 9.103531961664118e-08, "logits/chosen": 0.30749082565307617, "logits/rejected": 0.40894001722335815, "logps/chosen": -7.0129899978637695, "logps/rejected": -8.4142427444458, "loss": 0.3552, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.0129899978637695, "rewards/margins": 1.4012525081634521, "rewards/rejected": -8.4142427444458, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 20.192569144236582, "learning_rate": 9.014130214003269e-08, "logits/chosen": 0.24297723174095154, "logits/rejected": 0.25429749488830566, "logps/chosen": -7.187660217285156, "logps/rejected": -8.685514450073242, "loss": 0.3749, "rewards/accuracies": 0.84375, "rewards/chosen": -7.187660217285156, "rewards/margins": 1.4978554248809814, "rewards/rejected": -8.685514450073242, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 24.155271494870153, "learning_rate": 8.925126111430848e-08, "logits/chosen": 0.27042657136917114, "logits/rejected": 0.3160789906978607, "logps/chosen": -6.964795112609863, "logps/rejected": -8.435829162597656, "loss": 0.3874, "rewards/accuracies": 0.8125, "rewards/chosen": -6.964795112609863, "rewards/margins": 1.4710357189178467, "rewards/rejected": -8.435829162597656, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 30.714720542213904, "learning_rate": 8.83652051746504e-08, "logits/chosen": 0.40431347489356995, "logits/rejected": 0.4494917392730713, "logps/chosen": -7.201512336730957, "logps/rejected": -8.718502044677734, "loss": 0.3811, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.201512336730957, "rewards/margins": 1.5169906616210938, "rewards/rejected": -8.718502044677734, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 23.95311002380526, "learning_rate": 8.748314291757696e-08, "logits/chosen": 0.3205835223197937, "logits/rejected": 0.4018063545227051, "logps/chosen": -7.078852653503418, "logps/rejected": -8.378152847290039, "loss": 0.4036, "rewards/accuracies": 0.8125, "rewards/chosen": -7.078852653503418, "rewards/margins": 1.299300193786621, "rewards/rejected": -8.378152847290039, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 20.653388607351502, "learning_rate": 8.660508290086032e-08, "logits/chosen": 0.3385287821292877, "logits/rejected": 0.43104809522628784, "logps/chosen": -7.1990461349487305, "logps/rejected": -8.687498092651367, "loss": 0.3893, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.1990461349487305, "rewards/margins": 1.4884520769119263, "rewards/rejected": -8.687498092651367, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 23.702997512278735, "learning_rate": 8.573103364344231e-08, "logits/chosen": 0.29973700642585754, "logits/rejected": 0.4090765416622162, "logps/chosen": -7.032199859619141, "logps/rejected": -8.50168228149414, "loss": 0.3859, "rewards/accuracies": 0.8125, "rewards/chosen": -7.032199859619141, "rewards/margins": 1.469482183456421, "rewards/rejected": -8.50168228149414, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 24.475565470706503, "learning_rate": 8.486100362535292e-08, "logits/chosen": 0.2720106840133667, "logits/rejected": 0.36632654070854187, "logps/chosen": -7.305517673492432, "logps/rejected": -8.448290824890137, "loss": 0.4488, "rewards/accuracies": 0.78125, "rewards/chosen": -7.305517673492432, "rewards/margins": 1.1427732706069946, "rewards/rejected": -8.448290824890137, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 17.372575337716505, "learning_rate": 8.399500128762693e-08, "logits/chosen": 0.27360981702804565, "logits/rejected": 0.356884241104126, "logps/chosen": -7.3060736656188965, "logps/rejected": -8.687190055847168, "loss": 0.3878, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.3060736656188965, "rewards/margins": 1.3811149597167969, "rewards/rejected": -8.687190055847168, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 23.12243716745422, "learning_rate": 8.313303503222313e-08, "logits/chosen": 0.31145209074020386, "logits/rejected": 0.3661433756351471, "logps/chosen": -7.007277488708496, "logps/rejected": -8.304855346679688, "loss": 0.4099, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.007277488708496, "rewards/margins": 1.2975780963897705, "rewards/rejected": -8.304855346679688, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 25.569925133743286, "learning_rate": 8.227511322194164e-08, "logits/chosen": 0.3236026167869568, "logits/rejected": 0.4079325795173645, "logps/chosen": -7.037171363830566, "logps/rejected": -8.340476989746094, "loss": 0.411, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.037171363830566, "rewards/margins": 1.303305745124817, "rewards/rejected": -8.340476989746094, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 24.8385013680145, "learning_rate": 8.142124418034385e-08, "logits/chosen": 0.33507323265075684, "logits/rejected": 0.42742854356765747, "logps/chosen": -7.0142998695373535, "logps/rejected": -8.384051322937012, "loss": 0.4379, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.0142998695373535, "rewards/margins": 1.3697519302368164, "rewards/rejected": -8.384051322937012, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 27.996629827100776, "learning_rate": 8.057143619167073e-08, "logits/chosen": 0.30457979440689087, "logits/rejected": 0.3588828444480896, "logps/chosen": -6.839517116546631, "logps/rejected": -8.201641082763672, "loss": 0.4152, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.839517116546631, "rewards/margins": 1.3621232509613037, "rewards/rejected": -8.201641082763672, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 17.61120966970223, "learning_rate": 7.97256975007633e-08, "logits/chosen": 0.29396334290504456, "logits/rejected": 0.42836183309555054, "logps/chosen": -7.061775207519531, "logps/rejected": -8.464418411254883, "loss": 0.3928, "rewards/accuracies": 0.8125, "rewards/chosen": -7.061775207519531, "rewards/margins": 1.4026434421539307, "rewards/rejected": -8.464418411254883, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 23.661973620730006, "learning_rate": 7.888403631298186e-08, "logits/chosen": 0.30718234181404114, "logits/rejected": 0.36082449555397034, "logps/chosen": -6.999554634094238, "logps/rejected": -8.362503051757812, "loss": 0.4171, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -6.999554634094238, "rewards/margins": 1.3629480600357056, "rewards/rejected": -8.362503051757812, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 20.745248993221384, "learning_rate": 7.804646079412719e-08, "logits/chosen": 0.3332816958427429, "logits/rejected": 0.42536455392837524, "logps/chosen": -7.182110786437988, "logps/rejected": -8.579402923583984, "loss": 0.4004, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.182110786437988, "rewards/margins": 1.3972933292388916, "rewards/rejected": -8.579402923583984, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 23.63824050056205, "learning_rate": 7.72129790703604e-08, "logits/chosen": 0.2427501380443573, "logits/rejected": 0.3272876739501953, "logps/chosen": -7.0366315841674805, "logps/rejected": -8.329340934753418, "loss": 0.4191, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -7.0366315841674805, "rewards/margins": 1.2927104234695435, "rewards/rejected": -8.329340934753418, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 23.323551422848947, "learning_rate": 7.638359922812504e-08, "logits/chosen": 0.2943052649497986, "logits/rejected": 0.34262460470199585, "logps/chosen": -6.966763973236084, "logps/rejected": -8.309457778930664, "loss": 0.4112, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.966763973236084, "rewards/margins": 1.3426947593688965, "rewards/rejected": -8.309457778930664, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 28.914111568133343, "learning_rate": 7.555832931406774e-08, "logits/chosen": 0.26896438002586365, "logits/rejected": 0.37224045395851135, "logps/chosen": -7.18133020401001, "logps/rejected": -8.617822647094727, "loss": 0.3977, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.18133020401001, "rewards/margins": 1.4364928007125854, "rewards/rejected": -8.617822647094727, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 18.779354896943886, "learning_rate": 7.47371773349611e-08, "logits/chosen": 0.3385409414768219, "logits/rejected": 0.37672966718673706, "logps/chosen": -7.1719160079956055, "logps/rejected": -8.793294906616211, "loss": 0.329, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.1719160079956055, "rewards/margins": 1.6213775873184204, "rewards/rejected": -8.793294906616211, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 28.12587859347306, "learning_rate": 7.392015125762496e-08, "logits/chosen": 0.27432897686958313, "logits/rejected": 0.3799535036087036, "logps/chosen": -7.190088748931885, "logps/rejected": -8.653684616088867, "loss": 0.3755, "rewards/accuracies": 0.84375, "rewards/chosen": -7.190088748931885, "rewards/margins": 1.463595986366272, "rewards/rejected": -8.653684616088867, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 21.82885512828381, "learning_rate": 7.310725900885018e-08, "logits/chosen": 0.24644596874713898, "logits/rejected": 0.3031538426876068, "logps/chosen": -7.093823432922363, "logps/rejected": -8.433025360107422, "loss": 0.4411, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.093823432922363, "rewards/margins": 1.339200496673584, "rewards/rejected": -8.433025360107422, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 27.85694527808639, "learning_rate": 7.229850847532076e-08, "logits/chosen": 0.31186577677726746, "logits/rejected": 0.42418089509010315, "logps/chosen": -6.986525058746338, "logps/rejected": -8.633869171142578, "loss": 0.3238, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -6.986525058746338, "rewards/margins": 1.6473438739776611, "rewards/rejected": -8.633869171142578, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 20.87903572643391, "learning_rate": 7.149390750353779e-08, "logits/chosen": 0.37633460760116577, "logits/rejected": 0.39099740982055664, "logps/chosen": -7.451706886291504, "logps/rejected": -8.744372367858887, "loss": 0.3871, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.451706886291504, "rewards/margins": 1.2926665544509888, "rewards/rejected": -8.744372367858887, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 18.77330851618906, "learning_rate": 7.069346389974374e-08, "logits/chosen": 0.30393606424331665, "logits/rejected": 0.37179672718048096, "logps/chosen": -7.338356018066406, "logps/rejected": -8.634533882141113, "loss": 0.4028, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.338356018066406, "rewards/margins": 1.2961766719818115, "rewards/rejected": -8.634533882141113, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 23.59712413480673, "learning_rate": 6.989718542984563e-08, "logits/chosen": 0.3044324219226837, "logits/rejected": 0.3388569951057434, "logps/chosen": -7.444338321685791, "logps/rejected": -8.859186172485352, "loss": 0.3957, "rewards/accuracies": 0.8125, "rewards/chosen": -7.444338321685791, "rewards/margins": 1.4148471355438232, "rewards/rejected": -8.859186172485352, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 19.646990543464785, "learning_rate": 6.9105079819341e-08, "logits/chosen": 0.33015909790992737, "logits/rejected": 0.4234229028224945, "logps/chosen": -7.158675193786621, "logps/rejected": -8.765896797180176, "loss": 0.3458, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -7.158675193786621, "rewards/margins": 1.6072216033935547, "rewards/rejected": -8.765896797180176, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 20.656463904382207, "learning_rate": 6.831715475324163e-08, "logits/chosen": 0.30524563789367676, "logits/rejected": 0.37293511629104614, "logps/chosen": -7.39168643951416, "logps/rejected": -8.968939781188965, "loss": 0.3735, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.39168643951416, "rewards/margins": 1.5772539377212524, "rewards/rejected": -8.968939781188965, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 19.234832714590443, "learning_rate": 6.753341787600026e-08, "logits/chosen": 0.29548680782318115, "logits/rejected": 0.34894752502441406, "logps/chosen": -7.09867000579834, "logps/rejected": -8.6600923538208, "loss": 0.3461, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.09867000579834, "rewards/margins": 1.5614221096038818, "rewards/rejected": -8.6600923538208, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 24.052654052164538, "learning_rate": 6.67538767914353e-08, "logits/chosen": 0.25670138001441956, "logits/rejected": 0.3477746248245239, "logps/chosen": -7.168509006500244, "logps/rejected": -8.433395385742188, "loss": 0.445, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.168509006500244, "rewards/margins": 1.2648859024047852, "rewards/rejected": -8.433395385742188, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 25.44833497922028, "learning_rate": 6.597853906265793e-08, "logits/chosen": 0.3342982232570648, "logits/rejected": 0.39628344774246216, "logps/chosen": -7.300291538238525, "logps/rejected": -9.000889778137207, "loss": 0.3682, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.300291538238525, "rewards/margins": 1.700596809387207, "rewards/rejected": -9.000889778137207, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 26.609098043431516, "learning_rate": 6.5207412211998e-08, "logits/chosen": 0.4072763919830322, "logits/rejected": 0.44860178232192993, "logps/chosen": -7.407323360443115, "logps/rejected": -8.868807792663574, "loss": 0.4339, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.407323360443115, "rewards/margins": 1.4614832401275635, "rewards/rejected": -8.868807792663574, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 21.999463868769016, "learning_rate": 6.444050372093186e-08, "logits/chosen": 0.281717449426651, "logits/rejected": 0.38245004415512085, "logps/chosen": -7.271409511566162, "logps/rejected": -8.580997467041016, "loss": 0.3977, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.271409511566162, "rewards/margins": 1.3095887899398804, "rewards/rejected": -8.580997467041016, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 28.29891197246065, "learning_rate": 6.367782103000873e-08, "logits/chosen": 0.30486860871315, "logits/rejected": 0.3355456292629242, "logps/chosen": -7.189188480377197, "logps/rejected": -8.31476879119873, "loss": 0.4576, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.189188480377197, "rewards/margins": 1.1255815029144287, "rewards/rejected": -8.31476879119873, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 24.31115334141798, "learning_rate": 6.29193715387798e-08, "logits/chosen": 0.29903197288513184, "logits/rejected": 0.366224467754364, "logps/chosen": -7.279676914215088, "logps/rejected": -8.806459426879883, "loss": 0.393, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.279676914215088, "rewards/margins": 1.5267828702926636, "rewards/rejected": -8.806459426879883, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 26.720788491942123, "learning_rate": 6.216516260572502e-08, "logits/chosen": 0.2929922342300415, "logits/rejected": 0.35973021388053894, "logps/chosen": -7.407988548278809, "logps/rejected": -8.832331657409668, "loss": 0.4076, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.407988548278809, "rewards/margins": 1.4243441820144653, "rewards/rejected": -8.832331657409668, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 18.49776775362057, "learning_rate": 6.141520154818297e-08, "logits/chosen": 0.31417933106422424, "logits/rejected": 0.35339245200157166, "logps/chosen": -7.163817405700684, "logps/rejected": -8.42949390411377, "loss": 0.4277, "rewards/accuracies": 0.78125, "rewards/chosen": -7.163817405700684, "rewards/margins": 1.2656759023666382, "rewards/rejected": -8.42949390411377, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.5142617225646973, "eval_logits/rejected": 0.5700955986976624, "eval_logps/chosen": -7.352376937866211, "eval_logps/rejected": -8.500706672668457, "eval_loss": 0.5188721418380737, "eval_rewards/accuracies": 0.7270029783248901, "eval_rewards/chosen": -7.352376937866211, "eval_rewards/margins": 1.148330569267273, "eval_rewards/rejected": -8.500706672668457, "eval_runtime": 40.7161, "eval_samples_per_second": 33.034, "eval_steps_per_second": 8.277, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 27.22546495159263, "learning_rate": 6.066949564227897e-08, "logits/chosen": 0.32233959436416626, "logits/rejected": 0.3736671507358551, "logps/chosen": -7.1079864501953125, "logps/rejected": -8.541547775268555, "loss": 0.4143, "rewards/accuracies": 0.78125, "rewards/chosen": -7.1079864501953125, "rewards/margins": 1.4335615634918213, "rewards/rejected": -8.541547775268555, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 21.138383102269678, "learning_rate": 5.992805212285523e-08, "logits/chosen": 0.32077568769454956, "logits/rejected": 0.36533135175704956, "logps/chosen": -7.15933084487915, "logps/rejected": -8.624617576599121, "loss": 0.3975, "rewards/accuracies": 0.8125, "rewards/chosen": -7.15933084487915, "rewards/margins": 1.4652879238128662, "rewards/rejected": -8.624617576599121, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 28.935094504472683, "learning_rate": 5.9190878183399684e-08, "logits/chosen": 0.3448185324668884, "logits/rejected": 0.3879143297672272, "logps/chosen": -7.008177757263184, "logps/rejected": -8.45622730255127, "loss": 0.4519, "rewards/accuracies": 0.78125, "rewards/chosen": -7.008177757263184, "rewards/margins": 1.4480500221252441, "rewards/rejected": -8.45622730255127, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 25.942056855756686, "learning_rate": 5.845798097597748e-08, "logits/chosen": 0.326662540435791, "logits/rejected": 0.40040117502212524, "logps/chosen": -7.173343658447266, "logps/rejected": -8.433788299560547, "loss": 0.4308, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.173343658447266, "rewards/margins": 1.2604436874389648, "rewards/rejected": -8.433788299560547, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 25.701774917079668, "learning_rate": 5.772936761116026e-08, "logits/chosen": 0.3510200083255768, "logits/rejected": 0.4387829303741455, "logps/chosen": -7.210471153259277, "logps/rejected": -8.626684188842773, "loss": 0.3976, "rewards/accuracies": 0.8125, "rewards/chosen": -7.210471153259277, "rewards/margins": 1.4162123203277588, "rewards/rejected": -8.626684188842773, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 24.96598203097229, "learning_rate": 5.700504515795829e-08, "logits/chosen": 0.31674087047576904, "logits/rejected": 0.40286388993263245, "logps/chosen": -7.32675313949585, "logps/rejected": -8.66277027130127, "loss": 0.3997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.32675313949585, "rewards/margins": 1.3360170125961304, "rewards/rejected": -8.66277027130127, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 20.6215292472409, "learning_rate": 5.628502064375101e-08, "logits/chosen": 0.18289585411548615, "logits/rejected": 0.27206552028656006, "logps/chosen": -6.9815826416015625, "logps/rejected": -8.546882629394531, "loss": 0.3207, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -6.9815826416015625, "rewards/margins": 1.5653002262115479, "rewards/rejected": -8.546882629394531, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 28.235651406248117, "learning_rate": 5.55693010542197e-08, "logits/chosen": 0.24837911128997803, "logits/rejected": 0.3524007797241211, "logps/chosen": -7.089817047119141, "logps/rejected": -8.576709747314453, "loss": 0.3611, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.089817047119141, "rewards/margins": 1.4868923425674438, "rewards/rejected": -8.576709747314453, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 22.312101562846937, "learning_rate": 5.485789333327856e-08, "logits/chosen": 0.2977980375289917, "logits/rejected": 0.32573074102401733, "logps/chosen": -7.186008453369141, "logps/rejected": -8.48531723022461, "loss": 0.4043, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.186008453369141, "rewards/margins": 1.2993084192276, "rewards/rejected": -8.48531723022461, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 22.989257050188918, "learning_rate": 5.4150804383008675e-08, "logits/chosen": 0.243485689163208, "logits/rejected": 0.32594019174575806, "logps/chosen": -7.310733795166016, "logps/rejected": -8.759309768676758, "loss": 0.3977, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.310733795166016, "rewards/margins": 1.4485762119293213, "rewards/rejected": -8.759309768676758, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 24.537077300732616, "learning_rate": 5.344804106359002e-08, "logits/chosen": 0.331149160861969, "logits/rejected": 0.4025397300720215, "logps/chosen": -6.973557949066162, "logps/rejected": -8.444169998168945, "loss": 0.3929, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.973557949066162, "rewards/margins": 1.470611572265625, "rewards/rejected": -8.444169998168945, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 23.454798925936434, "learning_rate": 5.274961019323559e-08, "logits/chosen": 0.23005084693431854, "logits/rejected": 0.27359023690223694, "logps/chosen": -6.997138023376465, "logps/rejected": -8.317842483520508, "loss": 0.4147, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.997138023376465, "rewards/margins": 1.320704698562622, "rewards/rejected": -8.317842483520508, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 16.678527362647745, "learning_rate": 5.205551854812451e-08, "logits/chosen": 0.28112632036209106, "logits/rejected": 0.30581679940223694, "logps/chosen": -7.450681209564209, "logps/rejected": -8.888608932495117, "loss": 0.3963, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.450681209564209, "rewards/margins": 1.4379276037216187, "rewards/rejected": -8.888608932495117, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 19.853396077997953, "learning_rate": 5.1365772862337177e-08, "logits/chosen": 0.32495415210723877, "logits/rejected": 0.42128506302833557, "logps/chosen": -6.994874477386475, "logps/rejected": -8.742433547973633, "loss": 0.3018, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.994874477386475, "rewards/margins": 1.7475589513778687, "rewards/rejected": -8.742433547973633, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 24.638355992950682, "learning_rate": 5.068037982778905e-08, "logits/chosen": 0.2811067998409271, "logits/rejected": 0.3503668010234833, "logps/chosen": -6.903725624084473, "logps/rejected": -8.329036712646484, "loss": 0.4218, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.903725624084473, "rewards/margins": 1.4253112077713013, "rewards/rejected": -8.329036712646484, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 15.927123900195017, "learning_rate": 4.999934609416656e-08, "logits/chosen": 0.3937390446662903, "logits/rejected": 0.43355339765548706, "logps/chosen": -7.153407096862793, "logps/rejected": -8.826255798339844, "loss": 0.3563, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.153407096862793, "rewards/margins": 1.6728490591049194, "rewards/rejected": -8.826255798339844, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 18.668512874480736, "learning_rate": 4.932267826886183e-08, "logits/chosen": 0.32629144191741943, "logits/rejected": 0.38562673330307007, "logps/chosen": -7.244387149810791, "logps/rejected": -8.755474090576172, "loss": 0.3896, "rewards/accuracies": 0.8125, "rewards/chosen": -7.244387149810791, "rewards/margins": 1.5110862255096436, "rewards/rejected": -8.755474090576172, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 26.204824180252302, "learning_rate": 4.8650382916909206e-08, "logits/chosen": 0.2891615331172943, "logits/rejected": 0.3577572703361511, "logps/chosen": -7.293282508850098, "logps/rejected": -8.706974983215332, "loss": 0.4249, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.293282508850098, "rewards/margins": 1.4136929512023926, "rewards/rejected": -8.706974983215332, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 19.313725443744854, "learning_rate": 4.7982466560920976e-08, "logits/chosen": 0.3168511688709259, "logits/rejected": 0.3902406692504883, "logps/chosen": -7.3736467361450195, "logps/rejected": -8.621143341064453, "loss": 0.4193, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.3736467361450195, "rewards/margins": 1.247496485710144, "rewards/rejected": -8.621143341064453, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 20.073515788529196, "learning_rate": 4.7318935681024685e-08, "logits/chosen": 0.30994993448257446, "logits/rejected": 0.4362601637840271, "logps/chosen": -7.3219313621521, "logps/rejected": -8.836444854736328, "loss": 0.3702, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.3219313621521, "rewards/margins": 1.5145145654678345, "rewards/rejected": -8.836444854736328, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 19.193194845738887, "learning_rate": 4.6659796714799745e-08, "logits/chosen": 0.30282360315322876, "logits/rejected": 0.4043620526790619, "logps/chosen": -7.17397928237915, "logps/rejected": -8.738012313842773, "loss": 0.3579, "rewards/accuracies": 0.84375, "rewards/chosen": -7.17397928237915, "rewards/margins": 1.5640318393707275, "rewards/rejected": -8.738012313842773, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 19.91668735706689, "learning_rate": 4.60050560572155e-08, "logits/chosen": 0.28438445925712585, "logits/rejected": 0.2902686595916748, "logps/chosen": -7.133607387542725, "logps/rejected": -8.729795455932617, "loss": 0.3925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.133607387542725, "rewards/margins": 1.596187949180603, "rewards/rejected": -8.729795455932617, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 26.620865121812862, "learning_rate": 4.535472006056834e-08, "logits/chosen": 0.2999909818172455, "logits/rejected": 0.37691730260849, "logps/chosen": -7.246764183044434, "logps/rejected": -8.570967674255371, "loss": 0.4258, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.246764183044434, "rewards/margins": 1.3242039680480957, "rewards/rejected": -8.570967674255371, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 25.888492091913584, "learning_rate": 4.470879503442132e-08, "logits/chosen": 0.3285333812236786, "logits/rejected": 0.4007868766784668, "logps/chosen": -7.300829887390137, "logps/rejected": -8.755602836608887, "loss": 0.39, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.300829887390137, "rewards/margins": 1.4547721147537231, "rewards/rejected": -8.755602836608887, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 19.733005871137284, "learning_rate": 4.406728724554154e-08, "logits/chosen": 0.24100670218467712, "logits/rejected": 0.3914957344532013, "logps/chosen": -7.3828444480896, "logps/rejected": -8.916400909423828, "loss": 0.3895, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.3828444480896, "rewards/margins": 1.5335559844970703, "rewards/rejected": -8.916400909423828, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 20.52262689766509, "learning_rate": 4.3430202917840664e-08, "logits/chosen": 0.3017696738243103, "logits/rejected": 0.4047134518623352, "logps/chosen": -7.158440589904785, "logps/rejected": -8.729731559753418, "loss": 0.3838, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.158440589904785, "rewards/margins": 1.5712920427322388, "rewards/rejected": -8.729731559753418, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 29.34076633960613, "learning_rate": 4.279754823231346e-08, "logits/chosen": 0.27591472864151, "logits/rejected": 0.38238704204559326, "logps/chosen": -7.194726467132568, "logps/rejected": -8.602941513061523, "loss": 0.4107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.194726467132568, "rewards/margins": 1.4082155227661133, "rewards/rejected": -8.602941513061523, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 20.38945462228945, "learning_rate": 4.216932932697859e-08, "logits/chosen": 0.284008651971817, "logits/rejected": 0.347798615694046, "logps/chosen": -7.161848545074463, "logps/rejected": -8.407276153564453, "loss": 0.4147, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.161848545074463, "rewards/margins": 1.2454277276992798, "rewards/rejected": -8.407276153564453, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 24.560300078420465, "learning_rate": 4.154555229681844e-08, "logits/chosen": 0.295249879360199, "logits/rejected": 0.41257724165916443, "logps/chosen": -7.251509189605713, "logps/rejected": -8.809806823730469, "loss": 0.3574, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.251509189605713, "rewards/margins": 1.5582975149154663, "rewards/rejected": -8.809806823730469, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 27.75137398413972, "learning_rate": 4.092622319372069e-08, "logits/chosen": 0.3213416635990143, "logits/rejected": 0.39722609519958496, "logps/chosen": -7.246720790863037, "logps/rejected": -8.634997367858887, "loss": 0.4206, "rewards/accuracies": 0.8125, "rewards/chosen": -7.246720790863037, "rewards/margins": 1.3882768154144287, "rewards/rejected": -8.634997367858887, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 21.77932002210548, "learning_rate": 4.031134802641889e-08, "logits/chosen": 0.2759356200695038, "logits/rejected": 0.32287460565567017, "logps/chosen": -7.177241325378418, "logps/rejected": -8.51970100402832, "loss": 0.3792, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.177241325378418, "rewards/margins": 1.342458963394165, "rewards/rejected": -8.51970100402832, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 18.220861137083197, "learning_rate": 3.970093276043468e-08, "logits/chosen": 0.3211168646812439, "logits/rejected": 0.40248903632164, "logps/chosen": -7.083096504211426, "logps/rejected": -8.582834243774414, "loss": 0.3675, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.083096504211426, "rewards/margins": 1.4997375011444092, "rewards/rejected": -8.582834243774414, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 30.878471639428035, "learning_rate": 3.9094983318019584e-08, "logits/chosen": 0.2704783082008362, "logits/rejected": 0.33513563871383667, "logps/chosen": -7.248069763183594, "logps/rejected": -8.768553733825684, "loss": 0.3721, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.248069763183594, "rewards/margins": 1.5204837322235107, "rewards/rejected": -8.768553733825684, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 18.410544355665298, "learning_rate": 3.849350557809789e-08, "logits/chosen": 0.35101503133773804, "logits/rejected": 0.40395402908325195, "logps/chosen": -6.919222831726074, "logps/rejected": -8.395438194274902, "loss": 0.3548, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -6.919222831726074, "rewards/margins": 1.4762163162231445, "rewards/rejected": -8.395438194274902, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 22.423297979060788, "learning_rate": 3.789650537620903e-08, "logits/chosen": 0.3210376799106598, "logits/rejected": 0.3646882474422455, "logps/chosen": -7.2892279624938965, "logps/rejected": -8.680535316467285, "loss": 0.3806, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.2892279624938965, "rewards/margins": 1.3913065195083618, "rewards/rejected": -8.680535316467285, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 19.500622646649507, "learning_rate": 3.730398850445182e-08, "logits/chosen": 0.36859241127967834, "logits/rejected": 0.38761404156684875, "logps/chosen": -7.493990898132324, "logps/rejected": -8.856212615966797, "loss": 0.4361, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.493990898132324, "rewards/margins": 1.3622210025787354, "rewards/rejected": -8.856212615966797, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 24.202809757637297, "learning_rate": 3.671596071142735e-08, "logits/chosen": 0.3260084092617035, "logits/rejected": 0.41234469413757324, "logps/chosen": -7.222275733947754, "logps/rejected": -8.670363426208496, "loss": 0.4514, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -7.222275733947754, "rewards/margins": 1.4480884075164795, "rewards/rejected": -8.670363426208496, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 22.978865221738406, "learning_rate": 3.6132427702183996e-08, "logits/chosen": 0.2606358528137207, "logits/rejected": 0.3371516466140747, "logps/chosen": -7.061984062194824, "logps/rejected": -8.64496898651123, "loss": 0.3355, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -7.061984062194824, "rewards/margins": 1.5829856395721436, "rewards/rejected": -8.64496898651123, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 22.476469339821513, "learning_rate": 3.555339513816147e-08, "logits/chosen": 0.30862298607826233, "logits/rejected": 0.33659136295318604, "logps/chosen": -7.384900093078613, "logps/rejected": -8.592907905578613, "loss": 0.457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.384900093078613, "rewards/margins": 1.2080084085464478, "rewards/rejected": -8.592907905578613, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 18.040141519690703, "learning_rate": 3.497886863713639e-08, "logits/chosen": 0.2701326906681061, "logits/rejected": 0.2998841404914856, "logps/chosen": -7.201764106750488, "logps/rejected": -8.64393424987793, "loss": 0.4191, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.201764106750488, "rewards/margins": 1.4421710968017578, "rewards/rejected": -8.64393424987793, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 31.325861492511365, "learning_rate": 3.440885377316721e-08, "logits/chosen": 0.3548194169998169, "logits/rejected": 0.39042147994041443, "logps/chosen": -7.335655212402344, "logps/rejected": -8.545886039733887, "loss": 0.4371, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.335655212402344, "rewards/margins": 1.210229754447937, "rewards/rejected": -8.545886039733887, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 26.934069228577144, "learning_rate": 3.384335607654082e-08, "logits/chosen": 0.2751379609107971, "logits/rejected": 0.3402464985847473, "logps/chosen": -7.133413791656494, "logps/rejected": -8.54039192199707, "loss": 0.3628, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.133413791656494, "rewards/margins": 1.4069788455963135, "rewards/rejected": -8.54039192199707, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 22.039623140353005, "learning_rate": 3.328238103371811e-08, "logits/chosen": 0.30774179100990295, "logits/rejected": 0.3615073561668396, "logps/chosen": -7.2347002029418945, "logps/rejected": -8.706754684448242, "loss": 0.3638, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.2347002029418945, "rewards/margins": 1.4720547199249268, "rewards/rejected": -8.706754684448242, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 27.933589503194142, "learning_rate": 3.272593408728169e-08, "logits/chosen": 0.2784278988838196, "logits/rejected": 0.3821938633918762, "logps/chosen": -7.128647804260254, "logps/rejected": -8.447057723999023, "loss": 0.4108, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.128647804260254, "rewards/margins": 1.3184096813201904, "rewards/rejected": -8.447057723999023, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 19.219139574723425, "learning_rate": 3.217402063588204e-08, "logits/chosen": 0.2328338921070099, "logits/rejected": 0.3208829462528229, "logps/chosen": -7.203301429748535, "logps/rejected": -8.570844650268555, "loss": 0.4089, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.203301429748535, "rewards/margins": 1.3675434589385986, "rewards/rejected": -8.570844650268555, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 20.339232276905847, "learning_rate": 3.162664603418608e-08, "logits/chosen": 0.3106628656387329, "logits/rejected": 0.36122971773147583, "logps/chosen": -7.118054389953613, "logps/rejected": -8.627669334411621, "loss": 0.3861, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.118054389953613, "rewards/margins": 1.5096149444580078, "rewards/rejected": -8.627669334411621, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 33.21487833526446, "learning_rate": 3.1083815592824416e-08, "logits/chosen": 0.26000967621803284, "logits/rejected": 0.34797805547714233, "logps/chosen": -7.431216239929199, "logps/rejected": -8.74619197845459, "loss": 0.4082, "rewards/accuracies": 0.8125, "rewards/chosen": -7.431216239929199, "rewards/margins": 1.3149755001068115, "rewards/rejected": -8.74619197845459, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 21.86444112381381, "learning_rate": 3.054553457834053e-08, "logits/chosen": 0.4372014105319977, "logits/rejected": 0.4418373107910156, "logps/chosen": -7.361603736877441, "logps/rejected": -8.666631698608398, "loss": 0.3984, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.361603736877441, "rewards/margins": 1.3050282001495361, "rewards/rejected": -8.666631698608398, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 27.53360194840201, "learning_rate": 3.0011808213139036e-08, "logits/chosen": 0.3539176881313324, "logits/rejected": 0.3691009283065796, "logps/chosen": -7.237766265869141, "logps/rejected": -8.570918083190918, "loss": 0.4013, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.237766265869141, "rewards/margins": 1.3331520557403564, "rewards/rejected": -8.570918083190918, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 22.36973704960858, "learning_rate": 2.948264167543568e-08, "logits/chosen": 0.282431423664093, "logits/rejected": 0.3295731246471405, "logps/chosen": -6.9825944900512695, "logps/rejected": -8.267919540405273, "loss": 0.4033, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.9825944900512695, "rewards/margins": 1.2853238582611084, "rewards/rejected": -8.267919540405273, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 21.352961619351646, "learning_rate": 2.8958040099206216e-08, "logits/chosen": 0.21210503578186035, "logits/rejected": 0.2868809401988983, "logps/chosen": -6.9668779373168945, "logps/rejected": -8.4452486038208, "loss": 0.3604, "rewards/accuracies": 0.84375, "rewards/chosen": -6.9668779373168945, "rewards/margins": 1.4783704280853271, "rewards/rejected": -8.4452486038208, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 23.923043011176013, "learning_rate": 2.843800857413775e-08, "logits/chosen": 0.2966536581516266, "logits/rejected": 0.3478633463382721, "logps/chosen": -7.118401527404785, "logps/rejected": -8.413106918334961, "loss": 0.4516, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -7.118401527404785, "rewards/margins": 1.294704794883728, "rewards/rejected": -8.413106918334961, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 28.680252569240125, "learning_rate": 2.7922552145578203e-08, "logits/chosen": 0.3281368613243103, "logits/rejected": 0.42749959230422974, "logps/chosen": -6.957740783691406, "logps/rejected": -8.415166854858398, "loss": 0.3917, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -6.957740783691406, "rewards/margins": 1.45742666721344, "rewards/rejected": -8.415166854858398, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 33.957208247121216, "learning_rate": 2.7411675814488277e-08, "logits/chosen": 0.3737146258354187, "logits/rejected": 0.4640532433986664, "logps/chosen": -7.112711429595947, "logps/rejected": -8.404443740844727, "loss": 0.3852, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.112711429595947, "rewards/margins": 1.2917325496673584, "rewards/rejected": -8.404443740844727, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 28.088078068674758, "learning_rate": 2.690538453739216e-08, "logits/chosen": 0.34561339020729065, "logits/rejected": 0.3797042965888977, "logps/chosen": -7.065415382385254, "logps/rejected": -8.171833038330078, "loss": 0.4853, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.065415382385254, "rewards/margins": 1.106418251991272, "rewards/rejected": -8.171833038330078, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 20.793908315976935, "learning_rate": 2.6403683226330298e-08, "logits/chosen": 0.25794026255607605, "logits/rejected": 0.34034374356269836, "logps/chosen": -7.146594047546387, "logps/rejected": -8.529902458190918, "loss": 0.4069, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.146594047546387, "rewards/margins": 1.3833087682724, "rewards/rejected": -8.529902458190918, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 28.081692750716353, "learning_rate": 2.5906576748810804e-08, "logits/chosen": 0.2502826452255249, "logits/rejected": 0.3049445152282715, "logps/chosen": -7.016318321228027, "logps/rejected": -8.592795372009277, "loss": 0.3467, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -7.016318321228027, "rewards/margins": 1.5764776468276978, "rewards/rejected": -8.592795372009277, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 23.910675449437182, "learning_rate": 2.5414069927763016e-08, "logits/chosen": 0.30048057436943054, "logits/rejected": 0.3917124271392822, "logps/chosen": -7.372959136962891, "logps/rejected": -8.837319374084473, "loss": 0.3729, "rewards/accuracies": 0.84375, "rewards/chosen": -7.372959136962891, "rewards/margins": 1.4643598794937134, "rewards/rejected": -8.837319374084473, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 21.833051982511005, "learning_rate": 2.4926167541490185e-08, "logits/chosen": 0.17824986577033997, "logits/rejected": 0.29352328181266785, "logps/chosen": -7.115069389343262, "logps/rejected": -8.654746055603027, "loss": 0.3981, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.115069389343262, "rewards/margins": 1.5396769046783447, "rewards/rejected": -8.654746055603027, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 17.571881814287618, "learning_rate": 2.4442874323623574e-08, "logits/chosen": 0.34478524327278137, "logits/rejected": 0.39487963914871216, "logps/chosen": -7.267444610595703, "logps/rejected": -8.685300827026367, "loss": 0.4282, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.267444610595703, "rewards/margins": 1.4178565740585327, "rewards/rejected": -8.685300827026367, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 18.5131885704943, "learning_rate": 2.396419496307589e-08, "logits/chosen": 0.2899494767189026, "logits/rejected": 0.3831739127635956, "logps/chosen": -7.339287757873535, "logps/rejected": -8.778003692626953, "loss": 0.3914, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.339287757873535, "rewards/margins": 1.4387154579162598, "rewards/rejected": -8.778003692626953, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 20.745330700870873, "learning_rate": 2.349013410399653e-08, "logits/chosen": 0.2776733338832855, "logits/rejected": 0.3346291184425354, "logps/chosen": -7.195461273193359, "logps/rejected": -8.564038276672363, "loss": 0.4493, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.195461273193359, "rewards/margins": 1.368577241897583, "rewards/rejected": -8.564038276672363, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 21.679365768702265, "learning_rate": 2.3020696345725954e-08, "logits/chosen": 0.2596895098686218, "logits/rejected": 0.36548155546188354, "logps/chosen": -7.302142143249512, "logps/rejected": -8.857519149780273, "loss": 0.3325, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.302142143249512, "rewards/margins": 1.5553773641586304, "rewards/rejected": -8.857519149780273, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 24.408239632096212, "learning_rate": 2.2555886242751398e-08, "logits/chosen": 0.30755752325057983, "logits/rejected": 0.36671942472457886, "logps/chosen": -7.259624481201172, "logps/rejected": -8.605215072631836, "loss": 0.3796, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.259624481201172, "rewards/margins": 1.3455908298492432, "rewards/rejected": -8.605215072631836, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 32.28298872859025, "learning_rate": 2.2095708304662453e-08, "logits/chosen": 0.2339930534362793, "logits/rejected": 0.37787026166915894, "logps/chosen": -7.102840423583984, "logps/rejected": -8.613138198852539, "loss": 0.3756, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.102840423583984, "rewards/margins": 1.5102989673614502, "rewards/rejected": -8.613138198852539, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 27.25234540171007, "learning_rate": 2.16401669961076e-08, "logits/chosen": 0.23947136104106903, "logits/rejected": 0.337904155254364, "logps/chosen": -7.167047023773193, "logps/rejected": -8.601274490356445, "loss": 0.405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.167047023773193, "rewards/margins": 1.4342281818389893, "rewards/rejected": -8.601274490356445, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 30.282701033883782, "learning_rate": 2.1189266736750532e-08, "logits/chosen": 0.3645736575126648, "logits/rejected": 0.42372456192970276, "logps/chosen": -7.2429399490356445, "logps/rejected": -8.54863166809082, "loss": 0.4022, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.2429399490356445, "rewards/margins": 1.3056929111480713, "rewards/rejected": -8.54863166809082, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 20.757990695671722, "learning_rate": 2.0743011901227623e-08, "logits/chosen": 0.3731573522090912, "logits/rejected": 0.48451048135757446, "logps/chosen": -7.3839921951293945, "logps/rejected": -8.832746505737305, "loss": 0.3719, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.3839921951293945, "rewards/margins": 1.448754906654358, "rewards/rejected": -8.832746505737305, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 35.05864234841248, "learning_rate": 2.030140681910508e-08, "logits/chosen": 0.36153721809387207, "logits/rejected": 0.4401648938655853, "logps/chosen": -7.37774133682251, "logps/rejected": -8.738704681396484, "loss": 0.4173, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.37774133682251, "rewards/margins": 1.3609635829925537, "rewards/rejected": -8.738704681396484, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 25.34270308338845, "learning_rate": 1.986445577483753e-08, "logits/chosen": 0.2994186580181122, "logits/rejected": 0.35864385962486267, "logps/chosen": -7.163787841796875, "logps/rejected": -8.589695930480957, "loss": 0.4125, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.163787841796875, "rewards/margins": 1.4259079694747925, "rewards/rejected": -8.589695930480957, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 20.14550022690791, "learning_rate": 1.9432163007725765e-08, "logits/chosen": 0.26159173250198364, "logits/rejected": 0.31221261620521545, "logps/chosen": -7.068418979644775, "logps/rejected": -8.480939865112305, "loss": 0.4001, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.068418979644775, "rewards/margins": 1.4125207662582397, "rewards/rejected": -8.480939865112305, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 18.241250850095277, "learning_rate": 1.9004532711876297e-08, "logits/chosen": 0.26252037286758423, "logits/rejected": 0.2940872311592102, "logps/chosen": -6.948674201965332, "logps/rejected": -8.454702377319336, "loss": 0.3775, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -6.948674201965332, "rewards/margins": 1.5060292482376099, "rewards/rejected": -8.454702377319336, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 23.50662860562689, "learning_rate": 1.8581569036159928e-08, "logits/chosen": 0.2692853808403015, "logits/rejected": 0.3535127341747284, "logps/chosen": -7.194221496582031, "logps/rejected": -8.616966247558594, "loss": 0.4245, "rewards/accuracies": 0.8125, "rewards/chosen": -7.194221496582031, "rewards/margins": 1.4227447509765625, "rewards/rejected": -8.616966247558594, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 23.615204575666915, "learning_rate": 1.8163276084172285e-08, "logits/chosen": 0.32568153738975525, "logits/rejected": 0.41400790214538574, "logps/chosen": -7.383831977844238, "logps/rejected": -8.821314811706543, "loss": 0.376, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.383831977844238, "rewards/margins": 1.4374825954437256, "rewards/rejected": -8.821314811706543, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 22.220878556127776, "learning_rate": 1.7749657914193194e-08, "logits/chosen": 0.310195654630661, "logits/rejected": 0.3940275311470032, "logps/chosen": -7.422992706298828, "logps/rejected": -9.003042221069336, "loss": 0.3443, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.422992706298828, "rewards/margins": 1.5800496339797974, "rewards/rejected": -9.003042221069336, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 30.95902756631437, "learning_rate": 1.7340718539148203e-08, "logits/chosen": 0.3520205318927765, "logits/rejected": 0.3604137897491455, "logps/chosen": -7.535077095031738, "logps/rejected": -8.801849365234375, "loss": 0.4171, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.535077095031738, "rewards/margins": 1.2667723894119263, "rewards/rejected": -8.801849365234375, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 22.62233399647259, "learning_rate": 1.6936461926568724e-08, "logits/chosen": 0.3163068890571594, "logits/rejected": 0.36980319023132324, "logps/chosen": -7.072035789489746, "logps/rejected": -8.552950859069824, "loss": 0.4274, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.072035789489746, "rewards/margins": 1.4809151887893677, "rewards/rejected": -8.552950859069824, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 21.637116411562364, "learning_rate": 1.6536891998554346e-08, "logits/chosen": 0.25053638219833374, "logits/rejected": 0.3379058241844177, "logps/chosen": -7.1809587478637695, "logps/rejected": -8.58250904083252, "loss": 0.3729, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.1809587478637695, "rewards/margins": 1.4015495777130127, "rewards/rejected": -8.58250904083252, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 26.567198421529813, "learning_rate": 1.6142012631734093e-08, "logits/chosen": 0.3163151144981384, "logits/rejected": 0.4019327759742737, "logps/chosen": -7.117714881896973, "logps/rejected": -8.575161933898926, "loss": 0.3823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.117714881896973, "rewards/margins": 1.4574459791183472, "rewards/rejected": -8.575161933898926, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 28.255256602319154, "learning_rate": 1.575182765722949e-08, "logits/chosen": 0.2213568240404129, "logits/rejected": 0.3056579530239105, "logps/chosen": -7.2496819496154785, "logps/rejected": -8.641031265258789, "loss": 0.3999, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.2496819496154785, "rewards/margins": 1.3913477659225464, "rewards/rejected": -8.641031265258789, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.4985702931880951, "eval_logits/rejected": 0.5521857142448425, "eval_logps/chosen": -7.4281415939331055, "eval_logps/rejected": -8.578861236572266, "eval_loss": 0.5186934471130371, "eval_rewards/accuracies": 0.7292284965515137, "eval_rewards/chosen": -7.4281415939331055, "eval_rewards/margins": 1.1507208347320557, "eval_rewards/rejected": -8.578861236572266, "eval_runtime": 40.6346, "eval_samples_per_second": 33.1, "eval_steps_per_second": 8.293, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 15.32657315622752, "learning_rate": 1.536634086061672e-08, "logits/chosen": 0.3475949168205261, "logits/rejected": 0.38415712118148804, "logps/chosen": -7.22928524017334, "logps/rejected": -8.614801406860352, "loss": 0.4015, "rewards/accuracies": 0.8125, "rewards/chosen": -7.22928524017334, "rewards/margins": 1.3855170011520386, "rewards/rejected": -8.614801406860352, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 25.5224557239706, "learning_rate": 1.4985555981890495e-08, "logits/chosen": 0.3383334279060364, "logits/rejected": 0.3926554322242737, "logps/chosen": -7.228939056396484, "logps/rejected": -8.658843994140625, "loss": 0.4083, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.228939056396484, "rewards/margins": 1.4299055337905884, "rewards/rejected": -8.658843994140625, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 20.220988915641463, "learning_rate": 1.4609476715427226e-08, "logits/chosen": 0.337196409702301, "logits/rejected": 0.3915210962295532, "logps/chosen": -6.936718940734863, "logps/rejected": -8.387724876403809, "loss": 0.377, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -6.936718940734863, "rewards/margins": 1.4510059356689453, "rewards/rejected": -8.387724876403809, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 20.69651339040928, "learning_rate": 1.4238106709949792e-08, "logits/chosen": 0.23444592952728271, "logits/rejected": 0.3056947886943817, "logps/chosen": -7.1570000648498535, "logps/rejected": -8.737861633300781, "loss": 0.3399, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.1570000648498535, "rewards/margins": 1.5808613300323486, "rewards/rejected": -8.737861633300781, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 31.463176088927963, "learning_rate": 1.3871449568491511e-08, "logits/chosen": 0.27287358045578003, "logits/rejected": 0.3815121054649353, "logps/chosen": -7.238094329833984, "logps/rejected": -8.578378677368164, "loss": 0.407, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.238094329833984, "rewards/margins": 1.3402849435806274, "rewards/rejected": -8.578378677368164, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 15.823718720506902, "learning_rate": 1.3509508848361606e-08, "logits/chosen": 0.24542060494422913, "logits/rejected": 0.31007397174835205, "logps/chosen": -7.2425384521484375, "logps/rejected": -8.627290725708008, "loss": 0.3869, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.2425384521484375, "rewards/margins": 1.3847519159317017, "rewards/rejected": -8.627290725708008, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 16.347088854499393, "learning_rate": 1.3152288061110517e-08, "logits/chosen": 0.23816080391407013, "logits/rejected": 0.32516372203826904, "logps/chosen": -7.032590389251709, "logps/rejected": -8.535174369812012, "loss": 0.3637, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.032590389251709, "rewards/margins": 1.502583622932434, "rewards/rejected": -8.535174369812012, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 24.2897104027007, "learning_rate": 1.2799790672495814e-08, "logits/chosen": 0.28313878178596497, "logits/rejected": 0.38212865591049194, "logps/chosen": -7.175711154937744, "logps/rejected": -8.717257499694824, "loss": 0.3709, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.175711154937744, "rewards/margins": 1.5415469408035278, "rewards/rejected": -8.717257499694824, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 23.74238248175098, "learning_rate": 1.2452020102448835e-08, "logits/chosen": 0.2967221438884735, "logits/rejected": 0.3341226577758789, "logps/chosen": -7.1786603927612305, "logps/rejected": -8.562417984008789, "loss": 0.3904, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.1786603927612305, "rewards/margins": 1.3837578296661377, "rewards/rejected": -8.562417984008789, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 27.349159997965547, "learning_rate": 1.2108979725041103e-08, "logits/chosen": 0.22132794559001923, "logits/rejected": 0.3450946509838104, "logps/chosen": -7.286431789398193, "logps/rejected": -8.768219947814941, "loss": 0.3948, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.286431789398193, "rewards/margins": 1.4817878007888794, "rewards/rejected": -8.768219947814941, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 28.123630565180832, "learning_rate": 1.1770672868451958e-08, "logits/chosen": 0.31627506017684937, "logits/rejected": 0.4306565821170807, "logps/chosen": -7.582350730895996, "logps/rejected": -8.978840827941895, "loss": 0.3782, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.582350730895996, "rewards/margins": 1.3964893817901611, "rewards/rejected": -8.978840827941895, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 30.84440787136363, "learning_rate": 1.1437102814935872e-08, "logits/chosen": 0.28158897161483765, "logits/rejected": 0.3064124584197998, "logps/chosen": -7.2543745040893555, "logps/rejected": -8.558185577392578, "loss": 0.4547, "rewards/accuracies": 0.8125, "rewards/chosen": -7.2543745040893555, "rewards/margins": 1.303810954093933, "rewards/rejected": -8.558185577392578, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 19.607178220249814, "learning_rate": 1.1108272800791018e-08, "logits/chosen": 0.27760350704193115, "logits/rejected": 0.36912697553634644, "logps/chosen": -7.3151702880859375, "logps/rejected": -8.707416534423828, "loss": 0.3825, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.3151702880859375, "rewards/margins": 1.392246961593628, "rewards/rejected": -8.707416534423828, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 23.23363895287951, "learning_rate": 1.078418601632769e-08, "logits/chosen": 0.37443819642066956, "logits/rejected": 0.42989540100097656, "logps/chosen": -7.3462653160095215, "logps/rejected": -8.788736343383789, "loss": 0.361, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.3462653160095215, "rewards/margins": 1.4424704313278198, "rewards/rejected": -8.788736343383789, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 17.801517904386742, "learning_rate": 1.0464845605837159e-08, "logits/chosen": 0.2990570664405823, "logits/rejected": 0.388136088848114, "logps/chosen": -7.282112121582031, "logps/rejected": -8.657365798950195, "loss": 0.3602, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -7.282112121582031, "rewards/margins": 1.375253438949585, "rewards/rejected": -8.657365798950195, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 17.822232994593257, "learning_rate": 1.0150254667561642e-08, "logits/chosen": 0.2961115837097168, "logits/rejected": 0.3711056411266327, "logps/chosen": -7.567373752593994, "logps/rejected": -9.089851379394531, "loss": 0.3727, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.567373752593994, "rewards/margins": 1.5224769115447998, "rewards/rejected": -9.089851379394531, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 24.539190069467047, "learning_rate": 9.840416253663719e-09, "logits/chosen": 0.2580451965332031, "logits/rejected": 0.3410447835922241, "logps/chosen": -7.294611930847168, "logps/rejected": -8.85181713104248, "loss": 0.3599, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -7.294611930847168, "rewards/margins": 1.5572036504745483, "rewards/rejected": -8.85181713104248, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 23.968199198485376, "learning_rate": 9.535333370197074e-09, "logits/chosen": 0.31158167123794556, "logits/rejected": 0.39575880765914917, "logps/chosen": -7.369777679443359, "logps/rejected": -8.826897621154785, "loss": 0.3733, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.369777679443359, "rewards/margins": 1.4571194648742676, "rewards/rejected": -8.826897621154785, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 20.893021665662875, "learning_rate": 9.23500897707713e-09, "logits/chosen": 0.2715266942977905, "logits/rejected": 0.37386825680732727, "logps/chosen": -7.475455284118652, "logps/rejected": -8.934734344482422, "loss": 0.3954, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.475455284118652, "rewards/margins": 1.4592790603637695, "rewards/rejected": -8.934734344482422, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 23.044043166401494, "learning_rate": 8.939445988052574e-09, "logits/chosen": 0.28303730487823486, "logits/rejected": 0.33819717168807983, "logps/chosen": -7.185553073883057, "logps/rejected": -8.694276809692383, "loss": 0.3673, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.185553073883057, "rewards/margins": 1.5087233781814575, "rewards/rejected": -8.694276809692383, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 28.463542985017366, "learning_rate": 8.648647270676656e-09, "logits/chosen": 0.34945443272590637, "logits/rejected": 0.3732437789440155, "logps/chosen": -7.386940002441406, "logps/rejected": -8.709967613220215, "loss": 0.4293, "rewards/accuracies": 0.78125, "rewards/chosen": -7.386940002441406, "rewards/margins": 1.3230271339416504, "rewards/rejected": -8.709967613220215, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 18.763175516605287, "learning_rate": 8.362615646279991e-09, "logits/chosen": 0.2666058838367462, "logits/rejected": 0.361070454120636, "logps/chosen": -7.223726749420166, "logps/rejected": -8.906436920166016, "loss": 0.3823, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.223726749420166, "rewards/margins": 1.6827106475830078, "rewards/rejected": -8.906436920166016, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 30.209035784604183, "learning_rate": 8.081353889942466e-09, "logits/chosen": 0.39111408591270447, "logits/rejected": 0.4997267723083496, "logps/chosen": -7.301754951477051, "logps/rejected": -8.610307693481445, "loss": 0.4132, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -7.301754951477051, "rewards/margins": 1.3085535764694214, "rewards/rejected": -8.610307693481445, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 23.010072750215983, "learning_rate": 7.804864730467042e-09, "logits/chosen": 0.37200552225112915, "logits/rejected": 0.4101128578186035, "logps/chosen": -7.388136386871338, "logps/rejected": -8.815114974975586, "loss": 0.369, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.388136386871338, "rewards/margins": 1.4269793033599854, "rewards/rejected": -8.815114974975586, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 21.52321885906159, "learning_rate": 7.533150850352665e-09, "logits/chosen": 0.29464906454086304, "logits/rejected": 0.40461841225624084, "logps/chosen": -7.250817775726318, "logps/rejected": -8.884664535522461, "loss": 0.349, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -7.250817775726318, "rewards/margins": 1.6338462829589844, "rewards/rejected": -8.884664535522461, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 27.332720861161917, "learning_rate": 7.2662148857686175e-09, "logits/chosen": 0.3459104895591736, "logits/rejected": 0.393253892660141, "logps/chosen": -7.330377101898193, "logps/rejected": -8.819860458374023, "loss": 0.4193, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.330377101898193, "rewards/margins": 1.4894819259643555, "rewards/rejected": -8.819860458374023, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 22.516439965147775, "learning_rate": 7.0040594265287635e-09, "logits/chosen": 0.3407040238380432, "logits/rejected": 0.33243387937545776, "logps/chosen": -7.146331787109375, "logps/rejected": -8.360511779785156, "loss": 0.4369, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.146331787109375, "rewards/margins": 1.2141797542572021, "rewards/rejected": -8.360511779785156, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 20.88719412023568, "learning_rate": 6.746687016066566e-09, "logits/chosen": 0.33555251359939575, "logits/rejected": 0.3980465233325958, "logps/chosen": -7.211967468261719, "logps/rejected": -8.739933013916016, "loss": 0.374, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.211967468261719, "rewards/margins": 1.5279653072357178, "rewards/rejected": -8.739933013916016, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 24.42773754717571, "learning_rate": 6.494100151410276e-09, "logits/chosen": 0.2551911473274231, "logits/rejected": 0.33627766370773315, "logps/chosen": -7.327950954437256, "logps/rejected": -8.691459655761719, "loss": 0.3813, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -7.327950954437256, "rewards/margins": 1.3635084629058838, "rewards/rejected": -8.691459655761719, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 20.306382709025538, "learning_rate": 6.246301283158728e-09, "logits/chosen": 0.34484735131263733, "logits/rejected": 0.3669079542160034, "logps/chosen": -7.169123649597168, "logps/rejected": -8.392450332641602, "loss": 0.4706, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.169123649597168, "rewards/margins": 1.2233270406723022, "rewards/rejected": -8.392450332641602, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 20.593025785635714, "learning_rate": 6.0032928154576944e-09, "logits/chosen": 0.3364306688308716, "logits/rejected": 0.3957788348197937, "logps/chosen": -7.352032661437988, "logps/rejected": -8.578702926635742, "loss": 0.4222, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.352032661437988, "rewards/margins": 1.2266703844070435, "rewards/rejected": -8.578702926635742, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 30.104065152713716, "learning_rate": 5.76507710597629e-09, "logits/chosen": 0.3110693097114563, "logits/rejected": 0.3900325298309326, "logps/chosen": -7.348735809326172, "logps/rejected": -8.616621971130371, "loss": 0.4356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.348735809326172, "rewards/margins": 1.267886757850647, "rewards/rejected": -8.616621971130371, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 16.680882206070773, "learning_rate": 5.531656465884438e-09, "logits/chosen": 0.2769559621810913, "logits/rejected": 0.33080360293388367, "logps/chosen": -7.2665114402771, "logps/rejected": -8.79356861114502, "loss": 0.3822, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.2665114402771, "rewards/margins": 1.5270572900772095, "rewards/rejected": -8.79356861114502, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 33.512056687353265, "learning_rate": 5.303033159830217e-09, "logits/chosen": 0.4004074037075043, "logits/rejected": 0.4319838583469391, "logps/chosen": -7.44448709487915, "logps/rejected": -8.595792770385742, "loss": 0.4611, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.44448709487915, "rewards/margins": 1.1513049602508545, "rewards/rejected": -8.595792770385742, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 20.250493713729256, "learning_rate": 5.079209405917939e-09, "logits/chosen": 0.293439120054245, "logits/rejected": 0.3513754606246948, "logps/chosen": -7.047325134277344, "logps/rejected": -8.779550552368164, "loss": 0.35, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.047325134277344, "rewards/margins": 1.7322266101837158, "rewards/rejected": -8.779550552368164, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 20.851854370512623, "learning_rate": 4.860187375686664e-09, "logits/chosen": 0.27034562826156616, "logits/rejected": 0.3860381543636322, "logps/chosen": -7.286364555358887, "logps/rejected": -8.74521255493164, "loss": 0.3612, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.286364555358887, "rewards/margins": 1.4588474035263062, "rewards/rejected": -8.74521255493164, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 15.41840469676924, "learning_rate": 4.64596919408905e-09, "logits/chosen": 0.38776499032974243, "logits/rejected": 0.427722692489624, "logps/chosen": -7.104072570800781, "logps/rejected": -8.54166030883789, "loss": 0.3935, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.104072570800781, "rewards/margins": 1.437588095664978, "rewards/rejected": -8.54166030883789, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 21.804404625569394, "learning_rate": 4.436556939470814e-09, "logits/chosen": 0.3041074275970459, "logits/rejected": 0.39138466119766235, "logps/chosen": -7.551170349121094, "logps/rejected": -8.749353408813477, "loss": 0.4449, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -7.551170349121094, "rewards/margins": 1.198183298110962, "rewards/rejected": -8.749353408813477, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 20.580354121725325, "learning_rate": 4.23195264355064e-09, "logits/chosen": 0.17509128153324127, "logits/rejected": 0.30415108799934387, "logps/chosen": -7.1259660720825195, "logps/rejected": -8.546727180480957, "loss": 0.4011, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -7.1259660720825195, "rewards/margins": 1.4207613468170166, "rewards/rejected": -8.546727180480957, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 23.294164243711002, "learning_rate": 4.032158291400245e-09, "logits/chosen": 0.28611665964126587, "logits/rejected": 0.4263533651828766, "logps/chosen": -7.054600715637207, "logps/rejected": -8.823493957519531, "loss": 0.3228, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -7.054600715637207, "rewards/margins": 1.7688941955566406, "rewards/rejected": -8.823493957519531, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 19.299170874793624, "learning_rate": 3.837175821425398e-09, "logits/chosen": 0.29695925116539, "logits/rejected": 0.3492467999458313, "logps/chosen": -7.200922966003418, "logps/rejected": -8.514185905456543, "loss": 0.436, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.200922966003418, "rewards/margins": 1.3132617473602295, "rewards/rejected": -8.514185905456543, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 16.29802896136597, "learning_rate": 3.6470071253467683e-09, "logits/chosen": 0.3573339283466339, "logits/rejected": 0.4042547643184662, "logps/chosen": -7.51315975189209, "logps/rejected": -9.13004207611084, "loss": 0.3902, "rewards/accuracies": 0.8125, "rewards/chosen": -7.51315975189209, "rewards/margins": 1.6168826818466187, "rewards/rejected": -9.13004207611084, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 17.885590774258812, "learning_rate": 3.461654048181939e-09, "logits/chosen": 0.31921032071113586, "logits/rejected": 0.4436897337436676, "logps/chosen": -7.5214738845825195, "logps/rejected": -8.74337100982666, "loss": 0.4325, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.5214738845825195, "rewards/margins": 1.2218959331512451, "rewards/rejected": -8.74337100982666, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 18.02959237640329, "learning_rate": 3.281118388227255e-09, "logits/chosen": 0.3429179787635803, "logits/rejected": 0.3885026276111603, "logps/chosen": -7.318105220794678, "logps/rejected": -8.550865173339844, "loss": 0.4515, "rewards/accuracies": 0.8125, "rewards/chosen": -7.318105220794678, "rewards/margins": 1.232759714126587, "rewards/rejected": -8.550865173339844, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 18.34654628892494, "learning_rate": 3.1054018970405048e-09, "logits/chosen": 0.3382863700389862, "logits/rejected": 0.37192660570144653, "logps/chosen": -7.285592555999756, "logps/rejected": -8.900105476379395, "loss": 0.3615, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.285592555999756, "rewards/margins": 1.6145131587982178, "rewards/rejected": -8.900105476379395, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 22.15263683014796, "learning_rate": 2.9345062794238207e-09, "logits/chosen": 0.32305216789245605, "logits/rejected": 0.43725156784057617, "logps/chosen": -7.2798662185668945, "logps/rejected": -8.818778991699219, "loss": 0.3424, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -7.2798662185668945, "rewards/margins": 1.5389130115509033, "rewards/rejected": -8.818778991699219, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 23.025938828789805, "learning_rate": 2.7684331934072492e-09, "logits/chosen": 0.2248649299144745, "logits/rejected": 0.2711140513420105, "logps/chosen": -7.142079830169678, "logps/rejected": -8.689891815185547, "loss": 0.3605, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.142079830169678, "rewards/margins": 1.5478121042251587, "rewards/rejected": -8.689891815185547, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 20.29832599070399, "learning_rate": 2.6071842502326526e-09, "logits/chosen": 0.2866033613681793, "logits/rejected": 0.35560205578804016, "logps/chosen": -7.302939414978027, "logps/rejected": -8.542236328125, "loss": 0.4082, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.302939414978027, "rewards/margins": 1.2392966747283936, "rewards/rejected": -8.542236328125, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 27.652008034671443, "learning_rate": 2.450761014337888e-09, "logits/chosen": 0.3976050019264221, "logits/rejected": 0.4410485625267029, "logps/chosen": -7.016195774078369, "logps/rejected": -8.602128982543945, "loss": 0.426, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -7.016195774078369, "rewards/margins": 1.5859334468841553, "rewards/rejected": -8.602128982543945, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 25.659090467281686, "learning_rate": 2.299165003341985e-09, "logits/chosen": 0.4138898253440857, "logits/rejected": 0.4521142840385437, "logps/chosen": -7.281416416168213, "logps/rejected": -8.658967971801758, "loss": 0.4223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.281416416168213, "rewards/margins": 1.3775522708892822, "rewards/rejected": -8.658967971801758, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 20.685960614166202, "learning_rate": 2.1523976880299945e-09, "logits/chosen": 0.25372791290283203, "logits/rejected": 0.369385302066803, "logps/chosen": -7.375463962554932, "logps/rejected": -8.575654983520508, "loss": 0.4367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.375463962554932, "rewards/margins": 1.2001909017562866, "rewards/rejected": -8.575654983520508, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 14.672376428450754, "learning_rate": 2.010460492339161e-09, "logits/chosen": 0.27492180466651917, "logits/rejected": 0.3557998538017273, "logps/chosen": -7.125840663909912, "logps/rejected": -8.567343711853027, "loss": 0.3868, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.125840663909912, "rewards/margins": 1.4415031671524048, "rewards/rejected": -8.567343711853027, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 15.320747926318706, "learning_rate": 1.8733547933446614e-09, "logits/chosen": 0.274699866771698, "logits/rejected": 0.39291685819625854, "logps/chosen": -7.3683762550354, "logps/rejected": -8.641883850097656, "loss": 0.4193, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.3683762550354, "rewards/margins": 1.273508071899414, "rewards/rejected": -8.641883850097656, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 41.160069139744266, "learning_rate": 1.7410819212467231e-09, "logits/chosen": 0.30718690156936646, "logits/rejected": 0.3702242970466614, "logps/chosen": -7.338247776031494, "logps/rejected": -8.609526634216309, "loss": 0.4331, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -7.338247776031494, "rewards/margins": 1.2712781429290771, "rewards/rejected": -8.609526634216309, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 21.857520994985457, "learning_rate": 1.613643159357192e-09, "logits/chosen": 0.3399103283882141, "logits/rejected": 0.3038916289806366, "logps/chosen": -7.159180641174316, "logps/rejected": -8.42628288269043, "loss": 0.4275, "rewards/accuracies": 0.78125, "rewards/chosen": -7.159180641174316, "rewards/margins": 1.267102837562561, "rewards/rejected": -8.42628288269043, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 21.21868121904039, "learning_rate": 1.4910397440875967e-09, "logits/chosen": 0.323896586894989, "logits/rejected": 0.385012149810791, "logps/chosen": -7.383314609527588, "logps/rejected": -8.851823806762695, "loss": 0.3794, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.383314609527588, "rewards/margins": 1.4685090780258179, "rewards/rejected": -8.851823806762695, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 19.9898083790725, "learning_rate": 1.3732728649368253e-09, "logits/chosen": 0.3619735836982727, "logits/rejected": 0.4459814131259918, "logps/chosen": -7.1718621253967285, "logps/rejected": -8.443506240844727, "loss": 0.4015, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.1718621253967285, "rewards/margins": 1.2716439962387085, "rewards/rejected": -8.443506240844727, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 22.88849198505473, "learning_rate": 1.260343664479524e-09, "logits/chosen": 0.24851293861865997, "logits/rejected": 0.29774126410484314, "logps/chosen": -7.188299655914307, "logps/rejected": -8.597356796264648, "loss": 0.406, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.188299655914307, "rewards/margins": 1.4090566635131836, "rewards/rejected": -8.597356796264648, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 17.053118528635586, "learning_rate": 1.1522532383554384e-09, "logits/chosen": 0.29698696732521057, "logits/rejected": 0.3932635486125946, "logps/chosen": -7.141233921051025, "logps/rejected": -8.782320022583008, "loss": 0.3386, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -7.141233921051025, "rewards/margins": 1.6410869359970093, "rewards/rejected": -8.782320022583008, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 19.651338748272615, "learning_rate": 1.049002635258256e-09, "logits/chosen": 0.3427634835243225, "logits/rejected": 0.4012731909751892, "logps/chosen": -7.4074907302856445, "logps/rejected": -8.760231971740723, "loss": 0.4021, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.4074907302856445, "rewards/margins": 1.3527414798736572, "rewards/rejected": -8.760231971740723, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 28.861964570770848, "learning_rate": 9.505928569258358e-10, "logits/chosen": 0.2894745469093323, "logits/rejected": 0.31022459268569946, "logps/chosen": -7.188068389892578, "logps/rejected": -8.540335655212402, "loss": 0.4135, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.188068389892578, "rewards/margins": 1.3522676229476929, "rewards/rejected": -8.540335655212402, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 21.72983790722157, "learning_rate": 8.57024858130273e-10, "logits/chosen": 0.2915140688419342, "logits/rejected": 0.3793867826461792, "logps/chosen": -7.252325534820557, "logps/rejected": -8.96684455871582, "loss": 0.3345, "rewards/accuracies": 0.875, "rewards/chosen": -7.252325534820557, "rewards/margins": 1.7145202159881592, "rewards/rejected": -8.96684455871582, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 29.771949260507018, "learning_rate": 7.682995466686826e-10, "logits/chosen": 0.252725750207901, "logits/rejected": 0.33126434683799744, "logps/chosen": -7.264952182769775, "logps/rejected": -8.645537376403809, "loss": 0.4117, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -7.264952182769775, "rewards/margins": 1.3805863857269287, "rewards/rejected": -8.645537376403809, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 27.660196639837128, "learning_rate": 6.844177833543741e-10, "logits/chosen": 0.31331852078437805, "logits/rejected": 0.34796732664108276, "logps/chosen": -7.2345781326293945, "logps/rejected": -8.596330642700195, "loss": 0.3791, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -7.2345781326293945, "rewards/margins": 1.3617534637451172, "rewards/rejected": -8.596330642700195, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 23.042857949994694, "learning_rate": 6.053803820087467e-10, "logits/chosen": 0.3006105422973633, "logits/rejected": 0.40464895963668823, "logps/chosen": -7.379230499267578, "logps/rejected": -8.816555976867676, "loss": 0.404, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.379230499267578, "rewards/margins": 1.4373241662979126, "rewards/rejected": -8.816555976867676, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 19.244148609629136, "learning_rate": 5.311881094528514e-10, "logits/chosen": 0.22791461646556854, "logits/rejected": 0.32278043031692505, "logps/chosen": -7.511324882507324, "logps/rejected": -8.766143798828125, "loss": 0.4307, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.511324882507324, "rewards/margins": 1.2548186779022217, "rewards/rejected": -8.766143798828125, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 24.892160291596507, "learning_rate": 4.6184168550050806e-10, "logits/chosen": 0.24193692207336426, "logits/rejected": 0.28786149621009827, "logps/chosen": -7.362066745758057, "logps/rejected": -8.70634937286377, "loss": 0.4217, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.362066745758057, "rewards/margins": 1.3442833423614502, "rewards/rejected": -8.70634937286377, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 24.16859919110654, "learning_rate": 3.973417829510328e-10, "logits/chosen": 0.26713812351226807, "logits/rejected": 0.32389068603515625, "logps/chosen": -7.500788688659668, "logps/rejected": -8.870849609375, "loss": 0.4187, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.500788688659668, "rewards/margins": 1.370060682296753, "rewards/rejected": -8.870849609375, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 22.227875539441303, "learning_rate": 3.3768902758274377e-10, "logits/chosen": 0.3273284137248993, "logits/rejected": 0.3817601203918457, "logps/chosen": -7.42513370513916, "logps/rejected": -8.734639167785645, "loss": 0.4099, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.42513370513916, "rewards/margins": 1.3095057010650635, "rewards/rejected": -8.734639167785645, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 17.99627660100524, "learning_rate": 2.8288399814691e-10, "logits/chosen": 0.331220805644989, "logits/rejected": 0.3743188977241516, "logps/chosen": -7.079068183898926, "logps/rejected": -8.399408340454102, "loss": 0.388, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.079068183898926, "rewards/margins": 1.3203405141830444, "rewards/rejected": -8.399408340454102, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 24.534847907180502, "learning_rate": 2.3292722636220066e-10, "logits/chosen": 0.27975600957870483, "logits/rejected": 0.37564751505851746, "logps/chosen": -7.207106113433838, "logps/rejected": -8.848150253295898, "loss": 0.3572, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.207106113433838, "rewards/margins": 1.64104425907135, "rewards/rejected": -8.848150253295898, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 25.176730431822243, "learning_rate": 1.8781919690946668e-10, "logits/chosen": 0.33012354373931885, "logits/rejected": 0.3701064884662628, "logps/chosen": -7.320425987243652, "logps/rejected": -8.515074729919434, "loss": 0.4453, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.320425987243652, "rewards/margins": 1.1946496963500977, "rewards/rejected": -8.515074729919434, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 31.314284963711998, "learning_rate": 1.4756034742696711e-10, "logits/chosen": 0.30056458711624146, "logits/rejected": 0.38470593094825745, "logps/chosen": -7.4208083152771, "logps/rejected": -8.832112312316895, "loss": 0.3936, "rewards/accuracies": 0.84375, "rewards/chosen": -7.4208083152771, "rewards/margins": 1.411303162574768, "rewards/rejected": -8.832112312316895, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 16.942458017593896, "learning_rate": 1.12151068506261e-10, "logits/chosen": 0.3220537304878235, "logits/rejected": 0.4005371928215027, "logps/chosen": -7.145326137542725, "logps/rejected": -8.870862007141113, "loss": 0.3542, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.145326137542725, "rewards/margins": 1.7255351543426514, "rewards/rejected": -8.870862007141113, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 20.839489835965495, "learning_rate": 8.159170368826629e-11, "logits/chosen": 0.27218252420425415, "logits/rejected": 0.35972651839256287, "logps/chosen": -6.80571985244751, "logps/rejected": -8.25713062286377, "loss": 0.4102, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -6.80571985244751, "rewards/margins": 1.4514100551605225, "rewards/rejected": -8.25713062286377, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 26.637599808009313, "learning_rate": 5.588254946015114e-11, "logits/chosen": 0.25911322236061096, "logits/rejected": 0.3960227370262146, "logps/chosen": -7.185223579406738, "logps/rejected": -8.654568672180176, "loss": 0.3962, "rewards/accuracies": 0.8125, "rewards/chosen": -7.185223579406738, "rewards/margins": 1.4693444967269897, "rewards/rejected": -8.654568672180176, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 17.149293602865423, "learning_rate": 3.502385525216978e-11, "logits/chosen": 0.2490883320569992, "logits/rejected": 0.34305018186569214, "logps/chosen": -7.224591255187988, "logps/rejected": -8.672290802001953, "loss": 0.3835, "rewards/accuracies": 0.8125, "rewards/chosen": -7.224591255187988, "rewards/margins": 1.4476995468139648, "rewards/rejected": -8.672290802001953, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 20.59972458952927, "learning_rate": 1.901582343555308e-11, "logits/chosen": 0.3119482696056366, "logits/rejected": 0.38122907280921936, "logps/chosen": -7.362497806549072, "logps/rejected": -8.697230339050293, "loss": 0.424, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -7.362497806549072, "rewards/margins": 1.3347325325012207, "rewards/rejected": -8.697230339050293, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 27.059897607022943, "learning_rate": 7.858609320232634e-12, "logits/chosen": 0.28058987855911255, "logits/rejected": 0.383955180644989, "logps/chosen": -7.132127285003662, "logps/rejected": -8.563615798950195, "loss": 0.397, "rewards/accuracies": 0.78125, "rewards/chosen": -7.132127285003662, "rewards/margins": 1.4314875602722168, "rewards/rejected": -8.563615798950195, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 22.374656736366514, "learning_rate": 1.5523211535639624e-12, "logits/chosen": 0.3075283169746399, "logits/rejected": 0.3662322461605072, "logps/chosen": -7.1898980140686035, "logps/rejected": -8.871538162231445, "loss": 0.3855, "rewards/accuracies": 0.8125, "rewards/chosen": -7.1898980140686035, "rewards/margins": 1.6816394329071045, "rewards/rejected": -8.871538162231445, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.491445928812027, "eval_logits/rejected": 0.5435217618942261, "eval_logps/chosen": -7.457206726074219, "eval_logps/rejected": -8.611745834350586, "eval_loss": 0.5195400714874268, "eval_rewards/accuracies": 0.7284866571426392, "eval_rewards/chosen": -7.457206726074219, "eval_rewards/margins": 1.1545389890670776, "eval_rewards/rejected": -8.611745834350586, "eval_runtime": 40.2382, "eval_samples_per_second": 33.426, "eval_steps_per_second": 8.375, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.5211759163684967, "train_runtime": 30222.1049, "train_samples_per_second": 5.935, "train_steps_per_second": 0.185 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }