{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 11608, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.184279469343464, "learning_rate": 4.3066322136089575e-10, "logits/chosen": -2.9685676097869873, "logits/rejected": -2.926340103149414, "logps/chosen": -44.04426574707031, "logps/rejected": -41.580841064453125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 2.397164451396864, "learning_rate": 4.306632213608958e-09, "logits/chosen": -3.057889223098755, "logits/rejected": -3.028320550918579, "logps/chosen": -50.45764923095703, "logps/rejected": -49.59663391113281, "loss": 0.6931, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": 4.533848914434202e-05, "rewards/margins": 1.5664114471292123e-05, "rewards/rejected": 2.9674369216081686e-05, "step": 10 }, { "epoch": 0.0, "grad_norm": 2.2428396092279437, "learning_rate": 8.613264427217916e-09, "logits/chosen": -3.1213667392730713, "logits/rejected": -3.113072633743286, "logps/chosen": -52.6474494934082, "logps/rejected": -52.98405075073242, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -2.8674810891970992e-05, "rewards/margins": 0.00011320582416374236, "rewards/rejected": -0.00014188062050379813, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.5739690837982745, "learning_rate": 1.2919896640826872e-08, "logits/chosen": -3.093750476837158, "logits/rejected": -3.0699524879455566, "logps/chosen": -56.7930793762207, "logps/rejected": -58.43015670776367, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00020466512069106102, "rewards/margins": -1.0724004823714495e-05, "rewards/rejected": -0.00019394111586734653, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.0121230094500575, "learning_rate": 1.722652885443583e-08, "logits/chosen": -3.107394218444824, "logits/rejected": -3.075824499130249, "logps/chosen": -55.259185791015625, "logps/rejected": -50.681114196777344, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 2.6132946004508995e-05, "rewards/margins": 0.0003762342967092991, "rewards/rejected": -0.00035010138526558876, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.387552940684203, "learning_rate": 2.153316106804479e-08, "logits/chosen": -3.1034653186798096, "logits/rejected": -3.0867769718170166, "logps/chosen": -53.10588455200195, "logps/rejected": -51.49999237060547, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00013910401321481913, "rewards/margins": -8.180685108527541e-05, "rewards/rejected": -5.7297169405501336e-05, "step": 50 }, { "epoch": 0.01, "grad_norm": 2.7970003076901, "learning_rate": 2.5839793281653743e-08, "logits/chosen": -3.156252384185791, "logits/rejected": -3.1266000270843506, "logps/chosen": -57.58796310424805, "logps/rejected": -54.14855194091797, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -4.136812640354037e-05, "rewards/margins": 5.838483411935158e-05, "rewards/rejected": -9.975295688491315e-05, "step": 60 }, { "epoch": 0.01, "grad_norm": 2.204322757523193, "learning_rate": 3.01464254952627e-08, "logits/chosen": -3.0535032749176025, "logits/rejected": -3.033651828765869, "logps/chosen": -53.7407112121582, "logps/rejected": -53.21503448486328, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0001532899623271078, "rewards/margins": 0.0002148848434444517, "rewards/rejected": -6.159489566925913e-05, "step": 70 }, { "epoch": 0.01, "grad_norm": 2.4363897145491564, "learning_rate": 3.445305770887166e-08, "logits/chosen": -3.1622116565704346, "logits/rejected": -3.1288113594055176, "logps/chosen": -59.07722091674805, "logps/rejected": -54.100318908691406, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 6.512943946290761e-05, "rewards/margins": 0.0002318086044397205, "rewards/rejected": -0.00016667917952872813, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.4857228513755465, "learning_rate": 3.8759689922480615e-08, "logits/chosen": -2.996279239654541, "logits/rejected": -2.9815406799316406, "logps/chosen": -53.46660232543945, "logps/rejected": -52.83372116088867, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0002413403708487749, "rewards/margins": 0.0003210466238670051, "rewards/rejected": -7.97062530182302e-05, "step": 90 }, { "epoch": 0.02, "grad_norm": 2.4854058944857753, "learning_rate": 4.306632213608958e-08, "logits/chosen": -3.1720452308654785, "logits/rejected": -3.109947681427002, "logps/chosen": -55.90839385986328, "logps/rejected": -49.635841369628906, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0004091753507964313, "rewards/margins": 0.0005016516079194844, "rewards/rejected": -9.247624257113785e-05, "step": 100 }, { "epoch": 0.02, "eval_logits/chosen": -3.165482521057129, "eval_logits/rejected": -3.1598188877105713, "eval_logps/chosen": -58.70554733276367, "eval_logps/rejected": -63.15681457519531, "eval_loss": 0.6931592226028442, "eval_rewards/accuracies": 0.4986059367656708, "eval_rewards/chosen": -1.7028520232997835e-05, "eval_rewards/margins": -2.261956069560256e-05, "eval_rewards/rejected": 5.5910377341206186e-06, "eval_runtime": 356.9348, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 100 }, { "epoch": 0.02, "grad_norm": 2.5379181619698423, "learning_rate": 4.7372954349698534e-08, "logits/chosen": -3.12424898147583, "logits/rejected": -3.1003119945526123, "logps/chosen": -55.57979202270508, "logps/rejected": -52.30139923095703, "loss": 0.6932, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.00025836736313067377, "rewards/margins": -0.0001904887321870774, "rewards/rejected": -6.787859456380829e-05, "step": 110 }, { "epoch": 0.02, "grad_norm": 2.561368552467561, "learning_rate": 5.1679586563307486e-08, "logits/chosen": -3.0679683685302734, "logits/rejected": -3.0525035858154297, "logps/chosen": -53.182281494140625, "logps/rejected": -55.54204177856445, "loss": 0.6932, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.00013645360013470054, "rewards/margins": -7.856530282879248e-05, "rewards/rejected": -5.78882682020776e-05, "step": 120 }, { "epoch": 0.02, "grad_norm": 2.138984879227857, "learning_rate": 5.598621877691645e-08, "logits/chosen": -3.10345458984375, "logits/rejected": -3.089416980743408, "logps/chosen": -55.18548583984375, "logps/rejected": -53.74910354614258, "loss": 0.6931, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.565276711015031e-05, "rewards/margins": 0.00010266680328641087, "rewards/rejected": -0.0001383195340167731, "step": 130 }, { "epoch": 0.02, "grad_norm": 2.4347210527199588, "learning_rate": 6.02928509905254e-08, "logits/chosen": -3.1250388622283936, "logits/rejected": -3.106936454772949, "logps/chosen": -54.17211151123047, "logps/rejected": -53.7529296875, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 1.1661546523100697e-05, "rewards/margins": -4.082123723492259e-06, "rewards/rejected": 1.5743673429824412e-05, "step": 140 }, { "epoch": 0.03, "grad_norm": 2.213735398044619, "learning_rate": 6.459948320413436e-08, "logits/chosen": -3.0300798416137695, "logits/rejected": -3.0123374462127686, "logps/chosen": -52.598976135253906, "logps/rejected": -52.38323211669922, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -6.447284249588847e-05, "rewards/margins": -5.51502344023902e-05, "rewards/rejected": -9.322635378339328e-06, "step": 150 }, { "epoch": 0.03, "grad_norm": 2.156655887327036, "learning_rate": 6.890611541774332e-08, "logits/chosen": -3.0911943912506104, "logits/rejected": -3.070504665374756, "logps/chosen": -53.4869499206543, "logps/rejected": -54.68552780151367, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -5.7128123444272205e-05, "rewards/margins": -0.00023163272999227047, "rewards/rejected": 0.00017450464656576514, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.3532949837745685, "learning_rate": 7.321274763135228e-08, "logits/chosen": -3.0787293910980225, "logits/rejected": -3.0594067573547363, "logps/chosen": -56.2595100402832, "logps/rejected": -51.335472106933594, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.00012248748680576682, "rewards/margins": 0.00017826970724854618, "rewards/rejected": -5.5782216804800555e-05, "step": 170 }, { "epoch": 0.03, "grad_norm": 2.6258607223080777, "learning_rate": 7.751937984496123e-08, "logits/chosen": -3.0651602745056152, "logits/rejected": -3.0461204051971436, "logps/chosen": -56.38677215576172, "logps/rejected": -53.772865295410156, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 8.890104800229892e-05, "rewards/margins": 6.191270949784666e-05, "rewards/rejected": 2.6988331228494644e-05, "step": 180 }, { "epoch": 0.03, "grad_norm": 2.636977530600279, "learning_rate": 8.18260120585702e-08, "logits/chosen": -3.126418352127075, "logits/rejected": -3.0830445289611816, "logps/chosen": -58.16786575317383, "logps/rejected": -52.552574157714844, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 8.153868111548945e-05, "rewards/margins": 0.00046413601376116276, "rewards/rejected": -0.0003825973253697157, "step": 190 }, { "epoch": 0.03, "grad_norm": 2.580185714051456, "learning_rate": 8.613264427217916e-08, "logits/chosen": -3.0618324279785156, "logits/rejected": -3.04618239402771, "logps/chosen": -54.1072998046875, "logps/rejected": -54.72209548950195, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.7029391276300885e-05, "rewards/margins": 0.0002754017186816782, "rewards/rejected": -0.00030243111541494727, "step": 200 }, { "epoch": 0.03, "eval_logits/chosen": -3.165257692337036, "eval_logits/rejected": -3.159618616104126, "eval_logps/chosen": -58.68031311035156, "eval_logps/rejected": -63.137481689453125, "eval_loss": 0.69312983751297, "eval_rewards/accuracies": 0.5127788186073303, "eval_rewards/chosen": 0.0002353396121179685, "eval_rewards/margins": 3.6308691051090136e-05, "eval_rewards/rejected": 0.00019903088104911149, "eval_runtime": 355.2015, "eval_samples_per_second": 12.117, "eval_steps_per_second": 1.515, "step": 200 }, { "epoch": 0.04, "grad_norm": 2.28669707979838, "learning_rate": 9.043927648578811e-08, "logits/chosen": -3.0168232917785645, "logits/rejected": -3.008084535598755, "logps/chosen": -53.26890182495117, "logps/rejected": -57.292236328125, "loss": 0.6932, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.00018195889424532652, "rewards/margins": -8.943781722337008e-05, "rewards/rejected": -9.252109157387167e-05, "step": 210 }, { "epoch": 0.04, "grad_norm": 2.336438367711948, "learning_rate": 9.474590869939707e-08, "logits/chosen": -3.053389072418213, "logits/rejected": -3.022315502166748, "logps/chosen": -52.211769104003906, "logps/rejected": -51.38096237182617, "loss": 0.6928, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0002459119423292577, "rewards/margins": 0.0007187900482676923, "rewards/rejected": -0.00096470199059695, "step": 220 }, { "epoch": 0.04, "grad_norm": 2.394447162636319, "learning_rate": 9.905254091300602e-08, "logits/chosen": -3.0536513328552246, "logits/rejected": -3.0352864265441895, "logps/chosen": -48.92305374145508, "logps/rejected": -50.00139617919922, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.000386159576009959, "rewards/margins": 0.00027917162515223026, "rewards/rejected": -0.0006653312011621892, "step": 230 }, { "epoch": 0.04, "grad_norm": 2.25083824617627, "learning_rate": 1.0335917312661497e-07, "logits/chosen": -3.026599168777466, "logits/rejected": -2.9841794967651367, "logps/chosen": -55.9691276550293, "logps/rejected": -52.21491622924805, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0005459034582599998, "rewards/margins": 9.160184708889574e-05, "rewards/rejected": -0.0006375053199008107, "step": 240 }, { "epoch": 0.04, "grad_norm": 2.323445591258243, "learning_rate": 1.0766580534022394e-07, "logits/chosen": -3.1199052333831787, "logits/rejected": -3.0994296073913574, "logps/chosen": -52.30159378051758, "logps/rejected": -51.17644119262695, "loss": 0.6928, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.00036477210232988, "rewards/margins": 0.000624177569989115, "rewards/rejected": -0.000988949672318995, "step": 250 }, { "epoch": 0.04, "grad_norm": 2.316340782882154, "learning_rate": 1.119724375538329e-07, "logits/chosen": -3.0962424278259277, "logits/rejected": -3.0838680267333984, "logps/chosen": -54.875404357910156, "logps/rejected": -56.73250198364258, "loss": 0.6927, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00046339546679519117, "rewards/margins": 0.000800677458755672, "rewards/rejected": -0.0012640730710700154, "step": 260 }, { "epoch": 0.05, "grad_norm": 2.212868394326073, "learning_rate": 1.1627906976744186e-07, "logits/chosen": -3.034665584564209, "logits/rejected": -3.0166120529174805, "logps/chosen": -53.17912673950195, "logps/rejected": -54.439247131347656, "loss": 0.6927, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00068662193370983, "rewards/margins": 0.000886773515958339, "rewards/rejected": -0.0015733955660834908, "step": 270 }, { "epoch": 0.05, "grad_norm": 2.4332688553771162, "learning_rate": 1.205857019810508e-07, "logits/chosen": -3.125800609588623, "logits/rejected": -3.0919315814971924, "logps/chosen": -57.64659881591797, "logps/rejected": -53.57320022583008, "loss": 0.6925, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00048610559315420687, "rewards/margins": 0.0012957851868122816, "rewards/rejected": -0.0017818908672779799, "step": 280 }, { "epoch": 0.05, "grad_norm": 2.249917169819848, "learning_rate": 1.2489233419465976e-07, "logits/chosen": -3.048657178878784, "logits/rejected": -3.034323215484619, "logps/chosen": -55.451141357421875, "logps/rejected": -54.447296142578125, "loss": 0.6926, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0007702001603320241, "rewards/margins": 0.0010641098488122225, "rewards/rejected": -0.0018343102419748902, "step": 290 }, { "epoch": 0.05, "grad_norm": 2.369298746667673, "learning_rate": 1.2919896640826872e-07, "logits/chosen": -3.004129409790039, "logits/rejected": -2.9955711364746094, "logps/chosen": -52.908668518066406, "logps/rejected": -54.089874267578125, "loss": 0.6926, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0007483543595299125, "rewards/margins": 0.0010376429418101907, "rewards/rejected": -0.0017859973013401031, "step": 300 }, { "epoch": 0.05, "eval_logits/chosen": -3.163788318634033, "eval_logits/rejected": -3.158111572265625, "eval_logps/chosen": -58.64423751831055, "eval_logps/rejected": -63.127235412597656, "eval_loss": 0.6930013298988342, "eval_rewards/accuracies": 0.5394981503486633, "eval_rewards/chosen": 0.0005960779963061213, "eval_rewards/margins": 0.00029463876853697, "eval_rewards/rejected": 0.0003014392568729818, "eval_runtime": 356.1408, "eval_samples_per_second": 12.085, "eval_steps_per_second": 1.511, "step": 300 }, { "epoch": 0.05, "grad_norm": 2.4743047933409654, "learning_rate": 1.335055986218777e-07, "logits/chosen": -3.0664687156677246, "logits/rejected": -3.060901641845703, "logps/chosen": -53.61384201049805, "logps/rejected": -53.52678298950195, "loss": 0.6926, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0011877021752297878, "rewards/margins": 0.0011907459702342749, "rewards/rejected": -0.0023784481454640627, "step": 310 }, { "epoch": 0.06, "grad_norm": 2.4049747365414973, "learning_rate": 1.3781223083548665e-07, "logits/chosen": -3.023968458175659, "logits/rejected": -2.9977526664733887, "logps/chosen": -54.628395080566406, "logps/rejected": -49.533180236816406, "loss": 0.6923, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0013292916119098663, "rewards/margins": 0.0016255916561931372, "rewards/rejected": -0.0029548832681030035, "step": 320 }, { "epoch": 0.06, "grad_norm": 2.3870544128030664, "learning_rate": 1.421188630490956e-07, "logits/chosen": -3.0835556983947754, "logits/rejected": -3.0599796772003174, "logps/chosen": -55.1311149597168, "logps/rejected": -52.4721794128418, "loss": 0.6923, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0011483869748190045, "rewards/margins": 0.0016829262021929026, "rewards/rejected": -0.0028313130605965853, "step": 330 }, { "epoch": 0.06, "grad_norm": 2.177069307915396, "learning_rate": 1.4642549526270455e-07, "logits/chosen": -3.0061721801757812, "logits/rejected": -2.9844064712524414, "logps/chosen": -52.63057327270508, "logps/rejected": -52.16088104248047, "loss": 0.6926, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.001195924007333815, "rewards/margins": 0.0012026333715766668, "rewards/rejected": -0.0023985574953258038, "step": 340 }, { "epoch": 0.06, "grad_norm": 2.3234378454912106, "learning_rate": 1.507321274763135e-07, "logits/chosen": -2.978062391281128, "logits/rejected": -2.9385359287261963, "logps/chosen": -56.37324142456055, "logps/rejected": -53.88068389892578, "loss": 0.6923, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0015299760270863771, "rewards/margins": 0.0017499777022749186, "rewards/rejected": -0.003279953496530652, "step": 350 }, { "epoch": 0.06, "grad_norm": 2.4262238382471386, "learning_rate": 1.5503875968992246e-07, "logits/chosen": -3.1277754306793213, "logits/rejected": -3.1047608852386475, "logps/chosen": -54.798912048339844, "logps/rejected": -50.93855667114258, "loss": 0.6921, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0022310030180960894, "rewards/margins": 0.00208022678270936, "rewards/rejected": -0.004311230033636093, "step": 360 }, { "epoch": 0.06, "grad_norm": 2.349381595760878, "learning_rate": 1.5934539190353144e-07, "logits/chosen": -3.1027891635894775, "logits/rejected": -3.0734617710113525, "logps/chosen": -52.51411819458008, "logps/rejected": -51.80864715576172, "loss": 0.6915, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0014876796631142497, "rewards/margins": 0.0032487933058291674, "rewards/rejected": -0.0047364733181893826, "step": 370 }, { "epoch": 0.07, "grad_norm": 2.097053054398486, "learning_rate": 1.636520241171404e-07, "logits/chosen": -3.201812744140625, "logits/rejected": -3.177008867263794, "logps/chosen": -53.772377014160156, "logps/rejected": -52.72692108154297, "loss": 0.6919, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0027772397734224796, "rewards/margins": 0.002589695155620575, "rewards/rejected": -0.005366935394704342, "step": 380 }, { "epoch": 0.07, "grad_norm": 2.410804418588189, "learning_rate": 1.6795865633074934e-07, "logits/chosen": -3.0987088680267334, "logits/rejected": -3.0732438564300537, "logps/chosen": -56.3135986328125, "logps/rejected": -55.72515106201172, "loss": 0.6917, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0026503982953727245, "rewards/margins": 0.0030277767218649387, "rewards/rejected": -0.005678174551576376, "step": 390 }, { "epoch": 0.07, "grad_norm": 2.178080396872868, "learning_rate": 1.7226528854435832e-07, "logits/chosen": -3.0680224895477295, "logits/rejected": -3.0521273612976074, "logps/chosen": -52.94443893432617, "logps/rejected": -53.484153747558594, "loss": 0.691, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0031809869688004255, "rewards/margins": 0.0043879905715584755, "rewards/rejected": -0.00756897684186697, "step": 400 }, { "epoch": 0.07, "eval_logits/chosen": -3.1602914333343506, "eval_logits/rejected": -3.1546647548675537, "eval_logps/chosen": -58.560550689697266, "eval_logps/rejected": -63.11561965942383, "eval_loss": 0.6926479339599609, "eval_rewards/accuracies": 0.5611059665679932, "eval_rewards/chosen": 0.00143293512519449, "eval_rewards/margins": 0.0010153905022889376, "eval_rewards/rejected": 0.00041754471021704376, "eval_runtime": 357.1134, "eval_samples_per_second": 12.052, "eval_steps_per_second": 1.507, "step": 400 }, { "epoch": 0.07, "grad_norm": 2.1621440173846986, "learning_rate": 1.7657192075796725e-07, "logits/chosen": -3.071235179901123, "logits/rejected": -3.0671350955963135, "logps/chosen": -51.14980697631836, "logps/rejected": -56.166534423828125, "loss": 0.6921, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.004188390448689461, "rewards/margins": 0.002166205085813999, "rewards/rejected": -0.00635459553450346, "step": 410 }, { "epoch": 0.07, "grad_norm": 2.497135126567034, "learning_rate": 1.8087855297157623e-07, "logits/chosen": -3.0559000968933105, "logits/rejected": -3.0479495525360107, "logps/chosen": -54.61994552612305, "logps/rejected": -54.487693786621094, "loss": 0.6925, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.004710297100245953, "rewards/margins": 0.0013622719561681151, "rewards/rejected": -0.006072568707168102, "step": 420 }, { "epoch": 0.07, "grad_norm": 2.277715100196342, "learning_rate": 1.8518518518518516e-07, "logits/chosen": -3.083691358566284, "logits/rejected": -3.069835662841797, "logps/chosen": -53.39365768432617, "logps/rejected": -54.8032341003418, "loss": 0.6915, "rewards/accuracies": 0.625, "rewards/chosen": -0.0031691633630543947, "rewards/margins": 0.0033808585721999407, "rewards/rejected": -0.006550021469593048, "step": 430 }, { "epoch": 0.08, "grad_norm": 2.5466826911783222, "learning_rate": 1.8949181739879413e-07, "logits/chosen": -3.128629684448242, "logits/rejected": -3.093276262283325, "logps/chosen": -54.6483268737793, "logps/rejected": -54.21075439453125, "loss": 0.6901, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0036215814761817455, "rewards/margins": 0.006118671037256718, "rewards/rejected": -0.009740252047777176, "step": 440 }, { "epoch": 0.08, "grad_norm": 2.2918968289848176, "learning_rate": 1.9379844961240311e-07, "logits/chosen": -3.050204038619995, "logits/rejected": -3.0178327560424805, "logps/chosen": -56.4046516418457, "logps/rejected": -55.3991813659668, "loss": 0.6907, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0031705971341580153, "rewards/margins": 0.004987316206097603, "rewards/rejected": -0.008157914504408836, "step": 450 }, { "epoch": 0.08, "grad_norm": 2.446433174118867, "learning_rate": 1.9810508182601204e-07, "logits/chosen": -3.0185251235961914, "logits/rejected": -2.998788833618164, "logps/chosen": -56.675636291503906, "logps/rejected": -53.9906005859375, "loss": 0.6905, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004722902551293373, "rewards/margins": 0.005447733215987682, "rewards/rejected": -0.010170635767281055, "step": 460 }, { "epoch": 0.08, "grad_norm": 2.260629002268937, "learning_rate": 2.0241171403962102e-07, "logits/chosen": -3.0403809547424316, "logits/rejected": -3.0088233947753906, "logps/chosen": -53.7476692199707, "logps/rejected": -52.34978103637695, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004685810301452875, "rewards/margins": 0.005127486772835255, "rewards/rejected": -0.009813296608626842, "step": 470 }, { "epoch": 0.08, "grad_norm": 2.4519536559144295, "learning_rate": 2.0671834625322995e-07, "logits/chosen": -3.0377235412597656, "logits/rejected": -3.032456398010254, "logps/chosen": -54.986839294433594, "logps/rejected": -60.05756378173828, "loss": 0.6914, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007612565997987986, "rewards/margins": 0.003608607454225421, "rewards/rejected": -0.011221175082027912, "step": 480 }, { "epoch": 0.08, "grad_norm": 2.4663820792794677, "learning_rate": 2.1102497846683892e-07, "logits/chosen": -2.9463233947753906, "logits/rejected": -2.8967654705047607, "logps/chosen": -61.15422439575195, "logps/rejected": -52.75993728637695, "loss": 0.6889, "rewards/accuracies": 0.65625, "rewards/chosen": -0.005517776124179363, "rewards/margins": 0.008782900869846344, "rewards/rejected": -0.014300678856670856, "step": 490 }, { "epoch": 0.09, "grad_norm": 2.330775006199537, "learning_rate": 2.1533161068044788e-07, "logits/chosen": -3.0073325634002686, "logits/rejected": -2.978564500808716, "logps/chosen": -55.71733474731445, "logps/rejected": -52.86774826049805, "loss": 0.6907, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.007723568938672543, "rewards/margins": 0.005096676293760538, "rewards/rejected": -0.012820245698094368, "step": 500 }, { "epoch": 0.09, "eval_logits/chosen": -3.1515376567840576, "eval_logits/rejected": -3.1458804607391357, "eval_logps/chosen": -58.49737548828125, "eval_logps/rejected": -63.162086486816406, "eval_loss": 0.6921212077140808, "eval_rewards/accuracies": 0.5755111575126648, "eval_rewards/chosen": 0.0020647228229790926, "eval_rewards/margins": 0.0021117778960615396, "eval_rewards/rejected": -4.7055131290107965e-05, "eval_runtime": 357.3148, "eval_samples_per_second": 12.045, "eval_steps_per_second": 1.506, "step": 500 }, { "epoch": 0.09, "grad_norm": 2.29181189171612, "learning_rate": 2.1963824289405683e-07, "logits/chosen": -2.9982523918151855, "logits/rejected": -2.977153778076172, "logps/chosen": -59.12430953979492, "logps/rejected": -53.17649459838867, "loss": 0.6916, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.008605951443314552, "rewards/margins": 0.0032728458754718304, "rewards/rejected": -0.01187879778444767, "step": 510 }, { "epoch": 0.09, "grad_norm": 2.083836403157807, "learning_rate": 2.239448751076658e-07, "logits/chosen": -3.045750141143799, "logits/rejected": -3.0202910900115967, "logps/chosen": -57.38309860229492, "logps/rejected": -53.21696090698242, "loss": 0.6902, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.009720906615257263, "rewards/margins": 0.006151780020445585, "rewards/rejected": -0.015872687101364136, "step": 520 }, { "epoch": 0.09, "grad_norm": 2.0972964611747837, "learning_rate": 2.2825150732127476e-07, "logits/chosen": -3.0414490699768066, "logits/rejected": -3.0001041889190674, "logps/chosen": -56.354469299316406, "logps/rejected": -53.02845001220703, "loss": 0.6874, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.006593731231987476, "rewards/margins": 0.01189809013158083, "rewards/rejected": -0.018491821363568306, "step": 530 }, { "epoch": 0.09, "grad_norm": 2.264333945297249, "learning_rate": 2.3255813953488372e-07, "logits/chosen": -3.0273146629333496, "logits/rejected": -3.0108981132507324, "logps/chosen": -54.00878143310547, "logps/rejected": -54.52630615234375, "loss": 0.6909, "rewards/accuracies": 0.5625, "rewards/chosen": -0.011568492278456688, "rewards/margins": 0.004814336076378822, "rewards/rejected": -0.01638282835483551, "step": 540 }, { "epoch": 0.09, "grad_norm": 2.2643011063724696, "learning_rate": 2.3686477174849267e-07, "logits/chosen": -3.086310863494873, "logits/rejected": -3.0684800148010254, "logps/chosen": -54.64630126953125, "logps/rejected": -53.90349197387695, "loss": 0.689, "rewards/accuracies": 0.65625, "rewards/chosen": -0.010920585133135319, "rewards/margins": 0.008641783148050308, "rewards/rejected": -0.0195623692125082, "step": 550 }, { "epoch": 0.1, "grad_norm": 2.6473142726505916, "learning_rate": 2.411714039621016e-07, "logits/chosen": -3.0569510459899902, "logits/rejected": -3.0491485595703125, "logps/chosen": -53.267906188964844, "logps/rejected": -57.03251266479492, "loss": 0.6887, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.011830061674118042, "rewards/margins": 0.009134244173765182, "rewards/rejected": -0.020964305847883224, "step": 560 }, { "epoch": 0.1, "grad_norm": 2.2526602237955093, "learning_rate": 2.454780361757106e-07, "logits/chosen": -3.0276739597320557, "logits/rejected": -3.019141435623169, "logps/chosen": -52.507972717285156, "logps/rejected": -55.79932403564453, "loss": 0.6892, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.011621413752436638, "rewards/margins": 0.008228513412177563, "rewards/rejected": -0.019849926233291626, "step": 570 }, { "epoch": 0.1, "grad_norm": 1.8450642814148608, "learning_rate": 2.4978466838931953e-07, "logits/chosen": -3.0327651500701904, "logits/rejected": -3.0267834663391113, "logps/chosen": -52.5694580078125, "logps/rejected": -55.16640090942383, "loss": 0.691, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.014748876914381981, "rewards/margins": 0.004590337164700031, "rewards/rejected": -0.019339213147759438, "step": 580 }, { "epoch": 0.1, "grad_norm": 2.0917320499094676, "learning_rate": 2.540913006029285e-07, "logits/chosen": -3.029935359954834, "logits/rejected": -3.0082030296325684, "logps/chosen": -56.06707763671875, "logps/rejected": -56.61497116088867, "loss": 0.6903, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.012053562328219414, "rewards/margins": 0.006142253987491131, "rewards/rejected": -0.01819581724703312, "step": 590 }, { "epoch": 0.1, "grad_norm": 2.3704490382647596, "learning_rate": 2.5839793281653743e-07, "logits/chosen": -3.001335382461548, "logits/rejected": -2.9773948192596436, "logps/chosen": -54.8140869140625, "logps/rejected": -59.22527313232422, "loss": 0.6852, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.009530474431812763, "rewards/margins": 0.01643664576113224, "rewards/rejected": -0.025967121124267578, "step": 600 }, { "epoch": 0.1, "eval_logits/chosen": -3.1388049125671387, "eval_logits/rejected": -3.1331238746643066, "eval_logps/chosen": -58.60028839111328, "eval_logps/rejected": -63.40556716918945, "eval_loss": 0.6914582848548889, "eval_rewards/accuracies": 0.5822490453720093, "eval_rewards/chosen": 0.0010355679551139474, "eval_rewards/margins": 0.0035174190998077393, "eval_rewards/rejected": -0.0024818514939397573, "eval_runtime": 357.2236, "eval_samples_per_second": 12.048, "eval_steps_per_second": 1.506, "step": 600 }, { "epoch": 0.11, "grad_norm": 2.356076010409613, "learning_rate": 2.627045650301464e-07, "logits/chosen": -2.964993715286255, "logits/rejected": -2.9622960090637207, "logps/chosen": -54.120277404785156, "logps/rejected": -55.163963317871094, "loss": 0.6917, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.015915414318442345, "rewards/margins": 0.0033319643698632717, "rewards/rejected": -0.019247379153966904, "step": 610 }, { "epoch": 0.11, "grad_norm": 2.474319380074331, "learning_rate": 2.670111972437554e-07, "logits/chosen": -3.120192289352417, "logits/rejected": -3.0936880111694336, "logps/chosen": -56.2474479675293, "logps/rejected": -56.004188537597656, "loss": 0.6857, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.012355372309684753, "rewards/margins": 0.015266923233866692, "rewards/rejected": -0.027622297406196594, "step": 620 }, { "epoch": 0.11, "grad_norm": 2.5148230940892784, "learning_rate": 2.713178294573643e-07, "logits/chosen": -3.101238250732422, "logits/rejected": -3.0744516849517822, "logps/chosen": -55.61579513549805, "logps/rejected": -53.38257598876953, "loss": 0.6895, "rewards/accuracies": 0.625, "rewards/chosen": -0.01770562306046486, "rewards/margins": 0.007805233355611563, "rewards/rejected": -0.02551085688173771, "step": 630 }, { "epoch": 0.11, "grad_norm": 2.4893970821776743, "learning_rate": 2.756244616709733e-07, "logits/chosen": -3.067873239517212, "logits/rejected": -3.056884527206421, "logps/chosen": -54.79213333129883, "logps/rejected": -56.961280822753906, "loss": 0.6896, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.018890097737312317, "rewards/margins": 0.007660907693207264, "rewards/rejected": -0.026551008224487305, "step": 640 }, { "epoch": 0.11, "grad_norm": 2.8078860081688024, "learning_rate": 2.799310938845822e-07, "logits/chosen": -3.078247308731079, "logits/rejected": -3.0796327590942383, "logps/chosen": -53.70562744140625, "logps/rejected": -57.23451614379883, "loss": 0.6925, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.02245044708251953, "rewards/margins": 0.0018996421713382006, "rewards/rejected": -0.024350086227059364, "step": 650 }, { "epoch": 0.11, "grad_norm": 2.4337428834992307, "learning_rate": 2.842377260981912e-07, "logits/chosen": -2.967741012573242, "logits/rejected": -2.9617857933044434, "logps/chosen": -56.363494873046875, "logps/rejected": -54.866737365722656, "loss": 0.689, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.017431819811463356, "rewards/margins": 0.008792152628302574, "rewards/rejected": -0.02622397243976593, "step": 660 }, { "epoch": 0.12, "grad_norm": 2.339727211875298, "learning_rate": 2.885443583118002e-07, "logits/chosen": -2.9894890785217285, "logits/rejected": -2.984511137008667, "logps/chosen": -54.77692794799805, "logps/rejected": -60.03133010864258, "loss": 0.6897, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.01770101860165596, "rewards/margins": 0.007546558044850826, "rewards/rejected": -0.02524757757782936, "step": 670 }, { "epoch": 0.12, "grad_norm": 2.5872483817238128, "learning_rate": 2.928509905254091e-07, "logits/chosen": -2.9396934509277344, "logits/rejected": -2.914903163909912, "logps/chosen": -55.78126907348633, "logps/rejected": -54.39397430419922, "loss": 0.685, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.018597902730107307, "rewards/margins": 0.016813453286886215, "rewards/rejected": -0.035411350429058075, "step": 680 }, { "epoch": 0.12, "grad_norm": 2.7292903036637064, "learning_rate": 2.971576227390181e-07, "logits/chosen": -3.085705518722534, "logits/rejected": -3.056213617324829, "logps/chosen": -60.99599075317383, "logps/rejected": -53.571380615234375, "loss": 0.6877, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.019351843744516373, "rewards/margins": 0.011478090658783913, "rewards/rejected": -0.030829936265945435, "step": 690 }, { "epoch": 0.12, "grad_norm": 2.389276468868878, "learning_rate": 3.01464254952627e-07, "logits/chosen": -3.0419516563415527, "logits/rejected": -3.0135397911071777, "logps/chosen": -57.74323272705078, "logps/rejected": -56.58147430419922, "loss": 0.6854, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02020222134888172, "rewards/margins": 0.016081832349300385, "rewards/rejected": -0.03628405183553696, "step": 700 }, { "epoch": 0.12, "eval_logits/chosen": -3.1206588745117188, "eval_logits/rejected": -3.114993095397949, "eval_logps/chosen": -58.9453010559082, "eval_logps/rejected": -63.95465087890625, "eval_loss": 0.6904971599578857, "eval_rewards/accuracies": 0.5894516706466675, "eval_rewards/chosen": -0.0024145517963916063, "eval_rewards/margins": 0.005558097269386053, "eval_rewards/rejected": -0.007972650229930878, "eval_runtime": 357.4068, "eval_samples_per_second": 12.042, "eval_steps_per_second": 1.505, "step": 700 }, { "epoch": 0.12, "grad_norm": 2.586197787105945, "learning_rate": 3.05770887166236e-07, "logits/chosen": -3.0198841094970703, "logits/rejected": -2.9914581775665283, "logps/chosen": -56.7116584777832, "logps/rejected": -58.61717987060547, "loss": 0.6843, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.020994270220398903, "rewards/margins": 0.018473895266652107, "rewards/rejected": -0.03946816921234131, "step": 710 }, { "epoch": 0.12, "grad_norm": 2.702057793681809, "learning_rate": 3.100775193798449e-07, "logits/chosen": -2.9804978370666504, "logits/rejected": -2.9764490127563477, "logps/chosen": -55.76905059814453, "logps/rejected": -58.02165603637695, "loss": 0.6876, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.024686764925718307, "rewards/margins": 0.011716444976627827, "rewards/rejected": -0.03640320897102356, "step": 720 }, { "epoch": 0.13, "grad_norm": 2.6466880554114547, "learning_rate": 3.143841515934539e-07, "logits/chosen": -3.0941200256347656, "logits/rejected": -3.069159746170044, "logps/chosen": -59.358177185058594, "logps/rejected": -57.388641357421875, "loss": 0.6844, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03032388910651207, "rewards/margins": 0.01852940209209919, "rewards/rejected": -0.04885329678654671, "step": 730 }, { "epoch": 0.13, "grad_norm": 2.3238465611991073, "learning_rate": 3.186907838070629e-07, "logits/chosen": -2.9740123748779297, "logits/rejected": -2.9482741355895996, "logps/chosen": -57.704978942871094, "logps/rejected": -58.32112503051758, "loss": 0.6846, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0279629435390234, "rewards/margins": 0.01816297508776188, "rewards/rejected": -0.04612591490149498, "step": 740 }, { "epoch": 0.13, "grad_norm": 2.60752160244168, "learning_rate": 3.229974160206718e-07, "logits/chosen": -3.142927646636963, "logits/rejected": -3.115029811859131, "logps/chosen": -58.75443649291992, "logps/rejected": -59.41529083251953, "loss": 0.6801, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.027507567778229713, "rewards/margins": 0.026881281286478043, "rewards/rejected": -0.05438884347677231, "step": 750 }, { "epoch": 0.13, "grad_norm": 3.113387241673442, "learning_rate": 3.273040482342808e-07, "logits/chosen": -2.9908509254455566, "logits/rejected": -2.9526984691619873, "logps/chosen": -57.714439392089844, "logps/rejected": -54.94502639770508, "loss": 0.683, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03286290913820267, "rewards/margins": 0.02157803811132908, "rewards/rejected": -0.0544409453868866, "step": 760 }, { "epoch": 0.13, "grad_norm": 2.4453618798570953, "learning_rate": 3.3161068044788976e-07, "logits/chosen": -3.038949489593506, "logits/rejected": -3.015986680984497, "logps/chosen": -56.661033630371094, "logps/rejected": -57.70270538330078, "loss": 0.685, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.037630461156368256, "rewards/margins": 0.017523907124996185, "rewards/rejected": -0.05515437200665474, "step": 770 }, { "epoch": 0.13, "grad_norm": 2.7995118829195893, "learning_rate": 3.359173126614987e-07, "logits/chosen": -3.0295486450195312, "logits/rejected": -2.9988982677459717, "logps/chosen": -57.22774124145508, "logps/rejected": -57.87713623046875, "loss": 0.6804, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04078695923089981, "rewards/margins": 0.026560068130493164, "rewards/rejected": -0.06734703481197357, "step": 780 }, { "epoch": 0.14, "grad_norm": 2.59814741855216, "learning_rate": 3.402239448751076e-07, "logits/chosen": -3.0351967811584473, "logits/rejected": -3.001878499984741, "logps/chosen": -57.94755172729492, "logps/rejected": -59.64392852783203, "loss": 0.6835, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04035332426428795, "rewards/margins": 0.02070373296737671, "rewards/rejected": -0.06105704978108406, "step": 790 }, { "epoch": 0.14, "grad_norm": 2.979271826495132, "learning_rate": 3.4453057708871665e-07, "logits/chosen": -2.9210307598114014, "logits/rejected": -2.8951001167297363, "logps/chosen": -59.6242790222168, "logps/rejected": -61.24828338623047, "loss": 0.6829, "rewards/accuracies": 0.625, "rewards/chosen": -0.04646407067775726, "rewards/margins": 0.021958164870738983, "rewards/rejected": -0.06842224299907684, "step": 800 }, { "epoch": 0.14, "eval_logits/chosen": -3.094379186630249, "eval_logits/rejected": -3.0886640548706055, "eval_logps/chosen": -60.67955017089844, "eval_logps/rejected": -66.09896087646484, "eval_loss": 0.6886637806892395, "eval_rewards/accuracies": 0.5734200477600098, "eval_rewards/chosen": -0.019756997004151344, "eval_rewards/margins": 0.009658826515078545, "eval_rewards/rejected": -0.029415827244520187, "eval_runtime": 356.2401, "eval_samples_per_second": 12.082, "eval_steps_per_second": 1.51, "step": 800 }, { "epoch": 0.14, "grad_norm": 2.4246944915810724, "learning_rate": 3.4883720930232557e-07, "logits/chosen": -2.9902844429016113, "logits/rejected": -2.9628844261169434, "logps/chosen": -61.16777801513672, "logps/rejected": -62.579185485839844, "loss": 0.6814, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.045093193650245667, "rewards/margins": 0.025155682116746902, "rewards/rejected": -0.07024887204170227, "step": 810 }, { "epoch": 0.14, "grad_norm": 2.572085405803537, "learning_rate": 3.531438415159345e-07, "logits/chosen": -3.0369210243225098, "logits/rejected": -3.0116257667541504, "logps/chosen": -56.83648681640625, "logps/rejected": -58.9111328125, "loss": 0.6793, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.05299956351518631, "rewards/margins": 0.028944198042154312, "rewards/rejected": -0.08194376528263092, "step": 820 }, { "epoch": 0.14, "grad_norm": 2.8751549171801125, "learning_rate": 3.574504737295435e-07, "logits/chosen": -2.96337628364563, "logits/rejected": -2.948317050933838, "logps/chosen": -61.24248504638672, "logps/rejected": -63.26741409301758, "loss": 0.6811, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06662433594465256, "rewards/margins": 0.02630782127380371, "rewards/rejected": -0.09293216466903687, "step": 830 }, { "epoch": 0.14, "grad_norm": 2.78034944593138, "learning_rate": 3.6175710594315246e-07, "logits/chosen": -3.060987949371338, "logits/rejected": -3.034540891647339, "logps/chosen": -60.72172164916992, "logps/rejected": -59.53022003173828, "loss": 0.6774, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06628672778606415, "rewards/margins": 0.033673472702503204, "rewards/rejected": -0.09996020048856735, "step": 840 }, { "epoch": 0.15, "grad_norm": 2.6269588611544923, "learning_rate": 3.660637381567614e-07, "logits/chosen": -2.93640398979187, "logits/rejected": -2.9257328510284424, "logps/chosen": -58.38752365112305, "logps/rejected": -64.44731140136719, "loss": 0.6844, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07559017837047577, "rewards/margins": 0.01969515159726143, "rewards/rejected": -0.0952853411436081, "step": 850 }, { "epoch": 0.15, "grad_norm": 2.5848797454702646, "learning_rate": 3.703703703703703e-07, "logits/chosen": -2.9593284130096436, "logits/rejected": -2.9393258094787598, "logps/chosen": -60.31854248046875, "logps/rejected": -63.26483154296875, "loss": 0.6769, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.07502218335866928, "rewards/margins": 0.03479871153831482, "rewards/rejected": -0.1098209023475647, "step": 860 }, { "epoch": 0.15, "grad_norm": 2.5705323083064373, "learning_rate": 3.7467700258397934e-07, "logits/chosen": -3.0270254611968994, "logits/rejected": -3.0232787132263184, "logps/chosen": -58.79963302612305, "logps/rejected": -64.54035949707031, "loss": 0.6772, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07506883889436722, "rewards/margins": 0.033983923494815826, "rewards/rejected": -0.10905275493860245, "step": 870 }, { "epoch": 0.15, "grad_norm": 2.689358158624382, "learning_rate": 3.7898363479758827e-07, "logits/chosen": -2.9357428550720215, "logits/rejected": -2.9134020805358887, "logps/chosen": -59.15460205078125, "logps/rejected": -62.313621520996094, "loss": 0.6765, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07416196167469025, "rewards/margins": 0.035411037504673004, "rewards/rejected": -0.10957300662994385, "step": 880 }, { "epoch": 0.15, "grad_norm": 2.6438277738000293, "learning_rate": 3.832902670111972e-07, "logits/chosen": -2.9593563079833984, "logits/rejected": -2.9229862689971924, "logps/chosen": -64.84220123291016, "logps/rejected": -63.8851203918457, "loss": 0.6778, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06395251303911209, "rewards/margins": 0.03320237249135971, "rewards/rejected": -0.0971548929810524, "step": 890 }, { "epoch": 0.16, "grad_norm": 2.832318924397106, "learning_rate": 3.8759689922480623e-07, "logits/chosen": -2.977108955383301, "logits/rejected": -2.9669320583343506, "logps/chosen": -62.267547607421875, "logps/rejected": -63.87932586669922, "loss": 0.6773, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08133289217948914, "rewards/margins": 0.033999234437942505, "rewards/rejected": -0.11533211171627045, "step": 900 }, { "epoch": 0.16, "eval_logits/chosen": -3.057013750076294, "eval_logits/rejected": -3.051332473754883, "eval_logps/chosen": -63.69248962402344, "eval_logps/rejected": -69.66419982910156, "eval_loss": 0.6862910985946655, "eval_rewards/accuracies": 0.5929368138313293, "eval_rewards/chosen": -0.049886368215084076, "eval_rewards/margins": 0.01518191583454609, "eval_rewards/rejected": -0.06506828218698502, "eval_runtime": 356.6845, "eval_samples_per_second": 12.067, "eval_steps_per_second": 1.508, "step": 900 }, { "epoch": 0.16, "grad_norm": 2.791646492165207, "learning_rate": 3.9190353143841515e-07, "logits/chosen": -2.9454283714294434, "logits/rejected": -2.9370040893554688, "logps/chosen": -60.4368896484375, "logps/rejected": -65.48957061767578, "loss": 0.676, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09426609426736832, "rewards/margins": 0.04028692469000816, "rewards/rejected": -0.13455303013324738, "step": 910 }, { "epoch": 0.16, "grad_norm": 3.011151931674002, "learning_rate": 3.962101636520241e-07, "logits/chosen": -2.9765427112579346, "logits/rejected": -2.934263229370117, "logps/chosen": -64.45548248291016, "logps/rejected": -62.556602478027344, "loss": 0.6796, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10165347158908844, "rewards/margins": 0.029758507385849953, "rewards/rejected": -0.13141196966171265, "step": 920 }, { "epoch": 0.16, "grad_norm": 3.2118502689123245, "learning_rate": 4.0051679586563306e-07, "logits/chosen": -3.031832456588745, "logits/rejected": -3.0159494876861572, "logps/chosen": -62.47021484375, "logps/rejected": -68.52490234375, "loss": 0.6746, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10014158487319946, "rewards/margins": 0.04036015272140503, "rewards/rejected": -0.14050175249576569, "step": 930 }, { "epoch": 0.16, "grad_norm": 2.9839009058497106, "learning_rate": 4.0482342807924204e-07, "logits/chosen": -3.00956392288208, "logits/rejected": -2.973090410232544, "logps/chosen": -71.03943634033203, "logps/rejected": -69.27840423583984, "loss": 0.6769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10842480510473251, "rewards/margins": 0.03579176589846611, "rewards/rejected": -0.14421656727790833, "step": 940 }, { "epoch": 0.16, "grad_norm": 2.966178612417894, "learning_rate": 4.0913006029285096e-07, "logits/chosen": -2.802929639816284, "logits/rejected": -2.785623073577881, "logps/chosen": -67.41767883300781, "logps/rejected": -71.18104553222656, "loss": 0.6779, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1217707023024559, "rewards/margins": 0.033875979483127594, "rewards/rejected": -0.1556466817855835, "step": 950 }, { "epoch": 0.17, "grad_norm": 3.310978864654213, "learning_rate": 4.134366925064599e-07, "logits/chosen": -2.7671194076538086, "logits/rejected": -2.770169734954834, "logps/chosen": -63.908164978027344, "logps/rejected": -69.78746032714844, "loss": 0.6894, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1316661685705185, "rewards/margins": 0.010264839045703411, "rewards/rejected": -0.14193101227283478, "step": 960 }, { "epoch": 0.17, "grad_norm": 3.407472849470465, "learning_rate": 4.177433247200689e-07, "logits/chosen": -2.93009877204895, "logits/rejected": -2.8950753211975098, "logps/chosen": -72.93366241455078, "logps/rejected": -67.29154968261719, "loss": 0.6821, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1230693906545639, "rewards/margins": 0.025729473680257797, "rewards/rejected": -0.148798868060112, "step": 970 }, { "epoch": 0.17, "grad_norm": 3.6748252700178887, "learning_rate": 4.2204995693367785e-07, "logits/chosen": -3.024207353591919, "logits/rejected": -3.0045340061187744, "logps/chosen": -67.85172271728516, "logps/rejected": -72.01612854003906, "loss": 0.6733, "rewards/accuracies": 0.625, "rewards/chosen": -0.11790040880441666, "rewards/margins": 0.05413081496953964, "rewards/rejected": -0.1720312237739563, "step": 980 }, { "epoch": 0.17, "grad_norm": 3.726003522783883, "learning_rate": 4.263565891472868e-07, "logits/chosen": -2.889413833618164, "logits/rejected": -2.8664448261260986, "logps/chosen": -66.66841125488281, "logps/rejected": -68.96918487548828, "loss": 0.6738, "rewards/accuracies": 0.625, "rewards/chosen": -0.12137794494628906, "rewards/margins": 0.04220535233616829, "rewards/rejected": -0.16358330845832825, "step": 990 }, { "epoch": 0.17, "grad_norm": 3.397153414453954, "learning_rate": 4.3066322136089576e-07, "logits/chosen": -2.836636543273926, "logits/rejected": -2.8090505599975586, "logps/chosen": -71.08350372314453, "logps/rejected": -68.55287170410156, "loss": 0.6818, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.14227214455604553, "rewards/margins": 0.02618454024195671, "rewards/rejected": -0.16845668852329254, "step": 1000 }, { "epoch": 0.17, "eval_logits/chosen": -3.0087461471557617, "eval_logits/rejected": -3.003145694732666, "eval_logps/chosen": -67.30126190185547, "eval_logps/rejected": -73.87139892578125, "eval_loss": 0.6837059855461121, "eval_rewards/accuracies": 0.5971189737319946, "eval_rewards/chosen": -0.08597413450479507, "eval_rewards/margins": 0.02116604894399643, "eval_rewards/rejected": -0.1071401834487915, "eval_runtime": 356.1874, "eval_samples_per_second": 12.084, "eval_steps_per_second": 1.51, "step": 1000 }, { "epoch": 0.17, "grad_norm": 3.4839564072636575, "learning_rate": 4.3496985357450473e-07, "logits/chosen": -2.771763324737549, "logits/rejected": -2.781416416168213, "logps/chosen": -67.20372009277344, "logps/rejected": -78.4428939819336, "loss": 0.6666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13454464077949524, "rewards/margins": 0.07122045755386353, "rewards/rejected": -0.20576509833335876, "step": 1010 }, { "epoch": 0.18, "grad_norm": 3.456440043201763, "learning_rate": 4.3927648578811366e-07, "logits/chosen": -2.9379830360412598, "logits/rejected": -2.9037129878997803, "logps/chosen": -70.02944946289062, "logps/rejected": -73.0363540649414, "loss": 0.6657, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1292036920785904, "rewards/margins": 0.05881650000810623, "rewards/rejected": -0.18802018463611603, "step": 1020 }, { "epoch": 0.18, "grad_norm": 3.573048388794818, "learning_rate": 4.4358311800172264e-07, "logits/chosen": -2.9771835803985596, "logits/rejected": -2.9505438804626465, "logps/chosen": -67.1936264038086, "logps/rejected": -70.21983337402344, "loss": 0.6709, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1349172741174698, "rewards/margins": 0.048518020659685135, "rewards/rejected": -0.18343529105186462, "step": 1030 }, { "epoch": 0.18, "grad_norm": 4.0876459743090034, "learning_rate": 4.478897502153316e-07, "logits/chosen": -2.893394947052002, "logits/rejected": -2.8834469318389893, "logps/chosen": -68.99354553222656, "logps/rejected": -74.55607604980469, "loss": 0.6785, "rewards/accuracies": 0.625, "rewards/chosen": -0.1559665948152542, "rewards/margins": 0.03430451080203056, "rewards/rejected": -0.19027109444141388, "step": 1040 }, { "epoch": 0.18, "grad_norm": 3.9493370777335084, "learning_rate": 4.5219638242894055e-07, "logits/chosen": -2.8083250522613525, "logits/rejected": -2.773916721343994, "logps/chosen": -73.37397766113281, "logps/rejected": -71.31639862060547, "loss": 0.6732, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.16260358691215515, "rewards/margins": 0.04476263374090195, "rewards/rejected": -0.2073661983013153, "step": 1050 }, { "epoch": 0.18, "grad_norm": 3.7318845219809025, "learning_rate": 4.565030146425495e-07, "logits/chosen": -2.8137454986572266, "logits/rejected": -2.7951653003692627, "logps/chosen": -73.75227355957031, "logps/rejected": -76.26736450195312, "loss": 0.6754, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1725139617919922, "rewards/margins": 0.04043154790997505, "rewards/rejected": -0.21294550597667694, "step": 1060 }, { "epoch": 0.18, "grad_norm": 3.921970666271597, "learning_rate": 4.6080964685615845e-07, "logits/chosen": -2.9450836181640625, "logits/rejected": -2.912147045135498, "logps/chosen": -74.61820220947266, "logps/rejected": -76.5606689453125, "loss": 0.6729, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.18662679195404053, "rewards/margins": 0.04664309695363045, "rewards/rejected": -0.23326988518238068, "step": 1070 }, { "epoch": 0.19, "grad_norm": 3.8287702054233956, "learning_rate": 4.6511627906976743e-07, "logits/chosen": -2.9213509559631348, "logits/rejected": -2.8993239402770996, "logps/chosen": -76.99789428710938, "logps/rejected": -75.40824127197266, "loss": 0.6781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.21221812069416046, "rewards/margins": 0.03467785567045212, "rewards/rejected": -0.2468959540128708, "step": 1080 }, { "epoch": 0.19, "grad_norm": 3.9047242283272396, "learning_rate": 4.6942291128337636e-07, "logits/chosen": -2.797940731048584, "logits/rejected": -2.7896194458007812, "logps/chosen": -71.64787292480469, "logps/rejected": -77.1730728149414, "loss": 0.6704, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19434688985347748, "rewards/margins": 0.051201529800891876, "rewards/rejected": -0.24554841220378876, "step": 1090 }, { "epoch": 0.19, "grad_norm": 3.742135088088785, "learning_rate": 4.7372954349698534e-07, "logits/chosen": -2.8556008338928223, "logits/rejected": -2.8531699180603027, "logps/chosen": -69.83229064941406, "logps/rejected": -78.55072784423828, "loss": 0.6715, "rewards/accuracies": 0.65625, "rewards/chosen": -0.18923941254615784, "rewards/margins": 0.050502412021160126, "rewards/rejected": -0.23974183201789856, "step": 1100 }, { "epoch": 0.19, "eval_logits/chosen": -2.946117401123047, "eval_logits/rejected": -2.94049072265625, "eval_logps/chosen": -71.7703857421875, "eval_logps/rejected": -79.22164916992188, "eval_loss": 0.679982602596283, "eval_rewards/accuracies": 0.6057156324386597, "eval_rewards/chosen": -0.13066548109054565, "eval_rewards/margins": 0.029977135360240936, "eval_rewards/rejected": -0.1606426239013672, "eval_runtime": 356.7556, "eval_samples_per_second": 12.064, "eval_steps_per_second": 1.508, "step": 1100 }, { "epoch": 0.19, "grad_norm": 4.753611130936478, "learning_rate": 4.780361757105943e-07, "logits/chosen": -2.840238571166992, "logits/rejected": -2.8512330055236816, "logps/chosen": -71.57658386230469, "logps/rejected": -81.72315979003906, "loss": 0.6636, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18691562116146088, "rewards/margins": 0.0658998116850853, "rewards/rejected": -0.2528154253959656, "step": 1110 }, { "epoch": 0.19, "grad_norm": 4.200014824991902, "learning_rate": 4.823428079242032e-07, "logits/chosen": -2.8601462841033936, "logits/rejected": -2.83921480178833, "logps/chosen": -75.09476470947266, "logps/rejected": -77.8160400390625, "loss": 0.6691, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1874438375234604, "rewards/margins": 0.05362669751048088, "rewards/rejected": -0.24107055366039276, "step": 1120 }, { "epoch": 0.19, "grad_norm": 4.069383490654212, "learning_rate": 4.866494401378123e-07, "logits/chosen": -2.903735399246216, "logits/rejected": -2.8804819583892822, "logps/chosen": -72.0593490600586, "logps/rejected": -81.51217651367188, "loss": 0.6601, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20027342438697815, "rewards/margins": 0.07303432375192642, "rewards/rejected": -0.273307740688324, "step": 1130 }, { "epoch": 0.2, "grad_norm": 4.350410897251021, "learning_rate": 4.909560723514212e-07, "logits/chosen": -2.808011293411255, "logits/rejected": -2.7773683071136475, "logps/chosen": -77.37016296386719, "logps/rejected": -78.98043060302734, "loss": 0.6698, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2049834281206131, "rewards/margins": 0.055016178637742996, "rewards/rejected": -0.2599996030330658, "step": 1140 }, { "epoch": 0.2, "grad_norm": 4.680052319466735, "learning_rate": 4.952627045650301e-07, "logits/chosen": -2.7504663467407227, "logits/rejected": -2.7321338653564453, "logps/chosen": -74.69068908691406, "logps/rejected": -83.3788833618164, "loss": 0.6601, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.21349653601646423, "rewards/margins": 0.0752733126282692, "rewards/rejected": -0.28876984119415283, "step": 1150 }, { "epoch": 0.2, "grad_norm": 4.383585223712436, "learning_rate": 4.995693367786391e-07, "logits/chosen": -2.8323190212249756, "logits/rejected": -2.823431968688965, "logps/chosen": -74.20658874511719, "logps/rejected": -82.88806915283203, "loss": 0.6568, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.20177213847637177, "rewards/margins": 0.08152016252279282, "rewards/rejected": -0.283292293548584, "step": 1160 }, { "epoch": 0.2, "grad_norm": 4.7924512014269425, "learning_rate": 4.999990843883228e-07, "logits/chosen": -2.706714630126953, "logits/rejected": -2.694248914718628, "logps/chosen": -75.1012191772461, "logps/rejected": -86.26618194580078, "loss": 0.6645, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2305610179901123, "rewards/margins": 0.08545393496751785, "rewards/rejected": -0.31601497530937195, "step": 1170 }, { "epoch": 0.2, "grad_norm": 4.531230658306969, "learning_rate": 4.999959193195308e-07, "logits/chosen": -2.665681838989258, "logits/rejected": -2.6355409622192383, "logps/chosen": -78.78900146484375, "logps/rejected": -83.79048156738281, "loss": 0.6643, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2599770426750183, "rewards/margins": 0.06831606477499008, "rewards/rejected": -0.3282931447029114, "step": 1180 }, { "epoch": 0.21, "grad_norm": 4.6260170747775105, "learning_rate": 4.999904935183911e-07, "logits/chosen": -2.841900587081909, "logits/rejected": -2.8088276386260986, "logps/chosen": -83.71769714355469, "logps/rejected": -83.9743423461914, "loss": 0.6567, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.25023940205574036, "rewards/margins": 0.08566378057003021, "rewards/rejected": -0.33590319752693176, "step": 1190 }, { "epoch": 0.21, "grad_norm": 4.214091282801035, "learning_rate": 4.999828070339698e-07, "logits/chosen": -2.669875144958496, "logits/rejected": -2.655302047729492, "logps/chosen": -79.1661605834961, "logps/rejected": -83.97240447998047, "loss": 0.6651, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2603180706501007, "rewards/margins": 0.0669156163930893, "rewards/rejected": -0.3272337019443512, "step": 1200 }, { "epoch": 0.21, "eval_logits/chosen": -2.85164475440979, "eval_logits/rejected": -2.84596586227417, "eval_logps/chosen": -78.02967834472656, "eval_logps/rejected": -86.59571075439453, "eval_loss": 0.6755677461624146, "eval_rewards/accuracies": 0.5996747016906738, "eval_rewards/chosen": -0.1932583451271057, "eval_rewards/margins": 0.04112492874264717, "eval_rewards/rejected": -0.23438328504562378, "eval_runtime": 357.4222, "eval_samples_per_second": 12.042, "eval_steps_per_second": 1.505, "step": 1200 }, { "epoch": 0.21, "grad_norm": 5.729626186243229, "learning_rate": 4.999728599357762e-07, "logits/chosen": -2.7580645084381104, "logits/rejected": -2.725999355316162, "logps/chosen": -82.05293273925781, "logps/rejected": -90.14490509033203, "loss": 0.6559, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.2827844023704529, "rewards/margins": 0.08879880607128143, "rewards/rejected": -0.37158316373825073, "step": 1210 }, { "epoch": 0.21, "grad_norm": 4.843172675927166, "learning_rate": 4.999606523137628e-07, "logits/chosen": -2.7558417320251465, "logits/rejected": -2.730149745941162, "logps/chosen": -82.96326446533203, "logps/rejected": -92.13468933105469, "loss": 0.6487, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2960751950740814, "rewards/margins": 0.10249904543161392, "rewards/rejected": -0.39857420325279236, "step": 1220 }, { "epoch": 0.21, "grad_norm": 5.963904626730247, "learning_rate": 4.99946184278324e-07, "logits/chosen": -2.815377950668335, "logits/rejected": -2.7770168781280518, "logps/chosen": -87.67040252685547, "logps/rejected": -92.19978332519531, "loss": 0.6591, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.31936585903167725, "rewards/margins": 0.08062832802534103, "rewards/rejected": -0.3999941945075989, "step": 1230 }, { "epoch": 0.21, "grad_norm": 5.432890340977972, "learning_rate": 4.999294559602954e-07, "logits/chosen": -2.681164264678955, "logits/rejected": -2.666093349456787, "logps/chosen": -85.52889251708984, "logps/rejected": -93.53931427001953, "loss": 0.6638, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3310098946094513, "rewards/margins": 0.07634725421667099, "rewards/rejected": -0.4073571562767029, "step": 1240 }, { "epoch": 0.22, "grad_norm": 5.250007678464064, "learning_rate": 4.999104675109525e-07, "logits/chosen": -2.787619113922119, "logits/rejected": -2.7531590461730957, "logps/chosen": -84.2263412475586, "logps/rejected": -89.0356216430664, "loss": 0.6633, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3051467835903168, "rewards/margins": 0.07372823357582092, "rewards/rejected": -0.3788750171661377, "step": 1250 }, { "epoch": 0.22, "grad_norm": 5.437906354386665, "learning_rate": 4.998892191020092e-07, "logits/chosen": -2.6413798332214355, "logits/rejected": -2.6131081581115723, "logps/chosen": -83.180419921875, "logps/rejected": -89.31295776367188, "loss": 0.6567, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.30223608016967773, "rewards/margins": 0.08826258033514023, "rewards/rejected": -0.39049869775772095, "step": 1260 }, { "epoch": 0.22, "grad_norm": 7.461728534142161, "learning_rate": 4.998657109256166e-07, "logits/chosen": -2.6996243000030518, "logits/rejected": -2.6940901279449463, "logps/chosen": -85.5318374633789, "logps/rejected": -94.53944396972656, "loss": 0.6692, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.33671867847442627, "rewards/margins": 0.06170588731765747, "rewards/rejected": -0.39842456579208374, "step": 1270 }, { "epoch": 0.22, "grad_norm": 5.558513126380609, "learning_rate": 4.998399431943609e-07, "logits/chosen": -2.768416166305542, "logits/rejected": -2.7740864753723145, "logps/chosen": -79.00863647460938, "logps/rejected": -98.24131774902344, "loss": 0.6481, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2783285081386566, "rewards/margins": 0.10511146485805511, "rewards/rejected": -0.38343995809555054, "step": 1280 }, { "epoch": 0.22, "grad_norm": 5.865934178946898, "learning_rate": 4.998119161412618e-07, "logits/chosen": -2.6547913551330566, "logits/rejected": -2.62509822845459, "logps/chosen": -86.21808624267578, "logps/rejected": -90.81287384033203, "loss": 0.6581, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3163338005542755, "rewards/margins": 0.08770157396793365, "rewards/rejected": -0.40403538942337036, "step": 1290 }, { "epoch": 0.22, "grad_norm": 6.062029915276689, "learning_rate": 4.997816300197699e-07, "logits/chosen": -2.7270829677581787, "logits/rejected": -2.714017391204834, "logps/chosen": -87.53861999511719, "logps/rejected": -99.14437866210938, "loss": 0.663, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.37334591150283813, "rewards/margins": 0.0809113010764122, "rewards/rejected": -0.4542572498321533, "step": 1300 }, { "epoch": 0.22, "eval_logits/chosen": -2.7855560779571533, "eval_logits/rejected": -2.7796361446380615, "eval_logps/chosen": -86.98535919189453, "eval_logps/rejected": -97.24429321289062, "eval_loss": 0.6691488027572632, "eval_rewards/accuracies": 0.6171003580093384, "eval_rewards/chosen": -0.2828150987625122, "eval_rewards/margins": 0.05805408954620361, "eval_rewards/rejected": -0.3408692181110382, "eval_runtime": 357.0192, "eval_samples_per_second": 12.055, "eval_steps_per_second": 1.507, "step": 1300 }, { "epoch": 0.23, "grad_norm": 7.2374921490781166, "learning_rate": 4.997490851037651e-07, "logits/chosen": -2.7199060916900635, "logits/rejected": -2.685650110244751, "logps/chosen": -90.32498931884766, "logps/rejected": -97.50151824951172, "loss": 0.6474, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3698830008506775, "rewards/margins": 0.11114762723445892, "rewards/rejected": -0.4810306429862976, "step": 1310 }, { "epoch": 0.23, "grad_norm": 7.329198026466427, "learning_rate": 4.997142816875534e-07, "logits/chosen": -2.6866555213928223, "logits/rejected": -2.6637563705444336, "logps/chosen": -93.89530944824219, "logps/rejected": -97.65375518798828, "loss": 0.6605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.38488394021987915, "rewards/margins": 0.0846201553940773, "rewards/rejected": -0.469504177570343, "step": 1320 }, { "epoch": 0.23, "grad_norm": 6.989316967263134, "learning_rate": 4.996772200858648e-07, "logits/chosen": -2.759702682495117, "logits/rejected": -2.731628894805908, "logps/chosen": -94.14637756347656, "logps/rejected": -99.45263671875, "loss": 0.6529, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3895763158798218, "rewards/margins": 0.10352253913879395, "rewards/rejected": -0.4930989146232605, "step": 1330 }, { "epoch": 0.23, "grad_norm": 5.339352998534476, "learning_rate": 4.996379006338504e-07, "logits/chosen": -2.6027302742004395, "logits/rejected": -2.582573890686035, "logps/chosen": -86.88957977294922, "logps/rejected": -96.32715606689453, "loss": 0.6431, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3176007866859436, "rewards/margins": 0.11718092858791351, "rewards/rejected": -0.4347817003726959, "step": 1340 }, { "epoch": 0.23, "grad_norm": 6.251815390342403, "learning_rate": 4.99596323687079e-07, "logits/chosen": -2.6558520793914795, "logits/rejected": -2.632688522338867, "logps/chosen": -94.7468490600586, "logps/rejected": -102.03514099121094, "loss": 0.666, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.40661248564720154, "rewards/margins": 0.08162738382816315, "rewards/rejected": -0.4882398247718811, "step": 1350 }, { "epoch": 0.23, "grad_norm": 6.6807835207225725, "learning_rate": 4.995524896215339e-07, "logits/chosen": -2.606091022491455, "logits/rejected": -2.593371868133545, "logps/chosen": -95.50038146972656, "logps/rejected": -104.9891586303711, "loss": 0.6603, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.40270328521728516, "rewards/margins": 0.08724579960107803, "rewards/rejected": -0.48994913697242737, "step": 1360 }, { "epoch": 0.24, "grad_norm": 7.719117110945399, "learning_rate": 4.995063988336101e-07, "logits/chosen": -2.6957902908325195, "logits/rejected": -2.67728328704834, "logps/chosen": -93.07683563232422, "logps/rejected": -106.47142028808594, "loss": 0.6415, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4041716456413269, "rewards/margins": 0.13314835727214813, "rewards/rejected": -0.5373200178146362, "step": 1370 }, { "epoch": 0.24, "grad_norm": 7.523503188478382, "learning_rate": 4.994580517401102e-07, "logits/chosen": -2.5843305587768555, "logits/rejected": -2.56766414642334, "logps/chosen": -97.11516571044922, "logps/rejected": -107.70014953613281, "loss": 0.6448, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.42785486578941345, "rewards/margins": 0.12621551752090454, "rewards/rejected": -0.5540703535079956, "step": 1380 }, { "epoch": 0.24, "grad_norm": 7.994616924999172, "learning_rate": 4.994074487782406e-07, "logits/chosen": -2.7036585807800293, "logits/rejected": -2.6787309646606445, "logps/chosen": -103.4170913696289, "logps/rejected": -113.5262222290039, "loss": 0.6457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.46558231115341187, "rewards/margins": 0.1351221799850464, "rewards/rejected": -0.6007044315338135, "step": 1390 }, { "epoch": 0.24, "grad_norm": 7.076195209744298, "learning_rate": 4.993545904056078e-07, "logits/chosen": -2.5222525596618652, "logits/rejected": -2.4996466636657715, "logps/chosen": -100.6828384399414, "logps/rejected": -113.02195739746094, "loss": 0.6329, "rewards/accuracies": 0.65625, "rewards/chosen": -0.448734849691391, "rewards/margins": 0.15365351736545563, "rewards/rejected": -0.6023883819580078, "step": 1400 }, { "epoch": 0.24, "eval_logits/chosen": -2.6804823875427246, "eval_logits/rejected": -2.674381732940674, "eval_logps/chosen": -96.39348602294922, "eval_logps/rejected": -108.98139190673828, "eval_loss": 0.6609914302825928, "eval_rewards/accuracies": 0.61849445104599, "eval_rewards/chosen": -0.37689635157585144, "eval_rewards/margins": 0.08134372532367706, "eval_rewards/rejected": -0.4582400619983673, "eval_runtime": 356.0921, "eval_samples_per_second": 12.087, "eval_steps_per_second": 1.511, "step": 1400 }, { "epoch": 0.24, "grad_norm": 8.108597626289049, "learning_rate": 4.992994771002141e-07, "logits/chosen": -2.5680909156799316, "logits/rejected": -2.5558865070343018, "logps/chosen": -102.64105224609375, "logps/rejected": -119.11705017089844, "loss": 0.6367, "rewards/accuracies": 0.6875, "rewards/chosen": -0.48273658752441406, "rewards/margins": 0.17660747468471527, "rewards/rejected": -0.6593440175056458, "step": 1410 }, { "epoch": 0.24, "grad_norm": 7.858931662859935, "learning_rate": 4.992421093604534e-07, "logits/chosen": -2.4751877784729004, "logits/rejected": -2.4777843952178955, "logps/chosen": -101.93944549560547, "logps/rejected": -124.42276763916016, "loss": 0.6348, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.512130856513977, "rewards/margins": 0.1852533519268036, "rewards/rejected": -0.697384238243103, "step": 1420 }, { "epoch": 0.25, "grad_norm": 9.45227562873637, "learning_rate": 4.991824877051067e-07, "logits/chosen": -2.561638832092285, "logits/rejected": -2.54856276512146, "logps/chosen": -108.27215576171875, "logps/rejected": -134.86288452148438, "loss": 0.601, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5470194816589355, "rewards/margins": 0.2372448891401291, "rewards/rejected": -0.784264326095581, "step": 1430 }, { "epoch": 0.25, "grad_norm": 10.22819119060325, "learning_rate": 4.991206126733369e-07, "logits/chosen": -2.448366403579712, "logits/rejected": -2.420719623565674, "logps/chosen": -108.7235107421875, "logps/rejected": -120.07295989990234, "loss": 0.6473, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5824471712112427, "rewards/margins": 0.13498175144195557, "rewards/rejected": -0.7174289226531982, "step": 1440 }, { "epoch": 0.25, "grad_norm": 12.725503598146807, "learning_rate": 4.990564848246851e-07, "logits/chosen": -2.4409327507019043, "logits/rejected": -2.408658504486084, "logps/chosen": -114.9705810546875, "logps/rejected": -128.18980407714844, "loss": 0.6348, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5951242446899414, "rewards/margins": 0.16781339049339294, "rewards/rejected": -0.7629376649856567, "step": 1450 }, { "epoch": 0.25, "grad_norm": 9.436937177439919, "learning_rate": 4.98990104739064e-07, "logits/chosen": -2.4494917392730713, "logits/rejected": -2.4222424030303955, "logps/chosen": -109.37040710449219, "logps/rejected": -126.34139251708984, "loss": 0.6382, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5768964290618896, "rewards/margins": 0.18195541203022003, "rewards/rejected": -0.7588518261909485, "step": 1460 }, { "epoch": 0.25, "grad_norm": 8.955583438989118, "learning_rate": 4.989214730167541e-07, "logits/chosen": -2.622709035873413, "logits/rejected": -2.5908420085906982, "logps/chosen": -110.5583267211914, "logps/rejected": -124.64057922363281, "loss": 0.6329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.535399854183197, "rewards/margins": 0.16410811245441437, "rewards/rejected": -0.6995079517364502, "step": 1470 }, { "epoch": 0.25, "grad_norm": 9.49330055426053, "learning_rate": 4.988505902783971e-07, "logits/chosen": -2.590567111968994, "logits/rejected": -2.556976795196533, "logps/chosen": -105.81478118896484, "logps/rejected": -119.9277114868164, "loss": 0.6432, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5450218915939331, "rewards/margins": 0.13719643652439117, "rewards/rejected": -0.6822183728218079, "step": 1480 }, { "epoch": 0.26, "grad_norm": 9.98525040448124, "learning_rate": 4.987774571649912e-07, "logits/chosen": -2.4983878135681152, "logits/rejected": -2.4753427505493164, "logps/chosen": -115.1557846069336, "logps/rejected": -128.9449005126953, "loss": 0.6331, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5778855085372925, "rewards/margins": 0.17497901618480682, "rewards/rejected": -0.7528645992279053, "step": 1490 }, { "epoch": 0.26, "grad_norm": 10.26303189770449, "learning_rate": 4.987020743378848e-07, "logits/chosen": -2.385967969894409, "logits/rejected": -2.383463144302368, "logps/chosen": -110.9022445678711, "logps/rejected": -130.71217346191406, "loss": 0.6356, "rewards/accuracies": 0.65625, "rewards/chosen": -0.609332263469696, "rewards/margins": 0.1715681552886963, "rewards/rejected": -0.7809004187583923, "step": 1500 }, { "epoch": 0.26, "eval_logits/chosen": -2.517664909362793, "eval_logits/rejected": -2.5109217166900635, "eval_logps/chosen": -107.28179168701172, "eval_logps/rejected": -122.3668212890625, "eval_loss": 0.6536844968795776, "eval_rewards/accuracies": 0.6380111575126648, "eval_rewards/chosen": -0.48577937483787537, "eval_rewards/margins": 0.10631493479013443, "eval_rewards/rejected": -0.5920943021774292, "eval_runtime": 356.9471, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 1500 }, { "epoch": 0.26, "grad_norm": 9.106501586861713, "learning_rate": 4.986244424787706e-07, "logits/chosen": -2.322202205657959, "logits/rejected": -2.2912230491638184, "logps/chosen": -118.87747955322266, "logps/rejected": -133.88900756835938, "loss": 0.6146, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6029187440872192, "rewards/margins": 0.2079295665025711, "rewards/rejected": -0.8108483552932739, "step": 1510 }, { "epoch": 0.26, "grad_norm": 10.760728694678805, "learning_rate": 4.985445622896794e-07, "logits/chosen": -2.387296676635742, "logits/rejected": -2.379225015640259, "logps/chosen": -118.9466781616211, "logps/rejected": -134.18948364257812, "loss": 0.6424, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6671319007873535, "rewards/margins": 0.16024193167686462, "rewards/rejected": -0.8273738026618958, "step": 1520 }, { "epoch": 0.26, "grad_norm": 15.020346958333217, "learning_rate": 4.98462434492974e-07, "logits/chosen": -2.2380728721618652, "logits/rejected": -2.2234339714050293, "logps/chosen": -127.2374038696289, "logps/rejected": -143.69729614257812, "loss": 0.6451, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.7556307315826416, "rewards/margins": 0.1741071343421936, "rewards/rejected": -0.9297378659248352, "step": 1530 }, { "epoch": 0.27, "grad_norm": 10.57370425758323, "learning_rate": 4.983780598313423e-07, "logits/chosen": -2.3825461864471436, "logits/rejected": -2.3513596057891846, "logps/chosen": -120.11723327636719, "logps/rejected": -140.9315185546875, "loss": 0.6034, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6420670747756958, "rewards/margins": 0.2574729919433594, "rewards/rejected": -0.8995401263237, "step": 1540 }, { "epoch": 0.27, "grad_norm": 11.337001973825672, "learning_rate": 4.982914390677909e-07, "logits/chosen": -2.2892661094665527, "logits/rejected": -2.2704126834869385, "logps/chosen": -114.27877044677734, "logps/rejected": -134.2589111328125, "loss": 0.6133, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6090589165687561, "rewards/margins": 0.2282298356294632, "rewards/rejected": -0.8372887372970581, "step": 1550 }, { "epoch": 0.27, "grad_norm": 13.598814466918178, "learning_rate": 4.982025729856381e-07, "logits/chosen": -2.273789882659912, "logits/rejected": -2.252927780151367, "logps/chosen": -123.75514221191406, "logps/rejected": -144.1699676513672, "loss": 0.6334, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7292818427085876, "rewards/margins": 0.20613010227680206, "rewards/rejected": -0.935411810874939, "step": 1560 }, { "epoch": 0.27, "grad_norm": 13.054000500346573, "learning_rate": 4.981114623885066e-07, "logits/chosen": -2.305429697036743, "logits/rejected": -2.304576873779297, "logps/chosen": -123.65141296386719, "logps/rejected": -149.49166870117188, "loss": 0.6306, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7176939845085144, "rewards/margins": 0.21510076522827148, "rewards/rejected": -0.9327947497367859, "step": 1570 }, { "epoch": 0.27, "grad_norm": 11.391373748499909, "learning_rate": 4.980181081003167e-07, "logits/chosen": -2.2610230445861816, "logits/rejected": -2.248826265335083, "logps/chosen": -120.3755874633789, "logps/rejected": -140.72938537597656, "loss": 0.6399, "rewards/accuracies": 0.625, "rewards/chosen": -0.6746999621391296, "rewards/margins": 0.19383227825164795, "rewards/rejected": -0.8685322999954224, "step": 1580 }, { "epoch": 0.27, "grad_norm": 11.558255813156592, "learning_rate": 4.979225109652783e-07, "logits/chosen": -2.317185878753662, "logits/rejected": -2.3010520935058594, "logps/chosen": -120.50982666015625, "logps/rejected": -136.13148498535156, "loss": 0.6499, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6638567447662354, "rewards/margins": 0.15114296972751617, "rewards/rejected": -0.8149996995925903, "step": 1590 }, { "epoch": 0.28, "grad_norm": 10.010530526041645, "learning_rate": 4.978246718478835e-07, "logits/chosen": -2.298884630203247, "logits/rejected": -2.2639718055725098, "logps/chosen": -114.62701416015625, "logps/rejected": -132.1314239501953, "loss": 0.6275, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6248496770858765, "rewards/margins": 0.18698535859584808, "rewards/rejected": -0.8118351101875305, "step": 1600 }, { "epoch": 0.28, "eval_logits/chosen": -2.417057991027832, "eval_logits/rejected": -2.4086451530456543, "eval_logps/chosen": -116.99667358398438, "eval_logps/rejected": -135.21180725097656, "eval_loss": 0.6452447175979614, "eval_rewards/accuracies": 0.6363847851753235, "eval_rewards/chosen": -0.5829283595085144, "eval_rewards/margins": 0.13761593401432037, "eval_rewards/rejected": -0.7205442786216736, "eval_runtime": 357.5084, "eval_samples_per_second": 12.039, "eval_steps_per_second": 1.505, "step": 1600 }, { "epoch": 0.28, "grad_norm": 10.584365804460724, "learning_rate": 4.977245916328994e-07, "logits/chosen": -2.3447985649108887, "logits/rejected": -2.3194570541381836, "logps/chosen": -130.62841796875, "logps/rejected": -153.30650329589844, "loss": 0.6354, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7475208044052124, "rewards/margins": 0.20843036472797394, "rewards/rejected": -0.9559510946273804, "step": 1610 }, { "epoch": 0.28, "grad_norm": 12.935555369642161, "learning_rate": 4.976222712253587e-07, "logits/chosen": -2.2747273445129395, "logits/rejected": -2.251038074493408, "logps/chosen": -124.53253173828125, "logps/rejected": -160.12515258789062, "loss": 0.6036, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7311369180679321, "rewards/margins": 0.3294447958469391, "rewards/rejected": -1.0605818033218384, "step": 1620 }, { "epoch": 0.28, "grad_norm": 12.670402097997451, "learning_rate": 4.97517711550553e-07, "logits/chosen": -2.334963083267212, "logits/rejected": -2.3116507530212402, "logps/chosen": -132.87025451660156, "logps/rejected": -149.6600799560547, "loss": 0.6289, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7737977504730225, "rewards/margins": 0.2057873010635376, "rewards/rejected": -0.9795848727226257, "step": 1630 }, { "epoch": 0.28, "grad_norm": 12.932831879229832, "learning_rate": 4.974109135540232e-07, "logits/chosen": -2.379924774169922, "logits/rejected": -2.3459315299987793, "logps/chosen": -137.20947265625, "logps/rejected": -144.1864013671875, "loss": 0.6681, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.803007960319519, "rewards/margins": 0.1256239265203476, "rewards/rejected": -0.9286319017410278, "step": 1640 }, { "epoch": 0.28, "grad_norm": 10.93104733261659, "learning_rate": 4.97301878201552e-07, "logits/chosen": -2.3800089359283447, "logits/rejected": -2.353868246078491, "logps/chosen": -118.6199951171875, "logps/rejected": -142.3135223388672, "loss": 0.6032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.613827109336853, "rewards/margins": 0.2595062851905823, "rewards/rejected": -0.8733335733413696, "step": 1650 }, { "epoch": 0.29, "grad_norm": 8.435075129874804, "learning_rate": 4.971906064791545e-07, "logits/chosen": -2.4072935581207275, "logits/rejected": -2.3678243160247803, "logps/chosen": -116.49778747558594, "logps/rejected": -127.7277603149414, "loss": 0.6444, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6152034401893616, "rewards/margins": 0.16506488621234894, "rewards/rejected": -0.7802683115005493, "step": 1660 }, { "epoch": 0.29, "grad_norm": 10.473049120892489, "learning_rate": 4.970770993930693e-07, "logits/chosen": -2.3916454315185547, "logits/rejected": -2.366729259490967, "logps/chosen": -112.58506774902344, "logps/rejected": -138.53781127929688, "loss": 0.6101, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5803548693656921, "rewards/margins": 0.25304341316223145, "rewards/rejected": -0.8333982229232788, "step": 1670 }, { "epoch": 0.29, "grad_norm": 11.92770345279179, "learning_rate": 4.969613579697499e-07, "logits/chosen": -2.329380989074707, "logits/rejected": -2.303520679473877, "logps/chosen": -119.73587799072266, "logps/rejected": -142.68923950195312, "loss": 0.6175, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6415343284606934, "rewards/margins": 0.24146585166454315, "rewards/rejected": -0.8830000758171082, "step": 1680 }, { "epoch": 0.29, "grad_norm": 10.415977139141571, "learning_rate": 4.968433832558549e-07, "logits/chosen": -2.2939274311065674, "logits/rejected": -2.2756576538085938, "logps/chosen": -115.64154052734375, "logps/rejected": -131.75743103027344, "loss": 0.637, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.601462185382843, "rewards/margins": 0.19359458982944489, "rewards/rejected": -0.7950568199157715, "step": 1690 }, { "epoch": 0.29, "grad_norm": 12.213920640091061, "learning_rate": 4.967231763182385e-07, "logits/chosen": -2.169027805328369, "logits/rejected": -2.16825795173645, "logps/chosen": -112.47358703613281, "logps/rejected": -138.60568237304688, "loss": 0.6315, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6292012333869934, "rewards/margins": 0.2038397490978241, "rewards/rejected": -0.8330410122871399, "step": 1700 }, { "epoch": 0.29, "eval_logits/chosen": -2.3369693756103516, "eval_logits/rejected": -2.3275210857391357, "eval_logps/chosen": -117.66336822509766, "eval_logps/rejected": -136.6091766357422, "eval_loss": 0.6433987021446228, "eval_rewards/accuracies": 0.633596658706665, "eval_rewards/chosen": -0.5895951986312866, "eval_rewards/margins": 0.14492255449295044, "eval_rewards/rejected": -0.7345177531242371, "eval_runtime": 357.4789, "eval_samples_per_second": 12.04, "eval_steps_per_second": 1.505, "step": 1700 }, { "epoch": 0.29, "grad_norm": 17.554636232799652, "learning_rate": 4.966007382439414e-07, "logits/chosen": -2.2377054691314697, "logits/rejected": -2.196046829223633, "logps/chosen": -134.55264282226562, "logps/rejected": -155.48165893554688, "loss": 0.6201, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.761473536491394, "rewards/margins": 0.24851946532726288, "rewards/rejected": -1.0099929571151733, "step": 1710 }, { "epoch": 0.3, "grad_norm": 12.864744504235462, "learning_rate": 4.964760701401807e-07, "logits/chosen": -2.2469406127929688, "logits/rejected": -2.2177302837371826, "logps/chosen": -136.5878143310547, "logps/rejected": -150.06259155273438, "loss": 0.6428, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.801825225353241, "rewards/margins": 0.18852399289608002, "rewards/rejected": -0.9903491735458374, "step": 1720 }, { "epoch": 0.3, "grad_norm": 15.115706747879441, "learning_rate": 4.963491731343395e-07, "logits/chosen": -2.2426817417144775, "logits/rejected": -2.225494146347046, "logps/chosen": -133.5530548095703, "logps/rejected": -154.15370178222656, "loss": 0.629, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7963297367095947, "rewards/margins": 0.2201063185930252, "rewards/rejected": -1.016435980796814, "step": 1730 }, { "epoch": 0.3, "grad_norm": 10.577013192942609, "learning_rate": 4.962200483739572e-07, "logits/chosen": -2.205991268157959, "logits/rejected": -2.1916627883911133, "logps/chosen": -137.78909301757812, "logps/rejected": -165.40756225585938, "loss": 0.6415, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.856925368309021, "rewards/margins": 0.25689417123794556, "rewards/rejected": -1.1138197183609009, "step": 1740 }, { "epoch": 0.3, "grad_norm": 14.966499781106553, "learning_rate": 4.96088697026719e-07, "logits/chosen": -2.2428221702575684, "logits/rejected": -2.2297050952911377, "logps/chosen": -130.33145141601562, "logps/rejected": -156.0831756591797, "loss": 0.6123, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7514396905899048, "rewards/margins": 0.2490360289812088, "rewards/rejected": -1.0004757642745972, "step": 1750 }, { "epoch": 0.3, "grad_norm": 14.761249007069992, "learning_rate": 4.959551202804452e-07, "logits/chosen": -2.2175586223602295, "logits/rejected": -2.1803672313690186, "logps/chosen": -129.0546417236328, "logps/rejected": -156.67083740234375, "loss": 0.5915, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7302212119102478, "rewards/margins": 0.3069346249103546, "rewards/rejected": -1.0371558666229248, "step": 1760 }, { "epoch": 0.3, "grad_norm": 13.94600835463878, "learning_rate": 4.958193193430807e-07, "logits/chosen": -2.2072737216949463, "logits/rejected": -2.1699469089508057, "logps/chosen": -136.83575439453125, "logps/rejected": -160.46617126464844, "loss": 0.5962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8186962008476257, "rewards/margins": 0.2965734004974365, "rewards/rejected": -1.115269660949707, "step": 1770 }, { "epoch": 0.31, "grad_norm": 14.814181161859743, "learning_rate": 4.956812954426837e-07, "logits/chosen": -2.0803823471069336, "logits/rejected": -2.0697758197784424, "logps/chosen": -145.62057495117188, "logps/rejected": -193.1177520751953, "loss": 0.5567, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9111822843551636, "rewards/margins": 0.43550905585289, "rewards/rejected": -1.3466914892196655, "step": 1780 }, { "epoch": 0.31, "grad_norm": 12.92886463604504, "learning_rate": 4.95541049827415e-07, "logits/chosen": -2.077265739440918, "logits/rejected": -2.0515055656433105, "logps/chosen": -154.9197998046875, "logps/rejected": -191.34097290039062, "loss": 0.5844, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.029468297958374, "rewards/margins": 0.35887202620506287, "rewards/rejected": -1.3883404731750488, "step": 1790 }, { "epoch": 0.31, "grad_norm": 14.455065715370587, "learning_rate": 4.953985837655266e-07, "logits/chosen": -2.03164005279541, "logits/rejected": -2.004000186920166, "logps/chosen": -154.93240356445312, "logps/rejected": -186.32785034179688, "loss": 0.6166, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0314334630966187, "rewards/margins": 0.31214088201522827, "rewards/rejected": -1.3435744047164917, "step": 1800 }, { "epoch": 0.31, "eval_logits/chosen": -2.099355936050415, "eval_logits/rejected": -2.087456226348877, "eval_logps/chosen": -137.8539276123047, "eval_logps/rejected": -159.61842346191406, "eval_loss": 0.6393665075302124, "eval_rewards/accuracies": 0.6289498209953308, "eval_rewards/chosen": -0.7915008664131165, "eval_rewards/margins": 0.1731095165014267, "eval_rewards/rejected": -0.9646103978157043, "eval_runtime": 357.2087, "eval_samples_per_second": 12.049, "eval_steps_per_second": 1.506, "step": 1800 }, { "epoch": 0.31, "grad_norm": 13.045686157199322, "learning_rate": 4.952538985453499e-07, "logits/chosen": -2.0923218727111816, "logits/rejected": -2.058093547821045, "logps/chosen": -148.35519409179688, "logps/rejected": -166.8310089111328, "loss": 0.6642, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9175241589546204, "rewards/margins": 0.21410436928272247, "rewards/rejected": -1.131628394126892, "step": 1810 }, { "epoch": 0.31, "grad_norm": 14.746527696680772, "learning_rate": 4.951069954752846e-07, "logits/chosen": -2.104447841644287, "logits/rejected": -2.0722100734710693, "logps/chosen": -138.68417358398438, "logps/rejected": -155.63540649414062, "loss": 0.6363, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8351799249649048, "rewards/margins": 0.2144734412431717, "rewards/rejected": -1.0496532917022705, "step": 1820 }, { "epoch": 0.32, "grad_norm": 14.068599458347373, "learning_rate": 4.949578758837864e-07, "logits/chosen": -2.0577917098999023, "logits/rejected": -2.040351390838623, "logps/chosen": -126.67193603515625, "logps/rejected": -151.28140258789062, "loss": 0.6184, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.711285412311554, "rewards/margins": 0.254142701625824, "rewards/rejected": -0.9654279947280884, "step": 1830 }, { "epoch": 0.32, "grad_norm": 16.186612704580565, "learning_rate": 4.948065411193554e-07, "logits/chosen": -2.2264585494995117, "logits/rejected": -2.2193264961242676, "logps/chosen": -132.20895385742188, "logps/rejected": -154.45802307128906, "loss": 0.6388, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7740861177444458, "rewards/margins": 0.22031357884407043, "rewards/rejected": -0.9943998456001282, "step": 1840 }, { "epoch": 0.32, "grad_norm": 13.631222489310312, "learning_rate": 4.946529925505233e-07, "logits/chosen": -2.104651927947998, "logits/rejected": -2.099863052368164, "logps/chosen": -124.88291931152344, "logps/rejected": -150.3798828125, "loss": 0.6244, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7169033288955688, "rewards/margins": 0.24011361598968506, "rewards/rejected": -0.9570168256759644, "step": 1850 }, { "epoch": 0.32, "grad_norm": 11.597593527632558, "learning_rate": 4.944972315658417e-07, "logits/chosen": -2.038820505142212, "logits/rejected": -2.0067200660705566, "logps/chosen": -129.09518432617188, "logps/rejected": -153.1046142578125, "loss": 0.601, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7181129455566406, "rewards/margins": 0.27581366896629333, "rewards/rejected": -0.9939267039299011, "step": 1860 }, { "epoch": 0.32, "grad_norm": 14.814299163074143, "learning_rate": 4.943392595738695e-07, "logits/chosen": -2.0475425720214844, "logits/rejected": -2.018345832824707, "logps/chosen": -130.18441772460938, "logps/rejected": -163.41127014160156, "loss": 0.5883, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7427932024002075, "rewards/margins": 0.35372671484947205, "rewards/rejected": -1.096519947052002, "step": 1870 }, { "epoch": 0.32, "grad_norm": 13.085964342637993, "learning_rate": 4.941790780031591e-07, "logits/chosen": -2.040121555328369, "logits/rejected": -2.0052008628845215, "logps/chosen": -139.4851531982422, "logps/rejected": -169.9237518310547, "loss": 0.5986, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.8679348230361938, "rewards/margins": 0.3169488310813904, "rewards/rejected": -1.184883713722229, "step": 1880 }, { "epoch": 0.33, "grad_norm": 17.97087896244849, "learning_rate": 4.94016688302245e-07, "logits/chosen": -2.036181688308716, "logits/rejected": -2.021777629852295, "logps/chosen": -137.5182647705078, "logps/rejected": -178.22129821777344, "loss": 0.561, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8445706367492676, "rewards/margins": 0.39340075850486755, "rewards/rejected": -1.237971544265747, "step": 1890 }, { "epoch": 0.33, "grad_norm": 15.473904552778107, "learning_rate": 4.938520919396297e-07, "logits/chosen": -1.9097896814346313, "logits/rejected": -1.878089189529419, "logps/chosen": -160.5067901611328, "logps/rejected": -182.10073852539062, "loss": 0.6238, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.030932068824768, "rewards/margins": 0.26739898324012756, "rewards/rejected": -1.2983310222625732, "step": 1900 }, { "epoch": 0.33, "eval_logits/chosen": -1.9768445491790771, "eval_logits/rejected": -1.964641809463501, "eval_logps/chosen": -151.84054565429688, "eval_logps/rejected": -174.0358428955078, "eval_loss": 0.6393516659736633, "eval_rewards/accuracies": 0.6280204653739929, "eval_rewards/chosen": -0.9313669800758362, "eval_rewards/margins": 0.1774175763130188, "eval_rewards/rejected": -1.108784556388855, "eval_runtime": 356.9465, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 1900 }, { "epoch": 0.33, "grad_norm": 18.295705836142915, "learning_rate": 4.936852904037709e-07, "logits/chosen": -1.8353763818740845, "logits/rejected": -1.7998859882354736, "logps/chosen": -162.01815795898438, "logps/rejected": -199.1243438720703, "loss": 0.5856, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0756003856658936, "rewards/margins": 0.3827807605266571, "rewards/rejected": -1.458381175994873, "step": 1910 }, { "epoch": 0.33, "grad_norm": 15.587066902188072, "learning_rate": 4.935162852030678e-07, "logits/chosen": -1.9671123027801514, "logits/rejected": -1.9385311603546143, "logps/chosen": -158.43060302734375, "logps/rejected": -187.6251678466797, "loss": 0.611, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.030988097190857, "rewards/margins": 0.3086529076099396, "rewards/rejected": -1.3396410942077637, "step": 1920 }, { "epoch": 0.33, "grad_norm": 16.64428631174434, "learning_rate": 4.933450778658472e-07, "logits/chosen": -1.9721879959106445, "logits/rejected": -1.9367185831069946, "logps/chosen": -145.00579833984375, "logps/rejected": -175.52078247070312, "loss": 0.6052, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9111844897270203, "rewards/margins": 0.30568939447402954, "rewards/rejected": -1.2168738842010498, "step": 1930 }, { "epoch": 0.33, "grad_norm": 13.008355304833884, "learning_rate": 4.931716699403504e-07, "logits/chosen": -2.0365664958953857, "logits/rejected": -2.016010046005249, "logps/chosen": -130.88787841796875, "logps/rejected": -154.35255432128906, "loss": 0.6209, "rewards/accuracies": 0.625, "rewards/chosen": -0.7993988990783691, "rewards/margins": 0.24416379630565643, "rewards/rejected": -1.043562650680542, "step": 1940 }, { "epoch": 0.34, "grad_norm": 11.221840915928341, "learning_rate": 4.929960629947185e-07, "logits/chosen": -2.021613597869873, "logits/rejected": -2.012424945831299, "logps/chosen": -137.0205535888672, "logps/rejected": -171.9865264892578, "loss": 0.6066, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.8452763557434082, "rewards/margins": 0.3268183469772339, "rewards/rejected": -1.172094702720642, "step": 1950 }, { "epoch": 0.34, "grad_norm": 12.459203609632565, "learning_rate": 4.928182586169787e-07, "logits/chosen": -2.0483787059783936, "logits/rejected": -2.024353504180908, "logps/chosen": -136.3292999267578, "logps/rejected": -166.4497833251953, "loss": 0.6003, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8198174238204956, "rewards/margins": 0.2998635768890381, "rewards/rejected": -1.1196808815002441, "step": 1960 }, { "epoch": 0.34, "grad_norm": 16.3262276995175, "learning_rate": 4.926382584150298e-07, "logits/chosen": -2.052652359008789, "logits/rejected": -2.0237042903900146, "logps/chosen": -133.0852813720703, "logps/rejected": -154.27850341796875, "loss": 0.6179, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7694526314735413, "rewards/margins": 0.25520798563957214, "rewards/rejected": -1.0246607065200806, "step": 1970 }, { "epoch": 0.34, "grad_norm": 13.309990683269428, "learning_rate": 4.924560640166273e-07, "logits/chosen": -1.9702781438827515, "logits/rejected": -1.955529808998108, "logps/chosen": -143.92767333984375, "logps/rejected": -171.1623077392578, "loss": 0.6143, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8834748268127441, "rewards/margins": 0.28803762793540955, "rewards/rejected": -1.1715123653411865, "step": 1980 }, { "epoch": 0.34, "grad_norm": 17.690808430077606, "learning_rate": 4.922716770693691e-07, "logits/chosen": -2.02256441116333, "logits/rejected": -1.9881162643432617, "logps/chosen": -148.55735778808594, "logps/rejected": -185.8997344970703, "loss": 0.5663, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9775354266166687, "rewards/margins": 0.3999343514442444, "rewards/rejected": -1.377469778060913, "step": 1990 }, { "epoch": 0.34, "grad_norm": 20.779887890208492, "learning_rate": 4.920850992406809e-07, "logits/chosen": -1.9081655740737915, "logits/rejected": -1.9007370471954346, "logps/chosen": -167.69796752929688, "logps/rejected": -216.0054168701172, "loss": 0.5824, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1772109270095825, "rewards/margins": 0.4069501757621765, "rewards/rejected": -1.5841610431671143, "step": 2000 }, { "epoch": 0.34, "eval_logits/chosen": -1.988376498222351, "eval_logits/rejected": -1.9742034673690796, "eval_logps/chosen": -156.2569122314453, "eval_logps/rejected": -181.40647888183594, "eval_loss": 0.6345042586326599, "eval_rewards/accuracies": 0.6338289976119995, "eval_rewards/chosen": -0.9755305647850037, "eval_rewards/margins": 0.2069605439901352, "eval_rewards/rejected": -1.1824910640716553, "eval_runtime": 356.8317, "eval_samples_per_second": 12.062, "eval_steps_per_second": 1.508, "step": 2000 }, { "epoch": 0.35, "grad_norm": 13.969050635341127, "learning_rate": 4.918963322178002e-07, "logits/chosen": -1.8815292119979858, "logits/rejected": -1.8513492345809937, "logps/chosen": -167.8777313232422, "logps/rejected": -195.77195739746094, "loss": 0.6169, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1228792667388916, "rewards/margins": 0.3168966770172119, "rewards/rejected": -1.439776062965393, "step": 2010 }, { "epoch": 0.35, "grad_norm": 15.797574816697441, "learning_rate": 4.917053777077616e-07, "logits/chosen": -1.8998935222625732, "logits/rejected": -1.8719685077667236, "logps/chosen": -154.22711181640625, "logps/rejected": -197.74649047851562, "loss": 0.5839, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0441197156906128, "rewards/margins": 0.38671204447746277, "rewards/rejected": -1.4308319091796875, "step": 2020 }, { "epoch": 0.35, "grad_norm": 14.833429475519068, "learning_rate": 4.915122374373815e-07, "logits/chosen": -1.9642966985702515, "logits/rejected": -1.9340064525604248, "logps/chosen": -159.75148010253906, "logps/rejected": -195.52828979492188, "loss": 0.5832, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0272386074066162, "rewards/margins": 0.3762350082397461, "rewards/rejected": -1.4034736156463623, "step": 2030 }, { "epoch": 0.35, "grad_norm": 14.865053760007001, "learning_rate": 4.913169131532422e-07, "logits/chosen": -1.820640206336975, "logits/rejected": -1.7987966537475586, "logps/chosen": -145.8691864013672, "logps/rejected": -193.00489807128906, "loss": 0.5621, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9316226243972778, "rewards/margins": 0.45906609296798706, "rewards/rejected": -1.3906886577606201, "step": 2040 }, { "epoch": 0.35, "grad_norm": 20.45821384576311, "learning_rate": 4.911194066216765e-07, "logits/chosen": -1.864013671875, "logits/rejected": -1.8330237865447998, "logps/chosen": -153.375732421875, "logps/rejected": -193.52369689941406, "loss": 0.5926, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.0217927694320679, "rewards/margins": 0.36956191062927246, "rewards/rejected": -1.3913547992706299, "step": 2050 }, { "epoch": 0.35, "grad_norm": 15.765551507413845, "learning_rate": 4.909197196287509e-07, "logits/chosen": -1.8556013107299805, "logits/rejected": -1.8133299350738525, "logps/chosen": -158.96043395996094, "logps/rejected": -183.34689331054688, "loss": 0.6223, "rewards/accuracies": 0.59375, "rewards/chosen": -1.036510705947876, "rewards/margins": 0.2811738848686218, "rewards/rejected": -1.3176846504211426, "step": 2060 }, { "epoch": 0.36, "grad_norm": 16.728188759969367, "learning_rate": 4.907178539802502e-07, "logits/chosen": -1.8902781009674072, "logits/rejected": -1.8563499450683594, "logps/chosen": -154.345947265625, "logps/rejected": -198.3380889892578, "loss": 0.5807, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9906846284866333, "rewards/margins": 0.44513338804244995, "rewards/rejected": -1.4358179569244385, "step": 2070 }, { "epoch": 0.36, "grad_norm": 17.234432946942448, "learning_rate": 4.905138115016614e-07, "logits/chosen": -1.8345119953155518, "logits/rejected": -1.7954838275909424, "logps/chosen": -153.2652587890625, "logps/rejected": -191.7286376953125, "loss": 0.5862, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0140693187713623, "rewards/margins": 0.39727577567100525, "rewards/rejected": -1.41134512424469, "step": 2080 }, { "epoch": 0.36, "grad_norm": 17.27758613167349, "learning_rate": 4.903075940381559e-07, "logits/chosen": -1.8448431491851807, "logits/rejected": -1.8325908184051514, "logps/chosen": -147.30392456054688, "logps/rejected": -176.21583557128906, "loss": 0.6304, "rewards/accuracies": 0.625, "rewards/chosen": -0.9291483163833618, "rewards/margins": 0.2863280475139618, "rewards/rejected": -1.2154762744903564, "step": 2090 }, { "epoch": 0.36, "grad_norm": 13.545506652655037, "learning_rate": 4.900992034545744e-07, "logits/chosen": -1.8317134380340576, "logits/rejected": -1.801325798034668, "logps/chosen": -134.27406311035156, "logps/rejected": -163.0066680908203, "loss": 0.5895, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7893965840339661, "rewards/margins": 0.32822003960609436, "rewards/rejected": -1.1176166534423828, "step": 2100 }, { "epoch": 0.36, "eval_logits/chosen": -1.95328950881958, "eval_logits/rejected": -1.9400743246078491, "eval_logps/chosen": -124.55523681640625, "eval_logps/rejected": -143.9415740966797, "eval_loss": 0.6449150443077087, "eval_rewards/accuracies": 0.6338289976119995, "eval_rewards/chosen": -0.6585139632225037, "eval_rewards/margins": 0.14932793378829956, "eval_rewards/rejected": -0.8078420162200928, "eval_runtime": 356.7749, "eval_samples_per_second": 12.064, "eval_steps_per_second": 1.508, "step": 2100 }, { "epoch": 0.36, "grad_norm": 20.3896857533217, "learning_rate": 4.898886416354088e-07, "logits/chosen": -1.8545172214508057, "logits/rejected": -1.8379218578338623, "logps/chosen": -136.37522888183594, "logps/rejected": -184.2274932861328, "loss": 0.5751, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8746525049209595, "rewards/margins": 0.41598910093307495, "rewards/rejected": -1.2906416654586792, "step": 2110 }, { "epoch": 0.37, "grad_norm": 19.966726504918753, "learning_rate": 4.896759104847859e-07, "logits/chosen": -1.6481273174285889, "logits/rejected": -1.6026216745376587, "logps/chosen": -143.7649688720703, "logps/rejected": -195.6068115234375, "loss": 0.549, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.90205317735672, "rewards/margins": 0.5448054075241089, "rewards/rejected": -1.4468586444854736, "step": 2120 }, { "epoch": 0.37, "grad_norm": 22.5331311441494, "learning_rate": 4.8946101192645e-07, "logits/chosen": -1.5899341106414795, "logits/rejected": -1.5583152770996094, "logps/chosen": -178.53843688964844, "logps/rejected": -229.2800750732422, "loss": 0.5623, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2409361600875854, "rewards/margins": 0.5267373323440552, "rewards/rejected": -1.7676734924316406, "step": 2130 }, { "epoch": 0.37, "grad_norm": 19.39894132054709, "learning_rate": 4.892439479037451e-07, "logits/chosen": -1.6246334314346313, "logits/rejected": -1.6016099452972412, "logps/chosen": -173.16412353515625, "logps/rejected": -215.16110229492188, "loss": 0.6048, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.245426058769226, "rewards/margins": 0.37089425325393677, "rewards/rejected": -1.6163203716278076, "step": 2140 }, { "epoch": 0.37, "grad_norm": 18.938179724571942, "learning_rate": 4.89024720379598e-07, "logits/chosen": -1.6600227355957031, "logits/rejected": -1.6034603118896484, "logps/chosen": -167.83346557617188, "logps/rejected": -214.2350616455078, "loss": 0.5413, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1314681768417358, "rewards/margins": 0.531185507774353, "rewards/rejected": -1.6626536846160889, "step": 2150 }, { "epoch": 0.37, "grad_norm": 17.819138078463755, "learning_rate": 4.888033313365001e-07, "logits/chosen": -1.5937135219573975, "logits/rejected": -1.5616223812103271, "logps/chosen": -189.21583557128906, "logps/rejected": -235.78518676757812, "loss": 0.5612, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.344967007637024, "rewards/margins": 0.5175682902336121, "rewards/rejected": -1.8625354766845703, "step": 2160 }, { "epoch": 0.37, "grad_norm": 28.899143582146355, "learning_rate": 4.885797827764895e-07, "logits/chosen": -1.6774377822875977, "logits/rejected": -1.6321741342544556, "logps/chosen": -193.11886596679688, "logps/rejected": -255.240234375, "loss": 0.543, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3880548477172852, "rewards/margins": 0.6497060656547546, "rewards/rejected": -2.0377612113952637, "step": 2170 }, { "epoch": 0.38, "grad_norm": 20.347133369564297, "learning_rate": 4.88354076721133e-07, "logits/chosen": -1.7743288278579712, "logits/rejected": -1.7314989566802979, "logps/chosen": -204.22975158691406, "logps/rejected": -241.63369750976562, "loss": 0.6385, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4623057842254639, "rewards/margins": 0.42681413888931274, "rewards/rejected": -1.889120101928711, "step": 2180 }, { "epoch": 0.38, "grad_norm": 11.864063871740855, "learning_rate": 4.88126215211508e-07, "logits/chosen": -2.0105056762695312, "logits/rejected": -1.9917558431625366, "logps/chosen": -134.98178100585938, "logps/rejected": -173.46408081054688, "loss": 0.5908, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8213205337524414, "rewards/margins": 0.37510326504707336, "rewards/rejected": -1.1964237689971924, "step": 2190 }, { "epoch": 0.38, "grad_norm": 12.665212840871442, "learning_rate": 4.878962003081834e-07, "logits/chosen": -1.8419253826141357, "logits/rejected": -1.8090530633926392, "logps/chosen": -125.55128479003906, "logps/rejected": -168.30105590820312, "loss": 0.5633, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7324917912483215, "rewards/margins": 0.4029674530029297, "rewards/rejected": -1.1354591846466064, "step": 2200 }, { "epoch": 0.38, "eval_logits/chosen": -1.9549309015274048, "eval_logits/rejected": -1.9415898323059082, "eval_logps/chosen": -122.187744140625, "eval_logps/rejected": -142.1007080078125, "eval_loss": 0.6433526277542114, "eval_rewards/accuracies": 0.6247676610946655, "eval_rewards/chosen": -0.6348390579223633, "eval_rewards/margins": 0.1545940786600113, "eval_rewards/rejected": -0.7894331216812134, "eval_runtime": 356.7846, "eval_samples_per_second": 12.063, "eval_steps_per_second": 1.508, "step": 2200 }, { "epoch": 0.38, "grad_norm": 15.566693864294429, "learning_rate": 4.87664034091202e-07, "logits/chosen": -1.864985466003418, "logits/rejected": -1.842546820640564, "logps/chosen": -135.0320587158203, "logps/rejected": -167.46334838867188, "loss": 0.6055, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8033797144889832, "rewards/margins": 0.33440515398979187, "rewards/rejected": -1.1377849578857422, "step": 2210 }, { "epoch": 0.38, "grad_norm": 14.08070748751659, "learning_rate": 4.874297186600607e-07, "logits/chosen": -1.6942613124847412, "logits/rejected": -1.6759631633758545, "logps/chosen": -136.83392333984375, "logps/rejected": -170.51962280273438, "loss": 0.5989, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8297648429870605, "rewards/margins": 0.33708181977272034, "rewards/rejected": -1.1668468713760376, "step": 2220 }, { "epoch": 0.38, "grad_norm": 13.765120270646621, "learning_rate": 4.871932561336917e-07, "logits/chosen": -1.7974563837051392, "logits/rejected": -1.7594830989837646, "logps/chosen": -157.6973114013672, "logps/rejected": -191.2778778076172, "loss": 0.6047, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0398889780044556, "rewards/margins": 0.3657877445220947, "rewards/rejected": -1.4056766033172607, "step": 2230 }, { "epoch": 0.39, "grad_norm": 14.959197166232116, "learning_rate": 4.869546486504443e-07, "logits/chosen": -1.7539308071136475, "logits/rejected": -1.715118408203125, "logps/chosen": -154.1725311279297, "logps/rejected": -178.8619384765625, "loss": 0.6287, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9734878540039062, "rewards/margins": 0.27934250235557556, "rewards/rejected": -1.2528302669525146, "step": 2240 }, { "epoch": 0.39, "grad_norm": 30.679025462387873, "learning_rate": 4.867138983680639e-07, "logits/chosen": -1.7157443761825562, "logits/rejected": -1.6704628467559814, "logps/chosen": -154.58035278320312, "logps/rejected": -192.14291381835938, "loss": 0.5929, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9923057556152344, "rewards/margins": 0.3957834541797638, "rewards/rejected": -1.3880890607833862, "step": 2250 }, { "epoch": 0.39, "grad_norm": 13.975707619834106, "learning_rate": 4.864710074636742e-07, "logits/chosen": -1.6998507976531982, "logits/rejected": -1.6631402969360352, "logps/chosen": -162.00726318359375, "logps/rejected": -190.08140563964844, "loss": 0.6257, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0117926597595215, "rewards/margins": 0.3156191408634186, "rewards/rejected": -1.3274118900299072, "step": 2260 }, { "epoch": 0.39, "grad_norm": 16.6560589968763, "learning_rate": 4.862259781337561e-07, "logits/chosen": -1.7075884342193604, "logits/rejected": -1.665636420249939, "logps/chosen": -147.17477416992188, "logps/rejected": -179.50704956054688, "loss": 0.6146, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9310780763626099, "rewards/margins": 0.34703630208969116, "rewards/rejected": -1.2781143188476562, "step": 2270 }, { "epoch": 0.39, "grad_norm": 14.830061652144124, "learning_rate": 4.859788125941288e-07, "logits/chosen": -1.791953444480896, "logits/rejected": -1.7653782367706299, "logps/chosen": -126.0338134765625, "logps/rejected": -163.85520935058594, "loss": 0.5711, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7260233163833618, "rewards/margins": 0.3571609556674957, "rewards/rejected": -1.0831841230392456, "step": 2280 }, { "epoch": 0.39, "grad_norm": 14.350187386244336, "learning_rate": 4.857295130799293e-07, "logits/chosen": -1.6346839666366577, "logits/rejected": -1.5958842039108276, "logps/chosen": -142.57481384277344, "logps/rejected": -190.54708862304688, "loss": 0.5495, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8971936106681824, "rewards/margins": 0.4708176255226135, "rewards/rejected": -1.368011236190796, "step": 2290 }, { "epoch": 0.4, "grad_norm": 21.919618404427325, "learning_rate": 4.854780818455922e-07, "logits/chosen": -1.7218765020370483, "logits/rejected": -1.671383261680603, "logps/chosen": -162.91741943359375, "logps/rejected": -212.5228729248047, "loss": 0.5459, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0696805715560913, "rewards/margins": 0.5191500782966614, "rewards/rejected": -1.5888304710388184, "step": 2300 }, { "epoch": 0.4, "eval_logits/chosen": -1.7182925939559937, "eval_logits/rejected": -1.699223279953003, "eval_logps/chosen": -166.81613159179688, "eval_logps/rejected": -196.93434143066406, "eval_loss": 0.6319848895072937, "eval_rewards/accuracies": 0.6301115155220032, "eval_rewards/chosen": -1.0811227560043335, "eval_rewards/margins": 0.25664687156677246, "eval_rewards/rejected": -1.3377697467803955, "eval_runtime": 355.9925, "eval_samples_per_second": 12.09, "eval_steps_per_second": 1.511, "step": 2300 }, { "epoch": 0.4, "grad_norm": 25.808234378973566, "learning_rate": 4.852245211648297e-07, "logits/chosen": -1.5000966787338257, "logits/rejected": -1.466485619544983, "logps/chosen": -192.41690063476562, "logps/rejected": -250.5450439453125, "loss": 0.5307, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.413515329360962, "rewards/margins": 0.577852725982666, "rewards/rejected": -1.991368055343628, "step": 2310 }, { "epoch": 0.4, "grad_norm": 29.80630825333485, "learning_rate": 4.849688333306104e-07, "logits/chosen": -1.5186668634414673, "logits/rejected": -1.4686113595962524, "logps/chosen": -204.86245727539062, "logps/rejected": -261.40045166015625, "loss": 0.5375, "rewards/accuracies": 0.71875, "rewards/chosen": -1.474421501159668, "rewards/margins": 0.6283503770828247, "rewards/rejected": -2.1027719974517822, "step": 2320 }, { "epoch": 0.4, "grad_norm": 22.33412954401948, "learning_rate": 4.847110206551393e-07, "logits/chosen": -1.4632006883621216, "logits/rejected": -1.4255832433700562, "logps/chosen": -201.6676483154297, "logps/rejected": -270.60858154296875, "loss": 0.5217, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4586617946624756, "rewards/margins": 0.7248638868331909, "rewards/rejected": -2.183525562286377, "step": 2330 }, { "epoch": 0.4, "grad_norm": 20.61698070907491, "learning_rate": 4.844510854698359e-07, "logits/chosen": -1.5553325414657593, "logits/rejected": -1.5306508541107178, "logps/chosen": -203.3428497314453, "logps/rejected": -246.02001953125, "loss": 0.6196, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5106031894683838, "rewards/margins": 0.3960232734680176, "rewards/rejected": -1.9066263437271118, "step": 2340 }, { "epoch": 0.4, "grad_norm": 24.7881803852688, "learning_rate": 4.841890301253145e-07, "logits/chosen": -1.548393726348877, "logits/rejected": -1.509019136428833, "logps/chosen": -186.4705047607422, "logps/rejected": -242.9622039794922, "loss": 0.544, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2822117805480957, "rewards/margins": 0.5945440530776978, "rewards/rejected": -1.876755952835083, "step": 2350 }, { "epoch": 0.41, "grad_norm": 21.155630137442166, "learning_rate": 4.839248569913614e-07, "logits/chosen": -1.4889419078826904, "logits/rejected": -1.452192783355713, "logps/chosen": -189.53225708007812, "logps/rejected": -255.5863800048828, "loss": 0.5421, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3918462991714478, "rewards/margins": 0.6527556777000427, "rewards/rejected": -2.044602155685425, "step": 2360 }, { "epoch": 0.41, "grad_norm": 34.58209504713677, "learning_rate": 4.836585684569147e-07, "logits/chosen": -1.4630403518676758, "logits/rejected": -1.430633544921875, "logps/chosen": -206.46591186523438, "logps/rejected": -273.29779052734375, "loss": 0.5551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.527071237564087, "rewards/margins": 0.6676559448242188, "rewards/rejected": -2.1947274208068848, "step": 2370 }, { "epoch": 0.41, "grad_norm": 26.061151506389557, "learning_rate": 4.833901669300424e-07, "logits/chosen": -1.4684240818023682, "logits/rejected": -1.4264377355575562, "logps/chosen": -186.2281494140625, "logps/rejected": -233.110107421875, "loss": 0.6138, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3187849521636963, "rewards/margins": 0.4836592674255371, "rewards/rejected": -1.8024442195892334, "step": 2380 }, { "epoch": 0.41, "grad_norm": 18.221595816735224, "learning_rate": 4.831196548379198e-07, "logits/chosen": -1.5969889163970947, "logits/rejected": -1.5512523651123047, "logps/chosen": -178.01239013671875, "logps/rejected": -238.6937255859375, "loss": 0.5315, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2178453207015991, "rewards/margins": 0.6382966041564941, "rewards/rejected": -1.8561418056488037, "step": 2390 }, { "epoch": 0.41, "grad_norm": 15.589350815937946, "learning_rate": 4.828470346268088e-07, "logits/chosen": -1.6465771198272705, "logits/rejected": -1.6099990606307983, "logps/chosen": -179.09376525878906, "logps/rejected": -228.93905639648438, "loss": 0.5786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2678877115249634, "rewards/margins": 0.4891243577003479, "rewards/rejected": -1.7570120096206665, "step": 2400 }, { "epoch": 0.41, "eval_logits/chosen": -1.6363126039505005, "eval_logits/rejected": -1.6167610883712769, "eval_logps/chosen": -178.53884887695312, "eval_logps/rejected": -209.47793579101562, "eval_loss": 0.6305522918701172, "eval_rewards/accuracies": 0.6291821599006653, "eval_rewards/chosen": -1.1983500719070435, "eval_rewards/margins": 0.2648555040359497, "eval_rewards/rejected": -1.4632055759429932, "eval_runtime": 356.7532, "eval_samples_per_second": 12.064, "eval_steps_per_second": 1.508, "step": 2400 }, { "epoch": 0.42, "grad_norm": 21.985975720515064, "learning_rate": 4.82572308762035e-07, "logits/chosen": -1.5702491998672485, "logits/rejected": -1.5282782316207886, "logps/chosen": -188.3826904296875, "logps/rejected": -229.83132934570312, "loss": 0.5676, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3452413082122803, "rewards/margins": 0.4605236053466797, "rewards/rejected": -1.8057647943496704, "step": 2410 }, { "epoch": 0.42, "grad_norm": 26.14144209889029, "learning_rate": 4.822954797279652e-07, "logits/chosen": -1.5276035070419312, "logits/rejected": -1.4836372137069702, "logps/chosen": -208.26181030273438, "logps/rejected": -264.6378479003906, "loss": 0.5711, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5207151174545288, "rewards/margins": 0.5766458511352539, "rewards/rejected": -2.0973610877990723, "step": 2420 }, { "epoch": 0.42, "grad_norm": 17.28711843102643, "learning_rate": 4.82016550027986e-07, "logits/chosen": -1.5296419858932495, "logits/rejected": -1.4949567317962646, "logps/chosen": -192.01145935058594, "logps/rejected": -231.2733154296875, "loss": 0.5997, "rewards/accuracies": 0.65625, "rewards/chosen": -1.371919870376587, "rewards/margins": 0.4235268533229828, "rewards/rejected": -1.7954469919204712, "step": 2430 }, { "epoch": 0.42, "grad_norm": 15.776095113993128, "learning_rate": 4.817355221844802e-07, "logits/chosen": -1.5643110275268555, "logits/rejected": -1.5382698774337769, "logps/chosen": -172.5228729248047, "logps/rejected": -228.47189331054688, "loss": 0.5417, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1747311353683472, "rewards/margins": 0.5653839707374573, "rewards/rejected": -1.7401151657104492, "step": 2440 }, { "epoch": 0.42, "grad_norm": 19.47187684087894, "learning_rate": 4.814523987388038e-07, "logits/chosen": -1.5278120040893555, "logits/rejected": -1.490755319595337, "logps/chosen": -177.8535919189453, "logps/rejected": -222.85791015625, "loss": 0.5882, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2479978799819946, "rewards/margins": 0.45816653966903687, "rewards/rejected": -1.7061645984649658, "step": 2450 }, { "epoch": 0.42, "grad_norm": 14.220492265970561, "learning_rate": 4.811671822512644e-07, "logits/chosen": -1.5169602632522583, "logits/rejected": -1.4735338687896729, "logps/chosen": -167.81532287597656, "logps/rejected": -209.4271240234375, "loss": 0.586, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1169971227645874, "rewards/margins": 0.45197534561157227, "rewards/rejected": -1.5689725875854492, "step": 2460 }, { "epoch": 0.43, "grad_norm": 24.10851119267855, "learning_rate": 4.808798753010965e-07, "logits/chosen": -1.648048758506775, "logits/rejected": -1.6161121129989624, "logps/chosen": -171.47268676757812, "logps/rejected": -209.07357788085938, "loss": 0.5908, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1645221710205078, "rewards/margins": 0.39932164549827576, "rewards/rejected": -1.5638437271118164, "step": 2470 }, { "epoch": 0.43, "grad_norm": 15.012483229366685, "learning_rate": 4.805904804864388e-07, "logits/chosen": -1.6050293445587158, "logits/rejected": -1.5672911405563354, "logps/chosen": -169.68240356445312, "logps/rejected": -201.05831909179688, "loss": 0.6088, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.153080701828003, "rewards/margins": 0.35358747839927673, "rewards/rejected": -1.506668210029602, "step": 2480 }, { "epoch": 0.43, "grad_norm": 17.517728604597213, "learning_rate": 4.802990004243112e-07, "logits/chosen": -1.6790720224380493, "logits/rejected": -1.6492221355438232, "logps/chosen": -141.60067749023438, "logps/rejected": -183.9330291748047, "loss": 0.581, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9159472584724426, "rewards/margins": 0.4076360762119293, "rewards/rejected": -1.3235833644866943, "step": 2490 }, { "epoch": 0.43, "grad_norm": 19.29815730049706, "learning_rate": 4.800054377505901e-07, "logits/chosen": -1.7170441150665283, "logits/rejected": -1.6744773387908936, "logps/chosen": -157.9257354736328, "logps/rejected": -203.2161102294922, "loss": 0.5679, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0103013515472412, "rewards/margins": 0.4565104842185974, "rewards/rejected": -1.4668117761611938, "step": 2500 }, { "epoch": 0.43, "eval_logits/chosen": -1.7219594717025757, "eval_logits/rejected": -1.7044074535369873, "eval_logps/chosen": -148.90235900878906, "eval_logps/rejected": -175.4527587890625, "eval_loss": 0.6329796314239502, "eval_rewards/accuracies": 0.6345260143280029, "eval_rewards/chosen": -0.9019851088523865, "eval_rewards/margins": 0.22096872329711914, "eval_rewards/rejected": -1.1229537725448608, "eval_runtime": 356.8285, "eval_samples_per_second": 12.062, "eval_steps_per_second": 1.508, "step": 2500 }, { "epoch": 0.43, "grad_norm": 20.098541263273624, "learning_rate": 4.797097951199854e-07, "logits/chosen": -1.5535961389541626, "logits/rejected": -1.5248453617095947, "logps/chosen": -159.08462524414062, "logps/rejected": -213.17837524414062, "loss": 0.546, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.067018985748291, "rewards/margins": 0.5451322793960571, "rewards/rejected": -1.6121511459350586, "step": 2510 }, { "epoch": 0.43, "grad_norm": 15.302909435904388, "learning_rate": 4.794120752060162e-07, "logits/chosen": -1.5149682760238647, "logits/rejected": -1.4745677709579468, "logps/chosen": -166.39657592773438, "logps/rejected": -208.57785034179688, "loss": 0.5939, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1393781900405884, "rewards/margins": 0.4565187990665436, "rewards/rejected": -1.5958969593048096, "step": 2520 }, { "epoch": 0.44, "grad_norm": 25.687681170750803, "learning_rate": 4.791122807009866e-07, "logits/chosen": -1.568881869316101, "logits/rejected": -1.552473783493042, "logps/chosen": -177.05374145507812, "logps/rejected": -220.03857421875, "loss": 0.6046, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2589091062545776, "rewards/margins": 0.40004104375839233, "rewards/rejected": -1.6589502096176147, "step": 2530 }, { "epoch": 0.44, "grad_norm": 17.330151466831865, "learning_rate": 4.788104143159616e-07, "logits/chosen": -1.6212892532348633, "logits/rejected": -1.5904419422149658, "logps/chosen": -177.31298828125, "logps/rejected": -227.2390899658203, "loss": 0.6146, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2378965616226196, "rewards/margins": 0.4961935579776764, "rewards/rejected": -1.7340900897979736, "step": 2540 }, { "epoch": 0.44, "grad_norm": 17.990325513608543, "learning_rate": 4.785064787807418e-07, "logits/chosen": -1.6909650564193726, "logits/rejected": -1.6431467533111572, "logps/chosen": -151.2425537109375, "logps/rejected": -197.7589569091797, "loss": 0.5459, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9256092309951782, "rewards/margins": 0.5123879313468933, "rewards/rejected": -1.4379972219467163, "step": 2550 }, { "epoch": 0.44, "grad_norm": 14.629576613571512, "learning_rate": 4.782004768438399e-07, "logits/chosen": -1.7803840637207031, "logits/rejected": -1.744502305984497, "logps/chosen": -138.01266479492188, "logps/rejected": -177.8213348388672, "loss": 0.5766, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8278089761734009, "rewards/margins": 0.41043511033058167, "rewards/rejected": -1.2382439374923706, "step": 2560 }, { "epoch": 0.44, "grad_norm": 14.783534214563243, "learning_rate": 4.778924112724548e-07, "logits/chosen": -1.6910631656646729, "logits/rejected": -1.6597950458526611, "logps/chosen": -161.07774353027344, "logps/rejected": -207.1680145263672, "loss": 0.5656, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.02474045753479, "rewards/margins": 0.4819648861885071, "rewards/rejected": -1.506705403327942, "step": 2570 }, { "epoch": 0.44, "grad_norm": 24.195144500390953, "learning_rate": 4.775822848524474e-07, "logits/chosen": -1.65180242061615, "logits/rejected": -1.6205854415893555, "logps/chosen": -174.69419860839844, "logps/rejected": -216.5833740234375, "loss": 0.5993, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2136080265045166, "rewards/margins": 0.4207339286804199, "rewards/rejected": -1.634341835975647, "step": 2580 }, { "epoch": 0.45, "grad_norm": 24.07431376599948, "learning_rate": 4.772701003883146e-07, "logits/chosen": -1.6589524745941162, "logits/rejected": -1.6199334859848022, "logps/chosen": -160.41822814941406, "logps/rejected": -193.14602661132812, "loss": 0.6035, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0372803211212158, "rewards/margins": 0.38218361139297485, "rewards/rejected": -1.419463872909546, "step": 2590 }, { "epoch": 0.45, "grad_norm": 17.260199729003336, "learning_rate": 4.769558607031646e-07, "logits/chosen": -1.6966606378555298, "logits/rejected": -1.6404602527618408, "logps/chosen": -153.80836486816406, "logps/rejected": -197.2882080078125, "loss": 0.5426, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.9672321081161499, "rewards/margins": 0.5213042497634888, "rewards/rejected": -1.4885362386703491, "step": 2600 }, { "epoch": 0.45, "eval_logits/chosen": -1.7993457317352295, "eval_logits/rejected": -1.7825459241867065, "eval_logps/chosen": -147.43885803222656, "eval_logps/rejected": -172.26231384277344, "eval_loss": 0.6352224946022034, "eval_rewards/accuracies": 0.6354553699493408, "eval_rewards/chosen": -0.8873502016067505, "eval_rewards/margins": 0.20369918644428253, "eval_rewards/rejected": -1.0910491943359375, "eval_runtime": 356.7035, "eval_samples_per_second": 12.066, "eval_steps_per_second": 1.508, "step": 2600 }, { "epoch": 0.45, "grad_norm": 16.168191035992383, "learning_rate": 4.7663956863869114e-07, "logits/chosen": -1.646691918373108, "logits/rejected": -1.5923856496810913, "logps/chosen": -167.03103637695312, "logps/rejected": -211.1219482421875, "loss": 0.5665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0866007804870605, "rewards/margins": 0.5080444812774658, "rewards/rejected": -1.5946451425552368, "step": 2610 }, { "epoch": 0.45, "grad_norm": 21.121725151266354, "learning_rate": 4.7632122705514764e-07, "logits/chosen": -1.6836843490600586, "logits/rejected": -1.6414811611175537, "logps/chosen": -174.40000915527344, "logps/rejected": -224.135009765625, "loss": 0.5734, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2318413257598877, "rewards/margins": 0.5052552223205566, "rewards/rejected": -1.7370964288711548, "step": 2620 }, { "epoch": 0.45, "grad_norm": 20.850911780664795, "learning_rate": 4.760008388313216e-07, "logits/chosen": -1.5688848495483398, "logits/rejected": -1.5264674425125122, "logps/chosen": -175.73428344726562, "logps/rejected": -224.6223907470703, "loss": 0.5769, "rewards/accuracies": 0.71875, "rewards/chosen": -1.209410548210144, "rewards/margins": 0.5050948262214661, "rewards/rejected": -1.7145051956176758, "step": 2630 }, { "epoch": 0.45, "grad_norm": 19.418048097017046, "learning_rate": 4.756784068645083e-07, "logits/chosen": -1.5928579568862915, "logits/rejected": -1.553302526473999, "logps/chosen": -167.90257263183594, "logps/rejected": -221.66702270507812, "loss": 0.5415, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.178271770477295, "rewards/margins": 0.5487642288208008, "rewards/rejected": -1.7270358800888062, "step": 2640 }, { "epoch": 0.46, "grad_norm": 21.88823441031475, "learning_rate": 4.75353934070485e-07, "logits/chosen": -1.5368947982788086, "logits/rejected": -1.5017019510269165, "logps/chosen": -185.3848419189453, "logps/rejected": -244.77587890625, "loss": 0.5833, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3293806314468384, "rewards/margins": 0.5755869746208191, "rewards/rejected": -1.9049675464630127, "step": 2650 }, { "epoch": 0.46, "grad_norm": 22.610484118287804, "learning_rate": 4.7502742338348406e-07, "logits/chosen": -1.5877610445022583, "logits/rejected": -1.5406101942062378, "logps/chosen": -193.39242553710938, "logps/rejected": -222.32186889648438, "loss": 0.6741, "rewards/accuracies": 0.625, "rewards/chosen": -1.351151943206787, "rewards/margins": 0.34743010997772217, "rewards/rejected": -1.6985820531845093, "step": 2660 }, { "epoch": 0.46, "grad_norm": 13.819491655406011, "learning_rate": 4.746988777561668e-07, "logits/chosen": -1.6597168445587158, "logits/rejected": -1.6143728494644165, "logps/chosen": -164.24771118164062, "logps/rejected": -210.0986328125, "loss": 0.584, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1030311584472656, "rewards/margins": 0.47755417227745056, "rewards/rejected": -1.5805851221084595, "step": 2670 }, { "epoch": 0.46, "grad_norm": 19.318025617063718, "learning_rate": 4.743683001595965e-07, "logits/chosen": -1.7418750524520874, "logits/rejected": -1.7099230289459229, "logps/chosen": -164.66427612304688, "logps/rejected": -190.61524963378906, "loss": 0.6165, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.0642914772033691, "rewards/margins": 0.30566078424453735, "rewards/rejected": -1.3699522018432617, "step": 2680 }, { "epoch": 0.46, "grad_norm": 12.859233397896235, "learning_rate": 4.7403569358321206e-07, "logits/chosen": -1.7552549839019775, "logits/rejected": -1.7229642868041992, "logps/chosen": -147.30789184570312, "logps/rejected": -191.17984008789062, "loss": 0.5505, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9669073820114136, "rewards/margins": 0.44619670510292053, "rewards/rejected": -1.4131041765213013, "step": 2690 }, { "epoch": 0.47, "grad_norm": 15.446403175724251, "learning_rate": 4.7370106103480013e-07, "logits/chosen": -1.7358205318450928, "logits/rejected": -1.6975898742675781, "logps/chosen": -156.6081085205078, "logps/rejected": -196.40362548828125, "loss": 0.5888, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.033973217010498, "rewards/margins": 0.3954750895500183, "rewards/rejected": -1.4294483661651611, "step": 2700 }, { "epoch": 0.47, "eval_logits/chosen": -1.8467351198196411, "eval_logits/rejected": -1.8293933868408203, "eval_logps/chosen": -149.63987731933594, "eval_logps/rejected": -176.10572814941406, "eval_loss": 0.6302607655525208, "eval_rewards/accuracies": 0.645213782787323, "eval_rewards/chosen": -0.9093602895736694, "eval_rewards/margins": 0.2201230674982071, "eval_rewards/rejected": -1.1294833421707153, "eval_runtime": 356.6899, "eval_samples_per_second": 12.067, "eval_steps_per_second": 1.508, "step": 2700 }, { "epoch": 0.47, "grad_norm": 16.878338793520253, "learning_rate": 4.733644055404687e-07, "logits/chosen": -1.7432657480239868, "logits/rejected": -1.7117999792099, "logps/chosen": -164.71484375, "logps/rejected": -207.14260864257812, "loss": 0.5639, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.052567481994629, "rewards/margins": 0.4653971791267395, "rewards/rejected": -1.5179646015167236, "step": 2710 }, { "epoch": 0.47, "grad_norm": 20.497517981633035, "learning_rate": 4.7302573014461935e-07, "logits/chosen": -1.7307226657867432, "logits/rejected": -1.7192401885986328, "logps/chosen": -163.8205108642578, "logps/rejected": -208.58010864257812, "loss": 0.5986, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1179062128067017, "rewards/margins": 0.430508553981781, "rewards/rejected": -1.548414707183838, "step": 2720 }, { "epoch": 0.47, "grad_norm": 19.634240873533788, "learning_rate": 4.7268503790991977e-07, "logits/chosen": -1.760005235671997, "logits/rejected": -1.728356957435608, "logps/chosen": -156.45030212402344, "logps/rejected": -194.1329803466797, "loss": 0.5988, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9790525436401367, "rewards/margins": 0.4097796380519867, "rewards/rejected": -1.3888323307037354, "step": 2730 }, { "epoch": 0.47, "grad_norm": 15.780929737120895, "learning_rate": 4.72342331917276e-07, "logits/chosen": -1.7603209018707275, "logits/rejected": -1.730158805847168, "logps/chosen": -138.56451416015625, "logps/rejected": -176.93080139160156, "loss": 0.5707, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8573589324951172, "rewards/margins": 0.41820549964904785, "rewards/rejected": -1.275564432144165, "step": 2740 }, { "epoch": 0.47, "grad_norm": 20.00693735071868, "learning_rate": 4.7199761526580484e-07, "logits/chosen": -1.6731714010238647, "logits/rejected": -1.6483711004257202, "logps/chosen": -145.03150939941406, "logps/rejected": -201.181640625, "loss": 0.5471, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9579359292984009, "rewards/margins": 0.5196166038513184, "rewards/rejected": -1.4775525331497192, "step": 2750 }, { "epoch": 0.48, "grad_norm": 17.11025850674512, "learning_rate": 4.7165089107280536e-07, "logits/chosen": -1.6770479679107666, "logits/rejected": -1.643370270729065, "logps/chosen": -151.8813018798828, "logps/rejected": -205.90744018554688, "loss": 0.5535, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9763303995132446, "rewards/margins": 0.5348490476608276, "rewards/rejected": -1.5111793279647827, "step": 2760 }, { "epoch": 0.48, "grad_norm": 19.624859017470257, "learning_rate": 4.7130216247373123e-07, "logits/chosen": -1.7125215530395508, "logits/rejected": -1.6670726537704468, "logps/chosen": -179.84864807128906, "logps/rejected": -230.32839965820312, "loss": 0.5671, "rewards/accuracies": 0.75, "rewards/chosen": -1.2465837001800537, "rewards/margins": 0.5265911817550659, "rewards/rejected": -1.7731748819351196, "step": 2770 }, { "epoch": 0.48, "grad_norm": 18.39672002807499, "learning_rate": 4.7095143262216203e-07, "logits/chosen": -1.5359172821044922, "logits/rejected": -1.4871833324432373, "logps/chosen": -188.5769805908203, "logps/rejected": -238.5513153076172, "loss": 0.5609, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3573224544525146, "rewards/margins": 0.5221356749534607, "rewards/rejected": -1.8794580698013306, "step": 2780 }, { "epoch": 0.48, "grad_norm": 25.940828538741872, "learning_rate": 4.705987046897748e-07, "logits/chosen": -1.6233654022216797, "logits/rejected": -1.58616042137146, "logps/chosen": -184.6060028076172, "logps/rejected": -230.30361938476562, "loss": 0.578, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3283016681671143, "rewards/margins": 0.4737378656864166, "rewards/rejected": -1.8020395040512085, "step": 2790 }, { "epoch": 0.48, "grad_norm": 17.60526676910467, "learning_rate": 4.7024398186631533e-07, "logits/chosen": -1.6539256572723389, "logits/rejected": -1.6196858882904053, "logps/chosen": -191.15408325195312, "logps/rejected": -228.4556121826172, "loss": 0.6328, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3112226724624634, "rewards/margins": 0.4050907492637634, "rewards/rejected": -1.7163136005401611, "step": 2800 }, { "epoch": 0.48, "eval_logits/chosen": -1.8423043489456177, "eval_logits/rejected": -1.8252124786376953, "eval_logps/chosen": -142.36798095703125, "eval_logps/rejected": -167.40048217773438, "eval_loss": 0.6315863728523254, "eval_rewards/accuracies": 0.6419609785079956, "eval_rewards/chosen": -0.8366413712501526, "eval_rewards/margins": 0.20578964054584503, "eval_rewards/rejected": -1.042431116104126, "eval_runtime": 356.6408, "eval_samples_per_second": 12.068, "eval_steps_per_second": 1.509, "step": 2800 }, { "epoch": 0.48, "grad_norm": 17.01308659310898, "learning_rate": 4.6988726735956953e-07, "logits/chosen": -1.6734424829483032, "logits/rejected": -1.633302092552185, "logps/chosen": -146.92413330078125, "logps/rejected": -195.38314819335938, "loss": 0.5504, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9160858392715454, "rewards/margins": 0.48056167364120483, "rewards/rejected": -1.3966474533081055, "step": 2810 }, { "epoch": 0.49, "grad_norm": 15.867862572496643, "learning_rate": 4.69528564395334e-07, "logits/chosen": -1.8120372295379639, "logits/rejected": -1.792083740234375, "logps/chosen": -149.9076385498047, "logps/rejected": -176.41195678710938, "loss": 0.6392, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.9560129046440125, "rewards/margins": 0.27826496958732605, "rewards/rejected": -1.2342779636383057, "step": 2820 }, { "epoch": 0.49, "grad_norm": 13.828136544925801, "learning_rate": 4.691678762173874e-07, "logits/chosen": -1.6588958501815796, "logits/rejected": -1.6307001113891602, "logps/chosen": -137.5013427734375, "logps/rejected": -186.1872100830078, "loss": 0.5436, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8202708959579468, "rewards/margins": 0.49207648634910583, "rewards/rejected": -1.3123472929000854, "step": 2830 }, { "epoch": 0.49, "grad_norm": 12.998742695216233, "learning_rate": 4.6880520608746065e-07, "logits/chosen": -1.766371726989746, "logits/rejected": -1.7423028945922852, "logps/chosen": -146.57913208007812, "logps/rejected": -185.55625915527344, "loss": 0.5919, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9395810961723328, "rewards/margins": 0.4023555815219879, "rewards/rejected": -1.341936707496643, "step": 2840 }, { "epoch": 0.49, "grad_norm": 16.674011214819668, "learning_rate": 4.684405572852077e-07, "logits/chosen": -1.6769888401031494, "logits/rejected": -1.6428205966949463, "logps/chosen": -159.1728515625, "logps/rejected": -215.3779754638672, "loss": 0.5433, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0694429874420166, "rewards/margins": 0.5360538363456726, "rewards/rejected": -1.6054970026016235, "step": 2850 }, { "epoch": 0.49, "grad_norm": 16.33436683362672, "learning_rate": 4.680739331081757e-07, "logits/chosen": -1.662724494934082, "logits/rejected": -1.6217238903045654, "logps/chosen": -157.78176879882812, "logps/rejected": -207.1402130126953, "loss": 0.5372, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0284638404846191, "rewards/margins": 0.5234465003013611, "rewards/rejected": -1.5519102811813354, "step": 2860 }, { "epoch": 0.49, "grad_norm": 20.916547924866293, "learning_rate": 4.677053368717754e-07, "logits/chosen": -1.682941198348999, "logits/rejected": -1.6458778381347656, "logps/chosen": -167.31607055664062, "logps/rejected": -218.1786346435547, "loss": 0.5674, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1120381355285645, "rewards/margins": 0.539524257183075, "rewards/rejected": -1.6515624523162842, "step": 2870 }, { "epoch": 0.5, "grad_norm": 18.373461916648946, "learning_rate": 4.6733477190925073e-07, "logits/chosen": -1.7388041019439697, "logits/rejected": -1.6905145645141602, "logps/chosen": -175.05714416503906, "logps/rejected": -221.7596435546875, "loss": 0.5676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1971012353897095, "rewards/margins": 0.5112650394439697, "rewards/rejected": -1.7083663940429688, "step": 2880 }, { "epoch": 0.5, "grad_norm": 22.994357222490983, "learning_rate": 4.6696224157164943e-07, "logits/chosen": -1.7159115076065063, "logits/rejected": -1.690751075744629, "logps/chosen": -173.77462768554688, "logps/rejected": -222.635986328125, "loss": 0.5746, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2083570957183838, "rewards/margins": 0.49303531646728516, "rewards/rejected": -1.701392412185669, "step": 2890 }, { "epoch": 0.5, "grad_norm": 23.78726232737787, "learning_rate": 4.6658774922779187e-07, "logits/chosen": -1.6340763568878174, "logits/rejected": -1.6105928421020508, "logps/chosen": -169.45535278320312, "logps/rejected": -216.67333984375, "loss": 0.5746, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.16982901096344, "rewards/margins": 0.48190441727638245, "rewards/rejected": -1.6517333984375, "step": 2900 }, { "epoch": 0.5, "eval_logits/chosen": -1.7276008129119873, "eval_logits/rejected": -1.707594871520996, "eval_logps/chosen": -164.17117309570312, "eval_logps/rejected": -193.61111450195312, "eval_loss": 0.6267496943473816, "eval_rewards/accuracies": 0.6442843675613403, "eval_rewards/chosen": -1.054673433303833, "eval_rewards/margins": 0.24986399710178375, "eval_rewards/rejected": -1.304537296295166, "eval_runtime": 356.9094, "eval_samples_per_second": 12.059, "eval_steps_per_second": 1.507, "step": 2900 }, { "epoch": 0.5, "grad_norm": 22.570926073633988, "learning_rate": 4.662112982642412e-07, "logits/chosen": -1.6592012643814087, "logits/rejected": -1.6184587478637695, "logps/chosen": -178.86044311523438, "logps/rejected": -250.51681518554688, "loss": 0.5009, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2277532815933228, "rewards/margins": 0.7185968160629272, "rewards/rejected": -1.94635009765625, "step": 2910 }, { "epoch": 0.5, "grad_norm": 25.29506569233524, "learning_rate": 4.6583289208527244e-07, "logits/chosen": -1.5599522590637207, "logits/rejected": -1.5313317775726318, "logps/chosen": -197.64329528808594, "logps/rejected": -261.9999084472656, "loss": 0.5768, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4407055377960205, "rewards/margins": 0.6243780851364136, "rewards/rejected": -2.0650835037231445, "step": 2920 }, { "epoch": 0.5, "grad_norm": 17.04379311188254, "learning_rate": 4.654525341128418e-07, "logits/chosen": -1.5148179531097412, "logits/rejected": -1.468490481376648, "logps/chosen": -188.95974731445312, "logps/rejected": -253.90164184570312, "loss": 0.5079, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.355076789855957, "rewards/margins": 0.6675424575805664, "rewards/rejected": -2.0226194858551025, "step": 2930 }, { "epoch": 0.51, "grad_norm": 14.086402307191449, "learning_rate": 4.650702277865558e-07, "logits/chosen": -1.5800873041152954, "logits/rejected": -1.5371129512786865, "logps/chosen": -182.9086456298828, "logps/rejected": -230.81295776367188, "loss": 0.5955, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3146532773971558, "rewards/margins": 0.48588043451309204, "rewards/rejected": -1.8005338907241821, "step": 2940 }, { "epoch": 0.51, "grad_norm": 21.69271778749331, "learning_rate": 4.6468597656363994e-07, "logits/chosen": -1.6005538702011108, "logits/rejected": -1.566699743270874, "logps/chosen": -182.33128356933594, "logps/rejected": -239.34439086914062, "loss": 0.5601, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2800660133361816, "rewards/margins": 0.56809401512146, "rewards/rejected": -1.8481600284576416, "step": 2950 }, { "epoch": 0.51, "grad_norm": 19.85110567457654, "learning_rate": 4.6429978391890756e-07, "logits/chosen": -1.5460537672042847, "logits/rejected": -1.498718500137329, "logps/chosen": -180.05801391601562, "logps/rejected": -235.87344360351562, "loss": 0.5485, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2613499164581299, "rewards/margins": 0.5788403749465942, "rewards/rejected": -1.8401902914047241, "step": 2960 }, { "epoch": 0.51, "grad_norm": 23.495999485202848, "learning_rate": 4.639116533447286e-07, "logits/chosen": -1.4766029119491577, "logits/rejected": -1.4375579357147217, "logps/chosen": -196.3876190185547, "logps/rejected": -245.5087432861328, "loss": 0.582, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4116392135620117, "rewards/margins": 0.519368588924408, "rewards/rejected": -1.931007742881775, "step": 2970 }, { "epoch": 0.51, "grad_norm": 22.165196277613042, "learning_rate": 4.635215883509976e-07, "logits/chosen": -1.5197012424468994, "logits/rejected": -1.47576105594635, "logps/chosen": -182.31118774414062, "logps/rejected": -240.2295379638672, "loss": 0.5411, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2836403846740723, "rewards/margins": 0.5962404012680054, "rewards/rejected": -1.8798809051513672, "step": 2980 }, { "epoch": 0.52, "grad_norm": 18.23553989348222, "learning_rate": 4.6312959246510234e-07, "logits/chosen": -1.6128461360931396, "logits/rejected": -1.5682920217514038, "logps/chosen": -174.17776489257812, "logps/rejected": -225.26742553710938, "loss": 0.5519, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1757910251617432, "rewards/margins": 0.5307731032371521, "rewards/rejected": -1.70656418800354, "step": 2990 }, { "epoch": 0.52, "grad_norm": 13.778675539788516, "learning_rate": 4.627356692318919e-07, "logits/chosen": -1.6555538177490234, "logits/rejected": -1.6289546489715576, "logps/chosen": -151.74960327148438, "logps/rejected": -207.7194061279297, "loss": 0.5452, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9974144697189331, "rewards/margins": 0.5410576462745667, "rewards/rejected": -1.5384724140167236, "step": 3000 }, { "epoch": 0.52, "eval_logits/chosen": -1.7558156251907349, "eval_logits/rejected": -1.7363479137420654, "eval_logps/chosen": -150.7609405517578, "eval_logps/rejected": -178.5692138671875, "eval_loss": 0.6288471221923828, "eval_rewards/accuracies": 0.6463754773139954, "eval_rewards/chosen": -0.9205708503723145, "eval_rewards/margins": 0.23354758322238922, "eval_rewards/rejected": -1.1541184186935425, "eval_runtime": 356.7483, "eval_samples_per_second": 12.065, "eval_steps_per_second": 1.508, "step": 3000 }, { "epoch": 0.52, "grad_norm": 16.90078566654223, "learning_rate": 4.623398222136443e-07, "logits/chosen": -1.6691395044326782, "logits/rejected": -1.6284288167953491, "logps/chosen": -161.9709930419922, "logps/rejected": -210.35195922851562, "loss": 0.5663, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0677728652954102, "rewards/margins": 0.5049013495445251, "rewards/rejected": -1.57267427444458, "step": 3010 }, { "epoch": 0.52, "grad_norm": 21.480798300954746, "learning_rate": 4.6194205499003467e-07, "logits/chosen": -1.7338823080062866, "logits/rejected": -1.6823310852050781, "logps/chosen": -166.0416259765625, "logps/rejected": -225.45706176757812, "loss": 0.5425, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1181974411010742, "rewards/margins": 0.6333866119384766, "rewards/rejected": -1.7515838146209717, "step": 3020 }, { "epoch": 0.52, "grad_norm": 17.722022537225527, "learning_rate": 4.615423711581027e-07, "logits/chosen": -1.6567986011505127, "logits/rejected": -1.6222938299179077, "logps/chosen": -170.3144989013672, "logps/rejected": -208.64501953125, "loss": 0.6058, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1464793682098389, "rewards/margins": 0.3912624716758728, "rewards/rejected": -1.5377418994903564, "step": 3030 }, { "epoch": 0.52, "grad_norm": 15.214602160313902, "learning_rate": 4.6114077433221994e-07, "logits/chosen": -1.7444251775741577, "logits/rejected": -1.715855598449707, "logps/chosen": -158.28477478027344, "logps/rejected": -213.8212432861328, "loss": 0.5529, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0712274312973022, "rewards/margins": 0.5239061713218689, "rewards/rejected": -1.5951335430145264, "step": 3040 }, { "epoch": 0.53, "grad_norm": 22.969445403757415, "learning_rate": 4.6073726814405746e-07, "logits/chosen": -1.6354888677597046, "logits/rejected": -1.6061290502548218, "logps/chosen": -153.43170166015625, "logps/rejected": -197.95614624023438, "loss": 0.5946, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9988822937011719, "rewards/margins": 0.4272375702857971, "rewards/rejected": -1.4261198043823242, "step": 3050 }, { "epoch": 0.53, "grad_norm": 28.86788571866789, "learning_rate": 4.6033185624255276e-07, "logits/chosen": -1.6350476741790771, "logits/rejected": -1.6002562046051025, "logps/chosen": -153.01419067382812, "logps/rejected": -200.2187042236328, "loss": 0.5541, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9734545946121216, "rewards/margins": 0.48509782552719116, "rewards/rejected": -1.458552360534668, "step": 3060 }, { "epoch": 0.53, "grad_norm": 21.065042836310123, "learning_rate": 4.5992454229387693e-07, "logits/chosen": -1.5526440143585205, "logits/rejected": -1.5073213577270508, "logps/chosen": -173.8745574951172, "logps/rejected": -220.6032257080078, "loss": 0.5811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1593778133392334, "rewards/margins": 0.5076309442520142, "rewards/rejected": -1.6670089960098267, "step": 3070 }, { "epoch": 0.53, "grad_norm": 19.310546719568105, "learning_rate": 4.5951532998140136e-07, "logits/chosen": -1.4362452030181885, "logits/rejected": -1.399596929550171, "logps/chosen": -182.4234161376953, "logps/rejected": -246.11245727539062, "loss": 0.5594, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3157217502593994, "rewards/margins": 0.5977397561073303, "rewards/rejected": -1.913461685180664, "step": 3080 }, { "epoch": 0.53, "grad_norm": 20.275727141362044, "learning_rate": 4.591042230056644e-07, "logits/chosen": -1.5431610345840454, "logits/rejected": -1.5028966665267944, "logps/chosen": -162.43585205078125, "logps/rejected": -222.4371337890625, "loss": 0.5203, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0885474681854248, "rewards/margins": 0.5989712476730347, "rewards/rejected": -1.6875184774398804, "step": 3090 }, { "epoch": 0.53, "grad_norm": 22.85769406936058, "learning_rate": 4.586912250843383e-07, "logits/chosen": -1.49831223487854, "logits/rejected": -1.4446604251861572, "logps/chosen": -172.95877075195312, "logps/rejected": -234.62548828125, "loss": 0.5525, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1904902458190918, "rewards/margins": 0.6400495767593384, "rewards/rejected": -1.8305397033691406, "step": 3100 }, { "epoch": 0.53, "eval_logits/chosen": -1.6323180198669434, "eval_logits/rejected": -1.6100775003433228, "eval_logps/chosen": -161.87399291992188, "eval_logps/rejected": -193.96153259277344, "eval_loss": 0.623075008392334, "eval_rewards/accuracies": 0.6563661694526672, "eval_rewards/chosen": -1.0317014455795288, "eval_rewards/margins": 0.2763398587703705, "eval_rewards/rejected": -1.3080412149429321, "eval_runtime": 356.7275, "eval_samples_per_second": 12.065, "eval_steps_per_second": 1.508, "step": 3100 }, { "epoch": 0.54, "grad_norm": 25.719416790446477, "learning_rate": 4.5827633995219485e-07, "logits/chosen": -1.4610720872879028, "logits/rejected": -1.4539538621902466, "logps/chosen": -183.6023406982422, "logps/rejected": -237.76895141601562, "loss": 0.5884, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3302621841430664, "rewards/margins": 0.498201847076416, "rewards/rejected": -1.8284639120101929, "step": 3110 }, { "epoch": 0.54, "grad_norm": 14.309987948640194, "learning_rate": 4.5785957136107234e-07, "logits/chosen": -1.544480323791504, "logits/rejected": -1.5000821352005005, "logps/chosen": -165.1011505126953, "logps/rejected": -232.72195434570312, "loss": 0.5266, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0910911560058594, "rewards/margins": 0.678336501121521, "rewards/rejected": -1.7694276571273804, "step": 3120 }, { "epoch": 0.54, "grad_norm": 18.893819003746106, "learning_rate": 4.574409230798413e-07, "logits/chosen": -1.4636805057525635, "logits/rejected": -1.4383834600448608, "logps/chosen": -155.36705017089844, "logps/rejected": -204.67672729492188, "loss": 0.5856, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.032982587814331, "rewards/margins": 0.48075515031814575, "rewards/rejected": -1.513737678527832, "step": 3130 }, { "epoch": 0.54, "grad_norm": 19.750368207437877, "learning_rate": 4.5702039889437014e-07, "logits/chosen": -1.5176935195922852, "logits/rejected": -1.4778989553451538, "logps/chosen": -171.33694458007812, "logps/rejected": -242.3556365966797, "loss": 0.5396, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1966036558151245, "rewards/margins": 0.689578115940094, "rewards/rejected": -1.8861818313598633, "step": 3140 }, { "epoch": 0.54, "grad_norm": 15.672784519353224, "learning_rate": 4.565980026074917e-07, "logits/chosen": -1.4829928874969482, "logits/rejected": -1.4322001934051514, "logps/chosen": -172.95135498046875, "logps/rejected": -237.19912719726562, "loss": 0.5323, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1858676671981812, "rewards/margins": 0.6504721641540527, "rewards/rejected": -1.8363399505615234, "step": 3150 }, { "epoch": 0.54, "grad_norm": 18.037452777993565, "learning_rate": 4.5617373803896796e-07, "logits/chosen": -1.3555725812911987, "logits/rejected": -1.3147733211517334, "logps/chosen": -188.40936279296875, "logps/rejected": -247.400634765625, "loss": 0.559, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3425967693328857, "rewards/margins": 0.605254054069519, "rewards/rejected": -1.9478508234024048, "step": 3160 }, { "epoch": 0.55, "grad_norm": 15.619953452388868, "learning_rate": 4.5574760902545625e-07, "logits/chosen": -1.4381481409072876, "logits/rejected": -1.391213059425354, "logps/chosen": -183.0797882080078, "logps/rejected": -240.10757446289062, "loss": 0.5234, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.272312045097351, "rewards/margins": 0.6338873505592346, "rewards/rejected": -1.9061992168426514, "step": 3170 }, { "epoch": 0.55, "grad_norm": 24.556199495358904, "learning_rate": 4.5531961942047385e-07, "logits/chosen": -1.521206021308899, "logits/rejected": -1.4590338468551636, "logps/chosen": -183.3689727783203, "logps/rejected": -249.7210693359375, "loss": 0.5345, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3013378381729126, "rewards/margins": 0.681814968585968, "rewards/rejected": -1.9831526279449463, "step": 3180 }, { "epoch": 0.55, "grad_norm": 18.24766746095848, "learning_rate": 4.548897730943638e-07, "logits/chosen": -1.5017660856246948, "logits/rejected": -1.4614744186401367, "logps/chosen": -174.06478881835938, "logps/rejected": -262.4342041015625, "loss": 0.481, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2178510427474976, "rewards/margins": 0.839970588684082, "rewards/rejected": -2.057821750640869, "step": 3190 }, { "epoch": 0.55, "grad_norm": 19.455426316676963, "learning_rate": 4.544580739342596e-07, "logits/chosen": -1.406374216079712, "logits/rejected": -1.3839681148529053, "logps/chosen": -190.57859802246094, "logps/rejected": -231.16159057617188, "loss": 0.6097, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3527233600616455, "rewards/margins": 0.4378505349159241, "rewards/rejected": -1.7905738353729248, "step": 3200 }, { "epoch": 0.55, "eval_logits/chosen": -1.6120717525482178, "eval_logits/rejected": -1.5902533531188965, "eval_logps/chosen": -167.82127380371094, "eval_logps/rejected": -200.23843383789062, "eval_loss": 0.6200531721115112, "eval_rewards/accuracies": 0.6554368138313293, "eval_rewards/chosen": -1.0911740064620972, "eval_rewards/margins": 0.2796363830566406, "eval_rewards/rejected": -1.3708105087280273, "eval_runtime": 356.6653, "eval_samples_per_second": 12.067, "eval_steps_per_second": 1.508, "step": 3200 }, { "epoch": 0.55, "grad_norm": 19.796269236135114, "learning_rate": 4.5402452584404995e-07, "logits/chosen": -1.411024808883667, "logits/rejected": -1.3637266159057617, "logps/chosen": -168.30789184570312, "logps/rejected": -231.79910278320312, "loss": 0.5188, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1436474323272705, "rewards/margins": 0.6496217250823975, "rewards/rejected": -1.793269157409668, "step": 3210 }, { "epoch": 0.55, "grad_norm": 15.230980934048251, "learning_rate": 4.535891327443435e-07, "logits/chosen": -1.4088395833969116, "logits/rejected": -1.3787903785705566, "logps/chosen": -172.39651489257812, "logps/rejected": -239.85952758789062, "loss": 0.5494, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2206478118896484, "rewards/margins": 0.6492413282394409, "rewards/rejected": -1.8698889017105103, "step": 3220 }, { "epoch": 0.56, "grad_norm": 30.32211749720397, "learning_rate": 4.5315189857243377e-07, "logits/chosen": -1.4493725299835205, "logits/rejected": -1.413207769393921, "logps/chosen": -175.58511352539062, "logps/rejected": -230.5104522705078, "loss": 0.5586, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.214536428451538, "rewards/margins": 0.5267941355705261, "rewards/rejected": -1.7413305044174194, "step": 3230 }, { "epoch": 0.56, "grad_norm": 16.643885483015545, "learning_rate": 4.527128272822629e-07, "logits/chosen": -1.621273398399353, "logits/rejected": -1.58579421043396, "logps/chosen": -170.71180725097656, "logps/rejected": -214.2522735595703, "loss": 0.6022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1446621417999268, "rewards/margins": 0.47269731760025024, "rewards/rejected": -1.6173597574234009, "step": 3240 }, { "epoch": 0.56, "grad_norm": 21.109399435647408, "learning_rate": 4.522719228443864e-07, "logits/chosen": -1.5881023406982422, "logits/rejected": -1.5520793199539185, "logps/chosen": -142.5547332763672, "logps/rejected": -191.86538696289062, "loss": 0.5607, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8997921943664551, "rewards/margins": 0.4858148992061615, "rewards/rejected": -1.3856070041656494, "step": 3250 }, { "epoch": 0.56, "grad_norm": 22.057929518873816, "learning_rate": 4.5182918924593703e-07, "logits/chosen": -1.607410192489624, "logits/rejected": -1.5681886672973633, "logps/chosen": -147.30438232421875, "logps/rejected": -203.17489624023438, "loss": 0.5476, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9579440355300903, "rewards/margins": 0.5465749502182007, "rewards/rejected": -1.504518747329712, "step": 3260 }, { "epoch": 0.56, "grad_norm": 18.19721714439992, "learning_rate": 4.5138463049058885e-07, "logits/chosen": -1.6494948863983154, "logits/rejected": -1.625372290611267, "logps/chosen": -166.00369262695312, "logps/rejected": -213.57943725585938, "loss": 0.5827, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1090967655181885, "rewards/margins": 0.4481208920478821, "rewards/rejected": -1.5572177171707153, "step": 3270 }, { "epoch": 0.57, "grad_norm": 20.95007962777713, "learning_rate": 4.50938250598521e-07, "logits/chosen": -1.6200672388076782, "logits/rejected": -1.5890979766845703, "logps/chosen": -150.30422973632812, "logps/rejected": -204.21400451660156, "loss": 0.5402, "rewards/accuracies": 0.75, "rewards/chosen": -1.0092462301254272, "rewards/margins": 0.509067714214325, "rewards/rejected": -1.5183137655258179, "step": 3280 }, { "epoch": 0.57, "grad_norm": 18.41298820642052, "learning_rate": 4.5049005360638103e-07, "logits/chosen": -1.5941425561904907, "logits/rejected": -1.538638710975647, "logps/chosen": -169.52369689941406, "logps/rejected": -231.6940460205078, "loss": 0.5708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1636857986450195, "rewards/margins": 0.6202095150947571, "rewards/rejected": -1.7838952541351318, "step": 3290 }, { "epoch": 0.57, "grad_norm": 22.05304883452244, "learning_rate": 4.5004004356724893e-07, "logits/chosen": -1.455288290977478, "logits/rejected": -1.414819598197937, "logps/chosen": -181.70596313476562, "logps/rejected": -236.2588653564453, "loss": 0.5807, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2678734064102173, "rewards/margins": 0.5589209794998169, "rewards/rejected": -1.8267943859100342, "step": 3300 }, { "epoch": 0.57, "eval_logits/chosen": -1.5502673387527466, "eval_logits/rejected": -1.5291829109191895, "eval_logps/chosen": -168.8760528564453, "eval_logps/rejected": -199.7250213623047, "eval_loss": 0.6238878965377808, "eval_rewards/accuracies": 0.6505576372146606, "eval_rewards/chosen": -1.1017221212387085, "eval_rewards/margins": 0.26395440101623535, "eval_rewards/rejected": -1.3656764030456543, "eval_runtime": 357.2068, "eval_samples_per_second": 12.049, "eval_steps_per_second": 1.506, "step": 3300 }, { "epoch": 0.57, "grad_norm": 18.786920967242654, "learning_rate": 4.4958822455060017e-07, "logits/chosen": -1.3820545673370361, "logits/rejected": -1.3281322717666626, "logps/chosen": -169.11795043945312, "logps/rejected": -231.81167602539062, "loss": 0.5407, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1603150367736816, "rewards/margins": 0.6508747339248657, "rewards/rejected": -1.8111896514892578, "step": 3310 }, { "epoch": 0.57, "grad_norm": 21.45405748820459, "learning_rate": 4.4913460064226894e-07, "logits/chosen": -1.44109308719635, "logits/rejected": -1.3888362646102905, "logps/chosen": -179.4849395751953, "logps/rejected": -232.6226043701172, "loss": 0.562, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.257912278175354, "rewards/margins": 0.5788989067077637, "rewards/rejected": -1.8368113040924072, "step": 3320 }, { "epoch": 0.57, "grad_norm": 16.508118858701444, "learning_rate": 4.486791759444111e-07, "logits/chosen": -1.5882141590118408, "logits/rejected": -1.5403480529785156, "logps/chosen": -164.25289916992188, "logps/rejected": -231.80007934570312, "loss": 0.5175, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1004276275634766, "rewards/margins": 0.680879533290863, "rewards/rejected": -1.7813072204589844, "step": 3330 }, { "epoch": 0.58, "grad_norm": 33.55292099789505, "learning_rate": 4.4822195457546716e-07, "logits/chosen": -1.5143282413482666, "logits/rejected": -1.4674466848373413, "logps/chosen": -189.68515014648438, "logps/rejected": -263.664794921875, "loss": 0.5363, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3378245830535889, "rewards/margins": 0.75914067029953, "rewards/rejected": -2.0969653129577637, "step": 3340 }, { "epoch": 0.58, "grad_norm": 16.44122753789477, "learning_rate": 4.477629406701254e-07, "logits/chosen": -1.427293300628662, "logits/rejected": -1.3901170492172241, "logps/chosen": -177.4530792236328, "logps/rejected": -247.17105102539062, "loss": 0.53, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.216886281967163, "rewards/margins": 0.6818975806236267, "rewards/rejected": -1.8987839221954346, "step": 3350 }, { "epoch": 0.58, "grad_norm": 14.401579786445982, "learning_rate": 4.473021383792838e-07, "logits/chosen": -1.5537811517715454, "logits/rejected": -1.5063152313232422, "logps/chosen": -168.55690002441406, "logps/rejected": -219.3855743408203, "loss": 0.5712, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1374642848968506, "rewards/margins": 0.5342391133308411, "rewards/rejected": -1.6717033386230469, "step": 3360 }, { "epoch": 0.58, "grad_norm": 19.904447545777934, "learning_rate": 4.4683955187001285e-07, "logits/chosen": -1.5263116359710693, "logits/rejected": -1.4976154565811157, "logps/chosen": -163.7618865966797, "logps/rejected": -228.2670440673828, "loss": 0.5506, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1279897689819336, "rewards/margins": 0.6148195862770081, "rewards/rejected": -1.7428092956542969, "step": 3370 }, { "epoch": 0.58, "grad_norm": 23.89725054685839, "learning_rate": 4.463751853255182e-07, "logits/chosen": -1.6531779766082764, "logits/rejected": -1.6117451190948486, "logps/chosen": -161.826171875, "logps/rejected": -209.54940795898438, "loss": 0.5573, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0634934902191162, "rewards/margins": 0.5306968688964844, "rewards/rejected": -1.5941904783248901, "step": 3380 }, { "epoch": 0.58, "grad_norm": 16.812554502274565, "learning_rate": 4.45909042945102e-07, "logits/chosen": -1.5942082405090332, "logits/rejected": -1.553095817565918, "logps/chosen": -158.5015869140625, "logps/rejected": -206.106201171875, "loss": 0.5908, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0478193759918213, "rewards/margins": 0.4990636706352234, "rewards/rejected": -1.5468828678131104, "step": 3390 }, { "epoch": 0.59, "grad_norm": 16.25733459636065, "learning_rate": 4.454411289441259e-07, "logits/chosen": -1.650813102722168, "logits/rejected": -1.589519739151001, "logps/chosen": -148.92491149902344, "logps/rejected": -204.60989379882812, "loss": 0.536, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.9608653783798218, "rewards/margins": 0.5713566541671753, "rewards/rejected": -1.532222032546997, "step": 3400 }, { "epoch": 0.59, "eval_logits/chosen": -1.723968505859375, "eval_logits/rejected": -1.705629825592041, "eval_logps/chosen": -141.45721435546875, "eval_logps/rejected": -167.2509307861328, "eval_loss": 0.63118976354599, "eval_rewards/accuracies": 0.6466078162193298, "eval_rewards/chosen": -0.8275338411331177, "eval_rewards/margins": 0.21340180933475494, "eval_rewards/rejected": -1.0409355163574219, "eval_runtime": 357.3396, "eval_samples_per_second": 12.045, "eval_steps_per_second": 1.506, "step": 3400 }, { "epoch": 0.59, "grad_norm": 17.44200872783744, "learning_rate": 4.4497144755397215e-07, "logits/chosen": -1.5299510955810547, "logits/rejected": -1.4821765422821045, "logps/chosen": -140.80323791503906, "logps/rejected": -188.71884155273438, "loss": 0.5417, "rewards/accuracies": 0.75, "rewards/chosen": -0.9123737215995789, "rewards/margins": 0.5039972066879272, "rewards/rejected": -1.4163707494735718, "step": 3410 }, { "epoch": 0.59, "grad_norm": 20.94711202688562, "learning_rate": 4.4450000302200576e-07, "logits/chosen": -1.5101244449615479, "logits/rejected": -1.4615298509597778, "logps/chosen": -156.17779541015625, "logps/rejected": -221.5508270263672, "loss": 0.52, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0341514348983765, "rewards/margins": 0.6485717296600342, "rewards/rejected": -1.6827232837677002, "step": 3420 }, { "epoch": 0.59, "grad_norm": 17.697196074003802, "learning_rate": 4.440267996115359e-07, "logits/chosen": -1.5161569118499756, "logits/rejected": -1.4690425395965576, "logps/chosen": -188.15164184570312, "logps/rejected": -251.8218231201172, "loss": 0.5671, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3682286739349365, "rewards/margins": 0.6038091778755188, "rewards/rejected": -1.9720379114151, "step": 3430 }, { "epoch": 0.59, "grad_norm": 23.878652372078566, "learning_rate": 4.435518416017774e-07, "logits/chosen": -1.4505062103271484, "logits/rejected": -1.4057317972183228, "logps/chosen": -191.19534301757812, "logps/rejected": -253.8204345703125, "loss": 0.5548, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.386473298072815, "rewards/margins": 0.6239285469055176, "rewards/rejected": -2.010401964187622, "step": 3440 }, { "epoch": 0.59, "grad_norm": 24.14251784705596, "learning_rate": 4.430751332878122e-07, "logits/chosen": -1.6515562534332275, "logits/rejected": -1.5952726602554321, "logps/chosen": -197.3381805419922, "logps/rejected": -256.8216857910156, "loss": 0.5492, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3796002864837646, "rewards/margins": 0.6393512487411499, "rewards/rejected": -2.018951654434204, "step": 3450 }, { "epoch": 0.6, "grad_norm": 24.13758333534331, "learning_rate": 4.425966789805503e-07, "logits/chosen": -1.499289631843567, "logits/rejected": -1.4667627811431885, "logps/chosen": -164.79124450683594, "logps/rejected": -216.9873504638672, "loss": 0.5621, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1262753009796143, "rewards/margins": 0.5078203678131104, "rewards/rejected": -1.6340957880020142, "step": 3460 }, { "epoch": 0.6, "grad_norm": 18.681968022257387, "learning_rate": 4.4211648300669076e-07, "logits/chosen": -1.597586989402771, "logits/rejected": -1.5641849040985107, "logps/chosen": -169.10386657714844, "logps/rejected": -226.5546112060547, "loss": 0.547, "rewards/accuracies": 0.71875, "rewards/chosen": -1.133301854133606, "rewards/margins": 0.5889450311660767, "rewards/rejected": -1.7222468852996826, "step": 3470 }, { "epoch": 0.6, "grad_norm": 29.803622122615664, "learning_rate": 4.4163454970868277e-07, "logits/chosen": -1.5102007389068604, "logits/rejected": -1.4543273448944092, "logps/chosen": -181.89114379882812, "logps/rejected": -245.5972900390625, "loss": 0.5329, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2680021524429321, "rewards/margins": 0.6691805124282837, "rewards/rejected": -1.9371826648712158, "step": 3480 }, { "epoch": 0.6, "grad_norm": 28.83453327688963, "learning_rate": 4.411508834446863e-07, "logits/chosen": -1.5323913097381592, "logits/rejected": -1.4874933958053589, "logps/chosen": -182.50387573242188, "logps/rejected": -241.5607452392578, "loss": 0.5529, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.282502293586731, "rewards/margins": 0.587981104850769, "rewards/rejected": -1.8704833984375, "step": 3490 }, { "epoch": 0.6, "grad_norm": 16.032941250777014, "learning_rate": 4.406654885885326e-07, "logits/chosen": -1.4855334758758545, "logits/rejected": -1.4571826457977295, "logps/chosen": -178.00135803222656, "logps/rejected": -237.3003387451172, "loss": 0.5392, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.259861707687378, "rewards/margins": 0.5666104555130005, "rewards/rejected": -1.8264720439910889, "step": 3500 }, { "epoch": 0.6, "eval_logits/chosen": -1.6595714092254639, "eval_logits/rejected": -1.6385571956634521, "eval_logps/chosen": -161.32484436035156, "eval_logps/rejected": -191.19439697265625, "eval_loss": 0.6286602020263672, "eval_rewards/accuracies": 0.6466078162193298, "eval_rewards/chosen": -1.0262099504470825, "eval_rewards/margins": 0.2541602849960327, "eval_rewards/rejected": -1.2803701162338257, "eval_runtime": 357.0552, "eval_samples_per_second": 12.054, "eval_steps_per_second": 1.507, "step": 3500 }, { "epoch": 0.6, "grad_norm": 16.07998691134206, "learning_rate": 4.4017836952968467e-07, "logits/chosen": -1.4526565074920654, "logits/rejected": -1.4062235355377197, "logps/chosen": -173.96885681152344, "logps/rejected": -226.445068359375, "loss": 0.5686, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2041919231414795, "rewards/margins": 0.5490579009056091, "rewards/rejected": -1.7532498836517334, "step": 3510 }, { "epoch": 0.61, "grad_norm": 19.834365533950795, "learning_rate": 4.396895306731977e-07, "logits/chosen": -1.5146148204803467, "logits/rejected": -1.4718494415283203, "logps/chosen": -160.6667938232422, "logps/rejected": -208.8634490966797, "loss": 0.5754, "rewards/accuracies": 0.6875, "rewards/chosen": -1.053438425064087, "rewards/margins": 0.5146963000297546, "rewards/rejected": -1.5681347846984863, "step": 3520 }, { "epoch": 0.61, "grad_norm": 23.90518563871216, "learning_rate": 4.391989764396792e-07, "logits/chosen": -1.6393533945083618, "logits/rejected": -1.577980637550354, "logps/chosen": -166.45315551757812, "logps/rejected": -219.417724609375, "loss": 0.5593, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1050881147384644, "rewards/margins": 0.5800349116325378, "rewards/rejected": -1.6851232051849365, "step": 3530 }, { "epoch": 0.61, "grad_norm": 20.26503213849771, "learning_rate": 4.387067112652487e-07, "logits/chosen": -1.5266510248184204, "logits/rejected": -1.4859250783920288, "logps/chosen": -157.66932678222656, "logps/rejected": -214.6486053466797, "loss": 0.5651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0376538038253784, "rewards/margins": 0.5733412504196167, "rewards/rejected": -1.6109952926635742, "step": 3540 }, { "epoch": 0.61, "grad_norm": 18.788930669523673, "learning_rate": 4.382127396014982e-07, "logits/chosen": -1.6274988651275635, "logits/rejected": -1.6048628091812134, "logps/chosen": -166.0504608154297, "logps/rejected": -205.7928466796875, "loss": 0.609, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1151145696640015, "rewards/margins": 0.41729575395584106, "rewards/rejected": -1.5324103832244873, "step": 3550 }, { "epoch": 0.61, "grad_norm": 18.6411865873627, "learning_rate": 4.377170659154514e-07, "logits/chosen": -1.5456907749176025, "logits/rejected": -1.506981611251831, "logps/chosen": -159.85513305664062, "logps/rejected": -213.0148468017578, "loss": 0.5651, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.069687843322754, "rewards/margins": 0.535394549369812, "rewards/rejected": -1.6050825119018555, "step": 3560 }, { "epoch": 0.62, "grad_norm": 26.755951307532122, "learning_rate": 4.372196946895238e-07, "logits/chosen": -1.6680046319961548, "logits/rejected": -1.6189712285995483, "logps/chosen": -177.2747344970703, "logps/rejected": -216.9444580078125, "loss": 0.609, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2041199207305908, "rewards/margins": 0.43309324979782104, "rewards/rejected": -1.637213110923767, "step": 3570 }, { "epoch": 0.62, "grad_norm": 16.327880126029793, "learning_rate": 4.367206304214815e-07, "logits/chosen": -1.6215425729751587, "logits/rejected": -1.5847231149673462, "logps/chosen": -168.4742431640625, "logps/rejected": -224.7816162109375, "loss": 0.5332, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1029232740402222, "rewards/margins": 0.570185661315918, "rewards/rejected": -1.6731090545654297, "step": 3580 }, { "epoch": 0.62, "grad_norm": 17.82230607590664, "learning_rate": 4.3621987762440114e-07, "logits/chosen": -1.582554578781128, "logits/rejected": -1.5418357849121094, "logps/chosen": -181.93031311035156, "logps/rejected": -246.87838745117188, "loss": 0.5296, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.27500319480896, "rewards/margins": 0.654120922088623, "rewards/rejected": -1.929124116897583, "step": 3590 }, { "epoch": 0.62, "grad_norm": 27.151158453154128, "learning_rate": 4.357174408266289e-07, "logits/chosen": -1.5266609191894531, "logits/rejected": -1.4800150394439697, "logps/chosen": -184.74020385742188, "logps/rejected": -237.02490234375, "loss": 0.5689, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3012017011642456, "rewards/margins": 0.5533289909362793, "rewards/rejected": -1.854530692100525, "step": 3600 }, { "epoch": 0.62, "eval_logits/chosen": -1.6493839025497437, "eval_logits/rejected": -1.6286251544952393, "eval_logps/chosen": -170.8087158203125, "eval_logps/rejected": -201.0063018798828, "eval_loss": 0.627535879611969, "eval_rewards/accuracies": 0.6486988663673401, "eval_rewards/chosen": -1.1210483312606812, "eval_rewards/margins": 0.2574405074119568, "eval_rewards/rejected": -1.3784890174865723, "eval_runtime": 356.9383, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 3600 }, { "epoch": 0.62, "grad_norm": 16.53409464026598, "learning_rate": 4.3521332457173933e-07, "logits/chosen": -1.4792962074279785, "logits/rejected": -1.4342132806777954, "logps/chosen": -188.6980438232422, "logps/rejected": -251.24951171875, "loss": 0.5333, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3721535205841064, "rewards/margins": 0.6460615396499634, "rewards/rejected": -2.018214702606201, "step": 3610 }, { "epoch": 0.62, "grad_norm": 24.356843194385412, "learning_rate": 4.347075334184946e-07, "logits/chosen": -1.389676809310913, "logits/rejected": -1.3410922288894653, "logps/chosen": -182.9585418701172, "logps/rejected": -253.10958862304688, "loss": 0.5018, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2746689319610596, "rewards/margins": 0.7207802534103394, "rewards/rejected": -1.9954490661621094, "step": 3620 }, { "epoch": 0.63, "grad_norm": 28.04057531766567, "learning_rate": 4.34200071940803e-07, "logits/chosen": -1.4672437906265259, "logits/rejected": -1.4280986785888672, "logps/chosen": -215.80764770507812, "logps/rejected": -300.14385986328125, "loss": 0.5108, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6468420028686523, "rewards/margins": 0.7980831861495972, "rewards/rejected": -2.44492506980896, "step": 3630 }, { "epoch": 0.63, "grad_norm": 31.68658499963128, "learning_rate": 4.3369094472767785e-07, "logits/chosen": -1.3977959156036377, "logits/rejected": -1.3606897592544556, "logps/chosen": -217.1880645751953, "logps/rejected": -290.8738708496094, "loss": 0.5379, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6216189861297607, "rewards/margins": 0.7258588671684265, "rewards/rejected": -2.347477674484253, "step": 3640 }, { "epoch": 0.63, "grad_norm": 22.54081035634695, "learning_rate": 4.331801563831956e-07, "logits/chosen": -1.3711670637130737, "logits/rejected": -1.350187063217163, "logps/chosen": -206.61978149414062, "logps/rejected": -269.8750915527344, "loss": 0.544, "rewards/accuracies": 0.71875, "rewards/chosen": -1.555546522140503, "rewards/margins": 0.5982221961021423, "rewards/rejected": -2.15376877784729, "step": 3650 }, { "epoch": 0.63, "grad_norm": 22.175111801909775, "learning_rate": 4.326677115264547e-07, "logits/chosen": -1.3863328695297241, "logits/rejected": -1.3272384405136108, "logps/chosen": -210.5821075439453, "logps/rejected": -292.0908203125, "loss": 0.5043, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5674175024032593, "rewards/margins": 0.8180697560310364, "rewards/rejected": -2.3854870796203613, "step": 3660 }, { "epoch": 0.63, "grad_norm": 18.961212344873417, "learning_rate": 4.321536147915334e-07, "logits/chosen": -1.3621985912322998, "logits/rejected": -1.3097569942474365, "logps/chosen": -205.28738403320312, "logps/rejected": -273.30584716796875, "loss": 0.5708, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5246150493621826, "rewards/margins": 0.6830393671989441, "rewards/rejected": -2.2076547145843506, "step": 3670 }, { "epoch": 0.63, "grad_norm": 17.75818169123661, "learning_rate": 4.316378708274481e-07, "logits/chosen": -1.4744240045547485, "logits/rejected": -1.422086477279663, "logps/chosen": -186.6185302734375, "logps/rejected": -245.1935577392578, "loss": 0.5536, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3194071054458618, "rewards/margins": 0.6318386197090149, "rewards/rejected": -1.951245665550232, "step": 3680 }, { "epoch": 0.64, "grad_norm": 25.3589027579314, "learning_rate": 4.31120484298111e-07, "logits/chosen": -1.4429172277450562, "logits/rejected": -1.4147446155548096, "logps/chosen": -174.2320098876953, "logps/rejected": -255.7716064453125, "loss": 0.5238, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2215301990509033, "rewards/margins": 0.7453585863113403, "rewards/rejected": -1.9668890237808228, "step": 3690 }, { "epoch": 0.64, "grad_norm": 17.358115744138235, "learning_rate": 4.306014598822886e-07, "logits/chosen": -1.4474033117294312, "logits/rejected": -1.394345998764038, "logps/chosen": -179.29293823242188, "logps/rejected": -256.70098876953125, "loss": 0.517, "rewards/accuracies": 0.75, "rewards/chosen": -1.2170681953430176, "rewards/margins": 0.7563605904579163, "rewards/rejected": -1.9734289646148682, "step": 3700 }, { "epoch": 0.64, "eval_logits/chosen": -1.523759365081787, "eval_logits/rejected": -1.4999202489852905, "eval_logps/chosen": -181.3194580078125, "eval_logps/rejected": -215.5612030029297, "eval_loss": 0.6243796944618225, "eval_rewards/accuracies": 0.6565985083580017, "eval_rewards/chosen": -1.2261559963226318, "eval_rewards/margins": 0.2978822588920593, "eval_rewards/rejected": -1.5240384340286255, "eval_runtime": 357.0346, "eval_samples_per_second": 12.055, "eval_steps_per_second": 1.507, "step": 3700 }, { "epoch": 0.64, "grad_norm": 21.218376082289595, "learning_rate": 4.3008080227355844e-07, "logits/chosen": -1.4100111722946167, "logits/rejected": -1.3635252714157104, "logps/chosen": -195.98471069335938, "logps/rejected": -256.88739013671875, "loss": 0.5513, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4212100505828857, "rewards/margins": 0.6375278234481812, "rewards/rejected": -2.0587379932403564, "step": 3710 }, { "epoch": 0.64, "grad_norm": 21.355833105859, "learning_rate": 4.295585161802674e-07, "logits/chosen": -1.4289751052856445, "logits/rejected": -1.3828635215759277, "logps/chosen": -182.1897735595703, "logps/rejected": -262.51226806640625, "loss": 0.4968, "rewards/accuracies": 0.75, "rewards/chosen": -1.303205132484436, "rewards/margins": 0.7883247137069702, "rewards/rejected": -2.0915298461914062, "step": 3720 }, { "epoch": 0.64, "grad_norm": 22.252174921593365, "learning_rate": 4.2903460632548893e-07, "logits/chosen": -1.3439371585845947, "logits/rejected": -1.2857837677001953, "logps/chosen": -212.6474151611328, "logps/rejected": -296.428466796875, "loss": 0.4997, "rewards/accuracies": 0.75, "rewards/chosen": -1.581465721130371, "rewards/margins": 0.8757398724555969, "rewards/rejected": -2.4572055339813232, "step": 3730 }, { "epoch": 0.64, "grad_norm": 22.471686436011503, "learning_rate": 4.285090774469802e-07, "logits/chosen": -1.3240846395492554, "logits/rejected": -1.2739444971084595, "logps/chosen": -212.66140747070312, "logps/rejected": -290.9830017089844, "loss": 0.5364, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.592313528060913, "rewards/margins": 0.7535529136657715, "rewards/rejected": -2.3458666801452637, "step": 3740 }, { "epoch": 0.65, "grad_norm": 17.899544534984106, "learning_rate": 4.2798193429713913e-07, "logits/chosen": -1.440411925315857, "logits/rejected": -1.3944687843322754, "logps/chosen": -198.22142028808594, "logps/rejected": -263.0658874511719, "loss": 0.5618, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.44929838180542, "rewards/margins": 0.650551974773407, "rewards/rejected": -2.0998501777648926, "step": 3750 }, { "epoch": 0.65, "grad_norm": 25.746429517526668, "learning_rate": 4.27453181642962e-07, "logits/chosen": -1.4367876052856445, "logits/rejected": -1.4018447399139404, "logps/chosen": -195.23745727539062, "logps/rejected": -260.03192138671875, "loss": 0.5471, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3940891027450562, "rewards/margins": 0.6460259556770325, "rewards/rejected": -2.0401148796081543, "step": 3760 }, { "epoch": 0.65, "grad_norm": 22.141823531733728, "learning_rate": 4.2692282426599967e-07, "logits/chosen": -1.4208014011383057, "logits/rejected": -1.3772103786468506, "logps/chosen": -181.8473663330078, "logps/rejected": -244.9152374267578, "loss": 0.5249, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2831165790557861, "rewards/margins": 0.6210489273071289, "rewards/rejected": -1.904165506362915, "step": 3770 }, { "epoch": 0.65, "grad_norm": 25.749235391096974, "learning_rate": 4.2639086696231483e-07, "logits/chosen": -1.3430489301681519, "logits/rejected": -1.2899630069732666, "logps/chosen": -210.1292266845703, "logps/rejected": -266.73431396484375, "loss": 0.552, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5607130527496338, "rewards/margins": 0.6126449704170227, "rewards/rejected": -2.1733579635620117, "step": 3780 }, { "epoch": 0.65, "grad_norm": 17.91537947782448, "learning_rate": 4.2585731454243834e-07, "logits/chosen": -1.347544550895691, "logits/rejected": -1.2992546558380127, "logps/chosen": -203.87083435058594, "logps/rejected": -270.9369812011719, "loss": 0.5513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.498838186264038, "rewards/margins": 0.7007363438606262, "rewards/rejected": -2.1995744705200195, "step": 3790 }, { "epoch": 0.65, "grad_norm": 20.642915135629426, "learning_rate": 4.2532217183132566e-07, "logits/chosen": -1.4202806949615479, "logits/rejected": -1.3704365491867065, "logps/chosen": -190.59255981445312, "logps/rejected": -250.588134765625, "loss": 0.5368, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3290106058120728, "rewards/margins": 0.6480494737625122, "rewards/rejected": -1.977060317993164, "step": 3800 }, { "epoch": 0.65, "eval_logits/chosen": -1.523742437362671, "eval_logits/rejected": -1.5010066032409668, "eval_logps/chosen": -182.3809356689453, "eval_logps/rejected": -216.2484893798828, "eval_loss": 0.6206509470939636, "eval_rewards/accuracies": 0.6579925417900085, "eval_rewards/chosen": -1.2367708683013916, "eval_rewards/margins": 0.29413995146751404, "eval_rewards/rejected": -1.530910849571228, "eval_runtime": 357.0405, "eval_samples_per_second": 12.055, "eval_steps_per_second": 1.507, "step": 3800 }, { "epoch": 0.66, "grad_norm": 31.94932671255532, "learning_rate": 4.2478544366831373e-07, "logits/chosen": -1.4317169189453125, "logits/rejected": -1.3770487308502197, "logps/chosen": -202.6186981201172, "logps/rejected": -254.42404174804688, "loss": 0.5594, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4515069723129272, "rewards/margins": 0.57206791639328, "rewards/rejected": -2.0235750675201416, "step": 3810 }, { "epoch": 0.66, "grad_norm": 26.840179888734244, "learning_rate": 4.242471349070765e-07, "logits/chosen": -1.430687665939331, "logits/rejected": -1.3825973272323608, "logps/chosen": -182.3372039794922, "logps/rejected": -254.831787109375, "loss": 0.5042, "rewards/accuracies": 0.75, "rewards/chosen": -1.2825406789779663, "rewards/margins": 0.7350937128067017, "rewards/rejected": -2.017634630203247, "step": 3820 }, { "epoch": 0.66, "grad_norm": 20.713279754546374, "learning_rate": 4.2370725041558163e-07, "logits/chosen": -1.4622533321380615, "logits/rejected": -1.3965160846710205, "logps/chosen": -194.24440002441406, "logps/rejected": -252.4665985107422, "loss": 0.5156, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3557844161987305, "rewards/margins": 0.6555383205413818, "rewards/rejected": -2.0113227367401123, "step": 3830 }, { "epoch": 0.66, "grad_norm": 22.738897007983, "learning_rate": 4.2316579507604613e-07, "logits/chosen": -1.3598577976226807, "logits/rejected": -1.3157683610916138, "logps/chosen": -200.95361328125, "logps/rejected": -288.90191650390625, "loss": 0.5188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.468830943107605, "rewards/margins": 0.8298671841621399, "rewards/rejected": -2.2986984252929688, "step": 3840 }, { "epoch": 0.66, "grad_norm": 26.405390100521963, "learning_rate": 4.2262277378489224e-07, "logits/chosen": -1.427339792251587, "logits/rejected": -1.385075330734253, "logps/chosen": -227.91049194335938, "logps/rejected": -302.1413879394531, "loss": 0.5196, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.706134557723999, "rewards/margins": 0.7777893543243408, "rewards/rejected": -2.483924150466919, "step": 3850 }, { "epoch": 0.67, "grad_norm": 28.541254516094085, "learning_rate": 4.2207819145270346e-07, "logits/chosen": -1.4458119869232178, "logits/rejected": -1.3982911109924316, "logps/chosen": -232.8706512451172, "logps/rejected": -297.7009582519531, "loss": 0.568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7871061563491821, "rewards/margins": 0.6608562469482422, "rewards/rejected": -2.4479622840881348, "step": 3860 }, { "epoch": 0.67, "grad_norm": 20.798655132332478, "learning_rate": 4.2153205300417966e-07, "logits/chosen": -1.4056997299194336, "logits/rejected": -1.3534657955169678, "logps/chosen": -214.6895751953125, "logps/rejected": -290.5361633300781, "loss": 0.5187, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5584853887557983, "rewards/margins": 0.7904712557792664, "rewards/rejected": -2.34895658493042, "step": 3870 }, { "epoch": 0.67, "grad_norm": 25.018107512387246, "learning_rate": 4.209843633780929e-07, "logits/chosen": -1.5281155109405518, "logits/rejected": -1.5098029375076294, "logps/chosen": -187.2782440185547, "logps/rejected": -250.95950317382812, "loss": 0.5438, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3641104698181152, "rewards/margins": 0.6108844876289368, "rewards/rejected": -1.9749950170516968, "step": 3880 }, { "epoch": 0.67, "grad_norm": 17.201393329622153, "learning_rate": 4.204351275272426e-07, "logits/chosen": -1.5760449171066284, "logits/rejected": -1.5332744121551514, "logps/chosen": -177.4027862548828, "logps/rejected": -236.5082244873047, "loss": 0.5651, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2320191860198975, "rewards/margins": 0.6117894053459167, "rewards/rejected": -1.8438085317611694, "step": 3890 }, { "epoch": 0.67, "grad_norm": 18.018343000765952, "learning_rate": 4.1988435041841096e-07, "logits/chosen": -1.5944218635559082, "logits/rejected": -1.5262387990951538, "logps/chosen": -170.49252319335938, "logps/rejected": -219.80899047851562, "loss": 0.5382, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1367781162261963, "rewards/margins": 0.5586223006248474, "rewards/rejected": -1.6954004764556885, "step": 3900 }, { "epoch": 0.67, "eval_logits/chosen": -1.6579508781433105, "eval_logits/rejected": -1.6361998319625854, "eval_logps/chosen": -160.2046661376953, "eval_logps/rejected": -190.85934448242188, "eval_loss": 0.6221497654914856, "eval_rewards/accuracies": 0.6596189737319946, "eval_rewards/chosen": -1.0150080919265747, "eval_rewards/margins": 0.26201140880584717, "eval_rewards/rejected": -1.2770196199417114, "eval_runtime": 357.0359, "eval_samples_per_second": 12.055, "eval_steps_per_second": 1.507, "step": 3900 }, { "epoch": 0.67, "grad_norm": 15.180474012277164, "learning_rate": 4.1933203703231766e-07, "logits/chosen": -1.584212303161621, "logits/rejected": -1.5458735227584839, "logps/chosen": -177.2735595703125, "logps/rejected": -240.51168823242188, "loss": 0.5167, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2165385484695435, "rewards/margins": 0.6424941420555115, "rewards/rejected": -1.8590329885482788, "step": 3910 }, { "epoch": 0.68, "grad_norm": 20.98038793058905, "learning_rate": 4.1877819236357524e-07, "logits/chosen": -1.5897353887557983, "logits/rejected": -1.5237689018249512, "logps/chosen": -172.70350646972656, "logps/rejected": -239.7980499267578, "loss": 0.4887, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1406900882720947, "rewards/margins": 0.7530602216720581, "rewards/rejected": -1.8937504291534424, "step": 3920 }, { "epoch": 0.68, "grad_norm": 29.051405565574303, "learning_rate": 4.182228214206437e-07, "logits/chosen": -1.5160815715789795, "logits/rejected": -1.48716139793396, "logps/chosen": -189.08810424804688, "logps/rejected": -257.77593994140625, "loss": 0.5336, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3786773681640625, "rewards/margins": 0.6711302399635315, "rewards/rejected": -2.0498077869415283, "step": 3930 }, { "epoch": 0.68, "grad_norm": 32.250601718030964, "learning_rate": 4.1766592922578527e-07, "logits/chosen": -1.3783949613571167, "logits/rejected": -1.3409314155578613, "logps/chosen": -183.18594360351562, "logps/rejected": -250.7453155517578, "loss": 0.5505, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.297050952911377, "rewards/margins": 0.6737505793571472, "rewards/rejected": -1.9708013534545898, "step": 3940 }, { "epoch": 0.68, "grad_norm": 19.935483029280153, "learning_rate": 4.1710752081501877e-07, "logits/chosen": -1.3798249959945679, "logits/rejected": -1.311702847480774, "logps/chosen": -178.89566040039062, "logps/rejected": -250.24734497070312, "loss": 0.4886, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2341785430908203, "rewards/margins": 0.7588900327682495, "rewards/rejected": -1.9930686950683594, "step": 3950 }, { "epoch": 0.68, "grad_norm": 33.05051638682265, "learning_rate": 4.1654760123807464e-07, "logits/chosen": -1.4223079681396484, "logits/rejected": -1.3881456851959229, "logps/chosen": -198.07431030273438, "logps/rejected": -285.7303771972656, "loss": 0.4943, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4536970853805542, "rewards/margins": 0.8098018765449524, "rewards/rejected": -2.2634987831115723, "step": 3960 }, { "epoch": 0.68, "grad_norm": 27.584592344930318, "learning_rate": 4.159861755583487e-07, "logits/chosen": -1.3134465217590332, "logits/rejected": -1.2678642272949219, "logps/chosen": -234.09814453125, "logps/rejected": -308.8127746582031, "loss": 0.549, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.800374984741211, "rewards/margins": 0.7578364610671997, "rewards/rejected": -2.558211326599121, "step": 3970 }, { "epoch": 0.69, "grad_norm": 29.469171946700417, "learning_rate": 4.154232488528566e-07, "logits/chosen": -1.1992053985595703, "logits/rejected": -1.1396461725234985, "logps/chosen": -219.92236328125, "logps/rejected": -315.6834411621094, "loss": 0.4724, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6659343242645264, "rewards/margins": 0.9590142965316772, "rewards/rejected": -2.624948501586914, "step": 3980 }, { "epoch": 0.69, "grad_norm": 17.97839097611244, "learning_rate": 4.148588262121877e-07, "logits/chosen": -1.3333715200424194, "logits/rejected": -1.299889326095581, "logps/chosen": -211.5640106201172, "logps/rejected": -273.6368408203125, "loss": 0.5775, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5835427045822144, "rewards/margins": 0.6002888083457947, "rewards/rejected": -2.183831214904785, "step": 3990 }, { "epoch": 0.69, "grad_norm": 29.741620640645017, "learning_rate": 4.1429291274045965e-07, "logits/chosen": -1.5011112689971924, "logits/rejected": -1.4447122812271118, "logps/chosen": -197.10702514648438, "logps/rejected": -261.1085205078125, "loss": 0.5399, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.399017572402954, "rewards/margins": 0.6844178438186646, "rewards/rejected": -2.083435535430908, "step": 4000 }, { "epoch": 0.69, "eval_logits/chosen": -1.5105490684509277, "eval_logits/rejected": -1.486973524093628, "eval_logps/chosen": -175.73806762695312, "eval_logps/rejected": -209.60133361816406, "eval_loss": 0.6212473511695862, "eval_rewards/accuracies": 0.6598513126373291, "eval_rewards/chosen": -1.170341968536377, "eval_rewards/margins": 0.29409757256507874, "eval_rewards/rejected": -1.4644395112991333, "eval_runtime": 356.9871, "eval_samples_per_second": 12.056, "eval_steps_per_second": 1.507, "step": 4000 }, { "epoch": 0.69, "grad_norm": 21.179313854062823, "learning_rate": 4.137255135552714e-07, "logits/chosen": -1.3642061948776245, "logits/rejected": -1.3185532093048096, "logps/chosen": -176.0836944580078, "logps/rejected": -256.7080078125, "loss": 0.5005, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2807254791259766, "rewards/margins": 0.7561134099960327, "rewards/rejected": -2.036839008331299, "step": 4010 }, { "epoch": 0.69, "grad_norm": 28.693872888159124, "learning_rate": 4.131566337876575e-07, "logits/chosen": -1.3393471240997314, "logits/rejected": -1.3077666759490967, "logps/chosen": -198.69711303710938, "logps/rejected": -269.0760192871094, "loss": 0.5463, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4653034210205078, "rewards/margins": 0.694530189037323, "rewards/rejected": -2.1598334312438965, "step": 4020 }, { "epoch": 0.69, "grad_norm": 20.870224461785025, "learning_rate": 4.125862785820416e-07, "logits/chosen": -1.3702881336212158, "logits/rejected": -1.319515585899353, "logps/chosen": -193.2422637939453, "logps/rejected": -268.23193359375, "loss": 0.5069, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3722060918807983, "rewards/margins": 0.752329409122467, "rewards/rejected": -2.12453556060791, "step": 4030 }, { "epoch": 0.7, "grad_norm": 24.444442071597063, "learning_rate": 4.1201445309618954e-07, "logits/chosen": -1.4431445598602295, "logits/rejected": -1.3992760181427002, "logps/chosen": -196.61453247070312, "logps/rejected": -275.2479553222656, "loss": 0.5035, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4041612148284912, "rewards/margins": 0.8080703020095825, "rewards/rejected": -2.212231397628784, "step": 4040 }, { "epoch": 0.7, "grad_norm": 20.703451812620813, "learning_rate": 4.114411625011634e-07, "logits/chosen": -1.3789803981781006, "logits/rejected": -1.3350975513458252, "logps/chosen": -179.94940185546875, "logps/rejected": -255.5587615966797, "loss": 0.525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2938952445983887, "rewards/margins": 0.7335586547851562, "rewards/rejected": -2.027453899383545, "step": 4050 }, { "epoch": 0.7, "grad_norm": 18.961262754259714, "learning_rate": 4.1086641198127404e-07, "logits/chosen": -1.392407774925232, "logits/rejected": -1.3395566940307617, "logps/chosen": -198.05589294433594, "logps/rejected": -261.8578796386719, "loss": 0.5609, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.417249321937561, "rewards/margins": 0.6800889372825623, "rewards/rejected": -2.0973381996154785, "step": 4060 }, { "epoch": 0.7, "grad_norm": 18.757428674130104, "learning_rate": 4.102902067340348e-07, "logits/chosen": -1.3935401439666748, "logits/rejected": -1.3446584939956665, "logps/chosen": -188.22537231445312, "logps/rejected": -258.7428283691406, "loss": 0.5224, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3439836502075195, "rewards/margins": 0.7147015333175659, "rewards/rejected": -2.058685302734375, "step": 4070 }, { "epoch": 0.7, "grad_norm": 19.87740623644791, "learning_rate": 4.0971255197011395e-07, "logits/chosen": -1.3319361209869385, "logits/rejected": -1.294301986694336, "logps/chosen": -182.7200469970703, "logps/rejected": -261.05523681640625, "loss": 0.5126, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3081274032592773, "rewards/margins": 0.7623058557510376, "rewards/rejected": -2.0704331398010254, "step": 4080 }, { "epoch": 0.7, "grad_norm": 22.371735739500583, "learning_rate": 4.091334529132881e-07, "logits/chosen": -1.4664791822433472, "logits/rejected": -1.404679536819458, "logps/chosen": -177.88174438476562, "logps/rejected": -248.9783935546875, "loss": 0.5139, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2242028713226318, "rewards/margins": 0.7204464673995972, "rewards/rejected": -1.944649338722229, "step": 4090 }, { "epoch": 0.71, "grad_norm": 24.372455930646307, "learning_rate": 4.0855291480039454e-07, "logits/chosen": -1.3770744800567627, "logits/rejected": -1.329611897468567, "logps/chosen": -182.24974060058594, "logps/rejected": -250.8807830810547, "loss": 0.5175, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2940999269485474, "rewards/margins": 0.722284197807312, "rewards/rejected": -2.0163843631744385, "step": 4100 }, { "epoch": 0.71, "eval_logits/chosen": -1.472186803817749, "eval_logits/rejected": -1.4476103782653809, "eval_logps/chosen": -186.34982299804688, "eval_logps/rejected": -222.20494079589844, "eval_loss": 0.6203304529190063, "eval_rewards/accuracies": 0.6554368138313293, "eval_rewards/chosen": -1.276459813117981, "eval_rewards/margins": 0.31401583552360535, "eval_rewards/rejected": -1.5904756784439087, "eval_runtime": 356.6055, "eval_samples_per_second": 12.069, "eval_steps_per_second": 1.509, "step": 4100 }, { "epoch": 0.71, "grad_norm": 17.294444254537726, "learning_rate": 4.079709428812842e-07, "logits/chosen": -1.3422235250473022, "logits/rejected": -1.3077037334442139, "logps/chosen": -201.4602813720703, "logps/rejected": -255.19070434570312, "loss": 0.5744, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4541057348251343, "rewards/margins": 0.5830581188201904, "rewards/rejected": -2.0371639728546143, "step": 4110 }, { "epoch": 0.71, "grad_norm": 19.72533891211532, "learning_rate": 4.073875424187739e-07, "logits/chosen": -1.3486844301223755, "logits/rejected": -1.3319542407989502, "logps/chosen": -187.834228515625, "logps/rejected": -242.30770874023438, "loss": 0.583, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3786789178848267, "rewards/margins": 0.49663081765174866, "rewards/rejected": -1.875309944152832, "step": 4120 }, { "epoch": 0.71, "grad_norm": 17.2804912644492, "learning_rate": 4.0680271868859906e-07, "logits/chosen": -1.4753568172454834, "logits/rejected": -1.4285588264465332, "logps/chosen": -163.55975341796875, "logps/rejected": -233.02902221679688, "loss": 0.4851, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0960824489593506, "rewards/margins": 0.7092905640602112, "rewards/rejected": -1.805373191833496, "step": 4130 }, { "epoch": 0.71, "grad_norm": 21.81704208943039, "learning_rate": 4.0621647697936556e-07, "logits/chosen": -1.4139468669891357, "logits/rejected": -1.3735511302947998, "logps/chosen": -193.77230834960938, "logps/rejected": -239.8427734375, "loss": 0.5885, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3823630809783936, "rewards/margins": 0.5091473460197449, "rewards/rejected": -1.8915106058120728, "step": 4140 }, { "epoch": 0.72, "grad_norm": 18.457503619554558, "learning_rate": 4.0562882259250233e-07, "logits/chosen": -1.4741637706756592, "logits/rejected": -1.4252352714538574, "logps/chosen": -182.50404357910156, "logps/rejected": -247.427734375, "loss": 0.5226, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2463964223861694, "rewards/margins": 0.709705114364624, "rewards/rejected": -1.956101417541504, "step": 4150 }, { "epoch": 0.72, "grad_norm": 19.03179504667782, "learning_rate": 4.0503976084221323e-07, "logits/chosen": -1.3726146221160889, "logits/rejected": -1.3159904479980469, "logps/chosen": -179.83924865722656, "logps/rejected": -258.4535217285156, "loss": 0.4878, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2819244861602783, "rewards/margins": 0.8116732835769653, "rewards/rejected": -2.093597888946533, "step": 4160 }, { "epoch": 0.72, "grad_norm": 23.70588198146383, "learning_rate": 4.044492970554292e-07, "logits/chosen": -1.374589443206787, "logits/rejected": -1.3363924026489258, "logps/chosen": -193.84341430664062, "logps/rejected": -267.19268798828125, "loss": 0.5472, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4245638847351074, "rewards/margins": 0.711793065071106, "rewards/rejected": -2.136356830596924, "step": 4170 }, { "epoch": 0.72, "grad_norm": 20.355963174810125, "learning_rate": 4.038574365717594e-07, "logits/chosen": -1.3285168409347534, "logits/rejected": -1.2805362939834595, "logps/chosen": -200.12326049804688, "logps/rejected": -274.0704650878906, "loss": 0.5344, "rewards/accuracies": 0.75, "rewards/chosen": -1.447908878326416, "rewards/margins": 0.7298901677131653, "rewards/rejected": -2.1777987480163574, "step": 4180 }, { "epoch": 0.72, "grad_norm": 30.67163449647573, "learning_rate": 4.0326418474344416e-07, "logits/chosen": -1.3149698972702026, "logits/rejected": -1.2749181985855103, "logps/chosen": -206.71963500976562, "logps/rejected": -285.58819580078125, "loss": 0.5258, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5566787719726562, "rewards/margins": 0.7740581631660461, "rewards/rejected": -2.3307368755340576, "step": 4190 }, { "epoch": 0.72, "grad_norm": 23.189476713803757, "learning_rate": 4.0266954693530515e-07, "logits/chosen": -1.3780596256256104, "logits/rejected": -1.3424698114395142, "logps/chosen": -209.67904663085938, "logps/rejected": -262.9185791015625, "loss": 0.5803, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5398088693618774, "rewards/margins": 0.5638743042945862, "rewards/rejected": -2.1036829948425293, "step": 4200 }, { "epoch": 0.72, "eval_logits/chosen": -1.4580851793289185, "eval_logits/rejected": -1.4321988821029663, "eval_logps/chosen": -193.99774169921875, "eval_logps/rejected": -231.7759552001953, "eval_loss": 0.6207540035247803, "eval_rewards/accuracies": 0.6624070405960083, "eval_rewards/chosen": -1.3529391288757324, "eval_rewards/margins": 0.33324676752090454, "eval_rewards/rejected": -1.6861858367919922, "eval_runtime": 356.9885, "eval_samples_per_second": 12.056, "eval_steps_per_second": 1.507, "step": 4200 }, { "epoch": 0.73, "grad_norm": 32.38460221933683, "learning_rate": 4.020735285246979e-07, "logits/chosen": -1.3975965976715088, "logits/rejected": -1.355668306350708, "logps/chosen": -212.5668182373047, "logps/rejected": -264.63543701171875, "loss": 0.6133, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5729354619979858, "rewards/margins": 0.5456880331039429, "rewards/rejected": -2.1186232566833496, "step": 4210 }, { "epoch": 0.73, "grad_norm": 16.885675503765714, "learning_rate": 4.014761349014629e-07, "logits/chosen": -1.3606762886047363, "logits/rejected": -1.3178844451904297, "logps/chosen": -178.82691955566406, "logps/rejected": -241.67929077148438, "loss": 0.5612, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2662688493728638, "rewards/margins": 0.6238974332809448, "rewards/rejected": -1.8901660442352295, "step": 4220 }, { "epoch": 0.73, "grad_norm": 29.792336168407434, "learning_rate": 4.0087737146787656e-07, "logits/chosen": -1.587550401687622, "logits/rejected": -1.5437839031219482, "logps/chosen": -163.56658935546875, "logps/rejected": -229.27224731445312, "loss": 0.5343, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.082904577255249, "rewards/margins": 0.6688292622566223, "rewards/rejected": -1.7517340183258057, "step": 4230 }, { "epoch": 0.73, "grad_norm": 20.765107873436467, "learning_rate": 4.002772436386027e-07, "logits/chosen": -1.5118169784545898, "logits/rejected": -1.4638663530349731, "logps/chosen": -155.6704559326172, "logps/rejected": -229.45620727539062, "loss": 0.518, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.009352207183838, "rewards/margins": 0.7259235382080078, "rewards/rejected": -1.7352758646011353, "step": 4240 }, { "epoch": 0.73, "grad_norm": 23.96242082772077, "learning_rate": 3.9967575684064367e-07, "logits/chosen": -1.4785500764846802, "logits/rejected": -1.4373469352722168, "logps/chosen": -159.1673126220703, "logps/rejected": -217.58468627929688, "loss": 0.5303, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0413461923599243, "rewards/margins": 0.6081751585006714, "rewards/rejected": -1.6495214700698853, "step": 4250 }, { "epoch": 0.73, "grad_norm": 24.144969976617194, "learning_rate": 3.990729165132907e-07, "logits/chosen": -1.4406192302703857, "logits/rejected": -1.4052913188934326, "logps/chosen": -160.1890106201172, "logps/rejected": -228.3748779296875, "loss": 0.544, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0565812587738037, "rewards/margins": 0.679029643535614, "rewards/rejected": -1.7356109619140625, "step": 4260 }, { "epoch": 0.74, "grad_norm": 24.358604007282633, "learning_rate": 3.984687281080754e-07, "logits/chosen": -1.3951603174209595, "logits/rejected": -1.3441218137741089, "logps/chosen": -164.6576690673828, "logps/rejected": -230.2593536376953, "loss": 0.5264, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1306527853012085, "rewards/margins": 0.6664345860481262, "rewards/rejected": -1.79708731174469, "step": 4270 }, { "epoch": 0.74, "grad_norm": 24.378376134460726, "learning_rate": 3.978631970887201e-07, "logits/chosen": -1.4013197422027588, "logits/rejected": -1.3541449308395386, "logps/chosen": -162.7399139404297, "logps/rejected": -237.9474639892578, "loss": 0.5066, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1103148460388184, "rewards/margins": 0.7485235333442688, "rewards/rejected": -1.858838677406311, "step": 4280 }, { "epoch": 0.74, "grad_norm": 26.74832111588032, "learning_rate": 3.972563289310882e-07, "logits/chosen": -1.3995485305786133, "logits/rejected": -1.348487138748169, "logps/chosen": -180.16856384277344, "logps/rejected": -255.4312286376953, "loss": 0.5467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2493109703063965, "rewards/margins": 0.7890298962593079, "rewards/rejected": -2.0383410453796387, "step": 4290 }, { "epoch": 0.74, "grad_norm": 15.729787495443901, "learning_rate": 3.9664812912313533e-07, "logits/chosen": -1.4865190982818604, "logits/rejected": -1.4452247619628906, "logps/chosen": -155.62908935546875, "logps/rejected": -227.11245727539062, "loss": 0.507, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0309059619903564, "rewards/margins": 0.7097223997116089, "rewards/rejected": -1.7406282424926758, "step": 4300 }, { "epoch": 0.74, "eval_logits/chosen": -1.5954647064208984, "eval_logits/rejected": -1.5738048553466797, "eval_logps/chosen": -152.3179931640625, "eval_logps/rejected": -181.7826385498047, "eval_loss": 0.6264519095420837, "eval_rewards/accuracies": 0.6624070405960083, "eval_rewards/chosen": -0.9361413717269897, "eval_rewards/margins": 0.25011131167411804, "eval_rewards/rejected": -1.1862527132034302, "eval_runtime": 356.8364, "eval_samples_per_second": 12.062, "eval_steps_per_second": 1.508, "step": 4300 }, { "epoch": 0.74, "grad_norm": 13.768891932897395, "learning_rate": 3.9603860316485925e-07, "logits/chosen": -1.418806791305542, "logits/rejected": -1.3766006231307983, "logps/chosen": -162.57728576660156, "logps/rejected": -218.0735321044922, "loss": 0.5448, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0851390361785889, "rewards/margins": 0.5797218084335327, "rewards/rejected": -1.664860725402832, "step": 4310 }, { "epoch": 0.74, "grad_norm": 14.715075236823548, "learning_rate": 3.9542775656825e-07, "logits/chosen": -1.4987797737121582, "logits/rejected": -1.4415086507797241, "logps/chosen": -172.37828063964844, "logps/rejected": -245.30239868164062, "loss": 0.4709, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1376218795776367, "rewards/margins": 0.7687402963638306, "rewards/rejected": -1.9063619375228882, "step": 4320 }, { "epoch": 0.75, "grad_norm": 28.706374504499472, "learning_rate": 3.948155948572405e-07, "logits/chosen": -1.3579802513122559, "logits/rejected": -1.3000389337539673, "logps/chosen": -183.6754150390625, "logps/rejected": -248.2251739501953, "loss": 0.5221, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2819044589996338, "rewards/margins": 0.702656626701355, "rewards/rejected": -1.9845609664916992, "step": 4330 }, { "epoch": 0.75, "grad_norm": 21.368421423428487, "learning_rate": 3.9420212356765606e-07, "logits/chosen": -1.3122832775115967, "logits/rejected": -1.2653281688690186, "logps/chosen": -180.68899536132812, "logps/rejected": -259.26025390625, "loss": 0.5476, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3169108629226685, "rewards/margins": 0.7710382342338562, "rewards/rejected": -2.08794903755188, "step": 4340 }, { "epoch": 0.75, "grad_norm": 21.538559584335715, "learning_rate": 3.93587348247165e-07, "logits/chosen": -1.3758046627044678, "logits/rejected": -1.337914228439331, "logps/chosen": -167.84469604492188, "logps/rejected": -237.70242309570312, "loss": 0.5158, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1684411764144897, "rewards/margins": 0.6767427921295166, "rewards/rejected": -1.8451837301254272, "step": 4350 }, { "epoch": 0.75, "grad_norm": 16.11283805818009, "learning_rate": 3.929712744552278e-07, "logits/chosen": -1.412389874458313, "logits/rejected": -1.356400728225708, "logps/chosen": -176.62753295898438, "logps/rejected": -245.2499542236328, "loss": 0.5299, "rewards/accuracies": 0.75, "rewards/chosen": -1.2295823097229004, "rewards/margins": 0.7066112160682678, "rewards/rejected": -1.9361934661865234, "step": 4360 }, { "epoch": 0.75, "grad_norm": 30.246128714363632, "learning_rate": 3.923539077630471e-07, "logits/chosen": -1.3993642330169678, "logits/rejected": -1.3563659191131592, "logps/chosen": -184.28158569335938, "logps/rejected": -247.9432373046875, "loss": 0.5544, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.293311357498169, "rewards/margins": 0.6382697820663452, "rewards/rejected": -1.9315814971923828, "step": 4370 }, { "epoch": 0.75, "grad_norm": 25.81059610250568, "learning_rate": 3.917352537535176e-07, "logits/chosen": -1.4071307182312012, "logits/rejected": -1.356684923171997, "logps/chosen": -182.4822998046875, "logps/rejected": -258.318359375, "loss": 0.5202, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.281593918800354, "rewards/margins": 0.7798604369163513, "rewards/rejected": -2.0614542961120605, "step": 4380 }, { "epoch": 0.76, "grad_norm": 21.811436240135972, "learning_rate": 3.91115318021175e-07, "logits/chosen": -1.336089849472046, "logits/rejected": -1.291550874710083, "logps/chosen": -188.70346069335938, "logps/rejected": -266.1550598144531, "loss": 0.5067, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3547266721725464, "rewards/margins": 0.7720333337783813, "rewards/rejected": -2.126760244369507, "step": 4390 }, { "epoch": 0.76, "grad_norm": 25.759431483166495, "learning_rate": 3.9049410617214607e-07, "logits/chosen": -1.3443093299865723, "logits/rejected": -1.2999963760375977, "logps/chosen": -194.36892700195312, "logps/rejected": -274.6505432128906, "loss": 0.5273, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3747164011001587, "rewards/margins": 0.8176537752151489, "rewards/rejected": -2.1923701763153076, "step": 4400 }, { "epoch": 0.76, "eval_logits/chosen": -1.4307539463043213, "eval_logits/rejected": -1.404834270477295, "eval_logps/chosen": -185.88987731933594, "eval_logps/rejected": -224.0266876220703, "eval_loss": 0.6210964918136597, "eval_rewards/accuracies": 0.6686803102493286, "eval_rewards/chosen": -1.2718603610992432, "eval_rewards/margins": 0.3368328809738159, "eval_rewards/rejected": -1.608693242073059, "eval_runtime": 356.7436, "eval_samples_per_second": 12.065, "eval_steps_per_second": 1.508, "step": 4400 }, { "epoch": 0.76, "grad_norm": 19.840813891153072, "learning_rate": 3.898716238240971e-07, "logits/chosen": -1.3299553394317627, "logits/rejected": -1.289876103401184, "logps/chosen": -192.49099731445312, "logps/rejected": -250.6361541748047, "loss": 0.5987, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3760064840316772, "rewards/margins": 0.5802738666534424, "rewards/rejected": -1.9562803506851196, "step": 4410 }, { "epoch": 0.76, "grad_norm": 26.035856678795522, "learning_rate": 3.892478766061841e-07, "logits/chosen": -1.489180564880371, "logits/rejected": -1.4286963939666748, "logps/chosen": -172.96762084960938, "logps/rejected": -225.233154296875, "loss": 0.5652, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1818801164627075, "rewards/margins": 0.5658958554267883, "rewards/rejected": -1.7477757930755615, "step": 4420 }, { "epoch": 0.76, "grad_norm": 25.167741330404056, "learning_rate": 3.886228701590011e-07, "logits/chosen": -1.4246338605880737, "logits/rejected": -1.3719749450683594, "logps/chosen": -154.06051635742188, "logps/rejected": -209.6873321533203, "loss": 0.5631, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0029816627502441, "rewards/margins": 0.5808738470077515, "rewards/rejected": -1.583855390548706, "step": 4430 }, { "epoch": 0.76, "grad_norm": 17.10714671684354, "learning_rate": 3.8799661013452955e-07, "logits/chosen": -1.485050916671753, "logits/rejected": -1.4327274560928345, "logps/chosen": -173.7892608642578, "logps/rejected": -240.88101196289062, "loss": 0.5115, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1438651084899902, "rewards/margins": 0.7148648500442505, "rewards/rejected": -1.8587299585342407, "step": 4440 }, { "epoch": 0.77, "grad_norm": 19.220029089043173, "learning_rate": 3.8736910219608705e-07, "logits/chosen": -1.3361194133758545, "logits/rejected": -1.2997193336486816, "logps/chosen": -164.47987365722656, "logps/rejected": -231.0243682861328, "loss": 0.5257, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0879555940628052, "rewards/margins": 0.6804584264755249, "rewards/rejected": -1.7684139013290405, "step": 4450 }, { "epoch": 0.77, "grad_norm": 21.93245110911694, "learning_rate": 3.8674035201827626e-07, "logits/chosen": -1.4222412109375, "logits/rejected": -1.387459635734558, "logps/chosen": -174.14901733398438, "logps/rejected": -240.2296600341797, "loss": 0.5475, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2142760753631592, "rewards/margins": 0.6618901491165161, "rewards/rejected": -1.8761663436889648, "step": 4460 }, { "epoch": 0.77, "grad_norm": 26.334535864969112, "learning_rate": 3.861103652869334e-07, "logits/chosen": -1.4492603540420532, "logits/rejected": -1.3905606269836426, "logps/chosen": -184.28909301757812, "logps/rejected": -254.66085815429688, "loss": 0.5201, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2838647365570068, "rewards/margins": 0.7604522109031677, "rewards/rejected": -2.0443172454833984, "step": 4470 }, { "epoch": 0.77, "grad_norm": 41.57134948033417, "learning_rate": 3.8547914769907705e-07, "logits/chosen": -1.4375700950622559, "logits/rejected": -1.3990795612335205, "logps/chosen": -193.02252197265625, "logps/rejected": -266.1149597167969, "loss": 0.5628, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4204905033111572, "rewards/margins": 0.7187623977661133, "rewards/rejected": -2.1392529010772705, "step": 4480 }, { "epoch": 0.77, "grad_norm": 28.64256495751411, "learning_rate": 3.848467049628564e-07, "logits/chosen": -1.317628264427185, "logits/rejected": -1.2681446075439453, "logps/chosen": -187.17874145507812, "logps/rejected": -251.03970336914062, "loss": 0.531, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3606078624725342, "rewards/margins": 0.6513209939002991, "rewards/rejected": -2.0119290351867676, "step": 4490 }, { "epoch": 0.78, "grad_norm": 12.948645388897265, "learning_rate": 3.8421304279749983e-07, "logits/chosen": -1.3421502113342285, "logits/rejected": -1.2936899662017822, "logps/chosen": -180.89065551757812, "logps/rejected": -251.9108428955078, "loss": 0.5574, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2455495595932007, "rewards/margins": 0.7440658211708069, "rewards/rejected": -1.9896152019500732, "step": 4500 }, { "epoch": 0.78, "eval_logits/chosen": -1.4964402914047241, "eval_logits/rejected": -1.472887396812439, "eval_logps/chosen": -169.3536376953125, "eval_logps/rejected": -203.17874145507812, "eval_loss": 0.6233484745025635, "eval_rewards/accuracies": 0.6670538783073425, "eval_rewards/chosen": -1.1064980030059814, "eval_rewards/margins": 0.29371556639671326, "eval_rewards/rejected": -1.400213599205017, "eval_runtime": 356.7428, "eval_samples_per_second": 12.065, "eval_steps_per_second": 1.508, "step": 4500 }, { "epoch": 0.78, "grad_norm": 17.509248366329857, "learning_rate": 3.8357816693326314e-07, "logits/chosen": -1.487713098526001, "logits/rejected": -1.4337613582611084, "logps/chosen": -171.7799835205078, "logps/rejected": -247.6623992919922, "loss": 0.5203, "rewards/accuracies": 0.75, "rewards/chosen": -1.1836849451065063, "rewards/margins": 0.7367376089096069, "rewards/rejected": -1.9204223155975342, "step": 4510 }, { "epoch": 0.78, "grad_norm": 21.475335389561995, "learning_rate": 3.829420831113775e-07, "logits/chosen": -1.4706519842147827, "logits/rejected": -1.422728180885315, "logps/chosen": -176.04486083984375, "logps/rejected": -242.8605499267578, "loss": 0.5317, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.234581708908081, "rewards/margins": 0.6933325529098511, "rewards/rejected": -1.9279142618179321, "step": 4520 }, { "epoch": 0.78, "grad_norm": 30.044462081498253, "learning_rate": 3.823047970839981e-07, "logits/chosen": -1.4337480068206787, "logits/rejected": -1.4001357555389404, "logps/chosen": -168.0045623779297, "logps/rejected": -224.1116943359375, "loss": 0.5591, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1422946453094482, "rewards/margins": 0.564059317111969, "rewards/rejected": -1.7063539028167725, "step": 4530 }, { "epoch": 0.78, "grad_norm": 32.08416286753465, "learning_rate": 3.816663146141514e-07, "logits/chosen": -1.321825623512268, "logits/rejected": -1.2757227420806885, "logps/chosen": -175.95849609375, "logps/rejected": -248.03097534179688, "loss": 0.5095, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.232386827468872, "rewards/margins": 0.7383901476860046, "rewards/rejected": -1.970776915550232, "step": 4540 }, { "epoch": 0.78, "grad_norm": 21.22323203264765, "learning_rate": 3.810266414756836e-07, "logits/chosen": -1.3958414793014526, "logits/rejected": -1.344481110572815, "logps/chosen": -177.6934051513672, "logps/rejected": -246.7960205078125, "loss": 0.5158, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2130842208862305, "rewards/margins": 0.7209616899490356, "rewards/rejected": -1.9340457916259766, "step": 4550 }, { "epoch": 0.79, "grad_norm": 21.245086858763795, "learning_rate": 3.803857834532081e-07, "logits/chosen": -1.2998394966125488, "logits/rejected": -1.2284823656082153, "logps/chosen": -185.28271484375, "logps/rejected": -263.3417663574219, "loss": 0.5029, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2907075881958008, "rewards/margins": 0.8120514154434204, "rewards/rejected": -2.1027588844299316, "step": 4560 }, { "epoch": 0.79, "grad_norm": 30.48524469504423, "learning_rate": 3.797437463420534e-07, "logits/chosen": -1.3093476295471191, "logits/rejected": -1.259817361831665, "logps/chosen": -194.9139862060547, "logps/rejected": -268.6426696777344, "loss": 0.5509, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4381399154663086, "rewards/margins": 0.7308866381645203, "rewards/rejected": -2.1690266132354736, "step": 4570 }, { "epoch": 0.79, "grad_norm": 20.59369730212467, "learning_rate": 3.791005359482106e-07, "logits/chosen": -1.3296152353286743, "logits/rejected": -1.28184175491333, "logps/chosen": -157.950439453125, "logps/rejected": -213.4839630126953, "loss": 0.5544, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.037023901939392, "rewards/margins": 0.5802245736122131, "rewards/rejected": -1.617248296737671, "step": 4580 }, { "epoch": 0.79, "grad_norm": 21.05628740715068, "learning_rate": 3.784561580882806e-07, "logits/chosen": -1.4657633304595947, "logits/rejected": -1.4152452945709229, "logps/chosen": -169.1389617919922, "logps/rejected": -219.05465698242188, "loss": 0.6103, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1427037715911865, "rewards/margins": 0.5260920524597168, "rewards/rejected": -1.6687958240509033, "step": 4590 }, { "epoch": 0.79, "grad_norm": 22.60134182003666, "learning_rate": 3.778106185894221e-07, "logits/chosen": -1.3957931995391846, "logits/rejected": -1.3415305614471436, "logps/chosen": -159.36795043945312, "logps/rejected": -236.89773559570312, "loss": 0.4819, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.0599912405014038, "rewards/margins": 0.7656328082084656, "rewards/rejected": -1.8256241083145142, "step": 4600 }, { "epoch": 0.79, "eval_logits/chosen": -1.525081992149353, "eval_logits/rejected": -1.501688003540039, "eval_logps/chosen": -169.0588836669922, "eval_logps/rejected": -203.32528686523438, "eval_loss": 0.6219184994697571, "eval_rewards/accuracies": 0.6642658114433289, "eval_rewards/chosen": -1.1035504341125488, "eval_rewards/margins": 0.298128604888916, "eval_rewards/rejected": -1.4016790390014648, "eval_runtime": 356.8595, "eval_samples_per_second": 12.061, "eval_steps_per_second": 1.508, "step": 4600 }, { "epoch": 0.79, "grad_norm": 25.140522968467, "learning_rate": 3.771639232892986e-07, "logits/chosen": -1.3437252044677734, "logits/rejected": -1.3191344738006592, "logps/chosen": -189.2852325439453, "logps/rejected": -238.8382110595703, "loss": 0.6115, "rewards/accuracies": 0.625, "rewards/chosen": -1.3813140392303467, "rewards/margins": 0.4978283941745758, "rewards/rejected": -1.8791425228118896, "step": 4610 }, { "epoch": 0.8, "grad_norm": 21.229639253980007, "learning_rate": 3.765160780360254e-07, "logits/chosen": -1.3881046772003174, "logits/rejected": -1.3339178562164307, "logps/chosen": -180.4025421142578, "logps/rejected": -259.890380859375, "loss": 0.5085, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2468607425689697, "rewards/margins": 0.7971670627593994, "rewards/rejected": -2.0440280437469482, "step": 4620 }, { "epoch": 0.8, "grad_norm": 30.53629505024118, "learning_rate": 3.75867088688117e-07, "logits/chosen": -1.3791451454162598, "logits/rejected": -1.3137580156326294, "logps/chosen": -209.33139038085938, "logps/rejected": -286.5825500488281, "loss": 0.5159, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5205762386322021, "rewards/margins": 0.7794795036315918, "rewards/rejected": -2.300055742263794, "step": 4630 }, { "epoch": 0.8, "grad_norm": 26.67530435659893, "learning_rate": 3.7521696111443413e-07, "logits/chosen": -1.3778386116027832, "logits/rejected": -1.341675043106079, "logps/chosen": -217.1299285888672, "logps/rejected": -284.71978759765625, "loss": 0.5664, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6237319707870483, "rewards/margins": 0.6682808995246887, "rewards/rejected": -2.2920126914978027, "step": 4640 }, { "epoch": 0.8, "grad_norm": 36.07331886816096, "learning_rate": 3.7456570119413034e-07, "logits/chosen": -1.413480520248413, "logits/rejected": -1.3600969314575195, "logps/chosen": -189.5355987548828, "logps/rejected": -251.88204956054688, "loss": 0.5552, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3538799285888672, "rewards/margins": 0.6571952700614929, "rewards/rejected": -2.011075496673584, "step": 4650 }, { "epoch": 0.8, "grad_norm": 19.411023629464214, "learning_rate": 3.739133148165994e-07, "logits/chosen": -1.4477910995483398, "logits/rejected": -1.4069766998291016, "logps/chosen": -176.53195190429688, "logps/rejected": -239.30844116210938, "loss": 0.5337, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2224671840667725, "rewards/margins": 0.62762051820755, "rewards/rejected": -1.8500875234603882, "step": 4660 }, { "epoch": 0.8, "grad_norm": 22.362534471168633, "learning_rate": 3.7325980788142146e-07, "logits/chosen": -1.4072405099868774, "logits/rejected": -1.357230305671692, "logps/chosen": -165.07376098632812, "logps/rejected": -237.02816772460938, "loss": 0.4906, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1267932653427124, "rewards/margins": 0.7528173327445984, "rewards/rejected": -1.8796107769012451, "step": 4670 }, { "epoch": 0.81, "grad_norm": 25.270149260749562, "learning_rate": 3.726051862983101e-07, "logits/chosen": -1.3556668758392334, "logits/rejected": -1.3043700456619263, "logps/chosen": -190.7840576171875, "logps/rejected": -255.9459686279297, "loss": 0.5537, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3830840587615967, "rewards/margins": 0.6596297025680542, "rewards/rejected": -2.0427136421203613, "step": 4680 }, { "epoch": 0.81, "grad_norm": 24.777840014526955, "learning_rate": 3.7194945598705864e-07, "logits/chosen": -1.3643500804901123, "logits/rejected": -1.3103562593460083, "logps/chosen": -213.9306182861328, "logps/rejected": -301.82757568359375, "loss": 0.5075, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5816562175750732, "rewards/margins": 0.8936996459960938, "rewards/rejected": -2.475355625152588, "step": 4690 }, { "epoch": 0.81, "grad_norm": 35.9367234479366, "learning_rate": 3.712926228774868e-07, "logits/chosen": -1.298680067062378, "logits/rejected": -1.252151608467102, "logps/chosen": -218.3412322998047, "logps/rejected": -308.64752197265625, "loss": 0.5187, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6659730672836304, "rewards/margins": 0.8636065721511841, "rewards/rejected": -2.5295798778533936, "step": 4700 }, { "epoch": 0.81, "eval_logits/chosen": -1.3934706449508667, "eval_logits/rejected": -1.3670498132705688, "eval_logps/chosen": -205.2917938232422, "eval_logps/rejected": -246.5410614013672, "eval_loss": 0.6171659231185913, "eval_rewards/accuracies": 0.6654275059700012, "eval_rewards/chosen": -1.4658793210983276, "eval_rewards/margins": 0.367957204580307, "eval_rewards/rejected": -1.833836555480957, "eval_runtime": 356.7771, "eval_samples_per_second": 12.064, "eval_steps_per_second": 1.508, "step": 4700 }, { "epoch": 0.81, "grad_norm": 32.393157890217395, "learning_rate": 3.7063469290938696e-07, "logits/chosen": -1.3531776666641235, "logits/rejected": -1.3033009767532349, "logps/chosen": -212.64096069335938, "logps/rejected": -282.6040954589844, "loss": 0.5413, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.546820044517517, "rewards/margins": 0.7553335428237915, "rewards/rejected": -2.3021538257598877, "step": 4710 }, { "epoch": 0.81, "grad_norm": 21.02352487673719, "learning_rate": 3.699756720324706e-07, "logits/chosen": -1.2925106287002563, "logits/rejected": -1.2351093292236328, "logps/chosen": -197.33441162109375, "logps/rejected": -281.5450439453125, "loss": 0.4884, "rewards/accuracies": 0.78125, "rewards/chosen": -1.405700922012329, "rewards/margins": 0.8855878710746765, "rewards/rejected": -2.2912888526916504, "step": 4720 }, { "epoch": 0.81, "grad_norm": 26.71400422066647, "learning_rate": 3.693155662063141e-07, "logits/chosen": -1.2681843042373657, "logits/rejected": -1.219074010848999, "logps/chosen": -199.86378479003906, "logps/rejected": -269.1332092285156, "loss": 0.5673, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4612171649932861, "rewards/margins": 0.7179456353187561, "rewards/rejected": -2.1791629791259766, "step": 4730 }, { "epoch": 0.82, "grad_norm": 16.242282256503056, "learning_rate": 3.686543814003053e-07, "logits/chosen": -1.3467975854873657, "logits/rejected": -1.3040544986724854, "logps/chosen": -180.89808654785156, "logps/rejected": -271.9189453125, "loss": 0.4748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2646945714950562, "rewards/margins": 0.9159406423568726, "rewards/rejected": -2.1806349754333496, "step": 4740 }, { "epoch": 0.82, "grad_norm": 22.35210107859672, "learning_rate": 3.6799212359358933e-07, "logits/chosen": -1.2919436693191528, "logits/rejected": -1.2553608417510986, "logps/chosen": -206.98501586914062, "logps/rejected": -273.5370788574219, "loss": 0.54, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5159308910369873, "rewards/margins": 0.673417866230011, "rewards/rejected": -2.1893489360809326, "step": 4750 }, { "epoch": 0.82, "grad_norm": 29.45848931001129, "learning_rate": 3.6732879877501453e-07, "logits/chosen": -1.2655035257339478, "logits/rejected": -1.2089664936065674, "logps/chosen": -215.909912109375, "logps/rejected": -307.30560302734375, "loss": 0.4823, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6345545053482056, "rewards/margins": 0.9228925704956055, "rewards/rejected": -2.5574469566345215, "step": 4760 }, { "epoch": 0.82, "grad_norm": 19.064566618433858, "learning_rate": 3.666644129430784e-07, "logits/chosen": -1.3485455513000488, "logits/rejected": -1.3007423877716064, "logps/chosen": -227.8409423828125, "logps/rejected": -299.4037170410156, "loss": 0.5629, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6823813915252686, "rewards/margins": 0.7599068284034729, "rewards/rejected": -2.4422881603240967, "step": 4770 }, { "epoch": 0.82, "grad_norm": 22.624642669624826, "learning_rate": 3.65998972105873e-07, "logits/chosen": -1.280133605003357, "logits/rejected": -1.228562593460083, "logps/chosen": -196.01754760742188, "logps/rejected": -289.71990966796875, "loss": 0.461, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4455887079238892, "rewards/margins": 0.912562370300293, "rewards/rejected": -2.3581509590148926, "step": 4780 }, { "epoch": 0.83, "grad_norm": 17.630300335409032, "learning_rate": 3.6533248228103114e-07, "logits/chosen": -1.3750900030136108, "logits/rejected": -1.324573278427124, "logps/chosen": -203.55032348632812, "logps/rejected": -268.24688720703125, "loss": 0.528, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4577271938323975, "rewards/margins": 0.6970139741897583, "rewards/rejected": -2.154741048812866, "step": 4790 }, { "epoch": 0.83, "grad_norm": 22.323338607470507, "learning_rate": 3.646649494956717e-07, "logits/chosen": -1.3112070560455322, "logits/rejected": -1.2691413164138794, "logps/chosen": -205.2784881591797, "logps/rejected": -268.1100769042969, "loss": 0.5805, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5329288244247437, "rewards/margins": 0.6451320648193359, "rewards/rejected": -2.178061008453369, "step": 4800 }, { "epoch": 0.83, "eval_logits/chosen": -1.445318341255188, "eval_logits/rejected": -1.4196213483810425, "eval_logps/chosen": -201.05026245117188, "eval_logps/rejected": -241.25576782226562, "eval_loss": 0.6145854592323303, "eval_rewards/accuracies": 0.6619423627853394, "eval_rewards/chosen": -1.423464059829712, "eval_rewards/margins": 0.35751983523368835, "eval_rewards/rejected": -1.780983805656433, "eval_runtime": 356.6955, "eval_samples_per_second": 12.066, "eval_steps_per_second": 1.508, "step": 4800 }, { "epoch": 0.83, "grad_norm": 23.1783795567982, "learning_rate": 3.6399637978634497e-07, "logits/chosen": -1.2973178625106812, "logits/rejected": -1.234431505203247, "logps/chosen": -206.18838500976562, "logps/rejected": -280.7757263183594, "loss": 0.5075, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4898689985275269, "rewards/margins": 0.7940649390220642, "rewards/rejected": -2.2839341163635254, "step": 4810 }, { "epoch": 0.83, "grad_norm": 20.941205872084087, "learning_rate": 3.6332677919897823e-07, "logits/chosen": -1.330582857131958, "logits/rejected": -1.2920982837677002, "logps/chosen": -202.64508056640625, "logps/rejected": -277.89532470703125, "loss": 0.5228, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.514899492263794, "rewards/margins": 0.7458511590957642, "rewards/rejected": -2.2607505321502686, "step": 4820 }, { "epoch": 0.83, "grad_norm": 20.30033885452288, "learning_rate": 3.626561537888214e-07, "logits/chosen": -1.3852955102920532, "logits/rejected": -1.3380589485168457, "logps/chosen": -196.2224884033203, "logps/rejected": -265.86505126953125, "loss": 0.5619, "rewards/accuracies": 0.71875, "rewards/chosen": -1.426672101020813, "rewards/margins": 0.7057501673698425, "rewards/rejected": -2.1324222087860107, "step": 4830 }, { "epoch": 0.83, "grad_norm": 28.339668690193722, "learning_rate": 3.6198450962039146e-07, "logits/chosen": -1.3548475503921509, "logits/rejected": -1.2958180904388428, "logps/chosen": -199.49864196777344, "logps/rejected": -276.6757507324219, "loss": 0.5025, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4332863092422485, "rewards/margins": 0.809424102306366, "rewards/rejected": -2.2427103519439697, "step": 4840 }, { "epoch": 0.84, "grad_norm": 24.489009584175815, "learning_rate": 3.6131185276741846e-07, "logits/chosen": -1.4219049215316772, "logits/rejected": -1.377803087234497, "logps/chosen": -193.35806274414062, "logps/rejected": -261.9024353027344, "loss": 0.5377, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.392596960067749, "rewards/margins": 0.6902645826339722, "rewards/rejected": -2.0828614234924316, "step": 4850 }, { "epoch": 0.84, "grad_norm": 23.049416952706352, "learning_rate": 3.6063818931278997e-07, "logits/chosen": -1.438050627708435, "logits/rejected": -1.3838953971862793, "logps/chosen": -196.03994750976562, "logps/rejected": -255.8987579345703, "loss": 0.5318, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3760112524032593, "rewards/margins": 0.6772761344909668, "rewards/rejected": -2.0532872676849365, "step": 4860 }, { "epoch": 0.84, "grad_norm": 27.95119723700728, "learning_rate": 3.599635253484967e-07, "logits/chosen": -1.458106279373169, "logits/rejected": -1.4050066471099854, "logps/chosen": -196.61280822753906, "logps/rejected": -271.3414611816406, "loss": 0.5218, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3915178775787354, "rewards/margins": 0.8070972561836243, "rewards/rejected": -2.1986148357391357, "step": 4870 }, { "epoch": 0.84, "grad_norm": 23.334821262369495, "learning_rate": 3.592878669755767e-07, "logits/chosen": -1.3905763626098633, "logits/rejected": -1.33687424659729, "logps/chosen": -179.8762664794922, "logps/rejected": -240.846923828125, "loss": 0.5348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.26702082157135, "rewards/margins": 0.6355078220367432, "rewards/rejected": -1.9025285243988037, "step": 4880 }, { "epoch": 0.84, "grad_norm": 20.651381347651178, "learning_rate": 3.586112203040607e-07, "logits/chosen": -1.4436790943145752, "logits/rejected": -1.3931138515472412, "logps/chosen": -185.76181030273438, "logps/rejected": -262.454345703125, "loss": 0.5051, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3033283948898315, "rewards/margins": 0.785824179649353, "rewards/rejected": -2.0891528129577637, "step": 4890 }, { "epoch": 0.84, "grad_norm": 14.620800977419114, "learning_rate": 3.5793359145291665e-07, "logits/chosen": -1.4301611185073853, "logits/rejected": -1.3711490631103516, "logps/chosen": -181.84730529785156, "logps/rejected": -252.3798065185547, "loss": 0.537, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2864367961883545, "rewards/margins": 0.7350460886955261, "rewards/rejected": -2.0214829444885254, "step": 4900 }, { "epoch": 0.84, "eval_logits/chosen": -1.5460282564163208, "eval_logits/rejected": -1.5222222805023193, "eval_logps/chosen": -179.59288024902344, "eval_logps/rejected": -214.940185546875, "eval_loss": 0.6194379925727844, "eval_rewards/accuracies": 0.6556691527366638, "eval_rewards/chosen": -1.2088903188705444, "eval_rewards/margins": 0.30893754959106445, "eval_rewards/rejected": -1.517828106880188, "eval_runtime": 356.6033, "eval_samples_per_second": 12.069, "eval_steps_per_second": 1.509, "step": 4900 }, { "epoch": 0.85, "grad_norm": 18.773623561195404, "learning_rate": 3.5725498654999436e-07, "logits/chosen": -1.572040319442749, "logits/rejected": -1.511036992073059, "logps/chosen": -181.8535919189453, "logps/rejected": -260.3033142089844, "loss": 0.5114, "rewards/accuracies": 0.75, "rewards/chosen": -1.2552320957183838, "rewards/margins": 0.8234399557113647, "rewards/rejected": -2.078671932220459, "step": 4910 }, { "epoch": 0.85, "grad_norm": 25.741760898047566, "learning_rate": 3.5657541173197025e-07, "logits/chosen": -1.3761519193649292, "logits/rejected": -1.3280622959136963, "logps/chosen": -193.3008575439453, "logps/rejected": -278.3187561035156, "loss": 0.494, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.394696593284607, "rewards/margins": 0.8541049957275391, "rewards/rejected": -2.2488017082214355, "step": 4920 }, { "epoch": 0.85, "grad_norm": 27.217333252692796, "learning_rate": 3.558948731442918e-07, "logits/chosen": -1.5090538263320923, "logits/rejected": -1.461111307144165, "logps/chosen": -210.60617065429688, "logps/rejected": -291.8194885253906, "loss": 0.5621, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5748202800750732, "rewards/margins": 0.7634907960891724, "rewards/rejected": -2.338311195373535, "step": 4930 }, { "epoch": 0.85, "grad_norm": 22.65567566910284, "learning_rate": 3.5521337694112177e-07, "logits/chosen": -1.4714148044586182, "logits/rejected": -1.4120241403579712, "logps/chosen": -215.1482391357422, "logps/rejected": -304.5157775878906, "loss": 0.4672, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.592576265335083, "rewards/margins": 0.9204828143119812, "rewards/rejected": -2.513059139251709, "step": 4940 }, { "epoch": 0.85, "grad_norm": 18.92994086706776, "learning_rate": 3.5453092928528283e-07, "logits/chosen": -1.2949804067611694, "logits/rejected": -1.252745270729065, "logps/chosen": -194.73434448242188, "logps/rejected": -266.51446533203125, "loss": 0.5488, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3993715047836304, "rewards/margins": 0.7247661352157593, "rewards/rejected": -2.1241374015808105, "step": 4950 }, { "epoch": 0.85, "grad_norm": 29.05218033955657, "learning_rate": 3.538475363482017e-07, "logits/chosen": -1.4200494289398193, "logits/rejected": -1.379931926727295, "logps/chosen": -191.14431762695312, "logps/rejected": -275.96734619140625, "loss": 0.4846, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.3891611099243164, "rewards/margins": 0.8065220713615417, "rewards/rejected": -2.195683240890503, "step": 4960 }, { "epoch": 0.86, "grad_norm": 25.53487607610273, "learning_rate": 3.531632043098533e-07, "logits/chosen": -1.3623136281967163, "logits/rejected": -1.316384196281433, "logps/chosen": -199.2927703857422, "logps/rejected": -290.8288269042969, "loss": 0.4895, "rewards/accuracies": 0.75, "rewards/chosen": -1.4792590141296387, "rewards/margins": 0.9106336832046509, "rewards/rejected": -2.389892578125, "step": 4970 }, { "epoch": 0.86, "grad_norm": 18.067653883232882, "learning_rate": 3.5247793935870493e-07, "logits/chosen": -1.3661185503005981, "logits/rejected": -1.318273663520813, "logps/chosen": -203.88742065429688, "logps/rejected": -304.10009765625, "loss": 0.4466, "rewards/accuracies": 0.78125, "rewards/chosen": -1.510157823562622, "rewards/margins": 0.9821771383285522, "rewards/rejected": -2.492335081100464, "step": 4980 }, { "epoch": 0.86, "grad_norm": 25.76809031221678, "learning_rate": 3.5179174769166036e-07, "logits/chosen": -1.2775933742523193, "logits/rejected": -1.2376461029052734, "logps/chosen": -229.0320587158203, "logps/rejected": -312.44122314453125, "loss": 0.5803, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7728198766708374, "rewards/margins": 0.8028677701950073, "rewards/rejected": -2.575687885284424, "step": 4990 }, { "epoch": 0.86, "grad_norm": 24.778358441979794, "learning_rate": 3.511046355140036e-07, "logits/chosen": -1.2975661754608154, "logits/rejected": -1.2417397499084473, "logps/chosen": -212.7232666015625, "logps/rejected": -299.6105041503906, "loss": 0.5112, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5695029497146606, "rewards/margins": 0.8998052477836609, "rewards/rejected": -2.4693078994750977, "step": 5000 }, { "epoch": 0.86, "eval_logits/chosen": -1.427557110786438, "eval_logits/rejected": -1.4012691974639893, "eval_logps/chosen": -209.61801147460938, "eval_logps/rejected": -250.4540252685547, "eval_loss": 0.6177005171775818, "eval_rewards/accuracies": 0.6579925417900085, "eval_rewards/chosen": -1.5091416835784912, "eval_rewards/margins": 0.3638246953487396, "eval_rewards/rejected": -1.8729661703109741, "eval_runtime": 357.0332, "eval_samples_per_second": 12.055, "eval_steps_per_second": 1.507, "step": 5000 }, { "epoch": 0.86, "grad_norm": 23.044456102420437, "learning_rate": 3.5041660903934306e-07, "logits/chosen": -1.334160566329956, "logits/rejected": -1.2778995037078857, "logps/chosen": -218.86984252929688, "logps/rejected": -302.5597229003906, "loss": 0.4935, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6585071086883545, "rewards/margins": 0.8564162254333496, "rewards/rejected": -2.514923572540283, "step": 5010 }, { "epoch": 0.86, "grad_norm": 31.632139485329414, "learning_rate": 3.4972767448955516e-07, "logits/chosen": -1.3136205673217773, "logits/rejected": -1.2596207857131958, "logps/chosen": -208.8480682373047, "logps/rejected": -286.65570068359375, "loss": 0.5453, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5537054538726807, "rewards/margins": 0.7929602265357971, "rewards/rejected": -2.346665859222412, "step": 5020 }, { "epoch": 0.87, "grad_norm": 28.18486571020162, "learning_rate": 3.4903783809472793e-07, "logits/chosen": -1.2829835414886475, "logits/rejected": -1.240122675895691, "logps/chosen": -198.78387451171875, "logps/rejected": -277.5005187988281, "loss": 0.5452, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4618786573410034, "rewards/margins": 0.7737663984298706, "rewards/rejected": -2.235645055770874, "step": 5030 }, { "epoch": 0.87, "grad_norm": 24.06279497070965, "learning_rate": 3.483471060931051e-07, "logits/chosen": -1.50538170337677, "logits/rejected": -1.4423930644989014, "logps/chosen": -200.87606811523438, "logps/rejected": -264.9050598144531, "loss": 0.5298, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4371707439422607, "rewards/margins": 0.7078632116317749, "rewards/rejected": -2.145033836364746, "step": 5040 }, { "epoch": 0.87, "grad_norm": 22.362866544811133, "learning_rate": 3.4765548473102936e-07, "logits/chosen": -1.3779505491256714, "logits/rejected": -1.329679250717163, "logps/chosen": -202.9115753173828, "logps/rejected": -277.89459228515625, "loss": 0.534, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4728914499282837, "rewards/margins": 0.7591592073440552, "rewards/rejected": -2.232050895690918, "step": 5050 }, { "epoch": 0.87, "grad_norm": 23.051911448956087, "learning_rate": 3.469629802628858e-07, "logits/chosen": -1.3045955896377563, "logits/rejected": -1.2757608890533447, "logps/chosen": -192.4661865234375, "logps/rejected": -255.61367797851562, "loss": 0.5956, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4202029705047607, "rewards/margins": 0.6157991290092468, "rewards/rejected": -2.0360023975372314, "step": 5060 }, { "epoch": 0.87, "grad_norm": 24.88380616274711, "learning_rate": 3.4626959895104585e-07, "logits/chosen": -1.445326566696167, "logits/rejected": -1.3971054553985596, "logps/chosen": -176.08921813964844, "logps/rejected": -235.4571990966797, "loss": 0.5525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2336560487747192, "rewards/margins": 0.6401635408401489, "rewards/rejected": -1.8738195896148682, "step": 5070 }, { "epoch": 0.88, "grad_norm": 16.10357940315526, "learning_rate": 3.4557534706580997e-07, "logits/chosen": -1.690610647201538, "logits/rejected": -1.6249040365219116, "logps/chosen": -159.53741455078125, "logps/rejected": -228.35623168945312, "loss": 0.5054, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.020334005355835, "rewards/margins": 0.7543589472770691, "rewards/rejected": -1.7746931314468384, "step": 5080 }, { "epoch": 0.88, "grad_norm": 22.84540156375547, "learning_rate": 3.4488023088535144e-07, "logits/chosen": -1.5469788312911987, "logits/rejected": -1.4869216680526733, "logps/chosen": -163.7722625732422, "logps/rejected": -236.5236053466797, "loss": 0.4949, "rewards/accuracies": 0.78125, "rewards/chosen": -1.09481680393219, "rewards/margins": 0.7558841109275818, "rewards/rejected": -1.8507009744644165, "step": 5090 }, { "epoch": 0.88, "grad_norm": 24.117584748609755, "learning_rate": 3.4418425669565946e-07, "logits/chosen": -1.3648254871368408, "logits/rejected": -1.3106800317764282, "logps/chosen": -192.01718139648438, "logps/rejected": -246.06515502929688, "loss": 0.5746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3694689273834229, "rewards/margins": 0.5682665109634399, "rewards/rejected": -1.9377353191375732, "step": 5100 }, { "epoch": 0.88, "eval_logits/chosen": -1.5572093725204468, "eval_logits/rejected": -1.532787561416626, "eval_logps/chosen": -180.94764709472656, "eval_logps/rejected": -217.08363342285156, "eval_loss": 0.6200332641601562, "eval_rewards/accuracies": 0.6654275059700012, "eval_rewards/chosen": -1.2224379777908325, "eval_rewards/margins": 0.3168245851993561, "eval_rewards/rejected": -1.5392626523971558, "eval_runtime": 356.479, "eval_samples_per_second": 12.074, "eval_steps_per_second": 1.509, "step": 5100 }, { "epoch": 0.88, "grad_norm": 28.379638712637114, "learning_rate": 3.434874307904822e-07, "logits/chosen": -1.4629642963409424, "logits/rejected": -1.4055429697036743, "logps/chosen": -198.49514770507812, "logps/rejected": -260.3693542480469, "loss": 0.5568, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4061576128005981, "rewards/margins": 0.6592377424240112, "rewards/rejected": -2.0653955936431885, "step": 5110 }, { "epoch": 0.88, "grad_norm": 33.84680091265571, "learning_rate": 3.427897594712699e-07, "logits/chosen": -1.5411012172698975, "logits/rejected": -1.4923017024993896, "logps/chosen": -190.82667541503906, "logps/rejected": -243.14395141601562, "loss": 0.5811, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.369838833808899, "rewards/margins": 0.558884859085083, "rewards/rejected": -1.928723931312561, "step": 5120 }, { "epoch": 0.88, "grad_norm": 21.2178772791051, "learning_rate": 3.4209124904711805e-07, "logits/chosen": -1.5400969982147217, "logits/rejected": -1.481069803237915, "logps/chosen": -191.25750732421875, "logps/rejected": -274.0238037109375, "loss": 0.4782, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3769677877426147, "rewards/margins": 0.8635452389717102, "rewards/rejected": -2.2405130863189697, "step": 5130 }, { "epoch": 0.89, "grad_norm": 27.959322283199082, "learning_rate": 3.4139190583471025e-07, "logits/chosen": -1.50569748878479, "logits/rejected": -1.4533193111419678, "logps/chosen": -185.509521484375, "logps/rejected": -234.0798797607422, "loss": 0.5815, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2769523859024048, "rewards/margins": 0.5343953967094421, "rewards/rejected": -1.8113473653793335, "step": 5140 }, { "epoch": 0.89, "grad_norm": 20.960498750902655, "learning_rate": 3.4069173615826097e-07, "logits/chosen": -1.5694390535354614, "logits/rejected": -1.5354506969451904, "logps/chosen": -168.58926391601562, "logps/rejected": -225.9666748046875, "loss": 0.5677, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.154682993888855, "rewards/margins": 0.5564135313034058, "rewards/rejected": -1.7110967636108398, "step": 5150 }, { "epoch": 0.89, "grad_norm": 20.888551855019205, "learning_rate": 3.399907463494585e-07, "logits/chosen": -1.553257703781128, "logits/rejected": -1.5001866817474365, "logps/chosen": -163.6346435546875, "logps/rejected": -215.9080810546875, "loss": 0.5523, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1223015785217285, "rewards/margins": 0.5567636489868164, "rewards/rejected": -1.6790653467178345, "step": 5160 }, { "epoch": 0.89, "grad_norm": 22.44450995498008, "learning_rate": 3.3928894274740773e-07, "logits/chosen": -1.5365890264511108, "logits/rejected": -1.4780817031860352, "logps/chosen": -159.61709594726562, "logps/rejected": -242.06234741210938, "loss": 0.4865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0430219173431396, "rewards/margins": 0.8591874837875366, "rewards/rejected": -1.9022095203399658, "step": 5170 }, { "epoch": 0.89, "grad_norm": 32.65259207620487, "learning_rate": 3.385863316985726e-07, "logits/chosen": -1.5846903324127197, "logits/rejected": -1.5513648986816406, "logps/chosen": -196.44918823242188, "logps/rejected": -248.993408203125, "loss": 0.5864, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4105911254882812, "rewards/margins": 0.5344855785369873, "rewards/rejected": -1.9450767040252686, "step": 5180 }, { "epoch": 0.89, "grad_norm": 20.160509769449938, "learning_rate": 3.3788291955671887e-07, "logits/chosen": -1.4820839166641235, "logits/rejected": -1.4463526010513306, "logps/chosen": -182.4502716064453, "logps/rejected": -242.9739990234375, "loss": 0.5733, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2962095737457275, "rewards/margins": 0.5948411226272583, "rewards/rejected": -1.8910505771636963, "step": 5190 }, { "epoch": 0.9, "grad_norm": 18.895506656949056, "learning_rate": 3.371787126828568e-07, "logits/chosen": -1.5754809379577637, "logits/rejected": -1.5293941497802734, "logps/chosen": -168.30630493164062, "logps/rejected": -236.78439331054688, "loss": 0.5138, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1625381708145142, "rewards/margins": 0.6649240255355835, "rewards/rejected": -1.8274621963500977, "step": 5200 }, { "epoch": 0.9, "eval_logits/chosen": -1.6232234239578247, "eval_logits/rejected": -1.6006020307540894, "eval_logps/chosen": -162.8901824951172, "eval_logps/rejected": -195.02578735351562, "eval_loss": 0.6237266063690186, "eval_rewards/accuracies": 0.6605483293533325, "eval_rewards/chosen": -1.0418633222579956, "eval_rewards/margins": 0.2768208086490631, "eval_rewards/rejected": -1.3186841011047363, "eval_runtime": 357.1428, "eval_samples_per_second": 12.051, "eval_steps_per_second": 1.506, "step": 5200 }, { "epoch": 0.9, "grad_norm": 20.636102998390292, "learning_rate": 3.364737174451834e-07, "logits/chosen": -1.5026520490646362, "logits/rejected": -1.4632505178451538, "logps/chosen": -176.74197387695312, "logps/rejected": -229.8460693359375, "loss": 0.5734, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.25114107131958, "rewards/margins": 0.5474826097488403, "rewards/rejected": -1.7986234426498413, "step": 5210 }, { "epoch": 0.9, "grad_norm": 18.20246477650609, "learning_rate": 3.3576794021902476e-07, "logits/chosen": -1.5258533954620361, "logits/rejected": -1.4866435527801514, "logps/chosen": -168.38894653320312, "logps/rejected": -228.6901092529297, "loss": 0.5603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1579110622406006, "rewards/margins": 0.5693751573562622, "rewards/rejected": -1.7272861003875732, "step": 5220 }, { "epoch": 0.9, "grad_norm": 25.25518148404087, "learning_rate": 3.350613873867788e-07, "logits/chosen": -1.4658780097961426, "logits/rejected": -1.4225355386734009, "logps/chosen": -174.45970153808594, "logps/rejected": -261.13702392578125, "loss": 0.4966, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1921513080596924, "rewards/margins": 0.858329176902771, "rewards/rejected": -2.050480365753174, "step": 5230 }, { "epoch": 0.9, "grad_norm": 22.578680932349204, "learning_rate": 3.343540653378571e-07, "logits/chosen": -1.4708452224731445, "logits/rejected": -1.409401535987854, "logps/chosen": -177.78860473632812, "logps/rejected": -269.25457763671875, "loss": 0.4701, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.2503529787063599, "rewards/margins": 0.9091068506240845, "rewards/rejected": -2.1594595909118652, "step": 5240 }, { "epoch": 0.9, "grad_norm": 21.94896818155034, "learning_rate": 3.3364598046862754e-07, "logits/chosen": -1.3917882442474365, "logits/rejected": -1.3478825092315674, "logps/chosen": -180.23175048828125, "logps/rejected": -260.63482666015625, "loss": 0.4965, "rewards/accuracies": 0.75, "rewards/chosen": -1.2926623821258545, "rewards/margins": 0.8049648404121399, "rewards/rejected": -2.0976271629333496, "step": 5250 }, { "epoch": 0.91, "grad_norm": 42.50229156823177, "learning_rate": 3.3293713918235594e-07, "logits/chosen": -1.4157629013061523, "logits/rejected": -1.3547483682632446, "logps/chosen": -195.6414337158203, "logps/rejected": -264.51416015625, "loss": 0.5465, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3889554738998413, "rewards/margins": 0.741489827632904, "rewards/rejected": -2.1304454803466797, "step": 5260 }, { "epoch": 0.91, "grad_norm": 20.377525883555307, "learning_rate": 3.3222754788914875e-07, "logits/chosen": -1.5662615299224854, "logits/rejected": -1.526829719543457, "logps/chosen": -177.85626220703125, "logps/rejected": -250.97933959960938, "loss": 0.5173, "rewards/accuracies": 0.75, "rewards/chosen": -1.263085961341858, "rewards/margins": 0.735329270362854, "rewards/rejected": -1.998415231704712, "step": 5270 }, { "epoch": 0.91, "grad_norm": 24.732098744846958, "learning_rate": 3.315172130058946e-07, "logits/chosen": -1.4817497730255127, "logits/rejected": -1.4205673933029175, "logps/chosen": -187.76739501953125, "logps/rejected": -252.81613159179688, "loss": 0.5296, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3341633081436157, "rewards/margins": 0.6874901056289673, "rewards/rejected": -2.021653413772583, "step": 5280 }, { "epoch": 0.91, "grad_norm": 26.624712530739988, "learning_rate": 3.308061409562065e-07, "logits/chosen": -1.4430485963821411, "logits/rejected": -1.378722906112671, "logps/chosen": -176.76901245117188, "logps/rejected": -246.05923461914062, "loss": 0.5194, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.209058165550232, "rewards/margins": 0.7315285801887512, "rewards/rejected": -1.940587043762207, "step": 5290 }, { "epoch": 0.91, "grad_norm": 16.5970454890233, "learning_rate": 3.300943381703639e-07, "logits/chosen": -1.4298001527786255, "logits/rejected": -1.3858981132507324, "logps/chosen": -189.7223358154297, "logps/rejected": -266.050537109375, "loss": 0.5094, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3463001251220703, "rewards/margins": 0.7629293203353882, "rewards/rejected": -2.109229564666748, "step": 5300 }, { "epoch": 0.91, "eval_logits/chosen": -1.5428049564361572, "eval_logits/rejected": -1.5180394649505615, "eval_logps/chosen": -187.3815460205078, "eval_logps/rejected": -224.76116943359375, "eval_loss": 0.6180873513221741, "eval_rewards/accuracies": 0.6598513126373291, "eval_rewards/chosen": -1.2867772579193115, "eval_rewards/margins": 0.32926076650619507, "eval_rewards/rejected": -1.6160376071929932, "eval_runtime": 357.239, "eval_samples_per_second": 12.048, "eval_steps_per_second": 1.506, "step": 5300 }, { "epoch": 0.91, "grad_norm": 24.46549230872405, "learning_rate": 3.293818110852541e-07, "logits/chosen": -1.5138168334960938, "logits/rejected": -1.472394347190857, "logps/chosen": -205.1785430908203, "logps/rejected": -269.4486389160156, "loss": 0.5451, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4704174995422363, "rewards/margins": 0.7003182172775269, "rewards/rejected": -2.1707355976104736, "step": 5310 }, { "epoch": 0.92, "grad_norm": 28.551168644730765, "learning_rate": 3.286685661443144e-07, "logits/chosen": -1.4450080394744873, "logits/rejected": -1.3684725761413574, "logps/chosen": -212.12887573242188, "logps/rejected": -286.6734924316406, "loss": 0.5018, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5459760427474976, "rewards/margins": 0.8185034990310669, "rewards/rejected": -2.3644795417785645, "step": 5320 }, { "epoch": 0.92, "grad_norm": 21.620111584157225, "learning_rate": 3.2795460979747375e-07, "logits/chosen": -1.3988605737686157, "logits/rejected": -1.3602981567382812, "logps/chosen": -198.0777130126953, "logps/rejected": -299.89276123046875, "loss": 0.4887, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4722723960876465, "rewards/margins": 0.9927380681037903, "rewards/rejected": -2.465010643005371, "step": 5330 }, { "epoch": 0.92, "grad_norm": 25.423142113773764, "learning_rate": 3.272399485010943e-07, "logits/chosen": -1.431849479675293, "logits/rejected": -1.3641244173049927, "logps/chosen": -201.8954620361328, "logps/rejected": -272.422607421875, "loss": 0.5224, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4554879665374756, "rewards/margins": 0.7551096081733704, "rewards/rejected": -2.210597515106201, "step": 5340 }, { "epoch": 0.92, "grad_norm": 21.415159819448192, "learning_rate": 3.2652458871791326e-07, "logits/chosen": -1.4087716341018677, "logits/rejected": -1.361433982849121, "logps/chosen": -191.4141082763672, "logps/rejected": -257.26812744140625, "loss": 0.5519, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3878710269927979, "rewards/margins": 0.6652384996414185, "rewards/rejected": -2.053109645843506, "step": 5350 }, { "epoch": 0.92, "grad_norm": 23.352557781202382, "learning_rate": 3.2580853691698417e-07, "logits/chosen": -1.5152844190597534, "logits/rejected": -1.4675962924957275, "logps/chosen": -186.24412536621094, "logps/rejected": -270.291015625, "loss": 0.5293, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3303719758987427, "rewards/margins": 0.8078699111938477, "rewards/rejected": -2.138241767883301, "step": 5360 }, { "epoch": 0.93, "grad_norm": 27.08219440604054, "learning_rate": 3.250917995736187e-07, "logits/chosen": -1.4008272886276245, "logits/rejected": -1.349577784538269, "logps/chosen": -190.25677490234375, "logps/rejected": -279.8938293457031, "loss": 0.4707, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.383306622505188, "rewards/margins": 0.8817272186279297, "rewards/rejected": -2.2650339603424072, "step": 5370 }, { "epoch": 0.93, "grad_norm": 21.444506787758893, "learning_rate": 3.2437438316932766e-07, "logits/chosen": -1.4608103036880493, "logits/rejected": -1.4044318199157715, "logps/chosen": -212.57666015625, "logps/rejected": -279.16912841796875, "loss": 0.535, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5518763065338135, "rewards/margins": 0.7239239811897278, "rewards/rejected": -2.2758002281188965, "step": 5380 }, { "epoch": 0.93, "grad_norm": 22.582981599970143, "learning_rate": 3.2365629419176294e-07, "logits/chosen": -1.422620415687561, "logits/rejected": -1.3589953184127808, "logps/chosen": -210.6852569580078, "logps/rejected": -277.79864501953125, "loss": 0.5383, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5212723016738892, "rewards/margins": 0.7407953143119812, "rewards/rejected": -2.2620673179626465, "step": 5390 }, { "epoch": 0.93, "grad_norm": 24.413482194523574, "learning_rate": 3.229375391346585e-07, "logits/chosen": -1.4233802556991577, "logits/rejected": -1.3726685047149658, "logps/chosen": -182.7910919189453, "logps/rejected": -270.41156005859375, "loss": 0.4865, "rewards/accuracies": 0.75, "rewards/chosen": -1.3170902729034424, "rewards/margins": 0.8481825590133667, "rewards/rejected": -2.1652729511260986, "step": 5400 }, { "epoch": 0.93, "eval_logits/chosen": -1.5443414449691772, "eval_logits/rejected": -1.5196946859359741, "eval_logps/chosen": -181.3465576171875, "eval_logps/rejected": -217.53018188476562, "eval_loss": 0.6221857070922852, "eval_rewards/accuracies": 0.669842004776001, "eval_rewards/chosen": -1.2264270782470703, "eval_rewards/margins": 0.31730079650878906, "eval_rewards/rejected": -1.543727993965149, "eval_runtime": 357.1726, "eval_samples_per_second": 12.05, "eval_steps_per_second": 1.506, "step": 5400 }, { "epoch": 0.93, "grad_norm": 33.90762588243634, "learning_rate": 3.222181244977716e-07, "logits/chosen": -1.4560267925262451, "logits/rejected": -1.4195563793182373, "logps/chosen": -192.35723876953125, "logps/rejected": -252.6515350341797, "loss": 0.5492, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3573839664459229, "rewards/margins": 0.6285881996154785, "rewards/rejected": -1.9859724044799805, "step": 5410 }, { "epoch": 0.93, "grad_norm": 25.43643547302594, "learning_rate": 3.2149805678682415e-07, "logits/chosen": -1.5208218097686768, "logits/rejected": -1.4746044874191284, "logps/chosen": -182.27589416503906, "logps/rejected": -257.0049743652344, "loss": 0.5247, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2857364416122437, "rewards/margins": 0.7472572326660156, "rewards/rejected": -2.032993793487549, "step": 5420 }, { "epoch": 0.94, "grad_norm": 28.226137721023306, "learning_rate": 3.207773425134441e-07, "logits/chosen": -1.4794824123382568, "logits/rejected": -1.4373642206192017, "logps/chosen": -185.75901794433594, "logps/rejected": -257.13983154296875, "loss": 0.532, "rewards/accuracies": 0.75, "rewards/chosen": -1.3217830657958984, "rewards/margins": 0.7321761846542358, "rewards/rejected": -2.0539591312408447, "step": 5430 }, { "epoch": 0.94, "grad_norm": 21.150277719094746, "learning_rate": 3.2005598819510586e-07, "logits/chosen": -1.4646375179290771, "logits/rejected": -1.4225250482559204, "logps/chosen": -189.71932983398438, "logps/rejected": -262.24505615234375, "loss": 0.5265, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3352781534194946, "rewards/margins": 0.7349643111228943, "rewards/rejected": -2.070242404937744, "step": 5440 }, { "epoch": 0.94, "grad_norm": 40.77880858071023, "learning_rate": 3.193340003550722e-07, "logits/chosen": -1.3812825679779053, "logits/rejected": -1.338728427886963, "logps/chosen": -192.5988006591797, "logps/rejected": -267.5910949707031, "loss": 0.525, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3704335689544678, "rewards/margins": 0.7710081338882446, "rewards/rejected": -2.141441822052002, "step": 5450 }, { "epoch": 0.94, "grad_norm": 30.470434371984815, "learning_rate": 3.186113855223348e-07, "logits/chosen": -1.4694117307662964, "logits/rejected": -1.4298455715179443, "logps/chosen": -192.1426239013672, "logps/rejected": -244.8093719482422, "loss": 0.6017, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3830455541610718, "rewards/margins": 0.544926106929779, "rewards/rejected": -1.9279718399047852, "step": 5460 }, { "epoch": 0.94, "grad_norm": 19.744210255270794, "learning_rate": 3.178881502315552e-07, "logits/chosen": -1.469347357749939, "logits/rejected": -1.4357693195343018, "logps/chosen": -171.6509246826172, "logps/rejected": -222.62744140625, "loss": 0.5921, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2030577659606934, "rewards/margins": 0.5184643268585205, "rewards/rejected": -1.7215220928192139, "step": 5470 }, { "epoch": 0.94, "grad_norm": 21.30606288485756, "learning_rate": 3.1716430102300573e-07, "logits/chosen": -1.5191564559936523, "logits/rejected": -1.4697355031967163, "logps/chosen": -164.0375518798828, "logps/rejected": -234.484619140625, "loss": 0.5209, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1111133098602295, "rewards/margins": 0.7175213694572449, "rewards/rejected": -1.8286346197128296, "step": 5480 }, { "epoch": 0.95, "grad_norm": 19.015649280887946, "learning_rate": 3.164398444425106e-07, "logits/chosen": -1.4912570714950562, "logits/rejected": -1.4432531595230103, "logps/chosen": -171.32119750976562, "logps/rejected": -226.35067749023438, "loss": 0.5431, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1578484773635864, "rewards/margins": 0.5836814641952515, "rewards/rejected": -1.7415298223495483, "step": 5490 }, { "epoch": 0.95, "grad_norm": 25.985898470229376, "learning_rate": 3.157147870413864e-07, "logits/chosen": -1.5515010356903076, "logits/rejected": -1.512731909751892, "logps/chosen": -169.95750427246094, "logps/rejected": -240.7258758544922, "loss": 0.513, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1616063117980957, "rewards/margins": 0.6991836428642273, "rewards/rejected": -1.8607898950576782, "step": 5500 }, { "epoch": 0.95, "eval_logits/chosen": -1.5876429080963135, "eval_logits/rejected": -1.5650734901428223, "eval_logps/chosen": -172.41822814941406, "eval_logps/rejected": -205.8068389892578, "eval_loss": 0.6214230060577393, "eval_rewards/accuracies": 0.6721654534339905, "eval_rewards/chosen": -1.137143850326538, "eval_rewards/margins": 0.289350688457489, "eval_rewards/rejected": -1.4264944791793823, "eval_runtime": 357.0862, "eval_samples_per_second": 12.053, "eval_steps_per_second": 1.507, "step": 5500 }, { "epoch": 0.95, "grad_norm": 23.17137980170567, "learning_rate": 3.1498913537638314e-07, "logits/chosen": -1.471665620803833, "logits/rejected": -1.4335734844207764, "logps/chosen": -186.6552734375, "logps/rejected": -242.85983276367188, "loss": 0.5717, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3344801664352417, "rewards/margins": 0.566752552986145, "rewards/rejected": -1.9012327194213867, "step": 5510 }, { "epoch": 0.95, "grad_norm": 20.934842998393453, "learning_rate": 3.142628960096246e-07, "logits/chosen": -1.4280526638031006, "logits/rejected": -1.3803586959838867, "logps/chosen": -180.4522247314453, "logps/rejected": -248.35739135742188, "loss": 0.5239, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.246476411819458, "rewards/margins": 0.7218848466873169, "rewards/rejected": -1.9683609008789062, "step": 5520 }, { "epoch": 0.95, "grad_norm": 20.694293406108667, "learning_rate": 3.135360755085493e-07, "logits/chosen": -1.4679347276687622, "logits/rejected": -1.4120423793792725, "logps/chosen": -185.0923309326172, "logps/rejected": -251.6171875, "loss": 0.5145, "rewards/accuracies": 0.75, "rewards/chosen": -1.2520725727081299, "rewards/margins": 0.7337436079978943, "rewards/rejected": -1.9858160018920898, "step": 5530 }, { "epoch": 0.95, "grad_norm": 22.80776688821052, "learning_rate": 3.12808680445851e-07, "logits/chosen": -1.4971723556518555, "logits/rejected": -1.477506399154663, "logps/chosen": -170.65281677246094, "logps/rejected": -243.3797149658203, "loss": 0.5055, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1831166744232178, "rewards/margins": 0.683652400970459, "rewards/rejected": -1.8667690753936768, "step": 5540 }, { "epoch": 0.96, "grad_norm": 21.44049349168216, "learning_rate": 3.1208071739941937e-07, "logits/chosen": -1.3374189138412476, "logits/rejected": -1.3035809993743896, "logps/chosen": -188.96694946289062, "logps/rejected": -238.453125, "loss": 0.6158, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3638770580291748, "rewards/margins": 0.4801939129829407, "rewards/rejected": -1.8440710306167603, "step": 5550 }, { "epoch": 0.96, "grad_norm": 20.558896998020238, "learning_rate": 3.113521929522802e-07, "logits/chosen": -1.4649537801742554, "logits/rejected": -1.4129371643066406, "logps/chosen": -168.60670471191406, "logps/rejected": -243.099365234375, "loss": 0.5099, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1673256158828735, "rewards/margins": 0.7692097425460815, "rewards/rejected": -1.9365352392196655, "step": 5560 }, { "epoch": 0.96, "grad_norm": 23.194806036236645, "learning_rate": 3.10623113692536e-07, "logits/chosen": -1.5026556253433228, "logits/rejected": -1.4706186056137085, "logps/chosen": -169.34744262695312, "logps/rejected": -237.5296630859375, "loss": 0.5547, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1870052814483643, "rewards/margins": 0.6190119385719299, "rewards/rejected": -1.8060171604156494, "step": 5570 }, { "epoch": 0.96, "grad_norm": 17.077550864060385, "learning_rate": 3.0989348621330695e-07, "logits/chosen": -1.4042866230010986, "logits/rejected": -1.3619954586029053, "logps/chosen": -170.88133239746094, "logps/rejected": -240.86929321289062, "loss": 0.5208, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1771609783172607, "rewards/margins": 0.6939356923103333, "rewards/rejected": -1.8710966110229492, "step": 5580 }, { "epoch": 0.96, "grad_norm": 41.03513118380479, "learning_rate": 3.091633171126704e-07, "logits/chosen": -1.5033903121948242, "logits/rejected": -1.4395246505737305, "logps/chosen": -188.55575561523438, "logps/rejected": -257.30426025390625, "loss": 0.5208, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3172476291656494, "rewards/margins": 0.7042714953422546, "rewards/rejected": -2.021519184112549, "step": 5590 }, { "epoch": 0.96, "grad_norm": 33.09744249323354, "learning_rate": 3.0843261299360164e-07, "logits/chosen": -1.4485256671905518, "logits/rejected": -1.4189153909683228, "logps/chosen": -189.03610229492188, "logps/rejected": -262.5235595703125, "loss": 0.5474, "rewards/accuracies": 0.75, "rewards/chosen": -1.3653209209442139, "rewards/margins": 0.6869848966598511, "rewards/rejected": -2.0523059368133545, "step": 5600 }, { "epoch": 0.96, "eval_logits/chosen": -1.5346873998641968, "eval_logits/rejected": -1.5108610391616821, "eval_logps/chosen": -177.24856567382812, "eval_logps/rejected": -212.6680450439453, "eval_loss": 0.6200674772262573, "eval_rewards/accuracies": 0.6689126491546631, "eval_rewards/chosen": -1.185447096824646, "eval_rewards/margins": 0.30965960025787354, "eval_rewards/rejected": -1.49510657787323, "eval_runtime": 357.0504, "eval_samples_per_second": 12.054, "eval_steps_per_second": 1.507, "step": 5600 }, { "epoch": 0.97, "grad_norm": 28.45755729486924, "learning_rate": 3.077013804639144e-07, "logits/chosen": -1.4699697494506836, "logits/rejected": -1.4311306476593018, "logps/chosen": -185.23690795898438, "logps/rejected": -261.2013244628906, "loss": 0.5124, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.328597068786621, "rewards/margins": 0.7179481983184814, "rewards/rejected": -2.0465452671051025, "step": 5610 }, { "epoch": 0.97, "grad_norm": 28.676584357098996, "learning_rate": 3.069696261362008e-07, "logits/chosen": -1.3878097534179688, "logits/rejected": -1.3469122648239136, "logps/chosen": -203.050537109375, "logps/rejected": -258.06475830078125, "loss": 0.5663, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4804455041885376, "rewards/margins": 0.5948622822761536, "rewards/rejected": -2.075307607650757, "step": 5620 }, { "epoch": 0.97, "grad_norm": 31.893058353519617, "learning_rate": 3.062373566277715e-07, "logits/chosen": -1.441892385482788, "logits/rejected": -1.3970489501953125, "logps/chosen": -198.53103637695312, "logps/rejected": -243.0982208251953, "loss": 0.5993, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4262107610702515, "rewards/margins": 0.4929355978965759, "rewards/rejected": -1.9191462993621826, "step": 5630 }, { "epoch": 0.97, "grad_norm": 32.04723618634895, "learning_rate": 3.0550457856059596e-07, "logits/chosen": -1.449190616607666, "logits/rejected": -1.4080677032470703, "logps/chosen": -168.21890258789062, "logps/rejected": -238.8227081298828, "loss": 0.5373, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1427868604660034, "rewards/margins": 0.6845923662185669, "rewards/rejected": -1.8273794651031494, "step": 5640 }, { "epoch": 0.97, "grad_norm": 18.712266282878826, "learning_rate": 3.047712985612428e-07, "logits/chosen": -1.3978092670440674, "logits/rejected": -1.3562982082366943, "logps/chosen": -176.45343017578125, "logps/rejected": -245.86154174804688, "loss": 0.5429, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2446025609970093, "rewards/margins": 0.6920903921127319, "rewards/rejected": -1.9366929531097412, "step": 5650 }, { "epoch": 0.98, "grad_norm": 20.19600716165965, "learning_rate": 3.040375232608194e-07, "logits/chosen": -1.3913816213607788, "logits/rejected": -1.3549675941467285, "logps/chosen": -181.96463012695312, "logps/rejected": -260.1955261230469, "loss": 0.5016, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3012346029281616, "rewards/margins": 0.7908525466918945, "rewards/rejected": -2.0920872688293457, "step": 5660 }, { "epoch": 0.98, "grad_norm": 21.953309399233106, "learning_rate": 3.0330325929491245e-07, "logits/chosen": -1.3647847175598145, "logits/rejected": -1.3191179037094116, "logps/chosen": -187.47390747070312, "logps/rejected": -257.4412841796875, "loss": 0.5044, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3347058296203613, "rewards/margins": 0.7116448879241943, "rewards/rejected": -2.0463504791259766, "step": 5670 }, { "epoch": 0.98, "grad_norm": 40.5185950103975, "learning_rate": 3.0256851330352753e-07, "logits/chosen": -1.3821312189102173, "logits/rejected": -1.322158932685852, "logps/chosen": -210.1196746826172, "logps/rejected": -284.6582946777344, "loss": 0.5185, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5572584867477417, "rewards/margins": 0.7730095982551575, "rewards/rejected": -2.330268383026123, "step": 5680 }, { "epoch": 0.98, "grad_norm": 17.952104758420326, "learning_rate": 3.0183329193102894e-07, "logits/chosen": -1.4393055438995361, "logits/rejected": -1.3816049098968506, "logps/chosen": -200.3620147705078, "logps/rejected": -273.0220642089844, "loss": 0.5014, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.423686146736145, "rewards/margins": 0.7898932695388794, "rewards/rejected": -2.2135794162750244, "step": 5690 }, { "epoch": 0.98, "grad_norm": 33.97947710759319, "learning_rate": 3.010976018260805e-07, "logits/chosen": -1.289398431777954, "logits/rejected": -1.246914267539978, "logps/chosen": -189.9264678955078, "logps/rejected": -259.13726806640625, "loss": 0.5291, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.3215687274932861, "rewards/margins": 0.7366959452629089, "rewards/rejected": -2.05826473236084, "step": 5700 }, { "epoch": 0.98, "eval_logits/chosen": -1.5449095964431763, "eval_logits/rejected": -1.5208981037139893, "eval_logps/chosen": -175.29298400878906, "eval_logps/rejected": -211.0419921875, "eval_loss": 0.6191110610961914, "eval_rewards/accuracies": 0.6696096658706665, "eval_rewards/chosen": -1.165891408920288, "eval_rewards/margins": 0.3129545748233795, "eval_rewards/rejected": -1.4788459539413452, "eval_runtime": 356.9489, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 5700 }, { "epoch": 0.98, "grad_norm": 24.082058815741895, "learning_rate": 3.003614496415843e-07, "logits/chosen": -1.501319169998169, "logits/rejected": -1.4549624919891357, "logps/chosen": -180.39666748046875, "logps/rejected": -245.5010223388672, "loss": 0.5398, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2527682781219482, "rewards/margins": 0.6765493154525757, "rewards/rejected": -1.9293174743652344, "step": 5710 }, { "epoch": 0.99, "grad_norm": 18.19243173514077, "learning_rate": 2.996248420346211e-07, "logits/chosen": -1.4630482196807861, "logits/rejected": -1.40970778465271, "logps/chosen": -167.19842529296875, "logps/rejected": -250.3085479736328, "loss": 0.467, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1519994735717773, "rewards/margins": 0.8528478741645813, "rewards/rejected": -2.004847288131714, "step": 5720 }, { "epoch": 0.99, "grad_norm": 33.44207769272372, "learning_rate": 2.988877856663905e-07, "logits/chosen": -1.5095856189727783, "logits/rejected": -1.4771087169647217, "logps/chosen": -190.52589416503906, "logps/rejected": -250.38720703125, "loss": 0.582, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3880624771118164, "rewards/margins": 0.597137987613678, "rewards/rejected": -1.9852005243301392, "step": 5730 }, { "epoch": 0.99, "grad_norm": 23.692568889129813, "learning_rate": 2.9815028720214985e-07, "logits/chosen": -1.4424539804458618, "logits/rejected": -1.3772521018981934, "logps/chosen": -192.9495849609375, "logps/rejected": -276.883056640625, "loss": 0.4764, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3374172449111938, "rewards/margins": 0.8872700929641724, "rewards/rejected": -2.224687099456787, "step": 5740 }, { "epoch": 0.99, "grad_norm": 17.064609183815648, "learning_rate": 2.974123533111545e-07, "logits/chosen": -1.580055594444275, "logits/rejected": -1.540281057357788, "logps/chosen": -191.0297393798828, "logps/rejected": -233.5084991455078, "loss": 0.5938, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3438113927841187, "rewards/margins": 0.48881006240844727, "rewards/rejected": -1.8326218128204346, "step": 5750 }, { "epoch": 0.99, "grad_norm": 17.358709255280868, "learning_rate": 2.9667399066659756e-07, "logits/chosen": -1.5095783472061157, "logits/rejected": -1.4547650814056396, "logps/chosen": -172.166015625, "logps/rejected": -241.0043182373047, "loss": 0.511, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1578861474990845, "rewards/margins": 0.6968024373054504, "rewards/rejected": -1.8546886444091797, "step": 5760 }, { "epoch": 0.99, "grad_norm": 20.70958250669779, "learning_rate": 2.959352059455492e-07, "logits/chosen": -1.4510507583618164, "logits/rejected": -1.3988720178604126, "logps/chosen": -165.02476501464844, "logps/rejected": -238.8711395263672, "loss": 0.5043, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1318988800048828, "rewards/margins": 0.7165284156799316, "rewards/rejected": -1.848427176475525, "step": 5770 }, { "epoch": 1.0, "grad_norm": 26.960927481343827, "learning_rate": 2.9519600582889655e-07, "logits/chosen": -1.4297640323638916, "logits/rejected": -1.378154993057251, "logps/chosen": -178.6942138671875, "logps/rejected": -261.24639892578125, "loss": 0.4882, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2625802755355835, "rewards/margins": 0.8104287385940552, "rewards/rejected": -2.0730090141296387, "step": 5780 }, { "epoch": 1.0, "grad_norm": 19.048196919185166, "learning_rate": 2.944563970012831e-07, "logits/chosen": -1.2947901487350464, "logits/rejected": -1.236037254333496, "logps/chosen": -189.5282440185547, "logps/rejected": -269.74005126953125, "loss": 0.4999, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.380592703819275, "rewards/margins": 0.7776483297348022, "rewards/rejected": -2.158240795135498, "step": 5790 }, { "epoch": 1.0, "grad_norm": 24.450393947587344, "learning_rate": 2.937163861510486e-07, "logits/chosen": -1.3695513010025024, "logits/rejected": -1.3113354444503784, "logps/chosen": -209.8120880126953, "logps/rejected": -304.4366149902344, "loss": 0.496, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5908751487731934, "rewards/margins": 0.9317790865898132, "rewards/rejected": -2.5226542949676514, "step": 5800 }, { "epoch": 1.0, "eval_logits/chosen": -1.4435439109802246, "eval_logits/rejected": -1.41628098487854, "eval_logps/chosen": -210.4264678955078, "eval_logps/rejected": -253.47520446777344, "eval_loss": 0.6148089170455933, "eval_rewards/accuracies": 0.6679832935333252, "eval_rewards/chosen": -1.5172260999679565, "eval_rewards/margins": 0.3859521150588989, "eval_rewards/rejected": -1.903178334236145, "eval_runtime": 356.8738, "eval_samples_per_second": 12.06, "eval_steps_per_second": 1.508, "step": 5800 }, { "epoch": 1.0, "grad_norm": 18.008006484022793, "learning_rate": 2.9297597997016797e-07, "logits/chosen": -1.4246017932891846, "logits/rejected": -1.3686187267303467, "logps/chosen": -203.05320739746094, "logps/rejected": -305.4484558105469, "loss": 0.4592, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.490220308303833, "rewards/margins": 1.0287506580352783, "rewards/rejected": -2.5189712047576904, "step": 5810 }, { "epoch": 1.0, "grad_norm": 24.397505186303498, "learning_rate": 2.922351851541915e-07, "logits/chosen": -1.4257746934890747, "logits/rejected": -1.3525466918945312, "logps/chosen": -195.74807739257812, "logps/rejected": -306.99359130859375, "loss": 0.3977, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.4273375272750854, "rewards/margins": 1.1495184898376465, "rewards/rejected": -2.5768561363220215, "step": 5820 }, { "epoch": 1.0, "grad_norm": 34.799268956724895, "learning_rate": 2.914940084021836e-07, "logits/chosen": -1.297031283378601, "logits/rejected": -1.231827974319458, "logps/chosen": -198.88839721679688, "logps/rejected": -312.55792236328125, "loss": 0.442, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4539258480072021, "rewards/margins": 1.143164873123169, "rewards/rejected": -2.59709095954895, "step": 5830 }, { "epoch": 1.01, "grad_norm": 16.077132102945612, "learning_rate": 2.907524564166628e-07, "logits/chosen": -1.3520994186401367, "logits/rejected": -1.2968170642852783, "logps/chosen": -193.8484344482422, "logps/rejected": -302.53302001953125, "loss": 0.4416, "rewards/accuracies": 0.8125, "rewards/chosen": -1.414527177810669, "rewards/margins": 1.065403699874878, "rewards/rejected": -2.4799306392669678, "step": 5840 }, { "epoch": 1.01, "grad_norm": 27.25246319010426, "learning_rate": 2.9001053590354076e-07, "logits/chosen": -1.410636067390442, "logits/rejected": -1.3458845615386963, "logps/chosen": -188.6303253173828, "logps/rejected": -307.2320861816406, "loss": 0.3943, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.3775842189788818, "rewards/margins": 1.1499364376068115, "rewards/rejected": -2.5275206565856934, "step": 5850 }, { "epoch": 1.01, "grad_norm": 18.14664982804058, "learning_rate": 2.8926825357206176e-07, "logits/chosen": -1.2408941984176636, "logits/rejected": -1.1875782012939453, "logps/chosen": -207.2787628173828, "logps/rejected": -331.25421142578125, "loss": 0.4206, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5550321340560913, "rewards/margins": 1.2251521348953247, "rewards/rejected": -2.780184268951416, "step": 5860 }, { "epoch": 1.01, "grad_norm": 33.09906725415787, "learning_rate": 2.885256161347421e-07, "logits/chosen": -1.236800193786621, "logits/rejected": -1.174392580986023, "logps/chosen": -234.08859252929688, "logps/rejected": -356.24066162109375, "loss": 0.3899, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7791268825531006, "rewards/margins": 1.2331987619400024, "rewards/rejected": -3.0123260021209717, "step": 5870 }, { "epoch": 1.01, "grad_norm": 29.54869321861051, "learning_rate": 2.877826303073094e-07, "logits/chosen": -1.2946747541427612, "logits/rejected": -1.2476125955581665, "logps/chosen": -216.5059814453125, "logps/rejected": -321.3334655761719, "loss": 0.4526, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6175800561904907, "rewards/margins": 1.0643819570541382, "rewards/rejected": -2.681962251663208, "step": 5880 }, { "epoch": 1.01, "grad_norm": 23.87246950772011, "learning_rate": 2.870393028086416e-07, "logits/chosen": -1.3654061555862427, "logits/rejected": -1.3092124462127686, "logps/chosen": -201.2548828125, "logps/rejected": -316.09326171875, "loss": 0.4342, "rewards/accuracies": 0.8125, "rewards/chosen": -1.461004614830017, "rewards/margins": 1.118170142173767, "rewards/rejected": -2.579174518585205, "step": 5890 }, { "epoch": 1.02, "grad_norm": 22.072237513064117, "learning_rate": 2.8629564036070663e-07, "logits/chosen": -1.2765244245529175, "logits/rejected": -1.2124745845794678, "logps/chosen": -197.97320556640625, "logps/rejected": -324.934326171875, "loss": 0.3739, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4230375289916992, "rewards/margins": 1.2795560359954834, "rewards/rejected": -2.7025935649871826, "step": 5900 }, { "epoch": 1.02, "eval_logits/chosen": -1.371607780456543, "eval_logits/rejected": -1.3429399728775024, "eval_logps/chosen": -213.2480010986328, "eval_logps/rejected": -259.2733459472656, "eval_loss": 0.621561586856842, "eval_rewards/accuracies": 0.6626393795013428, "eval_rewards/chosen": -1.5454415082931519, "eval_rewards/margins": 0.4157179594039917, "eval_rewards/rejected": -1.961159348487854, "eval_runtime": 356.8605, "eval_samples_per_second": 12.061, "eval_steps_per_second": 1.508, "step": 5900 }, { "epoch": 1.02, "grad_norm": 27.59715394921913, "learning_rate": 2.855516496885011e-07, "logits/chosen": -1.2443146705627441, "logits/rejected": -1.2075556516647339, "logps/chosen": -208.70321655273438, "logps/rejected": -310.6231689453125, "loss": 0.4883, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5616544485092163, "rewards/margins": 0.9676151275634766, "rewards/rejected": -2.5292694568634033, "step": 5910 }, { "epoch": 1.02, "grad_norm": 24.263714171924033, "learning_rate": 2.848073375199901e-07, "logits/chosen": -1.2384252548217773, "logits/rejected": -1.181979775428772, "logps/chosen": -219.75973510742188, "logps/rejected": -325.8696594238281, "loss": 0.4625, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6830384731292725, "rewards/margins": 1.0408580303192139, "rewards/rejected": -2.7238965034484863, "step": 5920 }, { "epoch": 1.02, "grad_norm": 20.723910025082144, "learning_rate": 2.8406271058604574e-07, "logits/chosen": -1.3165338039398193, "logits/rejected": -1.2699096202850342, "logps/chosen": -209.8754425048828, "logps/rejected": -316.329833984375, "loss": 0.4768, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5963435173034668, "rewards/margins": 1.05552077293396, "rewards/rejected": -2.6518642902374268, "step": 5930 }, { "epoch": 1.02, "grad_norm": 35.79174059848847, "learning_rate": 2.833177756203868e-07, "logits/chosen": -1.3231611251831055, "logits/rejected": -1.2533804178237915, "logps/chosen": -185.56277465820312, "logps/rejected": -296.2825622558594, "loss": 0.4249, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3131976127624512, "rewards/margins": 1.1194097995758057, "rewards/rejected": -2.4326071739196777, "step": 5940 }, { "epoch": 1.03, "grad_norm": 17.76388707818442, "learning_rate": 2.8257253935951754e-07, "logits/chosen": -1.2369143962860107, "logits/rejected": -1.1907278299331665, "logps/chosen": -180.17445373535156, "logps/rejected": -300.03424072265625, "loss": 0.3913, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2647721767425537, "rewards/margins": 1.1959367990493774, "rewards/rejected": -2.4607090950012207, "step": 5950 }, { "epoch": 1.03, "grad_norm": 16.94373019662342, "learning_rate": 2.818270085426668e-07, "logits/chosen": -1.252617597579956, "logits/rejected": -1.1776127815246582, "logps/chosen": -212.00613403320312, "logps/rejected": -300.90032958984375, "loss": 0.4698, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5370571613311768, "rewards/margins": 0.9539899826049805, "rewards/rejected": -2.4910473823547363, "step": 5960 }, { "epoch": 1.03, "grad_norm": 18.34387313974363, "learning_rate": 2.8108118991172715e-07, "logits/chosen": -1.2002298831939697, "logits/rejected": -1.1465680599212646, "logps/chosen": -217.5402374267578, "logps/rejected": -327.88226318359375, "loss": 0.4467, "rewards/accuracies": 0.8125, "rewards/chosen": -1.656275987625122, "rewards/margins": 1.1236770153045654, "rewards/rejected": -2.7799527645111084, "step": 5970 }, { "epoch": 1.03, "grad_norm": 36.226753471187656, "learning_rate": 2.8033509021119396e-07, "logits/chosen": -1.1955822706222534, "logits/rejected": -1.1503514051437378, "logps/chosen": -217.57955932617188, "logps/rejected": -343.6628112792969, "loss": 0.4415, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6500743627548218, "rewards/margins": 1.2213424444198608, "rewards/rejected": -2.8714168071746826, "step": 5980 }, { "epoch": 1.03, "grad_norm": 31.27940424068028, "learning_rate": 2.795887161881043e-07, "logits/chosen": -1.2698607444763184, "logits/rejected": -1.196852207183838, "logps/chosen": -225.42538452148438, "logps/rejected": -327.5019226074219, "loss": 0.4639, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7037813663482666, "rewards/margins": 1.0513932704925537, "rewards/rejected": -2.7551746368408203, "step": 5990 }, { "epoch": 1.03, "grad_norm": 25.514076708010567, "learning_rate": 2.7884207459197585e-07, "logits/chosen": -1.23202383518219, "logits/rejected": -1.166017770767212, "logps/chosen": -224.9745635986328, "logps/rejected": -359.5087585449219, "loss": 0.3835, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7289674282073975, "rewards/margins": 1.3435934782028198, "rewards/rejected": -3.0725607872009277, "step": 6000 }, { "epoch": 1.03, "eval_logits/chosen": -1.3176610469818115, "eval_logits/rejected": -1.2868660688400269, "eval_logps/chosen": -241.43719482421875, "eval_logps/rejected": -294.40496826171875, "eval_loss": 0.6213955879211426, "eval_rewards/accuracies": 0.6670538783073425, "eval_rewards/chosen": -1.8273334503173828, "eval_rewards/margins": 0.48514264822006226, "eval_rewards/rejected": -2.3124759197235107, "eval_runtime": 356.8191, "eval_samples_per_second": 12.062, "eval_steps_per_second": 1.508, "step": 6000 }, { "epoch": 1.04, "grad_norm": 23.69085635543719, "learning_rate": 2.780951721747461e-07, "logits/chosen": -1.243060827255249, "logits/rejected": -1.194278359413147, "logps/chosen": -225.7394561767578, "logps/rejected": -336.02508544921875, "loss": 0.4742, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.72724187374115, "rewards/margins": 1.119065284729004, "rewards/rejected": -2.8463072776794434, "step": 6010 }, { "epoch": 1.04, "grad_norm": 19.041771832370905, "learning_rate": 2.7734801569071104e-07, "logits/chosen": -1.4446563720703125, "logits/rejected": -1.3703842163085938, "logps/chosen": -204.45094299316406, "logps/rejected": -318.0191345214844, "loss": 0.4287, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.50087308883667, "rewards/margins": 1.1805269718170166, "rewards/rejected": -2.6814000606536865, "step": 6020 }, { "epoch": 1.04, "grad_norm": 22.204161933617108, "learning_rate": 2.766006118964644e-07, "logits/chosen": -1.1446921825408936, "logits/rejected": -1.0945428609848022, "logps/chosen": -206.8660125732422, "logps/rejected": -310.853759765625, "loss": 0.451, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5705788135528564, "rewards/margins": 1.0349514484405518, "rewards/rejected": -2.605530261993408, "step": 6030 }, { "epoch": 1.04, "grad_norm": 27.633296335888442, "learning_rate": 2.7585296755083615e-07, "logits/chosen": -1.3180968761444092, "logits/rejected": -1.2671663761138916, "logps/chosen": -198.31124877929688, "logps/rejected": -304.19488525390625, "loss": 0.43, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4460715055465698, "rewards/margins": 1.0792269706726074, "rewards/rejected": -2.525298595428467, "step": 6040 }, { "epoch": 1.04, "grad_norm": 19.23129899129365, "learning_rate": 2.751050894148317e-07, "logits/chosen": -1.235442876815796, "logits/rejected": -1.174726963043213, "logps/chosen": -212.7599334716797, "logps/rejected": -319.74639892578125, "loss": 0.4224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5342434644699097, "rewards/margins": 1.1264307498931885, "rewards/rejected": -2.6606743335723877, "step": 6050 }, { "epoch": 1.04, "grad_norm": 21.993509562945054, "learning_rate": 2.743569842515707e-07, "logits/chosen": -1.2447845935821533, "logits/rejected": -1.1827826499938965, "logps/chosen": -215.64529418945312, "logps/rejected": -319.7334899902344, "loss": 0.4931, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6215600967407227, "rewards/margins": 1.0563386678695679, "rewards/rejected": -2.67789888381958, "step": 6060 }, { "epoch": 1.05, "grad_norm": 31.01800262084724, "learning_rate": 2.7360865882622556e-07, "logits/chosen": -1.2382781505584717, "logits/rejected": -1.1739325523376465, "logps/chosen": -226.4764404296875, "logps/rejected": -340.14373779296875, "loss": 0.4489, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7254207134246826, "rewards/margins": 1.1591459512710571, "rewards/rejected": -2.8845667839050293, "step": 6070 }, { "epoch": 1.05, "grad_norm": 23.645653500298494, "learning_rate": 2.728601199059609e-07, "logits/chosen": -1.2225624322891235, "logits/rejected": -1.1666558980941772, "logps/chosen": -230.73385620117188, "logps/rejected": -352.7666320800781, "loss": 0.4109, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7167644500732422, "rewards/margins": 1.275223970413208, "rewards/rejected": -2.9919886589050293, "step": 6080 }, { "epoch": 1.05, "grad_norm": 26.05274957772241, "learning_rate": 2.7211137425987175e-07, "logits/chosen": -1.2456872463226318, "logits/rejected": -1.1820614337921143, "logps/chosen": -225.69967651367188, "logps/rejected": -366.41064453125, "loss": 0.3763, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7430721521377563, "rewards/margins": 1.407641887664795, "rewards/rejected": -3.150714159011841, "step": 6090 }, { "epoch": 1.05, "grad_norm": 23.260564144653117, "learning_rate": 2.713624286589227e-07, "logits/chosen": -1.1914881467819214, "logits/rejected": -1.1363308429718018, "logps/chosen": -247.3178253173828, "logps/rejected": -392.1295471191406, "loss": 0.3822, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.905500054359436, "rewards/margins": 1.4772017002105713, "rewards/rejected": -3.382701873779297, "step": 6100 }, { "epoch": 1.05, "eval_logits/chosen": -1.247104287147522, "eval_logits/rejected": -1.2163046598434448, "eval_logps/chosen": -258.7976379394531, "eval_logps/rejected": -313.2447509765625, "eval_loss": 0.6230133771896362, "eval_rewards/accuracies": 0.6710036993026733, "eval_rewards/chosen": -2.0009379386901855, "eval_rewards/margins": 0.49993589520454407, "eval_rewards/rejected": -2.5008738040924072, "eval_runtime": 356.9611, "eval_samples_per_second": 12.057, "eval_steps_per_second": 1.507, "step": 6100 }, { "epoch": 1.05, "grad_norm": 20.111788335085762, "learning_rate": 2.7061328987588626e-07, "logits/chosen": -1.1539726257324219, "logits/rejected": -1.087749719619751, "logps/chosen": -263.44146728515625, "logps/rejected": -398.18914794921875, "loss": 0.4009, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.117790699005127, "rewards/margins": 1.3292558193206787, "rewards/rejected": -3.4470467567443848, "step": 6110 }, { "epoch": 1.05, "grad_norm": 39.46202657579613, "learning_rate": 2.6986396468528236e-07, "logits/chosen": -1.2154873609542847, "logits/rejected": -1.1202675104141235, "logps/chosen": -255.0636444091797, "logps/rejected": -415.2186584472656, "loss": 0.3857, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.007509469985962, "rewards/margins": 1.6099491119384766, "rewards/rejected": -3.6174583435058594, "step": 6120 }, { "epoch": 1.06, "grad_norm": 16.111988424606817, "learning_rate": 2.6911445986331634e-07, "logits/chosen": -1.1826080083847046, "logits/rejected": -1.1169893741607666, "logps/chosen": -239.3343048095703, "logps/rejected": -376.82037353515625, "loss": 0.3942, "rewards/accuracies": 0.8125, "rewards/chosen": -1.849416971206665, "rewards/margins": 1.3795721530914307, "rewards/rejected": -3.2289886474609375, "step": 6130 }, { "epoch": 1.06, "grad_norm": 27.334786962517168, "learning_rate": 2.68364782187818e-07, "logits/chosen": -1.2877238988876343, "logits/rejected": -1.229827642440796, "logps/chosen": -210.6566619873047, "logps/rejected": -323.015869140625, "loss": 0.4396, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5760023593902588, "rewards/margins": 1.1295536756515503, "rewards/rejected": -2.7055559158325195, "step": 6140 }, { "epoch": 1.06, "grad_norm": 23.13483950909933, "learning_rate": 2.6761493843818027e-07, "logits/chosen": -1.24057936668396, "logits/rejected": -1.1909449100494385, "logps/chosen": -211.8172607421875, "logps/rejected": -319.30926513671875, "loss": 0.4605, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5882833003997803, "rewards/margins": 1.1051770448684692, "rewards/rejected": -2.693460464477539, "step": 6150 }, { "epoch": 1.06, "grad_norm": 24.089174586100004, "learning_rate": 2.66864935395298e-07, "logits/chosen": -1.1712977886199951, "logits/rejected": -1.1290233135223389, "logps/chosen": -205.40530395507812, "logps/rejected": -303.6624450683594, "loss": 0.4751, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5445663928985596, "rewards/margins": 0.9666827321052551, "rewards/rejected": -2.51124906539917, "step": 6160 }, { "epoch": 1.06, "grad_norm": 23.745589448999045, "learning_rate": 2.661147798415063e-07, "logits/chosen": -1.3031284809112549, "logits/rejected": -1.2508373260498047, "logps/chosen": -230.56103515625, "logps/rejected": -357.1623840332031, "loss": 0.4073, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7110252380371094, "rewards/margins": 1.2960346937179565, "rewards/rejected": -3.0070605278015137, "step": 6170 }, { "epoch": 1.06, "grad_norm": 23.45984720617789, "learning_rate": 2.6536447856051964e-07, "logits/chosen": -1.2978737354278564, "logits/rejected": -1.2452610731124878, "logps/chosen": -238.743408203125, "logps/rejected": -339.7936706542969, "loss": 0.4798, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.805580735206604, "rewards/margins": 1.062530517578125, "rewards/rejected": -2.8681111335754395, "step": 6180 }, { "epoch": 1.07, "grad_norm": 24.825020766575623, "learning_rate": 2.646140383373704e-07, "logits/chosen": -1.304811716079712, "logits/rejected": -1.2447645664215088, "logps/chosen": -218.7591552734375, "logps/rejected": -337.76025390625, "loss": 0.394, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6060594320297241, "rewards/margins": 1.2416613101959229, "rewards/rejected": -2.8477206230163574, "step": 6190 }, { "epoch": 1.07, "grad_norm": 22.348892489607106, "learning_rate": 2.6386346595834716e-07, "logits/chosen": -1.2410696744918823, "logits/rejected": -1.1765029430389404, "logps/chosen": -213.6040802001953, "logps/rejected": -335.80865478515625, "loss": 0.4249, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5910637378692627, "rewards/margins": 1.2148936986923218, "rewards/rejected": -2.805957317352295, "step": 6200 }, { "epoch": 1.07, "eval_logits/chosen": -1.446341872215271, "eval_logits/rejected": -1.418765902519226, "eval_logps/chosen": -210.3596954345703, "eval_logps/rejected": -255.79803466796875, "eval_loss": 0.6216332912445068, "eval_rewards/accuracies": 0.6656598448753357, "eval_rewards/chosen": -1.5165584087371826, "eval_rewards/margins": 0.40984830260276794, "eval_rewards/rejected": -1.9264066219329834, "eval_runtime": 356.8025, "eval_samples_per_second": 12.063, "eval_steps_per_second": 1.508, "step": 6200 }, { "epoch": 1.07, "grad_norm": 34.90631179192167, "learning_rate": 2.631127682109338e-07, "logits/chosen": -1.3385263681411743, "logits/rejected": -1.277630090713501, "logps/chosen": -212.1560516357422, "logps/rejected": -317.8212890625, "loss": 0.4505, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5736181735992432, "rewards/margins": 1.0757099390029907, "rewards/rejected": -2.6493279933929443, "step": 6210 }, { "epoch": 1.07, "grad_norm": 28.897418625202086, "learning_rate": 2.6236195188374797e-07, "logits/chosen": -1.3002517223358154, "logits/rejected": -1.2492867708206177, "logps/chosen": -213.46377563476562, "logps/rejected": -318.5871887207031, "loss": 0.4586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6133520603179932, "rewards/margins": 1.047262191772461, "rewards/rejected": -2.660614252090454, "step": 6220 }, { "epoch": 1.07, "grad_norm": 31.35968444075051, "learning_rate": 2.616110237664793e-07, "logits/chosen": -1.427841067314148, "logits/rejected": -1.3521463871002197, "logps/chosen": -208.4129180908203, "logps/rejected": -360.6506042480469, "loss": 0.3774, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.495847225189209, "rewards/margins": 1.5452907085418701, "rewards/rejected": -3.041138172149658, "step": 6230 }, { "epoch": 1.08, "grad_norm": 29.527066343948846, "learning_rate": 2.6085999064982873e-07, "logits/chosen": -1.2126820087432861, "logits/rejected": -1.143293023109436, "logps/chosen": -223.4793701171875, "logps/rejected": -341.6191711425781, "loss": 0.4529, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6712300777435303, "rewards/margins": 1.2077951431274414, "rewards/rejected": -2.879025459289551, "step": 6240 }, { "epoch": 1.08, "grad_norm": 19.581487431832702, "learning_rate": 2.601088593254465e-07, "logits/chosen": -1.3335120677947998, "logits/rejected": -1.2712721824645996, "logps/chosen": -218.8231658935547, "logps/rejected": -323.0040283203125, "loss": 0.4925, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6310733556747437, "rewards/margins": 1.0603678226470947, "rewards/rejected": -2.691441059112549, "step": 6250 }, { "epoch": 1.08, "grad_norm": 36.934927120147826, "learning_rate": 2.59357636585871e-07, "logits/chosen": -1.232089877128601, "logits/rejected": -1.1848324537277222, "logps/chosen": -211.7543487548828, "logps/rejected": -297.58697509765625, "loss": 0.4911, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.578609824180603, "rewards/margins": 0.8933493494987488, "rewards/rejected": -2.471959352493286, "step": 6260 }, { "epoch": 1.08, "grad_norm": 22.671481469710727, "learning_rate": 2.5860632922446737e-07, "logits/chosen": -1.5191317796707153, "logits/rejected": -1.4832508563995361, "logps/chosen": -200.9024658203125, "logps/rejected": -307.0449523925781, "loss": 0.4744, "rewards/accuracies": 0.78125, "rewards/chosen": -1.479931116104126, "rewards/margins": 1.0443861484527588, "rewards/rejected": -2.524317502975464, "step": 6270 }, { "epoch": 1.08, "grad_norm": 22.47288317464536, "learning_rate": 2.578549440353659e-07, "logits/chosen": -1.2445075511932373, "logits/rejected": -1.1983692646026611, "logps/chosen": -185.7981414794922, "logps/rejected": -284.7179260253906, "loss": 0.4304, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3140432834625244, "rewards/margins": 1.0069011449813843, "rewards/rejected": -2.320944309234619, "step": 6280 }, { "epoch": 1.08, "grad_norm": 26.874037522701656, "learning_rate": 2.571034878134007e-07, "logits/chosen": -1.3063162565231323, "logits/rejected": -1.2508251667022705, "logps/chosen": -197.89691162109375, "logps/rejected": -304.0697937011719, "loss": 0.4268, "rewards/accuracies": 0.8125, "rewards/chosen": -1.465261459350586, "rewards/margins": 1.0520577430725098, "rewards/rejected": -2.5173192024230957, "step": 6290 }, { "epoch": 1.09, "grad_norm": 34.25929825482152, "learning_rate": 2.5635196735404816e-07, "logits/chosen": -1.327014446258545, "logits/rejected": -1.273252248764038, "logps/chosen": -216.51626586914062, "logps/rejected": -310.141845703125, "loss": 0.4731, "rewards/accuracies": 0.78125, "rewards/chosen": -1.640974760055542, "rewards/margins": 0.9403258562088013, "rewards/rejected": -2.581300973892212, "step": 6300 }, { "epoch": 1.09, "eval_logits/chosen": -1.4054533243179321, "eval_logits/rejected": -1.3767914772033691, "eval_logps/chosen": -229.14906311035156, "eval_logps/rejected": -278.4627685546875, "eval_loss": 0.6205867528915405, "eval_rewards/accuracies": 0.6654275059700012, "eval_rewards/chosen": -1.7044522762298584, "eval_rewards/margins": 0.4486016631126404, "eval_rewards/rejected": -2.1530539989471436, "eval_runtime": 356.7832, "eval_samples_per_second": 12.063, "eval_steps_per_second": 1.508, "step": 6300 }, { "epoch": 1.09, "grad_norm": 30.63136108992965, "learning_rate": 2.5560038945336583e-07, "logits/chosen": -1.2807663679122925, "logits/rejected": -1.213196039199829, "logps/chosen": -209.709716796875, "logps/rejected": -319.9820861816406, "loss": 0.4534, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5926458835601807, "rewards/margins": 1.0916332006454468, "rewards/rejected": -2.684279203414917, "step": 6310 }, { "epoch": 1.09, "grad_norm": 18.736489627828945, "learning_rate": 2.548487609079305e-07, "logits/chosen": -1.2793110609054565, "logits/rejected": -1.2298452854156494, "logps/chosen": -228.3084716796875, "logps/rejected": -330.50390625, "loss": 0.4929, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7359955310821533, "rewards/margins": 1.0360920429229736, "rewards/rejected": -2.772087574005127, "step": 6320 }, { "epoch": 1.09, "grad_norm": 24.122550114272617, "learning_rate": 2.5409708851477687e-07, "logits/chosen": -1.316935658454895, "logits/rejected": -1.2505112886428833, "logps/chosen": -206.8124237060547, "logps/rejected": -340.0397644042969, "loss": 0.3752, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5354225635528564, "rewards/margins": 1.3291146755218506, "rewards/rejected": -2.864537477493286, "step": 6330 }, { "epoch": 1.09, "grad_norm": 24.00191172899993, "learning_rate": 2.533453790713363e-07, "logits/chosen": -1.3309152126312256, "logits/rejected": -1.2744200229644775, "logps/chosen": -205.3987274169922, "logps/rejected": -318.1818542480469, "loss": 0.4432, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4982860088348389, "rewards/margins": 1.1549437046051025, "rewards/rejected": -2.6532297134399414, "step": 6340 }, { "epoch": 1.09, "grad_norm": 37.256774507075455, "learning_rate": 2.5259363937537523e-07, "logits/chosen": -1.2830774784088135, "logits/rejected": -1.2399280071258545, "logps/chosen": -215.0625, "logps/rejected": -328.13604736328125, "loss": 0.4306, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5956099033355713, "rewards/margins": 1.1229556798934937, "rewards/rejected": -2.7185654640197754, "step": 6350 }, { "epoch": 1.1, "grad_norm": 24.16860767130257, "learning_rate": 2.5184187622493356e-07, "logits/chosen": -1.2492659091949463, "logits/rejected": -1.1924443244934082, "logps/chosen": -213.53012084960938, "logps/rejected": -354.2420959472656, "loss": 0.3785, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6007035970687866, "rewards/margins": 1.3961331844329834, "rewards/rejected": -2.9968369007110596, "step": 6360 }, { "epoch": 1.1, "grad_norm": 30.09884792748387, "learning_rate": 2.510900964182635e-07, "logits/chosen": -1.2614082098007202, "logits/rejected": -1.2243916988372803, "logps/chosen": -221.001708984375, "logps/rejected": -329.3047180175781, "loss": 0.4579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6786903142929077, "rewards/margins": 1.0699204206466675, "rewards/rejected": -2.7486109733581543, "step": 6370 }, { "epoch": 1.1, "grad_norm": 26.54672845989167, "learning_rate": 2.503383067537674e-07, "logits/chosen": -1.3264938592910767, "logits/rejected": -1.2688075304031372, "logps/chosen": -207.5281524658203, "logps/rejected": -336.74346923828125, "loss": 0.3866, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.5481336116790771, "rewards/margins": 1.2540295124053955, "rewards/rejected": -2.8021631240844727, "step": 6380 }, { "epoch": 1.1, "grad_norm": 26.92277033670455, "learning_rate": 2.495865140299374e-07, "logits/chosen": -1.364383578300476, "logits/rejected": -1.2942759990692139, "logps/chosen": -213.89453125, "logps/rejected": -337.72857666015625, "loss": 0.4155, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5782573223114014, "rewards/margins": 1.2891353368759155, "rewards/rejected": -2.8673927783966064, "step": 6390 }, { "epoch": 1.1, "grad_norm": 26.35121867365253, "learning_rate": 2.4883472504529284e-07, "logits/chosen": -1.2807561159133911, "logits/rejected": -1.2216075658798218, "logps/chosen": -221.13613891601562, "logps/rejected": -342.654296875, "loss": 0.4089, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6344629526138306, "rewards/margins": 1.2270433902740479, "rewards/rejected": -2.8615059852600098, "step": 6400 }, { "epoch": 1.1, "eval_logits/chosen": -1.32829749584198, "eval_logits/rejected": -1.2985448837280273, "eval_logps/chosen": -253.03562927246094, "eval_logps/rejected": -306.45611572265625, "eval_loss": 0.6263204216957092, "eval_rewards/accuracies": 0.6642658114433289, "eval_rewards/chosen": -1.9433181285858154, "eval_rewards/margins": 0.48966917395591736, "eval_rewards/rejected": -2.4329869747161865, "eval_runtime": 356.7188, "eval_samples_per_second": 12.066, "eval_steps_per_second": 1.508, "step": 6400 }, { "epoch": 1.1, "grad_norm": 35.49752834329857, "learning_rate": 2.480829465983194e-07, "logits/chosen": -1.3197977542877197, "logits/rejected": -1.2659125328063965, "logps/chosen": -263.07965087890625, "logps/rejected": -380.5269470214844, "loss": 0.4862, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.0871684551239014, "rewards/margins": 1.2009137868881226, "rewards/rejected": -3.2880821228027344, "step": 6410 }, { "epoch": 1.11, "grad_norm": 32.67436318673992, "learning_rate": 2.473311854874075e-07, "logits/chosen": -1.321010947227478, "logits/rejected": -1.269235372543335, "logps/chosen": -245.9833984375, "logps/rejected": -353.6158142089844, "loss": 0.4987, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.912111520767212, "rewards/margins": 1.085077166557312, "rewards/rejected": -2.9971888065338135, "step": 6420 }, { "epoch": 1.11, "grad_norm": 23.752696598571593, "learning_rate": 2.4657944851079076e-07, "logits/chosen": -1.2947794198989868, "logits/rejected": -1.2474058866500854, "logps/chosen": -207.3226776123047, "logps/rejected": -308.37408447265625, "loss": 0.4638, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5694684982299805, "rewards/margins": 1.0322576761245728, "rewards/rejected": -2.6017260551452637, "step": 6430 }, { "epoch": 1.11, "grad_norm": 23.030575066990085, "learning_rate": 2.458277424664845e-07, "logits/chosen": -1.339413046836853, "logits/rejected": -1.2803711891174316, "logps/chosen": -209.6329345703125, "logps/rejected": -334.5424499511719, "loss": 0.3994, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5639805793762207, "rewards/margins": 1.2495237588882446, "rewards/rejected": -2.813504219055176, "step": 6440 }, { "epoch": 1.11, "grad_norm": 28.704711169531485, "learning_rate": 2.450760741522244e-07, "logits/chosen": -1.3053383827209473, "logits/rejected": -1.2392146587371826, "logps/chosen": -219.56298828125, "logps/rejected": -331.5205078125, "loss": 0.4508, "rewards/accuracies": 0.75, "rewards/chosen": -1.632362723350525, "rewards/margins": 1.160873532295227, "rewards/rejected": -2.793236255645752, "step": 6450 }, { "epoch": 1.11, "grad_norm": 32.40806253150715, "learning_rate": 2.443244503654047e-07, "logits/chosen": -1.2578837871551514, "logits/rejected": -1.2153687477111816, "logps/chosen": -222.15573120117188, "logps/rejected": -370.3708190917969, "loss": 0.3719, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7027851343154907, "rewards/margins": 1.4216923713684082, "rewards/rejected": -3.1244776248931885, "step": 6460 }, { "epoch": 1.11, "grad_norm": 27.914141548982084, "learning_rate": 2.4357287790301755e-07, "logits/chosen": -1.2337547540664673, "logits/rejected": -1.178056001663208, "logps/chosen": -217.49169921875, "logps/rejected": -328.74554443359375, "loss": 0.4362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6474285125732422, "rewards/margins": 1.122659683227539, "rewards/rejected": -2.7700884342193604, "step": 6470 }, { "epoch": 1.12, "grad_norm": 31.48390506247595, "learning_rate": 2.428213635615902e-07, "logits/chosen": -1.3232189416885376, "logits/rejected": -1.2600330114364624, "logps/chosen": -222.95278930664062, "logps/rejected": -329.962158203125, "loss": 0.4351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6635528802871704, "rewards/margins": 1.1314866542816162, "rewards/rejected": -2.795039415359497, "step": 6480 }, { "epoch": 1.12, "grad_norm": 20.030188531568935, "learning_rate": 2.420699141371251e-07, "logits/chosen": -1.4895018339157104, "logits/rejected": -1.4343178272247314, "logps/chosen": -234.48953247070312, "logps/rejected": -364.4090881347656, "loss": 0.4536, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8089115619659424, "rewards/margins": 1.2942219972610474, "rewards/rejected": -3.1031336784362793, "step": 6490 }, { "epoch": 1.12, "grad_norm": 27.831603619813574, "learning_rate": 2.41318536425037e-07, "logits/chosen": -1.3722602128982544, "logits/rejected": -1.3266656398773193, "logps/chosen": -211.3208770751953, "logps/rejected": -318.9163818359375, "loss": 0.4055, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.551661491394043, "rewards/margins": 1.1104365587234497, "rewards/rejected": -2.6620981693267822, "step": 6500 }, { "epoch": 1.12, "eval_logits/chosen": -1.4495809078216553, "eval_logits/rejected": -1.4227662086486816, "eval_logps/chosen": -220.26852416992188, "eval_logps/rejected": -266.0024108886719, "eval_loss": 0.6262578964233398, "eval_rewards/accuracies": 0.6656598448753357, "eval_rewards/chosen": -1.6156466007232666, "eval_rewards/margins": 0.4128037095069885, "eval_rewards/rejected": -2.0284502506256104, "eval_runtime": 356.7396, "eval_samples_per_second": 12.065, "eval_steps_per_second": 1.508, "step": 6500 }, { "epoch": 1.12, "grad_norm": 30.446262958118353, "learning_rate": 2.4056723722009243e-07, "logits/chosen": -1.3711057901382446, "logits/rejected": -1.291212797164917, "logps/chosen": -215.440673828125, "logps/rejected": -331.94805908203125, "loss": 0.4238, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5865962505340576, "rewards/margins": 1.203033447265625, "rewards/rejected": -2.7896294593811035, "step": 6510 }, { "epoch": 1.12, "grad_norm": 22.937766573953056, "learning_rate": 2.39816023316348e-07, "logits/chosen": -1.3640129566192627, "logits/rejected": -1.3016841411590576, "logps/chosen": -203.5388641357422, "logps/rejected": -329.3097839355469, "loss": 0.3885, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.499146819114685, "rewards/margins": 1.2792606353759766, "rewards/rejected": -2.778407573699951, "step": 6520 }, { "epoch": 1.13, "grad_norm": 20.155288956338, "learning_rate": 2.3906490150708894e-07, "logits/chosen": -1.3035330772399902, "logits/rejected": -1.2258248329162598, "logps/chosen": -207.00711059570312, "logps/rejected": -357.16375732421875, "loss": 0.3594, "rewards/accuracies": 0.84375, "rewards/chosen": -1.533954381942749, "rewards/margins": 1.5234779119491577, "rewards/rejected": -3.057432174682617, "step": 6530 }, { "epoch": 1.13, "grad_norm": 38.1595216556655, "learning_rate": 2.3831387858476739e-07, "logits/chosen": -1.3005788326263428, "logits/rejected": -1.2370128631591797, "logps/chosen": -241.73861694335938, "logps/rejected": -354.9060363769531, "loss": 0.4724, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8435420989990234, "rewards/margins": 1.1428686380386353, "rewards/rejected": -2.986410617828369, "step": 6540 }, { "epoch": 1.13, "grad_norm": 32.72488280276923, "learning_rate": 2.3756296134094176e-07, "logits/chosen": -1.2309355735778809, "logits/rejected": -1.1715677976608276, "logps/chosen": -238.2613067626953, "logps/rejected": -352.8631286621094, "loss": 0.45, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8673267364501953, "rewards/margins": 1.1446090936660767, "rewards/rejected": -3.0119359493255615, "step": 6550 }, { "epoch": 1.13, "grad_norm": 21.509539455997288, "learning_rate": 2.368121565662142e-07, "logits/chosen": -1.372521162033081, "logits/rejected": -1.3001985549926758, "logps/chosen": -221.03701782226562, "logps/rejected": -343.4317626953125, "loss": 0.4243, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6504312753677368, "rewards/margins": 1.2656335830688477, "rewards/rejected": -2.916064739227295, "step": 6560 }, { "epoch": 1.13, "grad_norm": 26.05014427851963, "learning_rate": 2.3606147105017037e-07, "logits/chosen": -1.3940800428390503, "logits/rejected": -1.323072910308838, "logps/chosen": -222.31124877929688, "logps/rejected": -340.05926513671875, "loss": 0.4147, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6877048015594482, "rewards/margins": 1.1861732006072998, "rewards/rejected": -2.873878002166748, "step": 6570 }, { "epoch": 1.13, "grad_norm": 27.019728234474087, "learning_rate": 2.3531091158131702e-07, "logits/chosen": -1.4203673601150513, "logits/rejected": -1.349675178527832, "logps/chosen": -210.45614624023438, "logps/rejected": -316.55548095703125, "loss": 0.4356, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5541622638702393, "rewards/margins": 1.1116034984588623, "rewards/rejected": -2.6657655239105225, "step": 6580 }, { "epoch": 1.14, "grad_norm": 20.654019350018757, "learning_rate": 2.3456048494702133e-07, "logits/chosen": -1.360848069190979, "logits/rejected": -1.2936543226242065, "logps/chosen": -213.0439453125, "logps/rejected": -346.81463623046875, "loss": 0.4131, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6021496057510376, "rewards/margins": 1.3312013149261475, "rewards/rejected": -2.9333510398864746, "step": 6590 }, { "epoch": 1.14, "grad_norm": 39.76165527082194, "learning_rate": 2.3381019793344897e-07, "logits/chosen": -1.4293988943099976, "logits/rejected": -1.3686877489089966, "logps/chosen": -219.72946166992188, "logps/rejected": -342.87799072265625, "loss": 0.4373, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6410694122314453, "rewards/margins": 1.252076268196106, "rewards/rejected": -2.893145799636841, "step": 6600 }, { "epoch": 1.14, "eval_logits/chosen": -1.4153751134872437, "eval_logits/rejected": -1.3869836330413818, "eval_logps/chosen": -250.33335876464844, "eval_logps/rejected": -302.05145263671875, "eval_loss": 0.6318928003311157, "eval_rewards/accuracies": 0.6614776849746704, "eval_rewards/chosen": -1.9162949323654175, "eval_rewards/margins": 0.4726457893848419, "eval_rewards/rejected": -2.3889405727386475, "eval_runtime": 356.6448, "eval_samples_per_second": 12.068, "eval_steps_per_second": 1.509, "step": 6600 }, { "epoch": 1.14, "grad_norm": 18.9597145646942, "learning_rate": 2.3306005732550337e-07, "logits/chosen": -1.3483235836029053, "logits/rejected": -1.287246584892273, "logps/chosen": -246.5947265625, "logps/rejected": -377.0596618652344, "loss": 0.3971, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9461562633514404, "rewards/margins": 1.3069889545440674, "rewards/rejected": -3.253145217895508, "step": 6610 }, { "epoch": 1.14, "grad_norm": 34.30473163237968, "learning_rate": 2.3231006990676365e-07, "logits/chosen": -1.3247897624969482, "logits/rejected": -1.2637712955474854, "logps/chosen": -252.6153106689453, "logps/rejected": -367.00933837890625, "loss": 0.4838, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0022428035736084, "rewards/margins": 1.1463488340377808, "rewards/rejected": -3.148591995239258, "step": 6620 }, { "epoch": 1.14, "grad_norm": 27.70016442005097, "learning_rate": 2.3156024245942394e-07, "logits/chosen": -1.3318690061569214, "logits/rejected": -1.2696187496185303, "logps/chosen": -205.23562622070312, "logps/rejected": -317.63372802734375, "loss": 0.4009, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5194008350372314, "rewards/margins": 1.1651160717010498, "rewards/rejected": -2.6845173835754395, "step": 6630 }, { "epoch": 1.14, "grad_norm": 19.533684280783113, "learning_rate": 2.3081058176423148e-07, "logits/chosen": -1.4036105871200562, "logits/rejected": -1.3376753330230713, "logps/chosen": -224.6147003173828, "logps/rejected": -337.85577392578125, "loss": 0.4359, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7151901721954346, "rewards/margins": 1.1676080226898193, "rewards/rejected": -2.8827977180480957, "step": 6640 }, { "epoch": 1.15, "grad_norm": 25.725355566297086, "learning_rate": 2.300610946004256e-07, "logits/chosen": -1.449748158454895, "logits/rejected": -1.3810780048370361, "logps/chosen": -218.1935577392578, "logps/rejected": -359.31268310546875, "loss": 0.3906, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6608730554580688, "rewards/margins": 1.3988673686981201, "rewards/rejected": -3.0597405433654785, "step": 6650 }, { "epoch": 1.15, "grad_norm": 22.25653908814933, "learning_rate": 2.2931178774567662e-07, "logits/chosen": -1.4511274099349976, "logits/rejected": -1.389211654663086, "logps/chosen": -204.99295043945312, "logps/rejected": -340.06597900390625, "loss": 0.4019, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5325791835784912, "rewards/margins": 1.326608657836914, "rewards/rejected": -2.859187602996826, "step": 6660 }, { "epoch": 1.15, "grad_norm": 25.109705271958312, "learning_rate": 2.285626679760239e-07, "logits/chosen": -1.3574293851852417, "logits/rejected": -1.300843596458435, "logps/chosen": -223.3512420654297, "logps/rejected": -382.12548828125, "loss": 0.3859, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7345237731933594, "rewards/margins": 1.5338653326034546, "rewards/rejected": -3.2683892250061035, "step": 6670 }, { "epoch": 1.15, "grad_norm": 36.664785759024156, "learning_rate": 2.278137420658154e-07, "logits/chosen": -1.3482401371002197, "logits/rejected": -1.2863205671310425, "logps/chosen": -229.06997680664062, "logps/rejected": -332.17108154296875, "loss": 0.4906, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7523906230926514, "rewards/margins": 1.0489892959594727, "rewards/rejected": -2.8013803958892822, "step": 6680 }, { "epoch": 1.15, "grad_norm": 21.93167754697323, "learning_rate": 2.270650167876456e-07, "logits/chosen": -1.3556606769561768, "logits/rejected": -1.294721245765686, "logps/chosen": -201.72213745117188, "logps/rejected": -337.44561767578125, "loss": 0.3821, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.4457823038101196, "rewards/margins": 1.3707009553909302, "rewards/rejected": -2.81648325920105, "step": 6690 }, { "epoch": 1.15, "grad_norm": 45.51069055649989, "learning_rate": 2.2631649891229502e-07, "logits/chosen": -1.3424584865570068, "logits/rejected": -1.2963857650756836, "logps/chosen": -230.5055694580078, "logps/rejected": -343.5107421875, "loss": 0.4568, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7437101602554321, "rewards/margins": 1.1325562000274658, "rewards/rejected": -2.8762667179107666, "step": 6700 }, { "epoch": 1.15, "eval_logits/chosen": -1.4418696165084839, "eval_logits/rejected": -1.4138153791427612, "eval_logps/chosen": -229.56253051757812, "eval_logps/rejected": -278.3695983886719, "eval_loss": 0.6346877813339233, "eval_rewards/accuracies": 0.6575278639793396, "eval_rewards/chosen": -1.7085868120193481, "eval_rewards/margins": 0.44353532791137695, "eval_rewards/rejected": -2.1521220207214355, "eval_runtime": 356.6768, "eval_samples_per_second": 12.067, "eval_steps_per_second": 1.508, "step": 6700 }, { "epoch": 1.16, "grad_norm": 25.6076080920141, "learning_rate": 2.2556819520866828e-07, "logits/chosen": -1.3505706787109375, "logits/rejected": -1.2714554071426392, "logps/chosen": -215.65658569335938, "logps/rejected": -377.618408203125, "loss": 0.3473, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5876433849334717, "rewards/margins": 1.653158187866211, "rewards/rejected": -3.2408013343811035, "step": 6710 }, { "epoch": 1.16, "grad_norm": 25.373104312017844, "learning_rate": 2.2482011244373357e-07, "logits/chosen": -1.3233754634857178, "logits/rejected": -1.2596690654754639, "logps/chosen": -218.81887817382812, "logps/rejected": -363.94854736328125, "loss": 0.3888, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.674155592918396, "rewards/margins": 1.4498602151870728, "rewards/rejected": -3.1240158081054688, "step": 6720 }, { "epoch": 1.16, "grad_norm": 36.35012540803045, "learning_rate": 2.2407225738246074e-07, "logits/chosen": -1.2628940343856812, "logits/rejected": -1.2141722440719604, "logps/chosen": -245.4576416015625, "logps/rejected": -356.5597229003906, "loss": 0.4927, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8878835439682007, "rewards/margins": 1.1251258850097656, "rewards/rejected": -3.013009548187256, "step": 6730 }, { "epoch": 1.16, "grad_norm": 30.43443111435983, "learning_rate": 2.233246367877609e-07, "logits/chosen": -1.3312593698501587, "logits/rejected": -1.2748968601226807, "logps/chosen": -217.5796356201172, "logps/rejected": -356.3081970214844, "loss": 0.4072, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6314865350723267, "rewards/margins": 1.3706390857696533, "rewards/rejected": -3.0021252632141113, "step": 6740 }, { "epoch": 1.16, "grad_norm": 19.110570274965703, "learning_rate": 2.2257725742042438e-07, "logits/chosen": -1.3627091646194458, "logits/rejected": -1.3030933141708374, "logps/chosen": -224.2344512939453, "logps/rejected": -367.8368225097656, "loss": 0.4003, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7180770635604858, "rewards/margins": 1.4236048460006714, "rewards/rejected": -3.1416821479797363, "step": 6750 }, { "epoch": 1.16, "grad_norm": 48.40608306760337, "learning_rate": 2.2183012603906066e-07, "logits/chosen": -1.312281608581543, "logits/rejected": -1.2430318593978882, "logps/chosen": -216.72750854492188, "logps/rejected": -338.98541259765625, "loss": 0.4699, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6467088460922241, "rewards/margins": 1.255919337272644, "rewards/rejected": -2.902627944946289, "step": 6760 }, { "epoch": 1.17, "grad_norm": 41.406179970761336, "learning_rate": 2.2108324940003606e-07, "logits/chosen": -1.3574762344360352, "logits/rejected": -1.3120397329330444, "logps/chosen": -211.9196014404297, "logps/rejected": -334.21173095703125, "loss": 0.433, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5811679363250732, "rewards/margins": 1.2045514583587646, "rewards/rejected": -2.7857189178466797, "step": 6770 }, { "epoch": 1.17, "grad_norm": 35.196901180748505, "learning_rate": 2.2033663425741378e-07, "logits/chosen": -1.3661503791809082, "logits/rejected": -1.2911349534988403, "logps/chosen": -219.1353302001953, "logps/rejected": -340.84149169921875, "loss": 0.4354, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6264568567276, "rewards/margins": 1.2487623691558838, "rewards/rejected": -2.8752193450927734, "step": 6780 }, { "epoch": 1.17, "grad_norm": 30.125071463453715, "learning_rate": 2.1959028736289184e-07, "logits/chosen": -1.3736763000488281, "logits/rejected": -1.3135449886322021, "logps/chosen": -205.2973175048828, "logps/rejected": -336.26531982421875, "loss": 0.4184, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5477122068405151, "rewards/margins": 1.2838819026947021, "rewards/rejected": -2.8315939903259277, "step": 6790 }, { "epoch": 1.17, "grad_norm": 20.154181246091905, "learning_rate": 2.1884421546574288e-07, "logits/chosen": -1.2408316135406494, "logits/rejected": -1.166013240814209, "logps/chosen": -215.8163299560547, "logps/rejected": -352.4253234863281, "loss": 0.396, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.634212851524353, "rewards/margins": 1.3892707824707031, "rewards/rejected": -3.0234837532043457, "step": 6800 }, { "epoch": 1.17, "eval_logits/chosen": -1.4073745012283325, "eval_logits/rejected": -1.3791805505752563, "eval_logps/chosen": -242.52587890625, "eval_logps/rejected": -293.1243896484375, "eval_loss": 0.6304371356964111, "eval_rewards/accuracies": 0.669377326965332, "eval_rewards/chosen": -1.8382201194763184, "eval_rewards/margins": 0.4614499807357788, "eval_rewards/rejected": -2.2996702194213867, "eval_runtime": 356.7418, "eval_samples_per_second": 12.065, "eval_steps_per_second": 1.508, "step": 6800 }, { "epoch": 1.17, "grad_norm": 40.48439094660076, "learning_rate": 2.1809842531275234e-07, "logits/chosen": -1.3060812950134277, "logits/rejected": -1.2478914260864258, "logps/chosen": -250.30685424804688, "logps/rejected": -360.6299133300781, "loss": 0.458, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9459367990493774, "rewards/margins": 1.1430813074111938, "rewards/rejected": -3.0890181064605713, "step": 6810 }, { "epoch": 1.18, "grad_norm": 22.745532083063072, "learning_rate": 2.173529236481581e-07, "logits/chosen": -1.3810464143753052, "logits/rejected": -1.3168919086456299, "logps/chosen": -248.2955322265625, "logps/rejected": -380.9482421875, "loss": 0.4335, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9328635931015015, "rewards/margins": 1.3468307256698608, "rewards/rejected": -3.279694080352783, "step": 6820 }, { "epoch": 1.18, "grad_norm": 29.05046249095613, "learning_rate": 2.1660771721358898e-07, "logits/chosen": -1.4409806728363037, "logits/rejected": -1.3872400522232056, "logps/chosen": -220.9757080078125, "logps/rejected": -354.67041015625, "loss": 0.4077, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6707779169082642, "rewards/margins": 1.3462539911270142, "rewards/rejected": -3.0170319080352783, "step": 6830 }, { "epoch": 1.18, "grad_norm": 26.092849893502574, "learning_rate": 2.1586281274800433e-07, "logits/chosen": -1.4010366201400757, "logits/rejected": -1.3389381170272827, "logps/chosen": -229.3477020263672, "logps/rejected": -351.7111511230469, "loss": 0.4488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7241207361221313, "rewards/margins": 1.239141583442688, "rewards/rejected": -2.9632620811462402, "step": 6840 }, { "epoch": 1.18, "grad_norm": 31.755734616992513, "learning_rate": 2.151182169876325e-07, "logits/chosen": -1.3103221654891968, "logits/rejected": -1.243399739265442, "logps/chosen": -210.047607421875, "logps/rejected": -330.6673278808594, "loss": 0.438, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.559027075767517, "rewards/margins": 1.2114872932434082, "rewards/rejected": -2.7705142498016357, "step": 6850 }, { "epoch": 1.18, "grad_norm": 23.745751034555475, "learning_rate": 2.143739366659102e-07, "logits/chosen": -1.4120018482208252, "logits/rejected": -1.3412996530532837, "logps/chosen": -229.89297485351562, "logps/rejected": -333.34490966796875, "loss": 0.4456, "rewards/accuracies": 0.8125, "rewards/chosen": -1.707727074623108, "rewards/margins": 1.1025744676589966, "rewards/rejected": -2.8103013038635254, "step": 6860 }, { "epoch": 1.18, "grad_norm": 24.98697965563167, "learning_rate": 2.1362997851342186e-07, "logits/chosen": -1.300405502319336, "logits/rejected": -1.2553117275238037, "logps/chosen": -233.50424194335938, "logps/rejected": -346.513671875, "loss": 0.4593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8242870569229126, "rewards/margins": 1.1232497692108154, "rewards/rejected": -2.9475369453430176, "step": 6870 }, { "epoch": 1.19, "grad_norm": 28.460821723801782, "learning_rate": 2.1288634925783817e-07, "logits/chosen": -1.3697658777236938, "logits/rejected": -1.2964236736297607, "logps/chosen": -220.921630859375, "logps/rejected": -353.2587890625, "loss": 0.3797, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6666065454483032, "rewards/margins": 1.3543717861175537, "rewards/rejected": -3.0209782123565674, "step": 6880 }, { "epoch": 1.19, "grad_norm": 28.677786254784447, "learning_rate": 2.121430556238559e-07, "logits/chosen": -1.3057619333267212, "logits/rejected": -1.2380374670028687, "logps/chosen": -215.5825958251953, "logps/rejected": -367.3506774902344, "loss": 0.3409, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6328455209732056, "rewards/margins": 1.516847014427185, "rewards/rejected": -3.1496922969818115, "step": 6890 }, { "epoch": 1.19, "grad_norm": 43.60415706308666, "learning_rate": 2.1140010433313642e-07, "logits/chosen": -1.3161351680755615, "logits/rejected": -1.2549692392349243, "logps/chosen": -244.4730682373047, "logps/rejected": -367.92681884765625, "loss": 0.4312, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9093977212905884, "rewards/margins": 1.235694169998169, "rewards/rejected": -3.1450917720794678, "step": 6900 }, { "epoch": 1.19, "eval_logits/chosen": -1.3852744102478027, "eval_logits/rejected": -1.3564972877502441, "eval_logps/chosen": -266.2965393066406, "eval_logps/rejected": -320.25164794921875, "eval_loss": 0.6330453157424927, "eval_rewards/accuracies": 0.6644981503486633, "eval_rewards/chosen": -2.0759267807006836, "eval_rewards/margins": 0.4950157105922699, "eval_rewards/rejected": -2.5709426403045654, "eval_runtime": 356.8834, "eval_samples_per_second": 12.06, "eval_steps_per_second": 1.507, "step": 6900 }, { "epoch": 1.19, "grad_norm": 33.10823667565031, "learning_rate": 2.1065750210424572e-07, "logits/chosen": -1.3516546487808228, "logits/rejected": -1.280491828918457, "logps/chosen": -246.9800262451172, "logps/rejected": -387.73577880859375, "loss": 0.4107, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9011551141738892, "rewards/margins": 1.4241554737091064, "rewards/rejected": -3.3253109455108643, "step": 6910 }, { "epoch": 1.19, "grad_norm": 31.592188794374742, "learning_rate": 2.099152556525926e-07, "logits/chosen": -1.4136111736297607, "logits/rejected": -1.3561625480651855, "logps/chosen": -256.72418212890625, "logps/rejected": -363.986083984375, "loss": 0.4662, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0063765048980713, "rewards/margins": 1.0958820581436157, "rewards/rejected": -3.1022586822509766, "step": 6920 }, { "epoch": 1.19, "grad_norm": 33.58926677055865, "learning_rate": 2.0917337169036924e-07, "logits/chosen": -1.2991350889205933, "logits/rejected": -1.224484920501709, "logps/chosen": -217.6683807373047, "logps/rejected": -366.7851867675781, "loss": 0.3657, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6003036499023438, "rewards/margins": 1.525007724761963, "rewards/rejected": -3.1253113746643066, "step": 6930 }, { "epoch": 1.2, "grad_norm": 40.25747613620437, "learning_rate": 2.0843185692648911e-07, "logits/chosen": -1.3118326663970947, "logits/rejected": -1.2168563604354858, "logps/chosen": -208.8804168701172, "logps/rejected": -361.11444091796875, "loss": 0.3828, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.509263038635254, "rewards/margins": 1.596047043800354, "rewards/rejected": -3.1053099632263184, "step": 6940 }, { "epoch": 1.2, "grad_norm": 24.84940318081975, "learning_rate": 2.076907180665276e-07, "logits/chosen": -1.3450183868408203, "logits/rejected": -1.2781970500946045, "logps/chosen": -219.59927368164062, "logps/rejected": -362.3084411621094, "loss": 0.3846, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.657480239868164, "rewards/margins": 1.4361209869384766, "rewards/rejected": -3.0936012268066406, "step": 6950 }, { "epoch": 1.2, "grad_norm": 30.803786620950167, "learning_rate": 2.0694996181266027e-07, "logits/chosen": -1.5233880281448364, "logits/rejected": -1.4538953304290771, "logps/chosen": -233.41616821289062, "logps/rejected": -332.0858459472656, "loss": 0.4962, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.781266450881958, "rewards/margins": 1.062042474746704, "rewards/rejected": -2.843308925628662, "step": 6960 }, { "epoch": 1.2, "grad_norm": 27.683191395647043, "learning_rate": 2.062095948636031e-07, "logits/chosen": -1.4839107990264893, "logits/rejected": -1.41178297996521, "logps/chosen": -189.777587890625, "logps/rejected": -317.9434509277344, "loss": 0.3784, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.3524045944213867, "rewards/margins": 1.3231462240219116, "rewards/rejected": -2.675550937652588, "step": 6970 }, { "epoch": 1.2, "grad_norm": 22.494868613952, "learning_rate": 2.0546962391455128e-07, "logits/chosen": -1.4198600053787231, "logits/rejected": -1.3551172018051147, "logps/chosen": -198.3609161376953, "logps/rejected": -321.55157470703125, "loss": 0.4203, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.4413096904754639, "rewards/margins": 1.244518518447876, "rewards/rejected": -2.685828447341919, "step": 6980 }, { "epoch": 1.2, "grad_norm": 36.82272039938233, "learning_rate": 2.0473005565711924e-07, "logits/chosen": -1.335599422454834, "logits/rejected": -1.2768608331680298, "logps/chosen": -214.334716796875, "logps/rejected": -331.76312255859375, "loss": 0.4589, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5734632015228271, "rewards/margins": 1.201757550239563, "rewards/rejected": -2.7752208709716797, "step": 6990 }, { "epoch": 1.21, "grad_norm": 28.270864329924283, "learning_rate": 2.039908967792795e-07, "logits/chosen": -1.5961410999298096, "logits/rejected": -1.521150827407837, "logps/chosen": -228.6524200439453, "logps/rejected": -364.3278503417969, "loss": 0.4144, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7237600088119507, "rewards/margins": 1.3727467060089111, "rewards/rejected": -3.0965065956115723, "step": 7000 }, { "epoch": 1.21, "eval_logits/chosen": -1.5385133028030396, "eval_logits/rejected": -1.5128390789031982, "eval_logps/chosen": -213.44802856445312, "eval_logps/rejected": -257.9127502441406, "eval_loss": 0.630026638507843, "eval_rewards/accuracies": 0.6586896181106567, "eval_rewards/chosen": -1.547441840171814, "eval_rewards/margins": 0.40011176466941833, "eval_rewards/rejected": -1.9475535154342651, "eval_runtime": 356.7968, "eval_samples_per_second": 12.063, "eval_steps_per_second": 1.508, "step": 7000 }, { "epoch": 1.21, "grad_norm": 19.18466438304857, "learning_rate": 2.0325215396530289e-07, "logits/chosen": -1.4519102573394775, "logits/rejected": -1.3836629390716553, "logps/chosen": -213.21701049804688, "logps/rejected": -340.19964599609375, "loss": 0.4318, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5678694248199463, "rewards/margins": 1.305743932723999, "rewards/rejected": -2.8736133575439453, "step": 7010 }, { "epoch": 1.21, "grad_norm": 27.94478473053083, "learning_rate": 2.025138338956974e-07, "logits/chosen": -1.4114625453948975, "logits/rejected": -1.349818229675293, "logps/chosen": -198.73695373535156, "logps/rejected": -307.34136962890625, "loss": 0.445, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4743351936340332, "rewards/margins": 1.0755380392074585, "rewards/rejected": -2.549873113632202, "step": 7020 }, { "epoch": 1.21, "grad_norm": 41.933851784209644, "learning_rate": 2.0177594324714838e-07, "logits/chosen": -1.4608399868011475, "logits/rejected": -1.397789716720581, "logps/chosen": -205.079345703125, "logps/rejected": -330.39312744140625, "loss": 0.4361, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.519981026649475, "rewards/margins": 1.2328134775161743, "rewards/rejected": -2.7527945041656494, "step": 7030 }, { "epoch": 1.21, "grad_norm": 27.891320371971226, "learning_rate": 2.0103848869245764e-07, "logits/chosen": -1.3869388103485107, "logits/rejected": -1.3218698501586914, "logps/chosen": -200.9859619140625, "logps/rejected": -331.20611572265625, "loss": 0.3926, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4491612911224365, "rewards/margins": 1.297973871231079, "rewards/rejected": -2.7471349239349365, "step": 7040 }, { "epoch": 1.21, "grad_norm": 24.036317739572887, "learning_rate": 2.0030147690048374e-07, "logits/chosen": -1.3576328754425049, "logits/rejected": -1.3013206720352173, "logps/chosen": -198.55532836914062, "logps/rejected": -319.38226318359375, "loss": 0.4474, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4711755514144897, "rewards/margins": 1.1891216039657593, "rewards/rejected": -2.660297155380249, "step": 7050 }, { "epoch": 1.22, "grad_norm": 36.02994677091849, "learning_rate": 1.995649145360809e-07, "logits/chosen": -1.4678010940551758, "logits/rejected": -1.4199771881103516, "logps/chosen": -212.34457397460938, "logps/rejected": -316.0308532714844, "loss": 0.4659, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5913760662078857, "rewards/margins": 1.0398099422454834, "rewards/rejected": -2.631186008453369, "step": 7060 }, { "epoch": 1.22, "grad_norm": 65.30902966804867, "learning_rate": 1.988288082600392e-07, "logits/chosen": -1.3991708755493164, "logits/rejected": -1.337200403213501, "logps/chosen": -218.6618194580078, "logps/rejected": -320.5107421875, "loss": 0.5368, "rewards/accuracies": 0.75, "rewards/chosen": -1.640601396560669, "rewards/margins": 1.0321669578552246, "rewards/rejected": -2.6727681159973145, "step": 7070 }, { "epoch": 1.22, "grad_norm": 29.064832805753056, "learning_rate": 1.980931647290246e-07, "logits/chosen": -1.4547747373580933, "logits/rejected": -1.3819966316223145, "logps/chosen": -202.14923095703125, "logps/rejected": -324.9191589355469, "loss": 0.4112, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4868186712265015, "rewards/margins": 1.2228963375091553, "rewards/rejected": -2.709714889526367, "step": 7080 }, { "epoch": 1.22, "grad_norm": 21.70626173606062, "learning_rate": 1.97357990595518e-07, "logits/chosen": -1.5178253650665283, "logits/rejected": -1.4573280811309814, "logps/chosen": -204.9860382080078, "logps/rejected": -340.4731750488281, "loss": 0.4111, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5147212743759155, "rewards/margins": 1.3682196140289307, "rewards/rejected": -2.8829410076141357, "step": 7090 }, { "epoch": 1.22, "grad_norm": 38.87350809965092, "learning_rate": 1.9662329250775586e-07, "logits/chosen": -1.3815619945526123, "logits/rejected": -1.3237214088439941, "logps/chosen": -205.8621826171875, "logps/rejected": -319.4546203613281, "loss": 0.4501, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4879212379455566, "rewards/margins": 1.1380326747894287, "rewards/rejected": -2.6259539127349854, "step": 7100 }, { "epoch": 1.22, "eval_logits/chosen": -1.4833621978759766, "eval_logits/rejected": -1.4578860998153687, "eval_logps/chosen": -215.61434936523438, "eval_logps/rejected": -259.6932373046875, "eval_loss": 0.6319575309753418, "eval_rewards/accuracies": 0.6510223150253296, "eval_rewards/chosen": -1.5691050291061401, "eval_rewards/margins": 0.39625340700149536, "eval_rewards/rejected": -1.9653586149215698, "eval_runtime": 356.9358, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 7100 }, { "epoch": 1.23, "grad_norm": 30.114994381758486, "learning_rate": 1.9588907710966943e-07, "logits/chosen": -1.3856322765350342, "logits/rejected": -1.3147612810134888, "logps/chosen": -195.54681396484375, "logps/rejected": -312.0347595214844, "loss": 0.4228, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4208539724349976, "rewards/margins": 1.1898590326309204, "rewards/rejected": -2.610713243484497, "step": 7110 }, { "epoch": 1.23, "grad_norm": 44.292714805439296, "learning_rate": 1.951553510408252e-07, "logits/chosen": -1.3800169229507446, "logits/rejected": -1.3161985874176025, "logps/chosen": -223.2345428466797, "logps/rejected": -308.5866394042969, "loss": 0.5205, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.612497329711914, "rewards/margins": 0.9479106068611145, "rewards/rejected": -2.560408115386963, "step": 7120 }, { "epoch": 1.23, "grad_norm": 31.181918195201007, "learning_rate": 1.944221209363643e-07, "logits/chosen": -1.300041913986206, "logits/rejected": -1.2450910806655884, "logps/chosen": -205.53591918945312, "logps/rejected": -322.80950927734375, "loss": 0.4277, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4920589923858643, "rewards/margins": 1.2042802572250366, "rewards/rejected": -2.6963393688201904, "step": 7130 }, { "epoch": 1.23, "grad_norm": 33.82376669358963, "learning_rate": 1.9368939342694328e-07, "logits/chosen": -1.4221440553665161, "logits/rejected": -1.382880449295044, "logps/chosen": -190.2220458984375, "logps/rejected": -294.0387878417969, "loss": 0.4669, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.38909912109375, "rewards/margins": 1.0321005582809448, "rewards/rejected": -2.421199321746826, "step": 7140 }, { "epoch": 1.23, "grad_norm": 41.46398078974056, "learning_rate": 1.9295717513867324e-07, "logits/chosen": -1.5011231899261475, "logits/rejected": -1.4463145732879639, "logps/chosen": -224.45938110351562, "logps/rejected": -333.4562683105469, "loss": 0.4627, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6809113025665283, "rewards/margins": 1.1032178401947021, "rewards/rejected": -2.7841289043426514, "step": 7150 }, { "epoch": 1.23, "grad_norm": 43.52157134513151, "learning_rate": 1.9222547269306068e-07, "logits/chosen": -1.415351152420044, "logits/rejected": -1.3425204753875732, "logps/chosen": -192.447021484375, "logps/rejected": -316.6680908203125, "loss": 0.4306, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3883023262023926, "rewards/margins": 1.2378036975860596, "rewards/rejected": -2.6261062622070312, "step": 7160 }, { "epoch": 1.24, "grad_norm": 28.198520340260067, "learning_rate": 1.9149429270694705e-07, "logits/chosen": -1.4002097845077515, "logits/rejected": -1.3377676010131836, "logps/chosen": -201.3988494873047, "logps/rejected": -307.12530517578125, "loss": 0.4632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4582065343856812, "rewards/margins": 1.0752780437469482, "rewards/rejected": -2.533484697341919, "step": 7170 }, { "epoch": 1.24, "grad_norm": 36.940975044182615, "learning_rate": 1.9076364179244937e-07, "logits/chosen": -1.519090175628662, "logits/rejected": -1.4556185007095337, "logps/chosen": -192.38104248046875, "logps/rejected": -318.6177673339844, "loss": 0.3814, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3707835674285889, "rewards/margins": 1.2708237171173096, "rewards/rejected": -2.6416070461273193, "step": 7180 }, { "epoch": 1.24, "grad_norm": 29.51979419688918, "learning_rate": 1.900335265568999e-07, "logits/chosen": -1.2953803539276123, "logits/rejected": -1.22904372215271, "logps/chosen": -216.4315643310547, "logps/rejected": -340.22052001953125, "loss": 0.4171, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.637787103652954, "rewards/margins": 1.2283713817596436, "rewards/rejected": -2.8661584854125977, "step": 7190 }, { "epoch": 1.24, "grad_norm": 43.92936652185145, "learning_rate": 1.893039536027872e-07, "logits/chosen": -1.2936763763427734, "logits/rejected": -1.2286694049835205, "logps/chosen": -226.12228393554688, "logps/rejected": -365.41058349609375, "loss": 0.4303, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7303447723388672, "rewards/margins": 1.3623626232147217, "rewards/rejected": -3.092707395553589, "step": 7200 }, { "epoch": 1.24, "eval_logits/chosen": -1.436883568763733, "eval_logits/rejected": -1.4103857278823853, "eval_logps/chosen": -236.11033630371094, "eval_logps/rejected": -283.7571105957031, "eval_loss": 0.632332980632782, "eval_rewards/accuracies": 0.6538103818893433, "eval_rewards/chosen": -1.7740648984909058, "eval_rewards/margins": 0.431932270526886, "eval_rewards/rejected": -2.2059972286224365, "eval_runtime": 356.7528, "eval_samples_per_second": 12.064, "eval_steps_per_second": 1.508, "step": 7200 }, { "epoch": 1.24, "grad_norm": 35.504557595434676, "learning_rate": 1.885749295276955e-07, "logits/chosen": -1.4118075370788574, "logits/rejected": -1.362586498260498, "logps/chosen": -243.8560028076172, "logps/rejected": -341.7215576171875, "loss": 0.4936, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8682628870010376, "rewards/margins": 1.0187652111053467, "rewards/rejected": -2.887028217315674, "step": 7210 }, { "epoch": 1.24, "grad_norm": 38.49322970172763, "learning_rate": 1.8784646092424572e-07, "logits/chosen": -1.2949811220169067, "logits/rejected": -1.219310998916626, "logps/chosen": -222.8941650390625, "logps/rejected": -352.58172607421875, "loss": 0.4373, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6838912963867188, "rewards/margins": 1.325777292251587, "rewards/rejected": -3.0096685886383057, "step": 7220 }, { "epoch": 1.25, "grad_norm": 28.566213465067236, "learning_rate": 1.8711855438003543e-07, "logits/chosen": -1.3604927062988281, "logits/rejected": -1.2994263172149658, "logps/chosen": -208.6522216796875, "logps/rejected": -332.7591857910156, "loss": 0.4031, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5445417165756226, "rewards/margins": 1.2535761594772339, "rewards/rejected": -2.7981178760528564, "step": 7230 }, { "epoch": 1.25, "grad_norm": 24.11769012327145, "learning_rate": 1.8639121647757976e-07, "logits/chosen": -1.3791191577911377, "logits/rejected": -1.3320530652999878, "logps/chosen": -218.4418182373047, "logps/rejected": -334.22833251953125, "loss": 0.4437, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.609986662864685, "rewards/margins": 1.184805154800415, "rewards/rejected": -2.7947916984558105, "step": 7240 }, { "epoch": 1.25, "grad_norm": 28.06311795428957, "learning_rate": 1.8566445379425116e-07, "logits/chosen": -1.4544193744659424, "logits/rejected": -1.3801645040512085, "logps/chosen": -202.21969604492188, "logps/rejected": -322.6747131347656, "loss": 0.4007, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4765223264694214, "rewards/margins": 1.2214938402175903, "rewards/rejected": -2.69801664352417, "step": 7250 }, { "epoch": 1.25, "grad_norm": 28.029045217779828, "learning_rate": 1.8493827290222068e-07, "logits/chosen": -1.4240261316299438, "logits/rejected": -1.3594194650650024, "logps/chosen": -222.39035034179688, "logps/rejected": -343.7870788574219, "loss": 0.446, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6872167587280273, "rewards/margins": 1.2270071506500244, "rewards/rejected": -2.9142239093780518, "step": 7260 }, { "epoch": 1.25, "grad_norm": 33.35599192177956, "learning_rate": 1.84212680368398e-07, "logits/chosen": -1.4141993522644043, "logits/rejected": -1.3509438037872314, "logps/chosen": -217.01632690429688, "logps/rejected": -327.6526794433594, "loss": 0.4629, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6272687911987305, "rewards/margins": 1.1230875253677368, "rewards/rejected": -2.7503561973571777, "step": 7270 }, { "epoch": 1.25, "grad_norm": 38.40058672670466, "learning_rate": 1.834876827543721e-07, "logits/chosen": -1.4696061611175537, "logits/rejected": -1.3933039903640747, "logps/chosen": -214.4224090576172, "logps/rejected": -348.7910461425781, "loss": 0.4019, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5783621072769165, "rewards/margins": 1.3805334568023682, "rewards/rejected": -2.958895444869995, "step": 7280 }, { "epoch": 1.26, "grad_norm": 46.97048710720416, "learning_rate": 1.8276328661635248e-07, "logits/chosen": -1.2667840719223022, "logits/rejected": -1.2175432443618774, "logps/chosen": -230.58251953125, "logps/rejected": -343.7873840332031, "loss": 0.4282, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.757728934288025, "rewards/margins": 1.1217601299285889, "rewards/rejected": -2.879488945007324, "step": 7290 }, { "epoch": 1.26, "grad_norm": 23.37837488393363, "learning_rate": 1.8203949850510903e-07, "logits/chosen": -1.1985424757003784, "logits/rejected": -1.151474952697754, "logps/chosen": -231.79354858398438, "logps/rejected": -341.9132080078125, "loss": 0.4717, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8153043985366821, "rewards/margins": 1.083820104598999, "rewards/rejected": -2.8991243839263916, "step": 7300 }, { "epoch": 1.26, "eval_logits/chosen": -1.4253735542297363, "eval_logits/rejected": -1.3984841108322144, "eval_logps/chosen": -244.4295196533203, "eval_logps/rejected": -294.37445068359375, "eval_loss": 0.6293808221817017, "eval_rewards/accuracies": 0.6668215394020081, "eval_rewards/chosen": -1.857256531715393, "eval_rewards/margins": 0.45491406321525574, "eval_rewards/rejected": -2.3121707439422607, "eval_runtime": 357.0293, "eval_samples_per_second": 12.055, "eval_steps_per_second": 1.507, "step": 7300 }, { "epoch": 1.26, "grad_norm": 32.28579120461566, "learning_rate": 1.8131632496591348e-07, "logits/chosen": -1.354773759841919, "logits/rejected": -1.288698673248291, "logps/chosen": -231.0813446044922, "logps/rejected": -361.4246520996094, "loss": 0.4156, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7601534128189087, "rewards/margins": 1.3353766202926636, "rewards/rejected": -3.0955300331115723, "step": 7310 }, { "epoch": 1.26, "grad_norm": 26.521159677348862, "learning_rate": 1.8059377253847973e-07, "logits/chosen": -1.374133825302124, "logits/rejected": -1.314866304397583, "logps/chosen": -226.6112823486328, "logps/rejected": -341.32647705078125, "loss": 0.478, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.72268545627594, "rewards/margins": 1.179290533065796, "rewards/rejected": -2.9019761085510254, "step": 7320 }, { "epoch": 1.26, "grad_norm": 42.16253795931932, "learning_rate": 1.7987184775690508e-07, "logits/chosen": -1.2531036138534546, "logits/rejected": -1.1840673685073853, "logps/chosen": -223.2836151123047, "logps/rejected": -365.69842529296875, "loss": 0.3926, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7025184631347656, "rewards/margins": 1.438976526260376, "rewards/rejected": -3.1414952278137207, "step": 7330 }, { "epoch": 1.26, "grad_norm": 24.413820614134853, "learning_rate": 1.7915055714961092e-07, "logits/chosen": -1.3367866277694702, "logits/rejected": -1.274552822113037, "logps/chosen": -241.989501953125, "logps/rejected": -350.8983459472656, "loss": 0.463, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8661409616470337, "rewards/margins": 1.1037722826004028, "rewards/rejected": -2.9699134826660156, "step": 7340 }, { "epoch": 1.27, "grad_norm": 27.114867030708584, "learning_rate": 1.7842990723928376e-07, "logits/chosen": -1.4280154705047607, "logits/rejected": -1.3533068895339966, "logps/chosen": -203.71188354492188, "logps/rejected": -342.64697265625, "loss": 0.3656, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.4681717157363892, "rewards/margins": 1.4406144618988037, "rewards/rejected": -2.9087860584259033, "step": 7350 }, { "epoch": 1.27, "grad_norm": 43.36655567777902, "learning_rate": 1.7770990454281605e-07, "logits/chosen": -1.3013639450073242, "logits/rejected": -1.2412099838256836, "logps/chosen": -235.2455291748047, "logps/rejected": -366.5183410644531, "loss": 0.4174, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8094427585601807, "rewards/margins": 1.334092140197754, "rewards/rejected": -3.1435346603393555, "step": 7360 }, { "epoch": 1.27, "grad_norm": 43.02629441915015, "learning_rate": 1.7699055557124791e-07, "logits/chosen": -1.2064440250396729, "logits/rejected": -1.1509660482406616, "logps/chosen": -230.65185546875, "logps/rejected": -360.70745849609375, "loss": 0.4244, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7785717248916626, "rewards/margins": 1.316173791885376, "rewards/rejected": -3.094745635986328, "step": 7370 }, { "epoch": 1.27, "grad_norm": 44.50263470615816, "learning_rate": 1.7627186682970723e-07, "logits/chosen": -1.269676923751831, "logits/rejected": -1.2101144790649414, "logps/chosen": -239.9287109375, "logps/rejected": -365.6208190917969, "loss": 0.4291, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8677568435668945, "rewards/margins": 1.2519527673721313, "rewards/rejected": -3.1197097301483154, "step": 7380 }, { "epoch": 1.27, "grad_norm": 58.270680983383755, "learning_rate": 1.755538448173518e-07, "logits/chosen": -1.2635023593902588, "logits/rejected": -1.208660364151001, "logps/chosen": -237.5902557373047, "logps/rejected": -356.22052001953125, "loss": 0.4469, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8537395000457764, "rewards/margins": 1.17844557762146, "rewards/rejected": -3.0321850776672363, "step": 7390 }, { "epoch": 1.27, "grad_norm": 19.321168110421592, "learning_rate": 1.7483649602730987e-07, "logits/chosen": -1.2944018840789795, "logits/rejected": -1.2126576900482178, "logps/chosen": -228.0514373779297, "logps/rejected": -363.4169006347656, "loss": 0.3908, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7260242700576782, "rewards/margins": 1.4026000499725342, "rewards/rejected": -3.128624439239502, "step": 7400 }, { "epoch": 1.27, "eval_logits/chosen": -1.4501019716262817, "eval_logits/rejected": -1.423465609550476, "eval_logps/chosen": -227.0261688232422, "eval_logps/rejected": -274.0572204589844, "eval_loss": 0.630664587020874, "eval_rewards/accuracies": 0.6568308472633362, "eval_rewards/chosen": -1.683223009109497, "eval_rewards/margins": 0.4257754683494568, "eval_rewards/rejected": -2.1089982986450195, "eval_runtime": 357.4983, "eval_samples_per_second": 12.039, "eval_steps_per_second": 1.505, "step": 7400 }, { "epoch": 1.28, "grad_norm": 45.11638691089592, "learning_rate": 1.741198269466219e-07, "logits/chosen": -1.2776044607162476, "logits/rejected": -1.2083661556243896, "logps/chosen": -218.230712890625, "logps/rejected": -343.81890869140625, "loss": 0.4103, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6449800729751587, "rewards/margins": 1.2632148265838623, "rewards/rejected": -2.9081950187683105, "step": 7410 }, { "epoch": 1.28, "grad_norm": 42.030823379925565, "learning_rate": 1.7340384405618134e-07, "logits/chosen": -1.2458035945892334, "logits/rejected": -1.1925244331359863, "logps/chosen": -207.9458770751953, "logps/rejected": -318.7289123535156, "loss": 0.4746, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5179194211959839, "rewards/margins": 1.1385493278503418, "rewards/rejected": -2.6564688682556152, "step": 7420 }, { "epoch": 1.28, "grad_norm": 31.232966428180802, "learning_rate": 1.7268855383067683e-07, "logits/chosen": -1.2855768203735352, "logits/rejected": -1.2198007106781006, "logps/chosen": -232.55398559570312, "logps/rejected": -353.21661376953125, "loss": 0.4445, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7915750741958618, "rewards/margins": 1.2219661474227905, "rewards/rejected": -3.0135409832000732, "step": 7430 }, { "epoch": 1.28, "grad_norm": 34.93396652920407, "learning_rate": 1.7197396273853276e-07, "logits/chosen": -1.4023360013961792, "logits/rejected": -1.343386173248291, "logps/chosen": -240.6739044189453, "logps/rejected": -337.22491455078125, "loss": 0.5101, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8467438220977783, "rewards/margins": 0.9919818043708801, "rewards/rejected": -2.8387253284454346, "step": 7440 }, { "epoch": 1.28, "grad_norm": 27.19873025751527, "learning_rate": 1.7126007724185165e-07, "logits/chosen": -1.5503208637237549, "logits/rejected": -1.4830740690231323, "logps/chosen": -199.37518310546875, "logps/rejected": -305.76214599609375, "loss": 0.4474, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.4345240592956543, "rewards/margins": 1.0972298383712769, "rewards/rejected": -2.5317540168762207, "step": 7450 }, { "epoch": 1.29, "grad_norm": 27.77498676919186, "learning_rate": 1.7054690379635477e-07, "logits/chosen": -1.3472172021865845, "logits/rejected": -1.3040322065353394, "logps/chosen": -191.5806427001953, "logps/rejected": -319.0374755859375, "loss": 0.4022, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4145309925079346, "rewards/margins": 1.2461668252944946, "rewards/rejected": -2.6606979370117188, "step": 7460 }, { "epoch": 1.29, "grad_norm": 33.696980703475546, "learning_rate": 1.698344488513247e-07, "logits/chosen": -1.4441345930099487, "logits/rejected": -1.3981006145477295, "logps/chosen": -196.69949340820312, "logps/rejected": -293.2108459472656, "loss": 0.4734, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.42051100730896, "rewards/margins": 0.998548150062561, "rewards/rejected": -2.4190590381622314, "step": 7470 }, { "epoch": 1.29, "grad_norm": 35.61398182570765, "learning_rate": 1.691227188495461e-07, "logits/chosen": -1.3656269311904907, "logits/rejected": -1.313783884048462, "logps/chosen": -199.72702026367188, "logps/rejected": -283.9599609375, "loss": 0.5024, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4509938955307007, "rewards/margins": 0.8966015577316284, "rewards/rejected": -2.3475959300994873, "step": 7480 }, { "epoch": 1.29, "grad_norm": 27.73975502601125, "learning_rate": 1.684117202272485e-07, "logits/chosen": -1.3349004983901978, "logits/rejected": -1.287638545036316, "logps/chosen": -206.85690307617188, "logps/rejected": -317.2720031738281, "loss": 0.4389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5430989265441895, "rewards/margins": 1.098320484161377, "rewards/rejected": -2.6414191722869873, "step": 7490 }, { "epoch": 1.29, "grad_norm": 28.229250472829, "learning_rate": 1.6770145941404696e-07, "logits/chosen": -1.3574326038360596, "logits/rejected": -1.2926125526428223, "logps/chosen": -197.47386169433594, "logps/rejected": -314.25897216796875, "loss": 0.4618, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4602476358413696, "rewards/margins": 1.162480115890503, "rewards/rejected": -2.622727632522583, "step": 7500 }, { "epoch": 1.29, "eval_logits/chosen": -1.5060161352157593, "eval_logits/rejected": -1.481170654296875, "eval_logps/chosen": -211.69110107421875, "eval_logps/rejected": -254.75897216796875, "eval_loss": 0.627605676651001, "eval_rewards/accuracies": 0.6531133651733398, "eval_rewards/chosen": -1.5298728942871094, "eval_rewards/margins": 0.38614320755004883, "eval_rewards/rejected": -1.916015863418579, "eval_runtime": 357.4881, "eval_samples_per_second": 12.04, "eval_steps_per_second": 1.505, "step": 7500 }, { "epoch": 1.29, "grad_norm": 29.587971860400398, "learning_rate": 1.669919428328847e-07, "logits/chosen": -1.394683599472046, "logits/rejected": -1.3351951837539673, "logps/chosen": -214.37551879882812, "logps/rejected": -310.74652099609375, "loss": 0.4476, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5710610151290894, "rewards/margins": 1.0244272947311401, "rewards/rejected": -2.5954883098602295, "step": 7510 }, { "epoch": 1.3, "grad_norm": 25.2120659878252, "learning_rate": 1.6628317689997498e-07, "logits/chosen": -1.3550820350646973, "logits/rejected": -1.3013880252838135, "logps/chosen": -199.8297119140625, "logps/rejected": -322.51141357421875, "loss": 0.4, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.4692803621292114, "rewards/margins": 1.217524766921997, "rewards/rejected": -2.686805248260498, "step": 7520 }, { "epoch": 1.3, "grad_norm": 17.580126618538177, "learning_rate": 1.6557516802474247e-07, "logits/chosen": -1.2875080108642578, "logits/rejected": -1.237430453300476, "logps/chosen": -204.1475830078125, "logps/rejected": -334.02349853515625, "loss": 0.4147, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5195860862731934, "rewards/margins": 1.2862619161605835, "rewards/rejected": -2.8058478832244873, "step": 7530 }, { "epoch": 1.3, "grad_norm": 24.405271336303738, "learning_rate": 1.6486792260976618e-07, "logits/chosen": -1.4056943655014038, "logits/rejected": -1.3522775173187256, "logps/chosen": -208.80911254882812, "logps/rejected": -347.8929138183594, "loss": 0.3834, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5729515552520752, "rewards/margins": 1.3793232440948486, "rewards/rejected": -2.952274799346924, "step": 7540 }, { "epoch": 1.3, "grad_norm": 29.108508549211408, "learning_rate": 1.6416144705072072e-07, "logits/chosen": -1.2879887819290161, "logits/rejected": -1.2317047119140625, "logps/chosen": -233.01010131835938, "logps/rejected": -367.2439270019531, "loss": 0.4572, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8120092153549194, "rewards/margins": 1.3423044681549072, "rewards/rejected": -3.154313564300537, "step": 7550 }, { "epoch": 1.3, "grad_norm": 42.99955943761263, "learning_rate": 1.6345574773631898e-07, "logits/chosen": -1.388718843460083, "logits/rejected": -1.3253867626190186, "logps/chosen": -227.2410888671875, "logps/rejected": -355.32391357421875, "loss": 0.4386, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7468681335449219, "rewards/margins": 1.2529178857803345, "rewards/rejected": -2.999785900115967, "step": 7560 }, { "epoch": 1.3, "grad_norm": 38.1555696234109, "learning_rate": 1.6275083104825414e-07, "logits/chosen": -1.3410319089889526, "logits/rejected": -1.2803980112075806, "logps/chosen": -247.3603515625, "logps/rejected": -375.22528076171875, "loss": 0.4186, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9059759378433228, "rewards/margins": 1.3241702318191528, "rewards/rejected": -3.2301464080810547, "step": 7570 }, { "epoch": 1.31, "grad_norm": 41.061661152022474, "learning_rate": 1.6204670336114224e-07, "logits/chosen": -1.2776286602020264, "logits/rejected": -1.2231152057647705, "logps/chosen": -242.25015258789062, "logps/rejected": -361.8536682128906, "loss": 0.4535, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8624213933944702, "rewards/margins": 1.2097980976104736, "rewards/rejected": -3.0722193717956543, "step": 7580 }, { "epoch": 1.31, "grad_norm": 19.57994826653458, "learning_rate": 1.6134337104246395e-07, "logits/chosen": -1.3166749477386475, "logits/rejected": -1.225110650062561, "logps/chosen": -244.9367218017578, "logps/rejected": -403.8605041503906, "loss": 0.3392, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.8664249181747437, "rewards/margins": 1.658355474472046, "rewards/rejected": -3.524780750274658, "step": 7590 }, { "epoch": 1.31, "grad_norm": 46.40887691060314, "learning_rate": 1.6064084045250786e-07, "logits/chosen": -1.3110687732696533, "logits/rejected": -1.2509706020355225, "logps/chosen": -266.95465087890625, "logps/rejected": -382.99432373046875, "loss": 0.5019, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.144411563873291, "rewards/margins": 1.1999857425689697, "rewards/rejected": -3.3443970680236816, "step": 7600 }, { "epoch": 1.31, "eval_logits/chosen": -1.4277472496032715, "eval_logits/rejected": -1.4007766246795654, "eval_logps/chosen": -242.9215087890625, "eval_logps/rejected": -292.66485595703125, "eval_loss": 0.6300765872001648, "eval_rewards/accuracies": 0.6624070405960083, "eval_rewards/chosen": -1.8421767950057983, "eval_rewards/margins": 0.4528978765010834, "eval_rewards/rejected": -2.295074462890625, "eval_runtime": 357.4474, "eval_samples_per_second": 12.041, "eval_steps_per_second": 1.505, "step": 7600 }, { "epoch": 1.31, "grad_norm": 27.39761729744123, "learning_rate": 1.5993911794431197e-07, "logits/chosen": -1.3395607471466064, "logits/rejected": -1.275943398475647, "logps/chosen": -216.5625457763672, "logps/rejected": -342.125244140625, "loss": 0.4343, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.63640558719635, "rewards/margins": 1.2829220294952393, "rewards/rejected": -2.9193274974823, "step": 7610 }, { "epoch": 1.31, "grad_norm": 28.677466981159814, "learning_rate": 1.5923820986360703e-07, "logits/chosen": -1.4301960468292236, "logits/rejected": -1.3750264644622803, "logps/chosen": -204.70155334472656, "logps/rejected": -306.2242431640625, "loss": 0.4617, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4719994068145752, "rewards/margins": 1.0525176525115967, "rewards/rejected": -2.524517059326172, "step": 7620 }, { "epoch": 1.31, "grad_norm": 30.824980394711297, "learning_rate": 1.585381225487588e-07, "logits/chosen": -1.3620095252990723, "logits/rejected": -1.3211814165115356, "logps/chosen": -199.6609344482422, "logps/rejected": -317.6810607910156, "loss": 0.4326, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.49845290184021, "rewards/margins": 1.1381967067718506, "rewards/rejected": -2.6366496086120605, "step": 7630 }, { "epoch": 1.32, "grad_norm": 40.129932359839145, "learning_rate": 1.5783886233071074e-07, "logits/chosen": -1.281798243522644, "logits/rejected": -1.2172118425369263, "logps/chosen": -226.0537872314453, "logps/rejected": -354.72576904296875, "loss": 0.423, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6815427541732788, "rewards/margins": 1.3418070077896118, "rewards/rejected": -3.0233497619628906, "step": 7640 }, { "epoch": 1.32, "grad_norm": 34.0153156124325, "learning_rate": 1.5714043553292683e-07, "logits/chosen": -1.3627344369888306, "logits/rejected": -1.304088830947876, "logps/chosen": -245.76119995117188, "logps/rejected": -363.3336486816406, "loss": 0.4816, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8740276098251343, "rewards/margins": 1.2018343210220337, "rewards/rejected": -3.075861692428589, "step": 7650 }, { "epoch": 1.32, "grad_norm": 27.126137701003756, "learning_rate": 1.564428484713345e-07, "logits/chosen": -1.36992609500885, "logits/rejected": -1.2936866283416748, "logps/chosen": -223.11538696289062, "logps/rejected": -359.9754333496094, "loss": 0.3757, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6609264612197876, "rewards/margins": 1.3914331197738647, "rewards/rejected": -3.0523598194122314, "step": 7660 }, { "epoch": 1.32, "grad_norm": 34.38929359333108, "learning_rate": 1.5574610745426704e-07, "logits/chosen": -1.3428263664245605, "logits/rejected": -1.283569097518921, "logps/chosen": -209.7373046875, "logps/rejected": -315.8505859375, "loss": 0.5005, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5220474004745483, "rewards/margins": 1.107168436050415, "rewards/rejected": -2.629215955734253, "step": 7670 }, { "epoch": 1.32, "grad_norm": 26.0642265781016, "learning_rate": 1.5505021878240732e-07, "logits/chosen": -1.413971185684204, "logits/rejected": -1.3582481145858765, "logps/chosen": -209.92529296875, "logps/rejected": -324.2261047363281, "loss": 0.4226, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5928831100463867, "rewards/margins": 1.1526020765304565, "rewards/rejected": -2.745485305786133, "step": 7680 }, { "epoch": 1.32, "grad_norm": 21.73140455009811, "learning_rate": 1.543551887487301e-07, "logits/chosen": -1.5031044483184814, "logits/rejected": -1.4207048416137695, "logps/chosen": -188.10305786132812, "logps/rejected": -301.28973388671875, "loss": 0.3942, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3312842845916748, "rewards/margins": 1.1913520097732544, "rewards/rejected": -2.5226359367370605, "step": 7690 }, { "epoch": 1.33, "grad_norm": 21.539140223042764, "learning_rate": 1.5366102363844552e-07, "logits/chosen": -1.389103889465332, "logits/rejected": -1.3187581300735474, "logps/chosen": -202.11195373535156, "logps/rejected": -320.7894592285156, "loss": 0.4239, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4840246438980103, "rewards/margins": 1.1871830224990845, "rewards/rejected": -2.6712074279785156, "step": 7700 }, { "epoch": 1.33, "eval_logits/chosen": -1.4800820350646973, "eval_logits/rejected": -1.4540327787399292, "eval_logps/chosen": -219.68116760253906, "eval_logps/rejected": -265.55712890625, "eval_loss": 0.6266021728515625, "eval_rewards/accuracies": 0.663336455821991, "eval_rewards/chosen": -1.6097729206085205, "eval_rewards/margins": 0.414224237203598, "eval_rewards/rejected": -2.0239975452423096, "eval_runtime": 357.1774, "eval_samples_per_second": 12.05, "eval_steps_per_second": 1.506, "step": 7700 }, { "epoch": 1.33, "grad_norm": 30.088144432586212, "learning_rate": 1.5296772972894212e-07, "logits/chosen": -1.4096615314483643, "logits/rejected": -1.3569542169570923, "logps/chosen": -206.3859100341797, "logps/rejected": -320.184326171875, "loss": 0.4, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.5484488010406494, "rewards/margins": 1.1439803838729858, "rewards/rejected": -2.6924290657043457, "step": 7710 }, { "epoch": 1.33, "grad_norm": 40.080885647002745, "learning_rate": 1.5227531328972995e-07, "logits/chosen": -1.3759911060333252, "logits/rejected": -1.3137165307998657, "logps/chosen": -219.1600799560547, "logps/rejected": -328.873046875, "loss": 0.4476, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6303611993789673, "rewards/margins": 1.1502439975738525, "rewards/rejected": -2.7806053161621094, "step": 7720 }, { "epoch": 1.33, "grad_norm": 35.80033306298759, "learning_rate": 1.5158378058238442e-07, "logits/chosen": -1.3037515878677368, "logits/rejected": -1.245792031288147, "logps/chosen": -219.6901397705078, "logps/rejected": -339.2084655761719, "loss": 0.4158, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.657705545425415, "rewards/margins": 1.2117184400558472, "rewards/rejected": -2.8694233894348145, "step": 7730 }, { "epoch": 1.33, "grad_norm": 33.772757774329904, "learning_rate": 1.5089313786048885e-07, "logits/chosen": -1.282684564590454, "logits/rejected": -1.222401738166809, "logps/chosen": -228.92276000976562, "logps/rejected": -378.69622802734375, "loss": 0.3819, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7724063396453857, "rewards/margins": 1.4769179821014404, "rewards/rejected": -3.2493247985839844, "step": 7740 }, { "epoch": 1.34, "grad_norm": 28.25568491451203, "learning_rate": 1.5020339136957877e-07, "logits/chosen": -1.3118457794189453, "logits/rejected": -1.2334351539611816, "logps/chosen": -243.1823272705078, "logps/rejected": -393.16705322265625, "loss": 0.3772, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.892716407775879, "rewards/margins": 1.5171799659729004, "rewards/rejected": -3.4098963737487793, "step": 7750 }, { "epoch": 1.34, "grad_norm": 34.5208633023336, "learning_rate": 1.4951454734708458e-07, "logits/chosen": -1.2015626430511475, "logits/rejected": -1.1355557441711426, "logps/chosen": -221.8160400390625, "logps/rejected": -374.62139892578125, "loss": 0.3608, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7132511138916016, "rewards/margins": 1.516405701637268, "rewards/rejected": -3.229656934738159, "step": 7760 }, { "epoch": 1.34, "grad_norm": 32.49208419213271, "learning_rate": 1.4882661202227597e-07, "logits/chosen": -1.256168007850647, "logits/rejected": -1.1954753398895264, "logps/chosen": -244.3036346435547, "logps/rejected": -356.46368408203125, "loss": 0.4625, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9218246936798096, "rewards/margins": 1.1179141998291016, "rewards/rejected": -3.039738655090332, "step": 7770 }, { "epoch": 1.34, "grad_norm": 33.49580205257945, "learning_rate": 1.48139591616205e-07, "logits/chosen": -1.3774831295013428, "logits/rejected": -1.3217271566390991, "logps/chosen": -245.13168334960938, "logps/rejected": -392.8816223144531, "loss": 0.3819, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9360601902008057, "rewards/margins": 1.4542224407196045, "rewards/rejected": -3.390282392501831, "step": 7780 }, { "epoch": 1.34, "grad_norm": 31.546470528457153, "learning_rate": 1.4745349234165016e-07, "logits/chosen": -1.318555235862732, "logits/rejected": -1.2555335760116577, "logps/chosen": -245.47787475585938, "logps/rejected": -402.24993896484375, "loss": 0.36, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9392179250717163, "rewards/margins": 1.5643069744110107, "rewards/rejected": -3.5035252571105957, "step": 7790 }, { "epoch": 1.34, "grad_norm": 33.94781782883591, "learning_rate": 1.4676832040305984e-07, "logits/chosen": -1.3638694286346436, "logits/rejected": -1.3124583959579468, "logps/chosen": -240.8196258544922, "logps/rejected": -377.0190124511719, "loss": 0.4156, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.868431806564331, "rewards/margins": 1.3547109365463257, "rewards/rejected": -3.223142623901367, "step": 7800 }, { "epoch": 1.34, "eval_logits/chosen": -1.3900244235992432, "eval_logits/rejected": -1.3618897199630737, "eval_logps/chosen": -258.3906555175781, "eval_logps/rejected": -311.4806823730469, "eval_loss": 0.6327470541000366, "eval_rewards/accuracies": 0.6638011336326599, "eval_rewards/chosen": -1.9968681335449219, "eval_rewards/margins": 0.4863649308681488, "eval_rewards/rejected": -2.4832329750061035, "eval_runtime": 355.9596, "eval_samples_per_second": 12.091, "eval_steps_per_second": 1.511, "step": 7800 }, { "epoch": 1.35, "grad_norm": 53.82974487314594, "learning_rate": 1.4608408199649686e-07, "logits/chosen": -1.3559496402740479, "logits/rejected": -1.285172462463379, "logps/chosen": -246.6414031982422, "logps/rejected": -362.58270263671875, "loss": 0.4629, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8763376474380493, "rewards/margins": 1.2023292779922485, "rewards/rejected": -3.078667163848877, "step": 7810 }, { "epoch": 1.35, "grad_norm": 40.516669727296595, "learning_rate": 1.4540078330958167e-07, "logits/chosen": -1.336315393447876, "logits/rejected": -1.2665674686431885, "logps/chosen": -243.50302124023438, "logps/rejected": -392.2691650390625, "loss": 0.4179, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8713070154190063, "rewards/margins": 1.4982094764709473, "rewards/rejected": -3.369516372680664, "step": 7820 }, { "epoch": 1.35, "grad_norm": 32.90176835421052, "learning_rate": 1.4471843052143696e-07, "logits/chosen": -1.3154162168502808, "logits/rejected": -1.2652655839920044, "logps/chosen": -231.88034057617188, "logps/rejected": -358.99169921875, "loss": 0.4525, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.821041464805603, "rewards/margins": 1.234797716140747, "rewards/rejected": -3.0558390617370605, "step": 7830 }, { "epoch": 1.35, "grad_norm": 28.22131039387213, "learning_rate": 1.440370298026315e-07, "logits/chosen": -1.2927907705307007, "logits/rejected": -1.23340904712677, "logps/chosen": -216.0460662841797, "logps/rejected": -339.61273193359375, "loss": 0.412, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6249713897705078, "rewards/margins": 1.2290581464767456, "rewards/rejected": -2.8540291786193848, "step": 7840 }, { "epoch": 1.35, "grad_norm": 36.753366417631874, "learning_rate": 1.4335658731512451e-07, "logits/chosen": -1.301358699798584, "logits/rejected": -1.2169835567474365, "logps/chosen": -216.15402221679688, "logps/rejected": -345.2431945800781, "loss": 0.3962, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6059744358062744, "rewards/margins": 1.3626439571380615, "rewards/rejected": -2.968618392944336, "step": 7850 }, { "epoch": 1.35, "grad_norm": 25.514074332943455, "learning_rate": 1.4267710921220973e-07, "logits/chosen": -1.3115109205245972, "logits/rejected": -1.2281205654144287, "logps/chosen": -219.056396484375, "logps/rejected": -366.49932861328125, "loss": 0.3504, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6618436574935913, "rewards/margins": 1.4767673015594482, "rewards/rejected": -3.138611078262329, "step": 7860 }, { "epoch": 1.36, "grad_norm": 51.13665015949875, "learning_rate": 1.4199860163846007e-07, "logits/chosen": -1.3125016689300537, "logits/rejected": -1.251068353652954, "logps/chosen": -239.65835571289062, "logps/rejected": -367.1151428222656, "loss": 0.4608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8398945331573486, "rewards/margins": 1.2877471446990967, "rewards/rejected": -3.1276419162750244, "step": 7870 }, { "epoch": 1.36, "grad_norm": 46.23192498635243, "learning_rate": 1.4132107072967165e-07, "logits/chosen": -1.3768285512924194, "logits/rejected": -1.3229854106903076, "logps/chosen": -240.4175567626953, "logps/rejected": -354.0284118652344, "loss": 0.4709, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8730905055999756, "rewards/margins": 1.1423285007476807, "rewards/rejected": -3.0154192447662354, "step": 7880 }, { "epoch": 1.36, "grad_norm": 32.40033693942687, "learning_rate": 1.406445226128088e-07, "logits/chosen": -1.340899109840393, "logits/rejected": -1.2813217639923096, "logps/chosen": -226.41830444335938, "logps/rejected": -349.4949645996094, "loss": 0.4501, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.720557451248169, "rewards/margins": 1.221872329711914, "rewards/rejected": -2.942429542541504, "step": 7890 }, { "epoch": 1.36, "grad_norm": 38.482928631188805, "learning_rate": 1.399689634059479e-07, "logits/chosen": -1.3165191411972046, "logits/rejected": -1.2694923877716064, "logps/chosen": -227.16726684570312, "logps/rejected": -357.72381591796875, "loss": 0.418, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7499094009399414, "rewards/margins": 1.291032075881958, "rewards/rejected": -3.0409417152404785, "step": 7900 }, { "epoch": 1.36, "eval_logits/chosen": -1.4474836587905884, "eval_logits/rejected": -1.420722484588623, "eval_logps/chosen": -235.39988708496094, "eval_logps/rejected": -283.75970458984375, "eval_loss": 0.6320576071739197, "eval_rewards/accuracies": 0.6577602028846741, "eval_rewards/chosen": -1.7669605016708374, "eval_rewards/margins": 0.4390629529953003, "eval_rewards/rejected": -2.206023693084717, "eval_runtime": 357.7699, "eval_samples_per_second": 12.03, "eval_steps_per_second": 1.504, "step": 7900 }, { "epoch": 1.36, "grad_norm": 43.10989046415876, "learning_rate": 1.3929439921822334e-07, "logits/chosen": -1.3463201522827148, "logits/rejected": -1.282036542892456, "logps/chosen": -232.250732421875, "logps/rejected": -347.2474060058594, "loss": 0.4703, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7544885873794556, "rewards/margins": 1.1816383600234985, "rewards/rejected": -2.936127185821533, "step": 7910 }, { "epoch": 1.36, "grad_norm": 22.98150967323039, "learning_rate": 1.3862083614977067e-07, "logits/chosen": -1.3695622682571411, "logits/rejected": -1.3146297931671143, "logps/chosen": -206.7174072265625, "logps/rejected": -311.8243103027344, "loss": 0.4652, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.519517421722412, "rewards/margins": 1.0840694904327393, "rewards/rejected": -2.6035869121551514, "step": 7920 }, { "epoch": 1.37, "grad_norm": 26.732976356570454, "learning_rate": 1.3794828029167267e-07, "logits/chosen": -1.4295904636383057, "logits/rejected": -1.3580360412597656, "logps/chosen": -213.1838836669922, "logps/rejected": -333.4698181152344, "loss": 0.4051, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5487329959869385, "rewards/margins": 1.2622630596160889, "rewards/rejected": -2.8109960556030273, "step": 7930 }, { "epoch": 1.37, "grad_norm": 39.35835071533213, "learning_rate": 1.3727673772590376e-07, "logits/chosen": -1.3716719150543213, "logits/rejected": -1.3115837574005127, "logps/chosen": -209.64309692382812, "logps/rejected": -334.11761474609375, "loss": 0.4165, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5110173225402832, "rewards/margins": 1.2777800559997559, "rewards/rejected": -2.788797378540039, "step": 7940 }, { "epoch": 1.37, "grad_norm": 23.363749701913548, "learning_rate": 1.3660621452527505e-07, "logits/chosen": -1.308699369430542, "logits/rejected": -1.2606542110443115, "logps/chosen": -190.32920837402344, "logps/rejected": -315.5460205078125, "loss": 0.4323, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3881752490997314, "rewards/margins": 1.234622597694397, "rewards/rejected": -2.6227974891662598, "step": 7950 }, { "epoch": 1.37, "grad_norm": 30.69306170978758, "learning_rate": 1.3593671675337954e-07, "logits/chosen": -1.335451364517212, "logits/rejected": -1.2736941576004028, "logps/chosen": -200.57716369628906, "logps/rejected": -317.3564453125, "loss": 0.4216, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5011732578277588, "rewards/margins": 1.1477513313293457, "rewards/rejected": -2.6489245891571045, "step": 7960 }, { "epoch": 1.37, "grad_norm": 30.826633710533333, "learning_rate": 1.3526825046453706e-07, "logits/chosen": -1.3753823041915894, "logits/rejected": -1.3085488080978394, "logps/chosen": -217.94619750976562, "logps/rejected": -333.4029846191406, "loss": 0.4507, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6257708072662354, "rewards/margins": 1.1876569986343384, "rewards/rejected": -2.8134281635284424, "step": 7970 }, { "epoch": 1.37, "grad_norm": 32.72955968672792, "learning_rate": 1.3460082170373987e-07, "logits/chosen": -1.398342490196228, "logits/rejected": -1.3425318002700806, "logps/chosen": -230.95849609375, "logps/rejected": -356.80609130859375, "loss": 0.4128, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.740744948387146, "rewards/margins": 1.2664308547973633, "rewards/rejected": -3.0071756839752197, "step": 7980 }, { "epoch": 1.38, "grad_norm": 37.50249892999267, "learning_rate": 1.339344365065973e-07, "logits/chosen": -1.3826172351837158, "logits/rejected": -1.3275426626205444, "logps/chosen": -234.38742065429688, "logps/rejected": -364.55828857421875, "loss": 0.4381, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8373771905899048, "rewards/margins": 1.2917721271514893, "rewards/rejected": -3.1291489601135254, "step": 7990 }, { "epoch": 1.38, "grad_norm": 35.27460341174213, "learning_rate": 1.3326910089928246e-07, "logits/chosen": -1.2450647354125977, "logits/rejected": -1.1912448406219482, "logps/chosen": -227.9076385498047, "logps/rejected": -362.02349853515625, "loss": 0.4084, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7342026233673096, "rewards/margins": 1.3236587047576904, "rewards/rejected": -3.057861804962158, "step": 8000 }, { "epoch": 1.38, "eval_logits/chosen": -1.4088190793991089, "eval_logits/rejected": -1.381564974784851, "eval_logps/chosen": -247.23068237304688, "eval_logps/rejected": -297.6674499511719, "eval_loss": 0.631807804107666, "eval_rewards/accuracies": 0.6638011336326599, "eval_rewards/chosen": -1.8852684497833252, "eval_rewards/margins": 0.45983266830444336, "eval_rewards/rejected": -2.3451011180877686, "eval_runtime": 357.6561, "eval_samples_per_second": 12.034, "eval_steps_per_second": 1.504, "step": 8000 }, { "epoch": 1.38, "grad_norm": 33.54268692406238, "learning_rate": 1.3260482089847603e-07, "logits/chosen": -1.2896820306777954, "logits/rejected": -1.2158801555633545, "logps/chosen": -234.7473907470703, "logps/rejected": -369.5241394042969, "loss": 0.4207, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7761294841766357, "rewards/margins": 1.3950544595718384, "rewards/rejected": -3.1711838245391846, "step": 8010 }, { "epoch": 1.38, "grad_norm": 28.405016653483855, "learning_rate": 1.3194160251131365e-07, "logits/chosen": -1.3419923782348633, "logits/rejected": -1.257868766784668, "logps/chosen": -242.85787963867188, "logps/rejected": -379.0827331542969, "loss": 0.4081, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.826235055923462, "rewards/margins": 1.417443037033081, "rewards/rejected": -3.243677854537964, "step": 8020 }, { "epoch": 1.38, "grad_norm": 39.15936163505855, "learning_rate": 1.3127945173532988e-07, "logits/chosen": -1.3448692560195923, "logits/rejected": -1.284053087234497, "logps/chosen": -212.48580932617188, "logps/rejected": -355.14935302734375, "loss": 0.4275, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6137079000473022, "rewards/margins": 1.4035985469818115, "rewards/rejected": -3.017306327819824, "step": 8030 }, { "epoch": 1.39, "grad_norm": 33.82521625803319, "learning_rate": 1.3061837455840538e-07, "logits/chosen": -1.3016248941421509, "logits/rejected": -1.2252373695373535, "logps/chosen": -226.1641082763672, "logps/rejected": -372.94171142578125, "loss": 0.3642, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7177082300186157, "rewards/margins": 1.4918298721313477, "rewards/rejected": -3.209538221359253, "step": 8040 }, { "epoch": 1.39, "grad_norm": 23.926808236247748, "learning_rate": 1.2995837695871188e-07, "logits/chosen": -1.3715155124664307, "logits/rejected": -1.3059993982315063, "logps/chosen": -211.1027374267578, "logps/rejected": -356.53533935546875, "loss": 0.4153, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5601820945739746, "rewards/margins": 1.4470088481903076, "rewards/rejected": -3.0071911811828613, "step": 8050 }, { "epoch": 1.39, "grad_norm": 31.557939346164492, "learning_rate": 1.2929946490465855e-07, "logits/chosen": -1.4260159730911255, "logits/rejected": -1.3587584495544434, "logps/chosen": -217.4468536376953, "logps/rejected": -329.12548828125, "loss": 0.4882, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6211131811141968, "rewards/margins": 1.1693998575210571, "rewards/rejected": -2.7905125617980957, "step": 8060 }, { "epoch": 1.39, "grad_norm": 40.190393995057335, "learning_rate": 1.2864164435483777e-07, "logits/chosen": -1.354252815246582, "logits/rejected": -1.2852472066879272, "logps/chosen": -220.60400390625, "logps/rejected": -329.6341857910156, "loss": 0.4492, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6716521978378296, "rewards/margins": 1.1254643201828003, "rewards/rejected": -2.79711651802063, "step": 8070 }, { "epoch": 1.39, "grad_norm": 32.42169318224056, "learning_rate": 1.2798492125797145e-07, "logits/chosen": -1.3571466207504272, "logits/rejected": -1.3098504543304443, "logps/chosen": -203.21746826171875, "logps/rejected": -327.30889892578125, "loss": 0.4336, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5024254322052002, "rewards/margins": 1.1913418769836426, "rewards/rejected": -2.6937670707702637, "step": 8080 }, { "epoch": 1.39, "grad_norm": 23.167056574146898, "learning_rate": 1.273293015528571e-07, "logits/chosen": -1.3101780414581299, "logits/rejected": -1.2438642978668213, "logps/chosen": -202.09756469726562, "logps/rejected": -327.48089599609375, "loss": 0.4286, "rewards/accuracies": 0.8125, "rewards/chosen": -1.490017056465149, "rewards/margins": 1.2703895568847656, "rewards/rejected": -2.760406732559204, "step": 8090 }, { "epoch": 1.4, "grad_norm": 41.21644559684058, "learning_rate": 1.2667479116831436e-07, "logits/chosen": -1.3472046852111816, "logits/rejected": -1.3043591976165771, "logps/chosen": -227.7296905517578, "logps/rejected": -334.98541259765625, "loss": 0.4616, "rewards/accuracies": 0.78125, "rewards/chosen": -1.736731767654419, "rewards/margins": 1.0323158502578735, "rewards/rejected": -2.769047498703003, "step": 8100 }, { "epoch": 1.4, "eval_logits/chosen": -1.4580535888671875, "eval_logits/rejected": -1.4319345951080322, "eval_logps/chosen": -226.49224853515625, "eval_logps/rejected": -272.92999267578125, "eval_loss": 0.6337063908576965, "eval_rewards/accuracies": 0.6563661694526672, "eval_rewards/chosen": -1.6778842210769653, "eval_rewards/margins": 0.41984203457832336, "eval_rewards/rejected": -2.097726345062256, "eval_runtime": 356.5742, "eval_samples_per_second": 12.07, "eval_steps_per_second": 1.509, "step": 8100 }, { "epoch": 1.4, "grad_norm": 43.64122789141382, "learning_rate": 1.2602139602313066e-07, "logits/chosen": -1.3520846366882324, "logits/rejected": -1.2789603471755981, "logps/chosen": -220.7334442138672, "logps/rejected": -337.8800354003906, "loss": 0.4345, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6468912363052368, "rewards/margins": 1.221639633178711, "rewards/rejected": -2.8685309886932373, "step": 8110 }, { "epoch": 1.4, "grad_norm": 46.7301047333346, "learning_rate": 1.2536912202600908e-07, "logits/chosen": -1.3374398946762085, "logits/rejected": -1.2761024236679077, "logps/chosen": -215.7982940673828, "logps/rejected": -333.050048828125, "loss": 0.4249, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.617179274559021, "rewards/margins": 1.1847528219223022, "rewards/rejected": -2.8019323348999023, "step": 8120 }, { "epoch": 1.4, "grad_norm": 42.0819810676, "learning_rate": 1.2471797507551323e-07, "logits/chosen": -1.3652501106262207, "logits/rejected": -1.310935139656067, "logps/chosen": -208.43954467773438, "logps/rejected": -316.9969787597656, "loss": 0.4354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.536388635635376, "rewards/margins": 1.1277352571487427, "rewards/rejected": -2.664124011993408, "step": 8130 }, { "epoch": 1.4, "grad_norm": 32.76814476973973, "learning_rate": 1.2406796106001526e-07, "logits/chosen": -1.3102951049804688, "logits/rejected": -1.2481873035430908, "logps/chosen": -213.14266967773438, "logps/rejected": -348.0755615234375, "loss": 0.4184, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6040756702423096, "rewards/margins": 1.3488930463790894, "rewards/rejected": -2.9529685974121094, "step": 8140 }, { "epoch": 1.4, "grad_norm": 20.86333145951887, "learning_rate": 1.2341908585764197e-07, "logits/chosen": -1.3613207340240479, "logits/rejected": -1.2992537021636963, "logps/chosen": -226.6147918701172, "logps/rejected": -361.0736083984375, "loss": 0.4225, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7447595596313477, "rewards/margins": 1.3353623151779175, "rewards/rejected": -3.0801219940185547, "step": 8150 }, { "epoch": 1.41, "grad_norm": 33.69365894701024, "learning_rate": 1.2277135533622173e-07, "logits/chosen": -1.3138097524642944, "logits/rejected": -1.241779088973999, "logps/chosen": -218.9955291748047, "logps/rejected": -367.7317810058594, "loss": 0.3696, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6299861669540405, "rewards/margins": 1.524584174156189, "rewards/rejected": -3.1545703411102295, "step": 8160 }, { "epoch": 1.41, "grad_norm": 41.9578809865992, "learning_rate": 1.2212477535323158e-07, "logits/chosen": -1.3314152956008911, "logits/rejected": -1.261887550354004, "logps/chosen": -240.8853759765625, "logps/rejected": -356.1999816894531, "loss": 0.4335, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8119796514511108, "rewards/margins": 1.2307933568954468, "rewards/rejected": -3.0427732467651367, "step": 8170 }, { "epoch": 1.41, "grad_norm": 37.15151945333557, "learning_rate": 1.2147935175574403e-07, "logits/chosen": -1.336161732673645, "logits/rejected": -1.2775371074676514, "logps/chosen": -241.3358917236328, "logps/rejected": -364.7397766113281, "loss": 0.4214, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.850393295288086, "rewards/margins": 1.2469345331192017, "rewards/rejected": -3.0973281860351562, "step": 8180 }, { "epoch": 1.41, "grad_norm": 35.78807424327242, "learning_rate": 1.208350903803745e-07, "logits/chosen": -1.290093183517456, "logits/rejected": -1.2244975566864014, "logps/chosen": -240.7860870361328, "logps/rejected": -369.93792724609375, "loss": 0.4527, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8283048868179321, "rewards/margins": 1.3327312469482422, "rewards/rejected": -3.1610360145568848, "step": 8190 }, { "epoch": 1.41, "grad_norm": 36.986247487672124, "learning_rate": 1.2019199705322793e-07, "logits/chosen": -1.3099769353866577, "logits/rejected": -1.244363784790039, "logps/chosen": -241.85092163085938, "logps/rejected": -364.93157958984375, "loss": 0.4033, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.868650197982788, "rewards/margins": 1.2554208040237427, "rewards/rejected": -3.1240711212158203, "step": 8200 }, { "epoch": 1.41, "eval_logits/chosen": -1.4116390943527222, "eval_logits/rejected": -1.3845247030258179, "eval_logps/chosen": -245.81504821777344, "eval_logps/rejected": -296.2736511230469, "eval_loss": 0.6331018805503845, "eval_rewards/accuracies": 0.6638011336326599, "eval_rewards/chosen": -1.8711119890213013, "eval_rewards/margins": 0.460050493478775, "eval_rewards/rejected": -2.331162452697754, "eval_runtime": 357.1978, "eval_samples_per_second": 12.049, "eval_steps_per_second": 1.506, "step": 8200 }, { "epoch": 1.41, "grad_norm": 29.593005359208316, "learning_rate": 1.1955007758984717e-07, "logits/chosen": -1.2003768682479858, "logits/rejected": -1.1418938636779785, "logps/chosen": -232.4004669189453, "logps/rejected": -356.89031982421875, "loss": 0.4207, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7730633020401, "rewards/margins": 1.2342889308929443, "rewards/rejected": -3.007352113723755, "step": 8210 }, { "epoch": 1.42, "grad_norm": 28.630103927734417, "learning_rate": 1.1890933779515897e-07, "logits/chosen": -1.2915620803833008, "logits/rejected": -1.2165160179138184, "logps/chosen": -236.4958953857422, "logps/rejected": -377.47528076171875, "loss": 0.3975, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8341341018676758, "rewards/margins": 1.4102541208267212, "rewards/rejected": -3.2443878650665283, "step": 8220 }, { "epoch": 1.42, "grad_norm": 19.561231715377982, "learning_rate": 1.1826978346342301e-07, "logits/chosen": -1.2921059131622314, "logits/rejected": -1.229309320449829, "logps/chosen": -233.7902069091797, "logps/rejected": -382.6428527832031, "loss": 0.3569, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7854340076446533, "rewards/margins": 1.527852177619934, "rewards/rejected": -3.313286304473877, "step": 8230 }, { "epoch": 1.42, "grad_norm": 41.33964598646989, "learning_rate": 1.1763142037817805e-07, "logits/chosen": -1.3490978479385376, "logits/rejected": -1.2747749090194702, "logps/chosen": -261.16094970703125, "logps/rejected": -395.5751953125, "loss": 0.3843, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.049976348876953, "rewards/margins": 1.3874568939208984, "rewards/rejected": -3.4374337196350098, "step": 8240 }, { "epoch": 1.42, "grad_norm": 27.38502220196457, "learning_rate": 1.1699425431219079e-07, "logits/chosen": -1.2738348245620728, "logits/rejected": -1.215987205505371, "logps/chosen": -255.5185546875, "logps/rejected": -400.17083740234375, "loss": 0.4124, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9787349700927734, "rewards/margins": 1.4798691272735596, "rewards/rejected": -3.458604335784912, "step": 8250 }, { "epoch": 1.42, "grad_norm": 35.93771365818128, "learning_rate": 1.1635829102740294e-07, "logits/chosen": -1.3693095445632935, "logits/rejected": -1.3095340728759766, "logps/chosen": -248.7408905029297, "logps/rejected": -385.82574462890625, "loss": 0.4401, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9232066869735718, "rewards/margins": 1.382420539855957, "rewards/rejected": -3.3056271076202393, "step": 8260 }, { "epoch": 1.42, "grad_norm": 32.99199105515522, "learning_rate": 1.1572353627487948e-07, "logits/chosen": -1.3601871728897095, "logits/rejected": -1.3054234981536865, "logps/chosen": -243.6525421142578, "logps/rejected": -381.771240234375, "loss": 0.4241, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9026378393173218, "rewards/margins": 1.3616206645965576, "rewards/rejected": -3.264258623123169, "step": 8270 }, { "epoch": 1.43, "grad_norm": 40.84568027678274, "learning_rate": 1.1508999579475654e-07, "logits/chosen": -1.321771502494812, "logits/rejected": -1.2765603065490723, "logps/chosen": -239.63668823242188, "logps/rejected": -357.2843933105469, "loss": 0.453, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8728437423706055, "rewards/margins": 1.1715965270996094, "rewards/rejected": -3.044440269470215, "step": 8280 }, { "epoch": 1.43, "grad_norm": 27.9600215465676, "learning_rate": 1.1445767531618944e-07, "logits/chosen": -1.2803277969360352, "logits/rejected": -1.1913877725601196, "logps/chosen": -233.3045196533203, "logps/rejected": -352.5013732910156, "loss": 0.4225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7336339950561523, "rewards/margins": 1.2495338916778564, "rewards/rejected": -2.983167886734009, "step": 8290 }, { "epoch": 1.43, "grad_norm": 25.002008446127316, "learning_rate": 1.1382658055730096e-07, "logits/chosen": -1.419141173362732, "logits/rejected": -1.3513821363449097, "logps/chosen": -243.5513458251953, "logps/rejected": -379.9726867675781, "loss": 0.4659, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8599157333374023, "rewards/margins": 1.3946608304977417, "rewards/rejected": -3.2545769214630127, "step": 8300 }, { "epoch": 1.43, "eval_logits/chosen": -1.4014118909835815, "eval_logits/rejected": -1.3744704723358154, "eval_logps/chosen": -253.2737579345703, "eval_logps/rejected": -304.1915588378906, "eval_loss": 0.6337563991546631, "eval_rewards/accuracies": 0.6642658114433289, "eval_rewards/chosen": -1.9456990957260132, "eval_rewards/margins": 0.4646424651145935, "eval_rewards/rejected": -2.410341739654541, "eval_runtime": 357.7816, "eval_samples_per_second": 12.03, "eval_steps_per_second": 1.504, "step": 8300 }, { "epoch": 1.43, "grad_norm": 42.89349489877897, "learning_rate": 1.1319671722512958e-07, "logits/chosen": -1.2304198741912842, "logits/rejected": -1.1626794338226318, "logps/chosen": -231.46615600585938, "logps/rejected": -345.99493408203125, "loss": 0.4309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7518718242645264, "rewards/margins": 1.1967473030090332, "rewards/rejected": -2.9486191272735596, "step": 8310 }, { "epoch": 1.43, "grad_norm": 42.24768978841262, "learning_rate": 1.1256809101557793e-07, "logits/chosen": -1.3194677829742432, "logits/rejected": -1.263414740562439, "logps/chosen": -216.904052734375, "logps/rejected": -355.9466552734375, "loss": 0.4015, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.644235372543335, "rewards/margins": 1.3747762441635132, "rewards/rejected": -3.0190117359161377, "step": 8320 }, { "epoch": 1.44, "grad_norm": 24.459920685245006, "learning_rate": 1.1194070761336133e-07, "logits/chosen": -1.3198477029800415, "logits/rejected": -1.2680397033691406, "logps/chosen": -233.9806365966797, "logps/rejected": -365.51239013671875, "loss": 0.4175, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8538386821746826, "rewards/margins": 1.262743592262268, "rewards/rejected": -3.1165823936462402, "step": 8330 }, { "epoch": 1.44, "grad_norm": 44.12413238893927, "learning_rate": 1.1131457269195598e-07, "logits/chosen": -1.3826789855957031, "logits/rejected": -1.3336975574493408, "logps/chosen": -241.165283203125, "logps/rejected": -356.74481201171875, "loss": 0.4828, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8674249649047852, "rewards/margins": 1.1834924221038818, "rewards/rejected": -3.050917387008667, "step": 8340 }, { "epoch": 1.44, "grad_norm": 27.88048778386721, "learning_rate": 1.106896919135483e-07, "logits/chosen": -1.212838888168335, "logits/rejected": -1.1588201522827148, "logps/chosen": -237.91293334960938, "logps/rejected": -359.9263000488281, "loss": 0.4398, "rewards/accuracies": 0.78125, "rewards/chosen": -1.847447395324707, "rewards/margins": 1.225775122642517, "rewards/rejected": -3.0732228755950928, "step": 8350 }, { "epoch": 1.44, "grad_norm": 41.137677010372954, "learning_rate": 1.1006607092898326e-07, "logits/chosen": -1.2542845010757446, "logits/rejected": -1.1702654361724854, "logps/chosen": -207.97622680664062, "logps/rejected": -355.0627746582031, "loss": 0.3657, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5343948602676392, "rewards/margins": 1.4972972869873047, "rewards/rejected": -3.0316920280456543, "step": 8360 }, { "epoch": 1.44, "grad_norm": 30.172641899374565, "learning_rate": 1.0944371537771347e-07, "logits/chosen": -1.3405089378356934, "logits/rejected": -1.2776302099227905, "logps/chosen": -218.90786743164062, "logps/rejected": -363.3168640136719, "loss": 0.3823, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6786830425262451, "rewards/margins": 1.4186073541641235, "rewards/rejected": -3.097290277481079, "step": 8370 }, { "epoch": 1.44, "grad_norm": 26.55031255402569, "learning_rate": 1.0882263088774809e-07, "logits/chosen": -1.4416921138763428, "logits/rejected": -1.3774442672729492, "logps/chosen": -203.69728088378906, "logps/rejected": -338.6905212402344, "loss": 0.4202, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5246427059173584, "rewards/margins": 1.3209424018859863, "rewards/rejected": -2.845585346221924, "step": 8380 }, { "epoch": 1.45, "grad_norm": 35.13759701459188, "learning_rate": 1.0820282307560196e-07, "logits/chosen": -1.4239284992218018, "logits/rejected": -1.3506910800933838, "logps/chosen": -219.5550537109375, "logps/rejected": -363.826171875, "loss": 0.3804, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6286277770996094, "rewards/margins": 1.4488070011138916, "rewards/rejected": -3.077434778213501, "step": 8390 }, { "epoch": 1.45, "grad_norm": 28.883468549799368, "learning_rate": 1.075842975462449e-07, "logits/chosen": -1.3790721893310547, "logits/rejected": -1.3156163692474365, "logps/chosen": -207.9996337890625, "logps/rejected": -343.63677978515625, "loss": 0.4254, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5770851373672485, "rewards/margins": 1.351677656173706, "rewards/rejected": -2.928762912750244, "step": 8400 }, { "epoch": 1.45, "eval_logits/chosen": -1.4531240463256836, "eval_logits/rejected": -1.4271684885025024, "eval_logps/chosen": -233.58177185058594, "eval_logps/rejected": -281.2073974609375, "eval_loss": 0.6341521143913269, "eval_rewards/accuracies": 0.6589219570159912, "eval_rewards/chosen": -1.7487791776657104, "eval_rewards/margins": 0.4317210614681244, "eval_rewards/rejected": -2.1805002689361572, "eval_runtime": 357.7383, "eval_samples_per_second": 12.031, "eval_steps_per_second": 1.504, "step": 8400 }, { "epoch": 1.45, "grad_norm": 26.87906657018289, "learning_rate": 1.0696705989305085e-07, "logits/chosen": -1.3252408504486084, "logits/rejected": -1.2531208992004395, "logps/chosen": -228.47628784179688, "logps/rejected": -369.2504577636719, "loss": 0.4096, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7083253860473633, "rewards/margins": 1.466355323791504, "rewards/rejected": -3.1746809482574463, "step": 8410 }, { "epoch": 1.45, "grad_norm": 29.144216830031528, "learning_rate": 1.0635111569774755e-07, "logits/chosen": -1.2288157939910889, "logits/rejected": -1.1764132976531982, "logps/chosen": -192.9091796875, "logps/rejected": -327.469482421875, "loss": 0.3615, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.412838339805603, "rewards/margins": 1.3490186929702759, "rewards/rejected": -2.761857032775879, "step": 8420 }, { "epoch": 1.45, "grad_norm": 29.070566266372918, "learning_rate": 1.0573647053036552e-07, "logits/chosen": -1.3514432907104492, "logits/rejected": -1.294538974761963, "logps/chosen": -215.29830932617188, "logps/rejected": -337.3027038574219, "loss": 0.4437, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6178497076034546, "rewards/margins": 1.188742995262146, "rewards/rejected": -2.8065924644470215, "step": 8430 }, { "epoch": 1.45, "grad_norm": 31.754197528307863, "learning_rate": 1.0512312994918865e-07, "logits/chosen": -1.387795329093933, "logits/rejected": -1.3368542194366455, "logps/chosen": -220.80947875976562, "logps/rejected": -341.77264404296875, "loss": 0.4488, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6601117849349976, "rewards/margins": 1.242388129234314, "rewards/rejected": -2.9024999141693115, "step": 8440 }, { "epoch": 1.46, "grad_norm": 25.669279884647963, "learning_rate": 1.0451109950070275e-07, "logits/chosen": -1.2506482601165771, "logits/rejected": -1.2008402347564697, "logps/chosen": -225.53250122070312, "logps/rejected": -366.031982421875, "loss": 0.4143, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7753582000732422, "rewards/margins": 1.3562324047088623, "rewards/rejected": -3.1315910816192627, "step": 8450 }, { "epoch": 1.46, "grad_norm": 25.414444784217327, "learning_rate": 1.039003847195466e-07, "logits/chosen": -1.3718957901000977, "logits/rejected": -1.3084933757781982, "logps/chosen": -226.52572631835938, "logps/rejected": -355.208251953125, "loss": 0.378, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7372535467147827, "rewards/margins": 1.277478814125061, "rewards/rejected": -3.0147323608398438, "step": 8460 }, { "epoch": 1.46, "grad_norm": 47.176323143328105, "learning_rate": 1.0329099112846071e-07, "logits/chosen": -1.3328293561935425, "logits/rejected": -1.2753039598464966, "logps/chosen": -257.13079833984375, "logps/rejected": -379.00323486328125, "loss": 0.4915, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.010730504989624, "rewards/margins": 1.2479288578033447, "rewards/rejected": -3.2586593627929688, "step": 8470 }, { "epoch": 1.46, "grad_norm": 32.56367895113138, "learning_rate": 1.0268292423823838e-07, "logits/chosen": -1.3693746328353882, "logits/rejected": -1.2976529598236084, "logps/chosen": -223.7007598876953, "logps/rejected": -357.4267272949219, "loss": 0.439, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7069003582000732, "rewards/margins": 1.3514857292175293, "rewards/rejected": -3.0583860874176025, "step": 8480 }, { "epoch": 1.46, "grad_norm": 27.548404093695904, "learning_rate": 1.020761895476753e-07, "logits/chosen": -1.4502463340759277, "logits/rejected": -1.3987720012664795, "logps/chosen": -210.2566375732422, "logps/rejected": -340.35089111328125, "loss": 0.4054, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5552849769592285, "rewards/margins": 1.2762036323547363, "rewards/rejected": -2.831489086151123, "step": 8490 }, { "epoch": 1.46, "grad_norm": 25.509139166950433, "learning_rate": 1.0147079254352001e-07, "logits/chosen": -1.3013901710510254, "logits/rejected": -1.2501300573349, "logps/chosen": -205.6046142578125, "logps/rejected": -333.3544921875, "loss": 0.4177, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5129337310791016, "rewards/margins": 1.2862192392349243, "rewards/rejected": -2.7991526126861572, "step": 8500 }, { "epoch": 1.46, "eval_logits/chosen": -1.4731091260910034, "eval_logits/rejected": -1.4476723670959473, "eval_logps/chosen": -229.227783203125, "eval_logps/rejected": -275.5844421386719, "eval_loss": 0.6337706446647644, "eval_rewards/accuracies": 0.6589219570159912, "eval_rewards/chosen": -1.705239176750183, "eval_rewards/margins": 0.4190312325954437, "eval_rewards/rejected": -2.124270439147949, "eval_runtime": 357.6856, "eval_samples_per_second": 12.033, "eval_steps_per_second": 1.504, "step": 8500 }, { "epoch": 1.47, "grad_norm": 42.02138934399011, "learning_rate": 1.008667387004242e-07, "logits/chosen": -1.3346723318099976, "logits/rejected": -1.2678366899490356, "logps/chosen": -221.626953125, "logps/rejected": -353.46221923828125, "loss": 0.3852, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6379005908966064, "rewards/margins": 1.381575345993042, "rewards/rejected": -3.0194761753082275, "step": 8510 }, { "epoch": 1.47, "grad_norm": 31.896788173795407, "learning_rate": 1.002640334808933e-07, "logits/chosen": -1.3286793231964111, "logits/rejected": -1.2712717056274414, "logps/chosen": -226.07302856445312, "logps/rejected": -338.1138000488281, "loss": 0.4501, "rewards/accuracies": 0.75, "rewards/chosen": -1.7251167297363281, "rewards/margins": 1.1692787408828735, "rewards/rejected": -2.8943958282470703, "step": 8520 }, { "epoch": 1.47, "grad_norm": 30.87059976350335, "learning_rate": 9.9662682335237e-08, "logits/chosen": -1.3089802265167236, "logits/rejected": -1.2482343912124634, "logps/chosen": -220.247314453125, "logps/rejected": -340.78228759765625, "loss": 0.4195, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6637243032455444, "rewards/margins": 1.2299330234527588, "rewards/rejected": -2.8936572074890137, "step": 8530 }, { "epoch": 1.47, "grad_norm": 29.57486542104792, "learning_rate": 9.906269070152004e-08, "logits/chosen": -1.4429051876068115, "logits/rejected": -1.3985341787338257, "logps/chosen": -219.606689453125, "logps/rejected": -326.71759033203125, "loss": 0.4908, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6491502523422241, "rewards/margins": 1.0737760066986084, "rewards/rejected": -2.722926378250122, "step": 8540 }, { "epoch": 1.47, "grad_norm": 25.587032990412716, "learning_rate": 9.846406400551308e-08, "logits/chosen": -1.3666541576385498, "logits/rejected": -1.3007842302322388, "logps/chosen": -233.0788116455078, "logps/rejected": -375.75469970703125, "loss": 0.3971, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7926390171051025, "rewards/margins": 1.4118077754974365, "rewards/rejected": -3.204446792602539, "step": 8550 }, { "epoch": 1.47, "grad_norm": 29.854174805438845, "learning_rate": 9.786680766064318e-08, "logits/chosen": -1.4583765268325806, "logits/rejected": -1.3929063081741333, "logps/chosen": -231.9521942138672, "logps/rejected": -365.8257751464844, "loss": 0.4261, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7775615453720093, "rewards/margins": 1.3575212955474854, "rewards/rejected": -3.135082721710205, "step": 8560 }, { "epoch": 1.48, "grad_norm": 31.030686033718272, "learning_rate": 9.727092706794554e-08, "logits/chosen": -1.3249984979629517, "logits/rejected": -1.2678711414337158, "logps/chosen": -229.1342010498047, "logps/rejected": -341.3080749511719, "loss": 0.4511, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7480465173721313, "rewards/margins": 1.145034670829773, "rewards/rejected": -2.8930811882019043, "step": 8570 }, { "epoch": 1.48, "grad_norm": 32.81944513147602, "learning_rate": 9.667642761601433e-08, "logits/chosen": -1.418872594833374, "logits/rejected": -1.3541626930236816, "logps/chosen": -211.55941772460938, "logps/rejected": -351.56304931640625, "loss": 0.3834, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5743210315704346, "rewards/margins": 1.3962310552597046, "rewards/rejected": -2.9705519676208496, "step": 8580 }, { "epoch": 1.48, "grad_norm": 28.302147205613945, "learning_rate": 9.608331468095377e-08, "logits/chosen": -1.3961126804351807, "logits/rejected": -1.319726586341858, "logps/chosen": -209.43771362304688, "logps/rejected": -345.10186767578125, "loss": 0.358, "rewards/accuracies": 0.84375, "rewards/chosen": -1.524304986000061, "rewards/margins": 1.3930083513259888, "rewards/rejected": -2.91731333732605, "step": 8590 }, { "epoch": 1.48, "grad_norm": 22.424385790342782, "learning_rate": 9.549159362632986e-08, "logits/chosen": -1.3214257955551147, "logits/rejected": -1.26907217502594, "logps/chosen": -230.94613647460938, "logps/rejected": -337.48309326171875, "loss": 0.4537, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7750024795532227, "rewards/margins": 1.082525372505188, "rewards/rejected": -2.8575279712677, "step": 8600 }, { "epoch": 1.48, "eval_logits/chosen": -1.4457244873046875, "eval_logits/rejected": -1.4196789264678955, "eval_logps/chosen": -243.827392578125, "eval_logps/rejected": -292.89404296875, "eval_loss": 0.63252854347229, "eval_rewards/accuracies": 0.6677509546279907, "eval_rewards/chosen": -1.8512355089187622, "eval_rewards/margins": 0.4461313486099243, "eval_rewards/rejected": -2.2973668575286865, "eval_runtime": 357.6199, "eval_samples_per_second": 12.035, "eval_steps_per_second": 1.504, "step": 8600 }, { "epoch": 1.48, "grad_norm": 33.10659637068871, "learning_rate": 9.490126980312165e-08, "logits/chosen": -1.3417989015579224, "logits/rejected": -1.2814117670059204, "logps/chosen": -232.3809814453125, "logps/rejected": -360.255615234375, "loss": 0.4184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7825310230255127, "rewards/margins": 1.3146696090698242, "rewards/rejected": -3.097200632095337, "step": 8610 }, { "epoch": 1.49, "grad_norm": 29.42647146776832, "learning_rate": 9.431234854967291e-08, "logits/chosen": -1.2606632709503174, "logits/rejected": -1.2125203609466553, "logps/chosen": -236.4669189453125, "logps/rejected": -357.4127502441406, "loss": 0.4326, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8169389963150024, "rewards/margins": 1.2449721097946167, "rewards/rejected": -3.061911106109619, "step": 8620 }, { "epoch": 1.49, "grad_norm": 23.619846728441956, "learning_rate": 9.372483519164398e-08, "logits/chosen": -1.2358766794204712, "logits/rejected": -1.1763203144073486, "logps/chosen": -209.49526977539062, "logps/rejected": -350.7640075683594, "loss": 0.3756, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5778772830963135, "rewards/margins": 1.4093445539474487, "rewards/rejected": -2.9872217178344727, "step": 8630 }, { "epoch": 1.49, "grad_norm": 23.10386102267089, "learning_rate": 9.313873504196313e-08, "logits/chosen": -1.4057111740112305, "logits/rejected": -1.3474836349487305, "logps/chosen": -225.91744995117188, "logps/rejected": -339.12139892578125, "loss": 0.4632, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7019745111465454, "rewards/margins": 1.161685585975647, "rewards/rejected": -2.8636600971221924, "step": 8640 }, { "epoch": 1.49, "grad_norm": 51.94922788167167, "learning_rate": 9.255405340077949e-08, "logits/chosen": -1.321274757385254, "logits/rejected": -1.256255865097046, "logps/chosen": -225.003662109375, "logps/rejected": -348.2788391113281, "loss": 0.458, "rewards/accuracies": 0.78125, "rewards/chosen": -1.700404405593872, "rewards/margins": 1.241997480392456, "rewards/rejected": -2.9424021244049072, "step": 8650 }, { "epoch": 1.49, "grad_norm": 32.91948860177697, "learning_rate": 9.197079555541379e-08, "logits/chosen": -1.3428630828857422, "logits/rejected": -1.2893078327178955, "logps/chosen": -228.0025177001953, "logps/rejected": -354.33404541015625, "loss": 0.46, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7465832233428955, "rewards/margins": 1.2794687747955322, "rewards/rejected": -3.0260515213012695, "step": 8660 }, { "epoch": 1.49, "grad_norm": 23.89924519243948, "learning_rate": 9.138896678031202e-08, "logits/chosen": -1.4497371912002563, "logits/rejected": -1.3866420984268188, "logps/chosen": -211.95297241210938, "logps/rejected": -345.5428466796875, "loss": 0.4208, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5996501445770264, "rewards/margins": 1.3280017375946045, "rewards/rejected": -2.927651882171631, "step": 8670 }, { "epoch": 1.5, "grad_norm": 25.326870727709025, "learning_rate": 9.080857233699624e-08, "logits/chosen": -1.364079236984253, "logits/rejected": -1.323317289352417, "logps/chosen": -223.8002166748047, "logps/rejected": -334.2272644042969, "loss": 0.4608, "rewards/accuracies": 0.75, "rewards/chosen": -1.7312272787094116, "rewards/margins": 1.0833412408828735, "rewards/rejected": -2.814568281173706, "step": 8680 }, { "epoch": 1.5, "grad_norm": 28.8350727770196, "learning_rate": 9.022961747401841e-08, "logits/chosen": -1.385801911354065, "logits/rejected": -1.323319911956787, "logps/chosen": -221.8018341064453, "logps/rejected": -329.33428955078125, "loss": 0.4513, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.66269850730896, "rewards/margins": 1.1479780673980713, "rewards/rejected": -2.8106765747070312, "step": 8690 }, { "epoch": 1.5, "grad_norm": 27.89499138401969, "learning_rate": 8.96521074269117e-08, "logits/chosen": -1.3936054706573486, "logits/rejected": -1.3266212940216064, "logps/chosen": -221.9488067626953, "logps/rejected": -332.57354736328125, "loss": 0.4176, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.654510498046875, "rewards/margins": 1.1451376676559448, "rewards/rejected": -2.7996482849121094, "step": 8700 }, { "epoch": 1.5, "eval_logits/chosen": -1.4750956296920776, "eval_logits/rejected": -1.4490503072738647, "eval_logps/chosen": -231.7505340576172, "eval_logps/rejected": -279.62408447265625, "eval_loss": 0.6307649612426758, "eval_rewards/accuracies": 0.6654275059700012, "eval_rewards/chosen": -1.7304668426513672, "eval_rewards/margins": 0.4342002868652344, "eval_rewards/rejected": -2.1646668910980225, "eval_runtime": 356.447, "eval_samples_per_second": 12.075, "eval_steps_per_second": 1.509, "step": 8700 }, { "epoch": 1.5, "grad_norm": 44.13104620276346, "learning_rate": 8.907604741814403e-08, "logits/chosen": -1.3480675220489502, "logits/rejected": -1.3072960376739502, "logps/chosen": -227.20175170898438, "logps/rejected": -328.07940673828125, "loss": 0.4921, "rewards/accuracies": 0.75, "rewards/chosen": -1.7436609268188477, "rewards/margins": 1.0057998895645142, "rewards/rejected": -2.7494606971740723, "step": 8710 }, { "epoch": 1.5, "grad_norm": 40.22151350765789, "learning_rate": 8.850144265707039e-08, "logits/chosen": -1.3768417835235596, "logits/rejected": -1.312811255455017, "logps/chosen": -228.0986328125, "logps/rejected": -353.222412109375, "loss": 0.3894, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.717865228652954, "rewards/margins": 1.2698724269866943, "rewards/rejected": -2.9877376556396484, "step": 8720 }, { "epoch": 1.5, "grad_norm": 34.383596796071096, "learning_rate": 8.792829833988588e-08, "logits/chosen": -1.3603075742721558, "logits/rejected": -1.2962515354156494, "logps/chosen": -222.20852661132812, "logps/rejected": -350.58489990234375, "loss": 0.4555, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.699462890625, "rewards/margins": 1.288184404373169, "rewards/rejected": -2.987647533416748, "step": 8730 }, { "epoch": 1.51, "grad_norm": 43.757133699793904, "learning_rate": 8.735661964957869e-08, "logits/chosen": -1.3438574075698853, "logits/rejected": -1.2943146228790283, "logps/chosen": -223.5640411376953, "logps/rejected": -365.6803894042969, "loss": 0.3991, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.721771001815796, "rewards/margins": 1.4051529169082642, "rewards/rejected": -3.1269240379333496, "step": 8740 }, { "epoch": 1.51, "grad_norm": 33.49609695850668, "learning_rate": 8.678641175588324e-08, "logits/chosen": -1.3823951482772827, "logits/rejected": -1.3145440816879272, "logps/chosen": -226.49917602539062, "logps/rejected": -367.0007019042969, "loss": 0.4057, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7283353805541992, "rewards/margins": 1.4096132516860962, "rewards/rejected": -3.137948513031006, "step": 8750 }, { "epoch": 1.51, "grad_norm": 24.899312092376913, "learning_rate": 8.62176798152335e-08, "logits/chosen": -1.3334381580352783, "logits/rejected": -1.2971229553222656, "logps/chosen": -219.86520385742188, "logps/rejected": -323.74310302734375, "loss": 0.4992, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6835765838623047, "rewards/margins": 1.002506971359253, "rewards/rejected": -2.6860833168029785, "step": 8760 }, { "epoch": 1.51, "grad_norm": 24.892210563053087, "learning_rate": 8.565042897071606e-08, "logits/chosen": -1.3791553974151611, "logits/rejected": -1.3129332065582275, "logps/chosen": -221.7880096435547, "logps/rejected": -350.95306396484375, "loss": 0.4061, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6673253774642944, "rewards/margins": 1.3446027040481567, "rewards/rejected": -3.011927843093872, "step": 8770 }, { "epoch": 1.51, "grad_norm": 29.916856004909253, "learning_rate": 8.508466435202402e-08, "logits/chosen": -1.413527488708496, "logits/rejected": -1.3698149919509888, "logps/chosen": -221.784912109375, "logps/rejected": -348.02508544921875, "loss": 0.4088, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7269346714019775, "rewards/margins": 1.2188551425933838, "rewards/rejected": -2.9457898139953613, "step": 8780 }, { "epoch": 1.51, "grad_norm": 33.97524926089677, "learning_rate": 8.452039107541042e-08, "logits/chosen": -1.402840256690979, "logits/rejected": -1.333287000656128, "logps/chosen": -228.7657928466797, "logps/rejected": -360.1128845214844, "loss": 0.4378, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7385919094085693, "rewards/margins": 1.3424417972564697, "rewards/rejected": -3.08103346824646, "step": 8790 }, { "epoch": 1.52, "grad_norm": 35.70383300311999, "learning_rate": 8.395761424364193e-08, "logits/chosen": -1.3199231624603271, "logits/rejected": -1.2492494583129883, "logps/chosen": -216.8665771484375, "logps/rejected": -342.27081298828125, "loss": 0.4486, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6186439990997314, "rewards/margins": 1.2790682315826416, "rewards/rejected": -2.897711992263794, "step": 8800 }, { "epoch": 1.52, "eval_logits/chosen": -1.4813001155853271, "eval_logits/rejected": -1.4554513692855835, "eval_logps/chosen": -232.9863739013672, "eval_logps/rejected": -280.9822082519531, "eval_loss": 0.629108190536499, "eval_rewards/accuracies": 0.669377326965332, "eval_rewards/chosen": -1.7428252696990967, "eval_rewards/margins": 0.43542277812957764, "eval_rewards/rejected": -2.178248167037964, "eval_runtime": 357.8908, "eval_samples_per_second": 12.026, "eval_steps_per_second": 1.503, "step": 8800 }, { "epoch": 1.52, "grad_norm": 25.157636621740963, "learning_rate": 8.33963389459528e-08, "logits/chosen": -1.4286869764328003, "logits/rejected": -1.3668185472488403, "logps/chosen": -215.29214477539062, "logps/rejected": -350.935546875, "loss": 0.3864, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6436516046524048, "rewards/margins": 1.3397597074508667, "rewards/rejected": -2.9834113121032715, "step": 8810 }, { "epoch": 1.52, "grad_norm": 30.0040629228507, "learning_rate": 8.283657025799872e-08, "logits/chosen": -1.40675950050354, "logits/rejected": -1.3426892757415771, "logps/chosen": -213.4414825439453, "logps/rejected": -352.6174621582031, "loss": 0.3977, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.630210518836975, "rewards/margins": 1.3984111547470093, "rewards/rejected": -3.0286216735839844, "step": 8820 }, { "epoch": 1.52, "grad_norm": 44.391862230544554, "learning_rate": 8.227831324181109e-08, "logits/chosen": -1.2691117525100708, "logits/rejected": -1.2064220905303955, "logps/chosen": -220.8523406982422, "logps/rejected": -337.88763427734375, "loss": 0.5033, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6626207828521729, "rewards/margins": 1.188899278640747, "rewards/rejected": -2.85152006149292, "step": 8830 }, { "epoch": 1.52, "grad_norm": 35.6035223426018, "learning_rate": 8.172157294575108e-08, "logits/chosen": -1.3077764511108398, "logits/rejected": -1.2591360807418823, "logps/chosen": -206.6147003173828, "logps/rejected": -329.2171936035156, "loss": 0.4314, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5428450107574463, "rewards/margins": 1.211372971534729, "rewards/rejected": -2.7542176246643066, "step": 8840 }, { "epoch": 1.52, "grad_norm": 24.29335767133043, "learning_rate": 8.116635440446402e-08, "logits/chosen": -1.461669921875, "logits/rejected": -1.3966032266616821, "logps/chosen": -200.99661254882812, "logps/rejected": -347.1763000488281, "loss": 0.3793, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.4875738620758057, "rewards/margins": 1.4415907859802246, "rewards/rejected": -2.9291648864746094, "step": 8850 }, { "epoch": 1.53, "grad_norm": 28.563826235603777, "learning_rate": 8.061266263883404e-08, "logits/chosen": -1.3678812980651855, "logits/rejected": -1.3074369430541992, "logps/chosen": -219.76559448242188, "logps/rejected": -344.3088684082031, "loss": 0.4045, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.665743112564087, "rewards/margins": 1.2486860752105713, "rewards/rejected": -2.914429187774658, "step": 8860 }, { "epoch": 1.53, "grad_norm": 35.12276578609268, "learning_rate": 8.006050265593814e-08, "logits/chosen": -1.510338544845581, "logits/rejected": -1.4287976026535034, "logps/chosen": -221.04647827148438, "logps/rejected": -359.68280029296875, "loss": 0.3904, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6428911685943604, "rewards/margins": 1.4038238525390625, "rewards/rejected": -3.046715021133423, "step": 8870 }, { "epoch": 1.53, "grad_norm": 44.16416214094416, "learning_rate": 7.950987944900192e-08, "logits/chosen": -1.3029206991195679, "logits/rejected": -1.2382522821426392, "logps/chosen": -218.285888671875, "logps/rejected": -348.65289306640625, "loss": 0.4284, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.64852774143219, "rewards/margins": 1.3227118253707886, "rewards/rejected": -2.9712395668029785, "step": 8880 }, { "epoch": 1.53, "grad_norm": 37.57144300162699, "learning_rate": 7.896079799735308e-08, "logits/chosen": -1.3296968936920166, "logits/rejected": -1.262048602104187, "logps/chosen": -228.5214080810547, "logps/rejected": -362.320556640625, "loss": 0.3695, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7403507232666016, "rewards/margins": 1.3753221035003662, "rewards/rejected": -3.115673065185547, "step": 8890 }, { "epoch": 1.53, "grad_norm": 26.182342901511827, "learning_rate": 7.841326326637781e-08, "logits/chosen": -1.3689161539077759, "logits/rejected": -1.2965118885040283, "logps/chosen": -223.8673858642578, "logps/rejected": -368.6548156738281, "loss": 0.3594, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6899131536483765, "rewards/margins": 1.467007040977478, "rewards/rejected": -3.1569199562072754, "step": 8900 }, { "epoch": 1.53, "eval_logits/chosen": -1.4271249771118164, "eval_logits/rejected": -1.4001696109771729, "eval_logps/chosen": -251.50247192382812, "eval_logps/rejected": -303.11505126953125, "eval_loss": 0.6299463510513306, "eval_rewards/accuracies": 0.6675186157226562, "eval_rewards/chosen": -1.9279862642288208, "eval_rewards/margins": 0.47159045934677124, "eval_rewards/rejected": -2.3995769023895264, "eval_runtime": 357.9404, "eval_samples_per_second": 12.024, "eval_steps_per_second": 1.503, "step": 8900 }, { "epoch": 1.54, "grad_norm": 33.45762067690765, "learning_rate": 7.786728020747463e-08, "logits/chosen": -1.3184845447540283, "logits/rejected": -1.262537956237793, "logps/chosen": -243.18051147460938, "logps/rejected": -373.2486572265625, "loss": 0.4348, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8748201131820679, "rewards/margins": 1.3269211053848267, "rewards/rejected": -3.2017414569854736, "step": 8910 }, { "epoch": 1.54, "grad_norm": 17.15569346859002, "learning_rate": 7.73228537580104e-08, "logits/chosen": -1.4410854578018188, "logits/rejected": -1.3533989191055298, "logps/chosen": -239.29391479492188, "logps/rejected": -403.9765625, "loss": 0.3353, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.8225845098495483, "rewards/margins": 1.6963199377059937, "rewards/rejected": -3.518904447555542, "step": 8920 }, { "epoch": 1.54, "grad_norm": 23.134592614889822, "learning_rate": 7.677998884127543e-08, "logits/chosen": -1.3612538576126099, "logits/rejected": -1.2906397581100464, "logps/chosen": -248.0911102294922, "logps/rejected": -387.23736572265625, "loss": 0.4191, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9340145587921143, "rewards/margins": 1.4329915046691895, "rewards/rejected": -3.367006301879883, "step": 8930 }, { "epoch": 1.54, "grad_norm": 31.45965420355019, "learning_rate": 7.623869036643901e-08, "logits/chosen": -1.3538182973861694, "logits/rejected": -1.290165662765503, "logps/chosen": -234.25064086914062, "logps/rejected": -377.9019470214844, "loss": 0.3828, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7864036560058594, "rewards/margins": 1.434646725654602, "rewards/rejected": -3.22105073928833, "step": 8940 }, { "epoch": 1.54, "grad_norm": 52.90203534371852, "learning_rate": 7.569896322850489e-08, "logits/chosen": -1.214430570602417, "logits/rejected": -1.1799967288970947, "logps/chosen": -237.46340942382812, "logps/rejected": -357.76739501953125, "loss": 0.4502, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8337962627410889, "rewards/margins": 1.172048807144165, "rewards/rejected": -3.005844831466675, "step": 8950 }, { "epoch": 1.54, "grad_norm": 44.77979199965103, "learning_rate": 7.516081230826715e-08, "logits/chosen": -1.329178810119629, "logits/rejected": -1.259275197982788, "logps/chosen": -257.3124694824219, "logps/rejected": -397.51519775390625, "loss": 0.4137, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0291686058044434, "rewards/margins": 1.4289162158966064, "rewards/rejected": -3.4580845832824707, "step": 8960 }, { "epoch": 1.55, "grad_norm": 30.44657329134242, "learning_rate": 7.462424247226606e-08, "logits/chosen": -1.3297767639160156, "logits/rejected": -1.2500559091567993, "logps/chosen": -235.6514434814453, "logps/rejected": -384.52142333984375, "loss": 0.3625, "rewards/accuracies": 0.875, "rewards/chosen": -1.8238303661346436, "rewards/margins": 1.510573148727417, "rewards/rejected": -3.3344035148620605, "step": 8970 }, { "epoch": 1.55, "grad_norm": 27.607114771049687, "learning_rate": 7.408925857274373e-08, "logits/chosen": -1.4012901782989502, "logits/rejected": -1.3390867710113525, "logps/chosen": -257.0660705566406, "logps/rejected": -369.2894592285156, "loss": 0.5045, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0260767936706543, "rewards/margins": 1.1399564743041992, "rewards/rejected": -3.1660332679748535, "step": 8980 }, { "epoch": 1.55, "grad_norm": 24.870179348536638, "learning_rate": 7.355586544760109e-08, "logits/chosen": -1.2825881242752075, "logits/rejected": -1.2176916599273682, "logps/chosen": -230.96267700195312, "logps/rejected": -376.7090148925781, "loss": 0.3662, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7600934505462646, "rewards/margins": 1.4670263528823853, "rewards/rejected": -3.2271199226379395, "step": 8990 }, { "epoch": 1.55, "grad_norm": 36.79345395085888, "learning_rate": 7.302406792035298e-08, "logits/chosen": -1.369960069656372, "logits/rejected": -1.2962627410888672, "logps/chosen": -246.2954559326172, "logps/rejected": -387.07904052734375, "loss": 0.4428, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8888282775878906, "rewards/margins": 1.4562675952911377, "rewards/rejected": -3.3450961112976074, "step": 9000 }, { "epoch": 1.55, "eval_logits/chosen": -1.436100721359253, "eval_logits/rejected": -1.4093130826950073, "eval_logps/chosen": -247.8895263671875, "eval_logps/rejected": -298.9695739746094, "eval_loss": 0.631913423538208, "eval_rewards/accuracies": 0.6642658114433289, "eval_rewards/chosen": -1.8918566703796387, "eval_rewards/margins": 0.46626490354537964, "eval_rewards/rejected": -2.358121633529663, "eval_runtime": 357.5482, "eval_samples_per_second": 12.038, "eval_steps_per_second": 1.505, "step": 9000 }, { "epoch": 1.55, "grad_norm": 23.705026745341147, "learning_rate": 7.249387080008552e-08, "logits/chosen": -1.3333415985107422, "logits/rejected": -1.2726539373397827, "logps/chosen": -234.00778198242188, "logps/rejected": -349.1484069824219, "loss": 0.4458, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7839086055755615, "rewards/margins": 1.1792595386505127, "rewards/rejected": -2.963167905807495, "step": 9010 }, { "epoch": 1.55, "grad_norm": 38.89598953145415, "learning_rate": 7.196527888141199e-08, "logits/chosen": -1.2887022495269775, "logits/rejected": -1.2170953750610352, "logps/chosen": -209.22061157226562, "logps/rejected": -373.1058044433594, "loss": 0.3497, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.5529781579971313, "rewards/margins": 1.6689532995224, "rewards/rejected": -3.2219314575195312, "step": 9020 }, { "epoch": 1.56, "grad_norm": 28.117977627167093, "learning_rate": 7.14382969444299e-08, "logits/chosen": -1.3042352199554443, "logits/rejected": -1.266124963760376, "logps/chosen": -225.1204833984375, "logps/rejected": -359.4920959472656, "loss": 0.4126, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7398335933685303, "rewards/margins": 1.3171305656433105, "rewards/rejected": -3.0569639205932617, "step": 9030 }, { "epoch": 1.56, "grad_norm": 33.729240485438446, "learning_rate": 7.091292975467744e-08, "logits/chosen": -1.2989321947097778, "logits/rejected": -1.2422449588775635, "logps/chosen": -219.76953125, "logps/rejected": -347.19561767578125, "loss": 0.4361, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6861400604248047, "rewards/margins": 1.2657876014709473, "rewards/rejected": -2.951927661895752, "step": 9040 }, { "epoch": 1.56, "grad_norm": 43.84683137314539, "learning_rate": 7.038918206309061e-08, "logits/chosen": -1.363384485244751, "logits/rejected": -1.299889087677002, "logps/chosen": -237.06698608398438, "logps/rejected": -374.26708984375, "loss": 0.4154, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8334167003631592, "rewards/margins": 1.3851557970046997, "rewards/rejected": -3.2185721397399902, "step": 9050 }, { "epoch": 1.56, "grad_norm": 36.66512791686662, "learning_rate": 6.986705860596004e-08, "logits/chosen": -1.3877991437911987, "logits/rejected": -1.3261516094207764, "logps/chosen": -228.3108367919922, "logps/rejected": -349.1913146972656, "loss": 0.4466, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7361310720443726, "rewards/margins": 1.224022626876831, "rewards/rejected": -2.960153579711914, "step": 9060 }, { "epoch": 1.56, "grad_norm": 29.90532654515578, "learning_rate": 6.934656410488849e-08, "logits/chosen": -1.3249752521514893, "logits/rejected": -1.256667971611023, "logps/chosen": -210.0726776123047, "logps/rejected": -357.83990478515625, "loss": 0.3591, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.5548808574676514, "rewards/margins": 1.4754278659820557, "rewards/rejected": -3.030308246612549, "step": 9070 }, { "epoch": 1.56, "grad_norm": 33.369257953836836, "learning_rate": 6.882770326674753e-08, "logits/chosen": -1.3675148487091064, "logits/rejected": -1.3241257667541504, "logps/chosen": -205.6577606201172, "logps/rejected": -325.0508117675781, "loss": 0.4562, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.536978840827942, "rewards/margins": 1.1593728065490723, "rewards/rejected": -2.6963515281677246, "step": 9080 }, { "epoch": 1.57, "grad_norm": 27.478791555939875, "learning_rate": 6.831048078363603e-08, "logits/chosen": -1.340841293334961, "logits/rejected": -1.2636873722076416, "logps/chosen": -221.9270477294922, "logps/rejected": -348.5683898925781, "loss": 0.3936, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.666637659072876, "rewards/margins": 1.3149703741073608, "rewards/rejected": -2.9816081523895264, "step": 9090 }, { "epoch": 1.57, "grad_norm": 39.68968478397206, "learning_rate": 6.779490133283639e-08, "logits/chosen": -1.3765848875045776, "logits/rejected": -1.3157910108566284, "logps/chosen": -231.72561645507812, "logps/rejected": -339.90594482421875, "loss": 0.4441, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7491241693496704, "rewards/margins": 1.115006685256958, "rewards/rejected": -2.864131212234497, "step": 9100 }, { "epoch": 1.57, "eval_logits/chosen": -1.4595789909362793, "eval_logits/rejected": -1.433526635169983, "eval_logps/chosen": -236.91993713378906, "eval_logps/rejected": -285.5493469238281, "eval_loss": 0.6315240859985352, "eval_rewards/accuracies": 0.6670538783073425, "eval_rewards/chosen": -1.7821608781814575, "eval_rewards/margins": 0.4417589604854584, "eval_rewards/rejected": -2.2239201068878174, "eval_runtime": 357.8152, "eval_samples_per_second": 12.029, "eval_steps_per_second": 1.504, "step": 9100 }, { "epoch": 1.57, "grad_norm": 26.17971404373787, "learning_rate": 6.72809695767736e-08, "logits/chosen": -1.3913519382476807, "logits/rejected": -1.3288047313690186, "logps/chosen": -211.9567413330078, "logps/rejected": -343.7449951171875, "loss": 0.4005, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.567034125328064, "rewards/margins": 1.3445512056350708, "rewards/rejected": -2.9115850925445557, "step": 9110 }, { "epoch": 1.57, "grad_norm": 36.967070697201144, "learning_rate": 6.67686901629718e-08, "logits/chosen": -1.3988851308822632, "logits/rejected": -1.3274810314178467, "logps/chosen": -217.08779907226562, "logps/rejected": -345.1493225097656, "loss": 0.4303, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5681654214859009, "rewards/margins": 1.332497000694275, "rewards/rejected": -2.9006621837615967, "step": 9120 }, { "epoch": 1.57, "grad_norm": 31.635032954440245, "learning_rate": 6.625806772401346e-08, "logits/chosen": -1.323700189590454, "logits/rejected": -1.2679407596588135, "logps/chosen": -219.18026733398438, "logps/rejected": -335.81927490234375, "loss": 0.4429, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.641919732093811, "rewards/margins": 1.1755802631378174, "rewards/rejected": -2.817499876022339, "step": 9130 }, { "epoch": 1.57, "grad_norm": 18.580130895157108, "learning_rate": 6.574910687749641e-08, "logits/chosen": -1.3721438646316528, "logits/rejected": -1.284090280532837, "logps/chosen": -218.16458129882812, "logps/rejected": -355.3311462402344, "loss": 0.3806, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.608903169631958, "rewards/margins": 1.4527578353881836, "rewards/rejected": -3.0616610050201416, "step": 9140 }, { "epoch": 1.58, "grad_norm": 35.67358647499166, "learning_rate": 6.524181222599281e-08, "logits/chosen": -1.3545089960098267, "logits/rejected": -1.2818124294281006, "logps/chosen": -233.56680297851562, "logps/rejected": -374.37420654296875, "loss": 0.4067, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7525733709335327, "rewards/margins": 1.4464915990829468, "rewards/rejected": -3.1990647315979004, "step": 9150 }, { "epoch": 1.58, "grad_norm": 25.211132645134647, "learning_rate": 6.473618835700731e-08, "logits/chosen": -1.3555432558059692, "logits/rejected": -1.2993581295013428, "logps/chosen": -213.2099151611328, "logps/rejected": -361.63824462890625, "loss": 0.3679, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6408780813217163, "rewards/margins": 1.444949984550476, "rewards/rejected": -3.0858283042907715, "step": 9160 }, { "epoch": 1.58, "grad_norm": 29.097063689803438, "learning_rate": 6.423223984293543e-08, "logits/chosen": -1.4018914699554443, "logits/rejected": -1.3220356702804565, "logps/chosen": -224.4427947998047, "logps/rejected": -370.28741455078125, "loss": 0.3913, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6960633993148804, "rewards/margins": 1.5122019052505493, "rewards/rejected": -3.208265781402588, "step": 9170 }, { "epoch": 1.58, "grad_norm": 26.255457005191428, "learning_rate": 6.372997124102245e-08, "logits/chosen": -1.3606441020965576, "logits/rejected": -1.3003352880477905, "logps/chosen": -224.84521484375, "logps/rejected": -351.32806396484375, "loss": 0.4274, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7013254165649414, "rewards/margins": 1.290497899055481, "rewards/rejected": -2.991823434829712, "step": 9180 }, { "epoch": 1.58, "grad_norm": 27.40777133201824, "learning_rate": 6.322938709332195e-08, "logits/chosen": -1.4560340642929077, "logits/rejected": -1.4088115692138672, "logps/chosen": -246.0127716064453, "logps/rejected": -384.2671203613281, "loss": 0.3953, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.918752670288086, "rewards/margins": 1.3545637130737305, "rewards/rejected": -3.2733166217803955, "step": 9190 }, { "epoch": 1.59, "grad_norm": 29.72804041222211, "learning_rate": 6.273049192665502e-08, "logits/chosen": -1.3812012672424316, "logits/rejected": -1.3190171718597412, "logps/chosen": -222.0596466064453, "logps/rejected": -362.23779296875, "loss": 0.3898, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6922333240509033, "rewards/margins": 1.4142590761184692, "rewards/rejected": -3.106492280960083, "step": 9200 }, { "epoch": 1.59, "eval_logits/chosen": -1.4436826705932617, "eval_logits/rejected": -1.41750967502594, "eval_logps/chosen": -235.59715270996094, "eval_logps/rejected": -284.1919250488281, "eval_loss": 0.6316264271736145, "eval_rewards/accuracies": 0.6656598448753357, "eval_rewards/chosen": -1.7689329385757446, "eval_rewards/margins": 0.4414127469062805, "eval_rewards/rejected": -2.21034574508667, "eval_runtime": 358.0647, "eval_samples_per_second": 12.02, "eval_steps_per_second": 1.503, "step": 9200 }, { "epoch": 1.59, "grad_norm": 37.6892193633804, "learning_rate": 6.223329025256896e-08, "logits/chosen": -1.2596355676651, "logits/rejected": -1.189841866493225, "logps/chosen": -223.52413940429688, "logps/rejected": -360.711669921875, "loss": 0.4099, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6768831014633179, "rewards/margins": 1.412092924118042, "rewards/rejected": -3.0889759063720703, "step": 9210 }, { "epoch": 1.59, "grad_norm": 56.42606826064067, "learning_rate": 6.173778656729678e-08, "logits/chosen": -1.3393886089324951, "logits/rejected": -1.2738230228424072, "logps/chosen": -217.1497344970703, "logps/rejected": -360.0973205566406, "loss": 0.4035, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.671485185623169, "rewards/margins": 1.4341779947280884, "rewards/rejected": -3.1056630611419678, "step": 9220 }, { "epoch": 1.59, "grad_norm": 41.20515118078039, "learning_rate": 6.124398535171655e-08, "logits/chosen": -1.2532026767730713, "logits/rejected": -1.1974518299102783, "logps/chosen": -219.7674102783203, "logps/rejected": -351.6419372558594, "loss": 0.4204, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7048327922821045, "rewards/margins": 1.2831361293792725, "rewards/rejected": -2.987969160079956, "step": 9230 }, { "epoch": 1.59, "grad_norm": 37.10435338672497, "learning_rate": 6.07518910713106e-08, "logits/chosen": -1.3163211345672607, "logits/rejected": -1.269641399383545, "logps/chosen": -227.19949340820312, "logps/rejected": -363.2820739746094, "loss": 0.3956, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.710626244544983, "rewards/margins": 1.3764336109161377, "rewards/rejected": -3.087059736251831, "step": 9240 }, { "epoch": 1.59, "grad_norm": 38.05805409886092, "learning_rate": 6.026150817612544e-08, "logits/chosen": -1.2923637628555298, "logits/rejected": -1.2226426601409912, "logps/chosen": -215.64614868164062, "logps/rejected": -348.65020751953125, "loss": 0.4343, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6089054346084595, "rewards/margins": 1.3319495916366577, "rewards/rejected": -2.940855026245117, "step": 9250 }, { "epoch": 1.6, "grad_norm": 31.84368181089064, "learning_rate": 5.977284110073136e-08, "logits/chosen": -1.3127715587615967, "logits/rejected": -1.2576462030410767, "logps/chosen": -220.4276885986328, "logps/rejected": -353.9443054199219, "loss": 0.4051, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7238426208496094, "rewards/margins": 1.3259329795837402, "rewards/rejected": -3.0497756004333496, "step": 9260 }, { "epoch": 1.6, "grad_norm": 23.613823280583112, "learning_rate": 5.928589426418235e-08, "logits/chosen": -1.416325330734253, "logits/rejected": -1.3402214050292969, "logps/chosen": -227.9755859375, "logps/rejected": -368.1861572265625, "loss": 0.391, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7309129238128662, "rewards/margins": 1.4332636594772339, "rewards/rejected": -3.1641767024993896, "step": 9270 }, { "epoch": 1.6, "grad_norm": 25.039505768647498, "learning_rate": 5.8800672069976105e-08, "logits/chosen": -1.3524010181427002, "logits/rejected": -1.2967437505722046, "logps/chosen": -217.47488403320312, "logps/rejected": -346.5975341796875, "loss": 0.4171, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.639002799987793, "rewards/margins": 1.2891342639923096, "rewards/rejected": -2.9281370639801025, "step": 9280 }, { "epoch": 1.6, "grad_norm": 29.09884422283532, "learning_rate": 5.831717890601434e-08, "logits/chosen": -1.2608332633972168, "logits/rejected": -1.2108803987503052, "logps/chosen": -222.2425079345703, "logps/rejected": -329.8652038574219, "loss": 0.4687, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6758387088775635, "rewards/margins": 1.1141695976257324, "rewards/rejected": -2.790008068084717, "step": 9290 }, { "epoch": 1.6, "grad_norm": 39.97591942100439, "learning_rate": 5.7835419144563e-08, "logits/chosen": -1.3308337926864624, "logits/rejected": -1.2767422199249268, "logps/chosen": -234.8633270263672, "logps/rejected": -378.3427429199219, "loss": 0.3657, "rewards/accuracies": 0.84375, "rewards/chosen": -1.812017798423767, "rewards/margins": 1.413147211074829, "rewards/rejected": -3.2251651287078857, "step": 9300 }, { "epoch": 1.6, "eval_logits/chosen": -1.4361413717269897, "eval_logits/rejected": -1.409943699836731, "eval_logps/chosen": -239.39939880371094, "eval_logps/rejected": -288.6492614746094, "eval_loss": 0.6325801014900208, "eval_rewards/accuracies": 0.6638011336326599, "eval_rewards/chosen": -1.806955337524414, "eval_rewards/margins": 0.4479631185531616, "eval_rewards/rejected": -2.2549185752868652, "eval_runtime": 358.0378, "eval_samples_per_second": 12.021, "eval_steps_per_second": 1.503, "step": 9300 }, { "epoch": 1.6, "grad_norm": 23.678957806069693, "learning_rate": 5.7355397142212495e-08, "logits/chosen": -1.3521010875701904, "logits/rejected": -1.2914403676986694, "logps/chosen": -218.249267578125, "logps/rejected": -341.4629211425781, "loss": 0.4569, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6451466083526611, "rewards/margins": 1.2658127546310425, "rewards/rejected": -2.910959482192993, "step": 9310 }, { "epoch": 1.61, "grad_norm": 35.87762222953201, "learning_rate": 5.687711723983907e-08, "logits/chosen": -1.4106115102767944, "logits/rejected": -1.3428720235824585, "logps/chosen": -235.66696166992188, "logps/rejected": -378.86505126953125, "loss": 0.4091, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8487489223480225, "rewards/margins": 1.3959980010986328, "rewards/rejected": -3.2447471618652344, "step": 9320 }, { "epoch": 1.61, "grad_norm": 36.79795244053593, "learning_rate": 5.640058376256437e-08, "logits/chosen": -1.3952717781066895, "logits/rejected": -1.3357038497924805, "logps/chosen": -222.98147583007812, "logps/rejected": -338.1329650878906, "loss": 0.458, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6642459630966187, "rewards/margins": 1.166606068611145, "rewards/rejected": -2.8308520317077637, "step": 9330 }, { "epoch": 1.61, "grad_norm": 21.105194172978344, "learning_rate": 5.5925801019717637e-08, "logits/chosen": -1.308809518814087, "logits/rejected": -1.2506635189056396, "logps/chosen": -229.06600952148438, "logps/rejected": -373.9830627441406, "loss": 0.4046, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7726303339004517, "rewards/margins": 1.4514251947402954, "rewards/rejected": -3.224055528640747, "step": 9340 }, { "epoch": 1.61, "grad_norm": 41.165260710581435, "learning_rate": 5.5452773304795585e-08, "logits/chosen": -1.3799827098846436, "logits/rejected": -1.31058669090271, "logps/chosen": -214.2169647216797, "logps/rejected": -341.6388854980469, "loss": 0.4018, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6169312000274658, "rewards/margins": 1.2802019119262695, "rewards/rejected": -2.8971328735351562, "step": 9350 }, { "epoch": 1.61, "grad_norm": 30.89004835348826, "learning_rate": 5.4981504895424273e-08, "logits/chosen": -1.4255657196044922, "logits/rejected": -1.349281668663025, "logps/chosen": -211.1226043701172, "logps/rejected": -347.12811279296875, "loss": 0.3748, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5445703268051147, "rewards/margins": 1.4008324146270752, "rewards/rejected": -2.9454026222229004, "step": 9360 }, { "epoch": 1.61, "grad_norm": 26.504198753445007, "learning_rate": 5.4512000053320266e-08, "logits/chosen": -1.4140576124191284, "logits/rejected": -1.333478569984436, "logps/chosen": -233.5621795654297, "logps/rejected": -371.4867248535156, "loss": 0.3992, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7974094152450562, "rewards/margins": 1.403607726097107, "rewards/rejected": -3.201017379760742, "step": 9370 }, { "epoch": 1.62, "grad_norm": 22.785892226259428, "learning_rate": 5.4044263024251994e-08, "logits/chosen": -1.3954850435256958, "logits/rejected": -1.3420671224594116, "logps/chosen": -226.4377899169922, "logps/rejected": -347.89373779296875, "loss": 0.4574, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7203410863876343, "rewards/margins": 1.2211310863494873, "rewards/rejected": -2.941472291946411, "step": 9380 }, { "epoch": 1.62, "grad_norm": 31.550228794525225, "learning_rate": 5.357829803800137e-08, "logits/chosen": -1.2319252490997314, "logits/rejected": -1.172555923461914, "logps/chosen": -237.8747100830078, "logps/rejected": -377.8671875, "loss": 0.409, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8478126525878906, "rewards/margins": 1.3827223777770996, "rewards/rejected": -3.2305350303649902, "step": 9390 }, { "epoch": 1.62, "grad_norm": 27.350825817958476, "learning_rate": 5.3114109308325743e-08, "logits/chosen": -1.2861920595169067, "logits/rejected": -1.2316094636917114, "logps/chosen": -221.30990600585938, "logps/rejected": -339.2297058105469, "loss": 0.4666, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6726821660995483, "rewards/margins": 1.1769441366195679, "rewards/rejected": -2.849626302719116, "step": 9400 }, { "epoch": 1.62, "eval_logits/chosen": -1.437654733657837, "eval_logits/rejected": -1.4113147258758545, "eval_logps/chosen": -238.54745483398438, "eval_logps/rejected": -287.8304138183594, "eval_loss": 0.6324562430381775, "eval_rewards/accuracies": 0.6631041169166565, "eval_rewards/chosen": -1.7984360456466675, "eval_rewards/margins": 0.4482942521572113, "eval_rewards/rejected": -2.246730327606201, "eval_runtime": 358.2323, "eval_samples_per_second": 12.015, "eval_steps_per_second": 1.502, "step": 9400 }, { "epoch": 1.62, "grad_norm": 37.163291490549675, "learning_rate": 5.265170103291952e-08, "logits/chosen": -1.3120262622833252, "logits/rejected": -1.2514145374298096, "logps/chosen": -221.3708038330078, "logps/rejected": -354.61456298828125, "loss": 0.4083, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6749778985977173, "rewards/margins": 1.3499835729599, "rewards/rejected": -3.0249617099761963, "step": 9410 }, { "epoch": 1.62, "grad_norm": 38.28562781683107, "learning_rate": 5.2191077393376165e-08, "logits/chosen": -1.3560113906860352, "logits/rejected": -1.298722267150879, "logps/chosen": -231.69131469726562, "logps/rejected": -346.3023376464844, "loss": 0.4541, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7664096355438232, "rewards/margins": 1.1748406887054443, "rewards/rejected": -2.9412503242492676, "step": 9420 }, { "epoch": 1.62, "grad_norm": 31.533083950987773, "learning_rate": 5.173224255515099e-08, "logits/chosen": -1.3096221685409546, "logits/rejected": -1.2421365976333618, "logps/chosen": -223.90087890625, "logps/rejected": -373.5675048828125, "loss": 0.4027, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7149451971054077, "rewards/margins": 1.5314754247665405, "rewards/rejected": -3.2464206218719482, "step": 9430 }, { "epoch": 1.63, "grad_norm": 40.702906044337006, "learning_rate": 5.127520066752256e-08, "logits/chosen": -1.2992658615112305, "logits/rejected": -1.2521995306015015, "logps/chosen": -227.48019409179688, "logps/rejected": -347.92181396484375, "loss": 0.4109, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7786552906036377, "rewards/margins": 1.176113247871399, "rewards/rejected": -2.954768419265747, "step": 9440 }, { "epoch": 1.63, "grad_norm": 32.090498966901635, "learning_rate": 5.0819955863555916e-08, "logits/chosen": -1.4480046033859253, "logits/rejected": -1.3969746828079224, "logps/chosen": -240.8600616455078, "logps/rejected": -356.9917297363281, "loss": 0.4612, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.874746561050415, "rewards/margins": 1.189202070236206, "rewards/rejected": -3.0639488697052, "step": 9450 }, { "epoch": 1.63, "grad_norm": 17.790347252816513, "learning_rate": 5.0366512260064883e-08, "logits/chosen": -1.310302734375, "logits/rejected": -1.2530990839004517, "logps/chosen": -209.5319061279297, "logps/rejected": -382.5646667480469, "loss": 0.3101, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.5730886459350586, "rewards/margins": 1.7125349044799805, "rewards/rejected": -3.285623550415039, "step": 9460 }, { "epoch": 1.63, "grad_norm": 38.95147289526329, "learning_rate": 4.9914873957574906e-08, "logits/chosen": -1.1751810312271118, "logits/rejected": -1.1032475233078003, "logps/chosen": -228.77847290039062, "logps/rejected": -351.155517578125, "loss": 0.4307, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7615420818328857, "rewards/margins": 1.2454551458358765, "rewards/rejected": -3.006997585296631, "step": 9470 }, { "epoch": 1.63, "grad_norm": 26.885837020564978, "learning_rate": 4.94650450402859e-08, "logits/chosen": -1.3068211078643799, "logits/rejected": -1.22744882106781, "logps/chosen": -227.61032104492188, "logps/rejected": -368.6788330078125, "loss": 0.3906, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7363221645355225, "rewards/margins": 1.4241564273834229, "rewards/rejected": -3.1604788303375244, "step": 9480 }, { "epoch": 1.64, "grad_norm": 30.073094764522576, "learning_rate": 4.9017029576035404e-08, "logits/chosen": -1.2682311534881592, "logits/rejected": -1.2089643478393555, "logps/chosen": -231.9444580078125, "logps/rejected": -363.248291015625, "loss": 0.3954, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7784042358398438, "rewards/margins": 1.3314237594604492, "rewards/rejected": -3.109828233718872, "step": 9490 }, { "epoch": 1.64, "grad_norm": 23.54399691970181, "learning_rate": 4.857083161626174e-08, "logits/chosen": -1.3256847858428955, "logits/rejected": -1.2643065452575684, "logps/chosen": -226.8311004638672, "logps/rejected": -383.41436767578125, "loss": 0.3503, "rewards/accuracies": 0.875, "rewards/chosen": -1.7317718267440796, "rewards/margins": 1.551546335220337, "rewards/rejected": -3.283318042755127, "step": 9500 }, { "epoch": 1.64, "eval_logits/chosen": -1.402784824371338, "eval_logits/rejected": -1.3757189512252808, "eval_logps/chosen": -252.00526428222656, "eval_logps/rejected": -304.04388427734375, "eval_loss": 0.6339713931083679, "eval_rewards/accuracies": 0.6586896181106567, "eval_rewards/chosen": -1.9330142736434937, "eval_rewards/margins": 0.4758506715297699, "eval_rewards/rejected": -2.408864736557007, "eval_runtime": 358.2372, "eval_samples_per_second": 12.014, "eval_steps_per_second": 1.502, "step": 9500 }, { "epoch": 1.64, "grad_norm": 33.834138719354485, "learning_rate": 4.812645519596748e-08, "logits/chosen": -1.2126704454421997, "logits/rejected": -1.152756929397583, "logps/chosen": -234.50845336914062, "logps/rejected": -376.29010009765625, "loss": 0.3692, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8366127014160156, "rewards/margins": 1.3869558572769165, "rewards/rejected": -3.2235684394836426, "step": 9510 }, { "epoch": 1.64, "grad_norm": 24.273551744010256, "learning_rate": 4.7683904333682715e-08, "logits/chosen": -1.439879059791565, "logits/rejected": -1.3854401111602783, "logps/chosen": -251.1964874267578, "logps/rejected": -393.2129821777344, "loss": 0.426, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9885809421539307, "rewards/margins": 1.3651524782180786, "rewards/rejected": -3.3537330627441406, "step": 9520 }, { "epoch": 1.64, "grad_norm": 34.99697034368316, "learning_rate": 4.72431830314291e-08, "logits/chosen": -1.3440197706222534, "logits/rejected": -1.2706291675567627, "logps/chosen": -233.45089721679688, "logps/rejected": -384.12225341796875, "loss": 0.368, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8057721853256226, "rewards/margins": 1.5267770290374756, "rewards/rejected": -3.3325493335723877, "step": 9530 }, { "epoch": 1.64, "grad_norm": 41.925642700921735, "learning_rate": 4.68042952746831e-08, "logits/chosen": -1.247374415397644, "logits/rejected": -1.1915947198867798, "logps/chosen": -240.2723388671875, "logps/rejected": -373.23394775390625, "loss": 0.4017, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8617527484893799, "rewards/margins": 1.3366732597351074, "rewards/rejected": -3.1984262466430664, "step": 9540 }, { "epoch": 1.65, "grad_norm": 33.59274463489468, "learning_rate": 4.636724503234074e-08, "logits/chosen": -1.3158290386199951, "logits/rejected": -1.2710721492767334, "logps/chosen": -234.71505737304688, "logps/rejected": -374.32415771484375, "loss": 0.4214, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8152387142181396, "rewards/margins": 1.3604776859283447, "rewards/rejected": -3.1757161617279053, "step": 9550 }, { "epoch": 1.65, "grad_norm": 34.17989472034478, "learning_rate": 4.593203625668077e-08, "logits/chosen": -1.4079006910324097, "logits/rejected": -1.3559339046478271, "logps/chosen": -226.49813842773438, "logps/rejected": -359.3128967285156, "loss": 0.3967, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.728776216506958, "rewards/margins": 1.3338409662246704, "rewards/rejected": -3.0626168251037598, "step": 9560 }, { "epoch": 1.65, "grad_norm": 31.348331592601255, "learning_rate": 4.549867288332987e-08, "logits/chosen": -1.2812343835830688, "logits/rejected": -1.2303446531295776, "logps/chosen": -227.1624298095703, "logps/rejected": -361.9521789550781, "loss": 0.4189, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7691608667373657, "rewards/margins": 1.3438093662261963, "rewards/rejected": -3.1129703521728516, "step": 9570 }, { "epoch": 1.65, "grad_norm": 39.69075540606707, "learning_rate": 4.5067158831226273e-08, "logits/chosen": -1.3360213041305542, "logits/rejected": -1.2739002704620361, "logps/chosen": -245.7711639404297, "logps/rejected": -386.2242126464844, "loss": 0.4138, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9129165410995483, "rewards/margins": 1.3799970149993896, "rewards/rejected": -3.2929134368896484, "step": 9580 }, { "epoch": 1.65, "grad_norm": 33.115164323400506, "learning_rate": 4.463749800258479e-08, "logits/chosen": -1.444392442703247, "logits/rejected": -1.3800554275512695, "logps/chosen": -227.8526153564453, "logps/rejected": -368.66693115234375, "loss": 0.3906, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7177770137786865, "rewards/margins": 1.4058539867401123, "rewards/rejected": -3.123631000518799, "step": 9590 }, { "epoch": 1.65, "grad_norm": 41.69230357375958, "learning_rate": 4.420969428286139e-08, "logits/chosen": -1.2808119058609009, "logits/rejected": -1.2005847692489624, "logps/chosen": -230.84872436523438, "logps/rejected": -385.7454528808594, "loss": 0.3729, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7579460144042969, "rewards/margins": 1.5761973857879639, "rewards/rejected": -3.334143877029419, "step": 9600 }, { "epoch": 1.65, "eval_logits/chosen": -1.391427993774414, "eval_logits/rejected": -1.3641211986541748, "eval_logps/chosen": -252.29434204101562, "eval_logps/rejected": -304.6582946777344, "eval_loss": 0.6356525421142578, "eval_rewards/accuracies": 0.6563661694526672, "eval_rewards/chosen": -1.935904860496521, "eval_rewards/margins": 0.4791041910648346, "eval_rewards/rejected": -2.415009021759033, "eval_runtime": 357.9055, "eval_samples_per_second": 12.026, "eval_steps_per_second": 1.503, "step": 9600 }, { "epoch": 1.66, "grad_norm": 38.45195266525548, "learning_rate": 4.378375154071806e-08, "logits/chosen": -1.2705994844436646, "logits/rejected": -1.2090730667114258, "logps/chosen": -229.653076171875, "logps/rejected": -377.4794006347656, "loss": 0.3925, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7335882186889648, "rewards/margins": 1.491674780845642, "rewards/rejected": -3.2252631187438965, "step": 9610 }, { "epoch": 1.66, "grad_norm": 34.39068811627909, "learning_rate": 4.335967362798787e-08, "logits/chosen": -1.4060554504394531, "logits/rejected": -1.3566725254058838, "logps/chosen": -241.97201538085938, "logps/rejected": -348.53228759765625, "loss": 0.4833, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8736858367919922, "rewards/margins": 1.0578181743621826, "rewards/rejected": -2.9315037727355957, "step": 9620 }, { "epoch": 1.66, "grad_norm": 41.46128098759825, "learning_rate": 4.293746437963983e-08, "logits/chosen": -1.3315681219100952, "logits/rejected": -1.2634027004241943, "logps/chosen": -258.986328125, "logps/rejected": -377.8593444824219, "loss": 0.4636, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0355629920959473, "rewards/margins": 1.2129532098770142, "rewards/rejected": -3.248516082763672, "step": 9630 }, { "epoch": 1.66, "grad_norm": 43.601466824424364, "learning_rate": 4.2517127613744986e-08, "logits/chosen": -1.3833634853363037, "logits/rejected": -1.3209584951400757, "logps/chosen": -235.45803833007812, "logps/rejected": -356.86627197265625, "loss": 0.4311, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7948541641235352, "rewards/margins": 1.2689039707183838, "rewards/rejected": -3.063758134841919, "step": 9640 }, { "epoch": 1.66, "grad_norm": 29.63300631760297, "learning_rate": 4.209866713144078e-08, "logits/chosen": -1.2863609790802002, "logits/rejected": -1.2273352146148682, "logps/chosen": -234.44369506835938, "logps/rejected": -349.785888671875, "loss": 0.4813, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7898155450820923, "rewards/margins": 1.1682324409484863, "rewards/rejected": -2.958048105239868, "step": 9650 }, { "epoch": 1.66, "grad_norm": 28.816878634667912, "learning_rate": 4.1682086716897826e-08, "logits/chosen": -1.3099644184112549, "logits/rejected": -1.264509916305542, "logps/chosen": -219.1770477294922, "logps/rejected": -347.0464172363281, "loss": 0.4171, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6862246990203857, "rewards/margins": 1.2479627132415771, "rewards/rejected": -2.934187412261963, "step": 9660 }, { "epoch": 1.67, "grad_norm": 29.767921273426662, "learning_rate": 4.1267390137284725e-08, "logits/chosen": -1.3837589025497437, "logits/rejected": -1.307064414024353, "logps/chosen": -233.31289672851562, "logps/rejected": -396.89337158203125, "loss": 0.3643, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7717788219451904, "rewards/margins": 1.6641696691513062, "rewards/rejected": -3.435948610305786, "step": 9670 }, { "epoch": 1.67, "grad_norm": 35.63166890222917, "learning_rate": 4.085458114273463e-08, "logits/chosen": -1.3208513259887695, "logits/rejected": -1.2681634426116943, "logps/chosen": -229.1538848876953, "logps/rejected": -348.280029296875, "loss": 0.4797, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7467445135116577, "rewards/margins": 1.2142714262008667, "rewards/rejected": -2.9610159397125244, "step": 9680 }, { "epoch": 1.67, "grad_norm": 37.72367340978869, "learning_rate": 4.044366346631106e-08, "logits/chosen": -1.248106598854065, "logits/rejected": -1.1902921199798584, "logps/chosen": -231.75619506835938, "logps/rejected": -361.8299255371094, "loss": 0.4176, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7594753503799438, "rewards/margins": 1.3130239248275757, "rewards/rejected": -3.0724992752075195, "step": 9690 }, { "epoch": 1.67, "grad_norm": 29.824443223289826, "learning_rate": 4.00346408239742e-08, "logits/chosen": -1.2740222215652466, "logits/rejected": -1.2087305784225464, "logps/chosen": -237.3057403564453, "logps/rejected": -371.45361328125, "loss": 0.4403, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8458435535430908, "rewards/margins": 1.3461719751358032, "rewards/rejected": -3.1920151710510254, "step": 9700 }, { "epoch": 1.67, "eval_logits/chosen": -1.417214274406433, "eval_logits/rejected": -1.390296220779419, "eval_logps/chosen": -244.72193908691406, "eval_logps/rejected": -295.69439697265625, "eval_loss": 0.6342071294784546, "eval_rewards/accuracies": 0.6624070405960083, "eval_rewards/chosen": -1.8601804971694946, "eval_rewards/margins": 0.46518951654434204, "eval_rewards/rejected": -2.3253698348999023, "eval_runtime": 357.4423, "eval_samples_per_second": 12.041, "eval_steps_per_second": 1.505, "step": 9700 }, { "epoch": 1.67, "grad_norm": 37.595203889217025, "learning_rate": 3.96275169145473e-08, "logits/chosen": -1.16835618019104, "logits/rejected": -1.1231775283813477, "logps/chosen": -235.44985961914062, "logps/rejected": -346.90167236328125, "loss": 0.4504, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8184646368026733, "rewards/margins": 1.131596326828003, "rewards/rejected": -2.950061082839966, "step": 9710 }, { "epoch": 1.67, "grad_norm": 45.4457406884452, "learning_rate": 3.922229541968322e-08, "logits/chosen": -1.3615391254425049, "logits/rejected": -1.312709927558899, "logps/chosen": -238.9713134765625, "logps/rejected": -351.1409912109375, "loss": 0.5068, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8562841415405273, "rewards/margins": 1.1417663097381592, "rewards/rejected": -2.9980504512786865, "step": 9720 }, { "epoch": 1.68, "grad_norm": 33.15996891543965, "learning_rate": 3.881898000383116e-08, "logits/chosen": -1.3564598560333252, "logits/rejected": -1.3007166385650635, "logps/chosen": -208.45571899414062, "logps/rejected": -348.89300537109375, "loss": 0.4111, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.5474106073379517, "rewards/margins": 1.3974087238311768, "rewards/rejected": -2.944819450378418, "step": 9730 }, { "epoch": 1.68, "grad_norm": 32.7011999016843, "learning_rate": 3.841757431420351e-08, "logits/chosen": -1.3424409627914429, "logits/rejected": -1.2776859998703003, "logps/chosen": -232.28494262695312, "logps/rejected": -366.96038818359375, "loss": 0.4073, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.7781145572662354, "rewards/margins": 1.3590996265411377, "rewards/rejected": -3.137214183807373, "step": 9740 }, { "epoch": 1.68, "grad_norm": 26.93270991808977, "learning_rate": 3.801808198074266e-08, "logits/chosen": -1.367996096611023, "logits/rejected": -1.290379285812378, "logps/chosen": -232.1277313232422, "logps/rejected": -353.48175048828125, "loss": 0.3957, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7703111171722412, "rewards/margins": 1.271135926246643, "rewards/rejected": -3.0414466857910156, "step": 9750 }, { "epoch": 1.68, "grad_norm": 19.984914469702172, "learning_rate": 3.7620506616088817e-08, "logits/chosen": -1.3691097497940063, "logits/rejected": -1.306137204170227, "logps/chosen": -238.61715698242188, "logps/rejected": -359.2353210449219, "loss": 0.4193, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8544162511825562, "rewards/margins": 1.225228190422058, "rewards/rejected": -3.079644203186035, "step": 9760 }, { "epoch": 1.68, "grad_norm": 28.256879995074712, "learning_rate": 3.72248518155463e-08, "logits/chosen": -1.2638490200042725, "logits/rejected": -1.198878526687622, "logps/chosen": -214.2969512939453, "logps/rejected": -354.8340148925781, "loss": 0.3748, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6020538806915283, "rewards/margins": 1.4068208932876587, "rewards/rejected": -3.0088746547698975, "step": 9770 }, { "epoch": 1.69, "grad_norm": 34.80859221520256, "learning_rate": 3.683112115705225e-08, "logits/chosen": -1.3776103258132935, "logits/rejected": -1.2958616018295288, "logps/chosen": -220.2239227294922, "logps/rejected": -346.7090759277344, "loss": 0.4194, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5932377576828003, "rewards/margins": 1.3576481342315674, "rewards/rejected": -2.9508860111236572, "step": 9780 }, { "epoch": 1.69, "grad_norm": 49.153516151462675, "learning_rate": 3.6439318201143096e-08, "logits/chosen": -1.336469054222107, "logits/rejected": -1.2982733249664307, "logps/chosen": -235.4860382080078, "logps/rejected": -368.72442626953125, "loss": 0.4339, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8210595846176147, "rewards/margins": 1.2906904220581055, "rewards/rejected": -3.1117501258850098, "step": 9790 }, { "epoch": 1.69, "grad_norm": 45.583738179863275, "learning_rate": 3.604944649092323e-08, "logits/chosen": -1.3702198266983032, "logits/rejected": -1.2832942008972168, "logps/chosen": -227.76315307617188, "logps/rejected": -392.9060363769531, "loss": 0.3633, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7098026275634766, "rewards/margins": 1.682472586631775, "rewards/rejected": -3.392275333404541, "step": 9800 }, { "epoch": 1.69, "eval_logits/chosen": -1.4198648929595947, "eval_logits/rejected": -1.3927710056304932, "eval_logps/chosen": -244.33860778808594, "eval_logps/rejected": -295.2367248535156, "eval_loss": 0.6346299648284912, "eval_rewards/accuracies": 0.6589219570159912, "eval_rewards/chosen": -1.856347680091858, "eval_rewards/margins": 0.4644457995891571, "eval_rewards/rejected": -2.320793390274048, "eval_runtime": 357.9375, "eval_samples_per_second": 12.024, "eval_steps_per_second": 1.503, "step": 9800 }, { "epoch": 1.69, "grad_norm": 27.982644219340326, "learning_rate": 3.566150955203251e-08, "logits/chosen": -1.2991117238998413, "logits/rejected": -1.2290585041046143, "logps/chosen": -232.1947784423828, "logps/rejected": -362.37738037109375, "loss": 0.438, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.767719030380249, "rewards/margins": 1.3174628019332886, "rewards/rejected": -3.085181951522827, "step": 9810 }, { "epoch": 1.69, "grad_norm": 62.700137314497454, "learning_rate": 3.52755108926146e-08, "logits/chosen": -1.3249415159225464, "logits/rejected": -1.2635711431503296, "logps/chosen": -226.71157836914062, "logps/rejected": -362.6283874511719, "loss": 0.4126, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7259670495986938, "rewards/margins": 1.360840916633606, "rewards/rejected": -3.0868077278137207, "step": 9820 }, { "epoch": 1.69, "grad_norm": 31.049190619621527, "learning_rate": 3.489145400328511e-08, "logits/chosen": -1.3645232915878296, "logits/rejected": -1.3135316371917725, "logps/chosen": -240.0381317138672, "logps/rejected": -366.198486328125, "loss": 0.4544, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.820874810218811, "rewards/margins": 1.2675881385803223, "rewards/rejected": -3.088463068008423, "step": 9830 }, { "epoch": 1.7, "grad_norm": 31.393346729361205, "learning_rate": 3.4509342357099904e-08, "logits/chosen": -1.3261866569519043, "logits/rejected": -1.248307704925537, "logps/chosen": -227.368896484375, "logps/rejected": -373.2166442871094, "loss": 0.4319, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7153198719024658, "rewards/margins": 1.4764636754989624, "rewards/rejected": -3.1917834281921387, "step": 9840 }, { "epoch": 1.7, "grad_norm": 28.920399059721735, "learning_rate": 3.4129179409524225e-08, "logits/chosen": -1.3411327600479126, "logits/rejected": -1.2954285144805908, "logps/chosen": -218.49609375, "logps/rejected": -335.6559753417969, "loss": 0.4306, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.639061689376831, "rewards/margins": 1.2008213996887207, "rewards/rejected": -2.8398830890655518, "step": 9850 }, { "epoch": 1.7, "grad_norm": 42.06449748795488, "learning_rate": 3.375096859840071e-08, "logits/chosen": -1.4100219011306763, "logits/rejected": -1.3593881130218506, "logps/chosen": -245.61587524414062, "logps/rejected": -357.762939453125, "loss": 0.4891, "rewards/accuracies": 0.78125, "rewards/chosen": -1.89302659034729, "rewards/margins": 1.1459534168243408, "rewards/rejected": -3.038980007171631, "step": 9860 }, { "epoch": 1.7, "grad_norm": 39.095333220619935, "learning_rate": 3.337471334391903e-08, "logits/chosen": -1.3635808229446411, "logits/rejected": -1.2952406406402588, "logps/chosen": -218.5416717529297, "logps/rejected": -349.49041748046875, "loss": 0.4054, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.658063530921936, "rewards/margins": 1.3279998302459717, "rewards/rejected": -2.9860637187957764, "step": 9870 }, { "epoch": 1.7, "grad_norm": 26.222603081146218, "learning_rate": 3.300041704858425e-08, "logits/chosen": -1.2638031244277954, "logits/rejected": -1.2089149951934814, "logps/chosen": -224.624755859375, "logps/rejected": -374.73345947265625, "loss": 0.3794, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7256309986114502, "rewards/margins": 1.4671021699905396, "rewards/rejected": -3.1927330493927, "step": 9880 }, { "epoch": 1.7, "grad_norm": 32.364868235731464, "learning_rate": 3.262808309718668e-08, "logits/chosen": -1.2537232637405396, "logits/rejected": -1.204134225845337, "logps/chosen": -237.5514373779297, "logps/rejected": -368.892578125, "loss": 0.4235, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8114869594573975, "rewards/margins": 1.3386226892471313, "rewards/rejected": -3.1501097679138184, "step": 9890 }, { "epoch": 1.71, "grad_norm": 42.03184187301272, "learning_rate": 3.2257714856770866e-08, "logits/chosen": -1.3525625467300415, "logits/rejected": -1.2814313173294067, "logps/chosen": -209.0051727294922, "logps/rejected": -360.9290466308594, "loss": 0.3727, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5728161334991455, "rewards/margins": 1.538694977760315, "rewards/rejected": -3.111510992050171, "step": 9900 }, { "epoch": 1.71, "eval_logits/chosen": -1.424879550933838, "eval_logits/rejected": -1.397822618484497, "eval_logps/chosen": -246.3584747314453, "eval_logps/rejected": -297.6012878417969, "eval_loss": 0.6336408853530884, "eval_rewards/accuracies": 0.6556691527366638, "eval_rewards/chosen": -1.876546025276184, "eval_rewards/margins": 0.46789297461509705, "eval_rewards/rejected": -2.3444390296936035, "eval_runtime": 357.1672, "eval_samples_per_second": 12.05, "eval_steps_per_second": 1.506, "step": 9900 }, { "epoch": 1.71, "grad_norm": 26.80328363192874, "learning_rate": 3.1889315676605325e-08, "logits/chosen": -1.4107153415679932, "logits/rejected": -1.322643756866455, "logps/chosen": -216.0201873779297, "logps/rejected": -352.2988586425781, "loss": 0.4129, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5847244262695312, "rewards/margins": 1.4318273067474365, "rewards/rejected": -3.0165517330169678, "step": 9910 }, { "epoch": 1.71, "grad_norm": 31.18899577015784, "learning_rate": 3.152288888815227e-08, "logits/chosen": -1.3761640787124634, "logits/rejected": -1.3048861026763916, "logps/chosen": -222.0412139892578, "logps/rejected": -367.63140869140625, "loss": 0.3517, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6936910152435303, "rewards/margins": 1.4894893169403076, "rewards/rejected": -3.183180332183838, "step": 9920 }, { "epoch": 1.71, "grad_norm": 35.38732271471543, "learning_rate": 3.1158437805037296e-08, "logits/chosen": -1.3241338729858398, "logits/rejected": -1.276207685470581, "logps/chosen": -219.0576171875, "logps/rejected": -356.7098693847656, "loss": 0.4241, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.671561598777771, "rewards/margins": 1.347208023071289, "rewards/rejected": -3.0187695026397705, "step": 9930 }, { "epoch": 1.71, "grad_norm": 24.595744118468183, "learning_rate": 3.079596572301965e-08, "logits/chosen": -1.4158904552459717, "logits/rejected": -1.3684117794036865, "logps/chosen": -236.92745971679688, "logps/rejected": -355.1565246582031, "loss": 0.4652, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8187148571014404, "rewards/margins": 1.125727653503418, "rewards/rejected": -2.9444425106048584, "step": 9940 }, { "epoch": 1.71, "grad_norm": 59.09049440288645, "learning_rate": 3.043547591996226e-08, "logits/chosen": -1.3634991645812988, "logits/rejected": -1.2813036441802979, "logps/chosen": -228.6792755126953, "logps/rejected": -375.29547119140625, "loss": 0.3893, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7471336126327515, "rewards/margins": 1.4925024509429932, "rewards/rejected": -3.239635944366455, "step": 9950 }, { "epoch": 1.72, "grad_norm": 50.2107128049137, "learning_rate": 3.0076971655802196e-08, "logits/chosen": -1.3980926275253296, "logits/rejected": -1.3432905673980713, "logps/chosen": -247.3916015625, "logps/rejected": -370.38458251953125, "loss": 0.4546, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9397674798965454, "rewards/margins": 1.2183306217193604, "rewards/rejected": -3.158097743988037, "step": 9960 }, { "epoch": 1.72, "grad_norm": 31.69304656950342, "learning_rate": 2.972045617252114e-08, "logits/chosen": -1.37814462184906, "logits/rejected": -1.328137755393982, "logps/chosen": -225.2104949951172, "logps/rejected": -350.4515075683594, "loss": 0.4552, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7387615442276, "rewards/margins": 1.2483638525009155, "rewards/rejected": -2.9871251583099365, "step": 9970 }, { "epoch": 1.72, "grad_norm": 18.89889146936782, "learning_rate": 2.9365932694115913e-08, "logits/chosen": -1.2746312618255615, "logits/rejected": -1.2151672840118408, "logps/chosen": -238.1916046142578, "logps/rejected": -384.50164794921875, "loss": 0.4013, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8541069030761719, "rewards/margins": 1.459223747253418, "rewards/rejected": -3.3133304119110107, "step": 9980 }, { "epoch": 1.72, "grad_norm": 42.47918178591677, "learning_rate": 2.9013404426569855e-08, "logits/chosen": -1.3667715787887573, "logits/rejected": -1.2965288162231445, "logps/chosen": -238.6469268798828, "logps/rejected": -354.388427734375, "loss": 0.4658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8494361639022827, "rewards/margins": 1.195669412612915, "rewards/rejected": -3.045105457305908, "step": 9990 }, { "epoch": 1.72, "grad_norm": 21.723980366758628, "learning_rate": 2.8662874557823013e-08, "logits/chosen": -1.37138831615448, "logits/rejected": -1.3194690942764282, "logps/chosen": -232.44027709960938, "logps/rejected": -357.5736389160156, "loss": 0.424, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7671289443969727, "rewards/margins": 1.239452600479126, "rewards/rejected": -3.0065817832946777, "step": 10000 }, { "epoch": 1.72, "eval_logits/chosen": -1.4226433038711548, "eval_logits/rejected": -1.3957637548446655, "eval_logps/chosen": -245.6855010986328, "eval_logps/rejected": -296.6435852050781, "eval_loss": 0.6344332098960876, "eval_rewards/accuracies": 0.6514869928359985, "eval_rewards/chosen": -1.8698163032531738, "eval_rewards/margins": 0.4650455117225647, "eval_rewards/rejected": -2.334861993789673, "eval_runtime": 357.1222, "eval_samples_per_second": 12.052, "eval_steps_per_second": 1.506, "step": 10000 }, { "epoch": 1.72, "grad_norm": 19.257035851897566, "learning_rate": 2.8314346257744177e-08, "logits/chosen": -1.3713723421096802, "logits/rejected": -1.3086481094360352, "logps/chosen": -225.0717010498047, "logps/rejected": -362.225830078125, "loss": 0.3966, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7269824743270874, "rewards/margins": 1.3675779104232788, "rewards/rejected": -3.094560146331787, "step": 10010 }, { "epoch": 1.73, "grad_norm": 33.76415450194936, "learning_rate": 2.7967822678101466e-08, "logits/chosen": -1.3136873245239258, "logits/rejected": -1.243048071861267, "logps/chosen": -228.7400360107422, "logps/rejected": -360.3517761230469, "loss": 0.411, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.75080144405365, "rewards/margins": 1.3429168462753296, "rewards/rejected": -3.0937180519104004, "step": 10020 }, { "epoch": 1.73, "grad_norm": 30.936869629766193, "learning_rate": 2.7623306952534316e-08, "logits/chosen": -1.3288711309432983, "logits/rejected": -1.260801076889038, "logps/chosen": -239.26663208007812, "logps/rejected": -358.018798828125, "loss": 0.4206, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8187427520751953, "rewards/margins": 1.2403138875961304, "rewards/rejected": -3.0590567588806152, "step": 10030 }, { "epoch": 1.73, "grad_norm": 23.395127833799734, "learning_rate": 2.728080219652504e-08, "logits/chosen": -1.509854793548584, "logits/rejected": -1.4493193626403809, "logps/chosen": -229.5972900390625, "logps/rejected": -358.5816345214844, "loss": 0.4241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7475786209106445, "rewards/margins": 1.2939976453781128, "rewards/rejected": -3.0415761470794678, "step": 10040 }, { "epoch": 1.73, "grad_norm": 36.993355629776445, "learning_rate": 2.694031150737036e-08, "logits/chosen": -1.3353779315948486, "logits/rejected": -1.2893092632293701, "logps/chosen": -227.55111694335938, "logps/rejected": -348.4396057128906, "loss": 0.4256, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7357978820800781, "rewards/margins": 1.1867949962615967, "rewards/rejected": -2.922593355178833, "step": 10050 }, { "epoch": 1.73, "grad_norm": 40.38498271374321, "learning_rate": 2.6601837964153996e-08, "logits/chosen": -1.2823264598846436, "logits/rejected": -1.2305335998535156, "logps/chosen": -225.500732421875, "logps/rejected": -361.58197021484375, "loss": 0.4373, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7677650451660156, "rewards/margins": 1.3243902921676636, "rewards/rejected": -3.0921549797058105, "step": 10060 }, { "epoch": 1.74, "grad_norm": 28.019376221143933, "learning_rate": 2.6265384627718046e-08, "logits/chosen": -1.2700541019439697, "logits/rejected": -1.215319037437439, "logps/chosen": -226.7916717529297, "logps/rejected": -368.22149658203125, "loss": 0.3848, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7437423467636108, "rewards/margins": 1.4138736724853516, "rewards/rejected": -3.157615900039673, "step": 10070 }, { "epoch": 1.74, "grad_norm": 37.67411721104956, "learning_rate": 2.593095454063615e-08, "logits/chosen": -1.3826894760131836, "logits/rejected": -1.3214690685272217, "logps/chosen": -222.34335327148438, "logps/rejected": -359.19488525390625, "loss": 0.4212, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6584384441375732, "rewards/margins": 1.3830474615097046, "rewards/rejected": -3.0414860248565674, "step": 10080 }, { "epoch": 1.74, "grad_norm": 24.463559826579306, "learning_rate": 2.5598550727185142e-08, "logits/chosen": -1.3830634355545044, "logits/rejected": -1.3127849102020264, "logps/chosen": -223.80313110351562, "logps/rejected": -375.8487548828125, "loss": 0.39, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6944698095321655, "rewards/margins": 1.4869707822799683, "rewards/rejected": -3.181440591812134, "step": 10090 }, { "epoch": 1.74, "grad_norm": 35.57829724815369, "learning_rate": 2.5268176193318473e-08, "logits/chosen": -1.349346399307251, "logits/rejected": -1.2962137460708618, "logps/chosen": -229.2544403076172, "logps/rejected": -371.0001525878906, "loss": 0.3867, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.742475152015686, "rewards/margins": 1.4224704504013062, "rewards/rejected": -3.164945602416992, "step": 10100 }, { "epoch": 1.74, "eval_logits/chosen": -1.4281973838806152, "eval_logits/rejected": -1.401401162147522, "eval_logps/chosen": -242.6608123779297, "eval_logps/rejected": -292.8902587890625, "eval_loss": 0.6348100900650024, "eval_rewards/accuracies": 0.6610130071640015, "eval_rewards/chosen": -1.8395695686340332, "eval_rewards/margins": 0.4577590227127075, "eval_rewards/rejected": -2.297328472137451, "eval_runtime": 357.2066, "eval_samples_per_second": 12.049, "eval_steps_per_second": 1.506, "step": 10100 }, { "epoch": 1.74, "grad_norm": 41.259295901796634, "learning_rate": 2.4939833926638397e-08, "logits/chosen": -1.379417896270752, "logits/rejected": -1.333963394165039, "logps/chosen": -243.1998748779297, "logps/rejected": -391.07293701171875, "loss": 0.4012, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9264335632324219, "rewards/margins": 1.4382061958312988, "rewards/rejected": -3.3646399974823, "step": 10110 }, { "epoch": 1.74, "grad_norm": 26.28585974209364, "learning_rate": 2.4613526896369307e-08, "logits/chosen": -1.3697757720947266, "logits/rejected": -1.3050405979156494, "logps/chosen": -231.44277954101562, "logps/rejected": -369.87445068359375, "loss": 0.3817, "rewards/accuracies": 0.8125, "rewards/chosen": -1.777197241783142, "rewards/margins": 1.4158128499984741, "rewards/rejected": -3.193009853363037, "step": 10120 }, { "epoch": 1.75, "grad_norm": 30.6614965261688, "learning_rate": 2.428925805333082e-08, "logits/chosen": -1.3869436979293823, "logits/rejected": -1.3147681951522827, "logps/chosen": -221.4283905029297, "logps/rejected": -376.77764892578125, "loss": 0.3521, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6809848546981812, "rewards/margins": 1.5506279468536377, "rewards/rejected": -3.2316126823425293, "step": 10130 }, { "epoch": 1.75, "grad_norm": 40.486241150607576, "learning_rate": 2.396703032991107e-08, "logits/chosen": -1.3461048603057861, "logits/rejected": -1.2657787799835205, "logps/chosen": -243.0239715576172, "logps/rejected": -370.98052978515625, "loss": 0.4374, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8258941173553467, "rewards/margins": 1.3444820642471313, "rewards/rejected": -3.1703763008117676, "step": 10140 }, { "epoch": 1.75, "grad_norm": 31.540425471498317, "learning_rate": 2.3646846640040158e-08, "logits/chosen": -1.2644100189208984, "logits/rejected": -1.1996811628341675, "logps/chosen": -238.3133087158203, "logps/rejected": -371.70947265625, "loss": 0.4298, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8294227123260498, "rewards/margins": 1.333860158920288, "rewards/rejected": -3.163282871246338, "step": 10150 }, { "epoch": 1.75, "grad_norm": 29.9318411012178, "learning_rate": 2.332870987916383e-08, "logits/chosen": -1.3238608837127686, "logits/rejected": -1.2526966333389282, "logps/chosen": -222.04052734375, "logps/rejected": -380.37548828125, "loss": 0.3546, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6999727487564087, "rewards/margins": 1.5818251371383667, "rewards/rejected": -3.2817981243133545, "step": 10160 }, { "epoch": 1.75, "grad_norm": 34.151684842456746, "learning_rate": 2.3012622924217323e-08, "logits/chosen": -1.3320282697677612, "logits/rejected": -1.272220253944397, "logps/chosen": -232.2387237548828, "logps/rejected": -378.3623046875, "loss": 0.4196, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7736480236053467, "rewards/margins": 1.466052770614624, "rewards/rejected": -3.2397007942199707, "step": 10170 }, { "epoch": 1.75, "grad_norm": 21.211664057036547, "learning_rate": 2.2698588633599357e-08, "logits/chosen": -1.2500728368759155, "logits/rejected": -1.1695213317871094, "logps/chosen": -226.1921844482422, "logps/rejected": -385.45855712890625, "loss": 0.3777, "rewards/accuracies": 0.84375, "rewards/chosen": -1.737788438796997, "rewards/margins": 1.583823800086975, "rewards/rejected": -3.321612596511841, "step": 10180 }, { "epoch": 1.76, "grad_norm": 56.5968336109649, "learning_rate": 2.2386609847146077e-08, "logits/chosen": -1.2704510688781738, "logits/rejected": -1.2027629613876343, "logps/chosen": -225.5328826904297, "logps/rejected": -359.3175354003906, "loss": 0.4261, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7456719875335693, "rewards/margins": 1.333605408668518, "rewards/rejected": -3.079277515411377, "step": 10190 }, { "epoch": 1.76, "grad_norm": 24.63754465967521, "learning_rate": 2.2076689386105824e-08, "logits/chosen": -1.3379052877426147, "logits/rejected": -1.2737504243850708, "logps/chosen": -230.8819580078125, "logps/rejected": -371.2557067871094, "loss": 0.3851, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7848135232925415, "rewards/margins": 1.4187109470367432, "rewards/rejected": -3.203524351119995, "step": 10200 }, { "epoch": 1.76, "eval_logits/chosen": -1.3974082469940186, "eval_logits/rejected": -1.369729995727539, "eval_logps/chosen": -254.59268188476562, "eval_logps/rejected": -307.6221618652344, "eval_loss": 0.6357866525650024, "eval_rewards/accuracies": 0.660780668258667, "eval_rewards/chosen": -1.9588884115219116, "eval_rewards/margins": 0.4857591688632965, "eval_rewards/rejected": -2.444647789001465, "eval_runtime": 356.793, "eval_samples_per_second": 12.063, "eval_steps_per_second": 1.508, "step": 10200 }, { "epoch": 1.76, "grad_norm": 54.095086047288085, "learning_rate": 2.176883005311303e-08, "logits/chosen": -1.361449122428894, "logits/rejected": -1.3111120462417603, "logps/chosen": -245.9110565185547, "logps/rejected": -408.2243957519531, "loss": 0.3661, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9192225933074951, "rewards/margins": 1.5922331809997559, "rewards/rejected": -3.511455535888672, "step": 10210 }, { "epoch": 1.76, "grad_norm": 47.443592896329925, "learning_rate": 2.1463034632163535e-08, "logits/chosen": -1.2670648097991943, "logits/rejected": -1.2201905250549316, "logps/chosen": -241.36117553710938, "logps/rejected": -377.2726135253906, "loss": 0.4243, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9298385381698608, "rewards/margins": 1.3317430019378662, "rewards/rejected": -3.2615818977355957, "step": 10220 }, { "epoch": 1.76, "grad_norm": 27.354616295829352, "learning_rate": 2.1159305888588664e-08, "logits/chosen": -1.2255061864852905, "logits/rejected": -1.15509033203125, "logps/chosen": -232.21240234375, "logps/rejected": -373.4509582519531, "loss": 0.446, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7877719402313232, "rewards/margins": 1.4283082485198975, "rewards/rejected": -3.2160804271698, "step": 10230 }, { "epoch": 1.76, "grad_norm": 18.82682331923304, "learning_rate": 2.085764656903105e-08, "logits/chosen": -1.308882236480713, "logits/rejected": -1.2245006561279297, "logps/chosen": -224.0420684814453, "logps/rejected": -410.17626953125, "loss": 0.3153, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7259820699691772, "rewards/margins": 1.8521201610565186, "rewards/rejected": -3.5781021118164062, "step": 10240 }, { "epoch": 1.77, "grad_norm": 47.39267774633412, "learning_rate": 2.055805940141897e-08, "logits/chosen": -1.2967278957366943, "logits/rejected": -1.2207520008087158, "logps/chosen": -248.3931427001953, "logps/rejected": -379.16094970703125, "loss": 0.3673, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9161218404769897, "rewards/margins": 1.378807783126831, "rewards/rejected": -3.2949295043945312, "step": 10250 }, { "epoch": 1.77, "grad_norm": 29.996102972446575, "learning_rate": 2.0260547094942348e-08, "logits/chosen": -1.2790513038635254, "logits/rejected": -1.2392971515655518, "logps/chosen": -238.34115600585938, "logps/rejected": -378.8802185058594, "loss": 0.4199, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8511323928833008, "rewards/margins": 1.3689521551132202, "rewards/rejected": -3.2200846672058105, "step": 10260 }, { "epoch": 1.77, "grad_norm": 40.28205321720298, "learning_rate": 1.9965112340027874e-08, "logits/chosen": -1.300189733505249, "logits/rejected": -1.2424399852752686, "logps/chosen": -245.79641723632812, "logps/rejected": -382.1834411621094, "loss": 0.409, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9040024280548096, "rewards/margins": 1.361567497253418, "rewards/rejected": -3.2655701637268066, "step": 10270 }, { "epoch": 1.77, "grad_norm": 30.493851403701377, "learning_rate": 1.9671757808314675e-08, "logits/chosen": -1.2651712894439697, "logits/rejected": -1.2117892503738403, "logps/chosen": -252.64151000976562, "logps/rejected": -374.02337646484375, "loss": 0.4544, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9832038879394531, "rewards/margins": 1.2268593311309814, "rewards/rejected": -3.2100627422332764, "step": 10280 }, { "epoch": 1.77, "grad_norm": 32.40691710505685, "learning_rate": 1.9380486152630548e-08, "logits/chosen": -1.2505433559417725, "logits/rejected": -1.1942849159240723, "logps/chosen": -229.8437042236328, "logps/rejected": -385.4642639160156, "loss": 0.4206, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7842499017715454, "rewards/margins": 1.5160019397735596, "rewards/rejected": -3.3002521991729736, "step": 10290 }, { "epoch": 1.77, "grad_norm": 39.84324911925736, "learning_rate": 1.909130000696732e-08, "logits/chosen": -1.325438380241394, "logits/rejected": -1.2682257890701294, "logps/chosen": -235.05142211914062, "logps/rejected": -361.3345031738281, "loss": 0.4322, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8042240142822266, "rewards/margins": 1.2643927335739136, "rewards/rejected": -3.068617105484009, "step": 10300 }, { "epoch": 1.77, "eval_logits/chosen": -1.4002227783203125, "eval_logits/rejected": -1.3728564977645874, "eval_logps/chosen": -252.0375518798828, "eval_logps/rejected": -304.3727722167969, "eval_loss": 0.6351932287216187, "eval_rewards/accuracies": 0.6584572196006775, "eval_rewards/chosen": -1.9333373308181763, "eval_rewards/margins": 0.4788166582584381, "eval_rewards/rejected": -2.412153959274292, "eval_runtime": 356.5282, "eval_samples_per_second": 12.072, "eval_steps_per_second": 1.509, "step": 10300 }, { "epoch": 1.78, "grad_norm": 22.524340645441857, "learning_rate": 1.8804201986457742e-08, "logits/chosen": -1.261060357093811, "logits/rejected": -1.1999661922454834, "logps/chosen": -247.80233764648438, "logps/rejected": -386.2306213378906, "loss": 0.4087, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9156711101531982, "rewards/margins": 1.4324285984039307, "rewards/rejected": -3.34809947013855, "step": 10310 }, { "epoch": 1.78, "grad_norm": 46.482526842999754, "learning_rate": 1.851919468735119e-08, "logits/chosen": -1.315914511680603, "logits/rejected": -1.2488597631454468, "logps/chosen": -235.482666015625, "logps/rejected": -366.1687316894531, "loss": 0.4226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7874408960342407, "rewards/margins": 1.3392503261566162, "rewards/rejected": -3.1266913414001465, "step": 10320 }, { "epoch": 1.78, "grad_norm": 55.06877120295062, "learning_rate": 1.8236280686990653e-08, "logits/chosen": -1.3398211002349854, "logits/rejected": -1.2758935689926147, "logps/chosen": -230.93673706054688, "logps/rejected": -373.24432373046875, "loss": 0.3857, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7879480123519897, "rewards/margins": 1.3995113372802734, "rewards/rejected": -3.1874594688415527, "step": 10330 }, { "epoch": 1.78, "grad_norm": 31.59403091184685, "learning_rate": 1.795546254378927e-08, "logits/chosen": -1.3418684005737305, "logits/rejected": -1.2638609409332275, "logps/chosen": -231.3434295654297, "logps/rejected": -384.9586181640625, "loss": 0.3837, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7527868747711182, "rewards/margins": 1.5512384176254272, "rewards/rejected": -3.304025173187256, "step": 10340 }, { "epoch": 1.78, "grad_norm": 44.48450974185094, "learning_rate": 1.7676742797207045e-08, "logits/chosen": -1.4229285717010498, "logits/rejected": -1.3561543226242065, "logps/chosen": -241.94143676757812, "logps/rejected": -376.6054992675781, "loss": 0.4181, "rewards/accuracies": 0.78125, "rewards/chosen": -1.878838300704956, "rewards/margins": 1.376997470855713, "rewards/rejected": -3.255835771560669, "step": 10350 }, { "epoch": 1.78, "grad_norm": 40.416266152745024, "learning_rate": 1.740012396772819e-08, "logits/chosen": -1.2647850513458252, "logits/rejected": -1.1986753940582275, "logps/chosen": -245.0532684326172, "logps/rejected": -359.286865234375, "loss": 0.4955, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.891425371170044, "rewards/margins": 1.173485279083252, "rewards/rejected": -3.064910411834717, "step": 10360 }, { "epoch": 1.79, "grad_norm": 29.777426788141348, "learning_rate": 1.7125608556838035e-08, "logits/chosen": -1.1616214513778687, "logits/rejected": -1.0960752964019775, "logps/chosen": -223.65115356445312, "logps/rejected": -352.9761047363281, "loss": 0.4088, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7168071269989014, "rewards/margins": 1.2844127416610718, "rewards/rejected": -3.001220226287842, "step": 10370 }, { "epoch": 1.79, "grad_norm": 51.70883721503306, "learning_rate": 1.6853199047000584e-08, "logits/chosen": -1.3133689165115356, "logits/rejected": -1.2742842435836792, "logps/chosen": -249.2805633544922, "logps/rejected": -336.363037109375, "loss": 0.5641, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.974237084388733, "rewards/margins": 0.8665148019790649, "rewards/rejected": -2.840752124786377, "step": 10380 }, { "epoch": 1.79, "grad_norm": 21.280760614032122, "learning_rate": 1.6582897901636027e-08, "logits/chosen": -1.3474326133728027, "logits/rejected": -1.2722728252410889, "logps/chosen": -228.17819213867188, "logps/rejected": -372.76544189453125, "loss": 0.391, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7238601446151733, "rewards/margins": 1.463079571723938, "rewards/rejected": -3.1869397163391113, "step": 10390 }, { "epoch": 1.79, "grad_norm": 23.04577762515264, "learning_rate": 1.6314707565098395e-08, "logits/chosen": -1.2872307300567627, "logits/rejected": -1.2240254878997803, "logps/chosen": -246.9097900390625, "logps/rejected": -409.812255859375, "loss": 0.3405, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9479862451553345, "rewards/margins": 1.5971990823745728, "rewards/rejected": -3.5451855659484863, "step": 10400 }, { "epoch": 1.79, "eval_logits/chosen": -1.411521315574646, "eval_logits/rejected": -1.384405493736267, "eval_logps/chosen": -247.26951599121094, "eval_logps/rejected": -298.5337219238281, "eval_loss": 0.6351563334465027, "eval_rewards/accuracies": 0.660780668258667, "eval_rewards/chosen": -1.885656714439392, "eval_rewards/margins": 0.46810635924339294, "eval_rewards/rejected": -2.3537631034851074, "eval_runtime": 356.5783, "eval_samples_per_second": 12.07, "eval_steps_per_second": 1.509, "step": 10400 }, { "epoch": 1.79, "grad_norm": 29.30147034557462, "learning_rate": 1.6048630462653616e-08, "logits/chosen": -1.2767484188079834, "logits/rejected": -1.2087651491165161, "logps/chosen": -246.88467407226562, "logps/rejected": -367.8938293457031, "loss": 0.4398, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9034740924835205, "rewards/margins": 1.2447469234466553, "rewards/rejected": -3.1482207775115967, "step": 10410 }, { "epoch": 1.8, "grad_norm": 50.80138734136231, "learning_rate": 1.578466900045733e-08, "logits/chosen": -1.3154891729354858, "logits/rejected": -1.2465871572494507, "logps/chosen": -237.7150421142578, "logps/rejected": -368.3978576660156, "loss": 0.4055, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8362977504730225, "rewards/margins": 1.3066072463989258, "rewards/rejected": -3.142904758453369, "step": 10420 }, { "epoch": 1.8, "grad_norm": 28.18485959040931, "learning_rate": 1.5522825565533442e-08, "logits/chosen": -1.3946081399917603, "logits/rejected": -1.3362939357757568, "logps/chosen": -232.037109375, "logps/rejected": -357.0254211425781, "loss": 0.4289, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7716169357299805, "rewards/margins": 1.2683614492416382, "rewards/rejected": -3.039978504180908, "step": 10430 }, { "epoch": 1.8, "grad_norm": 27.574863749179777, "learning_rate": 1.526310252575222e-08, "logits/chosen": -1.4324071407318115, "logits/rejected": -1.380183219909668, "logps/chosen": -236.1354217529297, "logps/rejected": -360.7831115722656, "loss": 0.4444, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7827268838882446, "rewards/margins": 1.2522282600402832, "rewards/rejected": -3.0349552631378174, "step": 10440 }, { "epoch": 1.8, "grad_norm": 34.86665149520393, "learning_rate": 1.500550222980923e-08, "logits/chosen": -1.3539096117019653, "logits/rejected": -1.3092883825302124, "logps/chosen": -238.3639678955078, "logps/rejected": -363.4995422363281, "loss": 0.4144, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8473567962646484, "rewards/margins": 1.211411476135254, "rewards/rejected": -3.0587682723999023, "step": 10450 }, { "epoch": 1.8, "grad_norm": 32.44019457079582, "learning_rate": 1.4750027007203653e-08, "logits/chosen": -1.3425335884094238, "logits/rejected": -1.2749333381652832, "logps/chosen": -229.01583862304688, "logps/rejected": -354.47314453125, "loss": 0.423, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7443774938583374, "rewards/margins": 1.2702993154525757, "rewards/rejected": -3.014676809310913, "step": 10460 }, { "epoch": 1.8, "grad_norm": 57.913238056684506, "learning_rate": 1.4496679168217646e-08, "logits/chosen": -1.1968591213226318, "logits/rejected": -1.1376616954803467, "logps/chosen": -241.31689453125, "logps/rejected": -365.15399169921875, "loss": 0.4734, "rewards/accuracies": 0.75, "rewards/chosen": -1.8941717147827148, "rewards/margins": 1.2496927976608276, "rewards/rejected": -3.143864393234253, "step": 10470 }, { "epoch": 1.81, "grad_norm": 26.776034814613833, "learning_rate": 1.4245461003895232e-08, "logits/chosen": -1.3491319417953491, "logits/rejected": -1.2737585306167603, "logps/chosen": -229.96826171875, "logps/rejected": -374.3705749511719, "loss": 0.4425, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.740369439125061, "rewards/margins": 1.4796626567840576, "rewards/rejected": -3.220032215118408, "step": 10480 }, { "epoch": 1.81, "grad_norm": 27.83311511088631, "learning_rate": 1.3996374786021642e-08, "logits/chosen": -1.3122376203536987, "logits/rejected": -1.241257905960083, "logps/chosen": -222.9138641357422, "logps/rejected": -362.92169189453125, "loss": 0.4151, "rewards/accuracies": 0.78125, "rewards/chosen": -1.691291093826294, "rewards/margins": 1.4125699996948242, "rewards/rejected": -3.1038613319396973, "step": 10490 }, { "epoch": 1.81, "grad_norm": 34.16101354006198, "learning_rate": 1.3749422767102698e-08, "logits/chosen": -1.3112982511520386, "logits/rejected": -1.2444483041763306, "logps/chosen": -234.7496337890625, "logps/rejected": -384.12420654296875, "loss": 0.424, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.820643424987793, "rewards/margins": 1.4697020053863525, "rewards/rejected": -3.2903454303741455, "step": 10500 }, { "epoch": 1.81, "eval_logits/chosen": -1.411332368850708, "eval_logits/rejected": -1.3842883110046387, "eval_logps/chosen": -246.45022583007812, "eval_logps/rejected": -297.5495300292969, "eval_loss": 0.6351029276847839, "eval_rewards/accuracies": 0.6598513126373291, "eval_rewards/chosen": -1.8774638175964355, "eval_rewards/margins": 0.46645745635032654, "eval_rewards/rejected": -2.343921184539795, "eval_runtime": 356.9292, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 10500 }, { "epoch": 1.81, "grad_norm": 36.69075640202483, "learning_rate": 1.3504607180344463e-08, "logits/chosen": -1.3228697776794434, "logits/rejected": -1.2543773651123047, "logps/chosen": -228.5086669921875, "logps/rejected": -364.3096923828125, "loss": 0.415, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7290817499160767, "rewards/margins": 1.3821680545806885, "rewards/rejected": -3.1112494468688965, "step": 10510 }, { "epoch": 1.81, "grad_norm": 46.69963744425329, "learning_rate": 1.3261930239633261e-08, "logits/chosen": -1.4053773880004883, "logits/rejected": -1.3577836751937866, "logps/chosen": -221.27139282226562, "logps/rejected": -369.21697998046875, "loss": 0.4035, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7077957391738892, "rewards/margins": 1.4389338493347168, "rewards/rejected": -3.1467297077178955, "step": 10520 }, { "epoch": 1.81, "grad_norm": 35.672432590977046, "learning_rate": 1.3021394139515197e-08, "logits/chosen": -1.2971795797348022, "logits/rejected": -1.2416961193084717, "logps/chosen": -233.1261444091797, "logps/rejected": -350.49969482421875, "loss": 0.4311, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7978441715240479, "rewards/margins": 1.185850977897644, "rewards/rejected": -2.9836955070495605, "step": 10530 }, { "epoch": 1.82, "grad_norm": 25.051590405247744, "learning_rate": 1.2783001055176907e-08, "logits/chosen": -1.2430702447891235, "logits/rejected": -1.1768423318862915, "logps/chosen": -227.86788940429688, "logps/rejected": -368.8069152832031, "loss": 0.3771, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7555568218231201, "rewards/margins": 1.4385837316513062, "rewards/rejected": -3.194140672683716, "step": 10540 }, { "epoch": 1.82, "grad_norm": 48.523638164139314, "learning_rate": 1.2546753142425315e-08, "logits/chosen": -1.4035842418670654, "logits/rejected": -1.3484973907470703, "logps/chosen": -241.98580932617188, "logps/rejected": -391.2903137207031, "loss": 0.3919, "rewards/accuracies": 0.875, "rewards/chosen": -1.8824710845947266, "rewards/margins": 1.472904920578003, "rewards/rejected": -3.3553764820098877, "step": 10550 }, { "epoch": 1.82, "grad_norm": 31.996639068258908, "learning_rate": 1.2312652537668499e-08, "logits/chosen": -1.2917983531951904, "logits/rejected": -1.2220714092254639, "logps/chosen": -226.38491821289062, "logps/rejected": -382.4876403808594, "loss": 0.4064, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7078090906143188, "rewards/margins": 1.5710179805755615, "rewards/rejected": -3.278826951980591, "step": 10560 }, { "epoch": 1.82, "grad_norm": 39.94758110559314, "learning_rate": 1.2080701357896267e-08, "logits/chosen": -1.361748456954956, "logits/rejected": -1.3035575151443481, "logps/chosen": -242.5690460205078, "logps/rejected": -395.84844970703125, "loss": 0.3614, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.866334319114685, "rewards/margins": 1.5249965190887451, "rewards/rejected": -3.3913307189941406, "step": 10570 }, { "epoch": 1.82, "grad_norm": 34.466730835739696, "learning_rate": 1.185090170066097e-08, "logits/chosen": -1.3756376504898071, "logits/rejected": -1.310418963432312, "logps/chosen": -224.53564453125, "logps/rejected": -363.5881652832031, "loss": 0.4041, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.711679220199585, "rewards/margins": 1.3960368633270264, "rewards/rejected": -3.1077163219451904, "step": 10580 }, { "epoch": 1.82, "grad_norm": 25.78656181126498, "learning_rate": 1.1623255644058638e-08, "logits/chosen": -1.3188053369522095, "logits/rejected": -1.2425676584243774, "logps/chosen": -226.51016235351562, "logps/rejected": -357.7701721191406, "loss": 0.4113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7062441110610962, "rewards/margins": 1.3722339868545532, "rewards/rejected": -3.0784783363342285, "step": 10590 }, { "epoch": 1.83, "grad_norm": 34.38272700142523, "learning_rate": 1.1397765246710072e-08, "logits/chosen": -1.3734514713287354, "logits/rejected": -1.3211066722869873, "logps/chosen": -227.74270629882812, "logps/rejected": -366.80364990234375, "loss": 0.4396, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7487207651138306, "rewards/margins": 1.372809648513794, "rewards/rejected": -3.121530294418335, "step": 10600 }, { "epoch": 1.83, "eval_logits/chosen": -1.4146060943603516, "eval_logits/rejected": -1.3876330852508545, "eval_logps/chosen": -246.19650268554688, "eval_logps/rejected": -297.2034912109375, "eval_loss": 0.6350103616714478, "eval_rewards/accuracies": 0.6568308472633362, "eval_rewards/chosen": -1.8749263286590576, "eval_rewards/margins": 0.4655349552631378, "eval_rewards/rejected": -2.340461254119873, "eval_runtime": 357.4305, "eval_samples_per_second": 12.042, "eval_steps_per_second": 1.505, "step": 10600 }, { "epoch": 1.83, "grad_norm": 36.19363313018661, "learning_rate": 1.1174432547742308e-08, "logits/chosen": -1.3016915321350098, "logits/rejected": -1.2503132820129395, "logps/chosen": -239.76791381835938, "logps/rejected": -367.1284484863281, "loss": 0.4453, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8869917392730713, "rewards/margins": 1.2658240795135498, "rewards/rejected": -3.152815580368042, "step": 10610 }, { "epoch": 1.83, "grad_norm": 41.29882501819702, "learning_rate": 1.095325956677015e-08, "logits/chosen": -1.2536303997039795, "logits/rejected": -1.1866223812103271, "logps/chosen": -238.30874633789062, "logps/rejected": -374.2566833496094, "loss": 0.4032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8451499938964844, "rewards/margins": 1.374023675918579, "rewards/rejected": -3.2191734313964844, "step": 10620 }, { "epoch": 1.83, "grad_norm": 22.72297730303355, "learning_rate": 1.0734248303877813e-08, "logits/chosen": -1.3555238246917725, "logits/rejected": -1.2904984951019287, "logps/chosen": -231.20010375976562, "logps/rejected": -360.0879211425781, "loss": 0.4694, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7696985006332397, "rewards/margins": 1.2896592617034912, "rewards/rejected": -3.0593578815460205, "step": 10630 }, { "epoch": 1.83, "grad_norm": 34.92246125378767, "learning_rate": 1.051740073960114e-08, "logits/chosen": -1.340003252029419, "logits/rejected": -1.2763203382492065, "logps/chosen": -236.17941284179688, "logps/rejected": -368.7391052246094, "loss": 0.4742, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8113651275634766, "rewards/margins": 1.350304365158081, "rewards/rejected": -3.1616694927215576, "step": 10640 }, { "epoch": 1.83, "grad_norm": 26.61094450928108, "learning_rate": 1.0302718834909213e-08, "logits/chosen": -1.3977024555206299, "logits/rejected": -1.3331856727600098, "logps/chosen": -238.84042358398438, "logps/rejected": -391.08477783203125, "loss": 0.4176, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8521630764007568, "rewards/margins": 1.520591139793396, "rewards/rejected": -3.3727545738220215, "step": 10650 }, { "epoch": 1.84, "grad_norm": 33.27724860143072, "learning_rate": 1.0090204531187168e-08, "logits/chosen": -1.2856873273849487, "logits/rejected": -1.2244932651519775, "logps/chosen": -238.58139038085938, "logps/rejected": -374.8933410644531, "loss": 0.4089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.839249849319458, "rewards/margins": 1.358032464981079, "rewards/rejected": -3.197282314300537, "step": 10660 }, { "epoch": 1.84, "grad_norm": 44.68715039350637, "learning_rate": 9.8798597502181e-09, "logits/chosen": -1.3030592203140259, "logits/rejected": -1.2441256046295166, "logps/chosen": -246.3955841064453, "logps/rejected": -373.30462646484375, "loss": 0.4701, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9426281452178955, "rewards/margins": 1.2749965190887451, "rewards/rejected": -3.2176246643066406, "step": 10670 }, { "epoch": 1.84, "grad_norm": 26.493605739622634, "learning_rate": 9.671686394166156e-09, "logits/chosen": -1.3562123775482178, "logits/rejected": -1.2756215333938599, "logps/chosen": -221.86074829101562, "logps/rejected": -360.75762939453125, "loss": 0.3826, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6560554504394531, "rewards/margins": 1.4439318180084229, "rewards/rejected": -3.099987506866455, "step": 10680 }, { "epoch": 1.84, "grad_norm": 30.331480121505034, "learning_rate": 9.465686345558944e-09, "logits/chosen": -1.3282508850097656, "logits/rejected": -1.2702124118804932, "logps/chosen": -226.7456817626953, "logps/rejected": -377.0152282714844, "loss": 0.4397, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7300522327423096, "rewards/margins": 1.4764258861541748, "rewards/rejected": -3.2064781188964844, "step": 10690 }, { "epoch": 1.84, "grad_norm": 37.77942293381782, "learning_rate": 9.261861467270787e-09, "logits/chosen": -1.3761959075927734, "logits/rejected": -1.3014271259307861, "logps/chosen": -222.33877563476562, "logps/rejected": -349.0968322753906, "loss": 0.3908, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6618179082870483, "rewards/margins": 1.3258897066116333, "rewards/rejected": -2.9877076148986816, "step": 10700 }, { "epoch": 1.84, "eval_logits/chosen": -1.4212182760238647, "eval_logits/rejected": -1.3943599462509155, "eval_logps/chosen": -243.04237365722656, "eval_logps/rejected": -293.6067810058594, "eval_loss": 0.6334287524223328, "eval_rewards/accuracies": 0.6563661694526672, "eval_rewards/chosen": -1.8433852195739746, "eval_rewards/margins": 0.46110865473747253, "eval_rewards/rejected": -2.3044939041137695, "eval_runtime": 356.8228, "eval_samples_per_second": 12.062, "eval_steps_per_second": 1.508, "step": 10700 }, { "epoch": 1.85, "grad_norm": 54.759408136897285, "learning_rate": 9.060213602505778e-09, "logits/chosen": -1.304149866104126, "logits/rejected": -1.239341139793396, "logps/chosen": -225.63449096679688, "logps/rejected": -350.32049560546875, "loss": 0.4472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7219674587249756, "rewards/margins": 1.2871848344802856, "rewards/rejected": -3.0091521739959717, "step": 10710 }, { "epoch": 1.85, "grad_norm": 57.3062622613245, "learning_rate": 8.860744574781032e-09, "logits/chosen": -1.338438630104065, "logits/rejected": -1.269768476486206, "logps/chosen": -236.2624053955078, "logps/rejected": -356.3697509765625, "loss": 0.4911, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8369505405426025, "rewards/margins": 1.204301118850708, "rewards/rejected": -3.0412516593933105, "step": 10720 }, { "epoch": 1.85, "grad_norm": 28.412674272208022, "learning_rate": 8.663456187910422e-09, "logits/chosen": -1.4263569116592407, "logits/rejected": -1.3510792255401611, "logps/chosen": -229.66720581054688, "logps/rejected": -364.2427673339844, "loss": 0.3543, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7573401927947998, "rewards/margins": 1.3864922523498535, "rewards/rejected": -3.1438326835632324, "step": 10730 }, { "epoch": 1.85, "grad_norm": 41.706917137582344, "learning_rate": 8.468350225987908e-09, "logits/chosen": -1.2837555408477783, "logits/rejected": -1.222891926765442, "logps/chosen": -246.54638671875, "logps/rejected": -371.4312744140625, "loss": 0.4706, "rewards/accuracies": 0.75, "rewards/chosen": -1.9186769723892212, "rewards/margins": 1.2438628673553467, "rewards/rejected": -3.1625399589538574, "step": 10740 }, { "epoch": 1.85, "grad_norm": 31.262809555566417, "learning_rate": 8.275428453371813e-09, "logits/chosen": -1.248228907585144, "logits/rejected": -1.1745529174804688, "logps/chosen": -237.2993621826172, "logps/rejected": -385.1238098144531, "loss": 0.4173, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8524090051651, "rewards/margins": 1.4723732471466064, "rewards/rejected": -3.324782609939575, "step": 10750 }, { "epoch": 1.85, "grad_norm": 35.16663063428534, "learning_rate": 8.084692614668542e-09, "logits/chosen": -1.3016769886016846, "logits/rejected": -1.248867392539978, "logps/chosen": -224.76113891601562, "logps/rejected": -346.5380859375, "loss": 0.4184, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7105858325958252, "rewards/margins": 1.22708261013031, "rewards/rejected": -2.937668561935425, "step": 10760 }, { "epoch": 1.86, "grad_norm": 31.86195155798675, "learning_rate": 7.89614443471695e-09, "logits/chosen": -1.3076452016830444, "logits/rejected": -1.2525701522827148, "logps/chosen": -217.1156463623047, "logps/rejected": -352.6279296875, "loss": 0.3762, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6004784107208252, "rewards/margins": 1.3801997900009155, "rewards/rejected": -2.9806783199310303, "step": 10770 }, { "epoch": 1.86, "grad_norm": 25.654760643763883, "learning_rate": 7.7097856185728e-09, "logits/chosen": -1.418269395828247, "logits/rejected": -1.3548392057418823, "logps/chosen": -222.4542999267578, "logps/rejected": -363.5215148925781, "loss": 0.4058, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6930938959121704, "rewards/margins": 1.3915579319000244, "rewards/rejected": -3.084651470184326, "step": 10780 }, { "epoch": 1.86, "grad_norm": 40.803942439334094, "learning_rate": 7.525617851493166e-09, "logits/chosen": -1.4488575458526611, "logits/rejected": -1.3801032304763794, "logps/chosen": -207.01974487304688, "logps/rejected": -345.1376647949219, "loss": 0.3895, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5470707416534424, "rewards/margins": 1.3702001571655273, "rewards/rejected": -2.917271137237549, "step": 10790 }, { "epoch": 1.86, "grad_norm": 22.89870220316856, "learning_rate": 7.343642798921384e-09, "logits/chosen": -1.4306409358978271, "logits/rejected": -1.3741027116775513, "logps/chosen": -220.6624755859375, "logps/rejected": -359.7532043457031, "loss": 0.4273, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6872755289077759, "rewards/margins": 1.371055006980896, "rewards/rejected": -3.058330774307251, "step": 10800 }, { "epoch": 1.86, "eval_logits/chosen": -1.419447422027588, "eval_logits/rejected": -1.3925857543945312, "eval_logps/chosen": -244.09783935546875, "eval_logps/rejected": -294.6657409667969, "eval_loss": 0.6341846585273743, "eval_rewards/accuracies": 0.6624070405960083, "eval_rewards/chosen": -1.8539396524429321, "eval_rewards/margins": 0.4611437916755676, "eval_rewards/rejected": -2.3150837421417236, "eval_runtime": 357.4896, "eval_samples_per_second": 12.04, "eval_steps_per_second": 1.505, "step": 10800 }, { "epoch": 1.86, "grad_norm": 24.19588838169064, "learning_rate": 7.1638621064718516e-09, "logits/chosen": -1.373991847038269, "logits/rejected": -1.2945966720581055, "logps/chosen": -227.33596801757812, "logps/rejected": -364.17059326171875, "loss": 0.3831, "rewards/accuracies": 0.78125, "rewards/chosen": -1.699730634689331, "rewards/margins": 1.4409074783325195, "rewards/rejected": -3.140638589859009, "step": 10810 }, { "epoch": 1.86, "grad_norm": 50.27316401128035, "learning_rate": 6.986277399915197e-09, "logits/chosen": -1.2879558801651, "logits/rejected": -1.2268191576004028, "logps/chosen": -209.937744140625, "logps/rejected": -343.2181701660156, "loss": 0.4086, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5688936710357666, "rewards/margins": 1.349342703819275, "rewards/rejected": -2.918236255645752, "step": 10820 }, { "epoch": 1.87, "grad_norm": 50.904093818767734, "learning_rate": 6.8108902851636285e-09, "logits/chosen": -1.3199676275253296, "logits/rejected": -1.2508794069290161, "logps/chosen": -232.073486328125, "logps/rejected": -368.0849914550781, "loss": 0.399, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8025493621826172, "rewards/margins": 1.3453489542007446, "rewards/rejected": -3.147897958755493, "step": 10830 }, { "epoch": 1.87, "grad_norm": 27.336450230863502, "learning_rate": 6.637702348256308e-09, "logits/chosen": -1.3644187450408936, "logits/rejected": -1.307117223739624, "logps/chosen": -227.4879150390625, "logps/rejected": -349.18182373046875, "loss": 0.4494, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.74507737159729, "rewards/margins": 1.237743854522705, "rewards/rejected": -2.982821226119995, "step": 10840 }, { "epoch": 1.87, "grad_norm": 33.93760320273209, "learning_rate": 6.466715155345109e-09, "logits/chosen": -1.2493406534194946, "logits/rejected": -1.199512243270874, "logps/chosen": -229.41708374023438, "logps/rejected": -345.4873046875, "loss": 0.4573, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7922929525375366, "rewards/margins": 1.118058443069458, "rewards/rejected": -2.910351276397705, "step": 10850 }, { "epoch": 1.87, "grad_norm": 31.190541875354757, "learning_rate": 6.2979302526803006e-09, "logits/chosen": -1.4172183275222778, "logits/rejected": -1.3366795778274536, "logps/chosen": -223.34884643554688, "logps/rejected": -358.8973693847656, "loss": 0.4234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6810195446014404, "rewards/margins": 1.3840689659118652, "rewards/rejected": -3.0650882720947266, "step": 10860 }, { "epoch": 1.87, "grad_norm": 19.263570637063424, "learning_rate": 6.131349166596883e-09, "logits/chosen": -1.2681770324707031, "logits/rejected": -1.21076500415802, "logps/chosen": -206.9633026123047, "logps/rejected": -360.4692077636719, "loss": 0.4066, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5650874376296997, "rewards/margins": 1.4790990352630615, "rewards/rejected": -3.0441863536834717, "step": 10870 }, { "epoch": 1.87, "grad_norm": 33.21398609679741, "learning_rate": 5.966973403500303e-09, "logits/chosen": -1.3271772861480713, "logits/rejected": -1.2587413787841797, "logps/chosen": -232.65988159179688, "logps/rejected": -375.89111328125, "loss": 0.3756, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7844676971435547, "rewards/margins": 1.4478579759597778, "rewards/rejected": -3.232325315475464, "step": 10880 }, { "epoch": 1.88, "grad_norm": 37.56955046527683, "learning_rate": 5.804804449853401e-09, "logits/chosen": -1.3854949474334717, "logits/rejected": -1.3297450542449951, "logps/chosen": -223.22830200195312, "logps/rejected": -355.37030029296875, "loss": 0.4186, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7010080814361572, "rewards/margins": 1.30489182472229, "rewards/rejected": -3.0058999061584473, "step": 10890 }, { "epoch": 1.88, "grad_norm": 37.057241152716664, "learning_rate": 5.644843772162372e-09, "logits/chosen": -1.434251308441162, "logits/rejected": -1.3531643152236938, "logps/chosen": -212.3564453125, "logps/rejected": -349.568603515625, "loss": 0.3762, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.5736942291259766, "rewards/margins": 1.4162404537200928, "rewards/rejected": -2.9899344444274902, "step": 10900 }, { "epoch": 1.88, "eval_logits/chosen": -1.4172533750534058, "eval_logits/rejected": -1.3904321193695068, "eval_logps/chosen": -244.67039489746094, "eval_logps/rejected": -295.28729248046875, "eval_loss": 0.6345546245574951, "eval_rewards/accuracies": 0.6565985083580017, "eval_rewards/chosen": -1.8596652746200562, "eval_rewards/margins": 0.4616338312625885, "eval_rewards/rejected": -2.3212990760803223, "eval_runtime": 357.2508, "eval_samples_per_second": 12.048, "eval_steps_per_second": 1.506, "step": 10900 }, { "epoch": 1.88, "grad_norm": 37.413849937107756, "learning_rate": 5.487092816963995e-09, "logits/chosen": -1.3338580131530762, "logits/rejected": -1.2663573026657104, "logps/chosen": -217.49801635742188, "logps/rejected": -344.0356750488281, "loss": 0.4173, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6183340549468994, "rewards/margins": 1.2802609205245972, "rewards/rejected": -2.898594856262207, "step": 10910 }, { "epoch": 1.88, "grad_norm": 42.748889303543294, "learning_rate": 5.331553010812312e-09, "logits/chosen": -1.3081706762313843, "logits/rejected": -1.2376407384872437, "logps/chosen": -229.73422241210938, "logps/rejected": -366.51495361328125, "loss": 0.3763, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7389205694198608, "rewards/margins": 1.3969751596450806, "rewards/rejected": -3.1358957290649414, "step": 10920 }, { "epoch": 1.88, "grad_norm": 29.30790533708006, "learning_rate": 5.1782257602657756e-09, "logits/chosen": -1.257922887802124, "logits/rejected": -1.1985923051834106, "logps/chosen": -233.80294799804688, "logps/rejected": -352.77984619140625, "loss": 0.4355, "rewards/accuracies": 0.75, "rewards/chosen": -1.795925498008728, "rewards/margins": 1.217332363128662, "rewards/rejected": -3.0132579803466797, "step": 10930 }, { "epoch": 1.88, "grad_norm": 54.99655407488525, "learning_rate": 5.027112451874483e-09, "logits/chosen": -1.2420815229415894, "logits/rejected": -1.1881957054138184, "logps/chosen": -241.045166015625, "logps/rejected": -369.63568115234375, "loss": 0.4248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8460403680801392, "rewards/margins": 1.2986847162246704, "rewards/rejected": -3.1447253227233887, "step": 10940 }, { "epoch": 1.89, "grad_norm": 45.54073464601158, "learning_rate": 4.878214452167739e-09, "logits/chosen": -1.3072357177734375, "logits/rejected": -1.2408037185668945, "logps/chosen": -238.0092315673828, "logps/rejected": -381.5384826660156, "loss": 0.3866, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.834251046180725, "rewards/margins": 1.4345420598983765, "rewards/rejected": -3.2687935829162598, "step": 10950 }, { "epoch": 1.89, "grad_norm": 30.095545129155003, "learning_rate": 4.7315331076416275e-09, "logits/chosen": -1.3551833629608154, "logits/rejected": -1.293670654296875, "logps/chosen": -236.3451385498047, "logps/rejected": -368.11126708984375, "loss": 0.4367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8255088329315186, "rewards/margins": 1.3299205303192139, "rewards/rejected": -3.1554293632507324, "step": 10960 }, { "epoch": 1.89, "grad_norm": 45.41850107802761, "learning_rate": 4.587069744746791e-09, "logits/chosen": -1.3423527479171753, "logits/rejected": -1.2766934633255005, "logps/chosen": -233.58682250976562, "logps/rejected": -358.17694091796875, "loss": 0.4907, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7673285007476807, "rewards/margins": 1.2734668254852295, "rewards/rejected": -3.04079532623291, "step": 10970 }, { "epoch": 1.89, "grad_norm": 24.319102255891416, "learning_rate": 4.44482566987664e-09, "logits/chosen": -1.3505980968475342, "logits/rejected": -1.2955100536346436, "logps/chosen": -245.52920532226562, "logps/rejected": -382.1690368652344, "loss": 0.4328, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9095827341079712, "rewards/margins": 1.3507945537567139, "rewards/rejected": -3.2603771686553955, "step": 10980 }, { "epoch": 1.89, "grad_norm": 28.168547715893087, "learning_rate": 4.304802169355221e-09, "logits/chosen": -1.2861645221710205, "logits/rejected": -1.22446608543396, "logps/chosen": -218.29025268554688, "logps/rejected": -352.78509521484375, "loss": 0.4101, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6582218408584595, "rewards/margins": 1.3449079990386963, "rewards/rejected": -3.0031299591064453, "step": 10990 }, { "epoch": 1.9, "grad_norm": 39.270052414600016, "learning_rate": 4.167000509425811e-09, "logits/chosen": -1.4578297138214111, "logits/rejected": -1.4120782613754272, "logps/chosen": -241.8314208984375, "logps/rejected": -367.2518005371094, "loss": 0.4734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8894517421722412, "rewards/margins": 1.2133595943450928, "rewards/rejected": -3.102811336517334, "step": 11000 }, { "epoch": 1.9, "eval_logits/chosen": -1.4190027713775635, "eval_logits/rejected": -1.392012357711792, "eval_logps/chosen": -243.87950134277344, "eval_logps/rejected": -294.5247802734375, "eval_loss": 0.6338525414466858, "eval_rewards/accuracies": 0.6628717184066772, "eval_rewards/chosen": -1.8517564535140991, "eval_rewards/margins": 0.4619174599647522, "eval_rewards/rejected": -2.313674211502075, "eval_runtime": 357.5201, "eval_samples_per_second": 12.038, "eval_steps_per_second": 1.505, "step": 11000 }, { "epoch": 1.9, "grad_norm": 33.16784261064953, "learning_rate": 4.03142193623951e-09, "logits/chosen": -1.3858684301376343, "logits/rejected": -1.305426001548767, "logps/chosen": -235.13864135742188, "logps/rejected": -388.290771484375, "loss": 0.3688, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8195550441741943, "rewards/margins": 1.5488157272338867, "rewards/rejected": -3.368370771408081, "step": 11010 }, { "epoch": 1.9, "grad_norm": 19.279870183301984, "learning_rate": 3.898067675843747e-09, "logits/chosen": -1.435046911239624, "logits/rejected": -1.3683885335922241, "logps/chosen": -224.517578125, "logps/rejected": -371.97283935546875, "loss": 0.3641, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6953538656234741, "rewards/margins": 1.4940444231033325, "rewards/rejected": -3.1893982887268066, "step": 11020 }, { "epoch": 1.9, "grad_norm": 26.617574306250333, "learning_rate": 3.766938934171348e-09, "logits/chosen": -1.3704140186309814, "logits/rejected": -1.3155790567398071, "logps/chosen": -236.8312225341797, "logps/rejected": -383.66754150390625, "loss": 0.4191, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8204883337020874, "rewards/margins": 1.4713939428329468, "rewards/rejected": -3.2918827533721924, "step": 11030 }, { "epoch": 1.9, "grad_norm": 32.25461324792829, "learning_rate": 3.6380368970296836e-09, "logits/chosen": -1.4112730026245117, "logits/rejected": -1.351285696029663, "logps/chosen": -239.5612030029297, "logps/rejected": -364.81146240234375, "loss": 0.4416, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.831099271774292, "rewards/margins": 1.2617604732513428, "rewards/rejected": -3.0928597450256348, "step": 11040 }, { "epoch": 1.9, "grad_norm": 35.4050705396756, "learning_rate": 3.5113627300897285e-09, "logits/chosen": -1.310435175895691, "logits/rejected": -1.2423975467681885, "logps/chosen": -222.96923828125, "logps/rejected": -379.7068786621094, "loss": 0.3806, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7100273370742798, "rewards/margins": 1.534369707107544, "rewards/rejected": -3.244396924972534, "step": 11050 }, { "epoch": 1.91, "grad_norm": 29.633120494533653, "learning_rate": 3.38691757887577e-09, "logits/chosen": -1.3639460802078247, "logits/rejected": -1.271759271621704, "logps/chosen": -234.57968139648438, "logps/rejected": -376.5106506347656, "loss": 0.4119, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8031505346298218, "rewards/margins": 1.444676399230957, "rewards/rejected": -3.2478268146514893, "step": 11060 }, { "epoch": 1.91, "grad_norm": 27.809450621921286, "learning_rate": 3.2647025687549122e-09, "logits/chosen": -1.3753821849822998, "logits/rejected": -1.2898151874542236, "logps/chosen": -224.57437133789062, "logps/rejected": -369.9324645996094, "loss": 0.4233, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.714037537574768, "rewards/margins": 1.4897834062576294, "rewards/rejected": -3.2038207054138184, "step": 11070 }, { "epoch": 1.91, "grad_norm": 25.21379163567548, "learning_rate": 3.144718804926866e-09, "logits/chosen": -1.3679758310317993, "logits/rejected": -1.3044774532318115, "logps/chosen": -238.9508819580078, "logps/rejected": -373.56060791015625, "loss": 0.417, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8344926834106445, "rewards/margins": 1.3762904405593872, "rewards/rejected": -3.210782527923584, "step": 11080 }, { "epoch": 1.91, "grad_norm": 38.50752589918087, "learning_rate": 3.0269673724140356e-09, "logits/chosen": -1.3562889099121094, "logits/rejected": -1.297836184501648, "logps/chosen": -233.75924682617188, "logps/rejected": -358.478271484375, "loss": 0.4123, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7592127323150635, "rewards/margins": 1.2987269163131714, "rewards/rejected": -3.0579395294189453, "step": 11090 }, { "epoch": 1.91, "grad_norm": 34.59360161848991, "learning_rate": 2.9114493360517245e-09, "logits/chosen": -1.2469245195388794, "logits/rejected": -1.1905173063278198, "logps/chosen": -209.7041015625, "logps/rejected": -341.9212646484375, "loss": 0.4333, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5883010625839233, "rewards/margins": 1.2910573482513428, "rewards/rejected": -2.8793585300445557, "step": 11100 }, { "epoch": 1.91, "eval_logits/chosen": -1.4189980030059814, "eval_logits/rejected": -1.3920680284500122, "eval_logps/chosen": -244.1648712158203, "eval_logps/rejected": -294.9982604980469, "eval_loss": 0.6333078145980835, "eval_rewards/accuracies": 0.6598513126373291, "eval_rewards/chosen": -1.8546103239059448, "eval_rewards/margins": 0.46379825472831726, "eval_rewards/rejected": -2.318408489227295, "eval_runtime": 357.2276, "eval_samples_per_second": 12.048, "eval_steps_per_second": 1.506, "step": 11100 }, { "epoch": 1.91, "grad_norm": 15.157481934801936, "learning_rate": 2.79816574047842e-09, "logits/chosen": -1.3375742435455322, "logits/rejected": -1.2590216398239136, "logps/chosen": -242.087890625, "logps/rejected": -421.6622619628906, "loss": 0.3844, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8821719884872437, "rewards/margins": 1.799425721168518, "rewards/rejected": -3.681597948074341, "step": 11110 }, { "epoch": 1.92, "grad_norm": 41.14442211032838, "learning_rate": 2.6871176101263825e-09, "logits/chosen": -1.4798122644424438, "logits/rejected": -1.4134316444396973, "logps/chosen": -232.623046875, "logps/rejected": -359.533935546875, "loss": 0.4216, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.730859398841858, "rewards/margins": 1.3277714252471924, "rewards/rejected": -3.05863094329834, "step": 11120 }, { "epoch": 1.92, "grad_norm": 41.710468519729744, "learning_rate": 2.578305949212434e-09, "logits/chosen": -1.273736834526062, "logits/rejected": -1.2053143978118896, "logps/chosen": -243.1206817626953, "logps/rejected": -371.94866943359375, "loss": 0.4025, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8726370334625244, "rewards/margins": 1.3150079250335693, "rewards/rejected": -3.1876449584960938, "step": 11130 }, { "epoch": 1.92, "grad_norm": 41.66910869223657, "learning_rate": 2.4717317417287942e-09, "logits/chosen": -1.2594302892684937, "logits/rejected": -1.1979601383209229, "logps/chosen": -219.0839080810547, "logps/rejected": -353.35504150390625, "loss": 0.3744, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6528394222259521, "rewards/margins": 1.3649063110351562, "rewards/rejected": -3.0177457332611084, "step": 11140 }, { "epoch": 1.92, "grad_norm": 38.68187084741804, "learning_rate": 2.3673959514342314e-09, "logits/chosen": -1.3535066843032837, "logits/rejected": -1.3013697862625122, "logps/chosen": -242.1443328857422, "logps/rejected": -369.8888244628906, "loss": 0.43, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8408015966415405, "rewards/margins": 1.2930688858032227, "rewards/rejected": -3.1338706016540527, "step": 11150 }, { "epoch": 1.92, "grad_norm": 34.31089036359821, "learning_rate": 2.2652995218452877e-09, "logits/chosen": -1.4165607690811157, "logits/rejected": -1.3578795194625854, "logps/chosen": -216.8806610107422, "logps/rejected": -338.1597900390625, "loss": 0.4338, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.647953748703003, "rewards/margins": 1.2238695621490479, "rewards/rejected": -2.871823310852051, "step": 11160 }, { "epoch": 1.92, "grad_norm": 32.24674833849174, "learning_rate": 2.165443376227871e-09, "logits/chosen": -1.2586653232574463, "logits/rejected": -1.196004867553711, "logps/chosen": -232.2088165283203, "logps/rejected": -331.0061340332031, "loss": 0.4821, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.750544786453247, "rewards/margins": 1.0693135261535645, "rewards/rejected": -2.8198580741882324, "step": 11170 }, { "epoch": 1.93, "grad_norm": 57.55683167173714, "learning_rate": 2.0678284175887907e-09, "logits/chosen": -1.4004487991333008, "logits/rejected": -1.3373186588287354, "logps/chosen": -235.8096466064453, "logps/rejected": -374.24713134765625, "loss": 0.3973, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8187503814697266, "rewards/margins": 1.4154729843139648, "rewards/rejected": -3.2342236042022705, "step": 11180 }, { "epoch": 1.93, "grad_norm": 25.781820157206223, "learning_rate": 1.972455528667677e-09, "logits/chosen": -1.3892544507980347, "logits/rejected": -1.3081748485565186, "logps/chosen": -220.63912963867188, "logps/rejected": -372.5516052246094, "loss": 0.3301, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.6577503681182861, "rewards/margins": 1.551475167274475, "rewards/rejected": -3.20922589302063, "step": 11190 }, { "epoch": 1.93, "grad_norm": 33.77308243819322, "learning_rate": 1.8793255719288246e-09, "logits/chosen": -1.3923676013946533, "logits/rejected": -1.3273457288742065, "logps/chosen": -210.1555633544922, "logps/rejected": -346.9501647949219, "loss": 0.4305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5684833526611328, "rewards/margins": 1.3637385368347168, "rewards/rejected": -2.9322218894958496, "step": 11200 }, { "epoch": 1.93, "eval_logits/chosen": -1.42206609249115, "eval_logits/rejected": -1.3953404426574707, "eval_logps/chosen": -243.3866424560547, "eval_logps/rejected": -293.89874267578125, "eval_loss": 0.6334691643714905, "eval_rewards/accuracies": 0.6563661694526672, "eval_rewards/chosen": -1.8468278646469116, "eval_rewards/margins": 0.46058568358421326, "eval_rewards/rejected": -2.307413339614868, "eval_runtime": 357.503, "eval_samples_per_second": 12.039, "eval_steps_per_second": 1.505, "step": 11200 }, { "epoch": 1.93, "grad_norm": 46.567374288537444, "learning_rate": 1.7884393895536697e-09, "logits/chosen": -1.2399379014968872, "logits/rejected": -1.1777968406677246, "logps/chosen": -229.732666015625, "logps/rejected": -374.9830017089844, "loss": 0.4382, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.793482780456543, "rewards/margins": 1.417189121246338, "rewards/rejected": -3.2106716632843018, "step": 11210 }, { "epoch": 1.93, "grad_norm": 42.21368814153827, "learning_rate": 1.6997978034329342e-09, "logits/chosen": -1.3409579992294312, "logits/rejected": -1.2815383672714233, "logps/chosen": -217.97921752929688, "logps/rejected": -357.2254943847656, "loss": 0.4403, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6595194339752197, "rewards/margins": 1.3560142517089844, "rewards/rejected": -3.015533447265625, "step": 11220 }, { "epoch": 1.93, "grad_norm": 39.048264387033385, "learning_rate": 1.613401615159299e-09, "logits/chosen": -1.3046488761901855, "logits/rejected": -1.2500449419021606, "logps/chosen": -243.40029907226562, "logps/rejected": -376.0064697265625, "loss": 0.4023, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8764787912368774, "rewards/margins": 1.3685901165008545, "rewards/rejected": -3.2450687885284424, "step": 11230 }, { "epoch": 1.94, "grad_norm": 23.205283737644205, "learning_rate": 1.5292516060201599e-09, "logits/chosen": -1.3030173778533936, "logits/rejected": -1.2475535869598389, "logps/chosen": -230.6564483642578, "logps/rejected": -360.25225830078125, "loss": 0.4305, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.7603286504745483, "rewards/margins": 1.3079051971435547, "rewards/rejected": -3.0682339668273926, "step": 11240 }, { "epoch": 1.94, "grad_norm": 27.435468950638114, "learning_rate": 1.4473485369905224e-09, "logits/chosen": -1.3240660429000854, "logits/rejected": -1.261580228805542, "logps/chosen": -228.3889923095703, "logps/rejected": -360.775634765625, "loss": 0.4024, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7416374683380127, "rewards/margins": 1.3298765420913696, "rewards/rejected": -3.071514129638672, "step": 11250 }, { "epoch": 1.94, "grad_norm": 19.467311006382324, "learning_rate": 1.3676931487261456e-09, "logits/chosen": -1.2779386043548584, "logits/rejected": -1.209826111793518, "logps/chosen": -221.6068115234375, "logps/rejected": -340.8597717285156, "loss": 0.4412, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.637843370437622, "rewards/margins": 1.2554326057434082, "rewards/rejected": -2.8932759761810303, "step": 11260 }, { "epoch": 1.94, "grad_norm": 41.60730906780407, "learning_rate": 1.2902861615568527e-09, "logits/chosen": -1.3289337158203125, "logits/rejected": -1.2553983926773071, "logps/chosen": -224.650390625, "logps/rejected": -358.2517395019531, "loss": 0.391, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6910244226455688, "rewards/margins": 1.3813101053237915, "rewards/rejected": -3.0723345279693604, "step": 11270 }, { "epoch": 1.94, "grad_norm": 32.1598075986716, "learning_rate": 1.2151282754799542e-09, "logits/chosen": -1.3617111444473267, "logits/rejected": -1.293905258178711, "logps/chosen": -228.1493377685547, "logps/rejected": -348.77618408203125, "loss": 0.4462, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7183462381362915, "rewards/margins": 1.2455885410308838, "rewards/rejected": -2.9639346599578857, "step": 11280 }, { "epoch": 1.95, "grad_norm": 36.803056421177146, "learning_rate": 1.1422201701540567e-09, "logits/chosen": -1.3835200071334839, "logits/rejected": -1.3248523473739624, "logps/chosen": -217.21005249023438, "logps/rejected": -349.54583740234375, "loss": 0.4033, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5726218223571777, "rewards/margins": 1.3615262508392334, "rewards/rejected": -2.9341483116149902, "step": 11290 }, { "epoch": 1.95, "grad_norm": 27.85129396845735, "learning_rate": 1.0715625048927092e-09, "logits/chosen": -1.309777021408081, "logits/rejected": -1.2495863437652588, "logps/chosen": -242.6230926513672, "logps/rejected": -351.98077392578125, "loss": 0.4817, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8370163440704346, "rewards/margins": 1.1458604335784912, "rewards/rejected": -2.9828765392303467, "step": 11300 }, { "epoch": 1.95, "eval_logits/chosen": -1.4203462600708008, "eval_logits/rejected": -1.393379807472229, "eval_logps/chosen": -244.32652282714844, "eval_logps/rejected": -295.0477294921875, "eval_loss": 0.6342768669128418, "eval_rewards/accuracies": 0.6572955250740051, "eval_rewards/chosen": -1.8562268018722534, "eval_rewards/margins": 0.46267637610435486, "eval_rewards/rejected": -2.3189032077789307, "eval_runtime": 356.9343, "eval_samples_per_second": 12.058, "eval_steps_per_second": 1.507, "step": 11300 }, { "epoch": 1.95, "grad_norm": 33.21894208609608, "learning_rate": 1.0031559186586825e-09, "logits/chosen": -1.4185220003128052, "logits/rejected": -1.3642728328704834, "logps/chosen": -219.1095428466797, "logps/rejected": -368.1912841796875, "loss": 0.3543, "rewards/accuracies": 0.875, "rewards/chosen": -1.686640739440918, "rewards/margins": 1.4802013635635376, "rewards/rejected": -3.166841983795166, "step": 11310 }, { "epoch": 1.95, "grad_norm": 24.242052165940123, "learning_rate": 9.370010300579213e-10, "logits/chosen": -1.350401520729065, "logits/rejected": -1.2788327932357788, "logps/chosen": -226.5559539794922, "logps/rejected": -364.3349914550781, "loss": 0.4343, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7163565158843994, "rewards/margins": 1.3881375789642334, "rewards/rejected": -3.104494094848633, "step": 11320 }, { "epoch": 1.95, "grad_norm": 49.892604398278046, "learning_rate": 8.730984373342409e-10, "logits/chosen": -1.3533201217651367, "logits/rejected": -1.2792202234268188, "logps/chosen": -225.33120727539062, "logps/rejected": -375.6051330566406, "loss": 0.3547, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7277292013168335, "rewards/margins": 1.547004222869873, "rewards/rejected": -3.274733304977417, "step": 11330 }, { "epoch": 1.95, "grad_norm": 44.808939079357344, "learning_rate": 8.114487183636942e-10, "logits/chosen": -1.2439250946044922, "logits/rejected": -1.1751753091812134, "logps/chosen": -238.3853759765625, "logps/rejected": -387.01287841796875, "loss": 0.3996, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8302602767944336, "rewards/margins": 1.4923627376556396, "rewards/rejected": -3.3226234912872314, "step": 11340 }, { "epoch": 1.96, "grad_norm": 24.43792912942958, "learning_rate": 7.520524306494358e-10, "logits/chosen": -1.3848811388015747, "logits/rejected": -1.3261343240737915, "logps/chosen": -250.03085327148438, "logps/rejected": -366.9549255371094, "loss": 0.4724, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9313633441925049, "rewards/margins": 1.177704095840454, "rewards/rejected": -3.109067440032959, "step": 11350 }, { "epoch": 1.96, "grad_norm": 31.83383405122785, "learning_rate": 6.949101113166711e-10, "logits/chosen": -1.330773949623108, "logits/rejected": -1.2642290592193604, "logps/chosen": -235.5715789794922, "logps/rejected": -365.17340087890625, "loss": 0.4185, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8331444263458252, "rewards/margins": 1.3331224918365479, "rewards/rejected": -3.166267156600952, "step": 11360 }, { "epoch": 1.96, "grad_norm": 29.61007825180616, "learning_rate": 6.40022277107799e-10, "logits/chosen": -1.3195604085922241, "logits/rejected": -1.2653484344482422, "logps/chosen": -229.4175567626953, "logps/rejected": -352.00555419921875, "loss": 0.4398, "rewards/accuracies": 0.75, "rewards/chosen": -1.7311922311782837, "rewards/margins": 1.2529064416885376, "rewards/rejected": -2.9840986728668213, "step": 11370 }, { "epoch": 1.96, "grad_norm": 44.883310537971305, "learning_rate": 5.873894243776933e-10, "logits/chosen": -1.2741248607635498, "logits/rejected": -1.2058513164520264, "logps/chosen": -221.5206298828125, "logps/rejected": -360.199951171875, "loss": 0.4065, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6969740390777588, "rewards/margins": 1.4046361446380615, "rewards/rejected": -3.1016104221343994, "step": 11380 }, { "epoch": 1.96, "grad_norm": 36.423896869469424, "learning_rate": 5.370120290893176e-10, "logits/chosen": -1.4164044857025146, "logits/rejected": -1.340454339981079, "logps/chosen": -214.89956665039062, "logps/rejected": -366.628662109375, "loss": 0.4007, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.625978708267212, "rewards/margins": 1.4872604608535767, "rewards/rejected": -3.113239288330078, "step": 11390 }, { "epoch": 1.96, "grad_norm": 28.749205056855722, "learning_rate": 4.888905468093673e-10, "logits/chosen": -1.3569167852401733, "logits/rejected": -1.291182041168213, "logps/chosen": -212.83273315429688, "logps/rejected": -341.9942626953125, "loss": 0.4146, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5691052675247192, "rewards/margins": 1.31778883934021, "rewards/rejected": -2.8868937492370605, "step": 11400 }, { "epoch": 1.96, "eval_logits/chosen": -1.41786789894104, "eval_logits/rejected": -1.3909337520599365, "eval_logps/chosen": -244.43157958984375, "eval_logps/rejected": -295.2254638671875, "eval_loss": 0.63393235206604, "eval_rewards/accuracies": 0.6559014916419983, "eval_rewards/chosen": -1.857277512550354, "eval_rewards/margins": 0.4634034037590027, "eval_rewards/rejected": -2.320681095123291, "eval_runtime": 357.3673, "eval_samples_per_second": 12.044, "eval_steps_per_second": 1.505, "step": 11400 }, { "epoch": 1.97, "grad_norm": 39.086102079615756, "learning_rate": 4.430254127040789e-10, "logits/chosen": -1.3270251750946045, "logits/rejected": -1.2652333974838257, "logps/chosen": -229.4177703857422, "logps/rejected": -348.8304748535156, "loss": 0.4342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7375816106796265, "rewards/margins": 1.199951171875, "rewards/rejected": -2.937532901763916, "step": 11410 }, { "epoch": 1.97, "grad_norm": 30.366456037929765, "learning_rate": 3.994170415353715e-10, "logits/chosen": -1.3328666687011719, "logits/rejected": -1.2708826065063477, "logps/chosen": -233.0135498046875, "logps/rejected": -347.16339111328125, "loss": 0.4453, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.793405532836914, "rewards/margins": 1.175714135169983, "rewards/rejected": -2.9691193103790283, "step": 11420 }, { "epoch": 1.97, "grad_norm": 23.516266855640406, "learning_rate": 3.5806582765715574e-10, "logits/chosen": -1.2737079858779907, "logits/rejected": -1.215456247329712, "logps/chosen": -230.99423217773438, "logps/rejected": -346.54241943359375, "loss": 0.4635, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7601213455200195, "rewards/margins": 1.1789329051971436, "rewards/rejected": -2.939054012298584, "step": 11430 }, { "epoch": 1.97, "grad_norm": 32.39137150045276, "learning_rate": 3.189721450116145e-10, "logits/chosen": -1.340698003768921, "logits/rejected": -1.2886393070220947, "logps/chosen": -225.9718780517578, "logps/rejected": -343.7590637207031, "loss": 0.4333, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7345173358917236, "rewards/margins": 1.1770877838134766, "rewards/rejected": -2.9116053581237793, "step": 11440 }, { "epoch": 1.97, "grad_norm": 47.231248686402424, "learning_rate": 2.821363471259275e-10, "logits/chosen": -1.2820355892181396, "logits/rejected": -1.2136328220367432, "logps/chosen": -230.76296997070312, "logps/rejected": -371.7513122558594, "loss": 0.4059, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.780504822731018, "rewards/margins": 1.4201017618179321, "rewards/rejected": -3.20060658454895, "step": 11450 }, { "epoch": 1.97, "grad_norm": 28.594067560080052, "learning_rate": 2.4755876710905176e-10, "logits/chosen": -1.3040361404418945, "logits/rejected": -1.2424486875534058, "logps/chosen": -228.9510498046875, "logps/rejected": -367.95770263671875, "loss": 0.3752, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7342042922973633, "rewards/margins": 1.4023338556289673, "rewards/rejected": -3.136538028717041, "step": 11460 }, { "epoch": 1.98, "grad_norm": 46.590396282617746, "learning_rate": 2.1523971764869642e-10, "logits/chosen": -1.3816394805908203, "logits/rejected": -1.3001785278320312, "logps/chosen": -231.14248657226562, "logps/rejected": -376.31292724609375, "loss": 0.3637, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7429434061050415, "rewards/margins": 1.5104016065597534, "rewards/rejected": -3.253345012664795, "step": 11470 }, { "epoch": 1.98, "grad_norm": 43.745231317437764, "learning_rate": 1.8517949100854692e-10, "logits/chosen": -1.3776280879974365, "logits/rejected": -1.3017531633377075, "logps/chosen": -220.01980590820312, "logps/rejected": -345.5537414550781, "loss": 0.4132, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6392385959625244, "rewards/margins": 1.3121652603149414, "rewards/rejected": -2.9514036178588867, "step": 11480 }, { "epoch": 1.98, "grad_norm": 23.908719760101306, "learning_rate": 1.5737835902551733e-10, "logits/chosen": -1.3258212804794312, "logits/rejected": -1.2626917362213135, "logps/chosen": -230.5511474609375, "logps/rejected": -344.8309631347656, "loss": 0.4828, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.758782982826233, "rewards/margins": 1.1783698797225952, "rewards/rejected": -2.937152862548828, "step": 11490 }, { "epoch": 1.98, "grad_norm": 31.08219846166836, "learning_rate": 1.318365731074189e-10, "logits/chosen": -1.3679813146591187, "logits/rejected": -1.315306305885315, "logps/chosen": -218.0261688232422, "logps/rejected": -330.38092041015625, "loss": 0.432, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6775197982788086, "rewards/margins": 1.118550419807434, "rewards/rejected": -2.796069860458374, "step": 11500 }, { "epoch": 1.98, "eval_logits/chosen": -1.4183286428451538, "eval_logits/rejected": -1.391395092010498, "eval_logps/chosen": -244.1782684326172, "eval_logps/rejected": -295.0009765625, "eval_loss": 0.6337403059005737, "eval_rewards/accuracies": 0.6535780429840088, "eval_rewards/chosen": -1.8547443151474, "eval_rewards/margins": 0.4636920690536499, "eval_rewards/rejected": -2.31843638420105, "eval_runtime": 364.0419, "eval_samples_per_second": 11.823, "eval_steps_per_second": 1.478, "step": 11500 }, { "epoch": 1.98, "grad_norm": 25.52111148377799, "learning_rate": 1.0855436423054532e-10, "logits/chosen": -1.2700302600860596, "logits/rejected": -1.2187522649765015, "logps/chosen": -233.6306610107422, "logps/rejected": -368.69073486328125, "loss": 0.4327, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7816520929336548, "rewards/margins": 1.3728293180465698, "rewards/rejected": -3.1544814109802246, "step": 11510 }, { "epoch": 1.98, "grad_norm": 32.012807334453846, "learning_rate": 8.753194293770194e-11, "logits/chosen": -1.3286519050598145, "logits/rejected": -1.2325983047485352, "logps/chosen": -229.2296905517578, "logps/rejected": -372.2969970703125, "loss": 0.3641, "rewards/accuracies": 0.84375, "rewards/chosen": -1.694801688194275, "rewards/margins": 1.5285245180130005, "rewards/rejected": -3.2233262062072754, "step": 11520 }, { "epoch": 1.99, "grad_norm": 30.754768134489776, "learning_rate": 6.87694993363186e-11, "logits/chosen": -1.3455301523208618, "logits/rejected": -1.2808736562728882, "logps/chosen": -218.40139770507812, "logps/rejected": -335.3419494628906, "loss": 0.4166, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6494470834732056, "rewards/margins": 1.2140170335769653, "rewards/rejected": -2.863463878631592, "step": 11530 }, { "epoch": 1.99, "grad_norm": 52.90046362094887, "learning_rate": 5.226720309656207e-11, "logits/chosen": -1.3888723850250244, "logits/rejected": -1.3319811820983887, "logps/chosen": -217.9016876220703, "logps/rejected": -374.586669921875, "loss": 0.3692, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6716915369033813, "rewards/margins": 1.5221760272979736, "rewards/rejected": -3.1938672065734863, "step": 11540 }, { "epoch": 1.99, "grad_norm": 32.18141803466796, "learning_rate": 3.802520345000393e-11, "logits/chosen": -1.3312755823135376, "logits/rejected": -1.2780405282974243, "logps/chosen": -224.06698608398438, "logps/rejected": -344.835205078125, "loss": 0.46, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7274624109268188, "rewards/margins": 1.1927831172943115, "rewards/rejected": -2.92024564743042, "step": 11550 }, { "epoch": 1.99, "grad_norm": 43.51210987286007, "learning_rate": 2.604362918812164e-11, "logits/chosen": -1.3421592712402344, "logits/rejected": -1.2694909572601318, "logps/chosen": -227.5470733642578, "logps/rejected": -358.80706787109375, "loss": 0.4156, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7258684635162354, "rewards/margins": 1.3443853855133057, "rewards/rejected": -3.070253849029541, "step": 11560 }, { "epoch": 1.99, "grad_norm": 38.872585179691995, "learning_rate": 1.6322588661216163e-11, "logits/chosen": -1.3375303745269775, "logits/rejected": -1.271527647972107, "logps/chosen": -235.52359008789062, "logps/rejected": -379.27459716796875, "loss": 0.3802, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8141777515411377, "rewards/margins": 1.421951413154602, "rewards/rejected": -3.23612904548645, "step": 11570 }, { "epoch": 2.0, "grad_norm": 42.37337871627236, "learning_rate": 8.862169777440475e-12, "logits/chosen": -1.3970595598220825, "logits/rejected": -1.3340885639190674, "logps/chosen": -227.24227905273438, "logps/rejected": -373.0314636230469, "loss": 0.3938, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7100292444229126, "rewards/margins": 1.4638845920562744, "rewards/rejected": -3.1739137172698975, "step": 11580 }, { "epoch": 2.0, "grad_norm": 37.01271407525554, "learning_rate": 3.6624400018836485e-12, "logits/chosen": -1.2879482507705688, "logits/rejected": -1.215693712234497, "logps/chosen": -228.08352661132812, "logps/rejected": -351.55242919921875, "loss": 0.4093, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6859127283096313, "rewards/margins": 1.3195241689682007, "rewards/rejected": -3.005436420440674, "step": 11590 }, { "epoch": 2.0, "grad_norm": 33.774484526273135, "learning_rate": 7.234463561267557e-13, "logits/chosen": -1.341506004333496, "logits/rejected": -1.2840282917022705, "logps/chosen": -213.89993286132812, "logps/rejected": -343.8310852050781, "loss": 0.429, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.634570837020874, "rewards/margins": 1.2662864923477173, "rewards/rejected": -2.900857448577881, "step": 11600 }, { "epoch": 2.0, "eval_logits/chosen": -1.4189956188201904, "eval_logits/rejected": -1.3920025825500488, "eval_logps/chosen": -244.38746643066406, "eval_logps/rejected": -295.19293212890625, "eval_loss": 0.6342188119888306, "eval_rewards/accuracies": 0.6579925417900085, "eval_rewards/chosen": -1.8568360805511475, "eval_rewards/margins": 0.4635196328163147, "eval_rewards/rejected": -2.3203558921813965, "eval_runtime": 356.6898, "eval_samples_per_second": 12.067, "eval_steps_per_second": 1.508, "step": 11600 }, { "epoch": 2.0, "step": 11608, "total_flos": 0.0, "train_loss": 0.5042109644922366, "train_runtime": 89019.0317, "train_samples_per_second": 2.086, "train_steps_per_second": 0.13 } ], "logging_steps": 10, "max_steps": 11608, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }