diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19301 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 11608, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 2.184279469343464, + "learning_rate": 4.3066322136089575e-10, + "logits/chosen": -2.9685676097869873, + "logits/rejected": -2.926340103149414, + "logps/chosen": -44.04426574707031, + "logps/rejected": -41.580841064453125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2.397164451396864, + "learning_rate": 4.306632213608958e-09, + "logits/chosen": -3.057889223098755, + "logits/rejected": -3.028320550918579, + "logps/chosen": -50.45764923095703, + "logps/rejected": -49.59663391113281, + "loss": 0.6931, + "rewards/accuracies": 0.3611111044883728, + "rewards/chosen": 4.533848914434202e-05, + "rewards/margins": 1.5664114471292123e-05, + "rewards/rejected": 2.9674369216081686e-05, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 2.2428396092279437, + "learning_rate": 8.613264427217916e-09, + "logits/chosen": -3.1213667392730713, + "logits/rejected": -3.113072633743286, + "logps/chosen": -52.6474494934082, + "logps/rejected": -52.98405075073242, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.8674810891970992e-05, + "rewards/margins": 0.00011320582416374236, + "rewards/rejected": -0.00014188062050379813, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 2.5739690837982745, + "learning_rate": 1.2919896640826872e-08, + "logits/chosen": -3.093750476837158, + "logits/rejected": -3.0699524879455566, + "logps/chosen": -56.7930793762207, + "logps/rejected": -58.43015670776367, + "loss": 0.6932, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00020466512069106102, + "rewards/margins": -1.0724004823714495e-05, + "rewards/rejected": -0.00019394111586734653, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 2.0121230094500575, + "learning_rate": 1.722652885443583e-08, + "logits/chosen": -3.107394218444824, + "logits/rejected": -3.075824499130249, + "logps/chosen": -55.259185791015625, + "logps/rejected": -50.681114196777344, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 2.6132946004508995e-05, + "rewards/margins": 0.0003762342967092991, + "rewards/rejected": -0.00035010138526558876, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 2.387552940684203, + "learning_rate": 2.153316106804479e-08, + "logits/chosen": -3.1034653186798096, + "logits/rejected": -3.0867769718170166, + "logps/chosen": -53.10588455200195, + "logps/rejected": -51.49999237060547, + "loss": 0.6932, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.00013910401321481913, + "rewards/margins": -8.180685108527541e-05, + "rewards/rejected": -5.7297169405501336e-05, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 2.7970003076901, + "learning_rate": 2.5839793281653743e-08, + "logits/chosen": -3.156252384185791, + "logits/rejected": -3.1266000270843506, + "logps/chosen": -57.58796310424805, + "logps/rejected": -54.14855194091797, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -4.136812640354037e-05, + "rewards/margins": 5.838483411935158e-05, + "rewards/rejected": -9.975295688491315e-05, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.204322757523193, + "learning_rate": 3.01464254952627e-08, + "logits/chosen": -3.0535032749176025, + "logits/rejected": -3.033651828765869, + "logps/chosen": -53.7407112121582, + "logps/rejected": -53.21503448486328, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0001532899623271078, + "rewards/margins": 0.0002148848434444517, + "rewards/rejected": -6.159489566925913e-05, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 2.4363897145491564, + "learning_rate": 3.445305770887166e-08, + "logits/chosen": -3.1622116565704346, + "logits/rejected": -3.1288113594055176, + "logps/chosen": -59.07722091674805, + "logps/rejected": -54.100318908691406, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 6.512943946290761e-05, + "rewards/margins": 0.0002318086044397205, + "rewards/rejected": -0.00016667917952872813, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 2.4857228513755465, + "learning_rate": 3.8759689922480615e-08, + "logits/chosen": -2.996279239654541, + "logits/rejected": -2.9815406799316406, + "logps/chosen": -53.46660232543945, + "logps/rejected": -52.83372116088867, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0002413403708487749, + "rewards/margins": 0.0003210466238670051, + "rewards/rejected": -7.97062530182302e-05, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 2.4854058944857753, + "learning_rate": 4.306632213608958e-08, + "logits/chosen": -3.1720452308654785, + "logits/rejected": -3.109947681427002, + "logps/chosen": -55.90839385986328, + "logps/rejected": -49.635841369628906, + "loss": 0.6929, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0004091753507964313, + "rewards/margins": 0.0005016516079194844, + "rewards/rejected": -9.247624257113785e-05, + "step": 100 + }, + { + "epoch": 0.02, + "eval_logits/chosen": -3.165482521057129, + "eval_logits/rejected": -3.1598188877105713, + "eval_logps/chosen": -58.70554733276367, + "eval_logps/rejected": -63.15681457519531, + "eval_loss": 0.6931592226028442, + "eval_rewards/accuracies": 0.4986059367656708, + "eval_rewards/chosen": -1.7028520232997835e-05, + "eval_rewards/margins": -2.261956069560256e-05, + "eval_rewards/rejected": 5.5910377341206186e-06, + "eval_runtime": 356.9348, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.507, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 2.5379181619698423, + "learning_rate": 4.7372954349698534e-08, + "logits/chosen": -3.12424898147583, + "logits/rejected": -3.1003119945526123, + "logps/chosen": -55.57979202270508, + "logps/rejected": -52.30139923095703, + "loss": 0.6932, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.00025836736313067377, + "rewards/margins": -0.0001904887321870774, + "rewards/rejected": -6.787859456380829e-05, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 2.561368552467561, + "learning_rate": 5.1679586563307486e-08, + "logits/chosen": -3.0679683685302734, + "logits/rejected": -3.0525035858154297, + "logps/chosen": -53.182281494140625, + "logps/rejected": -55.54204177856445, + "loss": 0.6932, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.00013645360013470054, + "rewards/margins": -7.856530282879248e-05, + "rewards/rejected": -5.78882682020776e-05, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 2.138984879227857, + "learning_rate": 5.598621877691645e-08, + "logits/chosen": -3.10345458984375, + "logits/rejected": -3.089416980743408, + "logps/chosen": -55.18548583984375, + "logps/rejected": -53.74910354614258, + "loss": 0.6931, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -3.565276711015031e-05, + "rewards/margins": 0.00010266680328641087, + "rewards/rejected": -0.0001383195340167731, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 2.4347210527199588, + "learning_rate": 6.02928509905254e-08, + "logits/chosen": -3.1250388622283936, + "logits/rejected": -3.106936454772949, + "logps/chosen": -54.17211151123047, + "logps/rejected": -53.7529296875, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 1.1661546523100697e-05, + "rewards/margins": -4.082123723492259e-06, + "rewards/rejected": 1.5743673429824412e-05, + "step": 140 + }, + { + "epoch": 0.03, + "grad_norm": 2.213735398044619, + "learning_rate": 6.459948320413436e-08, + "logits/chosen": -3.0300798416137695, + "logits/rejected": -3.0123374462127686, + "logps/chosen": -52.598976135253906, + "logps/rejected": -52.38323211669922, + "loss": 0.6932, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -6.447284249588847e-05, + "rewards/margins": -5.51502344023902e-05, + "rewards/rejected": -9.322635378339328e-06, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 2.156655887327036, + "learning_rate": 6.890611541774332e-08, + "logits/chosen": -3.0911943912506104, + "logits/rejected": -3.070504665374756, + "logps/chosen": -53.4869499206543, + "logps/rejected": -54.68552780151367, + "loss": 0.6933, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -5.7128123444272205e-05, + "rewards/margins": -0.00023163272999227047, + "rewards/rejected": 0.00017450464656576514, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 2.3532949837745685, + "learning_rate": 7.321274763135228e-08, + "logits/chosen": -3.0787293910980225, + "logits/rejected": -3.0594067573547363, + "logps/chosen": -56.2595100402832, + "logps/rejected": -51.335472106933594, + "loss": 0.6931, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.00012248748680576682, + "rewards/margins": 0.00017826970724854618, + "rewards/rejected": -5.5782216804800555e-05, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 2.6258607223080777, + "learning_rate": 7.751937984496123e-08, + "logits/chosen": -3.0651602745056152, + "logits/rejected": -3.0461204051971436, + "logps/chosen": -56.38677215576172, + "logps/rejected": -53.772865295410156, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 8.890104800229892e-05, + "rewards/margins": 6.191270949784666e-05, + "rewards/rejected": 2.6988331228494644e-05, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 2.636977530600279, + "learning_rate": 8.18260120585702e-08, + "logits/chosen": -3.126418352127075, + "logits/rejected": -3.0830445289611816, + "logps/chosen": -58.16786575317383, + "logps/rejected": -52.552574157714844, + "loss": 0.6929, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 8.153868111548945e-05, + "rewards/margins": 0.00046413601376116276, + "rewards/rejected": -0.0003825973253697157, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 2.580185714051456, + "learning_rate": 8.613264427217916e-08, + "logits/chosen": -3.0618324279785156, + "logits/rejected": -3.04618239402771, + "logps/chosen": -54.1072998046875, + "logps/rejected": -54.72209548950195, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.7029391276300885e-05, + "rewards/margins": 0.0002754017186816782, + "rewards/rejected": -0.00030243111541494727, + "step": 200 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -3.165257692337036, + "eval_logits/rejected": -3.159618616104126, + "eval_logps/chosen": -58.68031311035156, + "eval_logps/rejected": -63.137481689453125, + "eval_loss": 0.69312983751297, + "eval_rewards/accuracies": 0.5127788186073303, + "eval_rewards/chosen": 0.0002353396121179685, + "eval_rewards/margins": 3.6308691051090136e-05, + "eval_rewards/rejected": 0.00019903088104911149, + "eval_runtime": 355.2015, + "eval_samples_per_second": 12.117, + "eval_steps_per_second": 1.515, + "step": 200 + }, + { + "epoch": 0.04, + "grad_norm": 2.28669707979838, + "learning_rate": 9.043927648578811e-08, + "logits/chosen": -3.0168232917785645, + "logits/rejected": -3.008084535598755, + "logps/chosen": -53.26890182495117, + "logps/rejected": -57.292236328125, + "loss": 0.6932, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.00018195889424532652, + "rewards/margins": -8.943781722337008e-05, + "rewards/rejected": -9.252109157387167e-05, + "step": 210 + }, + { + "epoch": 0.04, + "grad_norm": 2.336438367711948, + "learning_rate": 9.474590869939707e-08, + "logits/chosen": -3.053389072418213, + "logits/rejected": -3.022315502166748, + "logps/chosen": -52.211769104003906, + "logps/rejected": -51.38096237182617, + "loss": 0.6928, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0002459119423292577, + "rewards/margins": 0.0007187900482676923, + "rewards/rejected": -0.00096470199059695, + "step": 220 + }, + { + "epoch": 0.04, + "grad_norm": 2.394447162636319, + "learning_rate": 9.905254091300602e-08, + "logits/chosen": -3.0536513328552246, + "logits/rejected": -3.0352864265441895, + "logps/chosen": -48.92305374145508, + "logps/rejected": -50.00139617919922, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.000386159576009959, + "rewards/margins": 0.00027917162515223026, + "rewards/rejected": -0.0006653312011621892, + "step": 230 + }, + { + "epoch": 0.04, + "grad_norm": 2.25083824617627, + "learning_rate": 1.0335917312661497e-07, + "logits/chosen": -3.026599168777466, + "logits/rejected": -2.9841794967651367, + "logps/chosen": -55.9691276550293, + "logps/rejected": -52.21491622924805, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0005459034582599998, + "rewards/margins": 9.160184708889574e-05, + "rewards/rejected": -0.0006375053199008107, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 2.323445591258243, + "learning_rate": 1.0766580534022394e-07, + "logits/chosen": -3.1199052333831787, + "logits/rejected": -3.0994296073913574, + "logps/chosen": -52.30159378051758, + "logps/rejected": -51.17644119262695, + "loss": 0.6928, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.00036477210232988, + "rewards/margins": 0.000624177569989115, + "rewards/rejected": -0.000988949672318995, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 2.316340782882154, + "learning_rate": 1.119724375538329e-07, + "logits/chosen": -3.0962424278259277, + "logits/rejected": -3.0838680267333984, + "logps/chosen": -54.875404357910156, + "logps/rejected": -56.73250198364258, + "loss": 0.6927, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.00046339546679519117, + "rewards/margins": 0.000800677458755672, + "rewards/rejected": -0.0012640730710700154, + "step": 260 + }, + { + "epoch": 0.05, + "grad_norm": 2.212868394326073, + "learning_rate": 1.1627906976744186e-07, + "logits/chosen": -3.034665584564209, + "logits/rejected": -3.0166120529174805, + "logps/chosen": -53.17912673950195, + "logps/rejected": -54.439247131347656, + "loss": 0.6927, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.00068662193370983, + "rewards/margins": 0.000886773515958339, + "rewards/rejected": -0.0015733955660834908, + "step": 270 + }, + { + "epoch": 0.05, + "grad_norm": 2.4332688553771162, + "learning_rate": 1.205857019810508e-07, + "logits/chosen": -3.125800609588623, + "logits/rejected": -3.0919315814971924, + "logps/chosen": -57.64659881591797, + "logps/rejected": -53.57320022583008, + "loss": 0.6925, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.00048610559315420687, + "rewards/margins": 0.0012957851868122816, + "rewards/rejected": -0.0017818908672779799, + "step": 280 + }, + { + "epoch": 0.05, + "grad_norm": 2.249917169819848, + "learning_rate": 1.2489233419465976e-07, + "logits/chosen": -3.048657178878784, + "logits/rejected": -3.034323215484619, + "logps/chosen": -55.451141357421875, + "logps/rejected": -54.447296142578125, + "loss": 0.6926, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0007702001603320241, + "rewards/margins": 0.0010641098488122225, + "rewards/rejected": -0.0018343102419748902, + "step": 290 + }, + { + "epoch": 0.05, + "grad_norm": 2.369298746667673, + "learning_rate": 1.2919896640826872e-07, + "logits/chosen": -3.004129409790039, + "logits/rejected": -2.9955711364746094, + "logps/chosen": -52.908668518066406, + "logps/rejected": -54.089874267578125, + "loss": 0.6926, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0007483543595299125, + "rewards/margins": 0.0010376429418101907, + "rewards/rejected": -0.0017859973013401031, + "step": 300 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -3.163788318634033, + "eval_logits/rejected": -3.158111572265625, + "eval_logps/chosen": -58.64423751831055, + "eval_logps/rejected": -63.127235412597656, + "eval_loss": 0.6930013298988342, + "eval_rewards/accuracies": 0.5394981503486633, + "eval_rewards/chosen": 0.0005960779963061213, + "eval_rewards/margins": 0.00029463876853697, + "eval_rewards/rejected": 0.0003014392568729818, + "eval_runtime": 356.1408, + "eval_samples_per_second": 12.085, + "eval_steps_per_second": 1.511, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 2.4743047933409654, + "learning_rate": 1.335055986218777e-07, + "logits/chosen": -3.0664687156677246, + "logits/rejected": -3.060901641845703, + "logps/chosen": -53.61384201049805, + "logps/rejected": -53.52678298950195, + "loss": 0.6926, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0011877021752297878, + "rewards/margins": 0.0011907459702342749, + "rewards/rejected": -0.0023784481454640627, + "step": 310 + }, + { + "epoch": 0.06, + "grad_norm": 2.4049747365414973, + "learning_rate": 1.3781223083548665e-07, + "logits/chosen": -3.023968458175659, + "logits/rejected": -2.9977526664733887, + "logps/chosen": -54.628395080566406, + "logps/rejected": -49.533180236816406, + "loss": 0.6923, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0013292916119098663, + "rewards/margins": 0.0016255916561931372, + "rewards/rejected": -0.0029548832681030035, + "step": 320 + }, + { + "epoch": 0.06, + "grad_norm": 2.3870544128030664, + "learning_rate": 1.421188630490956e-07, + "logits/chosen": -3.0835556983947754, + "logits/rejected": -3.0599796772003174, + "logps/chosen": -55.1311149597168, + "logps/rejected": -52.4721794128418, + "loss": 0.6923, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0011483869748190045, + "rewards/margins": 0.0016829262021929026, + "rewards/rejected": -0.0028313130605965853, + "step": 330 + }, + { + "epoch": 0.06, + "grad_norm": 2.177069307915396, + "learning_rate": 1.4642549526270455e-07, + "logits/chosen": -3.0061721801757812, + "logits/rejected": -2.9844064712524414, + "logps/chosen": -52.63057327270508, + "logps/rejected": -52.16088104248047, + "loss": 0.6926, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.001195924007333815, + "rewards/margins": 0.0012026333715766668, + "rewards/rejected": -0.0023985574953258038, + "step": 340 + }, + { + "epoch": 0.06, + "grad_norm": 2.3234378454912106, + "learning_rate": 1.507321274763135e-07, + "logits/chosen": -2.978062391281128, + "logits/rejected": -2.9385359287261963, + "logps/chosen": -56.37324142456055, + "logps/rejected": -53.88068389892578, + "loss": 0.6923, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0015299760270863771, + "rewards/margins": 0.0017499777022749186, + "rewards/rejected": -0.003279953496530652, + "step": 350 + }, + { + "epoch": 0.06, + "grad_norm": 2.4262238382471386, + "learning_rate": 1.5503875968992246e-07, + "logits/chosen": -3.1277754306793213, + "logits/rejected": -3.1047608852386475, + "logps/chosen": -54.798912048339844, + "logps/rejected": -50.93855667114258, + "loss": 0.6921, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0022310030180960894, + "rewards/margins": 0.00208022678270936, + "rewards/rejected": -0.004311230033636093, + "step": 360 + }, + { + "epoch": 0.06, + "grad_norm": 2.349381595760878, + "learning_rate": 1.5934539190353144e-07, + "logits/chosen": -3.1027891635894775, + "logits/rejected": -3.0734617710113525, + "logps/chosen": -52.51411819458008, + "logps/rejected": -51.80864715576172, + "loss": 0.6915, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0014876796631142497, + "rewards/margins": 0.0032487933058291674, + "rewards/rejected": -0.0047364733181893826, + "step": 370 + }, + { + "epoch": 0.07, + "grad_norm": 2.097053054398486, + "learning_rate": 1.636520241171404e-07, + "logits/chosen": -3.201812744140625, + "logits/rejected": -3.177008867263794, + "logps/chosen": -53.772377014160156, + "logps/rejected": -52.72692108154297, + "loss": 0.6919, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0027772397734224796, + "rewards/margins": 0.002589695155620575, + "rewards/rejected": -0.005366935394704342, + "step": 380 + }, + { + "epoch": 0.07, + "grad_norm": 2.410804418588189, + "learning_rate": 1.6795865633074934e-07, + "logits/chosen": -3.0987088680267334, + "logits/rejected": -3.0732438564300537, + "logps/chosen": -56.3135986328125, + "logps/rejected": -55.72515106201172, + "loss": 0.6917, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0026503982953727245, + "rewards/margins": 0.0030277767218649387, + "rewards/rejected": -0.005678174551576376, + "step": 390 + }, + { + "epoch": 0.07, + "grad_norm": 2.178080396872868, + "learning_rate": 1.7226528854435832e-07, + "logits/chosen": -3.0680224895477295, + "logits/rejected": -3.0521273612976074, + "logps/chosen": -52.94443893432617, + "logps/rejected": -53.484153747558594, + "loss": 0.691, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0031809869688004255, + "rewards/margins": 0.0043879905715584755, + "rewards/rejected": -0.00756897684186697, + "step": 400 + }, + { + "epoch": 0.07, + "eval_logits/chosen": -3.1602914333343506, + "eval_logits/rejected": -3.1546647548675537, + "eval_logps/chosen": -58.560550689697266, + "eval_logps/rejected": -63.11561965942383, + "eval_loss": 0.6926479339599609, + "eval_rewards/accuracies": 0.5611059665679932, + "eval_rewards/chosen": 0.00143293512519449, + "eval_rewards/margins": 0.0010153905022889376, + "eval_rewards/rejected": 0.00041754471021704376, + "eval_runtime": 357.1134, + "eval_samples_per_second": 12.052, + "eval_steps_per_second": 1.507, + "step": 400 + }, + { + "epoch": 0.07, + "grad_norm": 2.1621440173846986, + "learning_rate": 1.7657192075796725e-07, + "logits/chosen": -3.071235179901123, + "logits/rejected": -3.0671350955963135, + "logps/chosen": -51.14980697631836, + "logps/rejected": -56.166534423828125, + "loss": 0.6921, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.004188390448689461, + "rewards/margins": 0.002166205085813999, + "rewards/rejected": -0.00635459553450346, + "step": 410 + }, + { + "epoch": 0.07, + "grad_norm": 2.497135126567034, + "learning_rate": 1.8087855297157623e-07, + "logits/chosen": -3.0559000968933105, + "logits/rejected": -3.0479495525360107, + "logps/chosen": -54.61994552612305, + "logps/rejected": -54.487693786621094, + "loss": 0.6925, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.004710297100245953, + "rewards/margins": 0.0013622719561681151, + "rewards/rejected": -0.006072568707168102, + "step": 420 + }, + { + "epoch": 0.07, + "grad_norm": 2.277715100196342, + "learning_rate": 1.8518518518518516e-07, + "logits/chosen": -3.083691358566284, + "logits/rejected": -3.069835662841797, + "logps/chosen": -53.39365768432617, + "logps/rejected": -54.8032341003418, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0031691633630543947, + "rewards/margins": 0.0033808585721999407, + "rewards/rejected": -0.006550021469593048, + "step": 430 + }, + { + "epoch": 0.08, + "grad_norm": 2.5466826911783222, + "learning_rate": 1.8949181739879413e-07, + "logits/chosen": -3.128629684448242, + "logits/rejected": -3.093276262283325, + "logps/chosen": -54.6483268737793, + "logps/rejected": -54.21075439453125, + "loss": 0.6901, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0036215814761817455, + "rewards/margins": 0.006118671037256718, + "rewards/rejected": -0.009740252047777176, + "step": 440 + }, + { + "epoch": 0.08, + "grad_norm": 2.2918968289848176, + "learning_rate": 1.9379844961240311e-07, + "logits/chosen": -3.050204038619995, + "logits/rejected": -3.0178327560424805, + "logps/chosen": -56.4046516418457, + "logps/rejected": -55.3991813659668, + "loss": 0.6907, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0031705971341580153, + "rewards/margins": 0.004987316206097603, + "rewards/rejected": -0.008157914504408836, + "step": 450 + }, + { + "epoch": 0.08, + "grad_norm": 2.446433174118867, + "learning_rate": 1.9810508182601204e-07, + "logits/chosen": -3.0185251235961914, + "logits/rejected": -2.998788833618164, + "logps/chosen": -56.675636291503906, + "logps/rejected": -53.9906005859375, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.004722902551293373, + "rewards/margins": 0.005447733215987682, + "rewards/rejected": -0.010170635767281055, + "step": 460 + }, + { + "epoch": 0.08, + "grad_norm": 2.260629002268937, + "learning_rate": 2.0241171403962102e-07, + "logits/chosen": -3.0403809547424316, + "logits/rejected": -3.0088233947753906, + "logps/chosen": -53.7476692199707, + "logps/rejected": -52.34978103637695, + "loss": 0.6906, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.004685810301452875, + "rewards/margins": 0.005127486772835255, + "rewards/rejected": -0.009813296608626842, + "step": 470 + }, + { + "epoch": 0.08, + "grad_norm": 2.4519536559144295, + "learning_rate": 2.0671834625322995e-07, + "logits/chosen": -3.0377235412597656, + "logits/rejected": -3.032456398010254, + "logps/chosen": -54.986839294433594, + "logps/rejected": -60.05756378173828, + "loss": 0.6914, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.007612565997987986, + "rewards/margins": 0.003608607454225421, + "rewards/rejected": -0.011221175082027912, + "step": 480 + }, + { + "epoch": 0.08, + "grad_norm": 2.4663820792794677, + "learning_rate": 2.1102497846683892e-07, + "logits/chosen": -2.9463233947753906, + "logits/rejected": -2.8967654705047607, + "logps/chosen": -61.15422439575195, + "logps/rejected": -52.75993728637695, + "loss": 0.6889, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.005517776124179363, + "rewards/margins": 0.008782900869846344, + "rewards/rejected": -0.014300678856670856, + "step": 490 + }, + { + "epoch": 0.09, + "grad_norm": 2.330775006199537, + "learning_rate": 2.1533161068044788e-07, + "logits/chosen": -3.0073325634002686, + "logits/rejected": -2.978564500808716, + "logps/chosen": -55.71733474731445, + "logps/rejected": -52.86774826049805, + "loss": 0.6907, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.007723568938672543, + "rewards/margins": 0.005096676293760538, + "rewards/rejected": -0.012820245698094368, + "step": 500 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -3.1515376567840576, + "eval_logits/rejected": -3.1458804607391357, + "eval_logps/chosen": -58.49737548828125, + "eval_logps/rejected": -63.162086486816406, + "eval_loss": 0.6921212077140808, + "eval_rewards/accuracies": 0.5755111575126648, + "eval_rewards/chosen": 0.0020647228229790926, + "eval_rewards/margins": 0.0021117778960615396, + "eval_rewards/rejected": -4.7055131290107965e-05, + "eval_runtime": 357.3148, + "eval_samples_per_second": 12.045, + "eval_steps_per_second": 1.506, + "step": 500 + }, + { + "epoch": 0.09, + "grad_norm": 2.29181189171612, + "learning_rate": 2.1963824289405683e-07, + "logits/chosen": -2.9982523918151855, + "logits/rejected": -2.977153778076172, + "logps/chosen": -59.12430953979492, + "logps/rejected": -53.17649459838867, + "loss": 0.6916, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.008605951443314552, + "rewards/margins": 0.0032728458754718304, + "rewards/rejected": -0.01187879778444767, + "step": 510 + }, + { + "epoch": 0.09, + "grad_norm": 2.083836403157807, + "learning_rate": 2.239448751076658e-07, + "logits/chosen": -3.045750141143799, + "logits/rejected": -3.0202910900115967, + "logps/chosen": -57.38309860229492, + "logps/rejected": -53.21696090698242, + "loss": 0.6902, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.009720906615257263, + "rewards/margins": 0.006151780020445585, + "rewards/rejected": -0.015872687101364136, + "step": 520 + }, + { + "epoch": 0.09, + "grad_norm": 2.0972964611747837, + "learning_rate": 2.2825150732127476e-07, + "logits/chosen": -3.0414490699768066, + "logits/rejected": -3.0001041889190674, + "logps/chosen": -56.354469299316406, + "logps/rejected": -53.02845001220703, + "loss": 0.6874, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.006593731231987476, + "rewards/margins": 0.01189809013158083, + "rewards/rejected": -0.018491821363568306, + "step": 530 + }, + { + "epoch": 0.09, + "grad_norm": 2.264333945297249, + "learning_rate": 2.3255813953488372e-07, + "logits/chosen": -3.0273146629333496, + "logits/rejected": -3.0108981132507324, + "logps/chosen": -54.00878143310547, + "logps/rejected": -54.52630615234375, + "loss": 0.6909, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.011568492278456688, + "rewards/margins": 0.004814336076378822, + "rewards/rejected": -0.01638282835483551, + "step": 540 + }, + { + "epoch": 0.09, + "grad_norm": 2.2643011063724696, + "learning_rate": 2.3686477174849267e-07, + "logits/chosen": -3.086310863494873, + "logits/rejected": -3.0684800148010254, + "logps/chosen": -54.64630126953125, + "logps/rejected": -53.90349197387695, + "loss": 0.689, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.010920585133135319, + "rewards/margins": 0.008641783148050308, + "rewards/rejected": -0.0195623692125082, + "step": 550 + }, + { + "epoch": 0.1, + "grad_norm": 2.6473142726505916, + "learning_rate": 2.411714039621016e-07, + "logits/chosen": -3.0569510459899902, + "logits/rejected": -3.0491485595703125, + "logps/chosen": -53.267906188964844, + "logps/rejected": -57.03251266479492, + "loss": 0.6887, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.011830061674118042, + "rewards/margins": 0.009134244173765182, + "rewards/rejected": -0.020964305847883224, + "step": 560 + }, + { + "epoch": 0.1, + "grad_norm": 2.2526602237955093, + "learning_rate": 2.454780361757106e-07, + "logits/chosen": -3.0276739597320557, + "logits/rejected": -3.019141435623169, + "logps/chosen": -52.507972717285156, + "logps/rejected": -55.79932403564453, + "loss": 0.6892, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.011621413752436638, + "rewards/margins": 0.008228513412177563, + "rewards/rejected": -0.019849926233291626, + "step": 570 + }, + { + "epoch": 0.1, + "grad_norm": 1.8450642814148608, + "learning_rate": 2.4978466838931953e-07, + "logits/chosen": -3.0327651500701904, + "logits/rejected": -3.0267834663391113, + "logps/chosen": -52.5694580078125, + "logps/rejected": -55.16640090942383, + "loss": 0.691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.014748876914381981, + "rewards/margins": 0.004590337164700031, + "rewards/rejected": -0.019339213147759438, + "step": 580 + }, + { + "epoch": 0.1, + "grad_norm": 2.0917320499094676, + "learning_rate": 2.540913006029285e-07, + "logits/chosen": -3.029935359954834, + "logits/rejected": -3.0082030296325684, + "logps/chosen": -56.06707763671875, + "logps/rejected": -56.61497116088867, + "loss": 0.6903, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.012053562328219414, + "rewards/margins": 0.006142253987491131, + "rewards/rejected": -0.01819581724703312, + "step": 590 + }, + { + "epoch": 0.1, + "grad_norm": 2.3704490382647596, + "learning_rate": 2.5839793281653743e-07, + "logits/chosen": -3.001335382461548, + "logits/rejected": -2.9773948192596436, + "logps/chosen": -54.8140869140625, + "logps/rejected": -59.22527313232422, + "loss": 0.6852, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.009530474431812763, + "rewards/margins": 0.01643664576113224, + "rewards/rejected": -0.025967121124267578, + "step": 600 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -3.1388049125671387, + "eval_logits/rejected": -3.1331238746643066, + "eval_logps/chosen": -58.60028839111328, + "eval_logps/rejected": -63.40556716918945, + "eval_loss": 0.6914582848548889, + "eval_rewards/accuracies": 0.5822490453720093, + "eval_rewards/chosen": 0.0010355679551139474, + "eval_rewards/margins": 0.0035174190998077393, + "eval_rewards/rejected": -0.0024818514939397573, + "eval_runtime": 357.2236, + "eval_samples_per_second": 12.048, + "eval_steps_per_second": 1.506, + "step": 600 + }, + { + "epoch": 0.11, + "grad_norm": 2.356076010409613, + "learning_rate": 2.627045650301464e-07, + "logits/chosen": -2.964993715286255, + "logits/rejected": -2.9622960090637207, + "logps/chosen": -54.120277404785156, + "logps/rejected": -55.163963317871094, + "loss": 0.6917, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.015915414318442345, + "rewards/margins": 0.0033319643698632717, + "rewards/rejected": -0.019247379153966904, + "step": 610 + }, + { + "epoch": 0.11, + "grad_norm": 2.474319380074331, + "learning_rate": 2.670111972437554e-07, + "logits/chosen": -3.120192289352417, + "logits/rejected": -3.0936880111694336, + "logps/chosen": -56.2474479675293, + "logps/rejected": -56.004188537597656, + "loss": 0.6857, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.012355372309684753, + "rewards/margins": 0.015266923233866692, + "rewards/rejected": -0.027622297406196594, + "step": 620 + }, + { + "epoch": 0.11, + "grad_norm": 2.5148230940892784, + "learning_rate": 2.713178294573643e-07, + "logits/chosen": -3.101238250732422, + "logits/rejected": -3.0744516849517822, + "logps/chosen": -55.61579513549805, + "logps/rejected": -53.38257598876953, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01770562306046486, + "rewards/margins": 0.007805233355611563, + "rewards/rejected": -0.02551085688173771, + "step": 630 + }, + { + "epoch": 0.11, + "grad_norm": 2.4893970821776743, + "learning_rate": 2.756244616709733e-07, + "logits/chosen": -3.067873239517212, + "logits/rejected": -3.056884527206421, + "logps/chosen": -54.79213333129883, + "logps/rejected": -56.961280822753906, + "loss": 0.6896, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.018890097737312317, + "rewards/margins": 0.007660907693207264, + "rewards/rejected": -0.026551008224487305, + "step": 640 + }, + { + "epoch": 0.11, + "grad_norm": 2.8078860081688024, + "learning_rate": 2.799310938845822e-07, + "logits/chosen": -3.078247308731079, + "logits/rejected": -3.0796327590942383, + "logps/chosen": -53.70562744140625, + "logps/rejected": -57.23451614379883, + "loss": 0.6925, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.02245044708251953, + "rewards/margins": 0.0018996421713382006, + "rewards/rejected": -0.024350086227059364, + "step": 650 + }, + { + "epoch": 0.11, + "grad_norm": 2.4337428834992307, + "learning_rate": 2.842377260981912e-07, + "logits/chosen": -2.967741012573242, + "logits/rejected": -2.9617857933044434, + "logps/chosen": -56.363494873046875, + "logps/rejected": -54.866737365722656, + "loss": 0.689, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.017431819811463356, + "rewards/margins": 0.008792152628302574, + "rewards/rejected": -0.02622397243976593, + "step": 660 + }, + { + "epoch": 0.12, + "grad_norm": 2.339727211875298, + "learning_rate": 2.885443583118002e-07, + "logits/chosen": -2.9894890785217285, + "logits/rejected": -2.984511137008667, + "logps/chosen": -54.77692794799805, + "logps/rejected": -60.03133010864258, + "loss": 0.6897, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.01770101860165596, + "rewards/margins": 0.007546558044850826, + "rewards/rejected": -0.02524757757782936, + "step": 670 + }, + { + "epoch": 0.12, + "grad_norm": 2.5872483817238128, + "learning_rate": 2.928509905254091e-07, + "logits/chosen": -2.9396934509277344, + "logits/rejected": -2.914903163909912, + "logps/chosen": -55.78126907348633, + "logps/rejected": -54.39397430419922, + "loss": 0.685, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.018597902730107307, + "rewards/margins": 0.016813453286886215, + "rewards/rejected": -0.035411350429058075, + "step": 680 + }, + { + "epoch": 0.12, + "grad_norm": 2.7292903036637064, + "learning_rate": 2.971576227390181e-07, + "logits/chosen": -3.085705518722534, + "logits/rejected": -3.056213617324829, + "logps/chosen": -60.99599075317383, + "logps/rejected": -53.571380615234375, + "loss": 0.6877, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.019351843744516373, + "rewards/margins": 0.011478090658783913, + "rewards/rejected": -0.030829936265945435, + "step": 690 + }, + { + "epoch": 0.12, + "grad_norm": 2.389276468868878, + "learning_rate": 3.01464254952627e-07, + "logits/chosen": -3.0419516563415527, + "logits/rejected": -3.0135397911071777, + "logps/chosen": -57.74323272705078, + "logps/rejected": -56.58147430419922, + "loss": 0.6854, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02020222134888172, + "rewards/margins": 0.016081832349300385, + "rewards/rejected": -0.03628405183553696, + "step": 700 + }, + { + "epoch": 0.12, + "eval_logits/chosen": -3.1206588745117188, + "eval_logits/rejected": -3.114993095397949, + "eval_logps/chosen": -58.9453010559082, + "eval_logps/rejected": -63.95465087890625, + "eval_loss": 0.6904971599578857, + "eval_rewards/accuracies": 0.5894516706466675, + "eval_rewards/chosen": -0.0024145517963916063, + "eval_rewards/margins": 0.005558097269386053, + "eval_rewards/rejected": -0.007972650229930878, + "eval_runtime": 357.4068, + "eval_samples_per_second": 12.042, + "eval_steps_per_second": 1.505, + "step": 700 + }, + { + "epoch": 0.12, + "grad_norm": 2.586197787105945, + "learning_rate": 3.05770887166236e-07, + "logits/chosen": -3.0198841094970703, + "logits/rejected": -2.9914581775665283, + "logps/chosen": -56.7116584777832, + "logps/rejected": -58.61717987060547, + "loss": 0.6843, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.020994270220398903, + "rewards/margins": 0.018473895266652107, + "rewards/rejected": -0.03946816921234131, + "step": 710 + }, + { + "epoch": 0.12, + "grad_norm": 2.702057793681809, + "learning_rate": 3.100775193798449e-07, + "logits/chosen": -2.9804978370666504, + "logits/rejected": -2.9764490127563477, + "logps/chosen": -55.76905059814453, + "logps/rejected": -58.02165603637695, + "loss": 0.6876, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.024686764925718307, + "rewards/margins": 0.011716444976627827, + "rewards/rejected": -0.03640320897102356, + "step": 720 + }, + { + "epoch": 0.13, + "grad_norm": 2.6466880554114547, + "learning_rate": 3.143841515934539e-07, + "logits/chosen": -3.0941200256347656, + "logits/rejected": -3.069159746170044, + "logps/chosen": -59.358177185058594, + "logps/rejected": -57.388641357421875, + "loss": 0.6844, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03032388910651207, + "rewards/margins": 0.01852940209209919, + "rewards/rejected": -0.04885329678654671, + "step": 730 + }, + { + "epoch": 0.13, + "grad_norm": 2.3238465611991073, + "learning_rate": 3.186907838070629e-07, + "logits/chosen": -2.9740123748779297, + "logits/rejected": -2.9482741355895996, + "logps/chosen": -57.704978942871094, + "logps/rejected": -58.32112503051758, + "loss": 0.6846, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0279629435390234, + "rewards/margins": 0.01816297508776188, + "rewards/rejected": -0.04612591490149498, + "step": 740 + }, + { + "epoch": 0.13, + "grad_norm": 2.60752160244168, + "learning_rate": 3.229974160206718e-07, + "logits/chosen": -3.142927646636963, + "logits/rejected": -3.115029811859131, + "logps/chosen": -58.75443649291992, + "logps/rejected": -59.41529083251953, + "loss": 0.6801, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.027507567778229713, + "rewards/margins": 0.026881281286478043, + "rewards/rejected": -0.05438884347677231, + "step": 750 + }, + { + "epoch": 0.13, + "grad_norm": 3.113387241673442, + "learning_rate": 3.273040482342808e-07, + "logits/chosen": -2.9908509254455566, + "logits/rejected": -2.9526984691619873, + "logps/chosen": -57.714439392089844, + "logps/rejected": -54.94502639770508, + "loss": 0.683, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03286290913820267, + "rewards/margins": 0.02157803811132908, + "rewards/rejected": -0.0544409453868866, + "step": 760 + }, + { + "epoch": 0.13, + "grad_norm": 2.4453618798570953, + "learning_rate": 3.3161068044788976e-07, + "logits/chosen": -3.038949489593506, + "logits/rejected": -3.015986680984497, + "logps/chosen": -56.661033630371094, + "logps/rejected": -57.70270538330078, + "loss": 0.685, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.037630461156368256, + "rewards/margins": 0.017523907124996185, + "rewards/rejected": -0.05515437200665474, + "step": 770 + }, + { + "epoch": 0.13, + "grad_norm": 2.7995118829195893, + "learning_rate": 3.359173126614987e-07, + "logits/chosen": -3.0295486450195312, + "logits/rejected": -2.9988982677459717, + "logps/chosen": -57.22774124145508, + "logps/rejected": -57.87713623046875, + "loss": 0.6804, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.04078695923089981, + "rewards/margins": 0.026560068130493164, + "rewards/rejected": -0.06734703481197357, + "step": 780 + }, + { + "epoch": 0.14, + "grad_norm": 2.59814741855216, + "learning_rate": 3.402239448751076e-07, + "logits/chosen": -3.0351967811584473, + "logits/rejected": -3.001878499984741, + "logps/chosen": -57.94755172729492, + "logps/rejected": -59.64392852783203, + "loss": 0.6835, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04035332426428795, + "rewards/margins": 0.02070373296737671, + "rewards/rejected": -0.06105704978108406, + "step": 790 + }, + { + "epoch": 0.14, + "grad_norm": 2.979271826495132, + "learning_rate": 3.4453057708871665e-07, + "logits/chosen": -2.9210307598114014, + "logits/rejected": -2.8951001167297363, + "logps/chosen": -59.6242790222168, + "logps/rejected": -61.24828338623047, + "loss": 0.6829, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04646407067775726, + "rewards/margins": 0.021958164870738983, + "rewards/rejected": -0.06842224299907684, + "step": 800 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -3.094379186630249, + "eval_logits/rejected": -3.0886640548706055, + "eval_logps/chosen": -60.67955017089844, + "eval_logps/rejected": -66.09896087646484, + "eval_loss": 0.6886637806892395, + "eval_rewards/accuracies": 0.5734200477600098, + "eval_rewards/chosen": -0.019756997004151344, + "eval_rewards/margins": 0.009658826515078545, + "eval_rewards/rejected": -0.029415827244520187, + "eval_runtime": 356.2401, + "eval_samples_per_second": 12.082, + "eval_steps_per_second": 1.51, + "step": 800 + }, + { + "epoch": 0.14, + "grad_norm": 2.4246944915810724, + "learning_rate": 3.4883720930232557e-07, + "logits/chosen": -2.9902844429016113, + "logits/rejected": -2.9628844261169434, + "logps/chosen": -61.16777801513672, + "logps/rejected": -62.579185485839844, + "loss": 0.6814, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.045093193650245667, + "rewards/margins": 0.025155682116746902, + "rewards/rejected": -0.07024887204170227, + "step": 810 + }, + { + "epoch": 0.14, + "grad_norm": 2.572085405803537, + "learning_rate": 3.531438415159345e-07, + "logits/chosen": -3.0369210243225098, + "logits/rejected": -3.0116257667541504, + "logps/chosen": -56.83648681640625, + "logps/rejected": -58.9111328125, + "loss": 0.6793, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.05299956351518631, + "rewards/margins": 0.028944198042154312, + "rewards/rejected": -0.08194376528263092, + "step": 820 + }, + { + "epoch": 0.14, + "grad_norm": 2.8751549171801125, + "learning_rate": 3.574504737295435e-07, + "logits/chosen": -2.96337628364563, + "logits/rejected": -2.948317050933838, + "logps/chosen": -61.24248504638672, + "logps/rejected": -63.26741409301758, + "loss": 0.6811, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.06662433594465256, + "rewards/margins": 0.02630782127380371, + "rewards/rejected": -0.09293216466903687, + "step": 830 + }, + { + "epoch": 0.14, + "grad_norm": 2.78034944593138, + "learning_rate": 3.6175710594315246e-07, + "logits/chosen": -3.060987949371338, + "logits/rejected": -3.034540891647339, + "logps/chosen": -60.72172164916992, + "logps/rejected": -59.53022003173828, + "loss": 0.6774, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.06628672778606415, + "rewards/margins": 0.033673472702503204, + "rewards/rejected": -0.09996020048856735, + "step": 840 + }, + { + "epoch": 0.15, + "grad_norm": 2.6269588611544923, + "learning_rate": 3.660637381567614e-07, + "logits/chosen": -2.93640398979187, + "logits/rejected": -2.9257328510284424, + "logps/chosen": -58.38752365112305, + "logps/rejected": -64.44731140136719, + "loss": 0.6844, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.07559017837047577, + "rewards/margins": 0.01969515159726143, + "rewards/rejected": -0.0952853411436081, + "step": 850 + }, + { + "epoch": 0.15, + "grad_norm": 2.5848797454702646, + "learning_rate": 3.703703703703703e-07, + "logits/chosen": -2.9593284130096436, + "logits/rejected": -2.9393258094787598, + "logps/chosen": -60.31854248046875, + "logps/rejected": -63.26483154296875, + "loss": 0.6769, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.07502218335866928, + "rewards/margins": 0.03479871153831482, + "rewards/rejected": -0.1098209023475647, + "step": 860 + }, + { + "epoch": 0.15, + "grad_norm": 2.5705323083064373, + "learning_rate": 3.7467700258397934e-07, + "logits/chosen": -3.0270254611968994, + "logits/rejected": -3.0232787132263184, + "logps/chosen": -58.79963302612305, + "logps/rejected": -64.54035949707031, + "loss": 0.6772, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.07506883889436722, + "rewards/margins": 0.033983923494815826, + "rewards/rejected": -0.10905275493860245, + "step": 870 + }, + { + "epoch": 0.15, + "grad_norm": 2.689358158624382, + "learning_rate": 3.7898363479758827e-07, + "logits/chosen": -2.9357428550720215, + "logits/rejected": -2.9134020805358887, + "logps/chosen": -59.15460205078125, + "logps/rejected": -62.313621520996094, + "loss": 0.6765, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.07416196167469025, + "rewards/margins": 0.035411037504673004, + "rewards/rejected": -0.10957300662994385, + "step": 880 + }, + { + "epoch": 0.15, + "grad_norm": 2.6438277738000293, + "learning_rate": 3.832902670111972e-07, + "logits/chosen": -2.9593563079833984, + "logits/rejected": -2.9229862689971924, + "logps/chosen": -64.84220123291016, + "logps/rejected": -63.8851203918457, + "loss": 0.6778, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.06395251303911209, + "rewards/margins": 0.03320237249135971, + "rewards/rejected": -0.0971548929810524, + "step": 890 + }, + { + "epoch": 0.16, + "grad_norm": 2.832318924397106, + "learning_rate": 3.8759689922480623e-07, + "logits/chosen": -2.977108955383301, + "logits/rejected": -2.9669320583343506, + "logps/chosen": -62.267547607421875, + "logps/rejected": -63.87932586669922, + "loss": 0.6773, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08133289217948914, + "rewards/margins": 0.033999234437942505, + "rewards/rejected": -0.11533211171627045, + "step": 900 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -3.057013750076294, + "eval_logits/rejected": -3.051332473754883, + "eval_logps/chosen": -63.69248962402344, + "eval_logps/rejected": -69.66419982910156, + "eval_loss": 0.6862910985946655, + "eval_rewards/accuracies": 0.5929368138313293, + "eval_rewards/chosen": -0.049886368215084076, + "eval_rewards/margins": 0.01518191583454609, + "eval_rewards/rejected": -0.06506828218698502, + "eval_runtime": 356.6845, + "eval_samples_per_second": 12.067, + "eval_steps_per_second": 1.508, + "step": 900 + }, + { + "epoch": 0.16, + "grad_norm": 2.791646492165207, + "learning_rate": 3.9190353143841515e-07, + "logits/chosen": -2.9454283714294434, + "logits/rejected": -2.9370040893554688, + "logps/chosen": -60.4368896484375, + "logps/rejected": -65.48957061767578, + "loss": 0.676, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.09426609426736832, + "rewards/margins": 0.04028692469000816, + "rewards/rejected": -0.13455303013324738, + "step": 910 + }, + { + "epoch": 0.16, + "grad_norm": 3.011151931674002, + "learning_rate": 3.962101636520241e-07, + "logits/chosen": -2.9765427112579346, + "logits/rejected": -2.934263229370117, + "logps/chosen": -64.45548248291016, + "logps/rejected": -62.556602478027344, + "loss": 0.6796, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.10165347158908844, + "rewards/margins": 0.029758507385849953, + "rewards/rejected": -0.13141196966171265, + "step": 920 + }, + { + "epoch": 0.16, + "grad_norm": 3.2118502689123245, + "learning_rate": 4.0051679586563306e-07, + "logits/chosen": -3.031832456588745, + "logits/rejected": -3.0159494876861572, + "logps/chosen": -62.47021484375, + "logps/rejected": -68.52490234375, + "loss": 0.6746, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.10014158487319946, + "rewards/margins": 0.04036015272140503, + "rewards/rejected": -0.14050175249576569, + "step": 930 + }, + { + "epoch": 0.16, + "grad_norm": 2.9839009058497106, + "learning_rate": 4.0482342807924204e-07, + "logits/chosen": -3.00956392288208, + "logits/rejected": -2.973090410232544, + "logps/chosen": -71.03943634033203, + "logps/rejected": -69.27840423583984, + "loss": 0.6769, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10842480510473251, + "rewards/margins": 0.03579176589846611, + "rewards/rejected": -0.14421656727790833, + "step": 940 + }, + { + "epoch": 0.16, + "grad_norm": 2.966178612417894, + "learning_rate": 4.0913006029285096e-07, + "logits/chosen": -2.802929639816284, + "logits/rejected": -2.785623073577881, + "logps/chosen": -67.41767883300781, + "logps/rejected": -71.18104553222656, + "loss": 0.6779, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.1217707023024559, + "rewards/margins": 0.033875979483127594, + "rewards/rejected": -0.1556466817855835, + "step": 950 + }, + { + "epoch": 0.17, + "grad_norm": 3.310978864654213, + "learning_rate": 4.134366925064599e-07, + "logits/chosen": -2.7671194076538086, + "logits/rejected": -2.770169734954834, + "logps/chosen": -63.908164978027344, + "logps/rejected": -69.78746032714844, + "loss": 0.6894, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1316661685705185, + "rewards/margins": 0.010264839045703411, + "rewards/rejected": -0.14193101227283478, + "step": 960 + }, + { + "epoch": 0.17, + "grad_norm": 3.407472849470465, + "learning_rate": 4.177433247200689e-07, + "logits/chosen": -2.93009877204895, + "logits/rejected": -2.8950753211975098, + "logps/chosen": -72.93366241455078, + "logps/rejected": -67.29154968261719, + "loss": 0.6821, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1230693906545639, + "rewards/margins": 0.025729473680257797, + "rewards/rejected": -0.148798868060112, + "step": 970 + }, + { + "epoch": 0.17, + "grad_norm": 3.6748252700178887, + "learning_rate": 4.2204995693367785e-07, + "logits/chosen": -3.024207353591919, + "logits/rejected": -3.0045340061187744, + "logps/chosen": -67.85172271728516, + "logps/rejected": -72.01612854003906, + "loss": 0.6733, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11790040880441666, + "rewards/margins": 0.05413081496953964, + "rewards/rejected": -0.1720312237739563, + "step": 980 + }, + { + "epoch": 0.17, + "grad_norm": 3.726003522783883, + "learning_rate": 4.263565891472868e-07, + "logits/chosen": -2.889413833618164, + "logits/rejected": -2.8664448261260986, + "logps/chosen": -66.66841125488281, + "logps/rejected": -68.96918487548828, + "loss": 0.6738, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12137794494628906, + "rewards/margins": 0.04220535233616829, + "rewards/rejected": -0.16358330845832825, + "step": 990 + }, + { + "epoch": 0.17, + "grad_norm": 3.397153414453954, + "learning_rate": 4.3066322136089576e-07, + "logits/chosen": -2.836636543273926, + "logits/rejected": -2.8090505599975586, + "logps/chosen": -71.08350372314453, + "logps/rejected": -68.55287170410156, + "loss": 0.6818, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.14227214455604553, + "rewards/margins": 0.02618454024195671, + "rewards/rejected": -0.16845668852329254, + "step": 1000 + }, + { + "epoch": 0.17, + "eval_logits/chosen": -3.0087461471557617, + "eval_logits/rejected": -3.003145694732666, + "eval_logps/chosen": -67.30126190185547, + "eval_logps/rejected": -73.87139892578125, + "eval_loss": 0.6837059855461121, + "eval_rewards/accuracies": 0.5971189737319946, + "eval_rewards/chosen": -0.08597413450479507, + "eval_rewards/margins": 0.02116604894399643, + "eval_rewards/rejected": -0.1071401834487915, + "eval_runtime": 356.1874, + "eval_samples_per_second": 12.084, + "eval_steps_per_second": 1.51, + "step": 1000 + }, + { + "epoch": 0.17, + "grad_norm": 3.4839564072636575, + "learning_rate": 4.3496985357450473e-07, + "logits/chosen": -2.771763324737549, + "logits/rejected": -2.781416416168213, + "logps/chosen": -67.20372009277344, + "logps/rejected": -78.4428939819336, + "loss": 0.6666, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13454464077949524, + "rewards/margins": 0.07122045755386353, + "rewards/rejected": -0.20576509833335876, + "step": 1010 + }, + { + "epoch": 0.18, + "grad_norm": 3.456440043201763, + "learning_rate": 4.3927648578811366e-07, + "logits/chosen": -2.9379830360412598, + "logits/rejected": -2.9037129878997803, + "logps/chosen": -70.02944946289062, + "logps/rejected": -73.0363540649414, + "loss": 0.6657, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1292036920785904, + "rewards/margins": 0.05881650000810623, + "rewards/rejected": -0.18802018463611603, + "step": 1020 + }, + { + "epoch": 0.18, + "grad_norm": 3.573048388794818, + "learning_rate": 4.4358311800172264e-07, + "logits/chosen": -2.9771835803985596, + "logits/rejected": -2.9505438804626465, + "logps/chosen": -67.1936264038086, + "logps/rejected": -70.21983337402344, + "loss": 0.6709, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.1349172741174698, + "rewards/margins": 0.048518020659685135, + "rewards/rejected": -0.18343529105186462, + "step": 1030 + }, + { + "epoch": 0.18, + "grad_norm": 4.0876459743090034, + "learning_rate": 4.478897502153316e-07, + "logits/chosen": -2.893394947052002, + "logits/rejected": -2.8834469318389893, + "logps/chosen": -68.99354553222656, + "logps/rejected": -74.55607604980469, + "loss": 0.6785, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1559665948152542, + "rewards/margins": 0.03430451080203056, + "rewards/rejected": -0.19027109444141388, + "step": 1040 + }, + { + "epoch": 0.18, + "grad_norm": 3.9493370777335084, + "learning_rate": 4.5219638242894055e-07, + "logits/chosen": -2.8083250522613525, + "logits/rejected": -2.773916721343994, + "logps/chosen": -73.37397766113281, + "logps/rejected": -71.31639862060547, + "loss": 0.6732, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.16260358691215515, + "rewards/margins": 0.04476263374090195, + "rewards/rejected": -0.2073661983013153, + "step": 1050 + }, + { + "epoch": 0.18, + "grad_norm": 3.7318845219809025, + "learning_rate": 4.565030146425495e-07, + "logits/chosen": -2.8137454986572266, + "logits/rejected": -2.7951653003692627, + "logps/chosen": -73.75227355957031, + "logps/rejected": -76.26736450195312, + "loss": 0.6754, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1725139617919922, + "rewards/margins": 0.04043154790997505, + "rewards/rejected": -0.21294550597667694, + "step": 1060 + }, + { + "epoch": 0.18, + "grad_norm": 3.921970666271597, + "learning_rate": 4.6080964685615845e-07, + "logits/chosen": -2.9450836181640625, + "logits/rejected": -2.912147045135498, + "logps/chosen": -74.61820220947266, + "logps/rejected": -76.5606689453125, + "loss": 0.6729, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18662679195404053, + "rewards/margins": 0.04664309695363045, + "rewards/rejected": -0.23326988518238068, + "step": 1070 + }, + { + "epoch": 0.19, + "grad_norm": 3.8287702054233956, + "learning_rate": 4.6511627906976743e-07, + "logits/chosen": -2.9213509559631348, + "logits/rejected": -2.8993239402770996, + "logps/chosen": -76.99789428710938, + "logps/rejected": -75.40824127197266, + "loss": 0.6781, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21221812069416046, + "rewards/margins": 0.03467785567045212, + "rewards/rejected": -0.2468959540128708, + "step": 1080 + }, + { + "epoch": 0.19, + "grad_norm": 3.9047242283272396, + "learning_rate": 4.6942291128337636e-07, + "logits/chosen": -2.797940731048584, + "logits/rejected": -2.7896194458007812, + "logps/chosen": -71.64787292480469, + "logps/rejected": -77.1730728149414, + "loss": 0.6704, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19434688985347748, + "rewards/margins": 0.051201529800891876, + "rewards/rejected": -0.24554841220378876, + "step": 1090 + }, + { + "epoch": 0.19, + "grad_norm": 3.742135088088785, + "learning_rate": 4.7372954349698534e-07, + "logits/chosen": -2.8556008338928223, + "logits/rejected": -2.8531699180603027, + "logps/chosen": -69.83229064941406, + "logps/rejected": -78.55072784423828, + "loss": 0.6715, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.18923941254615784, + "rewards/margins": 0.050502412021160126, + "rewards/rejected": -0.23974183201789856, + "step": 1100 + }, + { + "epoch": 0.19, + "eval_logits/chosen": -2.946117401123047, + "eval_logits/rejected": -2.94049072265625, + "eval_logps/chosen": -71.7703857421875, + "eval_logps/rejected": -79.22164916992188, + "eval_loss": 0.679982602596283, + "eval_rewards/accuracies": 0.6057156324386597, + "eval_rewards/chosen": -0.13066548109054565, + "eval_rewards/margins": 0.029977135360240936, + "eval_rewards/rejected": -0.1606426239013672, + "eval_runtime": 356.7556, + "eval_samples_per_second": 12.064, + "eval_steps_per_second": 1.508, + "step": 1100 + }, + { + "epoch": 0.19, + "grad_norm": 4.753611130936478, + "learning_rate": 4.780361757105943e-07, + "logits/chosen": -2.840238571166992, + "logits/rejected": -2.8512330055236816, + "logps/chosen": -71.57658386230469, + "logps/rejected": -81.72315979003906, + "loss": 0.6636, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18691562116146088, + "rewards/margins": 0.0658998116850853, + "rewards/rejected": -0.2528154253959656, + "step": 1110 + }, + { + "epoch": 0.19, + "grad_norm": 4.200014824991902, + "learning_rate": 4.823428079242032e-07, + "logits/chosen": -2.8601462841033936, + "logits/rejected": -2.83921480178833, + "logps/chosen": -75.09476470947266, + "logps/rejected": -77.8160400390625, + "loss": 0.6691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1874438375234604, + "rewards/margins": 0.05362669751048088, + "rewards/rejected": -0.24107055366039276, + "step": 1120 + }, + { + "epoch": 0.19, + "grad_norm": 4.069383490654212, + "learning_rate": 4.866494401378123e-07, + "logits/chosen": -2.903735399246216, + "logits/rejected": -2.8804819583892822, + "logps/chosen": -72.0593490600586, + "logps/rejected": -81.51217651367188, + "loss": 0.6601, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20027342438697815, + "rewards/margins": 0.07303432375192642, + "rewards/rejected": -0.273307740688324, + "step": 1130 + }, + { + "epoch": 0.2, + "grad_norm": 4.350410897251021, + "learning_rate": 4.909560723514212e-07, + "logits/chosen": -2.808011293411255, + "logits/rejected": -2.7773683071136475, + "logps/chosen": -77.37016296386719, + "logps/rejected": -78.98043060302734, + "loss": 0.6698, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2049834281206131, + "rewards/margins": 0.055016178637742996, + "rewards/rejected": -0.2599996030330658, + "step": 1140 + }, + { + "epoch": 0.2, + "grad_norm": 4.680052319466735, + "learning_rate": 4.952627045650301e-07, + "logits/chosen": -2.7504663467407227, + "logits/rejected": -2.7321338653564453, + "logps/chosen": -74.69068908691406, + "logps/rejected": -83.3788833618164, + "loss": 0.6601, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21349653601646423, + "rewards/margins": 0.0752733126282692, + "rewards/rejected": -0.28876984119415283, + "step": 1150 + }, + { + "epoch": 0.2, + "grad_norm": 4.383585223712436, + "learning_rate": 4.995693367786391e-07, + "logits/chosen": -2.8323190212249756, + "logits/rejected": -2.823431968688965, + "logps/chosen": -74.20658874511719, + "logps/rejected": -82.88806915283203, + "loss": 0.6568, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.20177213847637177, + "rewards/margins": 0.08152016252279282, + "rewards/rejected": -0.283292293548584, + "step": 1160 + }, + { + "epoch": 0.2, + "grad_norm": 4.7924512014269425, + "learning_rate": 4.999990843883228e-07, + "logits/chosen": -2.706714630126953, + "logits/rejected": -2.694248914718628, + "logps/chosen": -75.1012191772461, + "logps/rejected": -86.26618194580078, + "loss": 0.6645, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2305610179901123, + "rewards/margins": 0.08545393496751785, + "rewards/rejected": -0.31601497530937195, + "step": 1170 + }, + { + "epoch": 0.2, + "grad_norm": 4.531230658306969, + "learning_rate": 4.999959193195308e-07, + "logits/chosen": -2.665681838989258, + "logits/rejected": -2.6355409622192383, + "logps/chosen": -78.78900146484375, + "logps/rejected": -83.79048156738281, + "loss": 0.6643, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2599770426750183, + "rewards/margins": 0.06831606477499008, + "rewards/rejected": -0.3282931447029114, + "step": 1180 + }, + { + "epoch": 0.21, + "grad_norm": 4.6260170747775105, + "learning_rate": 4.999904935183911e-07, + "logits/chosen": -2.841900587081909, + "logits/rejected": -2.8088276386260986, + "logps/chosen": -83.71769714355469, + "logps/rejected": -83.9743423461914, + "loss": 0.6567, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25023940205574036, + "rewards/margins": 0.08566378057003021, + "rewards/rejected": -0.33590319752693176, + "step": 1190 + }, + { + "epoch": 0.21, + "grad_norm": 4.214091282801035, + "learning_rate": 4.999828070339698e-07, + "logits/chosen": -2.669875144958496, + "logits/rejected": -2.655302047729492, + "logps/chosen": -79.1661605834961, + "logps/rejected": -83.97240447998047, + "loss": 0.6651, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2603180706501007, + "rewards/margins": 0.0669156163930893, + "rewards/rejected": -0.3272337019443512, + "step": 1200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.85164475440979, + "eval_logits/rejected": -2.84596586227417, + "eval_logps/chosen": -78.02967834472656, + "eval_logps/rejected": -86.59571075439453, + "eval_loss": 0.6755677461624146, + "eval_rewards/accuracies": 0.5996747016906738, + "eval_rewards/chosen": -0.1932583451271057, + "eval_rewards/margins": 0.04112492874264717, + "eval_rewards/rejected": -0.23438328504562378, + "eval_runtime": 357.4222, + "eval_samples_per_second": 12.042, + "eval_steps_per_second": 1.505, + "step": 1200 + }, + { + "epoch": 0.21, + "grad_norm": 5.729626186243229, + "learning_rate": 4.999728599357762e-07, + "logits/chosen": -2.7580645084381104, + "logits/rejected": -2.725999355316162, + "logps/chosen": -82.05293273925781, + "logps/rejected": -90.14490509033203, + "loss": 0.6559, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2827844023704529, + "rewards/margins": 0.08879880607128143, + "rewards/rejected": -0.37158316373825073, + "step": 1210 + }, + { + "epoch": 0.21, + "grad_norm": 4.843172675927166, + "learning_rate": 4.999606523137628e-07, + "logits/chosen": -2.7558417320251465, + "logits/rejected": -2.730149745941162, + "logps/chosen": -82.96326446533203, + "logps/rejected": -92.13468933105469, + "loss": 0.6487, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2960751950740814, + "rewards/margins": 0.10249904543161392, + "rewards/rejected": -0.39857420325279236, + "step": 1220 + }, + { + "epoch": 0.21, + "grad_norm": 5.963904626730247, + "learning_rate": 4.99946184278324e-07, + "logits/chosen": -2.815377950668335, + "logits/rejected": -2.7770168781280518, + "logps/chosen": -87.67040252685547, + "logps/rejected": -92.19978332519531, + "loss": 0.6591, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.31936585903167725, + "rewards/margins": 0.08062832802534103, + "rewards/rejected": -0.3999941945075989, + "step": 1230 + }, + { + "epoch": 0.21, + "grad_norm": 5.432890340977972, + "learning_rate": 4.999294559602954e-07, + "logits/chosen": -2.681164264678955, + "logits/rejected": -2.666093349456787, + "logps/chosen": -85.52889251708984, + "logps/rejected": -93.53931427001953, + "loss": 0.6638, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3310098946094513, + "rewards/margins": 0.07634725421667099, + "rewards/rejected": -0.4073571562767029, + "step": 1240 + }, + { + "epoch": 0.22, + "grad_norm": 5.250007678464064, + "learning_rate": 4.999104675109525e-07, + "logits/chosen": -2.787619113922119, + "logits/rejected": -2.7531590461730957, + "logps/chosen": -84.2263412475586, + "logps/rejected": -89.0356216430664, + "loss": 0.6633, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3051467835903168, + "rewards/margins": 0.07372823357582092, + "rewards/rejected": -0.3788750171661377, + "step": 1250 + }, + { + "epoch": 0.22, + "grad_norm": 5.437906354386665, + "learning_rate": 4.998892191020092e-07, + "logits/chosen": -2.6413798332214355, + "logits/rejected": -2.6131081581115723, + "logps/chosen": -83.180419921875, + "logps/rejected": -89.31295776367188, + "loss": 0.6567, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.30223608016967773, + "rewards/margins": 0.08826258033514023, + "rewards/rejected": -0.39049869775772095, + "step": 1260 + }, + { + "epoch": 0.22, + "grad_norm": 7.461728534142161, + "learning_rate": 4.998657109256166e-07, + "logits/chosen": -2.6996243000030518, + "logits/rejected": -2.6940901279449463, + "logps/chosen": -85.5318374633789, + "logps/rejected": -94.53944396972656, + "loss": 0.6692, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.33671867847442627, + "rewards/margins": 0.06170588731765747, + "rewards/rejected": -0.39842456579208374, + "step": 1270 + }, + { + "epoch": 0.22, + "grad_norm": 5.558513126380609, + "learning_rate": 4.998399431943609e-07, + "logits/chosen": -2.768416166305542, + "logits/rejected": -2.7740864753723145, + "logps/chosen": -79.00863647460938, + "logps/rejected": -98.24131774902344, + "loss": 0.6481, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2783285081386566, + "rewards/margins": 0.10511146485805511, + "rewards/rejected": -0.38343995809555054, + "step": 1280 + }, + { + "epoch": 0.22, + "grad_norm": 5.865934178946898, + "learning_rate": 4.998119161412618e-07, + "logits/chosen": -2.6547913551330566, + "logits/rejected": -2.62509822845459, + "logps/chosen": -86.21808624267578, + "logps/rejected": -90.81287384033203, + "loss": 0.6581, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3163338005542755, + "rewards/margins": 0.08770157396793365, + "rewards/rejected": -0.40403538942337036, + "step": 1290 + }, + { + "epoch": 0.22, + "grad_norm": 6.062029915276689, + "learning_rate": 4.997816300197699e-07, + "logits/chosen": -2.7270829677581787, + "logits/rejected": -2.714017391204834, + "logps/chosen": -87.53861999511719, + "logps/rejected": -99.14437866210938, + "loss": 0.663, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.37334591150283813, + "rewards/margins": 0.0809113010764122, + "rewards/rejected": -0.4542572498321533, + "step": 1300 + }, + { + "epoch": 0.22, + "eval_logits/chosen": -2.7855560779571533, + "eval_logits/rejected": -2.7796361446380615, + "eval_logps/chosen": -86.98535919189453, + "eval_logps/rejected": -97.24429321289062, + "eval_loss": 0.6691488027572632, + "eval_rewards/accuracies": 0.6171003580093384, + "eval_rewards/chosen": -0.2828150987625122, + "eval_rewards/margins": 0.05805408954620361, + "eval_rewards/rejected": -0.3408692181110382, + "eval_runtime": 357.0192, + "eval_samples_per_second": 12.055, + "eval_steps_per_second": 1.507, + "step": 1300 + }, + { + "epoch": 0.23, + "grad_norm": 7.2374921490781166, + "learning_rate": 4.997490851037651e-07, + "logits/chosen": -2.7199060916900635, + "logits/rejected": -2.685650110244751, + "logps/chosen": -90.32498931884766, + "logps/rejected": -97.50151824951172, + "loss": 0.6474, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3698830008506775, + "rewards/margins": 0.11114762723445892, + "rewards/rejected": -0.4810306429862976, + "step": 1310 + }, + { + "epoch": 0.23, + "grad_norm": 7.329198026466427, + "learning_rate": 4.997142816875534e-07, + "logits/chosen": -2.6866555213928223, + "logits/rejected": -2.6637563705444336, + "logps/chosen": -93.89530944824219, + "logps/rejected": -97.65375518798828, + "loss": 0.6605, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.38488394021987915, + "rewards/margins": 0.0846201553940773, + "rewards/rejected": -0.469504177570343, + "step": 1320 + }, + { + "epoch": 0.23, + "grad_norm": 6.989316967263134, + "learning_rate": 4.996772200858648e-07, + "logits/chosen": -2.759702682495117, + "logits/rejected": -2.731628894805908, + "logps/chosen": -94.14637756347656, + "logps/rejected": -99.45263671875, + "loss": 0.6529, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3895763158798218, + "rewards/margins": 0.10352253913879395, + "rewards/rejected": -0.4930989146232605, + "step": 1330 + }, + { + "epoch": 0.23, + "grad_norm": 5.339352998534476, + "learning_rate": 4.996379006338504e-07, + "logits/chosen": -2.6027302742004395, + "logits/rejected": -2.582573890686035, + "logps/chosen": -86.88957977294922, + "logps/rejected": -96.32715606689453, + "loss": 0.6431, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3176007866859436, + "rewards/margins": 0.11718092858791351, + "rewards/rejected": -0.4347817003726959, + "step": 1340 + }, + { + "epoch": 0.23, + "grad_norm": 6.251815390342403, + "learning_rate": 4.99596323687079e-07, + "logits/chosen": -2.6558520793914795, + "logits/rejected": -2.632688522338867, + "logps/chosen": -94.7468490600586, + "logps/rejected": -102.03514099121094, + "loss": 0.666, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.40661248564720154, + "rewards/margins": 0.08162738382816315, + "rewards/rejected": -0.4882398247718811, + "step": 1350 + }, + { + "epoch": 0.23, + "grad_norm": 6.6807835207225725, + "learning_rate": 4.995524896215339e-07, + "logits/chosen": -2.606091022491455, + "logits/rejected": -2.593371868133545, + "logps/chosen": -95.50038146972656, + "logps/rejected": -104.9891586303711, + "loss": 0.6603, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.40270328521728516, + "rewards/margins": 0.08724579960107803, + "rewards/rejected": -0.48994913697242737, + "step": 1360 + }, + { + "epoch": 0.24, + "grad_norm": 7.719117110945399, + "learning_rate": 4.995063988336101e-07, + "logits/chosen": -2.6957902908325195, + "logits/rejected": -2.67728328704834, + "logps/chosen": -93.07683563232422, + "logps/rejected": -106.47142028808594, + "loss": 0.6415, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4041716456413269, + "rewards/margins": 0.13314835727214813, + "rewards/rejected": -0.5373200178146362, + "step": 1370 + }, + { + "epoch": 0.24, + "grad_norm": 7.523503188478382, + "learning_rate": 4.994580517401102e-07, + "logits/chosen": -2.5843305587768555, + "logits/rejected": -2.56766414642334, + "logps/chosen": -97.11516571044922, + "logps/rejected": -107.70014953613281, + "loss": 0.6448, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.42785486578941345, + "rewards/margins": 0.12621551752090454, + "rewards/rejected": -0.5540703535079956, + "step": 1380 + }, + { + "epoch": 0.24, + "grad_norm": 7.994616924999172, + "learning_rate": 4.994074487782406e-07, + "logits/chosen": -2.7036585807800293, + "logits/rejected": -2.6787309646606445, + "logps/chosen": -103.4170913696289, + "logps/rejected": -113.5262222290039, + "loss": 0.6457, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.46558231115341187, + "rewards/margins": 0.1351221799850464, + "rewards/rejected": -0.6007044315338135, + "step": 1390 + }, + { + "epoch": 0.24, + "grad_norm": 7.076195209744298, + "learning_rate": 4.993545904056078e-07, + "logits/chosen": -2.5222525596618652, + "logits/rejected": -2.4996466636657715, + "logps/chosen": -100.6828384399414, + "logps/rejected": -113.02195739746094, + "loss": 0.6329, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.448734849691391, + "rewards/margins": 0.15365351736545563, + "rewards/rejected": -0.6023883819580078, + "step": 1400 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.6804823875427246, + "eval_logits/rejected": -2.674381732940674, + "eval_logps/chosen": -96.39348602294922, + "eval_logps/rejected": -108.98139190673828, + "eval_loss": 0.6609914302825928, + "eval_rewards/accuracies": 0.61849445104599, + "eval_rewards/chosen": -0.37689635157585144, + "eval_rewards/margins": 0.08134372532367706, + "eval_rewards/rejected": -0.4582400619983673, + "eval_runtime": 356.0921, + "eval_samples_per_second": 12.087, + "eval_steps_per_second": 1.511, + "step": 1400 + }, + { + "epoch": 0.24, + "grad_norm": 8.108597626289049, + "learning_rate": 4.992994771002141e-07, + "logits/chosen": -2.5680909156799316, + "logits/rejected": -2.5558865070343018, + "logps/chosen": -102.64105224609375, + "logps/rejected": -119.11705017089844, + "loss": 0.6367, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.48273658752441406, + "rewards/margins": 0.17660747468471527, + "rewards/rejected": -0.6593440175056458, + "step": 1410 + }, + { + "epoch": 0.24, + "grad_norm": 7.858931662859935, + "learning_rate": 4.992421093604534e-07, + "logits/chosen": -2.4751877784729004, + "logits/rejected": -2.4777843952178955, + "logps/chosen": -101.93944549560547, + "logps/rejected": -124.42276763916016, + "loss": 0.6348, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.512130856513977, + "rewards/margins": 0.1852533519268036, + "rewards/rejected": -0.697384238243103, + "step": 1420 + }, + { + "epoch": 0.25, + "grad_norm": 9.45227562873637, + "learning_rate": 4.991824877051067e-07, + "logits/chosen": -2.561638832092285, + "logits/rejected": -2.54856276512146, + "logps/chosen": -108.27215576171875, + "logps/rejected": -134.86288452148438, + "loss": 0.601, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5470194816589355, + "rewards/margins": 0.2372448891401291, + "rewards/rejected": -0.784264326095581, + "step": 1430 + }, + { + "epoch": 0.25, + "grad_norm": 10.22819119060325, + "learning_rate": 4.991206126733369e-07, + "logits/chosen": -2.448366403579712, + "logits/rejected": -2.420719623565674, + "logps/chosen": -108.7235107421875, + "logps/rejected": -120.07295989990234, + "loss": 0.6473, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5824471712112427, + "rewards/margins": 0.13498175144195557, + "rewards/rejected": -0.7174289226531982, + "step": 1440 + }, + { + "epoch": 0.25, + "grad_norm": 12.725503598146807, + "learning_rate": 4.990564848246851e-07, + "logits/chosen": -2.4409327507019043, + "logits/rejected": -2.408658504486084, + "logps/chosen": -114.9705810546875, + "logps/rejected": -128.18980407714844, + "loss": 0.6348, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5951242446899414, + "rewards/margins": 0.16781339049339294, + "rewards/rejected": -0.7629376649856567, + "step": 1450 + }, + { + "epoch": 0.25, + "grad_norm": 9.436937177439919, + "learning_rate": 4.98990104739064e-07, + "logits/chosen": -2.4494917392730713, + "logits/rejected": -2.4222424030303955, + "logps/chosen": -109.37040710449219, + "logps/rejected": -126.34139251708984, + "loss": 0.6382, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5768964290618896, + "rewards/margins": 0.18195541203022003, + "rewards/rejected": -0.7588518261909485, + "step": 1460 + }, + { + "epoch": 0.25, + "grad_norm": 8.955583438989118, + "learning_rate": 4.989214730167541e-07, + "logits/chosen": -2.622709035873413, + "logits/rejected": -2.5908420085906982, + "logps/chosen": -110.5583267211914, + "logps/rejected": -124.64057922363281, + "loss": 0.6329, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.535399854183197, + "rewards/margins": 0.16410811245441437, + "rewards/rejected": -0.6995079517364502, + "step": 1470 + }, + { + "epoch": 0.25, + "grad_norm": 9.49330055426053, + "learning_rate": 4.988505902783971e-07, + "logits/chosen": -2.590567111968994, + "logits/rejected": -2.556976795196533, + "logps/chosen": -105.81478118896484, + "logps/rejected": -119.9277114868164, + "loss": 0.6432, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5450218915939331, + "rewards/margins": 0.13719643652439117, + "rewards/rejected": -0.6822183728218079, + "step": 1480 + }, + { + "epoch": 0.26, + "grad_norm": 9.98525040448124, + "learning_rate": 4.987774571649912e-07, + "logits/chosen": -2.4983878135681152, + "logits/rejected": -2.4753427505493164, + "logps/chosen": -115.1557846069336, + "logps/rejected": -128.9449005126953, + "loss": 0.6331, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5778855085372925, + "rewards/margins": 0.17497901618480682, + "rewards/rejected": -0.7528645992279053, + "step": 1490 + }, + { + "epoch": 0.26, + "grad_norm": 10.26303189770449, + "learning_rate": 4.987020743378848e-07, + "logits/chosen": -2.385967969894409, + "logits/rejected": -2.383463144302368, + "logps/chosen": -110.9022445678711, + "logps/rejected": -130.71217346191406, + "loss": 0.6356, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.609332263469696, + "rewards/margins": 0.1715681552886963, + "rewards/rejected": -0.7809004187583923, + "step": 1500 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.517664909362793, + "eval_logits/rejected": -2.5109217166900635, + "eval_logps/chosen": -107.28179168701172, + "eval_logps/rejected": -122.3668212890625, + "eval_loss": 0.6536844968795776, + "eval_rewards/accuracies": 0.6380111575126648, + "eval_rewards/chosen": -0.48577937483787537, + "eval_rewards/margins": 0.10631493479013443, + "eval_rewards/rejected": -0.5920943021774292, + "eval_runtime": 356.9471, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.507, + "step": 1500 + }, + { + "epoch": 0.26, + "grad_norm": 9.106501586861713, + "learning_rate": 4.986244424787706e-07, + "logits/chosen": -2.322202205657959, + "logits/rejected": -2.2912230491638184, + "logps/chosen": -118.87747955322266, + "logps/rejected": -133.88900756835938, + "loss": 0.6146, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6029187440872192, + "rewards/margins": 0.2079295665025711, + "rewards/rejected": -0.8108483552932739, + "step": 1510 + }, + { + "epoch": 0.26, + "grad_norm": 10.760728694678805, + "learning_rate": 4.985445622896794e-07, + "logits/chosen": -2.387296676635742, + "logits/rejected": -2.379225015640259, + "logps/chosen": -118.9466781616211, + "logps/rejected": -134.18948364257812, + "loss": 0.6424, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6671319007873535, + "rewards/margins": 0.16024193167686462, + "rewards/rejected": -0.8273738026618958, + "step": 1520 + }, + { + "epoch": 0.26, + "grad_norm": 15.020346958333217, + "learning_rate": 4.98462434492974e-07, + "logits/chosen": -2.2380728721618652, + "logits/rejected": -2.2234339714050293, + "logps/chosen": -127.2374038696289, + "logps/rejected": -143.69729614257812, + "loss": 0.6451, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.7556307315826416, + "rewards/margins": 0.1741071343421936, + "rewards/rejected": -0.9297378659248352, + "step": 1530 + }, + { + "epoch": 0.27, + "grad_norm": 10.57370425758323, + "learning_rate": 4.983780598313423e-07, + "logits/chosen": -2.3825461864471436, + "logits/rejected": -2.3513596057891846, + "logps/chosen": -120.11723327636719, + "logps/rejected": -140.9315185546875, + "loss": 0.6034, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6420670747756958, + "rewards/margins": 0.2574729919433594, + "rewards/rejected": -0.8995401263237, + "step": 1540 + }, + { + "epoch": 0.27, + "grad_norm": 11.337001973825672, + "learning_rate": 4.982914390677909e-07, + "logits/chosen": -2.2892661094665527, + "logits/rejected": -2.2704126834869385, + "logps/chosen": -114.27877044677734, + "logps/rejected": -134.2589111328125, + "loss": 0.6133, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6090589165687561, + "rewards/margins": 0.2282298356294632, + "rewards/rejected": -0.8372887372970581, + "step": 1550 + }, + { + "epoch": 0.27, + "grad_norm": 13.598814466918178, + "learning_rate": 4.982025729856381e-07, + "logits/chosen": -2.273789882659912, + "logits/rejected": -2.252927780151367, + "logps/chosen": -123.75514221191406, + "logps/rejected": -144.1699676513672, + "loss": 0.6334, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7292818427085876, + "rewards/margins": 0.20613010227680206, + "rewards/rejected": -0.935411810874939, + "step": 1560 + }, + { + "epoch": 0.27, + "grad_norm": 13.054000500346573, + "learning_rate": 4.981114623885066e-07, + "logits/chosen": -2.305429697036743, + "logits/rejected": -2.304576873779297, + "logps/chosen": -123.65141296386719, + "logps/rejected": -149.49166870117188, + "loss": 0.6306, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7176939845085144, + "rewards/margins": 0.21510076522827148, + "rewards/rejected": -0.9327947497367859, + "step": 1570 + }, + { + "epoch": 0.27, + "grad_norm": 11.391373748499909, + "learning_rate": 4.980181081003167e-07, + "logits/chosen": -2.2610230445861816, + "logits/rejected": -2.248826265335083, + "logps/chosen": -120.3755874633789, + "logps/rejected": -140.72938537597656, + "loss": 0.6399, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6746999621391296, + "rewards/margins": 0.19383227825164795, + "rewards/rejected": -0.8685322999954224, + "step": 1580 + }, + { + "epoch": 0.27, + "grad_norm": 11.558255813156592, + "learning_rate": 4.979225109652783e-07, + "logits/chosen": -2.317185878753662, + "logits/rejected": -2.3010520935058594, + "logps/chosen": -120.50982666015625, + "logps/rejected": -136.13148498535156, + "loss": 0.6499, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6638567447662354, + "rewards/margins": 0.15114296972751617, + "rewards/rejected": -0.8149996995925903, + "step": 1590 + }, + { + "epoch": 0.28, + "grad_norm": 10.010530526041645, + "learning_rate": 4.978246718478835e-07, + "logits/chosen": -2.298884630203247, + "logits/rejected": -2.2639718055725098, + "logps/chosen": -114.62701416015625, + "logps/rejected": -132.1314239501953, + "loss": 0.6275, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6248496770858765, + "rewards/margins": 0.18698535859584808, + "rewards/rejected": -0.8118351101875305, + "step": 1600 + }, + { + "epoch": 0.28, + "eval_logits/chosen": -2.417057991027832, + "eval_logits/rejected": -2.4086451530456543, + "eval_logps/chosen": -116.99667358398438, + "eval_logps/rejected": -135.21180725097656, + "eval_loss": 0.6452447175979614, + "eval_rewards/accuracies": 0.6363847851753235, + "eval_rewards/chosen": -0.5829283595085144, + "eval_rewards/margins": 0.13761593401432037, + "eval_rewards/rejected": -0.7205442786216736, + "eval_runtime": 357.5084, + "eval_samples_per_second": 12.039, + "eval_steps_per_second": 1.505, + "step": 1600 + }, + { + "epoch": 0.28, + "grad_norm": 10.584365804460724, + "learning_rate": 4.977245916328994e-07, + "logits/chosen": -2.3447985649108887, + "logits/rejected": -2.3194570541381836, + "logps/chosen": -130.62841796875, + "logps/rejected": -153.30650329589844, + "loss": 0.6354, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7475208044052124, + "rewards/margins": 0.20843036472797394, + "rewards/rejected": -0.9559510946273804, + "step": 1610 + }, + { + "epoch": 0.28, + "grad_norm": 12.935555369642161, + "learning_rate": 4.976222712253587e-07, + "logits/chosen": -2.2747273445129395, + "logits/rejected": -2.251038074493408, + "logps/chosen": -124.53253173828125, + "logps/rejected": -160.12515258789062, + "loss": 0.6036, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7311369180679321, + "rewards/margins": 0.3294447958469391, + "rewards/rejected": -1.0605818033218384, + "step": 1620 + }, + { + "epoch": 0.28, + "grad_norm": 12.670402097997451, + "learning_rate": 4.97517711550553e-07, + "logits/chosen": -2.334963083267212, + "logits/rejected": -2.3116507530212402, + "logps/chosen": -132.87025451660156, + "logps/rejected": -149.6600799560547, + "loss": 0.6289, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7737977504730225, + "rewards/margins": 0.2057873010635376, + "rewards/rejected": -0.9795848727226257, + "step": 1630 + }, + { + "epoch": 0.28, + "grad_norm": 12.932831879229832, + "learning_rate": 4.974109135540232e-07, + "logits/chosen": -2.379924774169922, + "logits/rejected": -2.3459315299987793, + "logps/chosen": -137.20947265625, + "logps/rejected": -144.1864013671875, + "loss": 0.6681, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.803007960319519, + "rewards/margins": 0.1256239265203476, + "rewards/rejected": -0.9286319017410278, + "step": 1640 + }, + { + "epoch": 0.28, + "grad_norm": 10.93104733261659, + "learning_rate": 4.97301878201552e-07, + "logits/chosen": -2.3800089359283447, + "logits/rejected": -2.353868246078491, + "logps/chosen": -118.6199951171875, + "logps/rejected": -142.3135223388672, + "loss": 0.6032, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.613827109336853, + "rewards/margins": 0.2595062851905823, + "rewards/rejected": -0.8733335733413696, + "step": 1650 + }, + { + "epoch": 0.29, + "grad_norm": 8.435075129874804, + "learning_rate": 4.971906064791545e-07, + "logits/chosen": -2.4072935581207275, + "logits/rejected": -2.3678243160247803, + "logps/chosen": -116.49778747558594, + "logps/rejected": -127.7277603149414, + "loss": 0.6444, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6152034401893616, + "rewards/margins": 0.16506488621234894, + "rewards/rejected": -0.7802683115005493, + "step": 1660 + }, + { + "epoch": 0.29, + "grad_norm": 10.473049120892489, + "learning_rate": 4.970770993930693e-07, + "logits/chosen": -2.3916454315185547, + "logits/rejected": -2.366729259490967, + "logps/chosen": -112.58506774902344, + "logps/rejected": -138.53781127929688, + "loss": 0.6101, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5803548693656921, + "rewards/margins": 0.25304341316223145, + "rewards/rejected": -0.8333982229232788, + "step": 1670 + }, + { + "epoch": 0.29, + "grad_norm": 11.92770345279179, + "learning_rate": 4.969613579697499e-07, + "logits/chosen": -2.329380989074707, + "logits/rejected": -2.303520679473877, + "logps/chosen": -119.73587799072266, + "logps/rejected": -142.68923950195312, + "loss": 0.6175, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6415343284606934, + "rewards/margins": 0.24146585166454315, + "rewards/rejected": -0.8830000758171082, + "step": 1680 + }, + { + "epoch": 0.29, + "grad_norm": 10.415977139141571, + "learning_rate": 4.968433832558549e-07, + "logits/chosen": -2.2939274311065674, + "logits/rejected": -2.2756576538085938, + "logps/chosen": -115.64154052734375, + "logps/rejected": -131.75743103027344, + "loss": 0.637, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.601462185382843, + "rewards/margins": 0.19359458982944489, + "rewards/rejected": -0.7950568199157715, + "step": 1690 + }, + { + "epoch": 0.29, + "grad_norm": 12.213920640091061, + "learning_rate": 4.967231763182385e-07, + "logits/chosen": -2.169027805328369, + "logits/rejected": -2.16825795173645, + "logps/chosen": -112.47358703613281, + "logps/rejected": -138.60568237304688, + "loss": 0.6315, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6292012333869934, + "rewards/margins": 0.2038397490978241, + "rewards/rejected": -0.8330410122871399, + "step": 1700 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.3369693756103516, + "eval_logits/rejected": -2.3275210857391357, + "eval_logps/chosen": -117.66336822509766, + "eval_logps/rejected": -136.6091766357422, + "eval_loss": 0.6433987021446228, + "eval_rewards/accuracies": 0.633596658706665, + "eval_rewards/chosen": -0.5895951986312866, + "eval_rewards/margins": 0.14492255449295044, + "eval_rewards/rejected": -0.7345177531242371, + "eval_runtime": 357.4789, + "eval_samples_per_second": 12.04, + "eval_steps_per_second": 1.505, + "step": 1700 + }, + { + "epoch": 0.29, + "grad_norm": 17.554636232799652, + "learning_rate": 4.966007382439414e-07, + "logits/chosen": -2.2377054691314697, + "logits/rejected": -2.196046829223633, + "logps/chosen": -134.55264282226562, + "logps/rejected": -155.48165893554688, + "loss": 0.6201, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.761473536491394, + "rewards/margins": 0.24851946532726288, + "rewards/rejected": -1.0099929571151733, + "step": 1710 + }, + { + "epoch": 0.3, + "grad_norm": 12.864744504235462, + "learning_rate": 4.964760701401807e-07, + "logits/chosen": -2.2469406127929688, + "logits/rejected": -2.2177302837371826, + "logps/chosen": -136.5878143310547, + "logps/rejected": -150.06259155273438, + "loss": 0.6428, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.801825225353241, + "rewards/margins": 0.18852399289608002, + "rewards/rejected": -0.9903491735458374, + "step": 1720 + }, + { + "epoch": 0.3, + "grad_norm": 15.115706747879441, + "learning_rate": 4.963491731343395e-07, + "logits/chosen": -2.2426817417144775, + "logits/rejected": -2.225494146347046, + "logps/chosen": -133.5530548095703, + "logps/rejected": -154.15370178222656, + "loss": 0.629, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7963297367095947, + "rewards/margins": 0.2201063185930252, + "rewards/rejected": -1.016435980796814, + "step": 1730 + }, + { + "epoch": 0.3, + "grad_norm": 10.577013192942609, + "learning_rate": 4.962200483739572e-07, + "logits/chosen": -2.205991268157959, + "logits/rejected": -2.1916627883911133, + "logps/chosen": -137.78909301757812, + "logps/rejected": -165.40756225585938, + "loss": 0.6415, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.856925368309021, + "rewards/margins": 0.25689417123794556, + "rewards/rejected": -1.1138197183609009, + "step": 1740 + }, + { + "epoch": 0.3, + "grad_norm": 14.966499781106553, + "learning_rate": 4.96088697026719e-07, + "logits/chosen": -2.2428221702575684, + "logits/rejected": -2.2297050952911377, + "logps/chosen": -130.33145141601562, + "logps/rejected": -156.0831756591797, + "loss": 0.6123, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7514396905899048, + "rewards/margins": 0.2490360289812088, + "rewards/rejected": -1.0004757642745972, + "step": 1750 + }, + { + "epoch": 0.3, + "grad_norm": 14.761249007069992, + "learning_rate": 4.959551202804452e-07, + "logits/chosen": -2.2175586223602295, + "logits/rejected": -2.1803672313690186, + "logps/chosen": -129.0546417236328, + "logps/rejected": -156.67083740234375, + "loss": 0.5915, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7302212119102478, + "rewards/margins": 0.3069346249103546, + "rewards/rejected": -1.0371558666229248, + "step": 1760 + }, + { + "epoch": 0.3, + "grad_norm": 13.94600835463878, + "learning_rate": 4.958193193430807e-07, + "logits/chosen": -2.2072737216949463, + "logits/rejected": -2.1699469089508057, + "logps/chosen": -136.83575439453125, + "logps/rejected": -160.46617126464844, + "loss": 0.5962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8186962008476257, + "rewards/margins": 0.2965734004974365, + "rewards/rejected": -1.115269660949707, + "step": 1770 + }, + { + "epoch": 0.31, + "grad_norm": 14.814181161859743, + "learning_rate": 4.956812954426837e-07, + "logits/chosen": -2.0803823471069336, + "logits/rejected": -2.0697758197784424, + "logps/chosen": -145.62057495117188, + "logps/rejected": -193.1177520751953, + "loss": 0.5567, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9111822843551636, + "rewards/margins": 0.43550905585289, + "rewards/rejected": -1.3466914892196655, + "step": 1780 + }, + { + "epoch": 0.31, + "grad_norm": 12.92886463604504, + "learning_rate": 4.95541049827415e-07, + "logits/chosen": -2.077265739440918, + "logits/rejected": -2.0515055656433105, + "logps/chosen": -154.9197998046875, + "logps/rejected": -191.34097290039062, + "loss": 0.5844, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.029468297958374, + "rewards/margins": 0.35887202620506287, + "rewards/rejected": -1.3883404731750488, + "step": 1790 + }, + { + "epoch": 0.31, + "grad_norm": 14.455065715370587, + "learning_rate": 4.953985837655266e-07, + "logits/chosen": -2.03164005279541, + "logits/rejected": -2.004000186920166, + "logps/chosen": -154.93240356445312, + "logps/rejected": -186.32785034179688, + "loss": 0.6166, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0314334630966187, + "rewards/margins": 0.31214088201522827, + "rewards/rejected": -1.3435744047164917, + "step": 1800 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.099355936050415, + "eval_logits/rejected": -2.087456226348877, + "eval_logps/chosen": -137.8539276123047, + "eval_logps/rejected": -159.61842346191406, + "eval_loss": 0.6393665075302124, + "eval_rewards/accuracies": 0.6289498209953308, + "eval_rewards/chosen": -0.7915008664131165, + "eval_rewards/margins": 0.1731095165014267, + "eval_rewards/rejected": -0.9646103978157043, + "eval_runtime": 357.2087, + "eval_samples_per_second": 12.049, + "eval_steps_per_second": 1.506, + "step": 1800 + }, + { + "epoch": 0.31, + "grad_norm": 13.045686157199322, + "learning_rate": 4.952538985453499e-07, + "logits/chosen": -2.0923218727111816, + "logits/rejected": -2.058093547821045, + "logps/chosen": -148.35519409179688, + "logps/rejected": -166.8310089111328, + "loss": 0.6642, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9175241589546204, + "rewards/margins": 0.21410436928272247, + "rewards/rejected": -1.131628394126892, + "step": 1810 + }, + { + "epoch": 0.31, + "grad_norm": 14.746527696680772, + "learning_rate": 4.951069954752846e-07, + "logits/chosen": -2.104447841644287, + "logits/rejected": -2.0722100734710693, + "logps/chosen": -138.68417358398438, + "logps/rejected": -155.63540649414062, + "loss": 0.6363, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8351799249649048, + "rewards/margins": 0.2144734412431717, + "rewards/rejected": -1.0496532917022705, + "step": 1820 + }, + { + "epoch": 0.32, + "grad_norm": 14.068599458347373, + "learning_rate": 4.949578758837864e-07, + "logits/chosen": -2.0577917098999023, + "logits/rejected": -2.040351390838623, + "logps/chosen": -126.67193603515625, + "logps/rejected": -151.28140258789062, + "loss": 0.6184, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.711285412311554, + "rewards/margins": 0.254142701625824, + "rewards/rejected": -0.9654279947280884, + "step": 1830 + }, + { + "epoch": 0.32, + "grad_norm": 16.186612704580565, + "learning_rate": 4.948065411193554e-07, + "logits/chosen": -2.2264585494995117, + "logits/rejected": -2.2193264961242676, + "logps/chosen": -132.20895385742188, + "logps/rejected": -154.45802307128906, + "loss": 0.6388, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7740861177444458, + "rewards/margins": 0.22031357884407043, + "rewards/rejected": -0.9943998456001282, + "step": 1840 + }, + { + "epoch": 0.32, + "grad_norm": 13.631222489310312, + "learning_rate": 4.946529925505233e-07, + "logits/chosen": -2.104651927947998, + "logits/rejected": -2.099863052368164, + "logps/chosen": -124.88291931152344, + "logps/rejected": -150.3798828125, + "loss": 0.6244, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7169033288955688, + "rewards/margins": 0.24011361598968506, + "rewards/rejected": -0.9570168256759644, + "step": 1850 + }, + { + "epoch": 0.32, + "grad_norm": 11.597593527632558, + "learning_rate": 4.944972315658417e-07, + "logits/chosen": -2.038820505142212, + "logits/rejected": -2.0067200660705566, + "logps/chosen": -129.09518432617188, + "logps/rejected": -153.1046142578125, + "loss": 0.601, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7181129455566406, + "rewards/margins": 0.27581366896629333, + "rewards/rejected": -0.9939267039299011, + "step": 1860 + }, + { + "epoch": 0.32, + "grad_norm": 14.814299163074143, + "learning_rate": 4.943392595738695e-07, + "logits/chosen": -2.0475425720214844, + "logits/rejected": -2.018345832824707, + "logps/chosen": -130.18441772460938, + "logps/rejected": -163.41127014160156, + "loss": 0.5883, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7427932024002075, + "rewards/margins": 0.35372671484947205, + "rewards/rejected": -1.096519947052002, + "step": 1870 + }, + { + "epoch": 0.32, + "grad_norm": 13.085964342637993, + "learning_rate": 4.941790780031591e-07, + "logits/chosen": -2.040121555328369, + "logits/rejected": -2.0052008628845215, + "logps/chosen": -139.4851531982422, + "logps/rejected": -169.9237518310547, + "loss": 0.5986, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8679348230361938, + "rewards/margins": 0.3169488310813904, + "rewards/rejected": -1.184883713722229, + "step": 1880 + }, + { + "epoch": 0.33, + "grad_norm": 17.97087896244849, + "learning_rate": 4.94016688302245e-07, + "logits/chosen": -2.036181688308716, + "logits/rejected": -2.021777629852295, + "logps/chosen": -137.5182647705078, + "logps/rejected": -178.22129821777344, + "loss": 0.561, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8445706367492676, + "rewards/margins": 0.39340075850486755, + "rewards/rejected": -1.237971544265747, + "step": 1890 + }, + { + "epoch": 0.33, + "grad_norm": 15.473904552778107, + "learning_rate": 4.938520919396297e-07, + "logits/chosen": -1.9097896814346313, + "logits/rejected": -1.878089189529419, + "logps/chosen": -160.5067901611328, + "logps/rejected": -182.10073852539062, + "loss": 0.6238, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.030932068824768, + "rewards/margins": 0.26739898324012756, + "rewards/rejected": -1.2983310222625732, + "step": 1900 + }, + { + "epoch": 0.33, + "eval_logits/chosen": -1.9768445491790771, + "eval_logits/rejected": -1.964641809463501, + "eval_logps/chosen": -151.84054565429688, + "eval_logps/rejected": -174.0358428955078, + "eval_loss": 0.6393516659736633, + "eval_rewards/accuracies": 0.6280204653739929, + "eval_rewards/chosen": -0.9313669800758362, + "eval_rewards/margins": 0.1774175763130188, + "eval_rewards/rejected": -1.108784556388855, + "eval_runtime": 356.9465, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.507, + "step": 1900 + }, + { + "epoch": 0.33, + "grad_norm": 18.295705836142915, + "learning_rate": 4.936852904037709e-07, + "logits/chosen": -1.8353763818740845, + "logits/rejected": -1.7998859882354736, + "logps/chosen": -162.01815795898438, + "logps/rejected": -199.1243438720703, + "loss": 0.5856, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0756003856658936, + "rewards/margins": 0.3827807605266571, + "rewards/rejected": -1.458381175994873, + "step": 1910 + }, + { + "epoch": 0.33, + "grad_norm": 15.587066902188072, + "learning_rate": 4.935162852030678e-07, + "logits/chosen": -1.9671123027801514, + "logits/rejected": -1.9385311603546143, + "logps/chosen": -158.43060302734375, + "logps/rejected": -187.6251678466797, + "loss": 0.611, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.030988097190857, + "rewards/margins": 0.3086529076099396, + "rewards/rejected": -1.3396410942077637, + "step": 1920 + }, + { + "epoch": 0.33, + "grad_norm": 16.64428631174434, + "learning_rate": 4.933450778658472e-07, + "logits/chosen": -1.9721879959106445, + "logits/rejected": -1.9367185831069946, + "logps/chosen": -145.00579833984375, + "logps/rejected": -175.52078247070312, + "loss": 0.6052, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9111844897270203, + "rewards/margins": 0.30568939447402954, + "rewards/rejected": -1.2168738842010498, + "step": 1930 + }, + { + "epoch": 0.33, + "grad_norm": 13.008355304833884, + "learning_rate": 4.931716699403504e-07, + "logits/chosen": -2.0365664958953857, + "logits/rejected": -2.016010046005249, + "logps/chosen": -130.88787841796875, + "logps/rejected": -154.35255432128906, + "loss": 0.6209, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7993988990783691, + "rewards/margins": 0.24416379630565643, + "rewards/rejected": -1.043562650680542, + "step": 1940 + }, + { + "epoch": 0.34, + "grad_norm": 11.221840915928341, + "learning_rate": 4.929960629947185e-07, + "logits/chosen": -2.021613597869873, + "logits/rejected": -2.012424945831299, + "logps/chosen": -137.0205535888672, + "logps/rejected": -171.9865264892578, + "loss": 0.6066, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8452763557434082, + "rewards/margins": 0.3268183469772339, + "rewards/rejected": -1.172094702720642, + "step": 1950 + }, + { + "epoch": 0.34, + "grad_norm": 12.459203609632565, + "learning_rate": 4.928182586169787e-07, + "logits/chosen": -2.0483787059783936, + "logits/rejected": -2.024353504180908, + "logps/chosen": -136.3292999267578, + "logps/rejected": -166.4497833251953, + "loss": 0.6003, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8198174238204956, + "rewards/margins": 0.2998635768890381, + "rewards/rejected": -1.1196808815002441, + "step": 1960 + }, + { + "epoch": 0.34, + "grad_norm": 16.3262276995175, + "learning_rate": 4.926382584150298e-07, + "logits/chosen": -2.052652359008789, + "logits/rejected": -2.0237042903900146, + "logps/chosen": -133.0852813720703, + "logps/rejected": -154.27850341796875, + "loss": 0.6179, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7694526314735413, + "rewards/margins": 0.25520798563957214, + "rewards/rejected": -1.0246607065200806, + "step": 1970 + }, + { + "epoch": 0.34, + "grad_norm": 13.309990683269428, + "learning_rate": 4.924560640166273e-07, + "logits/chosen": -1.9702781438827515, + "logits/rejected": -1.955529808998108, + "logps/chosen": -143.92767333984375, + "logps/rejected": -171.1623077392578, + "loss": 0.6143, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8834748268127441, + "rewards/margins": 0.28803762793540955, + "rewards/rejected": -1.1715123653411865, + "step": 1980 + }, + { + "epoch": 0.34, + "grad_norm": 17.690808430077606, + "learning_rate": 4.922716770693691e-07, + "logits/chosen": -2.02256441116333, + "logits/rejected": -1.9881162643432617, + "logps/chosen": -148.55735778808594, + "logps/rejected": -185.8997344970703, + "loss": 0.5663, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9775354266166687, + "rewards/margins": 0.3999343514442444, + "rewards/rejected": -1.377469778060913, + "step": 1990 + }, + { + "epoch": 0.34, + "grad_norm": 20.779887890208492, + "learning_rate": 4.920850992406809e-07, + "logits/chosen": -1.9081655740737915, + "logits/rejected": -1.9007370471954346, + "logps/chosen": -167.69796752929688, + "logps/rejected": -216.0054168701172, + "loss": 0.5824, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1772109270095825, + "rewards/margins": 0.4069501757621765, + "rewards/rejected": -1.5841610431671143, + "step": 2000 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -1.988376498222351, + "eval_logits/rejected": -1.9742034673690796, + "eval_logps/chosen": -156.2569122314453, + "eval_logps/rejected": -181.40647888183594, + "eval_loss": 0.6345042586326599, + "eval_rewards/accuracies": 0.6338289976119995, + "eval_rewards/chosen": -0.9755305647850037, + "eval_rewards/margins": 0.2069605439901352, + "eval_rewards/rejected": -1.1824910640716553, + "eval_runtime": 356.8317, + "eval_samples_per_second": 12.062, + "eval_steps_per_second": 1.508, + "step": 2000 + }, + { + "epoch": 0.35, + "grad_norm": 13.969050635341127, + "learning_rate": 4.918963322178002e-07, + "logits/chosen": -1.8815292119979858, + "logits/rejected": -1.8513492345809937, + "logps/chosen": -167.8777313232422, + "logps/rejected": -195.77195739746094, + "loss": 0.6169, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1228792667388916, + "rewards/margins": 0.3168966770172119, + "rewards/rejected": -1.439776062965393, + "step": 2010 + }, + { + "epoch": 0.35, + "grad_norm": 15.797574816697441, + "learning_rate": 4.917053777077616e-07, + "logits/chosen": -1.8998935222625732, + "logits/rejected": -1.8719685077667236, + "logps/chosen": -154.22711181640625, + "logps/rejected": -197.74649047851562, + "loss": 0.5839, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0441197156906128, + "rewards/margins": 0.38671204447746277, + "rewards/rejected": -1.4308319091796875, + "step": 2020 + }, + { + "epoch": 0.35, + "grad_norm": 14.833429475519068, + "learning_rate": 4.915122374373815e-07, + "logits/chosen": -1.9642966985702515, + "logits/rejected": -1.9340064525604248, + "logps/chosen": -159.75148010253906, + "logps/rejected": -195.52828979492188, + "loss": 0.5832, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0272386074066162, + "rewards/margins": 0.3762350082397461, + "rewards/rejected": -1.4034736156463623, + "step": 2030 + }, + { + "epoch": 0.35, + "grad_norm": 14.865053760007001, + "learning_rate": 4.913169131532422e-07, + "logits/chosen": -1.820640206336975, + "logits/rejected": -1.7987966537475586, + "logps/chosen": -145.8691864013672, + "logps/rejected": -193.00489807128906, + "loss": 0.5621, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9316226243972778, + "rewards/margins": 0.45906609296798706, + "rewards/rejected": -1.3906886577606201, + "step": 2040 + }, + { + "epoch": 0.35, + "grad_norm": 20.45821384576311, + "learning_rate": 4.911194066216765e-07, + "logits/chosen": -1.864013671875, + "logits/rejected": -1.8330237865447998, + "logps/chosen": -153.375732421875, + "logps/rejected": -193.52369689941406, + "loss": 0.5926, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0217927694320679, + "rewards/margins": 0.36956191062927246, + "rewards/rejected": -1.3913547992706299, + "step": 2050 + }, + { + "epoch": 0.35, + "grad_norm": 15.765551507413845, + "learning_rate": 4.909197196287509e-07, + "logits/chosen": -1.8556013107299805, + "logits/rejected": -1.8133299350738525, + "logps/chosen": -158.96043395996094, + "logps/rejected": -183.34689331054688, + "loss": 0.6223, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.036510705947876, + "rewards/margins": 0.2811738848686218, + "rewards/rejected": -1.3176846504211426, + "step": 2060 + }, + { + "epoch": 0.36, + "grad_norm": 16.728188759969367, + "learning_rate": 4.907178539802502e-07, + "logits/chosen": -1.8902781009674072, + "logits/rejected": -1.8563499450683594, + "logps/chosen": -154.345947265625, + "logps/rejected": -198.3380889892578, + "loss": 0.5807, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9906846284866333, + "rewards/margins": 0.44513338804244995, + "rewards/rejected": -1.4358179569244385, + "step": 2070 + }, + { + "epoch": 0.36, + "grad_norm": 17.234432946942448, + "learning_rate": 4.905138115016614e-07, + "logits/chosen": -1.8345119953155518, + "logits/rejected": -1.7954838275909424, + "logps/chosen": -153.2652587890625, + "logps/rejected": -191.7286376953125, + "loss": 0.5862, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0140693187713623, + "rewards/margins": 0.39727577567100525, + "rewards/rejected": -1.41134512424469, + "step": 2080 + }, + { + "epoch": 0.36, + "grad_norm": 17.27758613167349, + "learning_rate": 4.903075940381559e-07, + "logits/chosen": -1.8448431491851807, + "logits/rejected": -1.8325908184051514, + "logps/chosen": -147.30392456054688, + "logps/rejected": -176.21583557128906, + "loss": 0.6304, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9291483163833618, + "rewards/margins": 0.2863280475139618, + "rewards/rejected": -1.2154762744903564, + "step": 2090 + }, + { + "epoch": 0.36, + "grad_norm": 13.545506652655037, + "learning_rate": 4.900992034545744e-07, + "logits/chosen": -1.8317134380340576, + "logits/rejected": -1.801325798034668, + "logps/chosen": -134.27406311035156, + "logps/rejected": -163.0066680908203, + "loss": 0.5895, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7893965840339661, + "rewards/margins": 0.32822003960609436, + "rewards/rejected": -1.1176166534423828, + "step": 2100 + }, + { + "epoch": 0.36, + "eval_logits/chosen": -1.95328950881958, + "eval_logits/rejected": -1.9400743246078491, + "eval_logps/chosen": -124.55523681640625, + "eval_logps/rejected": -143.9415740966797, + "eval_loss": 0.6449150443077087, + "eval_rewards/accuracies": 0.6338289976119995, + "eval_rewards/chosen": -0.6585139632225037, + "eval_rewards/margins": 0.14932793378829956, + "eval_rewards/rejected": -0.8078420162200928, + "eval_runtime": 356.7749, + "eval_samples_per_second": 12.064, + "eval_steps_per_second": 1.508, + "step": 2100 + }, + { + "epoch": 0.36, + "grad_norm": 20.3896857533217, + "learning_rate": 4.898886416354088e-07, + "logits/chosen": -1.8545172214508057, + "logits/rejected": -1.8379218578338623, + "logps/chosen": -136.37522888183594, + "logps/rejected": -184.2274932861328, + "loss": 0.5751, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8746525049209595, + "rewards/margins": 0.41598910093307495, + "rewards/rejected": -1.2906416654586792, + "step": 2110 + }, + { + "epoch": 0.37, + "grad_norm": 19.966726504918753, + "learning_rate": 4.896759104847859e-07, + "logits/chosen": -1.6481273174285889, + "logits/rejected": -1.6026216745376587, + "logps/chosen": -143.7649688720703, + "logps/rejected": -195.6068115234375, + "loss": 0.549, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.90205317735672, + "rewards/margins": 0.5448054075241089, + "rewards/rejected": -1.4468586444854736, + "step": 2120 + }, + { + "epoch": 0.37, + "grad_norm": 22.5331311441494, + "learning_rate": 4.8946101192645e-07, + "logits/chosen": -1.5899341106414795, + "logits/rejected": -1.5583152770996094, + "logps/chosen": -178.53843688964844, + "logps/rejected": -229.2800750732422, + "loss": 0.5623, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2409361600875854, + "rewards/margins": 0.5267373323440552, + "rewards/rejected": -1.7676734924316406, + "step": 2130 + }, + { + "epoch": 0.37, + "grad_norm": 19.39894132054709, + "learning_rate": 4.892439479037451e-07, + "logits/chosen": -1.6246334314346313, + "logits/rejected": -1.6016099452972412, + "logps/chosen": -173.16412353515625, + "logps/rejected": -215.16110229492188, + "loss": 0.6048, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.245426058769226, + "rewards/margins": 0.37089425325393677, + "rewards/rejected": -1.6163203716278076, + "step": 2140 + }, + { + "epoch": 0.37, + "grad_norm": 18.938179724571942, + "learning_rate": 4.89024720379598e-07, + "logits/chosen": -1.6600227355957031, + "logits/rejected": -1.6034603118896484, + "logps/chosen": -167.83346557617188, + "logps/rejected": -214.2350616455078, + "loss": 0.5413, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1314681768417358, + "rewards/margins": 0.531185507774353, + "rewards/rejected": -1.6626536846160889, + "step": 2150 + }, + { + "epoch": 0.37, + "grad_norm": 17.819138078463755, + "learning_rate": 4.888033313365001e-07, + "logits/chosen": -1.5937135219573975, + "logits/rejected": -1.5616223812103271, + "logps/chosen": -189.21583557128906, + "logps/rejected": -235.78518676757812, + "loss": 0.5612, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.344967007637024, + "rewards/margins": 0.5175682902336121, + "rewards/rejected": -1.8625354766845703, + "step": 2160 + }, + { + "epoch": 0.37, + "grad_norm": 28.899143582146355, + "learning_rate": 4.885797827764895e-07, + "logits/chosen": -1.6774377822875977, + "logits/rejected": -1.6321741342544556, + "logps/chosen": -193.11886596679688, + "logps/rejected": -255.240234375, + "loss": 0.543, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3880548477172852, + "rewards/margins": 0.6497060656547546, + "rewards/rejected": -2.0377612113952637, + "step": 2170 + }, + { + "epoch": 0.38, + "grad_norm": 20.347133369564297, + "learning_rate": 4.88354076721133e-07, + "logits/chosen": -1.7743288278579712, + "logits/rejected": -1.7314989566802979, + "logps/chosen": -204.22975158691406, + "logps/rejected": -241.63369750976562, + "loss": 0.6385, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4623057842254639, + "rewards/margins": 0.42681413888931274, + "rewards/rejected": -1.889120101928711, + "step": 2180 + }, + { + "epoch": 0.38, + "grad_norm": 11.864063871740855, + "learning_rate": 4.88126215211508e-07, + "logits/chosen": -2.0105056762695312, + "logits/rejected": -1.9917558431625366, + "logps/chosen": -134.98178100585938, + "logps/rejected": -173.46408081054688, + "loss": 0.5908, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8213205337524414, + "rewards/margins": 0.37510326504707336, + "rewards/rejected": -1.1964237689971924, + "step": 2190 + }, + { + "epoch": 0.38, + "grad_norm": 12.665212840871442, + "learning_rate": 4.878962003081834e-07, + "logits/chosen": -1.8419253826141357, + "logits/rejected": -1.8090530633926392, + "logps/chosen": -125.55128479003906, + "logps/rejected": -168.30105590820312, + "loss": 0.5633, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7324917912483215, + "rewards/margins": 0.4029674530029297, + "rewards/rejected": -1.1354591846466064, + "step": 2200 + }, + { + "epoch": 0.38, + "eval_logits/chosen": -1.9549309015274048, + "eval_logits/rejected": -1.9415898323059082, + "eval_logps/chosen": -122.187744140625, + "eval_logps/rejected": -142.1007080078125, + "eval_loss": 0.6433526277542114, + "eval_rewards/accuracies": 0.6247676610946655, + "eval_rewards/chosen": -0.6348390579223633, + "eval_rewards/margins": 0.1545940786600113, + "eval_rewards/rejected": -0.7894331216812134, + "eval_runtime": 356.7846, + "eval_samples_per_second": 12.063, + "eval_steps_per_second": 1.508, + "step": 2200 + }, + { + "epoch": 0.38, + "grad_norm": 15.566693864294429, + "learning_rate": 4.87664034091202e-07, + "logits/chosen": -1.864985466003418, + "logits/rejected": -1.842546820640564, + "logps/chosen": -135.0320587158203, + "logps/rejected": -167.46334838867188, + "loss": 0.6055, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8033797144889832, + "rewards/margins": 0.33440515398979187, + "rewards/rejected": -1.1377849578857422, + "step": 2210 + }, + { + "epoch": 0.38, + "grad_norm": 14.08070748751659, + "learning_rate": 4.874297186600607e-07, + "logits/chosen": -1.6942613124847412, + "logits/rejected": -1.6759631633758545, + "logps/chosen": -136.83392333984375, + "logps/rejected": -170.51962280273438, + "loss": 0.5989, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8297648429870605, + "rewards/margins": 0.33708181977272034, + "rewards/rejected": -1.1668468713760376, + "step": 2220 + }, + { + "epoch": 0.38, + "grad_norm": 13.765120270646621, + "learning_rate": 4.871932561336917e-07, + "logits/chosen": -1.7974563837051392, + "logits/rejected": -1.7594830989837646, + "logps/chosen": -157.6973114013672, + "logps/rejected": -191.2778778076172, + "loss": 0.6047, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0398889780044556, + "rewards/margins": 0.3657877445220947, + "rewards/rejected": -1.4056766033172607, + "step": 2230 + }, + { + "epoch": 0.39, + "grad_norm": 14.959197166232116, + "learning_rate": 4.869546486504443e-07, + "logits/chosen": -1.7539308071136475, + "logits/rejected": -1.715118408203125, + "logps/chosen": -154.1725311279297, + "logps/rejected": -178.8619384765625, + "loss": 0.6287, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9734878540039062, + "rewards/margins": 0.27934250235557556, + "rewards/rejected": -1.2528302669525146, + "step": 2240 + }, + { + "epoch": 0.39, + "grad_norm": 30.679025462387873, + "learning_rate": 4.867138983680639e-07, + "logits/chosen": -1.7157443761825562, + "logits/rejected": -1.6704628467559814, + "logps/chosen": -154.58035278320312, + "logps/rejected": -192.14291381835938, + "loss": 0.5929, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9923057556152344, + "rewards/margins": 0.3957834541797638, + "rewards/rejected": -1.3880890607833862, + "step": 2250 + }, + { + "epoch": 0.39, + "grad_norm": 13.975707619834106, + "learning_rate": 4.864710074636742e-07, + "logits/chosen": -1.6998507976531982, + "logits/rejected": -1.6631402969360352, + "logps/chosen": -162.00726318359375, + "logps/rejected": -190.08140563964844, + "loss": 0.6257, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0117926597595215, + "rewards/margins": 0.3156191408634186, + "rewards/rejected": -1.3274118900299072, + "step": 2260 + }, + { + "epoch": 0.39, + "grad_norm": 16.6560589968763, + "learning_rate": 4.862259781337561e-07, + "logits/chosen": -1.7075884342193604, + "logits/rejected": -1.665636420249939, + "logps/chosen": -147.17477416992188, + "logps/rejected": -179.50704956054688, + "loss": 0.6146, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9310780763626099, + "rewards/margins": 0.34703630208969116, + "rewards/rejected": -1.2781143188476562, + "step": 2270 + }, + { + "epoch": 0.39, + "grad_norm": 14.830061652144124, + "learning_rate": 4.859788125941288e-07, + "logits/chosen": -1.791953444480896, + "logits/rejected": -1.7653782367706299, + "logps/chosen": -126.0338134765625, + "logps/rejected": -163.85520935058594, + "loss": 0.5711, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7260233163833618, + "rewards/margins": 0.3571609556674957, + "rewards/rejected": -1.0831841230392456, + "step": 2280 + }, + { + "epoch": 0.39, + "grad_norm": 14.350187386244336, + "learning_rate": 4.857295130799293e-07, + "logits/chosen": -1.6346839666366577, + "logits/rejected": -1.5958842039108276, + "logps/chosen": -142.57481384277344, + "logps/rejected": -190.54708862304688, + "loss": 0.5495, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8971936106681824, + "rewards/margins": 0.4708176255226135, + "rewards/rejected": -1.368011236190796, + "step": 2290 + }, + { + "epoch": 0.4, + "grad_norm": 21.919618404427325, + "learning_rate": 4.854780818455922e-07, + "logits/chosen": -1.7218765020370483, + "logits/rejected": -1.671383261680603, + "logps/chosen": -162.91741943359375, + "logps/rejected": -212.5228729248047, + "loss": 0.5459, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0696805715560913, + "rewards/margins": 0.5191500782966614, + "rewards/rejected": -1.5888304710388184, + "step": 2300 + }, + { + "epoch": 0.4, + "eval_logits/chosen": -1.7182925939559937, + "eval_logits/rejected": -1.699223279953003, + "eval_logps/chosen": -166.81613159179688, + "eval_logps/rejected": -196.93434143066406, + "eval_loss": 0.6319848895072937, + "eval_rewards/accuracies": 0.6301115155220032, + "eval_rewards/chosen": -1.0811227560043335, + "eval_rewards/margins": 0.25664687156677246, + "eval_rewards/rejected": -1.3377697467803955, + "eval_runtime": 355.9925, + "eval_samples_per_second": 12.09, + "eval_steps_per_second": 1.511, + "step": 2300 + }, + { + "epoch": 0.4, + "grad_norm": 25.808234378973566, + "learning_rate": 4.852245211648297e-07, + "logits/chosen": -1.5000966787338257, + "logits/rejected": -1.466485619544983, + "logps/chosen": -192.41690063476562, + "logps/rejected": -250.5450439453125, + "loss": 0.5307, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.413515329360962, + "rewards/margins": 0.577852725982666, + "rewards/rejected": -1.991368055343628, + "step": 2310 + }, + { + "epoch": 0.4, + "grad_norm": 29.80630825333485, + "learning_rate": 4.849688333306104e-07, + "logits/chosen": -1.5186668634414673, + "logits/rejected": -1.4686113595962524, + "logps/chosen": -204.86245727539062, + "logps/rejected": -261.40045166015625, + "loss": 0.5375, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.474421501159668, + "rewards/margins": 0.6283503770828247, + "rewards/rejected": -2.1027719974517822, + "step": 2320 + }, + { + "epoch": 0.4, + "grad_norm": 22.33412954401948, + "learning_rate": 4.847110206551393e-07, + "logits/chosen": -1.4632006883621216, + "logits/rejected": -1.4255832433700562, + "logps/chosen": -201.6676483154297, + "logps/rejected": -270.60858154296875, + "loss": 0.5217, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4586617946624756, + "rewards/margins": 0.7248638868331909, + "rewards/rejected": -2.183525562286377, + "step": 2330 + }, + { + "epoch": 0.4, + "grad_norm": 20.61698070907491, + "learning_rate": 4.844510854698359e-07, + "logits/chosen": -1.5553325414657593, + "logits/rejected": -1.5306508541107178, + "logps/chosen": -203.3428497314453, + "logps/rejected": -246.02001953125, + "loss": 0.6196, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5106031894683838, + "rewards/margins": 0.3960232734680176, + "rewards/rejected": -1.9066263437271118, + "step": 2340 + }, + { + "epoch": 0.4, + "grad_norm": 24.7881803852688, + "learning_rate": 4.841890301253145e-07, + "logits/chosen": -1.548393726348877, + "logits/rejected": -1.509019136428833, + "logps/chosen": -186.4705047607422, + "logps/rejected": -242.9622039794922, + "loss": 0.544, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2822117805480957, + "rewards/margins": 0.5945440530776978, + "rewards/rejected": -1.876755952835083, + "step": 2350 + }, + { + "epoch": 0.41, + "grad_norm": 21.155630137442166, + "learning_rate": 4.839248569913614e-07, + "logits/chosen": -1.4889419078826904, + "logits/rejected": -1.452192783355713, + "logps/chosen": -189.53225708007812, + "logps/rejected": -255.5863800048828, + "loss": 0.5421, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3918462991714478, + "rewards/margins": 0.6527556777000427, + "rewards/rejected": -2.044602155685425, + "step": 2360 + }, + { + "epoch": 0.41, + "grad_norm": 34.58209504713677, + "learning_rate": 4.836585684569147e-07, + "logits/chosen": -1.4630403518676758, + "logits/rejected": -1.430633544921875, + "logps/chosen": -206.46591186523438, + "logps/rejected": -273.29779052734375, + "loss": 0.5551, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.527071237564087, + "rewards/margins": 0.6676559448242188, + "rewards/rejected": -2.1947274208068848, + "step": 2370 + }, + { + "epoch": 0.41, + "grad_norm": 26.061151506389557, + "learning_rate": 4.833901669300424e-07, + "logits/chosen": -1.4684240818023682, + "logits/rejected": -1.4264377355575562, + "logps/chosen": -186.2281494140625, + "logps/rejected": -233.110107421875, + "loss": 0.6138, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3187849521636963, + "rewards/margins": 0.4836592674255371, + "rewards/rejected": -1.8024442195892334, + "step": 2380 + }, + { + "epoch": 0.41, + "grad_norm": 18.221595816735224, + "learning_rate": 4.831196548379198e-07, + "logits/chosen": -1.5969889163970947, + "logits/rejected": -1.5512523651123047, + "logps/chosen": -178.01239013671875, + "logps/rejected": -238.6937255859375, + "loss": 0.5315, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2178453207015991, + "rewards/margins": 0.6382966041564941, + "rewards/rejected": -1.8561418056488037, + "step": 2390 + }, + { + "epoch": 0.41, + "grad_norm": 15.589350815937946, + "learning_rate": 4.828470346268088e-07, + "logits/chosen": -1.6465771198272705, + "logits/rejected": -1.6099990606307983, + "logps/chosen": -179.09376525878906, + "logps/rejected": -228.93905639648438, + "loss": 0.5786, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2678877115249634, + "rewards/margins": 0.4891243577003479, + "rewards/rejected": -1.7570120096206665, + "step": 2400 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -1.6363126039505005, + "eval_logits/rejected": -1.6167610883712769, + "eval_logps/chosen": -178.53884887695312, + "eval_logps/rejected": -209.47793579101562, + "eval_loss": 0.6305522918701172, + "eval_rewards/accuracies": 0.6291821599006653, + "eval_rewards/chosen": -1.1983500719070435, + "eval_rewards/margins": 0.2648555040359497, + "eval_rewards/rejected": -1.4632055759429932, + "eval_runtime": 356.7532, + "eval_samples_per_second": 12.064, + "eval_steps_per_second": 1.508, + "step": 2400 + }, + { + "epoch": 0.42, + "grad_norm": 21.985975720515064, + "learning_rate": 4.82572308762035e-07, + "logits/chosen": -1.5702491998672485, + "logits/rejected": -1.5282782316207886, + "logps/chosen": -188.3826904296875, + "logps/rejected": -229.83132934570312, + "loss": 0.5676, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3452413082122803, + "rewards/margins": 0.4605236053466797, + "rewards/rejected": -1.8057647943496704, + "step": 2410 + }, + { + "epoch": 0.42, + "grad_norm": 26.14144209889029, + "learning_rate": 4.822954797279652e-07, + "logits/chosen": -1.5276035070419312, + "logits/rejected": -1.4836372137069702, + "logps/chosen": -208.26181030273438, + "logps/rejected": -264.6378479003906, + "loss": 0.5711, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5207151174545288, + "rewards/margins": 0.5766458511352539, + "rewards/rejected": -2.0973610877990723, + "step": 2420 + }, + { + "epoch": 0.42, + "grad_norm": 17.28711843102643, + "learning_rate": 4.82016550027986e-07, + "logits/chosen": -1.5296419858932495, + "logits/rejected": -1.4949567317962646, + "logps/chosen": -192.01145935058594, + "logps/rejected": -231.2733154296875, + "loss": 0.5997, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.371919870376587, + "rewards/margins": 0.4235268533229828, + "rewards/rejected": -1.7954469919204712, + "step": 2430 + }, + { + "epoch": 0.42, + "grad_norm": 15.776095113993128, + "learning_rate": 4.817355221844802e-07, + "logits/chosen": -1.5643110275268555, + "logits/rejected": -1.5382698774337769, + "logps/chosen": -172.5228729248047, + "logps/rejected": -228.47189331054688, + "loss": 0.5417, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1747311353683472, + "rewards/margins": 0.5653839707374573, + "rewards/rejected": -1.7401151657104492, + "step": 2440 + }, + { + "epoch": 0.42, + "grad_norm": 19.47187684087894, + "learning_rate": 4.814523987388038e-07, + "logits/chosen": -1.5278120040893555, + "logits/rejected": -1.490755319595337, + "logps/chosen": -177.8535919189453, + "logps/rejected": -222.85791015625, + "loss": 0.5882, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2479978799819946, + "rewards/margins": 0.45816653966903687, + "rewards/rejected": -1.7061645984649658, + "step": 2450 + }, + { + "epoch": 0.42, + "grad_norm": 14.220492265970561, + "learning_rate": 4.811671822512644e-07, + "logits/chosen": -1.5169602632522583, + "logits/rejected": -1.4735338687896729, + "logps/chosen": -167.81532287597656, + "logps/rejected": -209.4271240234375, + "loss": 0.586, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1169971227645874, + "rewards/margins": 0.45197534561157227, + "rewards/rejected": -1.5689725875854492, + "step": 2460 + }, + { + "epoch": 0.43, + "grad_norm": 24.10851119267855, + "learning_rate": 4.808798753010965e-07, + "logits/chosen": -1.648048758506775, + "logits/rejected": -1.6161121129989624, + "logps/chosen": -171.47268676757812, + "logps/rejected": -209.07357788085938, + "loss": 0.5908, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1645221710205078, + "rewards/margins": 0.39932164549827576, + "rewards/rejected": -1.5638437271118164, + "step": 2470 + }, + { + "epoch": 0.43, + "grad_norm": 15.012483229366685, + "learning_rate": 4.805904804864388e-07, + "logits/chosen": -1.6050293445587158, + "logits/rejected": -1.5672911405563354, + "logps/chosen": -169.68240356445312, + "logps/rejected": -201.05831909179688, + "loss": 0.6088, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.153080701828003, + "rewards/margins": 0.35358747839927673, + "rewards/rejected": -1.506668210029602, + "step": 2480 + }, + { + "epoch": 0.43, + "grad_norm": 17.517728604597213, + "learning_rate": 4.802990004243112e-07, + "logits/chosen": -1.6790720224380493, + "logits/rejected": -1.6492221355438232, + "logps/chosen": -141.60067749023438, + "logps/rejected": -183.9330291748047, + "loss": 0.581, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9159472584724426, + "rewards/margins": 0.4076360762119293, + "rewards/rejected": -1.3235833644866943, + "step": 2490 + }, + { + "epoch": 0.43, + "grad_norm": 19.29815730049706, + "learning_rate": 4.800054377505901e-07, + "logits/chosen": -1.7170441150665283, + "logits/rejected": -1.6744773387908936, + "logps/chosen": -157.9257354736328, + "logps/rejected": -203.2161102294922, + "loss": 0.5679, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0103013515472412, + "rewards/margins": 0.4565104842185974, + "rewards/rejected": -1.4668117761611938, + "step": 2500 + }, + { + "epoch": 0.43, + "eval_logits/chosen": -1.7219594717025757, + "eval_logits/rejected": -1.7044074535369873, + "eval_logps/chosen": -148.90235900878906, + "eval_logps/rejected": -175.4527587890625, + "eval_loss": 0.6329796314239502, + "eval_rewards/accuracies": 0.6345260143280029, + "eval_rewards/chosen": -0.9019851088523865, + "eval_rewards/margins": 0.22096872329711914, + "eval_rewards/rejected": -1.1229537725448608, + "eval_runtime": 356.8285, + "eval_samples_per_second": 12.062, + "eval_steps_per_second": 1.508, + "step": 2500 + }, + { + "epoch": 0.43, + "grad_norm": 20.098541263273624, + "learning_rate": 4.797097951199854e-07, + "logits/chosen": -1.5535961389541626, + "logits/rejected": -1.5248453617095947, + "logps/chosen": -159.08462524414062, + "logps/rejected": -213.17837524414062, + "loss": 0.546, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.067018985748291, + "rewards/margins": 0.5451322793960571, + "rewards/rejected": -1.6121511459350586, + "step": 2510 + }, + { + "epoch": 0.43, + "grad_norm": 15.302909435904388, + "learning_rate": 4.794120752060162e-07, + "logits/chosen": -1.5149682760238647, + "logits/rejected": -1.4745677709579468, + "logps/chosen": -166.39657592773438, + "logps/rejected": -208.57785034179688, + "loss": 0.5939, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1393781900405884, + "rewards/margins": 0.4565187990665436, + "rewards/rejected": -1.5958969593048096, + "step": 2520 + }, + { + "epoch": 0.44, + "grad_norm": 25.687681170750803, + "learning_rate": 4.791122807009866e-07, + "logits/chosen": -1.568881869316101, + "logits/rejected": -1.552473783493042, + "logps/chosen": -177.05374145507812, + "logps/rejected": -220.03857421875, + "loss": 0.6046, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2589091062545776, + "rewards/margins": 0.40004104375839233, + "rewards/rejected": -1.6589502096176147, + "step": 2530 + }, + { + "epoch": 0.44, + "grad_norm": 17.330151466831865, + "learning_rate": 4.788104143159616e-07, + "logits/chosen": -1.6212892532348633, + "logits/rejected": -1.5904419422149658, + "logps/chosen": -177.31298828125, + "logps/rejected": -227.2390899658203, + "loss": 0.6146, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2378965616226196, + "rewards/margins": 0.4961935579776764, + "rewards/rejected": -1.7340900897979736, + "step": 2540 + }, + { + "epoch": 0.44, + "grad_norm": 17.990325513608543, + "learning_rate": 4.785064787807418e-07, + "logits/chosen": -1.6909650564193726, + "logits/rejected": -1.6431467533111572, + "logps/chosen": -151.2425537109375, + "logps/rejected": -197.7589569091797, + "loss": 0.5459, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9256092309951782, + "rewards/margins": 0.5123879313468933, + "rewards/rejected": -1.4379972219467163, + "step": 2550 + }, + { + "epoch": 0.44, + "grad_norm": 14.629576613571512, + "learning_rate": 4.782004768438399e-07, + "logits/chosen": -1.7803840637207031, + "logits/rejected": -1.744502305984497, + "logps/chosen": -138.01266479492188, + "logps/rejected": -177.8213348388672, + "loss": 0.5766, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8278089761734009, + "rewards/margins": 0.41043511033058167, + "rewards/rejected": -1.2382439374923706, + "step": 2560 + }, + { + "epoch": 0.44, + "grad_norm": 14.783534214563243, + "learning_rate": 4.778924112724548e-07, + "logits/chosen": -1.6910631656646729, + "logits/rejected": -1.6597950458526611, + "logps/chosen": -161.07774353027344, + "logps/rejected": -207.1680145263672, + "loss": 0.5656, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.02474045753479, + "rewards/margins": 0.4819648861885071, + "rewards/rejected": -1.506705403327942, + "step": 2570 + }, + { + "epoch": 0.44, + "grad_norm": 24.195144500390953, + "learning_rate": 4.775822848524474e-07, + "logits/chosen": -1.65180242061615, + "logits/rejected": -1.6205854415893555, + "logps/chosen": -174.69419860839844, + "logps/rejected": -216.5833740234375, + "loss": 0.5993, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2136080265045166, + "rewards/margins": 0.4207339286804199, + "rewards/rejected": -1.634341835975647, + "step": 2580 + }, + { + "epoch": 0.45, + "grad_norm": 24.07431376599948, + "learning_rate": 4.772701003883146e-07, + "logits/chosen": -1.6589524745941162, + "logits/rejected": -1.6199334859848022, + "logps/chosen": -160.41822814941406, + "logps/rejected": -193.14602661132812, + "loss": 0.6035, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0372803211212158, + "rewards/margins": 0.38218361139297485, + "rewards/rejected": -1.419463872909546, + "step": 2590 + }, + { + "epoch": 0.45, + "grad_norm": 17.260199729003336, + "learning_rate": 4.769558607031646e-07, + "logits/chosen": -1.6966606378555298, + "logits/rejected": -1.6404602527618408, + "logps/chosen": -153.80836486816406, + "logps/rejected": -197.2882080078125, + "loss": 0.5426, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.9672321081161499, + "rewards/margins": 0.5213042497634888, + "rewards/rejected": -1.4885362386703491, + "step": 2600 + }, + { + "epoch": 0.45, + "eval_logits/chosen": -1.7993457317352295, + "eval_logits/rejected": -1.7825459241867065, + "eval_logps/chosen": -147.43885803222656, + "eval_logps/rejected": -172.26231384277344, + "eval_loss": 0.6352224946022034, + "eval_rewards/accuracies": 0.6354553699493408, + "eval_rewards/chosen": -0.8873502016067505, + "eval_rewards/margins": 0.20369918644428253, + "eval_rewards/rejected": -1.0910491943359375, + "eval_runtime": 356.7035, + "eval_samples_per_second": 12.066, + "eval_steps_per_second": 1.508, + "step": 2600 + }, + { + "epoch": 0.45, + "grad_norm": 16.168191035992383, + "learning_rate": 4.7663956863869114e-07, + "logits/chosen": -1.646691918373108, + "logits/rejected": -1.5923856496810913, + "logps/chosen": -167.03103637695312, + "logps/rejected": -211.1219482421875, + "loss": 0.5665, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0866007804870605, + "rewards/margins": 0.5080444812774658, + "rewards/rejected": -1.5946451425552368, + "step": 2610 + }, + { + "epoch": 0.45, + "grad_norm": 21.121725151266354, + "learning_rate": 4.7632122705514764e-07, + "logits/chosen": -1.6836843490600586, + "logits/rejected": -1.6414811611175537, + "logps/chosen": -174.40000915527344, + "logps/rejected": -224.135009765625, + "loss": 0.5734, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2318413257598877, + "rewards/margins": 0.5052552223205566, + "rewards/rejected": -1.7370964288711548, + "step": 2620 + }, + { + "epoch": 0.45, + "grad_norm": 20.850911780664795, + "learning_rate": 4.760008388313216e-07, + "logits/chosen": -1.5688848495483398, + "logits/rejected": -1.5264674425125122, + "logps/chosen": -175.73428344726562, + "logps/rejected": -224.6223907470703, + "loss": 0.5769, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.209410548210144, + "rewards/margins": 0.5050948262214661, + "rewards/rejected": -1.7145051956176758, + "step": 2630 + }, + { + "epoch": 0.45, + "grad_norm": 19.418048097017046, + "learning_rate": 4.756784068645083e-07, + "logits/chosen": -1.5928579568862915, + "logits/rejected": -1.553302526473999, + "logps/chosen": -167.90257263183594, + "logps/rejected": -221.66702270507812, + "loss": 0.5415, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.178271770477295, + "rewards/margins": 0.5487642288208008, + "rewards/rejected": -1.7270358800888062, + "step": 2640 + }, + { + "epoch": 0.46, + "grad_norm": 21.88823441031475, + "learning_rate": 4.75353934070485e-07, + "logits/chosen": -1.5368947982788086, + "logits/rejected": -1.5017019510269165, + "logps/chosen": -185.3848419189453, + "logps/rejected": -244.77587890625, + "loss": 0.5833, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3293806314468384, + "rewards/margins": 0.5755869746208191, + "rewards/rejected": -1.9049675464630127, + "step": 2650 + }, + { + "epoch": 0.46, + "grad_norm": 22.610484118287804, + "learning_rate": 4.7502742338348406e-07, + "logits/chosen": -1.5877610445022583, + "logits/rejected": -1.5406101942062378, + "logps/chosen": -193.39242553710938, + "logps/rejected": -222.32186889648438, + "loss": 0.6741, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.351151943206787, + "rewards/margins": 0.34743010997772217, + "rewards/rejected": -1.6985820531845093, + "step": 2660 + }, + { + "epoch": 0.46, + "grad_norm": 13.819491655406011, + "learning_rate": 4.746988777561668e-07, + "logits/chosen": -1.6597168445587158, + "logits/rejected": -1.6143728494644165, + "logps/chosen": -164.24771118164062, + "logps/rejected": -210.0986328125, + "loss": 0.584, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1030311584472656, + "rewards/margins": 0.47755417227745056, + "rewards/rejected": -1.5805851221084595, + "step": 2670 + }, + { + "epoch": 0.46, + "grad_norm": 19.318025617063718, + "learning_rate": 4.743683001595965e-07, + "logits/chosen": -1.7418750524520874, + "logits/rejected": -1.7099230289459229, + "logps/chosen": -164.66427612304688, + "logps/rejected": -190.61524963378906, + "loss": 0.6165, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0642914772033691, + "rewards/margins": 0.30566078424453735, + "rewards/rejected": -1.3699522018432617, + "step": 2680 + }, + { + "epoch": 0.46, + "grad_norm": 12.859233397896235, + "learning_rate": 4.7403569358321206e-07, + "logits/chosen": -1.7552549839019775, + "logits/rejected": -1.7229642868041992, + "logps/chosen": -147.30789184570312, + "logps/rejected": -191.17984008789062, + "loss": 0.5505, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9669073820114136, + "rewards/margins": 0.44619670510292053, + "rewards/rejected": -1.4131041765213013, + "step": 2690 + }, + { + "epoch": 0.47, + "grad_norm": 15.446403175724251, + "learning_rate": 4.7370106103480013e-07, + "logits/chosen": -1.7358205318450928, + "logits/rejected": -1.6975898742675781, + "logps/chosen": -156.6081085205078, + "logps/rejected": -196.40362548828125, + "loss": 0.5888, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.033973217010498, + "rewards/margins": 0.3954750895500183, + "rewards/rejected": -1.4294483661651611, + "step": 2700 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -1.8467351198196411, + "eval_logits/rejected": -1.8293933868408203, + "eval_logps/chosen": -149.63987731933594, + "eval_logps/rejected": -176.10572814941406, + "eval_loss": 0.6302607655525208, + "eval_rewards/accuracies": 0.645213782787323, + "eval_rewards/chosen": -0.9093602895736694, + "eval_rewards/margins": 0.2201230674982071, + "eval_rewards/rejected": -1.1294833421707153, + "eval_runtime": 356.6899, + "eval_samples_per_second": 12.067, + "eval_steps_per_second": 1.508, + "step": 2700 + }, + { + "epoch": 0.47, + "grad_norm": 16.878338793520253, + "learning_rate": 4.733644055404687e-07, + "logits/chosen": -1.7432657480239868, + "logits/rejected": -1.7117999792099, + "logps/chosen": -164.71484375, + "logps/rejected": -207.14260864257812, + "loss": 0.5639, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.052567481994629, + "rewards/margins": 0.4653971791267395, + "rewards/rejected": -1.5179646015167236, + "step": 2710 + }, + { + "epoch": 0.47, + "grad_norm": 20.497517981633035, + "learning_rate": 4.7302573014461935e-07, + "logits/chosen": -1.7307226657867432, + "logits/rejected": -1.7192401885986328, + "logps/chosen": -163.8205108642578, + "logps/rejected": -208.58010864257812, + "loss": 0.5986, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1179062128067017, + "rewards/margins": 0.430508553981781, + "rewards/rejected": -1.548414707183838, + "step": 2720 + }, + { + "epoch": 0.47, + "grad_norm": 19.634240873533788, + "learning_rate": 4.7268503790991977e-07, + "logits/chosen": -1.760005235671997, + "logits/rejected": -1.728356957435608, + "logps/chosen": -156.45030212402344, + "logps/rejected": -194.1329803466797, + "loss": 0.5988, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9790525436401367, + "rewards/margins": 0.4097796380519867, + "rewards/rejected": -1.3888323307037354, + "step": 2730 + }, + { + "epoch": 0.47, + "grad_norm": 15.780929737120895, + "learning_rate": 4.72342331917276e-07, + "logits/chosen": -1.7603209018707275, + "logits/rejected": -1.730158805847168, + "logps/chosen": -138.56451416015625, + "logps/rejected": -176.93080139160156, + "loss": 0.5707, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8573589324951172, + "rewards/margins": 0.41820549964904785, + "rewards/rejected": -1.275564432144165, + "step": 2740 + }, + { + "epoch": 0.47, + "grad_norm": 20.00693735071868, + "learning_rate": 4.7199761526580484e-07, + "logits/chosen": -1.6731714010238647, + "logits/rejected": -1.6483711004257202, + "logps/chosen": -145.03150939941406, + "logps/rejected": -201.181640625, + "loss": 0.5471, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9579359292984009, + "rewards/margins": 0.5196166038513184, + "rewards/rejected": -1.4775525331497192, + "step": 2750 + }, + { + "epoch": 0.48, + "grad_norm": 17.11025850674512, + "learning_rate": 4.7165089107280536e-07, + "logits/chosen": -1.6770479679107666, + "logits/rejected": -1.643370270729065, + "logps/chosen": -151.8813018798828, + "logps/rejected": -205.90744018554688, + "loss": 0.5535, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9763303995132446, + "rewards/margins": 0.5348490476608276, + "rewards/rejected": -1.5111793279647827, + "step": 2760 + }, + { + "epoch": 0.48, + "grad_norm": 19.624859017470257, + "learning_rate": 4.7130216247373123e-07, + "logits/chosen": -1.7125215530395508, + "logits/rejected": -1.6670726537704468, + "logps/chosen": -179.84864807128906, + "logps/rejected": -230.32839965820312, + "loss": 0.5671, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2465837001800537, + "rewards/margins": 0.5265911817550659, + "rewards/rejected": -1.7731748819351196, + "step": 2770 + }, + { + "epoch": 0.48, + "grad_norm": 18.39672002807499, + "learning_rate": 4.7095143262216203e-07, + "logits/chosen": -1.5359172821044922, + "logits/rejected": -1.4871833324432373, + "logps/chosen": -188.5769805908203, + "logps/rejected": -238.5513153076172, + "loss": 0.5609, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3573224544525146, + "rewards/margins": 0.5221356749534607, + "rewards/rejected": -1.8794580698013306, + "step": 2780 + }, + { + "epoch": 0.48, + "grad_norm": 25.940828538741872, + "learning_rate": 4.705987046897748e-07, + "logits/chosen": -1.6233654022216797, + "logits/rejected": -1.58616042137146, + "logps/chosen": -184.6060028076172, + "logps/rejected": -230.30361938476562, + "loss": 0.578, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3283016681671143, + "rewards/margins": 0.4737378656864166, + "rewards/rejected": -1.8020395040512085, + "step": 2790 + }, + { + "epoch": 0.48, + "grad_norm": 17.60526676910467, + "learning_rate": 4.7024398186631533e-07, + "logits/chosen": -1.6539256572723389, + "logits/rejected": -1.6196858882904053, + "logps/chosen": -191.15408325195312, + "logps/rejected": -228.4556121826172, + "loss": 0.6328, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3112226724624634, + "rewards/margins": 0.4050907492637634, + "rewards/rejected": -1.7163136005401611, + "step": 2800 + }, + { + "epoch": 0.48, + "eval_logits/chosen": -1.8423043489456177, + "eval_logits/rejected": -1.8252124786376953, + "eval_logps/chosen": -142.36798095703125, + "eval_logps/rejected": -167.40048217773438, + "eval_loss": 0.6315863728523254, + "eval_rewards/accuracies": 0.6419609785079956, + "eval_rewards/chosen": -0.8366413712501526, + "eval_rewards/margins": 0.20578964054584503, + "eval_rewards/rejected": -1.042431116104126, + "eval_runtime": 356.6408, + "eval_samples_per_second": 12.068, + "eval_steps_per_second": 1.509, + "step": 2800 + }, + { + "epoch": 0.48, + "grad_norm": 17.01308659310898, + "learning_rate": 4.6988726735956953e-07, + "logits/chosen": -1.6734424829483032, + "logits/rejected": -1.633302092552185, + "logps/chosen": -146.92413330078125, + "logps/rejected": -195.38314819335938, + "loss": 0.5504, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9160858392715454, + "rewards/margins": 0.48056167364120483, + "rewards/rejected": -1.3966474533081055, + "step": 2810 + }, + { + "epoch": 0.49, + "grad_norm": 15.867862572496643, + "learning_rate": 4.69528564395334e-07, + "logits/chosen": -1.8120372295379639, + "logits/rejected": -1.792083740234375, + "logps/chosen": -149.9076385498047, + "logps/rejected": -176.41195678710938, + "loss": 0.6392, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9560129046440125, + "rewards/margins": 0.27826496958732605, + "rewards/rejected": -1.2342779636383057, + "step": 2820 + }, + { + "epoch": 0.49, + "grad_norm": 13.828136544925801, + "learning_rate": 4.691678762173874e-07, + "logits/chosen": -1.6588958501815796, + "logits/rejected": -1.6307001113891602, + "logps/chosen": -137.5013427734375, + "logps/rejected": -186.1872100830078, + "loss": 0.5436, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8202708959579468, + "rewards/margins": 0.49207648634910583, + "rewards/rejected": -1.3123472929000854, + "step": 2830 + }, + { + "epoch": 0.49, + "grad_norm": 12.998742695216233, + "learning_rate": 4.6880520608746065e-07, + "logits/chosen": -1.766371726989746, + "logits/rejected": -1.7423028945922852, + "logps/chosen": -146.57913208007812, + "logps/rejected": -185.55625915527344, + "loss": 0.5919, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9395810961723328, + "rewards/margins": 0.4023555815219879, + "rewards/rejected": -1.341936707496643, + "step": 2840 + }, + { + "epoch": 0.49, + "grad_norm": 16.674011214819668, + "learning_rate": 4.684405572852077e-07, + "logits/chosen": -1.6769888401031494, + "logits/rejected": -1.6428205966949463, + "logps/chosen": -159.1728515625, + "logps/rejected": -215.3779754638672, + "loss": 0.5433, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0694429874420166, + "rewards/margins": 0.5360538363456726, + "rewards/rejected": -1.6054970026016235, + "step": 2850 + }, + { + "epoch": 0.49, + "grad_norm": 16.33436683362672, + "learning_rate": 4.680739331081757e-07, + "logits/chosen": -1.662724494934082, + "logits/rejected": -1.6217238903045654, + "logps/chosen": -157.78176879882812, + "logps/rejected": -207.1402130126953, + "loss": 0.5372, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0284638404846191, + "rewards/margins": 0.5234465003013611, + "rewards/rejected": -1.5519102811813354, + "step": 2860 + }, + { + "epoch": 0.49, + "grad_norm": 20.916547924866293, + "learning_rate": 4.677053368717754e-07, + "logits/chosen": -1.682941198348999, + "logits/rejected": -1.6458778381347656, + "logps/chosen": -167.31607055664062, + "logps/rejected": -218.1786346435547, + "loss": 0.5674, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1120381355285645, + "rewards/margins": 0.539524257183075, + "rewards/rejected": -1.6515624523162842, + "step": 2870 + }, + { + "epoch": 0.5, + "grad_norm": 18.373461916648946, + "learning_rate": 4.6733477190925073e-07, + "logits/chosen": -1.7388041019439697, + "logits/rejected": -1.6905145645141602, + "logps/chosen": -175.05714416503906, + "logps/rejected": -221.7596435546875, + "loss": 0.5676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1971012353897095, + "rewards/margins": 0.5112650394439697, + "rewards/rejected": -1.7083663940429688, + "step": 2880 + }, + { + "epoch": 0.5, + "grad_norm": 22.994357222490983, + "learning_rate": 4.6696224157164943e-07, + "logits/chosen": -1.7159115076065063, + "logits/rejected": -1.690751075744629, + "logps/chosen": -173.77462768554688, + "logps/rejected": -222.635986328125, + "loss": 0.5746, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2083570957183838, + "rewards/margins": 0.49303531646728516, + "rewards/rejected": -1.701392412185669, + "step": 2890 + }, + { + "epoch": 0.5, + "grad_norm": 23.78726232737787, + "learning_rate": 4.6658774922779187e-07, + "logits/chosen": -1.6340763568878174, + "logits/rejected": -1.6105928421020508, + "logps/chosen": -169.45535278320312, + "logps/rejected": -216.67333984375, + "loss": 0.5746, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.16982901096344, + "rewards/margins": 0.48190441727638245, + "rewards/rejected": -1.6517333984375, + "step": 2900 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -1.7276008129119873, + "eval_logits/rejected": -1.707594871520996, + "eval_logps/chosen": -164.17117309570312, + "eval_logps/rejected": -193.61111450195312, + "eval_loss": 0.6267496943473816, + "eval_rewards/accuracies": 0.6442843675613403, + "eval_rewards/chosen": -1.054673433303833, + "eval_rewards/margins": 0.24986399710178375, + "eval_rewards/rejected": -1.304537296295166, + "eval_runtime": 356.9094, + "eval_samples_per_second": 12.059, + "eval_steps_per_second": 1.507, + "step": 2900 + }, + { + "epoch": 0.5, + "grad_norm": 22.570926073633988, + "learning_rate": 4.662112982642412e-07, + "logits/chosen": -1.6592012643814087, + "logits/rejected": -1.6184587478637695, + "logps/chosen": -178.86044311523438, + "logps/rejected": -250.51681518554688, + "loss": 0.5009, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2277532815933228, + "rewards/margins": 0.7185968160629272, + "rewards/rejected": -1.94635009765625, + "step": 2910 + }, + { + "epoch": 0.5, + "grad_norm": 25.29506569233524, + "learning_rate": 4.6583289208527244e-07, + "logits/chosen": -1.5599522590637207, + "logits/rejected": -1.5313317775726318, + "logps/chosen": -197.64329528808594, + "logps/rejected": -261.9999084472656, + "loss": 0.5768, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4407055377960205, + "rewards/margins": 0.6243780851364136, + "rewards/rejected": -2.0650835037231445, + "step": 2920 + }, + { + "epoch": 0.5, + "grad_norm": 17.04379311188254, + "learning_rate": 4.654525341128418e-07, + "logits/chosen": -1.5148179531097412, + "logits/rejected": -1.468490481376648, + "logps/chosen": -188.95974731445312, + "logps/rejected": -253.90164184570312, + "loss": 0.5079, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.355076789855957, + "rewards/margins": 0.6675424575805664, + "rewards/rejected": -2.0226194858551025, + "step": 2930 + }, + { + "epoch": 0.51, + "grad_norm": 14.086402307191449, + "learning_rate": 4.650702277865558e-07, + "logits/chosen": -1.5800873041152954, + "logits/rejected": -1.5371129512786865, + "logps/chosen": -182.9086456298828, + "logps/rejected": -230.81295776367188, + "loss": 0.5955, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3146532773971558, + "rewards/margins": 0.48588043451309204, + "rewards/rejected": -1.8005338907241821, + "step": 2940 + }, + { + "epoch": 0.51, + "grad_norm": 21.69271778749331, + "learning_rate": 4.6468597656363994e-07, + "logits/chosen": -1.6005538702011108, + "logits/rejected": -1.566699743270874, + "logps/chosen": -182.33128356933594, + "logps/rejected": -239.34439086914062, + "loss": 0.5601, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2800660133361816, + "rewards/margins": 0.56809401512146, + "rewards/rejected": -1.8481600284576416, + "step": 2950 + }, + { + "epoch": 0.51, + "grad_norm": 19.85110567457654, + "learning_rate": 4.6429978391890756e-07, + "logits/chosen": -1.5460537672042847, + "logits/rejected": -1.498718500137329, + "logps/chosen": -180.05801391601562, + "logps/rejected": -235.87344360351562, + "loss": 0.5485, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2613499164581299, + "rewards/margins": 0.5788403749465942, + "rewards/rejected": -1.8401902914047241, + "step": 2960 + }, + { + "epoch": 0.51, + "grad_norm": 23.495999485202848, + "learning_rate": 4.639116533447286e-07, + "logits/chosen": -1.4766029119491577, + "logits/rejected": -1.4375579357147217, + "logps/chosen": -196.3876190185547, + "logps/rejected": -245.5087432861328, + "loss": 0.582, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4116392135620117, + "rewards/margins": 0.519368588924408, + "rewards/rejected": -1.931007742881775, + "step": 2970 + }, + { + "epoch": 0.51, + "grad_norm": 22.165196277613042, + "learning_rate": 4.635215883509976e-07, + "logits/chosen": -1.5197012424468994, + "logits/rejected": -1.47576105594635, + "logps/chosen": -182.31118774414062, + "logps/rejected": -240.2295379638672, + "loss": 0.5411, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2836403846740723, + "rewards/margins": 0.5962404012680054, + "rewards/rejected": -1.8798809051513672, + "step": 2980 + }, + { + "epoch": 0.52, + "grad_norm": 18.23553989348222, + "learning_rate": 4.6312959246510234e-07, + "logits/chosen": -1.6128461360931396, + "logits/rejected": -1.5682920217514038, + "logps/chosen": -174.17776489257812, + "logps/rejected": -225.26742553710938, + "loss": 0.5519, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1757910251617432, + "rewards/margins": 0.5307731032371521, + "rewards/rejected": -1.70656418800354, + "step": 2990 + }, + { + "epoch": 0.52, + "grad_norm": 13.778675539788516, + "learning_rate": 4.627356692318919e-07, + "logits/chosen": -1.6555538177490234, + "logits/rejected": -1.6289546489715576, + "logps/chosen": -151.74960327148438, + "logps/rejected": -207.7194061279297, + "loss": 0.5452, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9974144697189331, + "rewards/margins": 0.5410576462745667, + "rewards/rejected": -1.5384724140167236, + "step": 3000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -1.7558156251907349, + "eval_logits/rejected": -1.7363479137420654, + "eval_logps/chosen": -150.7609405517578, + "eval_logps/rejected": -178.5692138671875, + "eval_loss": 0.6288471221923828, + "eval_rewards/accuracies": 0.6463754773139954, + "eval_rewards/chosen": -0.9205708503723145, + "eval_rewards/margins": 0.23354758322238922, + "eval_rewards/rejected": -1.1541184186935425, + "eval_runtime": 356.7483, + "eval_samples_per_second": 12.065, + "eval_steps_per_second": 1.508, + "step": 3000 + }, + { + "epoch": 0.52, + "grad_norm": 16.90078566654223, + "learning_rate": 4.623398222136443e-07, + "logits/chosen": -1.6691395044326782, + "logits/rejected": -1.6284288167953491, + "logps/chosen": -161.9709930419922, + "logps/rejected": -210.35195922851562, + "loss": 0.5663, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0677728652954102, + "rewards/margins": 0.5049013495445251, + "rewards/rejected": -1.57267427444458, + "step": 3010 + }, + { + "epoch": 0.52, + "grad_norm": 21.480798300954746, + "learning_rate": 4.6194205499003467e-07, + "logits/chosen": -1.7338823080062866, + "logits/rejected": -1.6823310852050781, + "logps/chosen": -166.0416259765625, + "logps/rejected": -225.45706176757812, + "loss": 0.5425, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1181974411010742, + "rewards/margins": 0.6333866119384766, + "rewards/rejected": -1.7515838146209717, + "step": 3020 + }, + { + "epoch": 0.52, + "grad_norm": 17.722022537225527, + "learning_rate": 4.615423711581027e-07, + "logits/chosen": -1.6567986011505127, + "logits/rejected": -1.6222938299179077, + "logps/chosen": -170.3144989013672, + "logps/rejected": -208.64501953125, + "loss": 0.6058, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1464793682098389, + "rewards/margins": 0.3912624716758728, + "rewards/rejected": -1.5377418994903564, + "step": 3030 + }, + { + "epoch": 0.52, + "grad_norm": 15.214602160313902, + "learning_rate": 4.6114077433221994e-07, + "logits/chosen": -1.7444251775741577, + "logits/rejected": -1.715855598449707, + "logps/chosen": -158.28477478027344, + "logps/rejected": -213.8212432861328, + "loss": 0.5529, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0712274312973022, + "rewards/margins": 0.5239061713218689, + "rewards/rejected": -1.5951335430145264, + "step": 3040 + }, + { + "epoch": 0.53, + "grad_norm": 22.969445403757415, + "learning_rate": 4.6073726814405746e-07, + "logits/chosen": -1.6354888677597046, + "logits/rejected": -1.6061290502548218, + "logps/chosen": -153.43170166015625, + "logps/rejected": -197.95614624023438, + "loss": 0.5946, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9988822937011719, + "rewards/margins": 0.4272375702857971, + "rewards/rejected": -1.4261198043823242, + "step": 3050 + }, + { + "epoch": 0.53, + "grad_norm": 28.86788571866789, + "learning_rate": 4.6033185624255276e-07, + "logits/chosen": -1.6350476741790771, + "logits/rejected": -1.6002562046051025, + "logps/chosen": -153.01419067382812, + "logps/rejected": -200.2187042236328, + "loss": 0.5541, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9734545946121216, + "rewards/margins": 0.48509782552719116, + "rewards/rejected": -1.458552360534668, + "step": 3060 + }, + { + "epoch": 0.53, + "grad_norm": 21.065042836310123, + "learning_rate": 4.5992454229387693e-07, + "logits/chosen": -1.5526440143585205, + "logits/rejected": -1.5073213577270508, + "logps/chosen": -173.8745574951172, + "logps/rejected": -220.6032257080078, + "loss": 0.5811, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1593778133392334, + "rewards/margins": 0.5076309442520142, + "rewards/rejected": -1.6670089960098267, + "step": 3070 + }, + { + "epoch": 0.53, + "grad_norm": 19.310546719568105, + "learning_rate": 4.5951532998140136e-07, + "logits/chosen": -1.4362452030181885, + "logits/rejected": -1.399596929550171, + "logps/chosen": -182.4234161376953, + "logps/rejected": -246.11245727539062, + "loss": 0.5594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3157217502593994, + "rewards/margins": 0.5977397561073303, + "rewards/rejected": -1.913461685180664, + "step": 3080 + }, + { + "epoch": 0.53, + "grad_norm": 20.275727141362044, + "learning_rate": 4.591042230056644e-07, + "logits/chosen": -1.5431610345840454, + "logits/rejected": -1.5028966665267944, + "logps/chosen": -162.43585205078125, + "logps/rejected": -222.4371337890625, + "loss": 0.5203, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0885474681854248, + "rewards/margins": 0.5989712476730347, + "rewards/rejected": -1.6875184774398804, + "step": 3090 + }, + { + "epoch": 0.53, + "grad_norm": 22.85769406936058, + "learning_rate": 4.586912250843383e-07, + "logits/chosen": -1.49831223487854, + "logits/rejected": -1.4446604251861572, + "logps/chosen": -172.95877075195312, + "logps/rejected": -234.62548828125, + "loss": 0.5525, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1904902458190918, + "rewards/margins": 0.6400495767593384, + "rewards/rejected": -1.8305397033691406, + "step": 3100 + }, + { + "epoch": 0.53, + "eval_logits/chosen": -1.6323180198669434, + "eval_logits/rejected": -1.6100775003433228, + "eval_logps/chosen": -161.87399291992188, + "eval_logps/rejected": -193.96153259277344, + "eval_loss": 0.623075008392334, + "eval_rewards/accuracies": 0.6563661694526672, + "eval_rewards/chosen": -1.0317014455795288, + "eval_rewards/margins": 0.2763398587703705, + "eval_rewards/rejected": -1.3080412149429321, + "eval_runtime": 356.7275, + "eval_samples_per_second": 12.065, + "eval_steps_per_second": 1.508, + "step": 3100 + }, + { + "epoch": 0.54, + "grad_norm": 25.719416790446477, + "learning_rate": 4.5827633995219485e-07, + "logits/chosen": -1.4610720872879028, + "logits/rejected": -1.4539538621902466, + "logps/chosen": -183.6023406982422, + "logps/rejected": -237.76895141601562, + "loss": 0.5884, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3302621841430664, + "rewards/margins": 0.498201847076416, + "rewards/rejected": -1.8284639120101929, + "step": 3110 + }, + { + "epoch": 0.54, + "grad_norm": 14.309987948640194, + "learning_rate": 4.5785957136107234e-07, + "logits/chosen": -1.544480323791504, + "logits/rejected": -1.5000821352005005, + "logps/chosen": -165.1011505126953, + "logps/rejected": -232.72195434570312, + "loss": 0.5266, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0910911560058594, + "rewards/margins": 0.678336501121521, + "rewards/rejected": -1.7694276571273804, + "step": 3120 + }, + { + "epoch": 0.54, + "grad_norm": 18.893819003746106, + "learning_rate": 4.574409230798413e-07, + "logits/chosen": -1.4636805057525635, + "logits/rejected": -1.4383834600448608, + "logps/chosen": -155.36705017089844, + "logps/rejected": -204.67672729492188, + "loss": 0.5856, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.032982587814331, + "rewards/margins": 0.48075515031814575, + "rewards/rejected": -1.513737678527832, + "step": 3130 + }, + { + "epoch": 0.54, + "grad_norm": 19.750368207437877, + "learning_rate": 4.5702039889437014e-07, + "logits/chosen": -1.5176935195922852, + "logits/rejected": -1.4778989553451538, + "logps/chosen": -171.33694458007812, + "logps/rejected": -242.3556365966797, + "loss": 0.5396, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1966036558151245, + "rewards/margins": 0.689578115940094, + "rewards/rejected": -1.8861818313598633, + "step": 3140 + }, + { + "epoch": 0.54, + "grad_norm": 15.672784519353224, + "learning_rate": 4.565980026074917e-07, + "logits/chosen": -1.4829928874969482, + "logits/rejected": -1.4322001934051514, + "logps/chosen": -172.95135498046875, + "logps/rejected": -237.19912719726562, + "loss": 0.5323, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1858676671981812, + "rewards/margins": 0.6504721641540527, + "rewards/rejected": -1.8363399505615234, + "step": 3150 + }, + { + "epoch": 0.54, + "grad_norm": 18.037452777993565, + "learning_rate": 4.5617373803896796e-07, + "logits/chosen": -1.3555725812911987, + "logits/rejected": -1.3147733211517334, + "logps/chosen": -188.40936279296875, + "logps/rejected": -247.400634765625, + "loss": 0.559, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3425967693328857, + "rewards/margins": 0.605254054069519, + "rewards/rejected": -1.9478508234024048, + "step": 3160 + }, + { + "epoch": 0.55, + "grad_norm": 15.619953452388868, + "learning_rate": 4.5574760902545625e-07, + "logits/chosen": -1.4381481409072876, + "logits/rejected": -1.391213059425354, + "logps/chosen": -183.0797882080078, + "logps/rejected": -240.10757446289062, + "loss": 0.5234, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.272312045097351, + "rewards/margins": 0.6338873505592346, + "rewards/rejected": -1.9061992168426514, + "step": 3170 + }, + { + "epoch": 0.55, + "grad_norm": 24.556199495358904, + "learning_rate": 4.5531961942047385e-07, + "logits/chosen": -1.521206021308899, + "logits/rejected": -1.4590338468551636, + "logps/chosen": -183.3689727783203, + "logps/rejected": -249.7210693359375, + "loss": 0.5345, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3013378381729126, + "rewards/margins": 0.681814968585968, + "rewards/rejected": -1.9831526279449463, + "step": 3180 + }, + { + "epoch": 0.55, + "grad_norm": 18.24766746095848, + "learning_rate": 4.548897730943638e-07, + "logits/chosen": -1.5017660856246948, + "logits/rejected": -1.4614744186401367, + "logps/chosen": -174.06478881835938, + "logps/rejected": -262.4342041015625, + "loss": 0.481, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2178510427474976, + "rewards/margins": 0.839970588684082, + "rewards/rejected": -2.057821750640869, + "step": 3190 + }, + { + "epoch": 0.55, + "grad_norm": 19.455426316676963, + "learning_rate": 4.544580739342596e-07, + "logits/chosen": -1.406374216079712, + "logits/rejected": -1.3839681148529053, + "logps/chosen": -190.57859802246094, + "logps/rejected": -231.16159057617188, + "loss": 0.6097, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3527233600616455, + "rewards/margins": 0.4378505349159241, + "rewards/rejected": -1.7905738353729248, + "step": 3200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -1.6120717525482178, + "eval_logits/rejected": -1.5902533531188965, + "eval_logps/chosen": -167.82127380371094, + "eval_logps/rejected": -200.23843383789062, + "eval_loss": 0.6200531721115112, + "eval_rewards/accuracies": 0.6554368138313293, + "eval_rewards/chosen": -1.0911740064620972, + "eval_rewards/margins": 0.2796363830566406, + "eval_rewards/rejected": -1.3708105087280273, + "eval_runtime": 356.6653, + "eval_samples_per_second": 12.067, + "eval_steps_per_second": 1.508, + "step": 3200 + }, + { + "epoch": 0.55, + "grad_norm": 19.796269236135114, + "learning_rate": 4.5402452584404995e-07, + "logits/chosen": -1.411024808883667, + "logits/rejected": -1.3637266159057617, + "logps/chosen": -168.30789184570312, + "logps/rejected": -231.79910278320312, + "loss": 0.5188, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1436474323272705, + "rewards/margins": 0.6496217250823975, + "rewards/rejected": -1.793269157409668, + "step": 3210 + }, + { + "epoch": 0.55, + "grad_norm": 15.230980934048251, + "learning_rate": 4.535891327443435e-07, + "logits/chosen": -1.4088395833969116, + "logits/rejected": -1.3787903785705566, + "logps/chosen": -172.39651489257812, + "logps/rejected": -239.85952758789062, + "loss": 0.5494, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2206478118896484, + "rewards/margins": 0.6492413282394409, + "rewards/rejected": -1.8698889017105103, + "step": 3220 + }, + { + "epoch": 0.56, + "grad_norm": 30.32211749720397, + "learning_rate": 4.5315189857243377e-07, + "logits/chosen": -1.4493725299835205, + "logits/rejected": -1.413207769393921, + "logps/chosen": -175.58511352539062, + "logps/rejected": -230.5104522705078, + "loss": 0.5586, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.214536428451538, + "rewards/margins": 0.5267941355705261, + "rewards/rejected": -1.7413305044174194, + "step": 3230 + }, + { + "epoch": 0.56, + "grad_norm": 16.643885483015545, + "learning_rate": 4.527128272822629e-07, + "logits/chosen": -1.621273398399353, + "logits/rejected": -1.58579421043396, + "logps/chosen": -170.71180725097656, + "logps/rejected": -214.2522735595703, + "loss": 0.6022, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1446621417999268, + "rewards/margins": 0.47269731760025024, + "rewards/rejected": -1.6173597574234009, + "step": 3240 + }, + { + "epoch": 0.56, + "grad_norm": 21.109399435647408, + "learning_rate": 4.522719228443864e-07, + "logits/chosen": -1.5881023406982422, + "logits/rejected": -1.5520793199539185, + "logps/chosen": -142.5547332763672, + "logps/rejected": -191.86538696289062, + "loss": 0.5607, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8997921943664551, + "rewards/margins": 0.4858148992061615, + "rewards/rejected": -1.3856070041656494, + "step": 3250 + }, + { + "epoch": 0.56, + "grad_norm": 22.057929518873816, + "learning_rate": 4.5182918924593703e-07, + "logits/chosen": -1.607410192489624, + "logits/rejected": -1.5681886672973633, + "logps/chosen": -147.30438232421875, + "logps/rejected": -203.17489624023438, + "loss": 0.5476, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9579440355300903, + "rewards/margins": 0.5465749502182007, + "rewards/rejected": -1.504518747329712, + "step": 3260 + }, + { + "epoch": 0.56, + "grad_norm": 18.19721714439992, + "learning_rate": 4.5138463049058885e-07, + "logits/chosen": -1.6494948863983154, + "logits/rejected": -1.625372290611267, + "logps/chosen": -166.00369262695312, + "logps/rejected": -213.57943725585938, + "loss": 0.5827, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1090967655181885, + "rewards/margins": 0.4481208920478821, + "rewards/rejected": -1.5572177171707153, + "step": 3270 + }, + { + "epoch": 0.57, + "grad_norm": 20.95007962777713, + "learning_rate": 4.50938250598521e-07, + "logits/chosen": -1.6200672388076782, + "logits/rejected": -1.5890979766845703, + "logps/chosen": -150.30422973632812, + "logps/rejected": -204.21400451660156, + "loss": 0.5402, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0092462301254272, + "rewards/margins": 0.509067714214325, + "rewards/rejected": -1.5183137655258179, + "step": 3280 + }, + { + "epoch": 0.57, + "grad_norm": 18.41298820642052, + "learning_rate": 4.5049005360638103e-07, + "logits/chosen": -1.5941425561904907, + "logits/rejected": -1.538638710975647, + "logps/chosen": -169.52369689941406, + "logps/rejected": -231.6940460205078, + "loss": 0.5708, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1636857986450195, + "rewards/margins": 0.6202095150947571, + "rewards/rejected": -1.7838952541351318, + "step": 3290 + }, + { + "epoch": 0.57, + "grad_norm": 22.05304883452244, + "learning_rate": 4.5004004356724893e-07, + "logits/chosen": -1.455288290977478, + "logits/rejected": -1.414819598197937, + "logps/chosen": -181.70596313476562, + "logps/rejected": -236.2588653564453, + "loss": 0.5807, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2678734064102173, + "rewards/margins": 0.5589209794998169, + "rewards/rejected": -1.8267943859100342, + "step": 3300 + }, + { + "epoch": 0.57, + "eval_logits/chosen": -1.5502673387527466, + "eval_logits/rejected": -1.5291829109191895, + "eval_logps/chosen": -168.8760528564453, + "eval_logps/rejected": -199.7250213623047, + "eval_loss": 0.6238878965377808, + "eval_rewards/accuracies": 0.6505576372146606, + "eval_rewards/chosen": -1.1017221212387085, + "eval_rewards/margins": 0.26395440101623535, + "eval_rewards/rejected": -1.3656764030456543, + "eval_runtime": 357.2068, + "eval_samples_per_second": 12.049, + "eval_steps_per_second": 1.506, + "step": 3300 + }, + { + "epoch": 0.57, + "grad_norm": 18.786920967242654, + "learning_rate": 4.4958822455060017e-07, + "logits/chosen": -1.3820545673370361, + "logits/rejected": -1.3281322717666626, + "logps/chosen": -169.11795043945312, + "logps/rejected": -231.81167602539062, + "loss": 0.5407, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1603150367736816, + "rewards/margins": 0.6508747339248657, + "rewards/rejected": -1.8111896514892578, + "step": 3310 + }, + { + "epoch": 0.57, + "grad_norm": 21.45405748820459, + "learning_rate": 4.4913460064226894e-07, + "logits/chosen": -1.44109308719635, + "logits/rejected": -1.3888362646102905, + "logps/chosen": -179.4849395751953, + "logps/rejected": -232.6226043701172, + "loss": 0.562, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.257912278175354, + "rewards/margins": 0.5788989067077637, + "rewards/rejected": -1.8368113040924072, + "step": 3320 + }, + { + "epoch": 0.57, + "grad_norm": 16.508118858701444, + "learning_rate": 4.486791759444111e-07, + "logits/chosen": -1.5882141590118408, + "logits/rejected": -1.5403480529785156, + "logps/chosen": -164.25289916992188, + "logps/rejected": -231.80007934570312, + "loss": 0.5175, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.1004276275634766, + "rewards/margins": 0.680879533290863, + "rewards/rejected": -1.7813072204589844, + "step": 3330 + }, + { + "epoch": 0.58, + "grad_norm": 33.55292099789505, + "learning_rate": 4.4822195457546716e-07, + "logits/chosen": -1.5143282413482666, + "logits/rejected": -1.4674466848373413, + "logps/chosen": -189.68515014648438, + "logps/rejected": -263.664794921875, + "loss": 0.5363, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3378245830535889, + "rewards/margins": 0.75914067029953, + "rewards/rejected": -2.0969653129577637, + "step": 3340 + }, + { + "epoch": 0.58, + "grad_norm": 16.44122753789477, + "learning_rate": 4.477629406701254e-07, + "logits/chosen": -1.427293300628662, + "logits/rejected": -1.3901170492172241, + "logps/chosen": -177.4530792236328, + "logps/rejected": -247.17105102539062, + "loss": 0.53, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.216886281967163, + "rewards/margins": 0.6818975806236267, + "rewards/rejected": -1.8987839221954346, + "step": 3350 + }, + { + "epoch": 0.58, + "grad_norm": 14.401579786445982, + "learning_rate": 4.473021383792838e-07, + "logits/chosen": -1.5537811517715454, + "logits/rejected": -1.5063152313232422, + "logps/chosen": -168.55690002441406, + "logps/rejected": -219.3855743408203, + "loss": 0.5712, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1374642848968506, + "rewards/margins": 0.5342391133308411, + "rewards/rejected": -1.6717033386230469, + "step": 3360 + }, + { + "epoch": 0.58, + "grad_norm": 19.904447545777934, + "learning_rate": 4.4683955187001285e-07, + "logits/chosen": -1.5263116359710693, + "logits/rejected": -1.4976154565811157, + "logps/chosen": -163.7618865966797, + "logps/rejected": -228.2670440673828, + "loss": 0.5506, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1279897689819336, + "rewards/margins": 0.6148195862770081, + "rewards/rejected": -1.7428092956542969, + "step": 3370 + }, + { + "epoch": 0.58, + "grad_norm": 23.89725054685839, + "learning_rate": 4.463751853255182e-07, + "logits/chosen": -1.6531779766082764, + "logits/rejected": -1.6117451190948486, + "logps/chosen": -161.826171875, + "logps/rejected": -209.54940795898438, + "loss": 0.5573, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0634934902191162, + "rewards/margins": 0.5306968688964844, + "rewards/rejected": -1.5941904783248901, + "step": 3380 + }, + { + "epoch": 0.58, + "grad_norm": 16.812554502274565, + "learning_rate": 4.45909042945102e-07, + "logits/chosen": -1.5942082405090332, + "logits/rejected": -1.553095817565918, + "logps/chosen": -158.5015869140625, + "logps/rejected": -206.106201171875, + "loss": 0.5908, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0478193759918213, + "rewards/margins": 0.4990636706352234, + "rewards/rejected": -1.5468828678131104, + "step": 3390 + }, + { + "epoch": 0.59, + "grad_norm": 16.25733459636065, + "learning_rate": 4.454411289441259e-07, + "logits/chosen": -1.650813102722168, + "logits/rejected": -1.589519739151001, + "logps/chosen": -148.92491149902344, + "logps/rejected": -204.60989379882812, + "loss": 0.536, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9608653783798218, + "rewards/margins": 0.5713566541671753, + "rewards/rejected": -1.532222032546997, + "step": 3400 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -1.723968505859375, + "eval_logits/rejected": -1.705629825592041, + "eval_logps/chosen": -141.45721435546875, + "eval_logps/rejected": -167.2509307861328, + "eval_loss": 0.63118976354599, + "eval_rewards/accuracies": 0.6466078162193298, + "eval_rewards/chosen": -0.8275338411331177, + "eval_rewards/margins": 0.21340180933475494, + "eval_rewards/rejected": -1.0409355163574219, + "eval_runtime": 357.3396, + "eval_samples_per_second": 12.045, + "eval_steps_per_second": 1.506, + "step": 3400 + }, + { + "epoch": 0.59, + "grad_norm": 17.44200872783744, + "learning_rate": 4.4497144755397215e-07, + "logits/chosen": -1.5299510955810547, + "logits/rejected": -1.4821765422821045, + "logps/chosen": -140.80323791503906, + "logps/rejected": -188.71884155273438, + "loss": 0.5417, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9123737215995789, + "rewards/margins": 0.5039972066879272, + "rewards/rejected": -1.4163707494735718, + "step": 3410 + }, + { + "epoch": 0.59, + "grad_norm": 20.94711202688562, + "learning_rate": 4.4450000302200576e-07, + "logits/chosen": -1.5101244449615479, + "logits/rejected": -1.4615298509597778, + "logps/chosen": -156.17779541015625, + "logps/rejected": -221.5508270263672, + "loss": 0.52, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0341514348983765, + "rewards/margins": 0.6485717296600342, + "rewards/rejected": -1.6827232837677002, + "step": 3420 + }, + { + "epoch": 0.59, + "grad_norm": 17.697196074003802, + "learning_rate": 4.440267996115359e-07, + "logits/chosen": -1.5161569118499756, + "logits/rejected": -1.4690425395965576, + "logps/chosen": -188.15164184570312, + "logps/rejected": -251.8218231201172, + "loss": 0.5671, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3682286739349365, + "rewards/margins": 0.6038091778755188, + "rewards/rejected": -1.9720379114151, + "step": 3430 + }, + { + "epoch": 0.59, + "grad_norm": 23.878652372078566, + "learning_rate": 4.435518416017774e-07, + "logits/chosen": -1.4505062103271484, + "logits/rejected": -1.4057317972183228, + "logps/chosen": -191.19534301757812, + "logps/rejected": -253.8204345703125, + "loss": 0.5548, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.386473298072815, + "rewards/margins": 0.6239285469055176, + "rewards/rejected": -2.010401964187622, + "step": 3440 + }, + { + "epoch": 0.59, + "grad_norm": 24.14251784705596, + "learning_rate": 4.430751332878122e-07, + "logits/chosen": -1.6515562534332275, + "logits/rejected": -1.5952726602554321, + "logps/chosen": -197.3381805419922, + "logps/rejected": -256.8216857910156, + "loss": 0.5492, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3796002864837646, + "rewards/margins": 0.6393512487411499, + "rewards/rejected": -2.018951654434204, + "step": 3450 + }, + { + "epoch": 0.6, + "grad_norm": 24.13758333534331, + "learning_rate": 4.425966789805503e-07, + "logits/chosen": -1.499289631843567, + "logits/rejected": -1.4667627811431885, + "logps/chosen": -164.79124450683594, + "logps/rejected": -216.9873504638672, + "loss": 0.5621, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1262753009796143, + "rewards/margins": 0.5078203678131104, + "rewards/rejected": -1.6340957880020142, + "step": 3460 + }, + { + "epoch": 0.6, + "grad_norm": 18.681968022257387, + "learning_rate": 4.4211648300669076e-07, + "logits/chosen": -1.597586989402771, + "logits/rejected": -1.5641849040985107, + "logps/chosen": -169.10386657714844, + "logps/rejected": -226.5546112060547, + "loss": 0.547, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.133301854133606, + "rewards/margins": 0.5889450311660767, + "rewards/rejected": -1.7222468852996826, + "step": 3470 + }, + { + "epoch": 0.6, + "grad_norm": 29.803622122615664, + "learning_rate": 4.4163454970868277e-07, + "logits/chosen": -1.5102007389068604, + "logits/rejected": -1.4543273448944092, + "logps/chosen": -181.89114379882812, + "logps/rejected": -245.5972900390625, + "loss": 0.5329, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2680021524429321, + "rewards/margins": 0.6691805124282837, + "rewards/rejected": -1.9371826648712158, + "step": 3480 + }, + { + "epoch": 0.6, + "grad_norm": 28.83453327688963, + "learning_rate": 4.411508834446863e-07, + "logits/chosen": -1.5323913097381592, + "logits/rejected": -1.4874933958053589, + "logps/chosen": -182.50387573242188, + "logps/rejected": -241.5607452392578, + "loss": 0.5529, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.282502293586731, + "rewards/margins": 0.587981104850769, + "rewards/rejected": -1.8704833984375, + "step": 3490 + }, + { + "epoch": 0.6, + "grad_norm": 16.032941250777014, + "learning_rate": 4.406654885885326e-07, + "logits/chosen": -1.4855334758758545, + "logits/rejected": -1.4571826457977295, + "logps/chosen": -178.00135803222656, + "logps/rejected": -237.3003387451172, + "loss": 0.5392, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.259861707687378, + "rewards/margins": 0.5666104555130005, + "rewards/rejected": -1.8264720439910889, + "step": 3500 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -1.6595714092254639, + "eval_logits/rejected": -1.6385571956634521, + "eval_logps/chosen": -161.32484436035156, + "eval_logps/rejected": -191.19439697265625, + "eval_loss": 0.6286602020263672, + "eval_rewards/accuracies": 0.6466078162193298, + "eval_rewards/chosen": -1.0262099504470825, + "eval_rewards/margins": 0.2541602849960327, + "eval_rewards/rejected": -1.2803701162338257, + "eval_runtime": 357.0552, + "eval_samples_per_second": 12.054, + "eval_steps_per_second": 1.507, + "step": 3500 + }, + { + "epoch": 0.6, + "grad_norm": 16.07998691134206, + "learning_rate": 4.4017836952968467e-07, + "logits/chosen": -1.4526565074920654, + "logits/rejected": -1.4062235355377197, + "logps/chosen": -173.96885681152344, + "logps/rejected": -226.445068359375, + "loss": 0.5686, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2041919231414795, + "rewards/margins": 0.5490579009056091, + "rewards/rejected": -1.7532498836517334, + "step": 3510 + }, + { + "epoch": 0.61, + "grad_norm": 19.834365533950795, + "learning_rate": 4.396895306731977e-07, + "logits/chosen": -1.5146148204803467, + "logits/rejected": -1.4718494415283203, + "logps/chosen": -160.6667938232422, + "logps/rejected": -208.8634490966797, + "loss": 0.5754, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.053438425064087, + "rewards/margins": 0.5146963000297546, + "rewards/rejected": -1.5681347846984863, + "step": 3520 + }, + { + "epoch": 0.61, + "grad_norm": 23.90518563871216, + "learning_rate": 4.391989764396792e-07, + "logits/chosen": -1.6393533945083618, + "logits/rejected": -1.577980637550354, + "logps/chosen": -166.45315551757812, + "logps/rejected": -219.417724609375, + "loss": 0.5593, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1050881147384644, + "rewards/margins": 0.5800349116325378, + "rewards/rejected": -1.6851232051849365, + "step": 3530 + }, + { + "epoch": 0.61, + "grad_norm": 20.26503213849771, + "learning_rate": 4.387067112652487e-07, + "logits/chosen": -1.5266510248184204, + "logits/rejected": -1.4859250783920288, + "logps/chosen": -157.66932678222656, + "logps/rejected": -214.6486053466797, + "loss": 0.5651, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0376538038253784, + "rewards/margins": 0.5733412504196167, + "rewards/rejected": -1.6109952926635742, + "step": 3540 + }, + { + "epoch": 0.61, + "grad_norm": 18.788930669523673, + "learning_rate": 4.382127396014982e-07, + "logits/chosen": -1.6274988651275635, + "logits/rejected": -1.6048628091812134, + "logps/chosen": -166.0504608154297, + "logps/rejected": -205.7928466796875, + "loss": 0.609, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1151145696640015, + "rewards/margins": 0.41729575395584106, + "rewards/rejected": -1.5324103832244873, + "step": 3550 + }, + { + "epoch": 0.61, + "grad_norm": 18.6411865873627, + "learning_rate": 4.377170659154514e-07, + "logits/chosen": -1.5456907749176025, + "logits/rejected": -1.506981611251831, + "logps/chosen": -159.85513305664062, + "logps/rejected": -213.0148468017578, + "loss": 0.5651, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.069687843322754, + "rewards/margins": 0.535394549369812, + "rewards/rejected": -1.6050825119018555, + "step": 3560 + }, + { + "epoch": 0.62, + "grad_norm": 26.755951307532122, + "learning_rate": 4.372196946895238e-07, + "logits/chosen": -1.6680046319961548, + "logits/rejected": -1.6189712285995483, + "logps/chosen": -177.2747344970703, + "logps/rejected": -216.9444580078125, + "loss": 0.609, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2041199207305908, + "rewards/margins": 0.43309324979782104, + "rewards/rejected": -1.637213110923767, + "step": 3570 + }, + { + "epoch": 0.62, + "grad_norm": 16.327880126029793, + "learning_rate": 4.367206304214815e-07, + "logits/chosen": -1.6215425729751587, + "logits/rejected": -1.5847231149673462, + "logps/chosen": -168.4742431640625, + "logps/rejected": -224.7816162109375, + "loss": 0.5332, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1029232740402222, + "rewards/margins": 0.570185661315918, + "rewards/rejected": -1.6731090545654297, + "step": 3580 + }, + { + "epoch": 0.62, + "grad_norm": 17.82230607590664, + "learning_rate": 4.3621987762440114e-07, + "logits/chosen": -1.582554578781128, + "logits/rejected": -1.5418357849121094, + "logps/chosen": -181.93031311035156, + "logps/rejected": -246.87838745117188, + "loss": 0.5296, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.27500319480896, + "rewards/margins": 0.654120922088623, + "rewards/rejected": -1.929124116897583, + "step": 3590 + }, + { + "epoch": 0.62, + "grad_norm": 27.151158453154128, + "learning_rate": 4.357174408266289e-07, + "logits/chosen": -1.5266609191894531, + "logits/rejected": -1.4800150394439697, + "logps/chosen": -184.74020385742188, + "logps/rejected": -237.02490234375, + "loss": 0.5689, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3012017011642456, + "rewards/margins": 0.5533289909362793, + "rewards/rejected": -1.854530692100525, + "step": 3600 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -1.6493839025497437, + "eval_logits/rejected": -1.6286251544952393, + "eval_logps/chosen": -170.8087158203125, + "eval_logps/rejected": -201.0063018798828, + "eval_loss": 0.627535879611969, + "eval_rewards/accuracies": 0.6486988663673401, + "eval_rewards/chosen": -1.1210483312606812, + "eval_rewards/margins": 0.2574405074119568, + "eval_rewards/rejected": -1.3784890174865723, + "eval_runtime": 356.9383, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.507, + "step": 3600 + }, + { + "epoch": 0.62, + "grad_norm": 16.53409464026598, + "learning_rate": 4.3521332457173933e-07, + "logits/chosen": -1.4792962074279785, + "logits/rejected": -1.4342132806777954, + "logps/chosen": -188.6980438232422, + "logps/rejected": -251.24951171875, + "loss": 0.5333, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3721535205841064, + "rewards/margins": 0.6460615396499634, + "rewards/rejected": -2.018214702606201, + "step": 3610 + }, + { + "epoch": 0.62, + "grad_norm": 24.356843194385412, + "learning_rate": 4.347075334184946e-07, + "logits/chosen": -1.389676809310913, + "logits/rejected": -1.3410922288894653, + "logps/chosen": -182.9585418701172, + "logps/rejected": -253.10958862304688, + "loss": 0.5018, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2746689319610596, + "rewards/margins": 0.7207802534103394, + "rewards/rejected": -1.9954490661621094, + "step": 3620 + }, + { + "epoch": 0.63, + "grad_norm": 28.04057531766567, + "learning_rate": 4.34200071940803e-07, + "logits/chosen": -1.4672437906265259, + "logits/rejected": -1.4280986785888672, + "logps/chosen": -215.80764770507812, + "logps/rejected": -300.14385986328125, + "loss": 0.5108, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6468420028686523, + "rewards/margins": 0.7980831861495972, + "rewards/rejected": -2.44492506980896, + "step": 3630 + }, + { + "epoch": 0.63, + "grad_norm": 31.68658499963128, + "learning_rate": 4.3369094472767785e-07, + "logits/chosen": -1.3977959156036377, + "logits/rejected": -1.3606897592544556, + "logps/chosen": -217.1880645751953, + "logps/rejected": -290.8738708496094, + "loss": 0.5379, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6216189861297607, + "rewards/margins": 0.7258588671684265, + "rewards/rejected": -2.347477674484253, + "step": 3640 + }, + { + "epoch": 0.63, + "grad_norm": 22.54081035634695, + "learning_rate": 4.331801563831956e-07, + "logits/chosen": -1.3711670637130737, + "logits/rejected": -1.350187063217163, + "logps/chosen": -206.61978149414062, + "logps/rejected": -269.8750915527344, + "loss": 0.544, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.555546522140503, + "rewards/margins": 0.5982221961021423, + "rewards/rejected": -2.15376877784729, + "step": 3650 + }, + { + "epoch": 0.63, + "grad_norm": 22.175111801909775, + "learning_rate": 4.326677115264547e-07, + "logits/chosen": -1.3863328695297241, + "logits/rejected": -1.3272384405136108, + "logps/chosen": -210.5821075439453, + "logps/rejected": -292.0908203125, + "loss": 0.5043, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5674175024032593, + "rewards/margins": 0.8180697560310364, + "rewards/rejected": -2.3854870796203613, + "step": 3660 + }, + { + "epoch": 0.63, + "grad_norm": 18.961212344873417, + "learning_rate": 4.321536147915334e-07, + "logits/chosen": -1.3621985912322998, + "logits/rejected": -1.3097569942474365, + "logps/chosen": -205.28738403320312, + "logps/rejected": -273.30584716796875, + "loss": 0.5708, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5246150493621826, + "rewards/margins": 0.6830393671989441, + "rewards/rejected": -2.2076547145843506, + "step": 3670 + }, + { + "epoch": 0.63, + "grad_norm": 17.75818169123661, + "learning_rate": 4.316378708274481e-07, + "logits/chosen": -1.4744240045547485, + "logits/rejected": -1.422086477279663, + "logps/chosen": -186.6185302734375, + "logps/rejected": -245.1935577392578, + "loss": 0.5536, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3194071054458618, + "rewards/margins": 0.6318386197090149, + "rewards/rejected": -1.951245665550232, + "step": 3680 + }, + { + "epoch": 0.64, + "grad_norm": 25.3589027579314, + "learning_rate": 4.31120484298111e-07, + "logits/chosen": -1.4429172277450562, + "logits/rejected": -1.4147446155548096, + "logps/chosen": -174.2320098876953, + "logps/rejected": -255.7716064453125, + "loss": 0.5238, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2215301990509033, + "rewards/margins": 0.7453585863113403, + "rewards/rejected": -1.9668890237808228, + "step": 3690 + }, + { + "epoch": 0.64, + "grad_norm": 17.358115744138235, + "learning_rate": 4.306014598822886e-07, + "logits/chosen": -1.4474033117294312, + "logits/rejected": -1.394345998764038, + "logps/chosen": -179.29293823242188, + "logps/rejected": -256.70098876953125, + "loss": 0.517, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2170681953430176, + "rewards/margins": 0.7563605904579163, + "rewards/rejected": -1.9734289646148682, + "step": 3700 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -1.523759365081787, + "eval_logits/rejected": -1.4999202489852905, + "eval_logps/chosen": -181.3194580078125, + "eval_logps/rejected": -215.5612030029297, + "eval_loss": 0.6243796944618225, + "eval_rewards/accuracies": 0.6565985083580017, + "eval_rewards/chosen": -1.2261559963226318, + "eval_rewards/margins": 0.2978822588920593, + "eval_rewards/rejected": -1.5240384340286255, + "eval_runtime": 357.0346, + "eval_samples_per_second": 12.055, + "eval_steps_per_second": 1.507, + "step": 3700 + }, + { + "epoch": 0.64, + "grad_norm": 21.218376082289595, + "learning_rate": 4.3008080227355844e-07, + "logits/chosen": -1.4100111722946167, + "logits/rejected": -1.3635252714157104, + "logps/chosen": -195.98471069335938, + "logps/rejected": -256.88739013671875, + "loss": 0.5513, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4212100505828857, + "rewards/margins": 0.6375278234481812, + "rewards/rejected": -2.0587379932403564, + "step": 3710 + }, + { + "epoch": 0.64, + "grad_norm": 21.355833105859, + "learning_rate": 4.295585161802674e-07, + "logits/chosen": -1.4289751052856445, + "logits/rejected": -1.3828635215759277, + "logps/chosen": -182.1897735595703, + "logps/rejected": -262.51226806640625, + "loss": 0.4968, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.303205132484436, + "rewards/margins": 0.7883247137069702, + "rewards/rejected": -2.0915298461914062, + "step": 3720 + }, + { + "epoch": 0.64, + "grad_norm": 22.252174921593365, + "learning_rate": 4.2903460632548893e-07, + "logits/chosen": -1.3439371585845947, + "logits/rejected": -1.2857837677001953, + "logps/chosen": -212.6474151611328, + "logps/rejected": -296.428466796875, + "loss": 0.4997, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.581465721130371, + "rewards/margins": 0.8757398724555969, + "rewards/rejected": -2.4572055339813232, + "step": 3730 + }, + { + "epoch": 0.64, + "grad_norm": 22.471686436011503, + "learning_rate": 4.285090774469802e-07, + "logits/chosen": -1.3240846395492554, + "logits/rejected": -1.2739444971084595, + "logps/chosen": -212.66140747070312, + "logps/rejected": -290.9830017089844, + "loss": 0.5364, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.592313528060913, + "rewards/margins": 0.7535529136657715, + "rewards/rejected": -2.3458666801452637, + "step": 3740 + }, + { + "epoch": 0.65, + "grad_norm": 17.899544534984106, + "learning_rate": 4.2798193429713913e-07, + "logits/chosen": -1.440411925315857, + "logits/rejected": -1.3944687843322754, + "logps/chosen": -198.22142028808594, + "logps/rejected": -263.0658874511719, + "loss": 0.5618, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.44929838180542, + "rewards/margins": 0.650551974773407, + "rewards/rejected": -2.0998501777648926, + "step": 3750 + }, + { + "epoch": 0.65, + "grad_norm": 25.746429517526668, + "learning_rate": 4.27453181642962e-07, + "logits/chosen": -1.4367876052856445, + "logits/rejected": -1.4018447399139404, + "logps/chosen": -195.23745727539062, + "logps/rejected": -260.03192138671875, + "loss": 0.5471, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3940891027450562, + "rewards/margins": 0.6460259556770325, + "rewards/rejected": -2.0401148796081543, + "step": 3760 + }, + { + "epoch": 0.65, + "grad_norm": 22.141823531733728, + "learning_rate": 4.2692282426599967e-07, + "logits/chosen": -1.4208014011383057, + "logits/rejected": -1.3772103786468506, + "logps/chosen": -181.8473663330078, + "logps/rejected": -244.9152374267578, + "loss": 0.5249, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2831165790557861, + "rewards/margins": 0.6210489273071289, + "rewards/rejected": -1.904165506362915, + "step": 3770 + }, + { + "epoch": 0.65, + "grad_norm": 25.749235391096974, + "learning_rate": 4.2639086696231483e-07, + "logits/chosen": -1.3430489301681519, + "logits/rejected": -1.2899630069732666, + "logps/chosen": -210.1292266845703, + "logps/rejected": -266.73431396484375, + "loss": 0.552, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5607130527496338, + "rewards/margins": 0.6126449704170227, + "rewards/rejected": -2.1733579635620117, + "step": 3780 + }, + { + "epoch": 0.65, + "grad_norm": 17.91537947782448, + "learning_rate": 4.2585731454243834e-07, + "logits/chosen": -1.347544550895691, + "logits/rejected": -1.2992546558380127, + "logps/chosen": -203.87083435058594, + "logps/rejected": -270.9369812011719, + "loss": 0.5513, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.498838186264038, + "rewards/margins": 0.7007363438606262, + "rewards/rejected": -2.1995744705200195, + "step": 3790 + }, + { + "epoch": 0.65, + "grad_norm": 20.642915135629426, + "learning_rate": 4.2532217183132566e-07, + "logits/chosen": -1.4202806949615479, + "logits/rejected": -1.3704365491867065, + "logps/chosen": -190.59255981445312, + "logps/rejected": -250.588134765625, + "loss": 0.5368, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3290106058120728, + "rewards/margins": 0.6480494737625122, + "rewards/rejected": -1.977060317993164, + "step": 3800 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -1.523742437362671, + "eval_logits/rejected": -1.5010066032409668, + "eval_logps/chosen": -182.3809356689453, + "eval_logps/rejected": -216.2484893798828, + "eval_loss": 0.6206509470939636, + "eval_rewards/accuracies": 0.6579925417900085, + "eval_rewards/chosen": -1.2367708683013916, + "eval_rewards/margins": 0.29413995146751404, + "eval_rewards/rejected": -1.530910849571228, + "eval_runtime": 357.0405, + "eval_samples_per_second": 12.055, + "eval_steps_per_second": 1.507, + "step": 3800 + }, + { + "epoch": 0.66, + "grad_norm": 31.94932671255532, + "learning_rate": 4.2478544366831373e-07, + "logits/chosen": -1.4317169189453125, + "logits/rejected": -1.3770487308502197, + "logps/chosen": -202.6186981201172, + "logps/rejected": -254.42404174804688, + "loss": 0.5594, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4515069723129272, + "rewards/margins": 0.57206791639328, + "rewards/rejected": -2.0235750675201416, + "step": 3810 + }, + { + "epoch": 0.66, + "grad_norm": 26.840179888734244, + "learning_rate": 4.242471349070765e-07, + "logits/chosen": -1.430687665939331, + "logits/rejected": -1.3825973272323608, + "logps/chosen": -182.3372039794922, + "logps/rejected": -254.831787109375, + "loss": 0.5042, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2825406789779663, + "rewards/margins": 0.7350937128067017, + "rewards/rejected": -2.017634630203247, + "step": 3820 + }, + { + "epoch": 0.66, + "grad_norm": 20.713279754546374, + "learning_rate": 4.2370725041558163e-07, + "logits/chosen": -1.4622533321380615, + "logits/rejected": -1.3965160846710205, + "logps/chosen": -194.24440002441406, + "logps/rejected": -252.4665985107422, + "loss": 0.5156, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3557844161987305, + "rewards/margins": 0.6555383205413818, + "rewards/rejected": -2.0113227367401123, + "step": 3830 + }, + { + "epoch": 0.66, + "grad_norm": 22.738897007983, + "learning_rate": 4.2316579507604613e-07, + "logits/chosen": -1.3598577976226807, + "logits/rejected": -1.3157683610916138, + "logps/chosen": -200.95361328125, + "logps/rejected": -288.90191650390625, + "loss": 0.5188, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.468830943107605, + "rewards/margins": 0.8298671841621399, + "rewards/rejected": -2.2986984252929688, + "step": 3840 + }, + { + "epoch": 0.66, + "grad_norm": 26.405390100521963, + "learning_rate": 4.2262277378489224e-07, + "logits/chosen": -1.427339792251587, + "logits/rejected": -1.385075330734253, + "logps/chosen": -227.91049194335938, + "logps/rejected": -302.1413879394531, + "loss": 0.5196, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.706134557723999, + "rewards/margins": 0.7777893543243408, + "rewards/rejected": -2.483924150466919, + "step": 3850 + }, + { + "epoch": 0.67, + "grad_norm": 28.541254516094085, + "learning_rate": 4.2207819145270346e-07, + "logits/chosen": -1.4458119869232178, + "logits/rejected": -1.3982911109924316, + "logps/chosen": -232.8706512451172, + "logps/rejected": -297.7009582519531, + "loss": 0.568, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7871061563491821, + "rewards/margins": 0.6608562469482422, + "rewards/rejected": -2.4479622840881348, + "step": 3860 + }, + { + "epoch": 0.67, + "grad_norm": 20.798655132332478, + "learning_rate": 4.2153205300417966e-07, + "logits/chosen": -1.4056997299194336, + "logits/rejected": -1.3534657955169678, + "logps/chosen": -214.6895751953125, + "logps/rejected": -290.5361633300781, + "loss": 0.5187, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5584853887557983, + "rewards/margins": 0.7904712557792664, + "rewards/rejected": -2.34895658493042, + "step": 3870 + }, + { + "epoch": 0.67, + "grad_norm": 25.018107512387246, + "learning_rate": 4.209843633780929e-07, + "logits/chosen": -1.5281155109405518, + "logits/rejected": -1.5098029375076294, + "logps/chosen": -187.2782440185547, + "logps/rejected": -250.95950317382812, + "loss": 0.5438, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3641104698181152, + "rewards/margins": 0.6108844876289368, + "rewards/rejected": -1.9749950170516968, + "step": 3880 + }, + { + "epoch": 0.67, + "grad_norm": 17.201393329622153, + "learning_rate": 4.204351275272426e-07, + "logits/chosen": -1.5760449171066284, + "logits/rejected": -1.5332744121551514, + "logps/chosen": -177.4027862548828, + "logps/rejected": -236.5082244873047, + "loss": 0.5651, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2320191860198975, + "rewards/margins": 0.6117894053459167, + "rewards/rejected": -1.8438085317611694, + "step": 3890 + }, + { + "epoch": 0.67, + "grad_norm": 18.018343000765952, + "learning_rate": 4.1988435041841096e-07, + "logits/chosen": -1.5944218635559082, + "logits/rejected": -1.5262387990951538, + "logps/chosen": -170.49252319335938, + "logps/rejected": -219.80899047851562, + "loss": 0.5382, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1367781162261963, + "rewards/margins": 0.5586223006248474, + "rewards/rejected": -1.6954004764556885, + "step": 3900 + }, + { + "epoch": 0.67, + "eval_logits/chosen": -1.6579508781433105, + "eval_logits/rejected": -1.6361998319625854, + "eval_logps/chosen": -160.2046661376953, + "eval_logps/rejected": -190.85934448242188, + "eval_loss": 0.6221497654914856, + "eval_rewards/accuracies": 0.6596189737319946, + "eval_rewards/chosen": -1.0150080919265747, + "eval_rewards/margins": 0.26201140880584717, + "eval_rewards/rejected": -1.2770196199417114, + "eval_runtime": 357.0359, + "eval_samples_per_second": 12.055, + "eval_steps_per_second": 1.507, + "step": 3900 + }, + { + "epoch": 0.67, + "grad_norm": 15.180474012277164, + "learning_rate": 4.1933203703231766e-07, + "logits/chosen": -1.584212303161621, + "logits/rejected": -1.5458735227584839, + "logps/chosen": -177.2735595703125, + "logps/rejected": -240.51168823242188, + "loss": 0.5167, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2165385484695435, + "rewards/margins": 0.6424941420555115, + "rewards/rejected": -1.8590329885482788, + "step": 3910 + }, + { + "epoch": 0.68, + "grad_norm": 20.98038793058905, + "learning_rate": 4.1877819236357524e-07, + "logits/chosen": -1.5897353887557983, + "logits/rejected": -1.5237689018249512, + "logps/chosen": -172.70350646972656, + "logps/rejected": -239.7980499267578, + "loss": 0.4887, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1406900882720947, + "rewards/margins": 0.7530602216720581, + "rewards/rejected": -1.8937504291534424, + "step": 3920 + }, + { + "epoch": 0.68, + "grad_norm": 29.051405565574303, + "learning_rate": 4.182228214206437e-07, + "logits/chosen": -1.5160815715789795, + "logits/rejected": -1.48716139793396, + "logps/chosen": -189.08810424804688, + "logps/rejected": -257.77593994140625, + "loss": 0.5336, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3786773681640625, + "rewards/margins": 0.6711302399635315, + "rewards/rejected": -2.0498077869415283, + "step": 3930 + }, + { + "epoch": 0.68, + "grad_norm": 32.250601718030964, + "learning_rate": 4.1766592922578527e-07, + "logits/chosen": -1.3783949613571167, + "logits/rejected": -1.3409314155578613, + "logps/chosen": -183.18594360351562, + "logps/rejected": -250.7453155517578, + "loss": 0.5505, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.297050952911377, + "rewards/margins": 0.6737505793571472, + "rewards/rejected": -1.9708013534545898, + "step": 3940 + }, + { + "epoch": 0.68, + "grad_norm": 19.935483029280153, + "learning_rate": 4.1710752081501877e-07, + "logits/chosen": -1.3798249959945679, + "logits/rejected": -1.311702847480774, + "logps/chosen": -178.89566040039062, + "logps/rejected": -250.24734497070312, + "loss": 0.4886, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2341785430908203, + "rewards/margins": 0.7588900327682495, + "rewards/rejected": -1.9930686950683594, + "step": 3950 + }, + { + "epoch": 0.68, + "grad_norm": 33.05051638682265, + "learning_rate": 4.1654760123807464e-07, + "logits/chosen": -1.4223079681396484, + "logits/rejected": -1.3881456851959229, + "logps/chosen": -198.07431030273438, + "logps/rejected": -285.7303771972656, + "loss": 0.4943, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4536970853805542, + "rewards/margins": 0.8098018765449524, + "rewards/rejected": -2.2634987831115723, + "step": 3960 + }, + { + "epoch": 0.68, + "grad_norm": 27.584592344930318, + "learning_rate": 4.159861755583487e-07, + "logits/chosen": -1.3134465217590332, + "logits/rejected": -1.2678642272949219, + "logps/chosen": -234.09814453125, + "logps/rejected": -308.8127746582031, + "loss": 0.549, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.800374984741211, + "rewards/margins": 0.7578364610671997, + "rewards/rejected": -2.558211326599121, + "step": 3970 + }, + { + "epoch": 0.69, + "grad_norm": 29.469171946700417, + "learning_rate": 4.154232488528566e-07, + "logits/chosen": -1.1992053985595703, + "logits/rejected": -1.1396461725234985, + "logps/chosen": -219.92236328125, + "logps/rejected": -315.6834411621094, + "loss": 0.4724, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6659343242645264, + "rewards/margins": 0.9590142965316772, + "rewards/rejected": -2.624948501586914, + "step": 3980 + }, + { + "epoch": 0.69, + "grad_norm": 17.97839097611244, + "learning_rate": 4.148588262121877e-07, + "logits/chosen": -1.3333715200424194, + "logits/rejected": -1.299889326095581, + "logps/chosen": -211.5640106201172, + "logps/rejected": -273.6368408203125, + "loss": 0.5775, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5835427045822144, + "rewards/margins": 0.6002888083457947, + "rewards/rejected": -2.183831214904785, + "step": 3990 + }, + { + "epoch": 0.69, + "grad_norm": 29.741620640645017, + "learning_rate": 4.1429291274045965e-07, + "logits/chosen": -1.5011112689971924, + "logits/rejected": -1.4447122812271118, + "logps/chosen": -197.10702514648438, + "logps/rejected": -261.1085205078125, + "loss": 0.5399, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.399017572402954, + "rewards/margins": 0.6844178438186646, + "rewards/rejected": -2.083435535430908, + "step": 4000 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -1.5105490684509277, + "eval_logits/rejected": -1.486973524093628, + "eval_logps/chosen": -175.73806762695312, + "eval_logps/rejected": -209.60133361816406, + "eval_loss": 0.6212473511695862, + "eval_rewards/accuracies": 0.6598513126373291, + "eval_rewards/chosen": -1.170341968536377, + "eval_rewards/margins": 0.29409757256507874, + "eval_rewards/rejected": -1.4644395112991333, + "eval_runtime": 356.9871, + "eval_samples_per_second": 12.056, + "eval_steps_per_second": 1.507, + "step": 4000 + }, + { + "epoch": 0.69, + "grad_norm": 21.179313854062823, + "learning_rate": 4.137255135552714e-07, + "logits/chosen": -1.3642061948776245, + "logits/rejected": -1.3185532093048096, + "logps/chosen": -176.0836944580078, + "logps/rejected": -256.7080078125, + "loss": 0.5005, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2807254791259766, + "rewards/margins": 0.7561134099960327, + "rewards/rejected": -2.036839008331299, + "step": 4010 + }, + { + "epoch": 0.69, + "grad_norm": 28.693872888159124, + "learning_rate": 4.131566337876575e-07, + "logits/chosen": -1.3393471240997314, + "logits/rejected": -1.3077666759490967, + "logps/chosen": -198.69711303710938, + "logps/rejected": -269.0760192871094, + "loss": 0.5463, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4653034210205078, + "rewards/margins": 0.694530189037323, + "rewards/rejected": -2.1598334312438965, + "step": 4020 + }, + { + "epoch": 0.69, + "grad_norm": 20.870224461785025, + "learning_rate": 4.125862785820416e-07, + "logits/chosen": -1.3702881336212158, + "logits/rejected": -1.319515585899353, + "logps/chosen": -193.2422637939453, + "logps/rejected": -268.23193359375, + "loss": 0.5069, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3722060918807983, + "rewards/margins": 0.752329409122467, + "rewards/rejected": -2.12453556060791, + "step": 4030 + }, + { + "epoch": 0.7, + "grad_norm": 24.444442071597063, + "learning_rate": 4.1201445309618954e-07, + "logits/chosen": -1.4431445598602295, + "logits/rejected": -1.3992760181427002, + "logps/chosen": -196.61453247070312, + "logps/rejected": -275.2479553222656, + "loss": 0.5035, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4041612148284912, + "rewards/margins": 0.8080703020095825, + "rewards/rejected": -2.212231397628784, + "step": 4040 + }, + { + "epoch": 0.7, + "grad_norm": 20.703451812620813, + "learning_rate": 4.114411625011634e-07, + "logits/chosen": -1.3789803981781006, + "logits/rejected": -1.3350975513458252, + "logps/chosen": -179.94940185546875, + "logps/rejected": -255.5587615966797, + "loss": 0.525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2938952445983887, + "rewards/margins": 0.7335586547851562, + "rewards/rejected": -2.027453899383545, + "step": 4050 + }, + { + "epoch": 0.7, + "grad_norm": 18.961262754259714, + "learning_rate": 4.1086641198127404e-07, + "logits/chosen": -1.392407774925232, + "logits/rejected": -1.3395566940307617, + "logps/chosen": -198.05589294433594, + "logps/rejected": -261.8578796386719, + "loss": 0.5609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.417249321937561, + "rewards/margins": 0.6800889372825623, + "rewards/rejected": -2.0973381996154785, + "step": 4060 + }, + { + "epoch": 0.7, + "grad_norm": 18.757428674130104, + "learning_rate": 4.102902067340348e-07, + "logits/chosen": -1.3935401439666748, + "logits/rejected": -1.3446584939956665, + "logps/chosen": -188.22537231445312, + "logps/rejected": -258.7428283691406, + "loss": 0.5224, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3439836502075195, + "rewards/margins": 0.7147015333175659, + "rewards/rejected": -2.058685302734375, + "step": 4070 + }, + { + "epoch": 0.7, + "grad_norm": 19.87740623644791, + "learning_rate": 4.0971255197011395e-07, + "logits/chosen": -1.3319361209869385, + "logits/rejected": -1.294301986694336, + "logps/chosen": -182.7200469970703, + "logps/rejected": -261.05523681640625, + "loss": 0.5126, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3081274032592773, + "rewards/margins": 0.7623058557510376, + "rewards/rejected": -2.0704331398010254, + "step": 4080 + }, + { + "epoch": 0.7, + "grad_norm": 22.371735739500583, + "learning_rate": 4.091334529132881e-07, + "logits/chosen": -1.4664791822433472, + "logits/rejected": -1.404679536819458, + "logps/chosen": -177.88174438476562, + "logps/rejected": -248.9783935546875, + "loss": 0.5139, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2242028713226318, + "rewards/margins": 0.7204464673995972, + "rewards/rejected": -1.944649338722229, + "step": 4090 + }, + { + "epoch": 0.71, + "grad_norm": 24.372455930646307, + "learning_rate": 4.0855291480039454e-07, + "logits/chosen": -1.3770744800567627, + "logits/rejected": -1.329611897468567, + "logps/chosen": -182.24974060058594, + "logps/rejected": -250.8807830810547, + "loss": 0.5175, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2940999269485474, + "rewards/margins": 0.722284197807312, + "rewards/rejected": -2.0163843631744385, + "step": 4100 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -1.472186803817749, + "eval_logits/rejected": -1.4476103782653809, + "eval_logps/chosen": -186.34982299804688, + "eval_logps/rejected": -222.20494079589844, + "eval_loss": 0.6203304529190063, + "eval_rewards/accuracies": 0.6554368138313293, + "eval_rewards/chosen": -1.276459813117981, + "eval_rewards/margins": 0.31401583552360535, + "eval_rewards/rejected": -1.5904756784439087, + "eval_runtime": 356.6055, + "eval_samples_per_second": 12.069, + "eval_steps_per_second": 1.509, + "step": 4100 + }, + { + "epoch": 0.71, + "grad_norm": 17.294444254537726, + "learning_rate": 4.079709428812842e-07, + "logits/chosen": -1.3422235250473022, + "logits/rejected": -1.3077037334442139, + "logps/chosen": -201.4602813720703, + "logps/rejected": -255.19070434570312, + "loss": 0.5744, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4541057348251343, + "rewards/margins": 0.5830581188201904, + "rewards/rejected": -2.0371639728546143, + "step": 4110 + }, + { + "epoch": 0.71, + "grad_norm": 19.72533891211532, + "learning_rate": 4.073875424187739e-07, + "logits/chosen": -1.3486844301223755, + "logits/rejected": -1.3319542407989502, + "logps/chosen": -187.834228515625, + "logps/rejected": -242.30770874023438, + "loss": 0.583, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3786789178848267, + "rewards/margins": 0.49663081765174866, + "rewards/rejected": -1.875309944152832, + "step": 4120 + }, + { + "epoch": 0.71, + "grad_norm": 17.2804912644492, + "learning_rate": 4.0680271868859906e-07, + "logits/chosen": -1.4753568172454834, + "logits/rejected": -1.4285588264465332, + "logps/chosen": -163.55975341796875, + "logps/rejected": -233.02902221679688, + "loss": 0.4851, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0960824489593506, + "rewards/margins": 0.7092905640602112, + "rewards/rejected": -1.805373191833496, + "step": 4130 + }, + { + "epoch": 0.71, + "grad_norm": 21.81704208943039, + "learning_rate": 4.0621647697936556e-07, + "logits/chosen": -1.4139468669891357, + "logits/rejected": -1.3735511302947998, + "logps/chosen": -193.77230834960938, + "logps/rejected": -239.8427734375, + "loss": 0.5885, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3823630809783936, + "rewards/margins": 0.5091473460197449, + "rewards/rejected": -1.8915106058120728, + "step": 4140 + }, + { + "epoch": 0.72, + "grad_norm": 18.457503619554558, + "learning_rate": 4.0562882259250233e-07, + "logits/chosen": -1.4741637706756592, + "logits/rejected": -1.4252352714538574, + "logps/chosen": -182.50404357910156, + "logps/rejected": -247.427734375, + "loss": 0.5226, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2463964223861694, + "rewards/margins": 0.709705114364624, + "rewards/rejected": -1.956101417541504, + "step": 4150 + }, + { + "epoch": 0.72, + "grad_norm": 19.03179504667782, + "learning_rate": 4.0503976084221323e-07, + "logits/chosen": -1.3726146221160889, + "logits/rejected": -1.3159904479980469, + "logps/chosen": -179.83924865722656, + "logps/rejected": -258.4535217285156, + "loss": 0.4878, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2819244861602783, + "rewards/margins": 0.8116732835769653, + "rewards/rejected": -2.093597888946533, + "step": 4160 + }, + { + "epoch": 0.72, + "grad_norm": 23.70588198146383, + "learning_rate": 4.044492970554292e-07, + "logits/chosen": -1.374589443206787, + "logits/rejected": -1.3363924026489258, + "logps/chosen": -193.84341430664062, + "logps/rejected": -267.19268798828125, + "loss": 0.5472, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4245638847351074, + "rewards/margins": 0.711793065071106, + "rewards/rejected": -2.136356830596924, + "step": 4170 + }, + { + "epoch": 0.72, + "grad_norm": 20.355963174810125, + "learning_rate": 4.038574365717594e-07, + "logits/chosen": -1.3285168409347534, + "logits/rejected": -1.2805362939834595, + "logps/chosen": -200.12326049804688, + "logps/rejected": -274.0704650878906, + "loss": 0.5344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.447908878326416, + "rewards/margins": 0.7298901677131653, + "rewards/rejected": -2.1777987480163574, + "step": 4180 + }, + { + "epoch": 0.72, + "grad_norm": 30.67163449647573, + "learning_rate": 4.0326418474344416e-07, + "logits/chosen": -1.3149698972702026, + "logits/rejected": -1.2749181985855103, + "logps/chosen": -206.71963500976562, + "logps/rejected": -285.58819580078125, + "loss": 0.5258, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5566787719726562, + "rewards/margins": 0.7740581631660461, + "rewards/rejected": -2.3307368755340576, + "step": 4190 + }, + { + "epoch": 0.72, + "grad_norm": 23.189476713803757, + "learning_rate": 4.0266954693530515e-07, + "logits/chosen": -1.3780596256256104, + "logits/rejected": -1.3424698114395142, + "logps/chosen": -209.67904663085938, + "logps/rejected": -262.9185791015625, + "loss": 0.5803, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5398088693618774, + "rewards/margins": 0.5638743042945862, + "rewards/rejected": -2.1036829948425293, + "step": 4200 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -1.4580851793289185, + "eval_logits/rejected": -1.4321988821029663, + "eval_logps/chosen": -193.99774169921875, + "eval_logps/rejected": -231.7759552001953, + "eval_loss": 0.6207540035247803, + "eval_rewards/accuracies": 0.6624070405960083, + "eval_rewards/chosen": -1.3529391288757324, + "eval_rewards/margins": 0.33324676752090454, + "eval_rewards/rejected": -1.6861858367919922, + "eval_runtime": 356.9885, + "eval_samples_per_second": 12.056, + "eval_steps_per_second": 1.507, + "step": 4200 + }, + { + "epoch": 0.73, + "grad_norm": 32.38460221933683, + "learning_rate": 4.020735285246979e-07, + "logits/chosen": -1.3975965976715088, + "logits/rejected": -1.355668306350708, + "logps/chosen": -212.5668182373047, + "logps/rejected": -264.63543701171875, + "loss": 0.6133, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5729354619979858, + "rewards/margins": 0.5456880331039429, + "rewards/rejected": -2.1186232566833496, + "step": 4210 + }, + { + "epoch": 0.73, + "grad_norm": 16.885675503765714, + "learning_rate": 4.014761349014629e-07, + "logits/chosen": -1.3606762886047363, + "logits/rejected": -1.3178844451904297, + "logps/chosen": -178.82691955566406, + "logps/rejected": -241.67929077148438, + "loss": 0.5612, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2662688493728638, + "rewards/margins": 0.6238974332809448, + "rewards/rejected": -1.8901660442352295, + "step": 4220 + }, + { + "epoch": 0.73, + "grad_norm": 29.792336168407434, + "learning_rate": 4.0087737146787656e-07, + "logits/chosen": -1.587550401687622, + "logits/rejected": -1.5437839031219482, + "logps/chosen": -163.56658935546875, + "logps/rejected": -229.27224731445312, + "loss": 0.5343, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.082904577255249, + "rewards/margins": 0.6688292622566223, + "rewards/rejected": -1.7517340183258057, + "step": 4230 + }, + { + "epoch": 0.73, + "grad_norm": 20.765107873436467, + "learning_rate": 4.002772436386027e-07, + "logits/chosen": -1.5118169784545898, + "logits/rejected": -1.4638663530349731, + "logps/chosen": -155.6704559326172, + "logps/rejected": -229.45620727539062, + "loss": 0.518, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.009352207183838, + "rewards/margins": 0.7259235382080078, + "rewards/rejected": -1.7352758646011353, + "step": 4240 + }, + { + "epoch": 0.73, + "grad_norm": 23.96242082772077, + "learning_rate": 3.9967575684064367e-07, + "logits/chosen": -1.4785500764846802, + "logits/rejected": -1.4373469352722168, + "logps/chosen": -159.1673126220703, + "logps/rejected": -217.58468627929688, + "loss": 0.5303, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0413461923599243, + "rewards/margins": 0.6081751585006714, + "rewards/rejected": -1.6495214700698853, + "step": 4250 + }, + { + "epoch": 0.73, + "grad_norm": 24.144969976617194, + "learning_rate": 3.990729165132907e-07, + "logits/chosen": -1.4406192302703857, + "logits/rejected": -1.4052913188934326, + "logps/chosen": -160.1890106201172, + "logps/rejected": -228.3748779296875, + "loss": 0.544, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0565812587738037, + "rewards/margins": 0.679029643535614, + "rewards/rejected": -1.7356109619140625, + "step": 4260 + }, + { + "epoch": 0.74, + "grad_norm": 24.358604007282633, + "learning_rate": 3.984687281080754e-07, + "logits/chosen": -1.3951603174209595, + "logits/rejected": -1.3441218137741089, + "logps/chosen": -164.6576690673828, + "logps/rejected": -230.2593536376953, + "loss": 0.5264, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1306527853012085, + "rewards/margins": 0.6664345860481262, + "rewards/rejected": -1.79708731174469, + "step": 4270 + }, + { + "epoch": 0.74, + "grad_norm": 24.378376134460726, + "learning_rate": 3.978631970887201e-07, + "logits/chosen": -1.4013197422027588, + "logits/rejected": -1.3541449308395386, + "logps/chosen": -162.7399139404297, + "logps/rejected": -237.9474639892578, + "loss": 0.5066, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1103148460388184, + "rewards/margins": 0.7485235333442688, + "rewards/rejected": -1.858838677406311, + "step": 4280 + }, + { + "epoch": 0.74, + "grad_norm": 26.74832111588032, + "learning_rate": 3.972563289310882e-07, + "logits/chosen": -1.3995485305786133, + "logits/rejected": -1.348487138748169, + "logps/chosen": -180.16856384277344, + "logps/rejected": -255.4312286376953, + "loss": 0.5467, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2493109703063965, + "rewards/margins": 0.7890298962593079, + "rewards/rejected": -2.0383410453796387, + "step": 4290 + }, + { + "epoch": 0.74, + "grad_norm": 15.729787495443901, + "learning_rate": 3.9664812912313533e-07, + "logits/chosen": -1.4865190982818604, + "logits/rejected": -1.4452247619628906, + "logps/chosen": -155.62908935546875, + "logps/rejected": -227.11245727539062, + "loss": 0.507, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0309059619903564, + "rewards/margins": 0.7097223997116089, + "rewards/rejected": -1.7406282424926758, + "step": 4300 + }, + { + "epoch": 0.74, + "eval_logits/chosen": -1.5954647064208984, + "eval_logits/rejected": -1.5738048553466797, + "eval_logps/chosen": -152.3179931640625, + "eval_logps/rejected": -181.7826385498047, + "eval_loss": 0.6264519095420837, + "eval_rewards/accuracies": 0.6624070405960083, + "eval_rewards/chosen": -0.9361413717269897, + "eval_rewards/margins": 0.25011131167411804, + "eval_rewards/rejected": -1.1862527132034302, + "eval_runtime": 356.8364, + "eval_samples_per_second": 12.062, + "eval_steps_per_second": 1.508, + "step": 4300 + }, + { + "epoch": 0.74, + "grad_norm": 13.768891932897395, + "learning_rate": 3.9603860316485925e-07, + "logits/chosen": -1.418806791305542, + "logits/rejected": -1.3766006231307983, + "logps/chosen": -162.57728576660156, + "logps/rejected": -218.0735321044922, + "loss": 0.5448, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0851390361785889, + "rewards/margins": 0.5797218084335327, + "rewards/rejected": -1.664860725402832, + "step": 4310 + }, + { + "epoch": 0.74, + "grad_norm": 14.715075236823548, + "learning_rate": 3.9542775656825e-07, + "logits/chosen": -1.4987797737121582, + "logits/rejected": -1.4415086507797241, + "logps/chosen": -172.37828063964844, + "logps/rejected": -245.30239868164062, + "loss": 0.4709, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1376218795776367, + "rewards/margins": 0.7687402963638306, + "rewards/rejected": -1.9063619375228882, + "step": 4320 + }, + { + "epoch": 0.75, + "grad_norm": 28.706374504499472, + "learning_rate": 3.948155948572405e-07, + "logits/chosen": -1.3579802513122559, + "logits/rejected": -1.3000389337539673, + "logps/chosen": -183.6754150390625, + "logps/rejected": -248.2251739501953, + "loss": 0.5221, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2819044589996338, + "rewards/margins": 0.702656626701355, + "rewards/rejected": -1.9845609664916992, + "step": 4330 + }, + { + "epoch": 0.75, + "grad_norm": 21.368421423428487, + "learning_rate": 3.9420212356765606e-07, + "logits/chosen": -1.3122832775115967, + "logits/rejected": -1.2653281688690186, + "logps/chosen": -180.68899536132812, + "logps/rejected": -259.26025390625, + "loss": 0.5476, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3169108629226685, + "rewards/margins": 0.7710382342338562, + "rewards/rejected": -2.08794903755188, + "step": 4340 + }, + { + "epoch": 0.75, + "grad_norm": 21.538559584335715, + "learning_rate": 3.93587348247165e-07, + "logits/chosen": -1.3758046627044678, + "logits/rejected": -1.337914228439331, + "logps/chosen": -167.84469604492188, + "logps/rejected": -237.70242309570312, + "loss": 0.5158, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1684411764144897, + "rewards/margins": 0.6767427921295166, + "rewards/rejected": -1.8451837301254272, + "step": 4350 + }, + { + "epoch": 0.75, + "grad_norm": 16.11283805818009, + "learning_rate": 3.929712744552278e-07, + "logits/chosen": -1.412389874458313, + "logits/rejected": -1.356400728225708, + "logps/chosen": -176.62753295898438, + "logps/rejected": -245.2499542236328, + "loss": 0.5299, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2295823097229004, + "rewards/margins": 0.7066112160682678, + "rewards/rejected": -1.9361934661865234, + "step": 4360 + }, + { + "epoch": 0.75, + "grad_norm": 30.246128714363632, + "learning_rate": 3.923539077630471e-07, + "logits/chosen": -1.3993642330169678, + "logits/rejected": -1.3563659191131592, + "logps/chosen": -184.28158569335938, + "logps/rejected": -247.9432373046875, + "loss": 0.5544, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.293311357498169, + "rewards/margins": 0.6382697820663452, + "rewards/rejected": -1.9315814971923828, + "step": 4370 + }, + { + "epoch": 0.75, + "grad_norm": 25.81059610250568, + "learning_rate": 3.917352537535176e-07, + "logits/chosen": -1.4071307182312012, + "logits/rejected": -1.356684923171997, + "logps/chosen": -182.4822998046875, + "logps/rejected": -258.318359375, + "loss": 0.5202, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.281593918800354, + "rewards/margins": 0.7798604369163513, + "rewards/rejected": -2.0614542961120605, + "step": 4380 + }, + { + "epoch": 0.76, + "grad_norm": 21.811436240135972, + "learning_rate": 3.91115318021175e-07, + "logits/chosen": -1.336089849472046, + "logits/rejected": -1.291550874710083, + "logps/chosen": -188.70346069335938, + "logps/rejected": -266.1550598144531, + "loss": 0.5067, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3547266721725464, + "rewards/margins": 0.7720333337783813, + "rewards/rejected": -2.126760244369507, + "step": 4390 + }, + { + "epoch": 0.76, + "grad_norm": 25.759431483166495, + "learning_rate": 3.9049410617214607e-07, + "logits/chosen": -1.3443093299865723, + "logits/rejected": -1.2999963760375977, + "logps/chosen": -194.36892700195312, + "logps/rejected": -274.6505432128906, + "loss": 0.5273, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3747164011001587, + "rewards/margins": 0.8176537752151489, + "rewards/rejected": -2.1923701763153076, + "step": 4400 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -1.4307539463043213, + "eval_logits/rejected": -1.404834270477295, + "eval_logps/chosen": -185.88987731933594, + "eval_logps/rejected": -224.0266876220703, + "eval_loss": 0.6210964918136597, + "eval_rewards/accuracies": 0.6686803102493286, + "eval_rewards/chosen": -1.2718603610992432, + "eval_rewards/margins": 0.3368328809738159, + "eval_rewards/rejected": -1.608693242073059, + "eval_runtime": 356.7436, + "eval_samples_per_second": 12.065, + "eval_steps_per_second": 1.508, + "step": 4400 + }, + { + "epoch": 0.76, + "grad_norm": 19.840813891153072, + "learning_rate": 3.898716238240971e-07, + "logits/chosen": -1.3299553394317627, + "logits/rejected": -1.289876103401184, + "logps/chosen": -192.49099731445312, + "logps/rejected": -250.6361541748047, + "loss": 0.5987, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3760064840316772, + "rewards/margins": 0.5802738666534424, + "rewards/rejected": -1.9562803506851196, + "step": 4410 + }, + { + "epoch": 0.76, + "grad_norm": 26.035856678795522, + "learning_rate": 3.892478766061841e-07, + "logits/chosen": -1.489180564880371, + "logits/rejected": -1.4286963939666748, + "logps/chosen": -172.96762084960938, + "logps/rejected": -225.233154296875, + "loss": 0.5652, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1818801164627075, + "rewards/margins": 0.5658958554267883, + "rewards/rejected": -1.7477757930755615, + "step": 4420 + }, + { + "epoch": 0.76, + "grad_norm": 25.167741330404056, + "learning_rate": 3.886228701590011e-07, + "logits/chosen": -1.4246338605880737, + "logits/rejected": -1.3719749450683594, + "logps/chosen": -154.06051635742188, + "logps/rejected": -209.6873321533203, + "loss": 0.5631, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0029816627502441, + "rewards/margins": 0.5808738470077515, + "rewards/rejected": -1.583855390548706, + "step": 4430 + }, + { + "epoch": 0.76, + "grad_norm": 17.10714671684354, + "learning_rate": 3.8799661013452955e-07, + "logits/chosen": -1.485050916671753, + "logits/rejected": -1.4327274560928345, + "logps/chosen": -173.7892608642578, + "logps/rejected": -240.88101196289062, + "loss": 0.5115, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1438651084899902, + "rewards/margins": 0.7148648500442505, + "rewards/rejected": -1.8587299585342407, + "step": 4440 + }, + { + "epoch": 0.77, + "grad_norm": 19.220029089043173, + "learning_rate": 3.8736910219608705e-07, + "logits/chosen": -1.3361194133758545, + "logits/rejected": -1.2997193336486816, + "logps/chosen": -164.47987365722656, + "logps/rejected": -231.0243682861328, + "loss": 0.5257, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0879555940628052, + "rewards/margins": 0.6804584264755249, + "rewards/rejected": -1.7684139013290405, + "step": 4450 + }, + { + "epoch": 0.77, + "grad_norm": 21.93245110911694, + "learning_rate": 3.8674035201827626e-07, + "logits/chosen": -1.4222412109375, + "logits/rejected": -1.387459635734558, + "logps/chosen": -174.14901733398438, + "logps/rejected": -240.2296600341797, + "loss": 0.5475, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2142760753631592, + "rewards/margins": 0.6618901491165161, + "rewards/rejected": -1.8761663436889648, + "step": 4460 + }, + { + "epoch": 0.77, + "grad_norm": 26.334535864969112, + "learning_rate": 3.861103652869334e-07, + "logits/chosen": -1.4492603540420532, + "logits/rejected": -1.3905606269836426, + "logps/chosen": -184.28909301757812, + "logps/rejected": -254.66085815429688, + "loss": 0.5201, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2838647365570068, + "rewards/margins": 0.7604522109031677, + "rewards/rejected": -2.0443172454833984, + "step": 4470 + }, + { + "epoch": 0.77, + "grad_norm": 41.57134948033417, + "learning_rate": 3.8547914769907705e-07, + "logits/chosen": -1.4375700950622559, + "logits/rejected": -1.3990795612335205, + "logps/chosen": -193.02252197265625, + "logps/rejected": -266.1149597167969, + "loss": 0.5628, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4204905033111572, + "rewards/margins": 0.7187623977661133, + "rewards/rejected": -2.1392529010772705, + "step": 4480 + }, + { + "epoch": 0.77, + "grad_norm": 28.64256495751411, + "learning_rate": 3.848467049628564e-07, + "logits/chosen": -1.317628264427185, + "logits/rejected": -1.2681446075439453, + "logps/chosen": -187.17874145507812, + "logps/rejected": -251.03970336914062, + "loss": 0.531, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3606078624725342, + "rewards/margins": 0.6513209939002991, + "rewards/rejected": -2.0119290351867676, + "step": 4490 + }, + { + "epoch": 0.78, + "grad_norm": 12.948645388897265, + "learning_rate": 3.8421304279749983e-07, + "logits/chosen": -1.3421502113342285, + "logits/rejected": -1.2936899662017822, + "logps/chosen": -180.89065551757812, + "logps/rejected": -251.9108428955078, + "loss": 0.5574, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2455495595932007, + "rewards/margins": 0.7440658211708069, + "rewards/rejected": -1.9896152019500732, + "step": 4500 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -1.4964402914047241, + "eval_logits/rejected": -1.472887396812439, + "eval_logps/chosen": -169.3536376953125, + "eval_logps/rejected": -203.17874145507812, + "eval_loss": 0.6233484745025635, + "eval_rewards/accuracies": 0.6670538783073425, + "eval_rewards/chosen": -1.1064980030059814, + "eval_rewards/margins": 0.29371556639671326, + "eval_rewards/rejected": -1.400213599205017, + "eval_runtime": 356.7428, + "eval_samples_per_second": 12.065, + "eval_steps_per_second": 1.508, + "step": 4500 + }, + { + "epoch": 0.78, + "grad_norm": 17.509248366329857, + "learning_rate": 3.8357816693326314e-07, + "logits/chosen": -1.487713098526001, + "logits/rejected": -1.4337613582611084, + "logps/chosen": -171.7799835205078, + "logps/rejected": -247.6623992919922, + "loss": 0.5203, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1836849451065063, + "rewards/margins": 0.7367376089096069, + "rewards/rejected": -1.9204223155975342, + "step": 4510 + }, + { + "epoch": 0.78, + "grad_norm": 21.475335389561995, + "learning_rate": 3.829420831113775e-07, + "logits/chosen": -1.4706519842147827, + "logits/rejected": -1.422728180885315, + "logps/chosen": -176.04486083984375, + "logps/rejected": -242.8605499267578, + "loss": 0.5317, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.234581708908081, + "rewards/margins": 0.6933325529098511, + "rewards/rejected": -1.9279142618179321, + "step": 4520 + }, + { + "epoch": 0.78, + "grad_norm": 30.044462081498253, + "learning_rate": 3.823047970839981e-07, + "logits/chosen": -1.4337480068206787, + "logits/rejected": -1.4001357555389404, + "logps/chosen": -168.0045623779297, + "logps/rejected": -224.1116943359375, + "loss": 0.5591, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1422946453094482, + "rewards/margins": 0.564059317111969, + "rewards/rejected": -1.7063539028167725, + "step": 4530 + }, + { + "epoch": 0.78, + "grad_norm": 32.08416286753465, + "learning_rate": 3.816663146141514e-07, + "logits/chosen": -1.321825623512268, + "logits/rejected": -1.2757227420806885, + "logps/chosen": -175.95849609375, + "logps/rejected": -248.03097534179688, + "loss": 0.5095, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.232386827468872, + "rewards/margins": 0.7383901476860046, + "rewards/rejected": -1.970776915550232, + "step": 4540 + }, + { + "epoch": 0.78, + "grad_norm": 21.22323203264765, + "learning_rate": 3.810266414756836e-07, + "logits/chosen": -1.3958414793014526, + "logits/rejected": -1.344481110572815, + "logps/chosen": -177.6934051513672, + "logps/rejected": -246.7960205078125, + "loss": 0.5158, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2130842208862305, + "rewards/margins": 0.7209616899490356, + "rewards/rejected": -1.9340457916259766, + "step": 4550 + }, + { + "epoch": 0.79, + "grad_norm": 21.245086858763795, + "learning_rate": 3.803857834532081e-07, + "logits/chosen": -1.2998394966125488, + "logits/rejected": -1.2284823656082153, + "logps/chosen": -185.28271484375, + "logps/rejected": -263.3417663574219, + "loss": 0.5029, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2907075881958008, + "rewards/margins": 0.8120514154434204, + "rewards/rejected": -2.1027588844299316, + "step": 4560 + }, + { + "epoch": 0.79, + "grad_norm": 30.48524469504423, + "learning_rate": 3.797437463420534e-07, + "logits/chosen": -1.3093476295471191, + "logits/rejected": -1.259817361831665, + "logps/chosen": -194.9139862060547, + "logps/rejected": -268.6426696777344, + "loss": 0.5509, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4381399154663086, + "rewards/margins": 0.7308866381645203, + "rewards/rejected": -2.1690266132354736, + "step": 4570 + }, + { + "epoch": 0.79, + "grad_norm": 20.59369730212467, + "learning_rate": 3.791005359482106e-07, + "logits/chosen": -1.3296152353286743, + "logits/rejected": -1.28184175491333, + "logps/chosen": -157.950439453125, + "logps/rejected": -213.4839630126953, + "loss": 0.5544, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.037023901939392, + "rewards/margins": 0.5802245736122131, + "rewards/rejected": -1.617248296737671, + "step": 4580 + }, + { + "epoch": 0.79, + "grad_norm": 21.05628740715068, + "learning_rate": 3.784561580882806e-07, + "logits/chosen": -1.4657633304595947, + "logits/rejected": -1.4152452945709229, + "logps/chosen": -169.1389617919922, + "logps/rejected": -219.05465698242188, + "loss": 0.6103, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1427037715911865, + "rewards/margins": 0.5260920524597168, + "rewards/rejected": -1.6687958240509033, + "step": 4590 + }, + { + "epoch": 0.79, + "grad_norm": 22.60134182003666, + "learning_rate": 3.778106185894221e-07, + "logits/chosen": -1.3957931995391846, + "logits/rejected": -1.3415305614471436, + "logps/chosen": -159.36795043945312, + "logps/rejected": -236.89773559570312, + "loss": 0.4819, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.0599912405014038, + "rewards/margins": 0.7656328082084656, + "rewards/rejected": -1.8256241083145142, + "step": 4600 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -1.525081992149353, + "eval_logits/rejected": -1.501688003540039, + "eval_logps/chosen": -169.0588836669922, + "eval_logps/rejected": -203.32528686523438, + "eval_loss": 0.6219184994697571, + "eval_rewards/accuracies": 0.6642658114433289, + "eval_rewards/chosen": -1.1035504341125488, + "eval_rewards/margins": 0.298128604888916, + "eval_rewards/rejected": -1.4016790390014648, + "eval_runtime": 356.8595, + "eval_samples_per_second": 12.061, + "eval_steps_per_second": 1.508, + "step": 4600 + }, + { + "epoch": 0.79, + "grad_norm": 25.140522968467, + "learning_rate": 3.771639232892986e-07, + "logits/chosen": -1.3437252044677734, + "logits/rejected": -1.3191344738006592, + "logps/chosen": -189.2852325439453, + "logps/rejected": -238.8382110595703, + "loss": 0.6115, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3813140392303467, + "rewards/margins": 0.4978283941745758, + "rewards/rejected": -1.8791425228118896, + "step": 4610 + }, + { + "epoch": 0.8, + "grad_norm": 21.229639253980007, + "learning_rate": 3.765160780360254e-07, + "logits/chosen": -1.3881046772003174, + "logits/rejected": -1.3339178562164307, + "logps/chosen": -180.4025421142578, + "logps/rejected": -259.890380859375, + "loss": 0.5085, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2468607425689697, + "rewards/margins": 0.7971670627593994, + "rewards/rejected": -2.0440280437469482, + "step": 4620 + }, + { + "epoch": 0.8, + "grad_norm": 30.53629505024118, + "learning_rate": 3.75867088688117e-07, + "logits/chosen": -1.3791451454162598, + "logits/rejected": -1.3137580156326294, + "logps/chosen": -209.33139038085938, + "logps/rejected": -286.5825500488281, + "loss": 0.5159, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5205762386322021, + "rewards/margins": 0.7794795036315918, + "rewards/rejected": -2.300055742263794, + "step": 4630 + }, + { + "epoch": 0.8, + "grad_norm": 26.67530435659893, + "learning_rate": 3.7521696111443413e-07, + "logits/chosen": -1.3778386116027832, + "logits/rejected": -1.341675043106079, + "logps/chosen": -217.1299285888672, + "logps/rejected": -284.71978759765625, + "loss": 0.5664, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6237319707870483, + "rewards/margins": 0.6682808995246887, + "rewards/rejected": -2.2920126914978027, + "step": 4640 + }, + { + "epoch": 0.8, + "grad_norm": 36.07331886816096, + "learning_rate": 3.7456570119413034e-07, + "logits/chosen": -1.413480520248413, + "logits/rejected": -1.3600969314575195, + "logps/chosen": -189.5355987548828, + "logps/rejected": -251.88204956054688, + "loss": 0.5552, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3538799285888672, + "rewards/margins": 0.6571952700614929, + "rewards/rejected": -2.011075496673584, + "step": 4650 + }, + { + "epoch": 0.8, + "grad_norm": 19.411023629464214, + "learning_rate": 3.739133148165994e-07, + "logits/chosen": -1.4477910995483398, + "logits/rejected": -1.4069766998291016, + "logps/chosen": -176.53195190429688, + "logps/rejected": -239.30844116210938, + "loss": 0.5337, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2224671840667725, + "rewards/margins": 0.62762051820755, + "rewards/rejected": -1.8500875234603882, + "step": 4660 + }, + { + "epoch": 0.8, + "grad_norm": 22.362534471168633, + "learning_rate": 3.7325980788142146e-07, + "logits/chosen": -1.4072405099868774, + "logits/rejected": -1.357230305671692, + "logps/chosen": -165.07376098632812, + "logps/rejected": -237.02816772460938, + "loss": 0.4906, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1267932653427124, + "rewards/margins": 0.7528173327445984, + "rewards/rejected": -1.8796107769012451, + "step": 4670 + }, + { + "epoch": 0.81, + "grad_norm": 25.270149260749562, + "learning_rate": 3.726051862983101e-07, + "logits/chosen": -1.3556668758392334, + "logits/rejected": -1.3043700456619263, + "logps/chosen": -190.7840576171875, + "logps/rejected": -255.9459686279297, + "loss": 0.5537, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3830840587615967, + "rewards/margins": 0.6596297025680542, + "rewards/rejected": -2.0427136421203613, + "step": 4680 + }, + { + "epoch": 0.81, + "grad_norm": 24.777840014526955, + "learning_rate": 3.7194945598705864e-07, + "logits/chosen": -1.3643500804901123, + "logits/rejected": -1.3103562593460083, + "logps/chosen": -213.9306182861328, + "logps/rejected": -301.82757568359375, + "loss": 0.5075, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5816562175750732, + "rewards/margins": 0.8936996459960938, + "rewards/rejected": -2.475355625152588, + "step": 4690 + }, + { + "epoch": 0.81, + "grad_norm": 35.9367234479366, + "learning_rate": 3.712926228774868e-07, + "logits/chosen": -1.298680067062378, + "logits/rejected": -1.252151608467102, + "logps/chosen": -218.3412322998047, + "logps/rejected": -308.64752197265625, + "loss": 0.5187, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6659730672836304, + "rewards/margins": 0.8636065721511841, + "rewards/rejected": -2.5295798778533936, + "step": 4700 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -1.3934706449508667, + "eval_logits/rejected": -1.3670498132705688, + "eval_logps/chosen": -205.2917938232422, + "eval_logps/rejected": -246.5410614013672, + "eval_loss": 0.6171659231185913, + "eval_rewards/accuracies": 0.6654275059700012, + "eval_rewards/chosen": -1.4658793210983276, + "eval_rewards/margins": 0.367957204580307, + "eval_rewards/rejected": -1.833836555480957, + "eval_runtime": 356.7771, + "eval_samples_per_second": 12.064, + "eval_steps_per_second": 1.508, + "step": 4700 + }, + { + "epoch": 0.81, + "grad_norm": 32.393157890217395, + "learning_rate": 3.7063469290938696e-07, + "logits/chosen": -1.3531776666641235, + "logits/rejected": -1.3033009767532349, + "logps/chosen": -212.64096069335938, + "logps/rejected": -282.6040954589844, + "loss": 0.5413, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.546820044517517, + "rewards/margins": 0.7553335428237915, + "rewards/rejected": -2.3021538257598877, + "step": 4710 + }, + { + "epoch": 0.81, + "grad_norm": 21.02352487673719, + "learning_rate": 3.699756720324706e-07, + "logits/chosen": -1.2925106287002563, + "logits/rejected": -1.2351093292236328, + "logps/chosen": -197.33441162109375, + "logps/rejected": -281.5450439453125, + "loss": 0.4884, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.405700922012329, + "rewards/margins": 0.8855878710746765, + "rewards/rejected": -2.2912888526916504, + "step": 4720 + }, + { + "epoch": 0.81, + "grad_norm": 26.71400422066647, + "learning_rate": 3.693155662063141e-07, + "logits/chosen": -1.2681843042373657, + "logits/rejected": -1.219074010848999, + "logps/chosen": -199.86378479003906, + "logps/rejected": -269.1332092285156, + "loss": 0.5673, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4612171649932861, + "rewards/margins": 0.7179456353187561, + "rewards/rejected": -2.1791629791259766, + "step": 4730 + }, + { + "epoch": 0.82, + "grad_norm": 16.242282256503056, + "learning_rate": 3.686543814003053e-07, + "logits/chosen": -1.3467975854873657, + "logits/rejected": -1.3040544986724854, + "logps/chosen": -180.89808654785156, + "logps/rejected": -271.9189453125, + "loss": 0.4748, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2646945714950562, + "rewards/margins": 0.9159406423568726, + "rewards/rejected": -2.1806349754333496, + "step": 4740 + }, + { + "epoch": 0.82, + "grad_norm": 22.35210107859672, + "learning_rate": 3.6799212359358933e-07, + "logits/chosen": -1.2919436693191528, + "logits/rejected": -1.2553608417510986, + "logps/chosen": -206.98501586914062, + "logps/rejected": -273.5370788574219, + "loss": 0.54, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5159308910369873, + "rewards/margins": 0.673417866230011, + "rewards/rejected": -2.1893489360809326, + "step": 4750 + }, + { + "epoch": 0.82, + "grad_norm": 29.45848931001129, + "learning_rate": 3.6732879877501453e-07, + "logits/chosen": -1.2655035257339478, + "logits/rejected": -1.2089664936065674, + "logps/chosen": -215.909912109375, + "logps/rejected": -307.30560302734375, + "loss": 0.4823, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6345545053482056, + "rewards/margins": 0.9228925704956055, + "rewards/rejected": -2.5574469566345215, + "step": 4760 + }, + { + "epoch": 0.82, + "grad_norm": 19.064566618433858, + "learning_rate": 3.666644129430784e-07, + "logits/chosen": -1.3485455513000488, + "logits/rejected": -1.3007423877716064, + "logps/chosen": -227.8409423828125, + "logps/rejected": -299.4037170410156, + "loss": 0.5629, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6823813915252686, + "rewards/margins": 0.7599068284034729, + "rewards/rejected": -2.4422881603240967, + "step": 4770 + }, + { + "epoch": 0.82, + "grad_norm": 22.624642669624826, + "learning_rate": 3.65998972105873e-07, + "logits/chosen": -1.280133605003357, + "logits/rejected": -1.228562593460083, + "logps/chosen": -196.01754760742188, + "logps/rejected": -289.71990966796875, + "loss": 0.461, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4455887079238892, + "rewards/margins": 0.912562370300293, + "rewards/rejected": -2.3581509590148926, + "step": 4780 + }, + { + "epoch": 0.83, + "grad_norm": 17.630300335409032, + "learning_rate": 3.6533248228103114e-07, + "logits/chosen": -1.3750900030136108, + "logits/rejected": -1.324573278427124, + "logps/chosen": -203.55032348632812, + "logps/rejected": -268.24688720703125, + "loss": 0.528, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4577271938323975, + "rewards/margins": 0.6970139741897583, + "rewards/rejected": -2.154741048812866, + "step": 4790 + }, + { + "epoch": 0.83, + "grad_norm": 22.323338607470507, + "learning_rate": 3.646649494956717e-07, + "logits/chosen": -1.3112070560455322, + "logits/rejected": -1.2691413164138794, + "logps/chosen": -205.2784881591797, + "logps/rejected": -268.1100769042969, + "loss": 0.5805, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5329288244247437, + "rewards/margins": 0.6451320648193359, + "rewards/rejected": -2.178061008453369, + "step": 4800 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -1.445318341255188, + "eval_logits/rejected": -1.4196213483810425, + "eval_logps/chosen": -201.05026245117188, + "eval_logps/rejected": -241.25576782226562, + "eval_loss": 0.6145854592323303, + "eval_rewards/accuracies": 0.6619423627853394, + "eval_rewards/chosen": -1.423464059829712, + "eval_rewards/margins": 0.35751983523368835, + "eval_rewards/rejected": -1.780983805656433, + "eval_runtime": 356.6955, + "eval_samples_per_second": 12.066, + "eval_steps_per_second": 1.508, + "step": 4800 + }, + { + "epoch": 0.83, + "grad_norm": 23.1783795567982, + "learning_rate": 3.6399637978634497e-07, + "logits/chosen": -1.2973178625106812, + "logits/rejected": -1.234431505203247, + "logps/chosen": -206.18838500976562, + "logps/rejected": -280.7757263183594, + "loss": 0.5075, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4898689985275269, + "rewards/margins": 0.7940649390220642, + "rewards/rejected": -2.2839341163635254, + "step": 4810 + }, + { + "epoch": 0.83, + "grad_norm": 20.941205872084087, + "learning_rate": 3.6332677919897823e-07, + "logits/chosen": -1.330582857131958, + "logits/rejected": -1.2920982837677002, + "logps/chosen": -202.64508056640625, + "logps/rejected": -277.89532470703125, + "loss": 0.5228, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.514899492263794, + "rewards/margins": 0.7458511590957642, + "rewards/rejected": -2.2607505321502686, + "step": 4820 + }, + { + "epoch": 0.83, + "grad_norm": 20.30033885452288, + "learning_rate": 3.626561537888214e-07, + "logits/chosen": -1.3852955102920532, + "logits/rejected": -1.3380589485168457, + "logps/chosen": -196.2224884033203, + "logps/rejected": -265.86505126953125, + "loss": 0.5619, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.426672101020813, + "rewards/margins": 0.7057501673698425, + "rewards/rejected": -2.1324222087860107, + "step": 4830 + }, + { + "epoch": 0.83, + "grad_norm": 28.339668690193722, + "learning_rate": 3.6198450962039146e-07, + "logits/chosen": -1.3548475503921509, + "logits/rejected": -1.2958180904388428, + "logps/chosen": -199.49864196777344, + "logps/rejected": -276.6757507324219, + "loss": 0.5025, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4332863092422485, + "rewards/margins": 0.809424102306366, + "rewards/rejected": -2.2427103519439697, + "step": 4840 + }, + { + "epoch": 0.84, + "grad_norm": 24.489009584175815, + "learning_rate": 3.6131185276741846e-07, + "logits/chosen": -1.4219049215316772, + "logits/rejected": -1.377803087234497, + "logps/chosen": -193.35806274414062, + "logps/rejected": -261.9024353027344, + "loss": 0.5377, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.392596960067749, + "rewards/margins": 0.6902645826339722, + "rewards/rejected": -2.0828614234924316, + "step": 4850 + }, + { + "epoch": 0.84, + "grad_norm": 23.049416952706352, + "learning_rate": 3.6063818931278997e-07, + "logits/chosen": -1.438050627708435, + "logits/rejected": -1.3838953971862793, + "logps/chosen": -196.03994750976562, + "logps/rejected": -255.8987579345703, + "loss": 0.5318, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3760112524032593, + "rewards/margins": 0.6772761344909668, + "rewards/rejected": -2.0532872676849365, + "step": 4860 + }, + { + "epoch": 0.84, + "grad_norm": 27.95119723700728, + "learning_rate": 3.599635253484967e-07, + "logits/chosen": -1.458106279373169, + "logits/rejected": -1.4050066471099854, + "logps/chosen": -196.61280822753906, + "logps/rejected": -271.3414611816406, + "loss": 0.5218, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3915178775787354, + "rewards/margins": 0.8070972561836243, + "rewards/rejected": -2.1986148357391357, + "step": 4870 + }, + { + "epoch": 0.84, + "grad_norm": 23.334821262369495, + "learning_rate": 3.592878669755767e-07, + "logits/chosen": -1.3905763626098633, + "logits/rejected": -1.33687424659729, + "logps/chosen": -179.8762664794922, + "logps/rejected": -240.846923828125, + "loss": 0.5348, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.26702082157135, + "rewards/margins": 0.6355078220367432, + "rewards/rejected": -1.9025285243988037, + "step": 4880 + }, + { + "epoch": 0.84, + "grad_norm": 20.651381347651178, + "learning_rate": 3.586112203040607e-07, + "logits/chosen": -1.4436790943145752, + "logits/rejected": -1.3931138515472412, + "logps/chosen": -185.76181030273438, + "logps/rejected": -262.454345703125, + "loss": 0.5051, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3033283948898315, + "rewards/margins": 0.785824179649353, + "rewards/rejected": -2.0891528129577637, + "step": 4890 + }, + { + "epoch": 0.84, + "grad_norm": 14.620800977419114, + "learning_rate": 3.5793359145291665e-07, + "logits/chosen": -1.4301611185073853, + "logits/rejected": -1.3711490631103516, + "logps/chosen": -181.84730529785156, + "logps/rejected": -252.3798065185547, + "loss": 0.537, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2864367961883545, + "rewards/margins": 0.7350460886955261, + "rewards/rejected": -2.0214829444885254, + "step": 4900 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -1.5460282564163208, + "eval_logits/rejected": -1.5222222805023193, + "eval_logps/chosen": -179.59288024902344, + "eval_logps/rejected": -214.940185546875, + "eval_loss": 0.6194379925727844, + "eval_rewards/accuracies": 0.6556691527366638, + "eval_rewards/chosen": -1.2088903188705444, + "eval_rewards/margins": 0.30893754959106445, + "eval_rewards/rejected": -1.517828106880188, + "eval_runtime": 356.6033, + "eval_samples_per_second": 12.069, + "eval_steps_per_second": 1.509, + "step": 4900 + }, + { + "epoch": 0.85, + "grad_norm": 18.773623561195404, + "learning_rate": 3.5725498654999436e-07, + "logits/chosen": -1.572040319442749, + "logits/rejected": -1.511036992073059, + "logps/chosen": -181.8535919189453, + "logps/rejected": -260.3033142089844, + "loss": 0.5114, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2552320957183838, + "rewards/margins": 0.8234399557113647, + "rewards/rejected": -2.078671932220459, + "step": 4910 + }, + { + "epoch": 0.85, + "grad_norm": 25.741760898047566, + "learning_rate": 3.5657541173197025e-07, + "logits/chosen": -1.3761519193649292, + "logits/rejected": -1.3280622959136963, + "logps/chosen": -193.3008575439453, + "logps/rejected": -278.3187561035156, + "loss": 0.494, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.394696593284607, + "rewards/margins": 0.8541049957275391, + "rewards/rejected": -2.2488017082214355, + "step": 4920 + }, + { + "epoch": 0.85, + "grad_norm": 27.217333252692796, + "learning_rate": 3.558948731442918e-07, + "logits/chosen": -1.5090538263320923, + "logits/rejected": -1.461111307144165, + "logps/chosen": -210.60617065429688, + "logps/rejected": -291.8194885253906, + "loss": 0.5621, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5748202800750732, + "rewards/margins": 0.7634907960891724, + "rewards/rejected": -2.338311195373535, + "step": 4930 + }, + { + "epoch": 0.85, + "grad_norm": 22.65567566910284, + "learning_rate": 3.5521337694112177e-07, + "logits/chosen": -1.4714148044586182, + "logits/rejected": -1.4120241403579712, + "logps/chosen": -215.1482391357422, + "logps/rejected": -304.5157775878906, + "loss": 0.4672, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.592576265335083, + "rewards/margins": 0.9204828143119812, + "rewards/rejected": -2.513059139251709, + "step": 4940 + }, + { + "epoch": 0.85, + "grad_norm": 18.92994086706776, + "learning_rate": 3.5453092928528283e-07, + "logits/chosen": -1.2949804067611694, + "logits/rejected": -1.252745270729065, + "logps/chosen": -194.73434448242188, + "logps/rejected": -266.51446533203125, + "loss": 0.5488, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3993715047836304, + "rewards/margins": 0.7247661352157593, + "rewards/rejected": -2.1241374015808105, + "step": 4950 + }, + { + "epoch": 0.85, + "grad_norm": 29.05218033955657, + "learning_rate": 3.538475363482017e-07, + "logits/chosen": -1.4200494289398193, + "logits/rejected": -1.379931926727295, + "logps/chosen": -191.14431762695312, + "logps/rejected": -275.96734619140625, + "loss": 0.4846, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3891611099243164, + "rewards/margins": 0.8065220713615417, + "rewards/rejected": -2.195683240890503, + "step": 4960 + }, + { + "epoch": 0.86, + "grad_norm": 25.53487607610273, + "learning_rate": 3.531632043098533e-07, + "logits/chosen": -1.3623136281967163, + "logits/rejected": -1.316384196281433, + "logps/chosen": -199.2927703857422, + "logps/rejected": -290.8288269042969, + "loss": 0.4895, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4792590141296387, + "rewards/margins": 0.9106336832046509, + "rewards/rejected": -2.389892578125, + "step": 4970 + }, + { + "epoch": 0.86, + "grad_norm": 18.067653883232882, + "learning_rate": 3.5247793935870493e-07, + "logits/chosen": -1.3661185503005981, + "logits/rejected": -1.318273663520813, + "logps/chosen": -203.88742065429688, + "logps/rejected": -304.10009765625, + "loss": 0.4466, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.510157823562622, + "rewards/margins": 0.9821771383285522, + "rewards/rejected": -2.492335081100464, + "step": 4980 + }, + { + "epoch": 0.86, + "grad_norm": 25.76809031221678, + "learning_rate": 3.5179174769166036e-07, + "logits/chosen": -1.2775933742523193, + "logits/rejected": -1.2376461029052734, + "logps/chosen": -229.0320587158203, + "logps/rejected": -312.44122314453125, + "loss": 0.5803, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7728198766708374, + "rewards/margins": 0.8028677701950073, + "rewards/rejected": -2.575687885284424, + "step": 4990 + }, + { + "epoch": 0.86, + "grad_norm": 24.778358441979794, + "learning_rate": 3.511046355140036e-07, + "logits/chosen": -1.2975661754608154, + "logits/rejected": -1.2417397499084473, + "logps/chosen": -212.7232666015625, + "logps/rejected": -299.6105041503906, + "loss": 0.5112, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5695029497146606, + "rewards/margins": 0.8998052477836609, + "rewards/rejected": -2.4693078994750977, + "step": 5000 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -1.427557110786438, + "eval_logits/rejected": -1.4012691974639893, + "eval_logps/chosen": -209.61801147460938, + "eval_logps/rejected": -250.4540252685547, + "eval_loss": 0.6177005171775818, + "eval_rewards/accuracies": 0.6579925417900085, + "eval_rewards/chosen": -1.5091416835784912, + "eval_rewards/margins": 0.3638246953487396, + "eval_rewards/rejected": -1.8729661703109741, + "eval_runtime": 357.0332, + "eval_samples_per_second": 12.055, + "eval_steps_per_second": 1.507, + "step": 5000 + }, + { + "epoch": 0.86, + "grad_norm": 23.044456102420437, + "learning_rate": 3.5041660903934306e-07, + "logits/chosen": -1.334160566329956, + "logits/rejected": -1.2778995037078857, + "logps/chosen": -218.86984252929688, + "logps/rejected": -302.5597229003906, + "loss": 0.4935, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6585071086883545, + "rewards/margins": 0.8564162254333496, + "rewards/rejected": -2.514923572540283, + "step": 5010 + }, + { + "epoch": 0.86, + "grad_norm": 31.632139485329414, + "learning_rate": 3.4972767448955516e-07, + "logits/chosen": -1.3136205673217773, + "logits/rejected": -1.2596207857131958, + "logps/chosen": -208.8480682373047, + "logps/rejected": -286.65570068359375, + "loss": 0.5453, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5537054538726807, + "rewards/margins": 0.7929602265357971, + "rewards/rejected": -2.346665859222412, + "step": 5020 + }, + { + "epoch": 0.87, + "grad_norm": 28.18486571020162, + "learning_rate": 3.4903783809472793e-07, + "logits/chosen": -1.2829835414886475, + "logits/rejected": -1.240122675895691, + "logps/chosen": -198.78387451171875, + "logps/rejected": -277.5005187988281, + "loss": 0.5452, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4618786573410034, + "rewards/margins": 0.7737663984298706, + "rewards/rejected": -2.235645055770874, + "step": 5030 + }, + { + "epoch": 0.87, + "grad_norm": 24.06279497070965, + "learning_rate": 3.483471060931051e-07, + "logits/chosen": -1.50538170337677, + "logits/rejected": -1.4423930644989014, + "logps/chosen": -200.87606811523438, + "logps/rejected": -264.9050598144531, + "loss": 0.5298, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4371707439422607, + "rewards/margins": 0.7078632116317749, + "rewards/rejected": -2.145033836364746, + "step": 5040 + }, + { + "epoch": 0.87, + "grad_norm": 22.362866544811133, + "learning_rate": 3.4765548473102936e-07, + "logits/chosen": -1.3779505491256714, + "logits/rejected": -1.329679250717163, + "logps/chosen": -202.9115753173828, + "logps/rejected": -277.89459228515625, + "loss": 0.534, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4728914499282837, + "rewards/margins": 0.7591592073440552, + "rewards/rejected": -2.232050895690918, + "step": 5050 + }, + { + "epoch": 0.87, + "grad_norm": 23.051911448956087, + "learning_rate": 3.469629802628858e-07, + "logits/chosen": -1.3045955896377563, + "logits/rejected": -1.2757608890533447, + "logps/chosen": -192.4661865234375, + "logps/rejected": -255.61367797851562, + "loss": 0.5956, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4202029705047607, + "rewards/margins": 0.6157991290092468, + "rewards/rejected": -2.0360023975372314, + "step": 5060 + }, + { + "epoch": 0.87, + "grad_norm": 24.88380616274711, + "learning_rate": 3.4626959895104585e-07, + "logits/chosen": -1.445326566696167, + "logits/rejected": -1.3971054553985596, + "logps/chosen": -176.08921813964844, + "logps/rejected": -235.4571990966797, + "loss": 0.5525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2336560487747192, + "rewards/margins": 0.6401635408401489, + "rewards/rejected": -1.8738195896148682, + "step": 5070 + }, + { + "epoch": 0.88, + "grad_norm": 16.10357940315526, + "learning_rate": 3.4557534706580997e-07, + "logits/chosen": -1.690610647201538, + "logits/rejected": -1.6249040365219116, + "logps/chosen": -159.53741455078125, + "logps/rejected": -228.35623168945312, + "loss": 0.5054, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.020334005355835, + "rewards/margins": 0.7543589472770691, + "rewards/rejected": -1.7746931314468384, + "step": 5080 + }, + { + "epoch": 0.88, + "grad_norm": 22.84540156375547, + "learning_rate": 3.4488023088535144e-07, + "logits/chosen": -1.5469788312911987, + "logits/rejected": -1.4869216680526733, + "logps/chosen": -163.7722625732422, + "logps/rejected": -236.5236053466797, + "loss": 0.4949, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.09481680393219, + "rewards/margins": 0.7558841109275818, + "rewards/rejected": -1.8507009744644165, + "step": 5090 + }, + { + "epoch": 0.88, + "grad_norm": 24.117584748609755, + "learning_rate": 3.4418425669565946e-07, + "logits/chosen": -1.3648254871368408, + "logits/rejected": -1.3106800317764282, + "logps/chosen": -192.01718139648438, + "logps/rejected": -246.06515502929688, + "loss": 0.5746, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3694689273834229, + "rewards/margins": 0.5682665109634399, + "rewards/rejected": -1.9377353191375732, + "step": 5100 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -1.5572093725204468, + "eval_logits/rejected": -1.532787561416626, + "eval_logps/chosen": -180.94764709472656, + "eval_logps/rejected": -217.08363342285156, + "eval_loss": 0.6200332641601562, + "eval_rewards/accuracies": 0.6654275059700012, + "eval_rewards/chosen": -1.2224379777908325, + "eval_rewards/margins": 0.3168245851993561, + "eval_rewards/rejected": -1.5392626523971558, + "eval_runtime": 356.479, + "eval_samples_per_second": 12.074, + "eval_steps_per_second": 1.509, + "step": 5100 + }, + { + "epoch": 0.88, + "grad_norm": 28.379638712637114, + "learning_rate": 3.434874307904822e-07, + "logits/chosen": -1.4629642963409424, + "logits/rejected": -1.4055429697036743, + "logps/chosen": -198.49514770507812, + "logps/rejected": -260.3693542480469, + "loss": 0.5568, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4061576128005981, + "rewards/margins": 0.6592377424240112, + "rewards/rejected": -2.0653955936431885, + "step": 5110 + }, + { + "epoch": 0.88, + "grad_norm": 33.84680091265571, + "learning_rate": 3.427897594712699e-07, + "logits/chosen": -1.5411012172698975, + "logits/rejected": -1.4923017024993896, + "logps/chosen": -190.82667541503906, + "logps/rejected": -243.14395141601562, + "loss": 0.5811, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.369838833808899, + "rewards/margins": 0.558884859085083, + "rewards/rejected": -1.928723931312561, + "step": 5120 + }, + { + "epoch": 0.88, + "grad_norm": 21.2178772791051, + "learning_rate": 3.4209124904711805e-07, + "logits/chosen": -1.5400969982147217, + "logits/rejected": -1.481069803237915, + "logps/chosen": -191.25750732421875, + "logps/rejected": -274.0238037109375, + "loss": 0.4782, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3769677877426147, + "rewards/margins": 0.8635452389717102, + "rewards/rejected": -2.2405130863189697, + "step": 5130 + }, + { + "epoch": 0.89, + "grad_norm": 27.959322283199082, + "learning_rate": 3.4139190583471025e-07, + "logits/chosen": -1.50569748878479, + "logits/rejected": -1.4533193111419678, + "logps/chosen": -185.509521484375, + "logps/rejected": -234.0798797607422, + "loss": 0.5815, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2769523859024048, + "rewards/margins": 0.5343953967094421, + "rewards/rejected": -1.8113473653793335, + "step": 5140 + }, + { + "epoch": 0.89, + "grad_norm": 20.960498750902655, + "learning_rate": 3.4069173615826097e-07, + "logits/chosen": -1.5694390535354614, + "logits/rejected": -1.5354506969451904, + "logps/chosen": -168.58926391601562, + "logps/rejected": -225.9666748046875, + "loss": 0.5677, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.154682993888855, + "rewards/margins": 0.5564135313034058, + "rewards/rejected": -1.7110967636108398, + "step": 5150 + }, + { + "epoch": 0.89, + "grad_norm": 20.888551855019205, + "learning_rate": 3.399907463494585e-07, + "logits/chosen": -1.553257703781128, + "logits/rejected": -1.5001866817474365, + "logps/chosen": -163.6346435546875, + "logps/rejected": -215.9080810546875, + "loss": 0.5523, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1223015785217285, + "rewards/margins": 0.5567636489868164, + "rewards/rejected": -1.6790653467178345, + "step": 5160 + }, + { + "epoch": 0.89, + "grad_norm": 22.44450995498008, + "learning_rate": 3.3928894274740773e-07, + "logits/chosen": -1.5365890264511108, + "logits/rejected": -1.4780817031860352, + "logps/chosen": -159.61709594726562, + "logps/rejected": -242.06234741210938, + "loss": 0.4865, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0430219173431396, + "rewards/margins": 0.8591874837875366, + "rewards/rejected": -1.9022095203399658, + "step": 5170 + }, + { + "epoch": 0.89, + "grad_norm": 32.65259207620487, + "learning_rate": 3.385863316985726e-07, + "logits/chosen": -1.5846903324127197, + "logits/rejected": -1.5513648986816406, + "logps/chosen": -196.44918823242188, + "logps/rejected": -248.993408203125, + "loss": 0.5864, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4105911254882812, + "rewards/margins": 0.5344855785369873, + "rewards/rejected": -1.9450767040252686, + "step": 5180 + }, + { + "epoch": 0.89, + "grad_norm": 20.160509769449938, + "learning_rate": 3.3788291955671887e-07, + "logits/chosen": -1.4820839166641235, + "logits/rejected": -1.4463526010513306, + "logps/chosen": -182.4502716064453, + "logps/rejected": -242.9739990234375, + "loss": 0.5733, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2962095737457275, + "rewards/margins": 0.5948411226272583, + "rewards/rejected": -1.8910505771636963, + "step": 5190 + }, + { + "epoch": 0.9, + "grad_norm": 18.895506656949056, + "learning_rate": 3.371787126828568e-07, + "logits/chosen": -1.5754809379577637, + "logits/rejected": -1.5293941497802734, + "logps/chosen": -168.30630493164062, + "logps/rejected": -236.78439331054688, + "loss": 0.5138, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1625381708145142, + "rewards/margins": 0.6649240255355835, + "rewards/rejected": -1.8274621963500977, + "step": 5200 + }, + { + "epoch": 0.9, + "eval_logits/chosen": -1.6232234239578247, + "eval_logits/rejected": -1.6006020307540894, + "eval_logps/chosen": -162.8901824951172, + "eval_logps/rejected": -195.02578735351562, + "eval_loss": 0.6237266063690186, + "eval_rewards/accuracies": 0.6605483293533325, + "eval_rewards/chosen": -1.0418633222579956, + "eval_rewards/margins": 0.2768208086490631, + "eval_rewards/rejected": -1.3186841011047363, + "eval_runtime": 357.1428, + "eval_samples_per_second": 12.051, + "eval_steps_per_second": 1.506, + "step": 5200 + }, + { + "epoch": 0.9, + "grad_norm": 20.636102998390292, + "learning_rate": 3.364737174451834e-07, + "logits/chosen": -1.5026520490646362, + "logits/rejected": -1.4632505178451538, + "logps/chosen": -176.74197387695312, + "logps/rejected": -229.8460693359375, + "loss": 0.5734, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.25114107131958, + "rewards/margins": 0.5474826097488403, + "rewards/rejected": -1.7986234426498413, + "step": 5210 + }, + { + "epoch": 0.9, + "grad_norm": 18.20246477650609, + "learning_rate": 3.3576794021902476e-07, + "logits/chosen": -1.5258533954620361, + "logits/rejected": -1.4866435527801514, + "logps/chosen": -168.38894653320312, + "logps/rejected": -228.6901092529297, + "loss": 0.5603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1579110622406006, + "rewards/margins": 0.5693751573562622, + "rewards/rejected": -1.7272861003875732, + "step": 5220 + }, + { + "epoch": 0.9, + "grad_norm": 25.25518148404087, + "learning_rate": 3.350613873867788e-07, + "logits/chosen": -1.4658780097961426, + "logits/rejected": -1.4225355386734009, + "logps/chosen": -174.45970153808594, + "logps/rejected": -261.13702392578125, + "loss": 0.4966, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1921513080596924, + "rewards/margins": 0.858329176902771, + "rewards/rejected": -2.050480365753174, + "step": 5230 + }, + { + "epoch": 0.9, + "grad_norm": 22.578680932349204, + "learning_rate": 3.343540653378571e-07, + "logits/chosen": -1.4708452224731445, + "logits/rejected": -1.409401535987854, + "logps/chosen": -177.78860473632812, + "logps/rejected": -269.25457763671875, + "loss": 0.4701, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2503529787063599, + "rewards/margins": 0.9091068506240845, + "rewards/rejected": -2.1594595909118652, + "step": 5240 + }, + { + "epoch": 0.9, + "grad_norm": 21.94896818155034, + "learning_rate": 3.3364598046862754e-07, + "logits/chosen": -1.3917882442474365, + "logits/rejected": -1.3478825092315674, + "logps/chosen": -180.23175048828125, + "logps/rejected": -260.63482666015625, + "loss": 0.4965, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2926623821258545, + "rewards/margins": 0.8049648404121399, + "rewards/rejected": -2.0976271629333496, + "step": 5250 + }, + { + "epoch": 0.91, + "grad_norm": 42.50229156823177, + "learning_rate": 3.3293713918235594e-07, + "logits/chosen": -1.4157629013061523, + "logits/rejected": -1.3547483682632446, + "logps/chosen": -195.6414337158203, + "logps/rejected": -264.51416015625, + "loss": 0.5465, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3889554738998413, + "rewards/margins": 0.741489827632904, + "rewards/rejected": -2.1304454803466797, + "step": 5260 + }, + { + "epoch": 0.91, + "grad_norm": 20.377525883555307, + "learning_rate": 3.3222754788914875e-07, + "logits/chosen": -1.5662615299224854, + "logits/rejected": -1.526829719543457, + "logps/chosen": -177.85626220703125, + "logps/rejected": -250.97933959960938, + "loss": 0.5173, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.263085961341858, + "rewards/margins": 0.735329270362854, + "rewards/rejected": -1.998415231704712, + "step": 5270 + }, + { + "epoch": 0.91, + "grad_norm": 24.732098744846958, + "learning_rate": 3.315172130058946e-07, + "logits/chosen": -1.4817497730255127, + "logits/rejected": -1.4205673933029175, + "logps/chosen": -187.76739501953125, + "logps/rejected": -252.81613159179688, + "loss": 0.5296, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3341633081436157, + "rewards/margins": 0.6874901056289673, + "rewards/rejected": -2.021653413772583, + "step": 5280 + }, + { + "epoch": 0.91, + "grad_norm": 26.624712530739988, + "learning_rate": 3.308061409562065e-07, + "logits/chosen": -1.4430485963821411, + "logits/rejected": -1.378722906112671, + "logps/chosen": -176.76901245117188, + "logps/rejected": -246.05923461914062, + "loss": 0.5194, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.209058165550232, + "rewards/margins": 0.7315285801887512, + "rewards/rejected": -1.940587043762207, + "step": 5290 + }, + { + "epoch": 0.91, + "grad_norm": 16.5970454890233, + "learning_rate": 3.300943381703639e-07, + "logits/chosen": -1.4298001527786255, + "logits/rejected": -1.3858981132507324, + "logps/chosen": -189.7223358154297, + "logps/rejected": -266.050537109375, + "loss": 0.5094, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3463001251220703, + "rewards/margins": 0.7629293203353882, + "rewards/rejected": -2.109229564666748, + "step": 5300 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -1.5428049564361572, + "eval_logits/rejected": -1.5180394649505615, + "eval_logps/chosen": -187.3815460205078, + "eval_logps/rejected": -224.76116943359375, + "eval_loss": 0.6180873513221741, + "eval_rewards/accuracies": 0.6598513126373291, + "eval_rewards/chosen": -1.2867772579193115, + "eval_rewards/margins": 0.32926076650619507, + "eval_rewards/rejected": -1.6160376071929932, + "eval_runtime": 357.239, + "eval_samples_per_second": 12.048, + "eval_steps_per_second": 1.506, + "step": 5300 + }, + { + "epoch": 0.91, + "grad_norm": 24.46549230872405, + "learning_rate": 3.293818110852541e-07, + "logits/chosen": -1.5138168334960938, + "logits/rejected": -1.472394347190857, + "logps/chosen": -205.1785430908203, + "logps/rejected": -269.4486389160156, + "loss": 0.5451, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4704174995422363, + "rewards/margins": 0.7003182172775269, + "rewards/rejected": -2.1707355976104736, + "step": 5310 + }, + { + "epoch": 0.92, + "grad_norm": 28.551168644730765, + "learning_rate": 3.286685661443144e-07, + "logits/chosen": -1.4450080394744873, + "logits/rejected": -1.3684725761413574, + "logps/chosen": -212.12887573242188, + "logps/rejected": -286.6734924316406, + "loss": 0.5018, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5459760427474976, + "rewards/margins": 0.8185034990310669, + "rewards/rejected": -2.3644795417785645, + "step": 5320 + }, + { + "epoch": 0.92, + "grad_norm": 21.620111584157225, + "learning_rate": 3.2795460979747375e-07, + "logits/chosen": -1.3988605737686157, + "logits/rejected": -1.3602981567382812, + "logps/chosen": -198.0777130126953, + "logps/rejected": -299.89276123046875, + "loss": 0.4887, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4722723960876465, + "rewards/margins": 0.9927380681037903, + "rewards/rejected": -2.465010643005371, + "step": 5330 + }, + { + "epoch": 0.92, + "grad_norm": 25.423142113773764, + "learning_rate": 3.272399485010943e-07, + "logits/chosen": -1.431849479675293, + "logits/rejected": -1.3641244173049927, + "logps/chosen": -201.8954620361328, + "logps/rejected": -272.422607421875, + "loss": 0.5224, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4554879665374756, + "rewards/margins": 0.7551096081733704, + "rewards/rejected": -2.210597515106201, + "step": 5340 + }, + { + "epoch": 0.92, + "grad_norm": 21.415159819448192, + "learning_rate": 3.2652458871791326e-07, + "logits/chosen": -1.4087716341018677, + "logits/rejected": -1.361433982849121, + "logps/chosen": -191.4141082763672, + "logps/rejected": -257.26812744140625, + "loss": 0.5519, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3878710269927979, + "rewards/margins": 0.6652384996414185, + "rewards/rejected": -2.053109645843506, + "step": 5350 + }, + { + "epoch": 0.92, + "grad_norm": 23.352557781202382, + "learning_rate": 3.2580853691698417e-07, + "logits/chosen": -1.5152844190597534, + "logits/rejected": -1.4675962924957275, + "logps/chosen": -186.24412536621094, + "logps/rejected": -270.291015625, + "loss": 0.5293, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3303719758987427, + "rewards/margins": 0.8078699111938477, + "rewards/rejected": -2.138241767883301, + "step": 5360 + }, + { + "epoch": 0.93, + "grad_norm": 27.08219440604054, + "learning_rate": 3.250917995736187e-07, + "logits/chosen": -1.4008272886276245, + "logits/rejected": -1.349577784538269, + "logps/chosen": -190.25677490234375, + "logps/rejected": -279.8938293457031, + "loss": 0.4707, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.383306622505188, + "rewards/margins": 0.8817272186279297, + "rewards/rejected": -2.2650339603424072, + "step": 5370 + }, + { + "epoch": 0.93, + "grad_norm": 21.444506787758893, + "learning_rate": 3.2437438316932766e-07, + "logits/chosen": -1.4608103036880493, + "logits/rejected": -1.4044318199157715, + "logps/chosen": -212.57666015625, + "logps/rejected": -279.16912841796875, + "loss": 0.535, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5518763065338135, + "rewards/margins": 0.7239239811897278, + "rewards/rejected": -2.2758002281188965, + "step": 5380 + }, + { + "epoch": 0.93, + "grad_norm": 22.582981599970143, + "learning_rate": 3.2365629419176294e-07, + "logits/chosen": -1.422620415687561, + "logits/rejected": -1.3589953184127808, + "logps/chosen": -210.6852569580078, + "logps/rejected": -277.79864501953125, + "loss": 0.5383, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5212723016738892, + "rewards/margins": 0.7407953143119812, + "rewards/rejected": -2.2620673179626465, + "step": 5390 + }, + { + "epoch": 0.93, + "grad_norm": 24.413482194523574, + "learning_rate": 3.229375391346585e-07, + "logits/chosen": -1.4233802556991577, + "logits/rejected": -1.3726685047149658, + "logps/chosen": -182.7910919189453, + "logps/rejected": -270.41156005859375, + "loss": 0.4865, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3170902729034424, + "rewards/margins": 0.8481825590133667, + "rewards/rejected": -2.1652729511260986, + "step": 5400 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -1.5443414449691772, + "eval_logits/rejected": -1.5196946859359741, + "eval_logps/chosen": -181.3465576171875, + "eval_logps/rejected": -217.53018188476562, + "eval_loss": 0.6221857070922852, + "eval_rewards/accuracies": 0.669842004776001, + "eval_rewards/chosen": -1.2264270782470703, + "eval_rewards/margins": 0.31730079650878906, + "eval_rewards/rejected": -1.543727993965149, + "eval_runtime": 357.1726, + "eval_samples_per_second": 12.05, + "eval_steps_per_second": 1.506, + "step": 5400 + }, + { + "epoch": 0.93, + "grad_norm": 33.90762588243634, + "learning_rate": 3.222181244977716e-07, + "logits/chosen": -1.4560267925262451, + "logits/rejected": -1.4195563793182373, + "logps/chosen": -192.35723876953125, + "logps/rejected": -252.6515350341797, + "loss": 0.5492, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3573839664459229, + "rewards/margins": 0.6285881996154785, + "rewards/rejected": -1.9859724044799805, + "step": 5410 + }, + { + "epoch": 0.93, + "grad_norm": 25.43643547302594, + "learning_rate": 3.2149805678682415e-07, + "logits/chosen": -1.5208218097686768, + "logits/rejected": -1.4746044874191284, + "logps/chosen": -182.27589416503906, + "logps/rejected": -257.0049743652344, + "loss": 0.5247, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2857364416122437, + "rewards/margins": 0.7472572326660156, + "rewards/rejected": -2.032993793487549, + "step": 5420 + }, + { + "epoch": 0.94, + "grad_norm": 28.226137721023306, + "learning_rate": 3.207773425134441e-07, + "logits/chosen": -1.4794824123382568, + "logits/rejected": -1.4373642206192017, + "logps/chosen": -185.75901794433594, + "logps/rejected": -257.13983154296875, + "loss": 0.532, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3217830657958984, + "rewards/margins": 0.7321761846542358, + "rewards/rejected": -2.0539591312408447, + "step": 5430 + }, + { + "epoch": 0.94, + "grad_norm": 21.150277719094746, + "learning_rate": 3.2005598819510586e-07, + "logits/chosen": -1.4646375179290771, + "logits/rejected": -1.4225250482559204, + "logps/chosen": -189.71932983398438, + "logps/rejected": -262.24505615234375, + "loss": 0.5265, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3352781534194946, + "rewards/margins": 0.7349643111228943, + "rewards/rejected": -2.070242404937744, + "step": 5440 + }, + { + "epoch": 0.94, + "grad_norm": 40.77880858071023, + "learning_rate": 3.193340003550722e-07, + "logits/chosen": -1.3812825679779053, + "logits/rejected": -1.338728427886963, + "logps/chosen": -192.5988006591797, + "logps/rejected": -267.5910949707031, + "loss": 0.525, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3704335689544678, + "rewards/margins": 0.7710081338882446, + "rewards/rejected": -2.141441822052002, + "step": 5450 + }, + { + "epoch": 0.94, + "grad_norm": 30.470434371984815, + "learning_rate": 3.186113855223348e-07, + "logits/chosen": -1.4694117307662964, + "logits/rejected": -1.4298455715179443, + "logps/chosen": -192.1426239013672, + "logps/rejected": -244.8093719482422, + "loss": 0.6017, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3830455541610718, + "rewards/margins": 0.544926106929779, + "rewards/rejected": -1.9279718399047852, + "step": 5460 + }, + { + "epoch": 0.94, + "grad_norm": 19.744210255270794, + "learning_rate": 3.178881502315552e-07, + "logits/chosen": -1.469347357749939, + "logits/rejected": -1.4357693195343018, + "logps/chosen": -171.6509246826172, + "logps/rejected": -222.62744140625, + "loss": 0.5921, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2030577659606934, + "rewards/margins": 0.5184643268585205, + "rewards/rejected": -1.7215220928192139, + "step": 5470 + }, + { + "epoch": 0.94, + "grad_norm": 21.30606288485756, + "learning_rate": 3.1716430102300573e-07, + "logits/chosen": -1.5191564559936523, + "logits/rejected": -1.4697355031967163, + "logps/chosen": -164.0375518798828, + "logps/rejected": -234.484619140625, + "loss": 0.5209, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1111133098602295, + "rewards/margins": 0.7175213694572449, + "rewards/rejected": -1.8286346197128296, + "step": 5480 + }, + { + "epoch": 0.95, + "grad_norm": 19.015649280887946, + "learning_rate": 3.164398444425106e-07, + "logits/chosen": -1.4912570714950562, + "logits/rejected": -1.4432531595230103, + "logps/chosen": -171.32119750976562, + "logps/rejected": -226.35067749023438, + "loss": 0.5431, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1578484773635864, + "rewards/margins": 0.5836814641952515, + "rewards/rejected": -1.7415298223495483, + "step": 5490 + }, + { + "epoch": 0.95, + "grad_norm": 25.985898470229376, + "learning_rate": 3.157147870413864e-07, + "logits/chosen": -1.5515010356903076, + "logits/rejected": -1.512731909751892, + "logps/chosen": -169.95750427246094, + "logps/rejected": -240.7258758544922, + "loss": 0.513, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1616063117980957, + "rewards/margins": 0.6991836428642273, + "rewards/rejected": -1.8607898950576782, + "step": 5500 + }, + { + "epoch": 0.95, + "eval_logits/chosen": -1.5876429080963135, + "eval_logits/rejected": -1.5650734901428223, + "eval_logps/chosen": -172.41822814941406, + "eval_logps/rejected": -205.8068389892578, + "eval_loss": 0.6214230060577393, + "eval_rewards/accuracies": 0.6721654534339905, + "eval_rewards/chosen": -1.137143850326538, + "eval_rewards/margins": 0.289350688457489, + "eval_rewards/rejected": -1.4264944791793823, + "eval_runtime": 357.0862, + "eval_samples_per_second": 12.053, + "eval_steps_per_second": 1.507, + "step": 5500 + }, + { + "epoch": 0.95, + "grad_norm": 23.17137980170567, + "learning_rate": 3.1498913537638314e-07, + "logits/chosen": -1.471665620803833, + "logits/rejected": -1.4335734844207764, + "logps/chosen": -186.6552734375, + "logps/rejected": -242.85983276367188, + "loss": 0.5717, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3344801664352417, + "rewards/margins": 0.566752552986145, + "rewards/rejected": -1.9012327194213867, + "step": 5510 + }, + { + "epoch": 0.95, + "grad_norm": 20.934842998393453, + "learning_rate": 3.142628960096246e-07, + "logits/chosen": -1.4280526638031006, + "logits/rejected": -1.3803586959838867, + "logps/chosen": -180.4522247314453, + "logps/rejected": -248.35739135742188, + "loss": 0.5239, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.246476411819458, + "rewards/margins": 0.7218848466873169, + "rewards/rejected": -1.9683609008789062, + "step": 5520 + }, + { + "epoch": 0.95, + "grad_norm": 20.694293406108667, + "learning_rate": 3.135360755085493e-07, + "logits/chosen": -1.4679347276687622, + "logits/rejected": -1.4120423793792725, + "logps/chosen": -185.0923309326172, + "logps/rejected": -251.6171875, + "loss": 0.5145, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2520725727081299, + "rewards/margins": 0.7337436079978943, + "rewards/rejected": -1.9858160018920898, + "step": 5530 + }, + { + "epoch": 0.95, + "grad_norm": 22.80776688821052, + "learning_rate": 3.12808680445851e-07, + "logits/chosen": -1.4971723556518555, + "logits/rejected": -1.477506399154663, + "logps/chosen": -170.65281677246094, + "logps/rejected": -243.3797149658203, + "loss": 0.5055, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1831166744232178, + "rewards/margins": 0.683652400970459, + "rewards/rejected": -1.8667690753936768, + "step": 5540 + }, + { + "epoch": 0.96, + "grad_norm": 21.44049349168216, + "learning_rate": 3.1208071739941937e-07, + "logits/chosen": -1.3374189138412476, + "logits/rejected": -1.3035809993743896, + "logps/chosen": -188.96694946289062, + "logps/rejected": -238.453125, + "loss": 0.6158, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3638770580291748, + "rewards/margins": 0.4801939129829407, + "rewards/rejected": -1.8440710306167603, + "step": 5550 + }, + { + "epoch": 0.96, + "grad_norm": 20.558896998020238, + "learning_rate": 3.113521929522802e-07, + "logits/chosen": -1.4649537801742554, + "logits/rejected": -1.4129371643066406, + "logps/chosen": -168.60670471191406, + "logps/rejected": -243.099365234375, + "loss": 0.5099, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1673256158828735, + "rewards/margins": 0.7692097425460815, + "rewards/rejected": -1.9365352392196655, + "step": 5560 + }, + { + "epoch": 0.96, + "grad_norm": 23.194806036236645, + "learning_rate": 3.10623113692536e-07, + "logits/chosen": -1.5026556253433228, + "logits/rejected": -1.4706186056137085, + "logps/chosen": -169.34744262695312, + "logps/rejected": -237.5296630859375, + "loss": 0.5547, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1870052814483643, + "rewards/margins": 0.6190119385719299, + "rewards/rejected": -1.8060171604156494, + "step": 5570 + }, + { + "epoch": 0.96, + "grad_norm": 17.077550864060385, + "learning_rate": 3.0989348621330695e-07, + "logits/chosen": -1.4042866230010986, + "logits/rejected": -1.3619954586029053, + "logps/chosen": -170.88133239746094, + "logps/rejected": -240.86929321289062, + "loss": 0.5208, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1771609783172607, + "rewards/margins": 0.6939356923103333, + "rewards/rejected": -1.8710966110229492, + "step": 5580 + }, + { + "epoch": 0.96, + "grad_norm": 41.03513118380479, + "learning_rate": 3.091633171126704e-07, + "logits/chosen": -1.5033903121948242, + "logits/rejected": -1.4395246505737305, + "logps/chosen": -188.55575561523438, + "logps/rejected": -257.30426025390625, + "loss": 0.5208, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3172476291656494, + "rewards/margins": 0.7042714953422546, + "rewards/rejected": -2.021519184112549, + "step": 5590 + }, + { + "epoch": 0.96, + "grad_norm": 33.09744249323354, + "learning_rate": 3.0843261299360164e-07, + "logits/chosen": -1.4485256671905518, + "logits/rejected": -1.4189153909683228, + "logps/chosen": -189.03610229492188, + "logps/rejected": -262.5235595703125, + "loss": 0.5474, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3653209209442139, + "rewards/margins": 0.6869848966598511, + "rewards/rejected": -2.0523059368133545, + "step": 5600 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -1.5346873998641968, + "eval_logits/rejected": -1.5108610391616821, + "eval_logps/chosen": -177.24856567382812, + "eval_logps/rejected": -212.6680450439453, + "eval_loss": 0.6200674772262573, + "eval_rewards/accuracies": 0.6689126491546631, + "eval_rewards/chosen": -1.185447096824646, + "eval_rewards/margins": 0.30965960025787354, + "eval_rewards/rejected": -1.49510657787323, + "eval_runtime": 357.0504, + "eval_samples_per_second": 12.054, + "eval_steps_per_second": 1.507, + "step": 5600 + }, + { + "epoch": 0.97, + "grad_norm": 28.45755729486924, + "learning_rate": 3.077013804639144e-07, + "logits/chosen": -1.4699697494506836, + "logits/rejected": -1.4311306476593018, + "logps/chosen": -185.23690795898438, + "logps/rejected": -261.2013244628906, + "loss": 0.5124, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.328597068786621, + "rewards/margins": 0.7179481983184814, + "rewards/rejected": -2.0465452671051025, + "step": 5610 + }, + { + "epoch": 0.97, + "grad_norm": 28.676584357098996, + "learning_rate": 3.069696261362008e-07, + "logits/chosen": -1.3878097534179688, + "logits/rejected": -1.3469122648239136, + "logps/chosen": -203.050537109375, + "logps/rejected": -258.06475830078125, + "loss": 0.5663, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4804455041885376, + "rewards/margins": 0.5948622822761536, + "rewards/rejected": -2.075307607650757, + "step": 5620 + }, + { + "epoch": 0.97, + "grad_norm": 31.893058353519617, + "learning_rate": 3.062373566277715e-07, + "logits/chosen": -1.441892385482788, + "logits/rejected": -1.3970489501953125, + "logps/chosen": -198.53103637695312, + "logps/rejected": -243.0982208251953, + "loss": 0.5993, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4262107610702515, + "rewards/margins": 0.4929355978965759, + "rewards/rejected": -1.9191462993621826, + "step": 5630 + }, + { + "epoch": 0.97, + "grad_norm": 32.04723618634895, + "learning_rate": 3.0550457856059596e-07, + "logits/chosen": -1.449190616607666, + "logits/rejected": -1.4080677032470703, + "logps/chosen": -168.21890258789062, + "logps/rejected": -238.8227081298828, + "loss": 0.5373, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1427868604660034, + "rewards/margins": 0.6845923662185669, + "rewards/rejected": -1.8273794651031494, + "step": 5640 + }, + { + "epoch": 0.97, + "grad_norm": 18.712266282878826, + "learning_rate": 3.047712985612428e-07, + "logits/chosen": -1.3978092670440674, + "logits/rejected": -1.3562982082366943, + "logps/chosen": -176.45343017578125, + "logps/rejected": -245.86154174804688, + "loss": 0.5429, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2446025609970093, + "rewards/margins": 0.6920903921127319, + "rewards/rejected": -1.9366929531097412, + "step": 5650 + }, + { + "epoch": 0.98, + "grad_norm": 20.19600716165965, + "learning_rate": 3.040375232608194e-07, + "logits/chosen": -1.3913816213607788, + "logits/rejected": -1.3549675941467285, + "logps/chosen": -181.96463012695312, + "logps/rejected": -260.1955261230469, + "loss": 0.5016, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3012346029281616, + "rewards/margins": 0.7908525466918945, + "rewards/rejected": -2.0920872688293457, + "step": 5660 + }, + { + "epoch": 0.98, + "grad_norm": 21.953309399233106, + "learning_rate": 3.0330325929491245e-07, + "logits/chosen": -1.3647847175598145, + "logits/rejected": -1.3191179037094116, + "logps/chosen": -187.47390747070312, + "logps/rejected": -257.4412841796875, + "loss": 0.5044, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3347058296203613, + "rewards/margins": 0.7116448879241943, + "rewards/rejected": -2.0463504791259766, + "step": 5670 + }, + { + "epoch": 0.98, + "grad_norm": 40.5185950103975, + "learning_rate": 3.0256851330352753e-07, + "logits/chosen": -1.3821312189102173, + "logits/rejected": -1.322158932685852, + "logps/chosen": -210.1196746826172, + "logps/rejected": -284.6582946777344, + "loss": 0.5185, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5572584867477417, + "rewards/margins": 0.7730095982551575, + "rewards/rejected": -2.330268383026123, + "step": 5680 + }, + { + "epoch": 0.98, + "grad_norm": 17.952104758420326, + "learning_rate": 3.0183329193102894e-07, + "logits/chosen": -1.4393055438995361, + "logits/rejected": -1.3816049098968506, + "logps/chosen": -200.3620147705078, + "logps/rejected": -273.0220642089844, + "loss": 0.5014, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.423686146736145, + "rewards/margins": 0.7898932695388794, + "rewards/rejected": -2.2135794162750244, + "step": 5690 + }, + { + "epoch": 0.98, + "grad_norm": 33.97947710759319, + "learning_rate": 3.010976018260805e-07, + "logits/chosen": -1.289398431777954, + "logits/rejected": -1.246914267539978, + "logps/chosen": -189.9264678955078, + "logps/rejected": -259.13726806640625, + "loss": 0.5291, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3215687274932861, + "rewards/margins": 0.7366959452629089, + "rewards/rejected": -2.05826473236084, + "step": 5700 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -1.5449095964431763, + "eval_logits/rejected": -1.5208981037139893, + "eval_logps/chosen": -175.29298400878906, + "eval_logps/rejected": -211.0419921875, + "eval_loss": 0.6191110610961914, + "eval_rewards/accuracies": 0.6696096658706665, + "eval_rewards/chosen": -1.165891408920288, + "eval_rewards/margins": 0.3129545748233795, + "eval_rewards/rejected": -1.4788459539413452, + "eval_runtime": 356.9489, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.507, + "step": 5700 + }, + { + "epoch": 0.98, + "grad_norm": 24.082058815741895, + "learning_rate": 3.003614496415843e-07, + "logits/chosen": -1.501319169998169, + "logits/rejected": -1.4549624919891357, + "logps/chosen": -180.39666748046875, + "logps/rejected": -245.5010223388672, + "loss": 0.5398, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2527682781219482, + "rewards/margins": 0.6765493154525757, + "rewards/rejected": -1.9293174743652344, + "step": 5710 + }, + { + "epoch": 0.99, + "grad_norm": 18.19243173514077, + "learning_rate": 2.996248420346211e-07, + "logits/chosen": -1.4630482196807861, + "logits/rejected": -1.40970778465271, + "logps/chosen": -167.19842529296875, + "logps/rejected": -250.3085479736328, + "loss": 0.467, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1519994735717773, + "rewards/margins": 0.8528478741645813, + "rewards/rejected": -2.004847288131714, + "step": 5720 + }, + { + "epoch": 0.99, + "grad_norm": 33.44207769272372, + "learning_rate": 2.988877856663905e-07, + "logits/chosen": -1.5095856189727783, + "logits/rejected": -1.4771087169647217, + "logps/chosen": -190.52589416503906, + "logps/rejected": -250.38720703125, + "loss": 0.582, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3880624771118164, + "rewards/margins": 0.597137987613678, + "rewards/rejected": -1.9852005243301392, + "step": 5730 + }, + { + "epoch": 0.99, + "grad_norm": 23.692568889129813, + "learning_rate": 2.9815028720214985e-07, + "logits/chosen": -1.4424539804458618, + "logits/rejected": -1.3772521018981934, + "logps/chosen": -192.9495849609375, + "logps/rejected": -276.883056640625, + "loss": 0.4764, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3374172449111938, + "rewards/margins": 0.8872700929641724, + "rewards/rejected": -2.224687099456787, + "step": 5740 + }, + { + "epoch": 0.99, + "grad_norm": 17.064609183815648, + "learning_rate": 2.974123533111545e-07, + "logits/chosen": -1.580055594444275, + "logits/rejected": -1.540281057357788, + "logps/chosen": -191.0297393798828, + "logps/rejected": -233.5084991455078, + "loss": 0.5938, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3438113927841187, + "rewards/margins": 0.48881006240844727, + "rewards/rejected": -1.8326218128204346, + "step": 5750 + }, + { + "epoch": 0.99, + "grad_norm": 17.358709255280868, + "learning_rate": 2.9667399066659756e-07, + "logits/chosen": -1.5095783472061157, + "logits/rejected": -1.4547650814056396, + "logps/chosen": -172.166015625, + "logps/rejected": -241.0043182373047, + "loss": 0.511, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1578861474990845, + "rewards/margins": 0.6968024373054504, + "rewards/rejected": -1.8546886444091797, + "step": 5760 + }, + { + "epoch": 0.99, + "grad_norm": 20.70958250669779, + "learning_rate": 2.959352059455492e-07, + "logits/chosen": -1.4510507583618164, + "logits/rejected": -1.3988720178604126, + "logps/chosen": -165.02476501464844, + "logps/rejected": -238.8711395263672, + "loss": 0.5043, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1318988800048828, + "rewards/margins": 0.7165284156799316, + "rewards/rejected": -1.848427176475525, + "step": 5770 + }, + { + "epoch": 1.0, + "grad_norm": 26.960927481343827, + "learning_rate": 2.9519600582889655e-07, + "logits/chosen": -1.4297640323638916, + "logits/rejected": -1.378154993057251, + "logps/chosen": -178.6942138671875, + "logps/rejected": -261.24639892578125, + "loss": 0.4882, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2625802755355835, + "rewards/margins": 0.8104287385940552, + "rewards/rejected": -2.0730090141296387, + "step": 5780 + }, + { + "epoch": 1.0, + "grad_norm": 19.048196919185166, + "learning_rate": 2.944563970012831e-07, + "logits/chosen": -1.2947901487350464, + "logits/rejected": -1.236037254333496, + "logps/chosen": -189.5282440185547, + "logps/rejected": -269.74005126953125, + "loss": 0.4999, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.380592703819275, + "rewards/margins": 0.7776483297348022, + "rewards/rejected": -2.158240795135498, + "step": 5790 + }, + { + "epoch": 1.0, + "grad_norm": 24.450393947587344, + "learning_rate": 2.937163861510486e-07, + "logits/chosen": -1.3695513010025024, + "logits/rejected": -1.3113354444503784, + "logps/chosen": -209.8120880126953, + "logps/rejected": -304.4366149902344, + "loss": 0.496, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5908751487731934, + "rewards/margins": 0.9317790865898132, + "rewards/rejected": -2.5226542949676514, + "step": 5800 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -1.4435439109802246, + "eval_logits/rejected": -1.41628098487854, + "eval_logps/chosen": -210.4264678955078, + "eval_logps/rejected": -253.47520446777344, + "eval_loss": 0.6148089170455933, + "eval_rewards/accuracies": 0.6679832935333252, + "eval_rewards/chosen": -1.5172260999679565, + "eval_rewards/margins": 0.3859521150588989, + "eval_rewards/rejected": -1.903178334236145, + "eval_runtime": 356.8738, + "eval_samples_per_second": 12.06, + "eval_steps_per_second": 1.508, + "step": 5800 + }, + { + "epoch": 1.0, + "grad_norm": 18.008006484022793, + "learning_rate": 2.9297597997016797e-07, + "logits/chosen": -1.4246017932891846, + "logits/rejected": -1.3686187267303467, + "logps/chosen": -203.05320739746094, + "logps/rejected": -305.4484558105469, + "loss": 0.4592, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.490220308303833, + "rewards/margins": 1.0287506580352783, + "rewards/rejected": -2.5189712047576904, + "step": 5810 + }, + { + "epoch": 1.0, + "grad_norm": 24.397505186303498, + "learning_rate": 2.922351851541915e-07, + "logits/chosen": -1.4257746934890747, + "logits/rejected": -1.3525466918945312, + "logps/chosen": -195.74807739257812, + "logps/rejected": -306.99359130859375, + "loss": 0.3977, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.4273375272750854, + "rewards/margins": 1.1495184898376465, + "rewards/rejected": -2.5768561363220215, + "step": 5820 + }, + { + "epoch": 1.0, + "grad_norm": 34.799268956724895, + "learning_rate": 2.914940084021836e-07, + "logits/chosen": -1.297031283378601, + "logits/rejected": -1.231827974319458, + "logps/chosen": -198.88839721679688, + "logps/rejected": -312.55792236328125, + "loss": 0.442, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4539258480072021, + "rewards/margins": 1.143164873123169, + "rewards/rejected": -2.59709095954895, + "step": 5830 + }, + { + "epoch": 1.01, + "grad_norm": 16.077132102945612, + "learning_rate": 2.907524564166628e-07, + "logits/chosen": -1.3520994186401367, + "logits/rejected": -1.2968170642852783, + "logps/chosen": -193.8484344482422, + "logps/rejected": -302.53302001953125, + "loss": 0.4416, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.414527177810669, + "rewards/margins": 1.065403699874878, + "rewards/rejected": -2.4799306392669678, + "step": 5840 + }, + { + "epoch": 1.01, + "grad_norm": 27.25246319010426, + "learning_rate": 2.9001053590354076e-07, + "logits/chosen": -1.410636067390442, + "logits/rejected": -1.3458845615386963, + "logps/chosen": -188.6303253173828, + "logps/rejected": -307.2320861816406, + "loss": 0.3943, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.3775842189788818, + "rewards/margins": 1.1499364376068115, + "rewards/rejected": -2.5275206565856934, + "step": 5850 + }, + { + "epoch": 1.01, + "grad_norm": 18.14664982804058, + "learning_rate": 2.8926825357206176e-07, + "logits/chosen": -1.2408941984176636, + "logits/rejected": -1.1875782012939453, + "logps/chosen": -207.2787628173828, + "logps/rejected": -331.25421142578125, + "loss": 0.4206, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5550321340560913, + "rewards/margins": 1.2251521348953247, + "rewards/rejected": -2.780184268951416, + "step": 5860 + }, + { + "epoch": 1.01, + "grad_norm": 33.09906725415787, + "learning_rate": 2.885256161347421e-07, + "logits/chosen": -1.236800193786621, + "logits/rejected": -1.174392580986023, + "logps/chosen": -234.08859252929688, + "logps/rejected": -356.24066162109375, + "loss": 0.3899, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7791268825531006, + "rewards/margins": 1.2331987619400024, + "rewards/rejected": -3.0123260021209717, + "step": 5870 + }, + { + "epoch": 1.01, + "grad_norm": 29.54869321861051, + "learning_rate": 2.877826303073094e-07, + "logits/chosen": -1.2946747541427612, + "logits/rejected": -1.2476125955581665, + "logps/chosen": -216.5059814453125, + "logps/rejected": -321.3334655761719, + "loss": 0.4526, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6175800561904907, + "rewards/margins": 1.0643819570541382, + "rewards/rejected": -2.681962251663208, + "step": 5880 + }, + { + "epoch": 1.01, + "grad_norm": 23.87246950772011, + "learning_rate": 2.870393028086416e-07, + "logits/chosen": -1.3654061555862427, + "logits/rejected": -1.3092124462127686, + "logps/chosen": -201.2548828125, + "logps/rejected": -316.09326171875, + "loss": 0.4342, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.461004614830017, + "rewards/margins": 1.118170142173767, + "rewards/rejected": -2.579174518585205, + "step": 5890 + }, + { + "epoch": 1.02, + "grad_norm": 22.072237513064117, + "learning_rate": 2.8629564036070663e-07, + "logits/chosen": -1.2765244245529175, + "logits/rejected": -1.2124745845794678, + "logps/chosen": -197.97320556640625, + "logps/rejected": -324.934326171875, + "loss": 0.3739, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4230375289916992, + "rewards/margins": 1.2795560359954834, + "rewards/rejected": -2.7025935649871826, + "step": 5900 + }, + { + "epoch": 1.02, + "eval_logits/chosen": -1.371607780456543, + "eval_logits/rejected": -1.3429399728775024, + "eval_logps/chosen": -213.2480010986328, + "eval_logps/rejected": -259.2733459472656, + "eval_loss": 0.621561586856842, + "eval_rewards/accuracies": 0.6626393795013428, + "eval_rewards/chosen": -1.5454415082931519, + "eval_rewards/margins": 0.4157179594039917, + "eval_rewards/rejected": -1.961159348487854, + "eval_runtime": 356.8605, + "eval_samples_per_second": 12.061, + "eval_steps_per_second": 1.508, + "step": 5900 + }, + { + "epoch": 1.02, + "grad_norm": 27.59715394921913, + "learning_rate": 2.855516496885011e-07, + "logits/chosen": -1.2443146705627441, + "logits/rejected": -1.2075556516647339, + "logps/chosen": -208.70321655273438, + "logps/rejected": -310.6231689453125, + "loss": 0.4883, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5616544485092163, + "rewards/margins": 0.9676151275634766, + "rewards/rejected": -2.5292694568634033, + "step": 5910 + }, + { + "epoch": 1.02, + "grad_norm": 24.263714171924033, + "learning_rate": 2.848073375199901e-07, + "logits/chosen": -1.2384252548217773, + "logits/rejected": -1.181979775428772, + "logps/chosen": -219.75973510742188, + "logps/rejected": -325.8696594238281, + "loss": 0.4625, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6830384731292725, + "rewards/margins": 1.0408580303192139, + "rewards/rejected": -2.7238965034484863, + "step": 5920 + }, + { + "epoch": 1.02, + "grad_norm": 20.723910025082144, + "learning_rate": 2.8406271058604574e-07, + "logits/chosen": -1.3165338039398193, + "logits/rejected": -1.2699096202850342, + "logps/chosen": -209.8754425048828, + "logps/rejected": -316.329833984375, + "loss": 0.4768, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5963435173034668, + "rewards/margins": 1.05552077293396, + "rewards/rejected": -2.6518642902374268, + "step": 5930 + }, + { + "epoch": 1.02, + "grad_norm": 35.79174059848847, + "learning_rate": 2.833177756203868e-07, + "logits/chosen": -1.3231611251831055, + "logits/rejected": -1.2533804178237915, + "logps/chosen": -185.56277465820312, + "logps/rejected": -296.2825622558594, + "loss": 0.4249, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3131976127624512, + "rewards/margins": 1.1194097995758057, + "rewards/rejected": -2.4326071739196777, + "step": 5940 + }, + { + "epoch": 1.03, + "grad_norm": 17.76388707818442, + "learning_rate": 2.8257253935951754e-07, + "logits/chosen": -1.2369143962860107, + "logits/rejected": -1.1907278299331665, + "logps/chosen": -180.17445373535156, + "logps/rejected": -300.03424072265625, + "loss": 0.3913, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.2647721767425537, + "rewards/margins": 1.1959367990493774, + "rewards/rejected": -2.4607090950012207, + "step": 5950 + }, + { + "epoch": 1.03, + "grad_norm": 16.94373019662342, + "learning_rate": 2.818270085426668e-07, + "logits/chosen": -1.252617597579956, + "logits/rejected": -1.1776127815246582, + "logps/chosen": -212.00613403320312, + "logps/rejected": -300.90032958984375, + "loss": 0.4698, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5370571613311768, + "rewards/margins": 0.9539899826049805, + "rewards/rejected": -2.4910473823547363, + "step": 5960 + }, + { + "epoch": 1.03, + "grad_norm": 18.34387313974363, + "learning_rate": 2.8108118991172715e-07, + "logits/chosen": -1.2002298831939697, + "logits/rejected": -1.1465680599212646, + "logps/chosen": -217.5402374267578, + "logps/rejected": -327.88226318359375, + "loss": 0.4467, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.656275987625122, + "rewards/margins": 1.1236770153045654, + "rewards/rejected": -2.7799527645111084, + "step": 5970 + }, + { + "epoch": 1.03, + "grad_norm": 36.226753471187656, + "learning_rate": 2.8033509021119396e-07, + "logits/chosen": -1.1955822706222534, + "logits/rejected": -1.1503514051437378, + "logps/chosen": -217.57955932617188, + "logps/rejected": -343.6628112792969, + "loss": 0.4415, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6500743627548218, + "rewards/margins": 1.2213424444198608, + "rewards/rejected": -2.8714168071746826, + "step": 5980 + }, + { + "epoch": 1.03, + "grad_norm": 31.27940424068028, + "learning_rate": 2.795887161881043e-07, + "logits/chosen": -1.2698607444763184, + "logits/rejected": -1.196852207183838, + "logps/chosen": -225.42538452148438, + "logps/rejected": -327.5019226074219, + "loss": 0.4639, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7037813663482666, + "rewards/margins": 1.0513932704925537, + "rewards/rejected": -2.7551746368408203, + "step": 5990 + }, + { + "epoch": 1.03, + "grad_norm": 25.514076708010567, + "learning_rate": 2.7884207459197585e-07, + "logits/chosen": -1.23202383518219, + "logits/rejected": -1.166017770767212, + "logps/chosen": -224.9745635986328, + "logps/rejected": -359.5087585449219, + "loss": 0.3835, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7289674282073975, + "rewards/margins": 1.3435934782028198, + "rewards/rejected": -3.0725607872009277, + "step": 6000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -1.3176610469818115, + "eval_logits/rejected": -1.2868660688400269, + "eval_logps/chosen": -241.43719482421875, + "eval_logps/rejected": -294.40496826171875, + "eval_loss": 0.6213955879211426, + "eval_rewards/accuracies": 0.6670538783073425, + "eval_rewards/chosen": -1.8273334503173828, + "eval_rewards/margins": 0.48514264822006226, + "eval_rewards/rejected": -2.3124759197235107, + "eval_runtime": 356.8191, + "eval_samples_per_second": 12.062, + "eval_steps_per_second": 1.508, + "step": 6000 + }, + { + "epoch": 1.04, + "grad_norm": 23.69085635543719, + "learning_rate": 2.780951721747461e-07, + "logits/chosen": -1.243060827255249, + "logits/rejected": -1.194278359413147, + "logps/chosen": -225.7394561767578, + "logps/rejected": -336.02508544921875, + "loss": 0.4742, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.72724187374115, + "rewards/margins": 1.119065284729004, + "rewards/rejected": -2.8463072776794434, + "step": 6010 + }, + { + "epoch": 1.04, + "grad_norm": 19.041771832370905, + "learning_rate": 2.7734801569071104e-07, + "logits/chosen": -1.4446563720703125, + "logits/rejected": -1.3703842163085938, + "logps/chosen": -204.45094299316406, + "logps/rejected": -318.0191345214844, + "loss": 0.4287, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.50087308883667, + "rewards/margins": 1.1805269718170166, + "rewards/rejected": -2.6814000606536865, + "step": 6020 + }, + { + "epoch": 1.04, + "grad_norm": 22.204161933617108, + "learning_rate": 2.766006118964644e-07, + "logits/chosen": -1.1446921825408936, + "logits/rejected": -1.0945428609848022, + "logps/chosen": -206.8660125732422, + "logps/rejected": -310.853759765625, + "loss": 0.451, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5705788135528564, + "rewards/margins": 1.0349514484405518, + "rewards/rejected": -2.605530261993408, + "step": 6030 + }, + { + "epoch": 1.04, + "grad_norm": 27.633296335888442, + "learning_rate": 2.7585296755083615e-07, + "logits/chosen": -1.3180968761444092, + "logits/rejected": -1.2671663761138916, + "logps/chosen": -198.31124877929688, + "logps/rejected": -304.19488525390625, + "loss": 0.43, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4460715055465698, + "rewards/margins": 1.0792269706726074, + "rewards/rejected": -2.525298595428467, + "step": 6040 + }, + { + "epoch": 1.04, + "grad_norm": 19.23129899129365, + "learning_rate": 2.751050894148317e-07, + "logits/chosen": -1.235442876815796, + "logits/rejected": -1.174726963043213, + "logps/chosen": -212.7599334716797, + "logps/rejected": -319.74639892578125, + "loss": 0.4224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5342434644699097, + "rewards/margins": 1.1264307498931885, + "rewards/rejected": -2.6606743335723877, + "step": 6050 + }, + { + "epoch": 1.04, + "grad_norm": 21.993509562945054, + "learning_rate": 2.743569842515707e-07, + "logits/chosen": -1.2447845935821533, + "logits/rejected": -1.1827826499938965, + "logps/chosen": -215.64529418945312, + "logps/rejected": -319.7334899902344, + "loss": 0.4931, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6215600967407227, + "rewards/margins": 1.0563386678695679, + "rewards/rejected": -2.67789888381958, + "step": 6060 + }, + { + "epoch": 1.05, + "grad_norm": 31.01800262084724, + "learning_rate": 2.7360865882622556e-07, + "logits/chosen": -1.2382781505584717, + "logits/rejected": -1.1739325523376465, + "logps/chosen": -226.4764404296875, + "logps/rejected": -340.14373779296875, + "loss": 0.4489, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7254207134246826, + "rewards/margins": 1.1591459512710571, + "rewards/rejected": -2.8845667839050293, + "step": 6070 + }, + { + "epoch": 1.05, + "grad_norm": 23.645653500298494, + "learning_rate": 2.728601199059609e-07, + "logits/chosen": -1.2225624322891235, + "logits/rejected": -1.1666558980941772, + "logps/chosen": -230.73385620117188, + "logps/rejected": -352.7666320800781, + "loss": 0.4109, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7167644500732422, + "rewards/margins": 1.275223970413208, + "rewards/rejected": -2.9919886589050293, + "step": 6080 + }, + { + "epoch": 1.05, + "grad_norm": 26.05274957772241, + "learning_rate": 2.7211137425987175e-07, + "logits/chosen": -1.2456872463226318, + "logits/rejected": -1.1820614337921143, + "logps/chosen": -225.69967651367188, + "logps/rejected": -366.41064453125, + "loss": 0.3763, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7430721521377563, + "rewards/margins": 1.407641887664795, + "rewards/rejected": -3.150714159011841, + "step": 6090 + }, + { + "epoch": 1.05, + "grad_norm": 23.260564144653117, + "learning_rate": 2.713624286589227e-07, + "logits/chosen": -1.1914881467819214, + "logits/rejected": -1.1363308429718018, + "logps/chosen": -247.3178253173828, + "logps/rejected": -392.1295471191406, + "loss": 0.3822, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.905500054359436, + "rewards/margins": 1.4772017002105713, + "rewards/rejected": -3.382701873779297, + "step": 6100 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -1.247104287147522, + "eval_logits/rejected": -1.2163046598434448, + "eval_logps/chosen": -258.7976379394531, + "eval_logps/rejected": -313.2447509765625, + "eval_loss": 0.6230133771896362, + "eval_rewards/accuracies": 0.6710036993026733, + "eval_rewards/chosen": -2.0009379386901855, + "eval_rewards/margins": 0.49993589520454407, + "eval_rewards/rejected": -2.5008738040924072, + "eval_runtime": 356.9611, + "eval_samples_per_second": 12.057, + "eval_steps_per_second": 1.507, + "step": 6100 + }, + { + "epoch": 1.05, + "grad_norm": 20.111788335085762, + "learning_rate": 2.7061328987588626e-07, + "logits/chosen": -1.1539726257324219, + "logits/rejected": -1.087749719619751, + "logps/chosen": -263.44146728515625, + "logps/rejected": -398.18914794921875, + "loss": 0.4009, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.117790699005127, + "rewards/margins": 1.3292558193206787, + "rewards/rejected": -3.4470467567443848, + "step": 6110 + }, + { + "epoch": 1.05, + "grad_norm": 39.46202657579613, + "learning_rate": 2.6986396468528236e-07, + "logits/chosen": -1.2154873609542847, + "logits/rejected": -1.1202675104141235, + "logps/chosen": -255.0636444091797, + "logps/rejected": -415.2186584472656, + "loss": 0.3857, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.007509469985962, + "rewards/margins": 1.6099491119384766, + "rewards/rejected": -3.6174583435058594, + "step": 6120 + }, + { + "epoch": 1.06, + "grad_norm": 16.111988424606817, + "learning_rate": 2.6911445986331634e-07, + "logits/chosen": -1.1826080083847046, + "logits/rejected": -1.1169893741607666, + "logps/chosen": -239.3343048095703, + "logps/rejected": -376.82037353515625, + "loss": 0.3942, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.849416971206665, + "rewards/margins": 1.3795721530914307, + "rewards/rejected": -3.2289886474609375, + "step": 6130 + }, + { + "epoch": 1.06, + "grad_norm": 27.334786962517168, + "learning_rate": 2.68364782187818e-07, + "logits/chosen": -1.2877238988876343, + "logits/rejected": -1.229827642440796, + "logps/chosen": -210.6566619873047, + "logps/rejected": -323.015869140625, + "loss": 0.4396, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5760023593902588, + "rewards/margins": 1.1295536756515503, + "rewards/rejected": -2.7055559158325195, + "step": 6140 + }, + { + "epoch": 1.06, + "grad_norm": 23.13483950909933, + "learning_rate": 2.6761493843818027e-07, + "logits/chosen": -1.24057936668396, + "logits/rejected": -1.1909449100494385, + "logps/chosen": -211.8172607421875, + "logps/rejected": -319.30926513671875, + "loss": 0.4605, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5882833003997803, + "rewards/margins": 1.1051770448684692, + "rewards/rejected": -2.693460464477539, + "step": 6150 + }, + { + "epoch": 1.06, + "grad_norm": 24.089174586100004, + "learning_rate": 2.66864935395298e-07, + "logits/chosen": -1.1712977886199951, + "logits/rejected": -1.1290233135223389, + "logps/chosen": -205.40530395507812, + "logps/rejected": -303.6624450683594, + "loss": 0.4751, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5445663928985596, + "rewards/margins": 0.9666827321052551, + "rewards/rejected": -2.51124906539917, + "step": 6160 + }, + { + "epoch": 1.06, + "grad_norm": 23.745589448999045, + "learning_rate": 2.661147798415063e-07, + "logits/chosen": -1.3031284809112549, + "logits/rejected": -1.2508373260498047, + "logps/chosen": -230.56103515625, + "logps/rejected": -357.1623840332031, + "loss": 0.4073, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7110252380371094, + "rewards/margins": 1.2960346937179565, + "rewards/rejected": -3.0070605278015137, + "step": 6170 + }, + { + "epoch": 1.06, + "grad_norm": 23.45984720617789, + "learning_rate": 2.6536447856051964e-07, + "logits/chosen": -1.2978737354278564, + "logits/rejected": -1.2452610731124878, + "logps/chosen": -238.743408203125, + "logps/rejected": -339.7936706542969, + "loss": 0.4798, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.805580735206604, + "rewards/margins": 1.062530517578125, + "rewards/rejected": -2.8681111335754395, + "step": 6180 + }, + { + "epoch": 1.07, + "grad_norm": 24.825020766575623, + "learning_rate": 2.646140383373704e-07, + "logits/chosen": -1.304811716079712, + "logits/rejected": -1.2447645664215088, + "logps/chosen": -218.7591552734375, + "logps/rejected": -337.76025390625, + "loss": 0.394, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6060594320297241, + "rewards/margins": 1.2416613101959229, + "rewards/rejected": -2.8477206230163574, + "step": 6190 + }, + { + "epoch": 1.07, + "grad_norm": 22.348892489607106, + "learning_rate": 2.6386346595834716e-07, + "logits/chosen": -1.2410696744918823, + "logits/rejected": -1.1765029430389404, + "logps/chosen": -213.6040802001953, + "logps/rejected": -335.80865478515625, + "loss": 0.4249, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5910637378692627, + "rewards/margins": 1.2148936986923218, + "rewards/rejected": -2.805957317352295, + "step": 6200 + }, + { + "epoch": 1.07, + "eval_logits/chosen": -1.446341872215271, + "eval_logits/rejected": -1.418765902519226, + "eval_logps/chosen": -210.3596954345703, + "eval_logps/rejected": -255.79803466796875, + "eval_loss": 0.6216332912445068, + "eval_rewards/accuracies": 0.6656598448753357, + "eval_rewards/chosen": -1.5165584087371826, + "eval_rewards/margins": 0.40984830260276794, + "eval_rewards/rejected": -1.9264066219329834, + "eval_runtime": 356.8025, + "eval_samples_per_second": 12.063, + "eval_steps_per_second": 1.508, + "step": 6200 + }, + { + "epoch": 1.07, + "grad_norm": 34.90631179192167, + "learning_rate": 2.631127682109338e-07, + "logits/chosen": -1.3385263681411743, + "logits/rejected": -1.277630090713501, + "logps/chosen": -212.1560516357422, + "logps/rejected": -317.8212890625, + "loss": 0.4505, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.5736181735992432, + "rewards/margins": 1.0757099390029907, + "rewards/rejected": -2.6493279933929443, + "step": 6210 + }, + { + "epoch": 1.07, + "grad_norm": 28.897418625202086, + "learning_rate": 2.6236195188374797e-07, + "logits/chosen": -1.3002517223358154, + "logits/rejected": -1.2492867708206177, + "logps/chosen": -213.46377563476562, + "logps/rejected": -318.5871887207031, + "loss": 0.4586, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6133520603179932, + "rewards/margins": 1.047262191772461, + "rewards/rejected": -2.660614252090454, + "step": 6220 + }, + { + "epoch": 1.07, + "grad_norm": 31.35968444075051, + "learning_rate": 2.616110237664793e-07, + "logits/chosen": -1.427841067314148, + "logits/rejected": -1.3521463871002197, + "logps/chosen": -208.4129180908203, + "logps/rejected": -360.6506042480469, + "loss": 0.3774, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.495847225189209, + "rewards/margins": 1.5452907085418701, + "rewards/rejected": -3.041138172149658, + "step": 6230 + }, + { + "epoch": 1.08, + "grad_norm": 29.527066343948846, + "learning_rate": 2.6085999064982873e-07, + "logits/chosen": -1.2126820087432861, + "logits/rejected": -1.143293023109436, + "logps/chosen": -223.4793701171875, + "logps/rejected": -341.6191711425781, + "loss": 0.4529, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6712300777435303, + "rewards/margins": 1.2077951431274414, + "rewards/rejected": -2.879025459289551, + "step": 6240 + }, + { + "epoch": 1.08, + "grad_norm": 19.581487431832702, + "learning_rate": 2.601088593254465e-07, + "logits/chosen": -1.3335120677947998, + "logits/rejected": -1.2712721824645996, + "logps/chosen": -218.8231658935547, + "logps/rejected": -323.0040283203125, + "loss": 0.4925, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6310733556747437, + "rewards/margins": 1.0603678226470947, + "rewards/rejected": -2.691441059112549, + "step": 6250 + }, + { + "epoch": 1.08, + "grad_norm": 36.934927120147826, + "learning_rate": 2.59357636585871e-07, + "logits/chosen": -1.232089877128601, + "logits/rejected": -1.1848324537277222, + "logps/chosen": -211.7543487548828, + "logps/rejected": -297.58697509765625, + "loss": 0.4911, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.578609824180603, + "rewards/margins": 0.8933493494987488, + "rewards/rejected": -2.471959352493286, + "step": 6260 + }, + { + "epoch": 1.08, + "grad_norm": 22.671481469710727, + "learning_rate": 2.5860632922446737e-07, + "logits/chosen": -1.5191317796707153, + "logits/rejected": -1.4832508563995361, + "logps/chosen": -200.9024658203125, + "logps/rejected": -307.0449523925781, + "loss": 0.4744, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.479931116104126, + "rewards/margins": 1.0443861484527588, + "rewards/rejected": -2.524317502975464, + "step": 6270 + }, + { + "epoch": 1.08, + "grad_norm": 22.47288317464536, + "learning_rate": 2.578549440353659e-07, + "logits/chosen": -1.2445075511932373, + "logits/rejected": -1.1983692646026611, + "logps/chosen": -185.7981414794922, + "logps/rejected": -284.7179260253906, + "loss": 0.4304, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.3140432834625244, + "rewards/margins": 1.0069011449813843, + "rewards/rejected": -2.320944309234619, + "step": 6280 + }, + { + "epoch": 1.08, + "grad_norm": 26.874037522701656, + "learning_rate": 2.571034878134007e-07, + "logits/chosen": -1.3063162565231323, + "logits/rejected": -1.2508251667022705, + "logps/chosen": -197.89691162109375, + "logps/rejected": -304.0697937011719, + "loss": 0.4268, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.465261459350586, + "rewards/margins": 1.0520577430725098, + "rewards/rejected": -2.5173192024230957, + "step": 6290 + }, + { + "epoch": 1.09, + "grad_norm": 34.25929825482152, + "learning_rate": 2.5635196735404816e-07, + "logits/chosen": -1.327014446258545, + "logits/rejected": -1.273252248764038, + "logps/chosen": -216.51626586914062, + "logps/rejected": -310.141845703125, + "loss": 0.4731, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.640974760055542, + "rewards/margins": 0.9403258562088013, + "rewards/rejected": -2.581300973892212, + "step": 6300 + }, + { + "epoch": 1.09, + "eval_logits/chosen": -1.4054533243179321, + "eval_logits/rejected": -1.3767914772033691, + "eval_logps/chosen": -229.14906311035156, + "eval_logps/rejected": -278.4627685546875, + "eval_loss": 0.6205867528915405, + "eval_rewards/accuracies": 0.6654275059700012, + "eval_rewards/chosen": -1.7044522762298584, + "eval_rewards/margins": 0.4486016631126404, + "eval_rewards/rejected": -2.1530539989471436, + "eval_runtime": 356.7832, + "eval_samples_per_second": 12.063, + "eval_steps_per_second": 1.508, + "step": 6300 + }, + { + "epoch": 1.09, + "grad_norm": 30.63136108992965, + "learning_rate": 2.5560038945336583e-07, + "logits/chosen": -1.2807663679122925, + "logits/rejected": -1.213196039199829, + "logps/chosen": -209.709716796875, + "logps/rejected": -319.9820861816406, + "loss": 0.4534, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5926458835601807, + "rewards/margins": 1.0916332006454468, + "rewards/rejected": -2.684279203414917, + "step": 6310 + }, + { + "epoch": 1.09, + "grad_norm": 18.736489627828945, + "learning_rate": 2.548487609079305e-07, + "logits/chosen": -1.2793110609054565, + "logits/rejected": -1.2298452854156494, + "logps/chosen": -228.3084716796875, + "logps/rejected": -330.50390625, + "loss": 0.4929, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7359955310821533, + "rewards/margins": 1.0360920429229736, + "rewards/rejected": -2.772087574005127, + "step": 6320 + }, + { + "epoch": 1.09, + "grad_norm": 24.122550114272617, + "learning_rate": 2.5409708851477687e-07, + "logits/chosen": -1.316935658454895, + "logits/rejected": -1.2505112886428833, + "logps/chosen": -206.8124237060547, + "logps/rejected": -340.0397644042969, + "loss": 0.3752, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5354225635528564, + "rewards/margins": 1.3291146755218506, + "rewards/rejected": -2.864537477493286, + "step": 6330 + }, + { + "epoch": 1.09, + "grad_norm": 24.00191172899993, + "learning_rate": 2.533453790713363e-07, + "logits/chosen": -1.3309152126312256, + "logits/rejected": -1.2744200229644775, + "logps/chosen": -205.3987274169922, + "logps/rejected": -318.1818542480469, + "loss": 0.4432, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4982860088348389, + "rewards/margins": 1.1549437046051025, + "rewards/rejected": -2.6532297134399414, + "step": 6340 + }, + { + "epoch": 1.09, + "grad_norm": 37.256774507075455, + "learning_rate": 2.5259363937537523e-07, + "logits/chosen": -1.2830774784088135, + "logits/rejected": -1.2399280071258545, + "logps/chosen": -215.0625, + "logps/rejected": -328.13604736328125, + "loss": 0.4306, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5956099033355713, + "rewards/margins": 1.1229556798934937, + "rewards/rejected": -2.7185654640197754, + "step": 6350 + }, + { + "epoch": 1.1, + "grad_norm": 24.16860767130257, + "learning_rate": 2.5184187622493356e-07, + "logits/chosen": -1.2492659091949463, + "logits/rejected": -1.1924443244934082, + "logps/chosen": -213.53012084960938, + "logps/rejected": -354.2420959472656, + "loss": 0.3785, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6007035970687866, + "rewards/margins": 1.3961331844329834, + "rewards/rejected": -2.9968369007110596, + "step": 6360 + }, + { + "epoch": 1.1, + "grad_norm": 30.09884792748387, + "learning_rate": 2.510900964182635e-07, + "logits/chosen": -1.2614082098007202, + "logits/rejected": -1.2243916988372803, + "logps/chosen": -221.001708984375, + "logps/rejected": -329.3047180175781, + "loss": 0.4579, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6786903142929077, + "rewards/margins": 1.0699204206466675, + "rewards/rejected": -2.7486109733581543, + "step": 6370 + }, + { + "epoch": 1.1, + "grad_norm": 26.54672845989167, + "learning_rate": 2.503383067537674e-07, + "logits/chosen": -1.3264938592910767, + "logits/rejected": -1.2688075304031372, + "logps/chosen": -207.5281524658203, + "logps/rejected": -336.74346923828125, + "loss": 0.3866, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.5481336116790771, + "rewards/margins": 1.2540295124053955, + "rewards/rejected": -2.8021631240844727, + "step": 6380 + }, + { + "epoch": 1.1, + "grad_norm": 26.92277033670455, + "learning_rate": 2.495865140299374e-07, + "logits/chosen": -1.364383578300476, + "logits/rejected": -1.2942759990692139, + "logps/chosen": -213.89453125, + "logps/rejected": -337.72857666015625, + "loss": 0.4155, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5782573223114014, + "rewards/margins": 1.2891353368759155, + "rewards/rejected": -2.8673927783966064, + "step": 6390 + }, + { + "epoch": 1.1, + "grad_norm": 26.35121867365253, + "learning_rate": 2.4883472504529284e-07, + "logits/chosen": -1.2807561159133911, + "logits/rejected": -1.2216075658798218, + "logps/chosen": -221.13613891601562, + "logps/rejected": -342.654296875, + "loss": 0.4089, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6344629526138306, + "rewards/margins": 1.2270433902740479, + "rewards/rejected": -2.8615059852600098, + "step": 6400 + }, + { + "epoch": 1.1, + "eval_logits/chosen": -1.32829749584198, + "eval_logits/rejected": -1.2985448837280273, + "eval_logps/chosen": -253.03562927246094, + "eval_logps/rejected": -306.45611572265625, + "eval_loss": 0.6263204216957092, + "eval_rewards/accuracies": 0.6642658114433289, + "eval_rewards/chosen": -1.9433181285858154, + "eval_rewards/margins": 0.48966917395591736, + "eval_rewards/rejected": -2.4329869747161865, + "eval_runtime": 356.7188, + "eval_samples_per_second": 12.066, + "eval_steps_per_second": 1.508, + "step": 6400 + }, + { + "epoch": 1.1, + "grad_norm": 35.49752834329857, + "learning_rate": 2.480829465983194e-07, + "logits/chosen": -1.3197977542877197, + "logits/rejected": -1.2659125328063965, + "logps/chosen": -263.07965087890625, + "logps/rejected": -380.5269470214844, + "loss": 0.4862, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.0871684551239014, + "rewards/margins": 1.2009137868881226, + "rewards/rejected": -3.2880821228027344, + "step": 6410 + }, + { + "epoch": 1.11, + "grad_norm": 32.67436318673992, + "learning_rate": 2.473311854874075e-07, + "logits/chosen": -1.321010947227478, + "logits/rejected": -1.269235372543335, + "logps/chosen": -245.9833984375, + "logps/rejected": -353.6158142089844, + "loss": 0.4987, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.912111520767212, + "rewards/margins": 1.085077166557312, + "rewards/rejected": -2.9971888065338135, + "step": 6420 + }, + { + "epoch": 1.11, + "grad_norm": 23.752696598571593, + "learning_rate": 2.4657944851079076e-07, + "logits/chosen": -1.2947794198989868, + "logits/rejected": -1.2474058866500854, + "logps/chosen": -207.3226776123047, + "logps/rejected": -308.37408447265625, + "loss": 0.4638, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5694684982299805, + "rewards/margins": 1.0322576761245728, + "rewards/rejected": -2.6017260551452637, + "step": 6430 + }, + { + "epoch": 1.11, + "grad_norm": 23.030575066990085, + "learning_rate": 2.458277424664845e-07, + "logits/chosen": -1.339413046836853, + "logits/rejected": -1.2803711891174316, + "logps/chosen": -209.6329345703125, + "logps/rejected": -334.5424499511719, + "loss": 0.3994, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.5639805793762207, + "rewards/margins": 1.2495237588882446, + "rewards/rejected": -2.813504219055176, + "step": 6440 + }, + { + "epoch": 1.11, + "grad_norm": 28.704711169531485, + "learning_rate": 2.450760741522244e-07, + "logits/chosen": -1.3053383827209473, + "logits/rejected": -1.2392146587371826, + "logps/chosen": -219.56298828125, + "logps/rejected": -331.5205078125, + "loss": 0.4508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.632362723350525, + "rewards/margins": 1.160873532295227, + "rewards/rejected": -2.793236255645752, + "step": 6450 + }, + { + "epoch": 1.11, + "grad_norm": 32.40806253150715, + "learning_rate": 2.443244503654047e-07, + "logits/chosen": -1.2578837871551514, + "logits/rejected": -1.2153687477111816, + "logps/chosen": -222.15573120117188, + "logps/rejected": -370.3708190917969, + "loss": 0.3719, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7027851343154907, + "rewards/margins": 1.4216923713684082, + "rewards/rejected": -3.1244776248931885, + "step": 6460 + }, + { + "epoch": 1.11, + "grad_norm": 27.914141548982084, + "learning_rate": 2.4357287790301755e-07, + "logits/chosen": -1.2337547540664673, + "logits/rejected": -1.178056001663208, + "logps/chosen": -217.49169921875, + "logps/rejected": -328.74554443359375, + "loss": 0.4362, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6474285125732422, + "rewards/margins": 1.122659683227539, + "rewards/rejected": -2.7700884342193604, + "step": 6470 + }, + { + "epoch": 1.12, + "grad_norm": 31.48390506247595, + "learning_rate": 2.428213635615902e-07, + "logits/chosen": -1.3232189416885376, + "logits/rejected": -1.2600330114364624, + "logps/chosen": -222.95278930664062, + "logps/rejected": -329.962158203125, + "loss": 0.4351, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6635528802871704, + "rewards/margins": 1.1314866542816162, + "rewards/rejected": -2.795039415359497, + "step": 6480 + }, + { + "epoch": 1.12, + "grad_norm": 20.030188531568935, + "learning_rate": 2.420699141371251e-07, + "logits/chosen": -1.4895018339157104, + "logits/rejected": -1.4343178272247314, + "logps/chosen": -234.48953247070312, + "logps/rejected": -364.4090881347656, + "loss": 0.4536, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8089115619659424, + "rewards/margins": 1.2942219972610474, + "rewards/rejected": -3.1031336784362793, + "step": 6490 + }, + { + "epoch": 1.12, + "grad_norm": 27.831603619813574, + "learning_rate": 2.41318536425037e-07, + "logits/chosen": -1.3722602128982544, + "logits/rejected": -1.3266656398773193, + "logps/chosen": -211.3208770751953, + "logps/rejected": -318.9163818359375, + "loss": 0.4055, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.551661491394043, + "rewards/margins": 1.1104365587234497, + "rewards/rejected": -2.6620981693267822, + "step": 6500 + }, + { + "epoch": 1.12, + "eval_logits/chosen": -1.4495809078216553, + "eval_logits/rejected": -1.4227662086486816, + "eval_logps/chosen": -220.26852416992188, + "eval_logps/rejected": -266.0024108886719, + "eval_loss": 0.6262578964233398, + "eval_rewards/accuracies": 0.6656598448753357, + "eval_rewards/chosen": -1.6156466007232666, + "eval_rewards/margins": 0.4128037095069885, + "eval_rewards/rejected": -2.0284502506256104, + "eval_runtime": 356.7396, + "eval_samples_per_second": 12.065, + "eval_steps_per_second": 1.508, + "step": 6500 + }, + { + "epoch": 1.12, + "grad_norm": 30.446262958118353, + "learning_rate": 2.4056723722009243e-07, + "logits/chosen": -1.3711057901382446, + "logits/rejected": -1.291212797164917, + "logps/chosen": -215.440673828125, + "logps/rejected": -331.94805908203125, + "loss": 0.4238, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5865962505340576, + "rewards/margins": 1.203033447265625, + "rewards/rejected": -2.7896294593811035, + "step": 6510 + }, + { + "epoch": 1.12, + "grad_norm": 22.937766573953056, + "learning_rate": 2.39816023316348e-07, + "logits/chosen": -1.3640129566192627, + "logits/rejected": -1.3016841411590576, + "logps/chosen": -203.5388641357422, + "logps/rejected": -329.3097839355469, + "loss": 0.3885, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.499146819114685, + "rewards/margins": 1.2792606353759766, + "rewards/rejected": -2.778407573699951, + "step": 6520 + }, + { + "epoch": 1.13, + "grad_norm": 20.155288956338, + "learning_rate": 2.3906490150708894e-07, + "logits/chosen": -1.3035330772399902, + "logits/rejected": -1.2258248329162598, + "logps/chosen": -207.00711059570312, + "logps/rejected": -357.16375732421875, + "loss": 0.3594, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.533954381942749, + "rewards/margins": 1.5234779119491577, + "rewards/rejected": -3.057432174682617, + "step": 6530 + }, + { + "epoch": 1.13, + "grad_norm": 38.1595216556655, + "learning_rate": 2.3831387858476739e-07, + "logits/chosen": -1.3005788326263428, + "logits/rejected": -1.2370128631591797, + "logps/chosen": -241.73861694335938, + "logps/rejected": -354.9060363769531, + "loss": 0.4724, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.8435420989990234, + "rewards/margins": 1.1428686380386353, + "rewards/rejected": -2.986410617828369, + "step": 6540 + }, + { + "epoch": 1.13, + "grad_norm": 32.72488280276923, + "learning_rate": 2.3756296134094176e-07, + "logits/chosen": -1.2309355735778809, + "logits/rejected": -1.1715677976608276, + "logps/chosen": -238.2613067626953, + "logps/rejected": -352.8631286621094, + "loss": 0.45, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8673267364501953, + "rewards/margins": 1.1446090936660767, + "rewards/rejected": -3.0119359493255615, + "step": 6550 + }, + { + "epoch": 1.13, + "grad_norm": 21.509539455997288, + "learning_rate": 2.368121565662142e-07, + "logits/chosen": -1.372521162033081, + "logits/rejected": -1.3001985549926758, + "logps/chosen": -221.03701782226562, + "logps/rejected": -343.4317626953125, + "loss": 0.4243, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6504312753677368, + "rewards/margins": 1.2656335830688477, + "rewards/rejected": -2.916064739227295, + "step": 6560 + }, + { + "epoch": 1.13, + "grad_norm": 26.05014427851963, + "learning_rate": 2.3606147105017037e-07, + "logits/chosen": -1.3940800428390503, + "logits/rejected": -1.323072910308838, + "logps/chosen": -222.31124877929688, + "logps/rejected": -340.05926513671875, + "loss": 0.4147, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6877048015594482, + "rewards/margins": 1.1861732006072998, + "rewards/rejected": -2.873878002166748, + "step": 6570 + }, + { + "epoch": 1.13, + "grad_norm": 27.019728234474087, + "learning_rate": 2.3531091158131702e-07, + "logits/chosen": -1.4203673601150513, + "logits/rejected": -1.349675178527832, + "logps/chosen": -210.45614624023438, + "logps/rejected": -316.55548095703125, + "loss": 0.4356, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5541622638702393, + "rewards/margins": 1.1116034984588623, + "rewards/rejected": -2.6657655239105225, + "step": 6580 + }, + { + "epoch": 1.14, + "grad_norm": 20.654019350018757, + "learning_rate": 2.3456048494702133e-07, + "logits/chosen": -1.360848069190979, + "logits/rejected": -1.2936543226242065, + "logps/chosen": -213.0439453125, + "logps/rejected": -346.81463623046875, + "loss": 0.4131, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6021496057510376, + "rewards/margins": 1.3312013149261475, + "rewards/rejected": -2.9333510398864746, + "step": 6590 + }, + { + "epoch": 1.14, + "grad_norm": 39.76165527082194, + "learning_rate": 2.3381019793344897e-07, + "logits/chosen": -1.4293988943099976, + "logits/rejected": -1.3686877489089966, + "logps/chosen": -219.72946166992188, + "logps/rejected": -342.87799072265625, + "loss": 0.4373, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6410694122314453, + "rewards/margins": 1.252076268196106, + "rewards/rejected": -2.893145799636841, + "step": 6600 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -1.4153751134872437, + "eval_logits/rejected": -1.3869836330413818, + "eval_logps/chosen": -250.33335876464844, + "eval_logps/rejected": -302.05145263671875, + "eval_loss": 0.6318928003311157, + "eval_rewards/accuracies": 0.6614776849746704, + "eval_rewards/chosen": -1.9162949323654175, + "eval_rewards/margins": 0.4726457893848419, + "eval_rewards/rejected": -2.3889405727386475, + "eval_runtime": 356.6448, + "eval_samples_per_second": 12.068, + "eval_steps_per_second": 1.509, + "step": 6600 + }, + { + "epoch": 1.14, + "grad_norm": 18.9597145646942, + "learning_rate": 2.3306005732550337e-07, + "logits/chosen": -1.3483235836029053, + "logits/rejected": -1.287246584892273, + "logps/chosen": -246.5947265625, + "logps/rejected": -377.0596618652344, + "loss": 0.3971, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9461562633514404, + "rewards/margins": 1.3069889545440674, + "rewards/rejected": -3.253145217895508, + "step": 6610 + }, + { + "epoch": 1.14, + "grad_norm": 34.30473163237968, + "learning_rate": 2.3231006990676365e-07, + "logits/chosen": -1.3247897624969482, + "logits/rejected": -1.2637712955474854, + "logps/chosen": -252.6153106689453, + "logps/rejected": -367.00933837890625, + "loss": 0.4838, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0022428035736084, + "rewards/margins": 1.1463488340377808, + "rewards/rejected": -3.148591995239258, + "step": 6620 + }, + { + "epoch": 1.14, + "grad_norm": 27.70016442005097, + "learning_rate": 2.3156024245942394e-07, + "logits/chosen": -1.3318690061569214, + "logits/rejected": -1.2696187496185303, + "logps/chosen": -205.23562622070312, + "logps/rejected": -317.63372802734375, + "loss": 0.4009, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.5194008350372314, + "rewards/margins": 1.1651160717010498, + "rewards/rejected": -2.6845173835754395, + "step": 6630 + }, + { + "epoch": 1.14, + "grad_norm": 19.533684280783113, + "learning_rate": 2.3081058176423148e-07, + "logits/chosen": -1.4036105871200562, + "logits/rejected": -1.3376753330230713, + "logps/chosen": -224.6147003173828, + "logps/rejected": -337.85577392578125, + "loss": 0.4359, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7151901721954346, + "rewards/margins": 1.1676080226898193, + "rewards/rejected": -2.8827977180480957, + "step": 6640 + }, + { + "epoch": 1.15, + "grad_norm": 25.725355566297086, + "learning_rate": 2.300610946004256e-07, + "logits/chosen": -1.449748158454895, + "logits/rejected": -1.3810780048370361, + "logps/chosen": -218.1935577392578, + "logps/rejected": -359.31268310546875, + "loss": 0.3906, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6608730554580688, + "rewards/margins": 1.3988673686981201, + "rewards/rejected": -3.0597405433654785, + "step": 6650 + }, + { + "epoch": 1.15, + "grad_norm": 22.25653908814933, + "learning_rate": 2.2931178774567662e-07, + "logits/chosen": -1.4511274099349976, + "logits/rejected": -1.389211654663086, + "logps/chosen": -204.99295043945312, + "logps/rejected": -340.06597900390625, + "loss": 0.4019, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.5325791835784912, + "rewards/margins": 1.326608657836914, + "rewards/rejected": -2.859187602996826, + "step": 6660 + }, + { + "epoch": 1.15, + "grad_norm": 25.109705271958312, + "learning_rate": 2.285626679760239e-07, + "logits/chosen": -1.3574293851852417, + "logits/rejected": -1.300843596458435, + "logps/chosen": -223.3512420654297, + "logps/rejected": -382.12548828125, + "loss": 0.3859, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7345237731933594, + "rewards/margins": 1.5338653326034546, + "rewards/rejected": -3.2683892250061035, + "step": 6670 + }, + { + "epoch": 1.15, + "grad_norm": 36.664785759024156, + "learning_rate": 2.278137420658154e-07, + "logits/chosen": -1.3482401371002197, + "logits/rejected": -1.2863205671310425, + "logps/chosen": -229.06997680664062, + "logps/rejected": -332.17108154296875, + "loss": 0.4906, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7523906230926514, + "rewards/margins": 1.0489892959594727, + "rewards/rejected": -2.8013803958892822, + "step": 6680 + }, + { + "epoch": 1.15, + "grad_norm": 21.93167754697323, + "learning_rate": 2.270650167876456e-07, + "logits/chosen": -1.3556606769561768, + "logits/rejected": -1.294721245765686, + "logps/chosen": -201.72213745117188, + "logps/rejected": -337.44561767578125, + "loss": 0.3821, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.4457823038101196, + "rewards/margins": 1.3707009553909302, + "rewards/rejected": -2.81648325920105, + "step": 6690 + }, + { + "epoch": 1.15, + "grad_norm": 45.51069055649989, + "learning_rate": 2.2631649891229502e-07, + "logits/chosen": -1.3424584865570068, + "logits/rejected": -1.2963857650756836, + "logps/chosen": -230.5055694580078, + "logps/rejected": -343.5107421875, + "loss": 0.4568, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7437101602554321, + "rewards/margins": 1.1325562000274658, + "rewards/rejected": -2.8762667179107666, + "step": 6700 + }, + { + "epoch": 1.15, + "eval_logits/chosen": -1.4418696165084839, + "eval_logits/rejected": -1.4138153791427612, + "eval_logps/chosen": -229.56253051757812, + "eval_logps/rejected": -278.3695983886719, + "eval_loss": 0.6346877813339233, + "eval_rewards/accuracies": 0.6575278639793396, + "eval_rewards/chosen": -1.7085868120193481, + "eval_rewards/margins": 0.44353532791137695, + "eval_rewards/rejected": -2.1521220207214355, + "eval_runtime": 356.6768, + "eval_samples_per_second": 12.067, + "eval_steps_per_second": 1.508, + "step": 6700 + }, + { + "epoch": 1.16, + "grad_norm": 25.6076080920141, + "learning_rate": 2.2556819520866828e-07, + "logits/chosen": -1.3505706787109375, + "logits/rejected": -1.2714554071426392, + "logps/chosen": -215.65658569335938, + "logps/rejected": -377.618408203125, + "loss": 0.3473, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5876433849334717, + "rewards/margins": 1.653158187866211, + "rewards/rejected": -3.2408013343811035, + "step": 6710 + }, + { + "epoch": 1.16, + "grad_norm": 25.373104312017844, + "learning_rate": 2.2482011244373357e-07, + "logits/chosen": -1.3233754634857178, + "logits/rejected": -1.2596690654754639, + "logps/chosen": -218.81887817382812, + "logps/rejected": -363.94854736328125, + "loss": 0.3888, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.674155592918396, + "rewards/margins": 1.4498602151870728, + "rewards/rejected": -3.1240158081054688, + "step": 6720 + }, + { + "epoch": 1.16, + "grad_norm": 36.35012540803045, + "learning_rate": 2.2407225738246074e-07, + "logits/chosen": -1.2628940343856812, + "logits/rejected": -1.2141722440719604, + "logps/chosen": -245.4576416015625, + "logps/rejected": -356.5597229003906, + "loss": 0.4927, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8878835439682007, + "rewards/margins": 1.1251258850097656, + "rewards/rejected": -3.013009548187256, + "step": 6730 + }, + { + "epoch": 1.16, + "grad_norm": 30.43443111435983, + "learning_rate": 2.233246367877609e-07, + "logits/chosen": -1.3312593698501587, + "logits/rejected": -1.2748968601226807, + "logps/chosen": -217.5796356201172, + "logps/rejected": -356.3081970214844, + "loss": 0.4072, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6314865350723267, + "rewards/margins": 1.3706390857696533, + "rewards/rejected": -3.0021252632141113, + "step": 6740 + }, + { + "epoch": 1.16, + "grad_norm": 19.110570274965703, + "learning_rate": 2.2257725742042438e-07, + "logits/chosen": -1.3627091646194458, + "logits/rejected": -1.3030933141708374, + "logps/chosen": -224.2344512939453, + "logps/rejected": -367.8368225097656, + "loss": 0.4003, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7180770635604858, + "rewards/margins": 1.4236048460006714, + "rewards/rejected": -3.1416821479797363, + "step": 6750 + }, + { + "epoch": 1.16, + "grad_norm": 48.40608306760337, + "learning_rate": 2.2183012603906066e-07, + "logits/chosen": -1.312281608581543, + "logits/rejected": -1.2430318593978882, + "logps/chosen": -216.72750854492188, + "logps/rejected": -338.98541259765625, + "loss": 0.4699, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6467088460922241, + "rewards/margins": 1.255919337272644, + "rewards/rejected": -2.902627944946289, + "step": 6760 + }, + { + "epoch": 1.17, + "grad_norm": 41.406179970761336, + "learning_rate": 2.2108324940003606e-07, + "logits/chosen": -1.3574762344360352, + "logits/rejected": -1.3120397329330444, + "logps/chosen": -211.9196014404297, + "logps/rejected": -334.21173095703125, + "loss": 0.433, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.5811679363250732, + "rewards/margins": 1.2045514583587646, + "rewards/rejected": -2.7857189178466797, + "step": 6770 + }, + { + "epoch": 1.17, + "grad_norm": 35.196901180748505, + "learning_rate": 2.2033663425741378e-07, + "logits/chosen": -1.3661503791809082, + "logits/rejected": -1.2911349534988403, + "logps/chosen": -219.1353302001953, + "logps/rejected": -340.84149169921875, + "loss": 0.4354, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6264568567276, + "rewards/margins": 1.2487623691558838, + "rewards/rejected": -2.8752193450927734, + "step": 6780 + }, + { + "epoch": 1.17, + "grad_norm": 30.125071463453715, + "learning_rate": 2.1959028736289184e-07, + "logits/chosen": -1.3736763000488281, + "logits/rejected": -1.3135449886322021, + "logps/chosen": -205.2973175048828, + "logps/rejected": -336.26531982421875, + "loss": 0.4184, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5477122068405151, + "rewards/margins": 1.2838819026947021, + "rewards/rejected": -2.8315939903259277, + "step": 6790 + }, + { + "epoch": 1.17, + "grad_norm": 20.154181246091905, + "learning_rate": 2.1884421546574288e-07, + "logits/chosen": -1.2408316135406494, + "logits/rejected": -1.166013240814209, + "logps/chosen": -215.8163299560547, + "logps/rejected": -352.4253234863281, + "loss": 0.396, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.634212851524353, + "rewards/margins": 1.3892707824707031, + "rewards/rejected": -3.0234837532043457, + "step": 6800 + }, + { + "epoch": 1.17, + "eval_logits/chosen": -1.4073745012283325, + "eval_logits/rejected": -1.3791805505752563, + "eval_logps/chosen": -242.52587890625, + "eval_logps/rejected": -293.1243896484375, + "eval_loss": 0.6304371356964111, + "eval_rewards/accuracies": 0.669377326965332, + "eval_rewards/chosen": -1.8382201194763184, + "eval_rewards/margins": 0.4614499807357788, + "eval_rewards/rejected": -2.2996702194213867, + "eval_runtime": 356.7418, + "eval_samples_per_second": 12.065, + "eval_steps_per_second": 1.508, + "step": 6800 + }, + { + "epoch": 1.17, + "grad_norm": 40.48439094660076, + "learning_rate": 2.1809842531275234e-07, + "logits/chosen": -1.3060812950134277, + "logits/rejected": -1.2478914260864258, + "logps/chosen": -250.30685424804688, + "logps/rejected": -360.6299133300781, + "loss": 0.458, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.9459367990493774, + "rewards/margins": 1.1430813074111938, + "rewards/rejected": -3.0890181064605713, + "step": 6810 + }, + { + "epoch": 1.18, + "grad_norm": 22.745532083063072, + "learning_rate": 2.173529236481581e-07, + "logits/chosen": -1.3810464143753052, + "logits/rejected": -1.3168919086456299, + "logps/chosen": -248.2955322265625, + "logps/rejected": -380.9482421875, + "loss": 0.4335, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9328635931015015, + "rewards/margins": 1.3468307256698608, + "rewards/rejected": -3.279694080352783, + "step": 6820 + }, + { + "epoch": 1.18, + "grad_norm": 29.05046249095613, + "learning_rate": 2.1660771721358898e-07, + "logits/chosen": -1.4409806728363037, + "logits/rejected": -1.3872400522232056, + "logps/chosen": -220.9757080078125, + "logps/rejected": -354.67041015625, + "loss": 0.4077, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6707779169082642, + "rewards/margins": 1.3462539911270142, + "rewards/rejected": -3.0170319080352783, + "step": 6830 + }, + { + "epoch": 1.18, + "grad_norm": 26.092849893502574, + "learning_rate": 2.1586281274800433e-07, + "logits/chosen": -1.4010366201400757, + "logits/rejected": -1.3389381170272827, + "logps/chosen": -229.3477020263672, + "logps/rejected": -351.7111511230469, + "loss": 0.4488, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7241207361221313, + "rewards/margins": 1.239141583442688, + "rewards/rejected": -2.9632620811462402, + "step": 6840 + }, + { + "epoch": 1.18, + "grad_norm": 31.755734616992513, + "learning_rate": 2.151182169876325e-07, + "logits/chosen": -1.3103221654891968, + "logits/rejected": -1.243399739265442, + "logps/chosen": -210.047607421875, + "logps/rejected": -330.6673278808594, + "loss": 0.438, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.559027075767517, + "rewards/margins": 1.2114872932434082, + "rewards/rejected": -2.7705142498016357, + "step": 6850 + }, + { + "epoch": 1.18, + "grad_norm": 23.745751034555475, + "learning_rate": 2.143739366659102e-07, + "logits/chosen": -1.4120018482208252, + "logits/rejected": -1.3412996530532837, + "logps/chosen": -229.89297485351562, + "logps/rejected": -333.34490966796875, + "loss": 0.4456, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.707727074623108, + "rewards/margins": 1.1025744676589966, + "rewards/rejected": -2.8103013038635254, + "step": 6860 + }, + { + "epoch": 1.18, + "grad_norm": 24.98697965563167, + "learning_rate": 2.1362997851342186e-07, + "logits/chosen": -1.300405502319336, + "logits/rejected": -1.2553117275238037, + "logps/chosen": -233.50424194335938, + "logps/rejected": -346.513671875, + "loss": 0.4593, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8242870569229126, + "rewards/margins": 1.1232497692108154, + "rewards/rejected": -2.9475369453430176, + "step": 6870 + }, + { + "epoch": 1.19, + "grad_norm": 28.460821723801782, + "learning_rate": 2.1288634925783817e-07, + "logits/chosen": -1.3697658777236938, + "logits/rejected": -1.2964236736297607, + "logps/chosen": -220.921630859375, + "logps/rejected": -353.2587890625, + "loss": 0.3797, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6666065454483032, + "rewards/margins": 1.3543717861175537, + "rewards/rejected": -3.0209782123565674, + "step": 6880 + }, + { + "epoch": 1.19, + "grad_norm": 28.677786254784447, + "learning_rate": 2.121430556238559e-07, + "logits/chosen": -1.3057619333267212, + "logits/rejected": -1.2380374670028687, + "logps/chosen": -215.5825958251953, + "logps/rejected": -367.3506774902344, + "loss": 0.3409, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6328455209732056, + "rewards/margins": 1.516847014427185, + "rewards/rejected": -3.1496922969818115, + "step": 6890 + }, + { + "epoch": 1.19, + "grad_norm": 43.60415706308666, + "learning_rate": 2.1140010433313642e-07, + "logits/chosen": -1.3161351680755615, + "logits/rejected": -1.2549692392349243, + "logps/chosen": -244.4730682373047, + "logps/rejected": -367.92681884765625, + "loss": 0.4312, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.9093977212905884, + "rewards/margins": 1.235694169998169, + "rewards/rejected": -3.1450917720794678, + "step": 6900 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -1.3852744102478027, + "eval_logits/rejected": -1.3564972877502441, + "eval_logps/chosen": -266.2965393066406, + "eval_logps/rejected": -320.25164794921875, + "eval_loss": 0.6330453157424927, + "eval_rewards/accuracies": 0.6644981503486633, + "eval_rewards/chosen": -2.0759267807006836, + "eval_rewards/margins": 0.4950157105922699, + "eval_rewards/rejected": -2.5709426403045654, + "eval_runtime": 356.8834, + "eval_samples_per_second": 12.06, + "eval_steps_per_second": 1.507, + "step": 6900 + }, + { + "epoch": 1.19, + "grad_norm": 33.10823667565031, + "learning_rate": 2.1065750210424572e-07, + "logits/chosen": -1.3516546487808228, + "logits/rejected": -1.280491828918457, + "logps/chosen": -246.9800262451172, + "logps/rejected": -387.73577880859375, + "loss": 0.4107, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9011551141738892, + "rewards/margins": 1.4241554737091064, + "rewards/rejected": -3.3253109455108643, + "step": 6910 + }, + { + "epoch": 1.19, + "grad_norm": 31.592188794374742, + "learning_rate": 2.099152556525926e-07, + "logits/chosen": -1.4136111736297607, + "logits/rejected": -1.3561625480651855, + "logps/chosen": -256.72418212890625, + "logps/rejected": -363.986083984375, + "loss": 0.4662, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.0063765048980713, + "rewards/margins": 1.0958820581436157, + "rewards/rejected": -3.1022586822509766, + "step": 6920 + }, + { + "epoch": 1.19, + "grad_norm": 33.58926677055865, + "learning_rate": 2.0917337169036924e-07, + "logits/chosen": -1.2991350889205933, + "logits/rejected": -1.224484920501709, + "logps/chosen": -217.6683807373047, + "logps/rejected": -366.7851867675781, + "loss": 0.3657, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6003036499023438, + "rewards/margins": 1.525007724761963, + "rewards/rejected": -3.1253113746643066, + "step": 6930 + }, + { + "epoch": 1.2, + "grad_norm": 40.25747613620437, + "learning_rate": 2.0843185692648911e-07, + "logits/chosen": -1.3118326663970947, + "logits/rejected": -1.2168563604354858, + "logps/chosen": -208.8804168701172, + "logps/rejected": -361.11444091796875, + "loss": 0.3828, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.509263038635254, + "rewards/margins": 1.596047043800354, + "rewards/rejected": -3.1053099632263184, + "step": 6940 + }, + { + "epoch": 1.2, + "grad_norm": 24.84940318081975, + "learning_rate": 2.076907180665276e-07, + "logits/chosen": -1.3450183868408203, + "logits/rejected": -1.2781970500946045, + "logps/chosen": -219.59927368164062, + "logps/rejected": -362.3084411621094, + "loss": 0.3846, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.657480239868164, + "rewards/margins": 1.4361209869384766, + "rewards/rejected": -3.0936012268066406, + "step": 6950 + }, + { + "epoch": 1.2, + "grad_norm": 30.803786620950167, + "learning_rate": 2.0694996181266027e-07, + "logits/chosen": -1.5233880281448364, + "logits/rejected": -1.4538953304290771, + "logps/chosen": -233.41616821289062, + "logps/rejected": -332.0858459472656, + "loss": 0.4962, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.781266450881958, + "rewards/margins": 1.062042474746704, + "rewards/rejected": -2.843308925628662, + "step": 6960 + }, + { + "epoch": 1.2, + "grad_norm": 27.683191395647043, + "learning_rate": 2.062095948636031e-07, + "logits/chosen": -1.4839107990264893, + "logits/rejected": -1.41178297996521, + "logps/chosen": -189.777587890625, + "logps/rejected": -317.9434509277344, + "loss": 0.3784, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.3524045944213867, + "rewards/margins": 1.3231462240219116, + "rewards/rejected": -2.675550937652588, + "step": 6970 + }, + { + "epoch": 1.2, + "grad_norm": 22.494868613952, + "learning_rate": 2.0546962391455128e-07, + "logits/chosen": -1.4198600053787231, + "logits/rejected": -1.3551172018051147, + "logps/chosen": -198.3609161376953, + "logps/rejected": -321.55157470703125, + "loss": 0.4203, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.4413096904754639, + "rewards/margins": 1.244518518447876, + "rewards/rejected": -2.685828447341919, + "step": 6980 + }, + { + "epoch": 1.2, + "grad_norm": 36.82272039938233, + "learning_rate": 2.0473005565711924e-07, + "logits/chosen": -1.335599422454834, + "logits/rejected": -1.2768608331680298, + "logps/chosen": -214.334716796875, + "logps/rejected": -331.76312255859375, + "loss": 0.4589, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5734632015228271, + "rewards/margins": 1.201757550239563, + "rewards/rejected": -2.7752208709716797, + "step": 6990 + }, + { + "epoch": 1.21, + "grad_norm": 28.270864329924283, + "learning_rate": 2.039908967792795e-07, + "logits/chosen": -1.5961410999298096, + "logits/rejected": -1.521150827407837, + "logps/chosen": -228.6524200439453, + "logps/rejected": -364.3278503417969, + "loss": 0.4144, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7237600088119507, + "rewards/margins": 1.3727467060089111, + "rewards/rejected": -3.0965065956115723, + "step": 7000 + }, + { + "epoch": 1.21, + "eval_logits/chosen": -1.5385133028030396, + "eval_logits/rejected": -1.5128390789031982, + "eval_logps/chosen": -213.44802856445312, + "eval_logps/rejected": -257.9127502441406, + "eval_loss": 0.630026638507843, + "eval_rewards/accuracies": 0.6586896181106567, + "eval_rewards/chosen": -1.547441840171814, + "eval_rewards/margins": 0.40011176466941833, + "eval_rewards/rejected": -1.9475535154342651, + "eval_runtime": 356.7968, + "eval_samples_per_second": 12.063, + "eval_steps_per_second": 1.508, + "step": 7000 + }, + { + "epoch": 1.21, + "grad_norm": 19.18466438304857, + "learning_rate": 2.0325215396530289e-07, + "logits/chosen": -1.4519102573394775, + "logits/rejected": -1.3836629390716553, + "logps/chosen": -213.21701049804688, + "logps/rejected": -340.19964599609375, + "loss": 0.4318, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5678694248199463, + "rewards/margins": 1.305743932723999, + "rewards/rejected": -2.8736133575439453, + "step": 7010 + }, + { + "epoch": 1.21, + "grad_norm": 27.94478473053083, + "learning_rate": 2.025138338956974e-07, + "logits/chosen": -1.4114625453948975, + "logits/rejected": -1.349818229675293, + "logps/chosen": -198.73695373535156, + "logps/rejected": -307.34136962890625, + "loss": 0.445, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4743351936340332, + "rewards/margins": 1.0755380392074585, + "rewards/rejected": -2.549873113632202, + "step": 7020 + }, + { + "epoch": 1.21, + "grad_norm": 41.933851784209644, + "learning_rate": 2.0177594324714838e-07, + "logits/chosen": -1.4608399868011475, + "logits/rejected": -1.397789716720581, + "logps/chosen": -205.079345703125, + "logps/rejected": -330.39312744140625, + "loss": 0.4361, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.519981026649475, + "rewards/margins": 1.2328134775161743, + "rewards/rejected": -2.7527945041656494, + "step": 7030 + }, + { + "epoch": 1.21, + "grad_norm": 27.891320371971226, + "learning_rate": 2.0103848869245764e-07, + "logits/chosen": -1.3869388103485107, + "logits/rejected": -1.3218698501586914, + "logps/chosen": -200.9859619140625, + "logps/rejected": -331.20611572265625, + "loss": 0.3926, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4491612911224365, + "rewards/margins": 1.297973871231079, + "rewards/rejected": -2.7471349239349365, + "step": 7040 + }, + { + "epoch": 1.21, + "grad_norm": 24.036317739572887, + "learning_rate": 2.0030147690048374e-07, + "logits/chosen": -1.3576328754425049, + "logits/rejected": -1.3013206720352173, + "logps/chosen": -198.55532836914062, + "logps/rejected": -319.38226318359375, + "loss": 0.4474, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4711755514144897, + "rewards/margins": 1.1891216039657593, + "rewards/rejected": -2.660297155380249, + "step": 7050 + }, + { + "epoch": 1.22, + "grad_norm": 36.02994677091849, + "learning_rate": 1.995649145360809e-07, + "logits/chosen": -1.4678010940551758, + "logits/rejected": -1.4199771881103516, + "logps/chosen": -212.34457397460938, + "logps/rejected": -316.0308532714844, + "loss": 0.4659, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5913760662078857, + "rewards/margins": 1.0398099422454834, + "rewards/rejected": -2.631186008453369, + "step": 7060 + }, + { + "epoch": 1.22, + "grad_norm": 65.30902966804867, + "learning_rate": 1.988288082600392e-07, + "logits/chosen": -1.3991708755493164, + "logits/rejected": -1.337200403213501, + "logps/chosen": -218.6618194580078, + "logps/rejected": -320.5107421875, + "loss": 0.5368, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.640601396560669, + "rewards/margins": 1.0321669578552246, + "rewards/rejected": -2.6727681159973145, + "step": 7070 + }, + { + "epoch": 1.22, + "grad_norm": 29.064832805753056, + "learning_rate": 1.980931647290246e-07, + "logits/chosen": -1.4547747373580933, + "logits/rejected": -1.3819966316223145, + "logps/chosen": -202.14923095703125, + "logps/rejected": -324.9191589355469, + "loss": 0.4112, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.4868186712265015, + "rewards/margins": 1.2228963375091553, + "rewards/rejected": -2.709714889526367, + "step": 7080 + }, + { + "epoch": 1.22, + "grad_norm": 21.70626173606062, + "learning_rate": 1.97357990595518e-07, + "logits/chosen": -1.5178253650665283, + "logits/rejected": -1.4573280811309814, + "logps/chosen": -204.9860382080078, + "logps/rejected": -340.4731750488281, + "loss": 0.4111, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5147212743759155, + "rewards/margins": 1.3682196140289307, + "rewards/rejected": -2.8829410076141357, + "step": 7090 + }, + { + "epoch": 1.22, + "grad_norm": 38.87350809965092, + "learning_rate": 1.9662329250775586e-07, + "logits/chosen": -1.3815619945526123, + "logits/rejected": -1.3237214088439941, + "logps/chosen": -205.8621826171875, + "logps/rejected": -319.4546203613281, + "loss": 0.4501, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4879212379455566, + "rewards/margins": 1.1380326747894287, + "rewards/rejected": -2.6259539127349854, + "step": 7100 + }, + { + "epoch": 1.22, + "eval_logits/chosen": -1.4833621978759766, + "eval_logits/rejected": -1.4578860998153687, + "eval_logps/chosen": -215.61434936523438, + "eval_logps/rejected": -259.6932373046875, + "eval_loss": 0.6319575309753418, + "eval_rewards/accuracies": 0.6510223150253296, + "eval_rewards/chosen": -1.5691050291061401, + "eval_rewards/margins": 0.39625340700149536, + "eval_rewards/rejected": -1.9653586149215698, + "eval_runtime": 356.9358, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.507, + "step": 7100 + }, + { + "epoch": 1.23, + "grad_norm": 30.114994381758486, + "learning_rate": 1.9588907710966943e-07, + "logits/chosen": -1.3856322765350342, + "logits/rejected": -1.3147612810134888, + "logps/chosen": -195.54681396484375, + "logps/rejected": -312.0347595214844, + "loss": 0.4228, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.4208539724349976, + "rewards/margins": 1.1898590326309204, + "rewards/rejected": -2.610713243484497, + "step": 7110 + }, + { + "epoch": 1.23, + "grad_norm": 44.292714805439296, + "learning_rate": 1.951553510408252e-07, + "logits/chosen": -1.3800169229507446, + "logits/rejected": -1.3161985874176025, + "logps/chosen": -223.2345428466797, + "logps/rejected": -308.5866394042969, + "loss": 0.5205, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.612497329711914, + "rewards/margins": 0.9479106068611145, + "rewards/rejected": -2.560408115386963, + "step": 7120 + }, + { + "epoch": 1.23, + "grad_norm": 31.181918195201007, + "learning_rate": 1.944221209363643e-07, + "logits/chosen": -1.300041913986206, + "logits/rejected": -1.2450910806655884, + "logps/chosen": -205.53591918945312, + "logps/rejected": -322.80950927734375, + "loss": 0.4277, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4920589923858643, + "rewards/margins": 1.2042802572250366, + "rewards/rejected": -2.6963393688201904, + "step": 7130 + }, + { + "epoch": 1.23, + "grad_norm": 33.82376669358963, + "learning_rate": 1.9368939342694328e-07, + "logits/chosen": -1.4221440553665161, + "logits/rejected": -1.382880449295044, + "logps/chosen": -190.2220458984375, + "logps/rejected": -294.0387878417969, + "loss": 0.4669, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.38909912109375, + "rewards/margins": 1.0321005582809448, + "rewards/rejected": -2.421199321746826, + "step": 7140 + }, + { + "epoch": 1.23, + "grad_norm": 41.46398078974056, + "learning_rate": 1.9295717513867324e-07, + "logits/chosen": -1.5011231899261475, + "logits/rejected": -1.4463145732879639, + "logps/chosen": -224.45938110351562, + "logps/rejected": -333.4562683105469, + "loss": 0.4627, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6809113025665283, + "rewards/margins": 1.1032178401947021, + "rewards/rejected": -2.7841289043426514, + "step": 7150 + }, + { + "epoch": 1.23, + "grad_norm": 43.52157134513151, + "learning_rate": 1.9222547269306068e-07, + "logits/chosen": -1.415351152420044, + "logits/rejected": -1.3425204753875732, + "logps/chosen": -192.447021484375, + "logps/rejected": -316.6680908203125, + "loss": 0.4306, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3883023262023926, + "rewards/margins": 1.2378036975860596, + "rewards/rejected": -2.6261062622070312, + "step": 7160 + }, + { + "epoch": 1.24, + "grad_norm": 28.198520340260067, + "learning_rate": 1.9149429270694705e-07, + "logits/chosen": -1.4002097845077515, + "logits/rejected": -1.3377676010131836, + "logps/chosen": -201.3988494873047, + "logps/rejected": -307.12530517578125, + "loss": 0.4632, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4582065343856812, + "rewards/margins": 1.0752780437469482, + "rewards/rejected": -2.533484697341919, + "step": 7170 + }, + { + "epoch": 1.24, + "grad_norm": 36.940975044182615, + "learning_rate": 1.9076364179244937e-07, + "logits/chosen": -1.519090175628662, + "logits/rejected": -1.4556185007095337, + "logps/chosen": -192.38104248046875, + "logps/rejected": -318.6177673339844, + "loss": 0.3814, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3707835674285889, + "rewards/margins": 1.2708237171173096, + "rewards/rejected": -2.6416070461273193, + "step": 7180 + }, + { + "epoch": 1.24, + "grad_norm": 29.51979419688918, + "learning_rate": 1.900335265568999e-07, + "logits/chosen": -1.2953803539276123, + "logits/rejected": -1.22904372215271, + "logps/chosen": -216.4315643310547, + "logps/rejected": -340.22052001953125, + "loss": 0.4171, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.637787103652954, + "rewards/margins": 1.2283713817596436, + "rewards/rejected": -2.8661584854125977, + "step": 7190 + }, + { + "epoch": 1.24, + "grad_norm": 43.92936652185145, + "learning_rate": 1.893039536027872e-07, + "logits/chosen": -1.2936763763427734, + "logits/rejected": -1.2286694049835205, + "logps/chosen": -226.12228393554688, + "logps/rejected": -365.41058349609375, + "loss": 0.4303, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7303447723388672, + "rewards/margins": 1.3623626232147217, + "rewards/rejected": -3.092707395553589, + "step": 7200 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -1.436883568763733, + "eval_logits/rejected": -1.4103857278823853, + "eval_logps/chosen": -236.11033630371094, + "eval_logps/rejected": -283.7571105957031, + "eval_loss": 0.632332980632782, + "eval_rewards/accuracies": 0.6538103818893433, + "eval_rewards/chosen": -1.7740648984909058, + "eval_rewards/margins": 0.431932270526886, + "eval_rewards/rejected": -2.2059972286224365, + "eval_runtime": 356.7528, + "eval_samples_per_second": 12.064, + "eval_steps_per_second": 1.508, + "step": 7200 + }, + { + "epoch": 1.24, + "grad_norm": 35.504557595434676, + "learning_rate": 1.885749295276955e-07, + "logits/chosen": -1.4118075370788574, + "logits/rejected": -1.362586498260498, + "logps/chosen": -243.8560028076172, + "logps/rejected": -341.7215576171875, + "loss": 0.4936, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8682628870010376, + "rewards/margins": 1.0187652111053467, + "rewards/rejected": -2.887028217315674, + "step": 7210 + }, + { + "epoch": 1.24, + "grad_norm": 38.49322970172763, + "learning_rate": 1.8784646092424572e-07, + "logits/chosen": -1.2949811220169067, + "logits/rejected": -1.219310998916626, + "logps/chosen": -222.8941650390625, + "logps/rejected": -352.58172607421875, + "loss": 0.4373, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6838912963867188, + "rewards/margins": 1.325777292251587, + "rewards/rejected": -3.0096685886383057, + "step": 7220 + }, + { + "epoch": 1.25, + "grad_norm": 28.566213465067236, + "learning_rate": 1.8711855438003543e-07, + "logits/chosen": -1.3604927062988281, + "logits/rejected": -1.2994263172149658, + "logps/chosen": -208.6522216796875, + "logps/rejected": -332.7591857910156, + "loss": 0.4031, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5445417165756226, + "rewards/margins": 1.2535761594772339, + "rewards/rejected": -2.7981178760528564, + "step": 7230 + }, + { + "epoch": 1.25, + "grad_norm": 24.11769012327145, + "learning_rate": 1.8639121647757976e-07, + "logits/chosen": -1.3791191577911377, + "logits/rejected": -1.3320530652999878, + "logps/chosen": -218.4418182373047, + "logps/rejected": -334.22833251953125, + "loss": 0.4437, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.609986662864685, + "rewards/margins": 1.184805154800415, + "rewards/rejected": -2.7947916984558105, + "step": 7240 + }, + { + "epoch": 1.25, + "grad_norm": 28.06311795428957, + "learning_rate": 1.8566445379425116e-07, + "logits/chosen": -1.4544193744659424, + "logits/rejected": -1.3801645040512085, + "logps/chosen": -202.21969604492188, + "logps/rejected": -322.6747131347656, + "loss": 0.4007, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4765223264694214, + "rewards/margins": 1.2214938402175903, + "rewards/rejected": -2.69801664352417, + "step": 7250 + }, + { + "epoch": 1.25, + "grad_norm": 28.029045217779828, + "learning_rate": 1.8493827290222068e-07, + "logits/chosen": -1.4240261316299438, + "logits/rejected": -1.3594194650650024, + "logps/chosen": -222.39035034179688, + "logps/rejected": -343.7870788574219, + "loss": 0.446, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6872167587280273, + "rewards/margins": 1.2270071506500244, + "rewards/rejected": -2.9142239093780518, + "step": 7260 + }, + { + "epoch": 1.25, + "grad_norm": 33.35599192177956, + "learning_rate": 1.84212680368398e-07, + "logits/chosen": -1.4141993522644043, + "logits/rejected": -1.3509438037872314, + "logps/chosen": -217.01632690429688, + "logps/rejected": -327.6526794433594, + "loss": 0.4629, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6272687911987305, + "rewards/margins": 1.1230875253677368, + "rewards/rejected": -2.7503561973571777, + "step": 7270 + }, + { + "epoch": 1.25, + "grad_norm": 38.40058672670466, + "learning_rate": 1.834876827543721e-07, + "logits/chosen": -1.4696061611175537, + "logits/rejected": -1.3933039903640747, + "logps/chosen": -214.4224090576172, + "logps/rejected": -348.7910461425781, + "loss": 0.4019, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5783621072769165, + "rewards/margins": 1.3805334568023682, + "rewards/rejected": -2.958895444869995, + "step": 7280 + }, + { + "epoch": 1.26, + "grad_norm": 46.97048710720416, + "learning_rate": 1.8276328661635248e-07, + "logits/chosen": -1.2667840719223022, + "logits/rejected": -1.2175432443618774, + "logps/chosen": -230.58251953125, + "logps/rejected": -343.7873840332031, + "loss": 0.4282, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.757728934288025, + "rewards/margins": 1.1217601299285889, + "rewards/rejected": -2.879488945007324, + "step": 7290 + }, + { + "epoch": 1.26, + "grad_norm": 23.37837488393363, + "learning_rate": 1.8203949850510903e-07, + "logits/chosen": -1.1985424757003784, + "logits/rejected": -1.151474952697754, + "logps/chosen": -231.79354858398438, + "logps/rejected": -341.9132080078125, + "loss": 0.4717, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8153043985366821, + "rewards/margins": 1.083820104598999, + "rewards/rejected": -2.8991243839263916, + "step": 7300 + }, + { + "epoch": 1.26, + "eval_logits/chosen": -1.4253735542297363, + "eval_logits/rejected": -1.3984841108322144, + "eval_logps/chosen": -244.4295196533203, + "eval_logps/rejected": -294.37445068359375, + "eval_loss": 0.6293808221817017, + "eval_rewards/accuracies": 0.6668215394020081, + "eval_rewards/chosen": -1.857256531715393, + "eval_rewards/margins": 0.45491406321525574, + "eval_rewards/rejected": -2.3121707439422607, + "eval_runtime": 357.0293, + "eval_samples_per_second": 12.055, + "eval_steps_per_second": 1.507, + "step": 7300 + }, + { + "epoch": 1.26, + "grad_norm": 32.28579120461566, + "learning_rate": 1.8131632496591348e-07, + "logits/chosen": -1.354773759841919, + "logits/rejected": -1.288698673248291, + "logps/chosen": -231.0813446044922, + "logps/rejected": -361.4246520996094, + "loss": 0.4156, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7601534128189087, + "rewards/margins": 1.3353766202926636, + "rewards/rejected": -3.0955300331115723, + "step": 7310 + }, + { + "epoch": 1.26, + "grad_norm": 26.521159677348862, + "learning_rate": 1.8059377253847973e-07, + "logits/chosen": -1.374133825302124, + "logits/rejected": -1.314866304397583, + "logps/chosen": -226.6112823486328, + "logps/rejected": -341.32647705078125, + "loss": 0.478, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.72268545627594, + "rewards/margins": 1.179290533065796, + "rewards/rejected": -2.9019761085510254, + "step": 7320 + }, + { + "epoch": 1.26, + "grad_norm": 42.16253795931932, + "learning_rate": 1.7987184775690508e-07, + "logits/chosen": -1.2531036138534546, + "logits/rejected": -1.1840673685073853, + "logps/chosen": -223.2836151123047, + "logps/rejected": -365.69842529296875, + "loss": 0.3926, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7025184631347656, + "rewards/margins": 1.438976526260376, + "rewards/rejected": -3.1414952278137207, + "step": 7330 + }, + { + "epoch": 1.26, + "grad_norm": 24.413820614134853, + "learning_rate": 1.7915055714961092e-07, + "logits/chosen": -1.3367866277694702, + "logits/rejected": -1.274552822113037, + "logps/chosen": -241.989501953125, + "logps/rejected": -350.8983459472656, + "loss": 0.463, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8661409616470337, + "rewards/margins": 1.1037722826004028, + "rewards/rejected": -2.9699134826660156, + "step": 7340 + }, + { + "epoch": 1.27, + "grad_norm": 27.114867030708584, + "learning_rate": 1.7842990723928376e-07, + "logits/chosen": -1.4280154705047607, + "logits/rejected": -1.3533068895339966, + "logps/chosen": -203.71188354492188, + "logps/rejected": -342.64697265625, + "loss": 0.3656, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.4681717157363892, + "rewards/margins": 1.4406144618988037, + "rewards/rejected": -2.9087860584259033, + "step": 7350 + }, + { + "epoch": 1.27, + "grad_norm": 43.36655567777902, + "learning_rate": 1.7770990454281605e-07, + "logits/chosen": -1.3013639450073242, + "logits/rejected": -1.2412099838256836, + "logps/chosen": -235.2455291748047, + "logps/rejected": -366.5183410644531, + "loss": 0.4174, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8094427585601807, + "rewards/margins": 1.334092140197754, + "rewards/rejected": -3.1435346603393555, + "step": 7360 + }, + { + "epoch": 1.27, + "grad_norm": 43.02629441915015, + "learning_rate": 1.7699055557124791e-07, + "logits/chosen": -1.2064440250396729, + "logits/rejected": -1.1509660482406616, + "logps/chosen": -230.65185546875, + "logps/rejected": -360.70745849609375, + "loss": 0.4244, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7785717248916626, + "rewards/margins": 1.316173791885376, + "rewards/rejected": -3.094745635986328, + "step": 7370 + }, + { + "epoch": 1.27, + "grad_norm": 44.50263470615816, + "learning_rate": 1.7627186682970723e-07, + "logits/chosen": -1.269676923751831, + "logits/rejected": -1.2101144790649414, + "logps/chosen": -239.9287109375, + "logps/rejected": -365.6208190917969, + "loss": 0.4291, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8677568435668945, + "rewards/margins": 1.2519527673721313, + "rewards/rejected": -3.1197097301483154, + "step": 7380 + }, + { + "epoch": 1.27, + "grad_norm": 58.270680983383755, + "learning_rate": 1.755538448173518e-07, + "logits/chosen": -1.2635023593902588, + "logits/rejected": -1.208660364151001, + "logps/chosen": -237.5902557373047, + "logps/rejected": -356.22052001953125, + "loss": 0.4469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8537395000457764, + "rewards/margins": 1.17844557762146, + "rewards/rejected": -3.0321850776672363, + "step": 7390 + }, + { + "epoch": 1.27, + "grad_norm": 19.321168110421592, + "learning_rate": 1.7483649602730987e-07, + "logits/chosen": -1.2944018840789795, + "logits/rejected": -1.2126576900482178, + "logps/chosen": -228.0514373779297, + "logps/rejected": -363.4169006347656, + "loss": 0.3908, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7260242700576782, + "rewards/margins": 1.4026000499725342, + "rewards/rejected": -3.128624439239502, + "step": 7400 + }, + { + "epoch": 1.27, + "eval_logits/chosen": -1.4501019716262817, + "eval_logits/rejected": -1.423465609550476, + "eval_logps/chosen": -227.0261688232422, + "eval_logps/rejected": -274.0572204589844, + "eval_loss": 0.630664587020874, + "eval_rewards/accuracies": 0.6568308472633362, + "eval_rewards/chosen": -1.683223009109497, + "eval_rewards/margins": 0.4257754683494568, + "eval_rewards/rejected": -2.1089982986450195, + "eval_runtime": 357.4983, + "eval_samples_per_second": 12.039, + "eval_steps_per_second": 1.505, + "step": 7400 + }, + { + "epoch": 1.28, + "grad_norm": 45.11638691089592, + "learning_rate": 1.741198269466219e-07, + "logits/chosen": -1.2776044607162476, + "logits/rejected": -1.2083661556243896, + "logps/chosen": -218.230712890625, + "logps/rejected": -343.81890869140625, + "loss": 0.4103, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6449800729751587, + "rewards/margins": 1.2632148265838623, + "rewards/rejected": -2.9081950187683105, + "step": 7410 + }, + { + "epoch": 1.28, + "grad_norm": 42.030823379925565, + "learning_rate": 1.7340384405618134e-07, + "logits/chosen": -1.2458035945892334, + "logits/rejected": -1.1925244331359863, + "logps/chosen": -207.9458770751953, + "logps/rejected": -318.7289123535156, + "loss": 0.4746, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5179194211959839, + "rewards/margins": 1.1385493278503418, + "rewards/rejected": -2.6564688682556152, + "step": 7420 + }, + { + "epoch": 1.28, + "grad_norm": 31.232966428180802, + "learning_rate": 1.7268855383067683e-07, + "logits/chosen": -1.2855768203735352, + "logits/rejected": -1.2198007106781006, + "logps/chosen": -232.55398559570312, + "logps/rejected": -353.21661376953125, + "loss": 0.4445, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7915750741958618, + "rewards/margins": 1.2219661474227905, + "rewards/rejected": -3.0135409832000732, + "step": 7430 + }, + { + "epoch": 1.28, + "grad_norm": 34.93396652920407, + "learning_rate": 1.7197396273853276e-07, + "logits/chosen": -1.4023360013961792, + "logits/rejected": -1.343386173248291, + "logps/chosen": -240.6739044189453, + "logps/rejected": -337.22491455078125, + "loss": 0.5101, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8467438220977783, + "rewards/margins": 0.9919818043708801, + "rewards/rejected": -2.8387253284454346, + "step": 7440 + }, + { + "epoch": 1.28, + "grad_norm": 27.19873025751527, + "learning_rate": 1.7126007724185165e-07, + "logits/chosen": -1.5503208637237549, + "logits/rejected": -1.4830740690231323, + "logps/chosen": -199.37518310546875, + "logps/rejected": -305.76214599609375, + "loss": 0.4474, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.4345240592956543, + "rewards/margins": 1.0972298383712769, + "rewards/rejected": -2.5317540168762207, + "step": 7450 + }, + { + "epoch": 1.29, + "grad_norm": 27.77498676919186, + "learning_rate": 1.7054690379635477e-07, + "logits/chosen": -1.3472172021865845, + "logits/rejected": -1.3040322065353394, + "logps/chosen": -191.5806427001953, + "logps/rejected": -319.0374755859375, + "loss": 0.4022, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4145309925079346, + "rewards/margins": 1.2461668252944946, + "rewards/rejected": -2.6606979370117188, + "step": 7460 + }, + { + "epoch": 1.29, + "grad_norm": 33.696980703475546, + "learning_rate": 1.698344488513247e-07, + "logits/chosen": -1.4441345930099487, + "logits/rejected": -1.3981006145477295, + "logps/chosen": -196.69949340820312, + "logps/rejected": -293.2108459472656, + "loss": 0.4734, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.42051100730896, + "rewards/margins": 0.998548150062561, + "rewards/rejected": -2.4190590381622314, + "step": 7470 + }, + { + "epoch": 1.29, + "grad_norm": 35.61398182570765, + "learning_rate": 1.691227188495461e-07, + "logits/chosen": -1.3656269311904907, + "logits/rejected": -1.313783884048462, + "logps/chosen": -199.72702026367188, + "logps/rejected": -283.9599609375, + "loss": 0.5024, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4509938955307007, + "rewards/margins": 0.8966015577316284, + "rewards/rejected": -2.3475959300994873, + "step": 7480 + }, + { + "epoch": 1.29, + "grad_norm": 27.73975502601125, + "learning_rate": 1.684117202272485e-07, + "logits/chosen": -1.3349004983901978, + "logits/rejected": -1.287638545036316, + "logps/chosen": -206.85690307617188, + "logps/rejected": -317.2720031738281, + "loss": 0.4389, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5430989265441895, + "rewards/margins": 1.098320484161377, + "rewards/rejected": -2.6414191722869873, + "step": 7490 + }, + { + "epoch": 1.29, + "grad_norm": 28.229250472829, + "learning_rate": 1.6770145941404696e-07, + "logits/chosen": -1.3574326038360596, + "logits/rejected": -1.2926125526428223, + "logps/chosen": -197.47386169433594, + "logps/rejected": -314.25897216796875, + "loss": 0.4618, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4602476358413696, + "rewards/margins": 1.162480115890503, + "rewards/rejected": -2.622727632522583, + "step": 7500 + }, + { + "epoch": 1.29, + "eval_logits/chosen": -1.5060161352157593, + "eval_logits/rejected": -1.481170654296875, + "eval_logps/chosen": -211.69110107421875, + "eval_logps/rejected": -254.75897216796875, + "eval_loss": 0.627605676651001, + "eval_rewards/accuracies": 0.6531133651733398, + "eval_rewards/chosen": -1.5298728942871094, + "eval_rewards/margins": 0.38614320755004883, + "eval_rewards/rejected": -1.916015863418579, + "eval_runtime": 357.4881, + "eval_samples_per_second": 12.04, + "eval_steps_per_second": 1.505, + "step": 7500 + }, + { + "epoch": 1.29, + "grad_norm": 29.587971860400398, + "learning_rate": 1.669919428328847e-07, + "logits/chosen": -1.394683599472046, + "logits/rejected": -1.3351951837539673, + "logps/chosen": -214.37551879882812, + "logps/rejected": -310.74652099609375, + "loss": 0.4476, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5710610151290894, + "rewards/margins": 1.0244272947311401, + "rewards/rejected": -2.5954883098602295, + "step": 7510 + }, + { + "epoch": 1.3, + "grad_norm": 25.2120659878252, + "learning_rate": 1.6628317689997498e-07, + "logits/chosen": -1.3550820350646973, + "logits/rejected": -1.3013880252838135, + "logps/chosen": -199.8297119140625, + "logps/rejected": -322.51141357421875, + "loss": 0.4, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.4692803621292114, + "rewards/margins": 1.217524766921997, + "rewards/rejected": -2.686805248260498, + "step": 7520 + }, + { + "epoch": 1.3, + "grad_norm": 17.580126618538177, + "learning_rate": 1.6557516802474247e-07, + "logits/chosen": -1.2875080108642578, + "logits/rejected": -1.237430453300476, + "logps/chosen": -204.1475830078125, + "logps/rejected": -334.02349853515625, + "loss": 0.4147, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.5195860862731934, + "rewards/margins": 1.2862619161605835, + "rewards/rejected": -2.8058478832244873, + "step": 7530 + }, + { + "epoch": 1.3, + "grad_norm": 24.405271336303738, + "learning_rate": 1.6486792260976618e-07, + "logits/chosen": -1.4056943655014038, + "logits/rejected": -1.3522775173187256, + "logps/chosen": -208.80911254882812, + "logps/rejected": -347.8929138183594, + "loss": 0.3834, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5729515552520752, + "rewards/margins": 1.3793232440948486, + "rewards/rejected": -2.952274799346924, + "step": 7540 + }, + { + "epoch": 1.3, + "grad_norm": 29.108508549211408, + "learning_rate": 1.6416144705072072e-07, + "logits/chosen": -1.2879887819290161, + "logits/rejected": -1.2317047119140625, + "logps/chosen": -233.01010131835938, + "logps/rejected": -367.2439270019531, + "loss": 0.4572, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8120092153549194, + "rewards/margins": 1.3423044681549072, + "rewards/rejected": -3.154313564300537, + "step": 7550 + }, + { + "epoch": 1.3, + "grad_norm": 42.99955943761263, + "learning_rate": 1.6345574773631898e-07, + "logits/chosen": -1.388718843460083, + "logits/rejected": -1.3253867626190186, + "logps/chosen": -227.2410888671875, + "logps/rejected": -355.32391357421875, + "loss": 0.4386, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7468681335449219, + "rewards/margins": 1.2529178857803345, + "rewards/rejected": -2.999785900115967, + "step": 7560 + }, + { + "epoch": 1.3, + "grad_norm": 38.1555696234109, + "learning_rate": 1.6275083104825414e-07, + "logits/chosen": -1.3410319089889526, + "logits/rejected": -1.2803980112075806, + "logps/chosen": -247.3603515625, + "logps/rejected": -375.22528076171875, + "loss": 0.4186, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.9059759378433228, + "rewards/margins": 1.3241702318191528, + "rewards/rejected": -3.2301464080810547, + "step": 7570 + }, + { + "epoch": 1.31, + "grad_norm": 41.061661152022474, + "learning_rate": 1.6204670336114224e-07, + "logits/chosen": -1.2776286602020264, + "logits/rejected": -1.2231152057647705, + "logps/chosen": -242.25015258789062, + "logps/rejected": -361.8536682128906, + "loss": 0.4535, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8624213933944702, + "rewards/margins": 1.2097980976104736, + "rewards/rejected": -3.0722193717956543, + "step": 7580 + }, + { + "epoch": 1.31, + "grad_norm": 19.57994826653458, + "learning_rate": 1.6134337104246395e-07, + "logits/chosen": -1.3166749477386475, + "logits/rejected": -1.225110650062561, + "logps/chosen": -244.9367218017578, + "logps/rejected": -403.8605041503906, + "loss": 0.3392, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.8664249181747437, + "rewards/margins": 1.658355474472046, + "rewards/rejected": -3.524780750274658, + "step": 7590 + }, + { + "epoch": 1.31, + "grad_norm": 46.40887691060314, + "learning_rate": 1.6064084045250786e-07, + "logits/chosen": -1.3110687732696533, + "logits/rejected": -1.2509706020355225, + "logps/chosen": -266.95465087890625, + "logps/rejected": -382.99432373046875, + "loss": 0.5019, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.144411563873291, + "rewards/margins": 1.1999857425689697, + "rewards/rejected": -3.3443970680236816, + "step": 7600 + }, + { + "epoch": 1.31, + "eval_logits/chosen": -1.4277472496032715, + "eval_logits/rejected": -1.4007766246795654, + "eval_logps/chosen": -242.9215087890625, + "eval_logps/rejected": -292.66485595703125, + "eval_loss": 0.6300765872001648, + "eval_rewards/accuracies": 0.6624070405960083, + "eval_rewards/chosen": -1.8421767950057983, + "eval_rewards/margins": 0.4528978765010834, + "eval_rewards/rejected": -2.295074462890625, + "eval_runtime": 357.4474, + "eval_samples_per_second": 12.041, + "eval_steps_per_second": 1.505, + "step": 7600 + }, + { + "epoch": 1.31, + "grad_norm": 27.39761729744123, + "learning_rate": 1.5993911794431197e-07, + "logits/chosen": -1.3395607471466064, + "logits/rejected": -1.275943398475647, + "logps/chosen": -216.5625457763672, + "logps/rejected": -342.125244140625, + "loss": 0.4343, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.63640558719635, + "rewards/margins": 1.2829220294952393, + "rewards/rejected": -2.9193274974823, + "step": 7610 + }, + { + "epoch": 1.31, + "grad_norm": 28.677466981159814, + "learning_rate": 1.5923820986360703e-07, + "logits/chosen": -1.4301960468292236, + "logits/rejected": -1.3750264644622803, + "logps/chosen": -204.70155334472656, + "logps/rejected": -306.2242431640625, + "loss": 0.4617, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.4719994068145752, + "rewards/margins": 1.0525176525115967, + "rewards/rejected": -2.524517059326172, + "step": 7620 + }, + { + "epoch": 1.31, + "grad_norm": 30.824980394711297, + "learning_rate": 1.585381225487588e-07, + "logits/chosen": -1.3620095252990723, + "logits/rejected": -1.3211814165115356, + "logps/chosen": -199.6609344482422, + "logps/rejected": -317.6810607910156, + "loss": 0.4326, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.49845290184021, + "rewards/margins": 1.1381967067718506, + "rewards/rejected": -2.6366496086120605, + "step": 7630 + }, + { + "epoch": 1.32, + "grad_norm": 40.129932359839145, + "learning_rate": 1.5783886233071074e-07, + "logits/chosen": -1.281798243522644, + "logits/rejected": -1.2172118425369263, + "logps/chosen": -226.0537872314453, + "logps/rejected": -354.72576904296875, + "loss": 0.423, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6815427541732788, + "rewards/margins": 1.3418070077896118, + "rewards/rejected": -3.0233497619628906, + "step": 7640 + }, + { + "epoch": 1.32, + "grad_norm": 34.0153156124325, + "learning_rate": 1.5714043553292683e-07, + "logits/chosen": -1.3627344369888306, + "logits/rejected": -1.304088830947876, + "logps/chosen": -245.76119995117188, + "logps/rejected": -363.3336486816406, + "loss": 0.4816, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8740276098251343, + "rewards/margins": 1.2018343210220337, + "rewards/rejected": -3.075861692428589, + "step": 7650 + }, + { + "epoch": 1.32, + "grad_norm": 27.126137701003756, + "learning_rate": 1.564428484713345e-07, + "logits/chosen": -1.36992609500885, + "logits/rejected": -1.2936866283416748, + "logps/chosen": -223.11538696289062, + "logps/rejected": -359.9754333496094, + "loss": 0.3757, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6609264612197876, + "rewards/margins": 1.3914331197738647, + "rewards/rejected": -3.0523598194122314, + "step": 7660 + }, + { + "epoch": 1.32, + "grad_norm": 34.38929359333108, + "learning_rate": 1.5574610745426704e-07, + "logits/chosen": -1.3428263664245605, + "logits/rejected": -1.283569097518921, + "logps/chosen": -209.7373046875, + "logps/rejected": -315.8505859375, + "loss": 0.5005, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.5220474004745483, + "rewards/margins": 1.107168436050415, + "rewards/rejected": -2.629215955734253, + "step": 7670 + }, + { + "epoch": 1.32, + "grad_norm": 26.0642265781016, + "learning_rate": 1.5505021878240732e-07, + "logits/chosen": -1.413971185684204, + "logits/rejected": -1.3582481145858765, + "logps/chosen": -209.92529296875, + "logps/rejected": -324.2261047363281, + "loss": 0.4226, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5928831100463867, + "rewards/margins": 1.1526020765304565, + "rewards/rejected": -2.745485305786133, + "step": 7680 + }, + { + "epoch": 1.32, + "grad_norm": 21.73140455009811, + "learning_rate": 1.543551887487301e-07, + "logits/chosen": -1.5031044483184814, + "logits/rejected": -1.4207048416137695, + "logps/chosen": -188.10305786132812, + "logps/rejected": -301.28973388671875, + "loss": 0.3942, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.3312842845916748, + "rewards/margins": 1.1913520097732544, + "rewards/rejected": -2.5226359367370605, + "step": 7690 + }, + { + "epoch": 1.33, + "grad_norm": 21.539140223042764, + "learning_rate": 1.5366102363844552e-07, + "logits/chosen": -1.389103889465332, + "logits/rejected": -1.3187581300735474, + "logps/chosen": -202.11195373535156, + "logps/rejected": -320.7894592285156, + "loss": 0.4239, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4840246438980103, + "rewards/margins": 1.1871830224990845, + "rewards/rejected": -2.6712074279785156, + "step": 7700 + }, + { + "epoch": 1.33, + "eval_logits/chosen": -1.4800820350646973, + "eval_logits/rejected": -1.4540327787399292, + "eval_logps/chosen": -219.68116760253906, + "eval_logps/rejected": -265.55712890625, + "eval_loss": 0.6266021728515625, + "eval_rewards/accuracies": 0.663336455821991, + "eval_rewards/chosen": -1.6097729206085205, + "eval_rewards/margins": 0.414224237203598, + "eval_rewards/rejected": -2.0239975452423096, + "eval_runtime": 357.1774, + "eval_samples_per_second": 12.05, + "eval_steps_per_second": 1.506, + "step": 7700 + }, + { + "epoch": 1.33, + "grad_norm": 30.088144432586212, + "learning_rate": 1.5296772972894212e-07, + "logits/chosen": -1.4096615314483643, + "logits/rejected": -1.3569542169570923, + "logps/chosen": -206.3859100341797, + "logps/rejected": -320.184326171875, + "loss": 0.4, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.5484488010406494, + "rewards/margins": 1.1439803838729858, + "rewards/rejected": -2.6924290657043457, + "step": 7710 + }, + { + "epoch": 1.33, + "grad_norm": 40.080885647002745, + "learning_rate": 1.5227531328972995e-07, + "logits/chosen": -1.3759911060333252, + "logits/rejected": -1.3137165307998657, + "logps/chosen": -219.1600799560547, + "logps/rejected": -328.873046875, + "loss": 0.4476, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6303611993789673, + "rewards/margins": 1.1502439975738525, + "rewards/rejected": -2.7806053161621094, + "step": 7720 + }, + { + "epoch": 1.33, + "grad_norm": 35.80033306298759, + "learning_rate": 1.5158378058238442e-07, + "logits/chosen": -1.3037515878677368, + "logits/rejected": -1.245792031288147, + "logps/chosen": -219.6901397705078, + "logps/rejected": -339.2084655761719, + "loss": 0.4158, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.657705545425415, + "rewards/margins": 1.2117184400558472, + "rewards/rejected": -2.8694233894348145, + "step": 7730 + }, + { + "epoch": 1.33, + "grad_norm": 33.772757774329904, + "learning_rate": 1.5089313786048885e-07, + "logits/chosen": -1.282684564590454, + "logits/rejected": -1.222401738166809, + "logps/chosen": -228.92276000976562, + "logps/rejected": -378.69622802734375, + "loss": 0.3819, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7724063396453857, + "rewards/margins": 1.4769179821014404, + "rewards/rejected": -3.2493247985839844, + "step": 7740 + }, + { + "epoch": 1.34, + "grad_norm": 28.25568491451203, + "learning_rate": 1.5020339136957877e-07, + "logits/chosen": -1.3118457794189453, + "logits/rejected": -1.2334351539611816, + "logps/chosen": -243.1823272705078, + "logps/rejected": -393.16705322265625, + "loss": 0.3772, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.892716407775879, + "rewards/margins": 1.5171799659729004, + "rewards/rejected": -3.4098963737487793, + "step": 7750 + }, + { + "epoch": 1.34, + "grad_norm": 34.5208633023336, + "learning_rate": 1.4951454734708458e-07, + "logits/chosen": -1.2015626430511475, + "logits/rejected": -1.1355557441711426, + "logps/chosen": -221.8160400390625, + "logps/rejected": -374.62139892578125, + "loss": 0.3608, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7132511138916016, + "rewards/margins": 1.516405701637268, + "rewards/rejected": -3.229656934738159, + "step": 7760 + }, + { + "epoch": 1.34, + "grad_norm": 32.49208419213271, + "learning_rate": 1.4882661202227597e-07, + "logits/chosen": -1.256168007850647, + "logits/rejected": -1.1954753398895264, + "logps/chosen": -244.3036346435547, + "logps/rejected": -356.46368408203125, + "loss": 0.4625, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.9218246936798096, + "rewards/margins": 1.1179141998291016, + "rewards/rejected": -3.039738655090332, + "step": 7770 + }, + { + "epoch": 1.34, + "grad_norm": 33.49580205257945, + "learning_rate": 1.48139591616205e-07, + "logits/chosen": -1.3774831295013428, + "logits/rejected": -1.3217271566390991, + "logps/chosen": -245.13168334960938, + "logps/rejected": -392.8816223144531, + "loss": 0.3819, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9360601902008057, + "rewards/margins": 1.4542224407196045, + "rewards/rejected": -3.390282392501831, + "step": 7780 + }, + { + "epoch": 1.34, + "grad_norm": 31.546470528457153, + "learning_rate": 1.4745349234165016e-07, + "logits/chosen": -1.318555235862732, + "logits/rejected": -1.2555335760116577, + "logps/chosen": -245.47787475585938, + "logps/rejected": -402.24993896484375, + "loss": 0.36, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.9392179250717163, + "rewards/margins": 1.5643069744110107, + "rewards/rejected": -3.5035252571105957, + "step": 7790 + }, + { + "epoch": 1.34, + "grad_norm": 33.94781782883591, + "learning_rate": 1.4676832040305984e-07, + "logits/chosen": -1.3638694286346436, + "logits/rejected": -1.3124583959579468, + "logps/chosen": -240.8196258544922, + "logps/rejected": -377.0190124511719, + "loss": 0.4156, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.868431806564331, + "rewards/margins": 1.3547109365463257, + "rewards/rejected": -3.223142623901367, + "step": 7800 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -1.3900244235992432, + "eval_logits/rejected": -1.3618897199630737, + "eval_logps/chosen": -258.3906555175781, + "eval_logps/rejected": -311.4806823730469, + "eval_loss": 0.6327470541000366, + "eval_rewards/accuracies": 0.6638011336326599, + "eval_rewards/chosen": -1.9968681335449219, + "eval_rewards/margins": 0.4863649308681488, + "eval_rewards/rejected": -2.4832329750061035, + "eval_runtime": 355.9596, + "eval_samples_per_second": 12.091, + "eval_steps_per_second": 1.511, + "step": 7800 + }, + { + "epoch": 1.35, + "grad_norm": 53.82974487314594, + "learning_rate": 1.4608408199649686e-07, + "logits/chosen": -1.3559496402740479, + "logits/rejected": -1.285172462463379, + "logps/chosen": -246.6414031982422, + "logps/rejected": -362.58270263671875, + "loss": 0.4629, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8763376474380493, + "rewards/margins": 1.2023292779922485, + "rewards/rejected": -3.078667163848877, + "step": 7810 + }, + { + "epoch": 1.35, + "grad_norm": 40.516669727296595, + "learning_rate": 1.4540078330958167e-07, + "logits/chosen": -1.336315393447876, + "logits/rejected": -1.2665674686431885, + "logps/chosen": -243.50302124023438, + "logps/rejected": -392.2691650390625, + "loss": 0.4179, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8713070154190063, + "rewards/margins": 1.4982094764709473, + "rewards/rejected": -3.369516372680664, + "step": 7820 + }, + { + "epoch": 1.35, + "grad_norm": 32.90176835421052, + "learning_rate": 1.4471843052143696e-07, + "logits/chosen": -1.3154162168502808, + "logits/rejected": -1.2652655839920044, + "logps/chosen": -231.88034057617188, + "logps/rejected": -358.99169921875, + "loss": 0.4525, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.821041464805603, + "rewards/margins": 1.234797716140747, + "rewards/rejected": -3.0558390617370605, + "step": 7830 + }, + { + "epoch": 1.35, + "grad_norm": 28.22131039387213, + "learning_rate": 1.440370298026315e-07, + "logits/chosen": -1.2927907705307007, + "logits/rejected": -1.23340904712677, + "logps/chosen": -216.0460662841797, + "logps/rejected": -339.61273193359375, + "loss": 0.412, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6249713897705078, + "rewards/margins": 1.2290581464767456, + "rewards/rejected": -2.8540291786193848, + "step": 7840 + }, + { + "epoch": 1.35, + "grad_norm": 36.753366417631874, + "learning_rate": 1.4335658731512451e-07, + "logits/chosen": -1.301358699798584, + "logits/rejected": -1.2169835567474365, + "logps/chosen": -216.15402221679688, + "logps/rejected": -345.2431945800781, + "loss": 0.3962, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6059744358062744, + "rewards/margins": 1.3626439571380615, + "rewards/rejected": -2.968618392944336, + "step": 7850 + }, + { + "epoch": 1.35, + "grad_norm": 25.514074332943455, + "learning_rate": 1.4267710921220973e-07, + "logits/chosen": -1.3115109205245972, + "logits/rejected": -1.2281205654144287, + "logps/chosen": -219.056396484375, + "logps/rejected": -366.49932861328125, + "loss": 0.3504, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6618436574935913, + "rewards/margins": 1.4767673015594482, + "rewards/rejected": -3.138611078262329, + "step": 7860 + }, + { + "epoch": 1.36, + "grad_norm": 51.13665015949875, + "learning_rate": 1.4199860163846007e-07, + "logits/chosen": -1.3125016689300537, + "logits/rejected": -1.251068353652954, + "logps/chosen": -239.65835571289062, + "logps/rejected": -367.1151428222656, + "loss": 0.4608, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8398945331573486, + "rewards/margins": 1.2877471446990967, + "rewards/rejected": -3.1276419162750244, + "step": 7870 + }, + { + "epoch": 1.36, + "grad_norm": 46.23192498635243, + "learning_rate": 1.4132107072967165e-07, + "logits/chosen": -1.3768285512924194, + "logits/rejected": -1.3229854106903076, + "logps/chosen": -240.4175567626953, + "logps/rejected": -354.0284118652344, + "loss": 0.4709, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8730905055999756, + "rewards/margins": 1.1423285007476807, + "rewards/rejected": -3.0154192447662354, + "step": 7880 + }, + { + "epoch": 1.36, + "grad_norm": 32.40033693942687, + "learning_rate": 1.406445226128088e-07, + "logits/chosen": -1.340899109840393, + "logits/rejected": -1.2813217639923096, + "logps/chosen": -226.41830444335938, + "logps/rejected": -349.4949645996094, + "loss": 0.4501, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.720557451248169, + "rewards/margins": 1.221872329711914, + "rewards/rejected": -2.942429542541504, + "step": 7890 + }, + { + "epoch": 1.36, + "grad_norm": 38.482928631188805, + "learning_rate": 1.399689634059479e-07, + "logits/chosen": -1.3165191411972046, + "logits/rejected": -1.2694923877716064, + "logps/chosen": -227.16726684570312, + "logps/rejected": -357.72381591796875, + "loss": 0.418, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7499094009399414, + "rewards/margins": 1.291032075881958, + "rewards/rejected": -3.0409417152404785, + "step": 7900 + }, + { + "epoch": 1.36, + "eval_logits/chosen": -1.4474836587905884, + "eval_logits/rejected": -1.420722484588623, + "eval_logps/chosen": -235.39988708496094, + "eval_logps/rejected": -283.75970458984375, + "eval_loss": 0.6320576071739197, + "eval_rewards/accuracies": 0.6577602028846741, + "eval_rewards/chosen": -1.7669605016708374, + "eval_rewards/margins": 0.4390629529953003, + "eval_rewards/rejected": -2.206023693084717, + "eval_runtime": 357.7699, + "eval_samples_per_second": 12.03, + "eval_steps_per_second": 1.504, + "step": 7900 + }, + { + "epoch": 1.36, + "grad_norm": 43.10989046415876, + "learning_rate": 1.3929439921822334e-07, + "logits/chosen": -1.3463201522827148, + "logits/rejected": -1.282036542892456, + "logps/chosen": -232.250732421875, + "logps/rejected": -347.2474060058594, + "loss": 0.4703, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7544885873794556, + "rewards/margins": 1.1816383600234985, + "rewards/rejected": -2.936127185821533, + "step": 7910 + }, + { + "epoch": 1.36, + "grad_norm": 22.98150967323039, + "learning_rate": 1.3862083614977067e-07, + "logits/chosen": -1.3695622682571411, + "logits/rejected": -1.3146297931671143, + "logps/chosen": -206.7174072265625, + "logps/rejected": -311.8243103027344, + "loss": 0.4652, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.519517421722412, + "rewards/margins": 1.0840694904327393, + "rewards/rejected": -2.6035869121551514, + "step": 7920 + }, + { + "epoch": 1.37, + "grad_norm": 26.732976356570454, + "learning_rate": 1.3794828029167267e-07, + "logits/chosen": -1.4295904636383057, + "logits/rejected": -1.3580360412597656, + "logps/chosen": -213.1838836669922, + "logps/rejected": -333.4698181152344, + "loss": 0.4051, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5487329959869385, + "rewards/margins": 1.2622630596160889, + "rewards/rejected": -2.8109960556030273, + "step": 7930 + }, + { + "epoch": 1.37, + "grad_norm": 39.35835071533213, + "learning_rate": 1.3727673772590376e-07, + "logits/chosen": -1.3716719150543213, + "logits/rejected": -1.3115837574005127, + "logps/chosen": -209.64309692382812, + "logps/rejected": -334.11761474609375, + "loss": 0.4165, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.5110173225402832, + "rewards/margins": 1.2777800559997559, + "rewards/rejected": -2.788797378540039, + "step": 7940 + }, + { + "epoch": 1.37, + "grad_norm": 23.363749701913548, + "learning_rate": 1.3660621452527505e-07, + "logits/chosen": -1.308699369430542, + "logits/rejected": -1.2606542110443115, + "logps/chosen": -190.32920837402344, + "logps/rejected": -315.5460205078125, + "loss": 0.4323, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3881752490997314, + "rewards/margins": 1.234622597694397, + "rewards/rejected": -2.6227974891662598, + "step": 7950 + }, + { + "epoch": 1.37, + "grad_norm": 30.69306170978758, + "learning_rate": 1.3593671675337954e-07, + "logits/chosen": -1.335451364517212, + "logits/rejected": -1.2736941576004028, + "logps/chosen": -200.57716369628906, + "logps/rejected": -317.3564453125, + "loss": 0.4216, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5011732578277588, + "rewards/margins": 1.1477513313293457, + "rewards/rejected": -2.6489245891571045, + "step": 7960 + }, + { + "epoch": 1.37, + "grad_norm": 30.826633710533333, + "learning_rate": 1.3526825046453706e-07, + "logits/chosen": -1.3753823041915894, + "logits/rejected": -1.3085488080978394, + "logps/chosen": -217.94619750976562, + "logps/rejected": -333.4029846191406, + "loss": 0.4507, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6257708072662354, + "rewards/margins": 1.1876569986343384, + "rewards/rejected": -2.8134281635284424, + "step": 7970 + }, + { + "epoch": 1.37, + "grad_norm": 32.72955968672792, + "learning_rate": 1.3460082170373987e-07, + "logits/chosen": -1.398342490196228, + "logits/rejected": -1.3425318002700806, + "logps/chosen": -230.95849609375, + "logps/rejected": -356.80609130859375, + "loss": 0.4128, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.740744948387146, + "rewards/margins": 1.2664308547973633, + "rewards/rejected": -3.0071756839752197, + "step": 7980 + }, + { + "epoch": 1.38, + "grad_norm": 37.50249892999267, + "learning_rate": 1.339344365065973e-07, + "logits/chosen": -1.3826172351837158, + "logits/rejected": -1.3275426626205444, + "logps/chosen": -234.38742065429688, + "logps/rejected": -364.55828857421875, + "loss": 0.4381, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8373771905899048, + "rewards/margins": 1.2917721271514893, + "rewards/rejected": -3.1291489601135254, + "step": 7990 + }, + { + "epoch": 1.38, + "grad_norm": 35.27460341174213, + "learning_rate": 1.3326910089928246e-07, + "logits/chosen": -1.2450647354125977, + "logits/rejected": -1.1912448406219482, + "logps/chosen": -227.9076385498047, + "logps/rejected": -362.02349853515625, + "loss": 0.4084, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7342026233673096, + "rewards/margins": 1.3236587047576904, + "rewards/rejected": -3.057861804962158, + "step": 8000 + }, + { + "epoch": 1.38, + "eval_logits/chosen": -1.4088190793991089, + "eval_logits/rejected": -1.381564974784851, + "eval_logps/chosen": -247.23068237304688, + "eval_logps/rejected": -297.6674499511719, + "eval_loss": 0.631807804107666, + "eval_rewards/accuracies": 0.6638011336326599, + "eval_rewards/chosen": -1.8852684497833252, + "eval_rewards/margins": 0.45983266830444336, + "eval_rewards/rejected": -2.3451011180877686, + "eval_runtime": 357.6561, + "eval_samples_per_second": 12.034, + "eval_steps_per_second": 1.504, + "step": 8000 + }, + { + "epoch": 1.38, + "grad_norm": 33.54268692406238, + "learning_rate": 1.3260482089847603e-07, + "logits/chosen": -1.2896820306777954, + "logits/rejected": -1.2158801555633545, + "logps/chosen": -234.7473907470703, + "logps/rejected": -369.5241394042969, + "loss": 0.4207, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7761294841766357, + "rewards/margins": 1.3950544595718384, + "rewards/rejected": -3.1711838245391846, + "step": 8010 + }, + { + "epoch": 1.38, + "grad_norm": 28.405016653483855, + "learning_rate": 1.3194160251131365e-07, + "logits/chosen": -1.3419923782348633, + "logits/rejected": -1.257868766784668, + "logps/chosen": -242.85787963867188, + "logps/rejected": -379.0827331542969, + "loss": 0.4081, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.826235055923462, + "rewards/margins": 1.417443037033081, + "rewards/rejected": -3.243677854537964, + "step": 8020 + }, + { + "epoch": 1.38, + "grad_norm": 39.15936163505855, + "learning_rate": 1.3127945173532988e-07, + "logits/chosen": -1.3448692560195923, + "logits/rejected": -1.284053087234497, + "logps/chosen": -212.48580932617188, + "logps/rejected": -355.14935302734375, + "loss": 0.4275, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6137079000473022, + "rewards/margins": 1.4035985469818115, + "rewards/rejected": -3.017306327819824, + "step": 8030 + }, + { + "epoch": 1.39, + "grad_norm": 33.82521625803319, + "learning_rate": 1.3061837455840538e-07, + "logits/chosen": -1.3016248941421509, + "logits/rejected": -1.2252373695373535, + "logps/chosen": -226.1641082763672, + "logps/rejected": -372.94171142578125, + "loss": 0.3642, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7177082300186157, + "rewards/margins": 1.4918298721313477, + "rewards/rejected": -3.209538221359253, + "step": 8040 + }, + { + "epoch": 1.39, + "grad_norm": 23.926808236247748, + "learning_rate": 1.2995837695871188e-07, + "logits/chosen": -1.3715155124664307, + "logits/rejected": -1.3059993982315063, + "logps/chosen": -211.1027374267578, + "logps/rejected": -356.53533935546875, + "loss": 0.4153, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.5601820945739746, + "rewards/margins": 1.4470088481903076, + "rewards/rejected": -3.0071911811828613, + "step": 8050 + }, + { + "epoch": 1.39, + "grad_norm": 31.557939346164492, + "learning_rate": 1.2929946490465855e-07, + "logits/chosen": -1.4260159730911255, + "logits/rejected": -1.3587584495544434, + "logps/chosen": -217.4468536376953, + "logps/rejected": -329.12548828125, + "loss": 0.4882, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6211131811141968, + "rewards/margins": 1.1693998575210571, + "rewards/rejected": -2.7905125617980957, + "step": 8060 + }, + { + "epoch": 1.39, + "grad_norm": 40.190393995057335, + "learning_rate": 1.2864164435483777e-07, + "logits/chosen": -1.354252815246582, + "logits/rejected": -1.2852472066879272, + "logps/chosen": -220.60400390625, + "logps/rejected": -329.6341857910156, + "loss": 0.4492, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6716521978378296, + "rewards/margins": 1.1254643201828003, + "rewards/rejected": -2.79711651802063, + "step": 8070 + }, + { + "epoch": 1.39, + "grad_norm": 32.42169318224056, + "learning_rate": 1.2798492125797145e-07, + "logits/chosen": -1.3571466207504272, + "logits/rejected": -1.3098504543304443, + "logps/chosen": -203.21746826171875, + "logps/rejected": -327.30889892578125, + "loss": 0.4336, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.5024254322052002, + "rewards/margins": 1.1913418769836426, + "rewards/rejected": -2.6937670707702637, + "step": 8080 + }, + { + "epoch": 1.39, + "grad_norm": 23.167056574146898, + "learning_rate": 1.273293015528571e-07, + "logits/chosen": -1.3101780414581299, + "logits/rejected": -1.2438642978668213, + "logps/chosen": -202.09756469726562, + "logps/rejected": -327.48089599609375, + "loss": 0.4286, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.490017056465149, + "rewards/margins": 1.2703895568847656, + "rewards/rejected": -2.760406732559204, + "step": 8090 + }, + { + "epoch": 1.4, + "grad_norm": 41.21644559684058, + "learning_rate": 1.2667479116831436e-07, + "logits/chosen": -1.3472046852111816, + "logits/rejected": -1.3043591976165771, + "logps/chosen": -227.7296905517578, + "logps/rejected": -334.98541259765625, + "loss": 0.4616, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.736731767654419, + "rewards/margins": 1.0323158502578735, + "rewards/rejected": -2.769047498703003, + "step": 8100 + }, + { + "epoch": 1.4, + "eval_logits/chosen": -1.4580535888671875, + "eval_logits/rejected": -1.4319345951080322, + "eval_logps/chosen": -226.49224853515625, + "eval_logps/rejected": -272.92999267578125, + "eval_loss": 0.6337063908576965, + "eval_rewards/accuracies": 0.6563661694526672, + "eval_rewards/chosen": -1.6778842210769653, + "eval_rewards/margins": 0.41984203457832336, + "eval_rewards/rejected": -2.097726345062256, + "eval_runtime": 356.5742, + "eval_samples_per_second": 12.07, + "eval_steps_per_second": 1.509, + "step": 8100 + }, + { + "epoch": 1.4, + "grad_norm": 43.64122789141382, + "learning_rate": 1.2602139602313066e-07, + "logits/chosen": -1.3520846366882324, + "logits/rejected": -1.2789603471755981, + "logps/chosen": -220.7334442138672, + "logps/rejected": -337.8800354003906, + "loss": 0.4345, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6468912363052368, + "rewards/margins": 1.221639633178711, + "rewards/rejected": -2.8685309886932373, + "step": 8110 + }, + { + "epoch": 1.4, + "grad_norm": 46.7301047333346, + "learning_rate": 1.2536912202600908e-07, + "logits/chosen": -1.3374398946762085, + "logits/rejected": -1.2761024236679077, + "logps/chosen": -215.7982940673828, + "logps/rejected": -333.050048828125, + "loss": 0.4249, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.617179274559021, + "rewards/margins": 1.1847528219223022, + "rewards/rejected": -2.8019323348999023, + "step": 8120 + }, + { + "epoch": 1.4, + "grad_norm": 42.0819810676, + "learning_rate": 1.2471797507551323e-07, + "logits/chosen": -1.3652501106262207, + "logits/rejected": -1.310935139656067, + "logps/chosen": -208.43954467773438, + "logps/rejected": -316.9969787597656, + "loss": 0.4354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.536388635635376, + "rewards/margins": 1.1277352571487427, + "rewards/rejected": -2.664124011993408, + "step": 8130 + }, + { + "epoch": 1.4, + "grad_norm": 32.76814476973973, + "learning_rate": 1.2406796106001526e-07, + "logits/chosen": -1.3102951049804688, + "logits/rejected": -1.2481873035430908, + "logps/chosen": -213.14266967773438, + "logps/rejected": -348.0755615234375, + "loss": 0.4184, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6040756702423096, + "rewards/margins": 1.3488930463790894, + "rewards/rejected": -2.9529685974121094, + "step": 8140 + }, + { + "epoch": 1.4, + "grad_norm": 20.86333145951887, + "learning_rate": 1.2341908585764197e-07, + "logits/chosen": -1.3613207340240479, + "logits/rejected": -1.2992537021636963, + "logps/chosen": -226.6147918701172, + "logps/rejected": -361.0736083984375, + "loss": 0.4225, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7447595596313477, + "rewards/margins": 1.3353623151779175, + "rewards/rejected": -3.0801219940185547, + "step": 8150 + }, + { + "epoch": 1.41, + "grad_norm": 33.69365894701024, + "learning_rate": 1.2277135533622173e-07, + "logits/chosen": -1.3138097524642944, + "logits/rejected": -1.241779088973999, + "logps/chosen": -218.9955291748047, + "logps/rejected": -367.7317810058594, + "loss": 0.3696, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6299861669540405, + "rewards/margins": 1.524584174156189, + "rewards/rejected": -3.1545703411102295, + "step": 8160 + }, + { + "epoch": 1.41, + "grad_norm": 41.9578809865992, + "learning_rate": 1.2212477535323158e-07, + "logits/chosen": -1.3314152956008911, + "logits/rejected": -1.261887550354004, + "logps/chosen": -240.8853759765625, + "logps/rejected": -356.1999816894531, + "loss": 0.4335, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8119796514511108, + "rewards/margins": 1.2307933568954468, + "rewards/rejected": -3.0427732467651367, + "step": 8170 + }, + { + "epoch": 1.41, + "grad_norm": 37.15151945333557, + "learning_rate": 1.2147935175574403e-07, + "logits/chosen": -1.336161732673645, + "logits/rejected": -1.2775371074676514, + "logps/chosen": -241.3358917236328, + "logps/rejected": -364.7397766113281, + "loss": 0.4214, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.850393295288086, + "rewards/margins": 1.2469345331192017, + "rewards/rejected": -3.0973281860351562, + "step": 8180 + }, + { + "epoch": 1.41, + "grad_norm": 35.78807424327242, + "learning_rate": 1.208350903803745e-07, + "logits/chosen": -1.290093183517456, + "logits/rejected": -1.2244975566864014, + "logps/chosen": -240.7860870361328, + "logps/rejected": -369.93792724609375, + "loss": 0.4527, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8283048868179321, + "rewards/margins": 1.3327312469482422, + "rewards/rejected": -3.1610360145568848, + "step": 8190 + }, + { + "epoch": 1.41, + "grad_norm": 36.986247487672124, + "learning_rate": 1.2019199705322793e-07, + "logits/chosen": -1.3099769353866577, + "logits/rejected": -1.244363784790039, + "logps/chosen": -241.85092163085938, + "logps/rejected": -364.93157958984375, + "loss": 0.4033, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.868650197982788, + "rewards/margins": 1.2554208040237427, + "rewards/rejected": -3.1240711212158203, + "step": 8200 + }, + { + "epoch": 1.41, + "eval_logits/chosen": -1.4116390943527222, + "eval_logits/rejected": -1.3845247030258179, + "eval_logps/chosen": -245.81504821777344, + "eval_logps/rejected": -296.2736511230469, + "eval_loss": 0.6331018805503845, + "eval_rewards/accuracies": 0.6638011336326599, + "eval_rewards/chosen": -1.8711119890213013, + "eval_rewards/margins": 0.460050493478775, + "eval_rewards/rejected": -2.331162452697754, + "eval_runtime": 357.1978, + "eval_samples_per_second": 12.049, + "eval_steps_per_second": 1.506, + "step": 8200 + }, + { + "epoch": 1.41, + "grad_norm": 29.593005359208316, + "learning_rate": 1.1955007758984717e-07, + "logits/chosen": -1.2003768682479858, + "logits/rejected": -1.1418938636779785, + "logps/chosen": -232.4004669189453, + "logps/rejected": -356.89031982421875, + "loss": 0.4207, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7730633020401, + "rewards/margins": 1.2342889308929443, + "rewards/rejected": -3.007352113723755, + "step": 8210 + }, + { + "epoch": 1.42, + "grad_norm": 28.630103927734417, + "learning_rate": 1.1890933779515897e-07, + "logits/chosen": -1.2915620803833008, + "logits/rejected": -1.2165160179138184, + "logps/chosen": -236.4958953857422, + "logps/rejected": -377.47528076171875, + "loss": 0.3975, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8341341018676758, + "rewards/margins": 1.4102541208267212, + "rewards/rejected": -3.2443878650665283, + "step": 8220 + }, + { + "epoch": 1.42, + "grad_norm": 19.561231715377982, + "learning_rate": 1.1826978346342301e-07, + "logits/chosen": -1.2921059131622314, + "logits/rejected": -1.229309320449829, + "logps/chosen": -233.7902069091797, + "logps/rejected": -382.6428527832031, + "loss": 0.3569, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7854340076446533, + "rewards/margins": 1.527852177619934, + "rewards/rejected": -3.313286304473877, + "step": 8230 + }, + { + "epoch": 1.42, + "grad_norm": 41.33964598646989, + "learning_rate": 1.1763142037817805e-07, + "logits/chosen": -1.3490978479385376, + "logits/rejected": -1.2747749090194702, + "logps/chosen": -261.16094970703125, + "logps/rejected": -395.5751953125, + "loss": 0.3843, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.049976348876953, + "rewards/margins": 1.3874568939208984, + "rewards/rejected": -3.4374337196350098, + "step": 8240 + }, + { + "epoch": 1.42, + "grad_norm": 27.38502220196457, + "learning_rate": 1.1699425431219079e-07, + "logits/chosen": -1.2738348245620728, + "logits/rejected": -1.215987205505371, + "logps/chosen": -255.5185546875, + "logps/rejected": -400.17083740234375, + "loss": 0.4124, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9787349700927734, + "rewards/margins": 1.4798691272735596, + "rewards/rejected": -3.458604335784912, + "step": 8250 + }, + { + "epoch": 1.42, + "grad_norm": 35.93771365818128, + "learning_rate": 1.1635829102740294e-07, + "logits/chosen": -1.3693095445632935, + "logits/rejected": -1.3095340728759766, + "logps/chosen": -248.7408905029297, + "logps/rejected": -385.82574462890625, + "loss": 0.4401, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.9232066869735718, + "rewards/margins": 1.382420539855957, + "rewards/rejected": -3.3056271076202393, + "step": 8260 + }, + { + "epoch": 1.42, + "grad_norm": 32.99199105515522, + "learning_rate": 1.1572353627487948e-07, + "logits/chosen": -1.3601871728897095, + "logits/rejected": -1.3054234981536865, + "logps/chosen": -243.6525421142578, + "logps/rejected": -381.771240234375, + "loss": 0.4241, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9026378393173218, + "rewards/margins": 1.3616206645965576, + "rewards/rejected": -3.264258623123169, + "step": 8270 + }, + { + "epoch": 1.43, + "grad_norm": 40.84568027678274, + "learning_rate": 1.1508999579475654e-07, + "logits/chosen": -1.321771502494812, + "logits/rejected": -1.2765603065490723, + "logps/chosen": -239.63668823242188, + "logps/rejected": -357.2843933105469, + "loss": 0.453, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8728437423706055, + "rewards/margins": 1.1715965270996094, + "rewards/rejected": -3.044440269470215, + "step": 8280 + }, + { + "epoch": 1.43, + "grad_norm": 27.9600215465676, + "learning_rate": 1.1445767531618944e-07, + "logits/chosen": -1.2803277969360352, + "logits/rejected": -1.1913877725601196, + "logps/chosen": -233.3045196533203, + "logps/rejected": -352.5013732910156, + "loss": 0.4225, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7336339950561523, + "rewards/margins": 1.2495338916778564, + "rewards/rejected": -2.983167886734009, + "step": 8290 + }, + { + "epoch": 1.43, + "grad_norm": 25.002008446127316, + "learning_rate": 1.1382658055730096e-07, + "logits/chosen": -1.419141173362732, + "logits/rejected": -1.3513821363449097, + "logps/chosen": -243.5513458251953, + "logps/rejected": -379.9726867675781, + "loss": 0.4659, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8599157333374023, + "rewards/margins": 1.3946608304977417, + "rewards/rejected": -3.2545769214630127, + "step": 8300 + }, + { + "epoch": 1.43, + "eval_logits/chosen": -1.4014118909835815, + "eval_logits/rejected": -1.3744704723358154, + "eval_logps/chosen": -253.2737579345703, + "eval_logps/rejected": -304.1915588378906, + "eval_loss": 0.6337563991546631, + "eval_rewards/accuracies": 0.6642658114433289, + "eval_rewards/chosen": -1.9456990957260132, + "eval_rewards/margins": 0.4646424651145935, + "eval_rewards/rejected": -2.410341739654541, + "eval_runtime": 357.7816, + "eval_samples_per_second": 12.03, + "eval_steps_per_second": 1.504, + "step": 8300 + }, + { + "epoch": 1.43, + "grad_norm": 42.89349489877897, + "learning_rate": 1.1319671722512958e-07, + "logits/chosen": -1.2304198741912842, + "logits/rejected": -1.1626794338226318, + "logps/chosen": -231.46615600585938, + "logps/rejected": -345.99493408203125, + "loss": 0.4309, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7518718242645264, + "rewards/margins": 1.1967473030090332, + "rewards/rejected": -2.9486191272735596, + "step": 8310 + }, + { + "epoch": 1.43, + "grad_norm": 42.24768978841262, + "learning_rate": 1.1256809101557793e-07, + "logits/chosen": -1.3194677829742432, + "logits/rejected": -1.263414740562439, + "logps/chosen": -216.904052734375, + "logps/rejected": -355.9466552734375, + "loss": 0.4015, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.644235372543335, + "rewards/margins": 1.3747762441635132, + "rewards/rejected": -3.0190117359161377, + "step": 8320 + }, + { + "epoch": 1.44, + "grad_norm": 24.459920685245006, + "learning_rate": 1.1194070761336133e-07, + "logits/chosen": -1.3198477029800415, + "logits/rejected": -1.2680397033691406, + "logps/chosen": -233.9806365966797, + "logps/rejected": -365.51239013671875, + "loss": 0.4175, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8538386821746826, + "rewards/margins": 1.262743592262268, + "rewards/rejected": -3.1165823936462402, + "step": 8330 + }, + { + "epoch": 1.44, + "grad_norm": 44.12413238893927, + "learning_rate": 1.1131457269195598e-07, + "logits/chosen": -1.3826789855957031, + "logits/rejected": -1.3336975574493408, + "logps/chosen": -241.165283203125, + "logps/rejected": -356.74481201171875, + "loss": 0.4828, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8674249649047852, + "rewards/margins": 1.1834924221038818, + "rewards/rejected": -3.050917387008667, + "step": 8340 + }, + { + "epoch": 1.44, + "grad_norm": 27.88048778386721, + "learning_rate": 1.106896919135483e-07, + "logits/chosen": -1.212838888168335, + "logits/rejected": -1.1588201522827148, + "logps/chosen": -237.91293334960938, + "logps/rejected": -359.9263000488281, + "loss": 0.4398, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.847447395324707, + "rewards/margins": 1.225775122642517, + "rewards/rejected": -3.0732228755950928, + "step": 8350 + }, + { + "epoch": 1.44, + "grad_norm": 41.137677010372954, + "learning_rate": 1.1006607092898326e-07, + "logits/chosen": -1.2542845010757446, + "logits/rejected": -1.1702654361724854, + "logps/chosen": -207.97622680664062, + "logps/rejected": -355.0627746582031, + "loss": 0.3657, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5343948602676392, + "rewards/margins": 1.4972972869873047, + "rewards/rejected": -3.0316920280456543, + "step": 8360 + }, + { + "epoch": 1.44, + "grad_norm": 30.172641899374565, + "learning_rate": 1.0944371537771347e-07, + "logits/chosen": -1.3405089378356934, + "logits/rejected": -1.2776302099227905, + "logps/chosen": -218.90786743164062, + "logps/rejected": -363.3168640136719, + "loss": 0.3823, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6786830425262451, + "rewards/margins": 1.4186073541641235, + "rewards/rejected": -3.097290277481079, + "step": 8370 + }, + { + "epoch": 1.44, + "grad_norm": 26.55031255402569, + "learning_rate": 1.0882263088774809e-07, + "logits/chosen": -1.4416921138763428, + "logits/rejected": -1.3774442672729492, + "logps/chosen": -203.69728088378906, + "logps/rejected": -338.6905212402344, + "loss": 0.4202, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5246427059173584, + "rewards/margins": 1.3209424018859863, + "rewards/rejected": -2.845585346221924, + "step": 8380 + }, + { + "epoch": 1.45, + "grad_norm": 35.13759701459188, + "learning_rate": 1.0820282307560196e-07, + "logits/chosen": -1.4239284992218018, + "logits/rejected": -1.3506910800933838, + "logps/chosen": -219.5550537109375, + "logps/rejected": -363.826171875, + "loss": 0.3804, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6286277770996094, + "rewards/margins": 1.4488070011138916, + "rewards/rejected": -3.077434778213501, + "step": 8390 + }, + { + "epoch": 1.45, + "grad_norm": 28.883468549799368, + "learning_rate": 1.075842975462449e-07, + "logits/chosen": -1.3790721893310547, + "logits/rejected": -1.3156163692474365, + "logps/chosen": -207.9996337890625, + "logps/rejected": -343.63677978515625, + "loss": 0.4254, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.5770851373672485, + "rewards/margins": 1.351677656173706, + "rewards/rejected": -2.928762912750244, + "step": 8400 + }, + { + "epoch": 1.45, + "eval_logits/chosen": -1.4531240463256836, + "eval_logits/rejected": -1.4271684885025024, + "eval_logps/chosen": -233.58177185058594, + "eval_logps/rejected": -281.2073974609375, + "eval_loss": 0.6341521143913269, + "eval_rewards/accuracies": 0.6589219570159912, + "eval_rewards/chosen": -1.7487791776657104, + "eval_rewards/margins": 0.4317210614681244, + "eval_rewards/rejected": -2.1805002689361572, + "eval_runtime": 357.7383, + "eval_samples_per_second": 12.031, + "eval_steps_per_second": 1.504, + "step": 8400 + }, + { + "epoch": 1.45, + "grad_norm": 26.87906657018289, + "learning_rate": 1.0696705989305085e-07, + "logits/chosen": -1.3252408504486084, + "logits/rejected": -1.2531208992004395, + "logps/chosen": -228.47628784179688, + "logps/rejected": -369.2504577636719, + "loss": 0.4096, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7083253860473633, + "rewards/margins": 1.466355323791504, + "rewards/rejected": -3.1746809482574463, + "step": 8410 + }, + { + "epoch": 1.45, + "grad_norm": 29.144216830031528, + "learning_rate": 1.0635111569774755e-07, + "logits/chosen": -1.2288157939910889, + "logits/rejected": -1.1764132976531982, + "logps/chosen": -192.9091796875, + "logps/rejected": -327.469482421875, + "loss": 0.3615, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.412838339805603, + "rewards/margins": 1.3490186929702759, + "rewards/rejected": -2.761857032775879, + "step": 8420 + }, + { + "epoch": 1.45, + "grad_norm": 29.070566266372918, + "learning_rate": 1.0573647053036552e-07, + "logits/chosen": -1.3514432907104492, + "logits/rejected": -1.294538974761963, + "logps/chosen": -215.29830932617188, + "logps/rejected": -337.3027038574219, + "loss": 0.4437, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6178497076034546, + "rewards/margins": 1.188742995262146, + "rewards/rejected": -2.8065924644470215, + "step": 8430 + }, + { + "epoch": 1.45, + "grad_norm": 31.754197528307863, + "learning_rate": 1.0512312994918865e-07, + "logits/chosen": -1.387795329093933, + "logits/rejected": -1.3368542194366455, + "logps/chosen": -220.80947875976562, + "logps/rejected": -341.77264404296875, + "loss": 0.4488, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6601117849349976, + "rewards/margins": 1.242388129234314, + "rewards/rejected": -2.9024999141693115, + "step": 8440 + }, + { + "epoch": 1.46, + "grad_norm": 25.669279884647963, + "learning_rate": 1.0451109950070275e-07, + "logits/chosen": -1.2506482601165771, + "logits/rejected": -1.2008402347564697, + "logps/chosen": -225.53250122070312, + "logps/rejected": -366.031982421875, + "loss": 0.4143, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7753582000732422, + "rewards/margins": 1.3562324047088623, + "rewards/rejected": -3.1315910816192627, + "step": 8450 + }, + { + "epoch": 1.46, + "grad_norm": 25.414444784217327, + "learning_rate": 1.039003847195466e-07, + "logits/chosen": -1.3718957901000977, + "logits/rejected": -1.3084933757781982, + "logps/chosen": -226.52572631835938, + "logps/rejected": -355.208251953125, + "loss": 0.378, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7372535467147827, + "rewards/margins": 1.277478814125061, + "rewards/rejected": -3.0147323608398438, + "step": 8460 + }, + { + "epoch": 1.46, + "grad_norm": 47.176323143328105, + "learning_rate": 1.0329099112846071e-07, + "logits/chosen": -1.3328293561935425, + "logits/rejected": -1.2753039598464966, + "logps/chosen": -257.13079833984375, + "logps/rejected": -379.00323486328125, + "loss": 0.4915, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.010730504989624, + "rewards/margins": 1.2479288578033447, + "rewards/rejected": -3.2586593627929688, + "step": 8470 + }, + { + "epoch": 1.46, + "grad_norm": 32.56367895113138, + "learning_rate": 1.0268292423823838e-07, + "logits/chosen": -1.3693746328353882, + "logits/rejected": -1.2976529598236084, + "logps/chosen": -223.7007598876953, + "logps/rejected": -357.4267272949219, + "loss": 0.439, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7069003582000732, + "rewards/margins": 1.3514857292175293, + "rewards/rejected": -3.0583860874176025, + "step": 8480 + }, + { + "epoch": 1.46, + "grad_norm": 27.548404093695904, + "learning_rate": 1.020761895476753e-07, + "logits/chosen": -1.4502463340759277, + "logits/rejected": -1.3987720012664795, + "logps/chosen": -210.2566375732422, + "logps/rejected": -340.35089111328125, + "loss": 0.4054, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5552849769592285, + "rewards/margins": 1.2762036323547363, + "rewards/rejected": -2.831489086151123, + "step": 8490 + }, + { + "epoch": 1.46, + "grad_norm": 25.509139166950433, + "learning_rate": 1.0147079254352001e-07, + "logits/chosen": -1.3013901710510254, + "logits/rejected": -1.2501300573349, + "logps/chosen": -205.6046142578125, + "logps/rejected": -333.3544921875, + "loss": 0.4177, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.5129337310791016, + "rewards/margins": 1.2862192392349243, + "rewards/rejected": -2.7991526126861572, + "step": 8500 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -1.4731091260910034, + "eval_logits/rejected": -1.4476723670959473, + "eval_logps/chosen": -229.227783203125, + "eval_logps/rejected": -275.5844421386719, + "eval_loss": 0.6337706446647644, + "eval_rewards/accuracies": 0.6589219570159912, + "eval_rewards/chosen": -1.705239176750183, + "eval_rewards/margins": 0.4190312325954437, + "eval_rewards/rejected": -2.124270439147949, + "eval_runtime": 357.6856, + "eval_samples_per_second": 12.033, + "eval_steps_per_second": 1.504, + "step": 8500 + }, + { + "epoch": 1.47, + "grad_norm": 42.02138934399011, + "learning_rate": 1.008667387004242e-07, + "logits/chosen": -1.3346723318099976, + "logits/rejected": -1.2678366899490356, + "logps/chosen": -221.626953125, + "logps/rejected": -353.46221923828125, + "loss": 0.3852, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.6379005908966064, + "rewards/margins": 1.381575345993042, + "rewards/rejected": -3.0194761753082275, + "step": 8510 + }, + { + "epoch": 1.47, + "grad_norm": 31.896788173795407, + "learning_rate": 1.002640334808933e-07, + "logits/chosen": -1.3286793231964111, + "logits/rejected": -1.2712717056274414, + "logps/chosen": -226.07302856445312, + "logps/rejected": -338.1138000488281, + "loss": 0.4501, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7251167297363281, + "rewards/margins": 1.1692787408828735, + "rewards/rejected": -2.8943958282470703, + "step": 8520 + }, + { + "epoch": 1.47, + "grad_norm": 30.87059976350335, + "learning_rate": 9.9662682335237e-08, + "logits/chosen": -1.3089802265167236, + "logits/rejected": -1.2482343912124634, + "logps/chosen": -220.247314453125, + "logps/rejected": -340.78228759765625, + "loss": 0.4195, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6637243032455444, + "rewards/margins": 1.2299330234527588, + "rewards/rejected": -2.8936572074890137, + "step": 8530 + }, + { + "epoch": 1.47, + "grad_norm": 29.57486542104792, + "learning_rate": 9.906269070152004e-08, + "logits/chosen": -1.4429051876068115, + "logits/rejected": -1.3985341787338257, + "logps/chosen": -219.606689453125, + "logps/rejected": -326.71759033203125, + "loss": 0.4908, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6491502523422241, + "rewards/margins": 1.0737760066986084, + "rewards/rejected": -2.722926378250122, + "step": 8540 + }, + { + "epoch": 1.47, + "grad_norm": 25.587032990412716, + "learning_rate": 9.846406400551308e-08, + "logits/chosen": -1.3666541576385498, + "logits/rejected": -1.3007842302322388, + "logps/chosen": -233.0788116455078, + "logps/rejected": -375.75469970703125, + "loss": 0.3971, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7926390171051025, + "rewards/margins": 1.4118077754974365, + "rewards/rejected": -3.204446792602539, + "step": 8550 + }, + { + "epoch": 1.47, + "grad_norm": 29.854174805438845, + "learning_rate": 9.786680766064318e-08, + "logits/chosen": -1.4583765268325806, + "logits/rejected": -1.3929063081741333, + "logps/chosen": -231.9521942138672, + "logps/rejected": -365.8257751464844, + "loss": 0.4261, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7775615453720093, + "rewards/margins": 1.3575212955474854, + "rewards/rejected": -3.135082721710205, + "step": 8560 + }, + { + "epoch": 1.48, + "grad_norm": 31.030686033718272, + "learning_rate": 9.727092706794554e-08, + "logits/chosen": -1.3249984979629517, + "logits/rejected": -1.2678711414337158, + "logps/chosen": -229.1342010498047, + "logps/rejected": -341.3080749511719, + "loss": 0.4511, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7480465173721313, + "rewards/margins": 1.145034670829773, + "rewards/rejected": -2.8930811882019043, + "step": 8570 + }, + { + "epoch": 1.48, + "grad_norm": 32.81944513147602, + "learning_rate": 9.667642761601433e-08, + "logits/chosen": -1.418872594833374, + "logits/rejected": -1.3541626930236816, + "logps/chosen": -211.55941772460938, + "logps/rejected": -351.56304931640625, + "loss": 0.3834, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.5743210315704346, + "rewards/margins": 1.3962310552597046, + "rewards/rejected": -2.9705519676208496, + "step": 8580 + }, + { + "epoch": 1.48, + "grad_norm": 28.302147205613945, + "learning_rate": 9.608331468095377e-08, + "logits/chosen": -1.3961126804351807, + "logits/rejected": -1.319726586341858, + "logps/chosen": -209.43771362304688, + "logps/rejected": -345.10186767578125, + "loss": 0.358, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.524304986000061, + "rewards/margins": 1.3930083513259888, + "rewards/rejected": -2.91731333732605, + "step": 8590 + }, + { + "epoch": 1.48, + "grad_norm": 22.424385790342782, + "learning_rate": 9.549159362632986e-08, + "logits/chosen": -1.3214257955551147, + "logits/rejected": -1.26907217502594, + "logps/chosen": -230.94613647460938, + "logps/rejected": -337.48309326171875, + "loss": 0.4537, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7750024795532227, + "rewards/margins": 1.082525372505188, + "rewards/rejected": -2.8575279712677, + "step": 8600 + }, + { + "epoch": 1.48, + "eval_logits/chosen": -1.4457244873046875, + "eval_logits/rejected": -1.4196789264678955, + "eval_logps/chosen": -243.827392578125, + "eval_logps/rejected": -292.89404296875, + "eval_loss": 0.63252854347229, + "eval_rewards/accuracies": 0.6677509546279907, + "eval_rewards/chosen": -1.8512355089187622, + "eval_rewards/margins": 0.4461313486099243, + "eval_rewards/rejected": -2.2973668575286865, + "eval_runtime": 357.6199, + "eval_samples_per_second": 12.035, + "eval_steps_per_second": 1.504, + "step": 8600 + }, + { + "epoch": 1.48, + "grad_norm": 33.10659637068871, + "learning_rate": 9.490126980312165e-08, + "logits/chosen": -1.3417989015579224, + "logits/rejected": -1.2814117670059204, + "logps/chosen": -232.3809814453125, + "logps/rejected": -360.255615234375, + "loss": 0.4184, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7825310230255127, + "rewards/margins": 1.3146696090698242, + "rewards/rejected": -3.097200632095337, + "step": 8610 + }, + { + "epoch": 1.49, + "grad_norm": 29.42647146776832, + "learning_rate": 9.431234854967291e-08, + "logits/chosen": -1.2606632709503174, + "logits/rejected": -1.2125203609466553, + "logps/chosen": -236.4669189453125, + "logps/rejected": -357.4127502441406, + "loss": 0.4326, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8169389963150024, + "rewards/margins": 1.2449721097946167, + "rewards/rejected": -3.061911106109619, + "step": 8620 + }, + { + "epoch": 1.49, + "grad_norm": 23.619846728441956, + "learning_rate": 9.372483519164398e-08, + "logits/chosen": -1.2358766794204712, + "logits/rejected": -1.1763203144073486, + "logps/chosen": -209.49526977539062, + "logps/rejected": -350.7640075683594, + "loss": 0.3756, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5778772830963135, + "rewards/margins": 1.4093445539474487, + "rewards/rejected": -2.9872217178344727, + "step": 8630 + }, + { + "epoch": 1.49, + "grad_norm": 23.10386102267089, + "learning_rate": 9.313873504196313e-08, + "logits/chosen": -1.4057111740112305, + "logits/rejected": -1.3474836349487305, + "logps/chosen": -225.91744995117188, + "logps/rejected": -339.12139892578125, + "loss": 0.4632, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7019745111465454, + "rewards/margins": 1.161685585975647, + "rewards/rejected": -2.8636600971221924, + "step": 8640 + }, + { + "epoch": 1.49, + "grad_norm": 51.94922788167167, + "learning_rate": 9.255405340077949e-08, + "logits/chosen": -1.321274757385254, + "logits/rejected": -1.256255865097046, + "logps/chosen": -225.003662109375, + "logps/rejected": -348.2788391113281, + "loss": 0.458, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.700404405593872, + "rewards/margins": 1.241997480392456, + "rewards/rejected": -2.9424021244049072, + "step": 8650 + }, + { + "epoch": 1.49, + "grad_norm": 32.91948860177697, + "learning_rate": 9.197079555541379e-08, + "logits/chosen": -1.3428630828857422, + "logits/rejected": -1.2893078327178955, + "logps/chosen": -228.0025177001953, + "logps/rejected": -354.33404541015625, + "loss": 0.46, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7465832233428955, + "rewards/margins": 1.2794687747955322, + "rewards/rejected": -3.0260515213012695, + "step": 8660 + }, + { + "epoch": 1.49, + "grad_norm": 23.89924519243948, + "learning_rate": 9.138896678031202e-08, + "logits/chosen": -1.4497371912002563, + "logits/rejected": -1.3866420984268188, + "logps/chosen": -211.95297241210938, + "logps/rejected": -345.5428466796875, + "loss": 0.4208, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5996501445770264, + "rewards/margins": 1.3280017375946045, + "rewards/rejected": -2.927651882171631, + "step": 8670 + }, + { + "epoch": 1.5, + "grad_norm": 25.326870727709025, + "learning_rate": 9.080857233699624e-08, + "logits/chosen": -1.364079236984253, + "logits/rejected": -1.323317289352417, + "logps/chosen": -223.8002166748047, + "logps/rejected": -334.2272644042969, + "loss": 0.4608, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7312272787094116, + "rewards/margins": 1.0833412408828735, + "rewards/rejected": -2.814568281173706, + "step": 8680 + }, + { + "epoch": 1.5, + "grad_norm": 28.8350727770196, + "learning_rate": 9.022961747401841e-08, + "logits/chosen": -1.385801911354065, + "logits/rejected": -1.323319911956787, + "logps/chosen": -221.8018341064453, + "logps/rejected": -329.33428955078125, + "loss": 0.4513, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.66269850730896, + "rewards/margins": 1.1479780673980713, + "rewards/rejected": -2.8106765747070312, + "step": 8690 + }, + { + "epoch": 1.5, + "grad_norm": 27.89499138401969, + "learning_rate": 8.96521074269117e-08, + "logits/chosen": -1.3936054706573486, + "logits/rejected": -1.3266212940216064, + "logps/chosen": -221.9488067626953, + "logps/rejected": -332.57354736328125, + "loss": 0.4176, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.654510498046875, + "rewards/margins": 1.1451376676559448, + "rewards/rejected": -2.7996482849121094, + "step": 8700 + }, + { + "epoch": 1.5, + "eval_logits/chosen": -1.4750956296920776, + "eval_logits/rejected": -1.4490503072738647, + "eval_logps/chosen": -231.7505340576172, + "eval_logps/rejected": -279.62408447265625, + "eval_loss": 0.6307649612426758, + "eval_rewards/accuracies": 0.6654275059700012, + "eval_rewards/chosen": -1.7304668426513672, + "eval_rewards/margins": 0.4342002868652344, + "eval_rewards/rejected": -2.1646668910980225, + "eval_runtime": 356.447, + "eval_samples_per_second": 12.075, + "eval_steps_per_second": 1.509, + "step": 8700 + }, + { + "epoch": 1.5, + "grad_norm": 44.13104620276346, + "learning_rate": 8.907604741814403e-08, + "logits/chosen": -1.3480675220489502, + "logits/rejected": -1.3072960376739502, + "logps/chosen": -227.20175170898438, + "logps/rejected": -328.07940673828125, + "loss": 0.4921, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7436609268188477, + "rewards/margins": 1.0057998895645142, + "rewards/rejected": -2.7494606971740723, + "step": 8710 + }, + { + "epoch": 1.5, + "grad_norm": 40.22151350765789, + "learning_rate": 8.850144265707039e-08, + "logits/chosen": -1.3768417835235596, + "logits/rejected": -1.312811255455017, + "logps/chosen": -228.0986328125, + "logps/rejected": -353.222412109375, + "loss": 0.3894, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.717865228652954, + "rewards/margins": 1.2698724269866943, + "rewards/rejected": -2.9877376556396484, + "step": 8720 + }, + { + "epoch": 1.5, + "grad_norm": 34.383596796071096, + "learning_rate": 8.792829833988588e-08, + "logits/chosen": -1.3603075742721558, + "logits/rejected": -1.2962515354156494, + "logps/chosen": -222.20852661132812, + "logps/rejected": -350.58489990234375, + "loss": 0.4555, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.699462890625, + "rewards/margins": 1.288184404373169, + "rewards/rejected": -2.987647533416748, + "step": 8730 + }, + { + "epoch": 1.51, + "grad_norm": 43.757133699793904, + "learning_rate": 8.735661964957869e-08, + "logits/chosen": -1.3438574075698853, + "logits/rejected": -1.2943146228790283, + "logps/chosen": -223.5640411376953, + "logps/rejected": -365.6803894042969, + "loss": 0.3991, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.721771001815796, + "rewards/margins": 1.4051529169082642, + "rewards/rejected": -3.1269240379333496, + "step": 8740 + }, + { + "epoch": 1.51, + "grad_norm": 33.49609695850668, + "learning_rate": 8.678641175588324e-08, + "logits/chosen": -1.3823951482772827, + "logits/rejected": -1.3145440816879272, + "logps/chosen": -226.49917602539062, + "logps/rejected": -367.0007019042969, + "loss": 0.4057, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7283353805541992, + "rewards/margins": 1.4096132516860962, + "rewards/rejected": -3.137948513031006, + "step": 8750 + }, + { + "epoch": 1.51, + "grad_norm": 24.899312092376913, + "learning_rate": 8.62176798152335e-08, + "logits/chosen": -1.3334381580352783, + "logits/rejected": -1.2971229553222656, + "logps/chosen": -219.86520385742188, + "logps/rejected": -323.74310302734375, + "loss": 0.4992, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6835765838623047, + "rewards/margins": 1.002506971359253, + "rewards/rejected": -2.6860833168029785, + "step": 8760 + }, + { + "epoch": 1.51, + "grad_norm": 24.892210563053087, + "learning_rate": 8.565042897071606e-08, + "logits/chosen": -1.3791553974151611, + "logits/rejected": -1.3129332065582275, + "logps/chosen": -221.7880096435547, + "logps/rejected": -350.95306396484375, + "loss": 0.4061, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6673253774642944, + "rewards/margins": 1.3446027040481567, + "rewards/rejected": -3.011927843093872, + "step": 8770 + }, + { + "epoch": 1.51, + "grad_norm": 29.916856004909253, + "learning_rate": 8.508466435202402e-08, + "logits/chosen": -1.413527488708496, + "logits/rejected": -1.3698149919509888, + "logps/chosen": -221.784912109375, + "logps/rejected": -348.02508544921875, + "loss": 0.4088, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7269346714019775, + "rewards/margins": 1.2188551425933838, + "rewards/rejected": -2.9457898139953613, + "step": 8780 + }, + { + "epoch": 1.51, + "grad_norm": 33.97524926089677, + "learning_rate": 8.452039107541042e-08, + "logits/chosen": -1.402840256690979, + "logits/rejected": -1.333287000656128, + "logps/chosen": -228.7657928466797, + "logps/rejected": -360.1128845214844, + "loss": 0.4378, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7385919094085693, + "rewards/margins": 1.3424417972564697, + "rewards/rejected": -3.08103346824646, + "step": 8790 + }, + { + "epoch": 1.52, + "grad_norm": 35.70383300311999, + "learning_rate": 8.395761424364193e-08, + "logits/chosen": -1.3199231624603271, + "logits/rejected": -1.2492494583129883, + "logps/chosen": -216.8665771484375, + "logps/rejected": -342.27081298828125, + "loss": 0.4486, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6186439990997314, + "rewards/margins": 1.2790682315826416, + "rewards/rejected": -2.897711992263794, + "step": 8800 + }, + { + "epoch": 1.52, + "eval_logits/chosen": -1.4813001155853271, + "eval_logits/rejected": -1.4554513692855835, + "eval_logps/chosen": -232.9863739013672, + "eval_logps/rejected": -280.9822082519531, + "eval_loss": 0.629108190536499, + "eval_rewards/accuracies": 0.669377326965332, + "eval_rewards/chosen": -1.7428252696990967, + "eval_rewards/margins": 0.43542277812957764, + "eval_rewards/rejected": -2.178248167037964, + "eval_runtime": 357.8908, + "eval_samples_per_second": 12.026, + "eval_steps_per_second": 1.503, + "step": 8800 + }, + { + "epoch": 1.52, + "grad_norm": 25.157636621740963, + "learning_rate": 8.33963389459528e-08, + "logits/chosen": -1.4286869764328003, + "logits/rejected": -1.3668185472488403, + "logps/chosen": -215.29214477539062, + "logps/rejected": -350.935546875, + "loss": 0.3864, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6436516046524048, + "rewards/margins": 1.3397597074508667, + "rewards/rejected": -2.9834113121032715, + "step": 8810 + }, + { + "epoch": 1.52, + "grad_norm": 30.0040629228507, + "learning_rate": 8.283657025799872e-08, + "logits/chosen": -1.40675950050354, + "logits/rejected": -1.3426892757415771, + "logps/chosen": -213.4414825439453, + "logps/rejected": -352.6174621582031, + "loss": 0.3977, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.630210518836975, + "rewards/margins": 1.3984111547470093, + "rewards/rejected": -3.0286216735839844, + "step": 8820 + }, + { + "epoch": 1.52, + "grad_norm": 44.391862230544554, + "learning_rate": 8.227831324181109e-08, + "logits/chosen": -1.2691117525100708, + "logits/rejected": -1.2064220905303955, + "logps/chosen": -220.8523406982422, + "logps/rejected": -337.88763427734375, + "loss": 0.5033, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6626207828521729, + "rewards/margins": 1.188899278640747, + "rewards/rejected": -2.85152006149292, + "step": 8830 + }, + { + "epoch": 1.52, + "grad_norm": 35.6035223426018, + "learning_rate": 8.172157294575108e-08, + "logits/chosen": -1.3077764511108398, + "logits/rejected": -1.2591360807418823, + "logps/chosen": -206.6147003173828, + "logps/rejected": -329.2171936035156, + "loss": 0.4314, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5428450107574463, + "rewards/margins": 1.211372971534729, + "rewards/rejected": -2.7542176246643066, + "step": 8840 + }, + { + "epoch": 1.52, + "grad_norm": 24.29335767133043, + "learning_rate": 8.116635440446402e-08, + "logits/chosen": -1.461669921875, + "logits/rejected": -1.3966032266616821, + "logps/chosen": -200.99661254882812, + "logps/rejected": -347.1763000488281, + "loss": 0.3793, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.4875738620758057, + "rewards/margins": 1.4415907859802246, + "rewards/rejected": -2.9291648864746094, + "step": 8850 + }, + { + "epoch": 1.53, + "grad_norm": 28.563826235603777, + "learning_rate": 8.061266263883404e-08, + "logits/chosen": -1.3678812980651855, + "logits/rejected": -1.3074369430541992, + "logps/chosen": -219.76559448242188, + "logps/rejected": -344.3088684082031, + "loss": 0.4045, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.665743112564087, + "rewards/margins": 1.2486860752105713, + "rewards/rejected": -2.914429187774658, + "step": 8860 + }, + { + "epoch": 1.53, + "grad_norm": 35.12276578609268, + "learning_rate": 8.006050265593814e-08, + "logits/chosen": -1.510338544845581, + "logits/rejected": -1.4287976026535034, + "logps/chosen": -221.04647827148438, + "logps/rejected": -359.68280029296875, + "loss": 0.3904, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6428911685943604, + "rewards/margins": 1.4038238525390625, + "rewards/rejected": -3.046715021133423, + "step": 8870 + }, + { + "epoch": 1.53, + "grad_norm": 44.16416214094416, + "learning_rate": 7.950987944900192e-08, + "logits/chosen": -1.3029206991195679, + "logits/rejected": -1.2382522821426392, + "logps/chosen": -218.285888671875, + "logps/rejected": -348.65289306640625, + "loss": 0.4284, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.64852774143219, + "rewards/margins": 1.3227118253707886, + "rewards/rejected": -2.9712395668029785, + "step": 8880 + }, + { + "epoch": 1.53, + "grad_norm": 37.57144300162699, + "learning_rate": 7.896079799735308e-08, + "logits/chosen": -1.3296968936920166, + "logits/rejected": -1.262048602104187, + "logps/chosen": -228.5214080810547, + "logps/rejected": -362.320556640625, + "loss": 0.3695, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7403507232666016, + "rewards/margins": 1.3753221035003662, + "rewards/rejected": -3.115673065185547, + "step": 8890 + }, + { + "epoch": 1.53, + "grad_norm": 26.182342901511827, + "learning_rate": 7.841326326637781e-08, + "logits/chosen": -1.3689161539077759, + "logits/rejected": -1.2965118885040283, + "logps/chosen": -223.8673858642578, + "logps/rejected": -368.6548156738281, + "loss": 0.3594, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6899131536483765, + "rewards/margins": 1.467007040977478, + "rewards/rejected": -3.1569199562072754, + "step": 8900 + }, + { + "epoch": 1.53, + "eval_logits/chosen": -1.4271249771118164, + "eval_logits/rejected": -1.4001696109771729, + "eval_logps/chosen": -251.50247192382812, + "eval_logps/rejected": -303.11505126953125, + "eval_loss": 0.6299463510513306, + "eval_rewards/accuracies": 0.6675186157226562, + "eval_rewards/chosen": -1.9279862642288208, + "eval_rewards/margins": 0.47159045934677124, + "eval_rewards/rejected": -2.3995769023895264, + "eval_runtime": 357.9404, + "eval_samples_per_second": 12.024, + "eval_steps_per_second": 1.503, + "step": 8900 + }, + { + "epoch": 1.54, + "grad_norm": 33.45762067690765, + "learning_rate": 7.786728020747463e-08, + "logits/chosen": -1.3184845447540283, + "logits/rejected": -1.262537956237793, + "logps/chosen": -243.18051147460938, + "logps/rejected": -373.2486572265625, + "loss": 0.4348, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.8748201131820679, + "rewards/margins": 1.3269211053848267, + "rewards/rejected": -3.2017414569854736, + "step": 8910 + }, + { + "epoch": 1.54, + "grad_norm": 17.15569346859002, + "learning_rate": 7.73228537580104e-08, + "logits/chosen": -1.4410854578018188, + "logits/rejected": -1.3533989191055298, + "logps/chosen": -239.29391479492188, + "logps/rejected": -403.9765625, + "loss": 0.3353, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.8225845098495483, + "rewards/margins": 1.6963199377059937, + "rewards/rejected": -3.518904447555542, + "step": 8920 + }, + { + "epoch": 1.54, + "grad_norm": 23.134592614889822, + "learning_rate": 7.677998884127543e-08, + "logits/chosen": -1.3612538576126099, + "logits/rejected": -1.2906397581100464, + "logps/chosen": -248.0911102294922, + "logps/rejected": -387.23736572265625, + "loss": 0.4191, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.9340145587921143, + "rewards/margins": 1.4329915046691895, + "rewards/rejected": -3.367006301879883, + "step": 8930 + }, + { + "epoch": 1.54, + "grad_norm": 31.45965420355019, + "learning_rate": 7.623869036643901e-08, + "logits/chosen": -1.3538182973861694, + "logits/rejected": -1.290165662765503, + "logps/chosen": -234.25064086914062, + "logps/rejected": -377.9019470214844, + "loss": 0.3828, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7864036560058594, + "rewards/margins": 1.434646725654602, + "rewards/rejected": -3.22105073928833, + "step": 8940 + }, + { + "epoch": 1.54, + "grad_norm": 52.90203534371852, + "learning_rate": 7.569896322850489e-08, + "logits/chosen": -1.214430570602417, + "logits/rejected": -1.1799967288970947, + "logps/chosen": -237.46340942382812, + "logps/rejected": -357.76739501953125, + "loss": 0.4502, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8337962627410889, + "rewards/margins": 1.172048807144165, + "rewards/rejected": -3.005844831466675, + "step": 8950 + }, + { + "epoch": 1.54, + "grad_norm": 44.77979199965103, + "learning_rate": 7.516081230826715e-08, + "logits/chosen": -1.329178810119629, + "logits/rejected": -1.259275197982788, + "logps/chosen": -257.3124694824219, + "logps/rejected": -397.51519775390625, + "loss": 0.4137, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0291686058044434, + "rewards/margins": 1.4289162158966064, + "rewards/rejected": -3.4580845832824707, + "step": 8960 + }, + { + "epoch": 1.55, + "grad_norm": 30.44657329134242, + "learning_rate": 7.462424247226606e-08, + "logits/chosen": -1.3297767639160156, + "logits/rejected": -1.2500559091567993, + "logps/chosen": -235.6514434814453, + "logps/rejected": -384.52142333984375, + "loss": 0.3625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8238303661346436, + "rewards/margins": 1.510573148727417, + "rewards/rejected": -3.3344035148620605, + "step": 8970 + }, + { + "epoch": 1.55, + "grad_norm": 27.607114771049687, + "learning_rate": 7.408925857274373e-08, + "logits/chosen": -1.4012901782989502, + "logits/rejected": -1.3390867710113525, + "logps/chosen": -257.0660705566406, + "logps/rejected": -369.2894592285156, + "loss": 0.5045, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0260767936706543, + "rewards/margins": 1.1399564743041992, + "rewards/rejected": -3.1660332679748535, + "step": 8980 + }, + { + "epoch": 1.55, + "grad_norm": 24.870179348536638, + "learning_rate": 7.355586544760109e-08, + "logits/chosen": -1.2825881242752075, + "logits/rejected": -1.2176916599273682, + "logps/chosen": -230.96267700195312, + "logps/rejected": -376.7090148925781, + "loss": 0.3662, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7600934505462646, + "rewards/margins": 1.4670263528823853, + "rewards/rejected": -3.2271199226379395, + "step": 8990 + }, + { + "epoch": 1.55, + "grad_norm": 36.79345395085888, + "learning_rate": 7.302406792035298e-08, + "logits/chosen": -1.369960069656372, + "logits/rejected": -1.2962627410888672, + "logps/chosen": -246.2954559326172, + "logps/rejected": -387.07904052734375, + "loss": 0.4428, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8888282775878906, + "rewards/margins": 1.4562675952911377, + "rewards/rejected": -3.3450961112976074, + "step": 9000 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -1.436100721359253, + "eval_logits/rejected": -1.4093130826950073, + "eval_logps/chosen": -247.8895263671875, + "eval_logps/rejected": -298.9695739746094, + "eval_loss": 0.631913423538208, + "eval_rewards/accuracies": 0.6642658114433289, + "eval_rewards/chosen": -1.8918566703796387, + "eval_rewards/margins": 0.46626490354537964, + "eval_rewards/rejected": -2.358121633529663, + "eval_runtime": 357.5482, + "eval_samples_per_second": 12.038, + "eval_steps_per_second": 1.505, + "step": 9000 + }, + { + "epoch": 1.55, + "grad_norm": 23.705026745341147, + "learning_rate": 7.249387080008552e-08, + "logits/chosen": -1.3333415985107422, + "logits/rejected": -1.2726539373397827, + "logps/chosen": -234.00778198242188, + "logps/rejected": -349.1484069824219, + "loss": 0.4458, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7839086055755615, + "rewards/margins": 1.1792595386505127, + "rewards/rejected": -2.963167905807495, + "step": 9010 + }, + { + "epoch": 1.55, + "grad_norm": 38.89598953145415, + "learning_rate": 7.196527888141199e-08, + "logits/chosen": -1.2887022495269775, + "logits/rejected": -1.2170953750610352, + "logps/chosen": -209.22061157226562, + "logps/rejected": -373.1058044433594, + "loss": 0.3497, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.5529781579971313, + "rewards/margins": 1.6689532995224, + "rewards/rejected": -3.2219314575195312, + "step": 9020 + }, + { + "epoch": 1.56, + "grad_norm": 28.117977627167093, + "learning_rate": 7.14382969444299e-08, + "logits/chosen": -1.3042352199554443, + "logits/rejected": -1.266124963760376, + "logps/chosen": -225.1204833984375, + "logps/rejected": -359.4920959472656, + "loss": 0.4126, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7398335933685303, + "rewards/margins": 1.3171305656433105, + "rewards/rejected": -3.0569639205932617, + "step": 9030 + }, + { + "epoch": 1.56, + "grad_norm": 33.729240485438446, + "learning_rate": 7.091292975467744e-08, + "logits/chosen": -1.2989321947097778, + "logits/rejected": -1.2422449588775635, + "logps/chosen": -219.76953125, + "logps/rejected": -347.19561767578125, + "loss": 0.4361, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6861400604248047, + "rewards/margins": 1.2657876014709473, + "rewards/rejected": -2.951927661895752, + "step": 9040 + }, + { + "epoch": 1.56, + "grad_norm": 43.84683137314539, + "learning_rate": 7.038918206309061e-08, + "logits/chosen": -1.363384485244751, + "logits/rejected": -1.299889087677002, + "logps/chosen": -237.06698608398438, + "logps/rejected": -374.26708984375, + "loss": 0.4154, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8334167003631592, + "rewards/margins": 1.3851557970046997, + "rewards/rejected": -3.2185721397399902, + "step": 9050 + }, + { + "epoch": 1.56, + "grad_norm": 36.66512791686662, + "learning_rate": 6.986705860596004e-08, + "logits/chosen": -1.3877991437911987, + "logits/rejected": -1.3261516094207764, + "logps/chosen": -228.3108367919922, + "logps/rejected": -349.1913146972656, + "loss": 0.4466, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7361310720443726, + "rewards/margins": 1.224022626876831, + "rewards/rejected": -2.960153579711914, + "step": 9060 + }, + { + "epoch": 1.56, + "grad_norm": 29.90532654515578, + "learning_rate": 6.934656410488849e-08, + "logits/chosen": -1.3249752521514893, + "logits/rejected": -1.256667971611023, + "logps/chosen": -210.0726776123047, + "logps/rejected": -357.83990478515625, + "loss": 0.3591, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.5548808574676514, + "rewards/margins": 1.4754278659820557, + "rewards/rejected": -3.030308246612549, + "step": 9070 + }, + { + "epoch": 1.56, + "grad_norm": 33.369257953836836, + "learning_rate": 6.882770326674753e-08, + "logits/chosen": -1.3675148487091064, + "logits/rejected": -1.3241257667541504, + "logps/chosen": -205.6577606201172, + "logps/rejected": -325.0508117675781, + "loss": 0.4562, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.536978840827942, + "rewards/margins": 1.1593728065490723, + "rewards/rejected": -2.6963515281677246, + "step": 9080 + }, + { + "epoch": 1.57, + "grad_norm": 27.478791555939875, + "learning_rate": 6.831048078363603e-08, + "logits/chosen": -1.340841293334961, + "logits/rejected": -1.2636873722076416, + "logps/chosen": -221.9270477294922, + "logps/rejected": -348.5683898925781, + "loss": 0.3936, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.666637659072876, + "rewards/margins": 1.3149703741073608, + "rewards/rejected": -2.9816081523895264, + "step": 9090 + }, + { + "epoch": 1.57, + "grad_norm": 39.68968478397206, + "learning_rate": 6.779490133283639e-08, + "logits/chosen": -1.3765848875045776, + "logits/rejected": -1.3157910108566284, + "logps/chosen": -231.72561645507812, + "logps/rejected": -339.90594482421875, + "loss": 0.4441, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7491241693496704, + "rewards/margins": 1.115006685256958, + "rewards/rejected": -2.864131212234497, + "step": 9100 + }, + { + "epoch": 1.57, + "eval_logits/chosen": -1.4595789909362793, + "eval_logits/rejected": -1.433526635169983, + "eval_logps/chosen": -236.91993713378906, + "eval_logps/rejected": -285.5493469238281, + "eval_loss": 0.6315240859985352, + "eval_rewards/accuracies": 0.6670538783073425, + "eval_rewards/chosen": -1.7821608781814575, + "eval_rewards/margins": 0.4417589604854584, + "eval_rewards/rejected": -2.2239201068878174, + "eval_runtime": 357.8152, + "eval_samples_per_second": 12.029, + "eval_steps_per_second": 1.504, + "step": 9100 + }, + { + "epoch": 1.57, + "grad_norm": 26.17971404373787, + "learning_rate": 6.72809695767736e-08, + "logits/chosen": -1.3913519382476807, + "logits/rejected": -1.3288047313690186, + "logps/chosen": -211.9567413330078, + "logps/rejected": -343.7449951171875, + "loss": 0.4005, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.567034125328064, + "rewards/margins": 1.3445512056350708, + "rewards/rejected": -2.9115850925445557, + "step": 9110 + }, + { + "epoch": 1.57, + "grad_norm": 36.967070697201144, + "learning_rate": 6.67686901629718e-08, + "logits/chosen": -1.3988851308822632, + "logits/rejected": -1.3274810314178467, + "logps/chosen": -217.08779907226562, + "logps/rejected": -345.1493225097656, + "loss": 0.4303, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5681654214859009, + "rewards/margins": 1.332497000694275, + "rewards/rejected": -2.9006621837615967, + "step": 9120 + }, + { + "epoch": 1.57, + "grad_norm": 31.635032954440245, + "learning_rate": 6.625806772401346e-08, + "logits/chosen": -1.323700189590454, + "logits/rejected": -1.2679407596588135, + "logps/chosen": -219.18026733398438, + "logps/rejected": -335.81927490234375, + "loss": 0.4429, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.641919732093811, + "rewards/margins": 1.1755802631378174, + "rewards/rejected": -2.817499876022339, + "step": 9130 + }, + { + "epoch": 1.57, + "grad_norm": 18.580130895157108, + "learning_rate": 6.574910687749641e-08, + "logits/chosen": -1.3721438646316528, + "logits/rejected": -1.284090280532837, + "logps/chosen": -218.16458129882812, + "logps/rejected": -355.3311462402344, + "loss": 0.3806, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.608903169631958, + "rewards/margins": 1.4527578353881836, + "rewards/rejected": -3.0616610050201416, + "step": 9140 + }, + { + "epoch": 1.58, + "grad_norm": 35.67358647499166, + "learning_rate": 6.524181222599281e-08, + "logits/chosen": -1.3545089960098267, + "logits/rejected": -1.2818124294281006, + "logps/chosen": -233.56680297851562, + "logps/rejected": -374.37420654296875, + "loss": 0.4067, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7525733709335327, + "rewards/margins": 1.4464915990829468, + "rewards/rejected": -3.1990647315979004, + "step": 9150 + }, + { + "epoch": 1.58, + "grad_norm": 25.211132645134647, + "learning_rate": 6.473618835700731e-08, + "logits/chosen": -1.3555432558059692, + "logits/rejected": -1.2993581295013428, + "logps/chosen": -213.2099151611328, + "logps/rejected": -361.63824462890625, + "loss": 0.3679, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6408780813217163, + "rewards/margins": 1.444949984550476, + "rewards/rejected": -3.0858283042907715, + "step": 9160 + }, + { + "epoch": 1.58, + "grad_norm": 29.097063689803438, + "learning_rate": 6.423223984293543e-08, + "logits/chosen": -1.4018914699554443, + "logits/rejected": -1.3220356702804565, + "logps/chosen": -224.4427947998047, + "logps/rejected": -370.28741455078125, + "loss": 0.3913, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6960633993148804, + "rewards/margins": 1.5122019052505493, + "rewards/rejected": -3.208265781402588, + "step": 9170 + }, + { + "epoch": 1.58, + "grad_norm": 26.255457005191428, + "learning_rate": 6.372997124102245e-08, + "logits/chosen": -1.3606441020965576, + "logits/rejected": -1.3003352880477905, + "logps/chosen": -224.84521484375, + "logps/rejected": -351.32806396484375, + "loss": 0.4274, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7013254165649414, + "rewards/margins": 1.290497899055481, + "rewards/rejected": -2.991823434829712, + "step": 9180 + }, + { + "epoch": 1.58, + "grad_norm": 27.40777133201824, + "learning_rate": 6.322938709332195e-08, + "logits/chosen": -1.4560340642929077, + "logits/rejected": -1.4088115692138672, + "logps/chosen": -246.0127716064453, + "logps/rejected": -384.2671203613281, + "loss": 0.3953, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.918752670288086, + "rewards/margins": 1.3545637130737305, + "rewards/rejected": -3.2733166217803955, + "step": 9190 + }, + { + "epoch": 1.59, + "grad_norm": 29.72804041222211, + "learning_rate": 6.273049192665502e-08, + "logits/chosen": -1.3812012672424316, + "logits/rejected": -1.3190171718597412, + "logps/chosen": -222.0596466064453, + "logps/rejected": -362.23779296875, + "loss": 0.3898, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6922333240509033, + "rewards/margins": 1.4142590761184692, + "rewards/rejected": -3.106492280960083, + "step": 9200 + }, + { + "epoch": 1.59, + "eval_logits/chosen": -1.4436826705932617, + "eval_logits/rejected": -1.41750967502594, + "eval_logps/chosen": -235.59715270996094, + "eval_logps/rejected": -284.1919250488281, + "eval_loss": 0.6316264271736145, + "eval_rewards/accuracies": 0.6656598448753357, + "eval_rewards/chosen": -1.7689329385757446, + "eval_rewards/margins": 0.4414127469062805, + "eval_rewards/rejected": -2.21034574508667, + "eval_runtime": 358.0647, + "eval_samples_per_second": 12.02, + "eval_steps_per_second": 1.503, + "step": 9200 + }, + { + "epoch": 1.59, + "grad_norm": 37.6892193633804, + "learning_rate": 6.223329025256896e-08, + "logits/chosen": -1.2596355676651, + "logits/rejected": -1.189841866493225, + "logps/chosen": -223.52413940429688, + "logps/rejected": -360.711669921875, + "loss": 0.4099, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6768831014633179, + "rewards/margins": 1.412092924118042, + "rewards/rejected": -3.0889759063720703, + "step": 9210 + }, + { + "epoch": 1.59, + "grad_norm": 56.42606826064067, + "learning_rate": 6.173778656729678e-08, + "logits/chosen": -1.3393886089324951, + "logits/rejected": -1.2738230228424072, + "logps/chosen": -217.1497344970703, + "logps/rejected": -360.0973205566406, + "loss": 0.4035, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.671485185623169, + "rewards/margins": 1.4341779947280884, + "rewards/rejected": -3.1056630611419678, + "step": 9220 + }, + { + "epoch": 1.59, + "grad_norm": 41.20515118078039, + "learning_rate": 6.124398535171655e-08, + "logits/chosen": -1.2532026767730713, + "logits/rejected": -1.1974518299102783, + "logps/chosen": -219.7674102783203, + "logps/rejected": -351.6419372558594, + "loss": 0.4204, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7048327922821045, + "rewards/margins": 1.2831361293792725, + "rewards/rejected": -2.987969160079956, + "step": 9230 + }, + { + "epoch": 1.59, + "grad_norm": 37.10435338672497, + "learning_rate": 6.07518910713106e-08, + "logits/chosen": -1.3163211345672607, + "logits/rejected": -1.269641399383545, + "logps/chosen": -227.19949340820312, + "logps/rejected": -363.2820739746094, + "loss": 0.3956, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.710626244544983, + "rewards/margins": 1.3764336109161377, + "rewards/rejected": -3.087059736251831, + "step": 9240 + }, + { + "epoch": 1.59, + "grad_norm": 38.05805409886092, + "learning_rate": 6.026150817612544e-08, + "logits/chosen": -1.2923637628555298, + "logits/rejected": -1.2226426601409912, + "logps/chosen": -215.64614868164062, + "logps/rejected": -348.65020751953125, + "loss": 0.4343, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6089054346084595, + "rewards/margins": 1.3319495916366577, + "rewards/rejected": -2.940855026245117, + "step": 9250 + }, + { + "epoch": 1.6, + "grad_norm": 31.84368181089064, + "learning_rate": 5.977284110073136e-08, + "logits/chosen": -1.3127715587615967, + "logits/rejected": -1.2576462030410767, + "logps/chosen": -220.4276885986328, + "logps/rejected": -353.9443054199219, + "loss": 0.4051, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7238426208496094, + "rewards/margins": 1.3259329795837402, + "rewards/rejected": -3.0497756004333496, + "step": 9260 + }, + { + "epoch": 1.6, + "grad_norm": 23.613823280583112, + "learning_rate": 5.928589426418235e-08, + "logits/chosen": -1.416325330734253, + "logits/rejected": -1.3402214050292969, + "logps/chosen": -227.9755859375, + "logps/rejected": -368.1861572265625, + "loss": 0.391, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7309129238128662, + "rewards/margins": 1.4332636594772339, + "rewards/rejected": -3.1641767024993896, + "step": 9270 + }, + { + "epoch": 1.6, + "grad_norm": 25.039505768647498, + "learning_rate": 5.8800672069976105e-08, + "logits/chosen": -1.3524010181427002, + "logits/rejected": -1.2967437505722046, + "logps/chosen": -217.47488403320312, + "logps/rejected": -346.5975341796875, + "loss": 0.4171, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.639002799987793, + "rewards/margins": 1.2891342639923096, + "rewards/rejected": -2.9281370639801025, + "step": 9280 + }, + { + "epoch": 1.6, + "grad_norm": 29.09884422283532, + "learning_rate": 5.831717890601434e-08, + "logits/chosen": -1.2608332633972168, + "logits/rejected": -1.2108803987503052, + "logps/chosen": -222.2425079345703, + "logps/rejected": -329.8652038574219, + "loss": 0.4687, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6758387088775635, + "rewards/margins": 1.1141695976257324, + "rewards/rejected": -2.790008068084717, + "step": 9290 + }, + { + "epoch": 1.6, + "grad_norm": 39.97591942100439, + "learning_rate": 5.7835419144563e-08, + "logits/chosen": -1.3308337926864624, + "logits/rejected": -1.2767422199249268, + "logps/chosen": -234.8633270263672, + "logps/rejected": -378.3427429199219, + "loss": 0.3657, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.812017798423767, + "rewards/margins": 1.413147211074829, + "rewards/rejected": -3.2251651287078857, + "step": 9300 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -1.4361413717269897, + "eval_logits/rejected": -1.409943699836731, + "eval_logps/chosen": -239.39939880371094, + "eval_logps/rejected": -288.6492614746094, + "eval_loss": 0.6325801014900208, + "eval_rewards/accuracies": 0.6638011336326599, + "eval_rewards/chosen": -1.806955337524414, + "eval_rewards/margins": 0.4479631185531616, + "eval_rewards/rejected": -2.2549185752868652, + "eval_runtime": 358.0378, + "eval_samples_per_second": 12.021, + "eval_steps_per_second": 1.503, + "step": 9300 + }, + { + "epoch": 1.6, + "grad_norm": 23.678957806069693, + "learning_rate": 5.7355397142212495e-08, + "logits/chosen": -1.3521010875701904, + "logits/rejected": -1.2914403676986694, + "logps/chosen": -218.249267578125, + "logps/rejected": -341.4629211425781, + "loss": 0.4569, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6451466083526611, + "rewards/margins": 1.2658127546310425, + "rewards/rejected": -2.910959482192993, + "step": 9310 + }, + { + "epoch": 1.61, + "grad_norm": 35.87762222953201, + "learning_rate": 5.687711723983907e-08, + "logits/chosen": -1.4106115102767944, + "logits/rejected": -1.3428720235824585, + "logps/chosen": -235.66696166992188, + "logps/rejected": -378.86505126953125, + "loss": 0.4091, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8487489223480225, + "rewards/margins": 1.3959980010986328, + "rewards/rejected": -3.2447471618652344, + "step": 9320 + }, + { + "epoch": 1.61, + "grad_norm": 36.79795244053593, + "learning_rate": 5.640058376256437e-08, + "logits/chosen": -1.3952717781066895, + "logits/rejected": -1.3357038497924805, + "logps/chosen": -222.98147583007812, + "logps/rejected": -338.1329650878906, + "loss": 0.458, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6642459630966187, + "rewards/margins": 1.166606068611145, + "rewards/rejected": -2.8308520317077637, + "step": 9330 + }, + { + "epoch": 1.61, + "grad_norm": 21.105194172978344, + "learning_rate": 5.5925801019717637e-08, + "logits/chosen": -1.308809518814087, + "logits/rejected": -1.2506635189056396, + "logps/chosen": -229.06600952148438, + "logps/rejected": -373.9830627441406, + "loss": 0.4046, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7726303339004517, + "rewards/margins": 1.4514251947402954, + "rewards/rejected": -3.224055528640747, + "step": 9340 + }, + { + "epoch": 1.61, + "grad_norm": 41.165260710581435, + "learning_rate": 5.5452773304795585e-08, + "logits/chosen": -1.3799827098846436, + "logits/rejected": -1.31058669090271, + "logps/chosen": -214.2169647216797, + "logps/rejected": -341.6388854980469, + "loss": 0.4018, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6169312000274658, + "rewards/margins": 1.2802019119262695, + "rewards/rejected": -2.8971328735351562, + "step": 9350 + }, + { + "epoch": 1.61, + "grad_norm": 30.89004835348826, + "learning_rate": 5.4981504895424273e-08, + "logits/chosen": -1.4255657196044922, + "logits/rejected": -1.349281668663025, + "logps/chosen": -211.1226043701172, + "logps/rejected": -347.12811279296875, + "loss": 0.3748, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5445703268051147, + "rewards/margins": 1.4008324146270752, + "rewards/rejected": -2.9454026222229004, + "step": 9360 + }, + { + "epoch": 1.61, + "grad_norm": 26.504198753445007, + "learning_rate": 5.4512000053320266e-08, + "logits/chosen": -1.4140576124191284, + "logits/rejected": -1.333478569984436, + "logps/chosen": -233.5621795654297, + "logps/rejected": -371.4867248535156, + "loss": 0.3992, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7974094152450562, + "rewards/margins": 1.403607726097107, + "rewards/rejected": -3.201017379760742, + "step": 9370 + }, + { + "epoch": 1.62, + "grad_norm": 22.785892226259428, + "learning_rate": 5.4044263024251994e-08, + "logits/chosen": -1.3954850435256958, + "logits/rejected": -1.3420671224594116, + "logps/chosen": -226.4377899169922, + "logps/rejected": -347.89373779296875, + "loss": 0.4574, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7203410863876343, + "rewards/margins": 1.2211310863494873, + "rewards/rejected": -2.941472291946411, + "step": 9380 + }, + { + "epoch": 1.62, + "grad_norm": 31.550228794525225, + "learning_rate": 5.357829803800137e-08, + "logits/chosen": -1.2319252490997314, + "logits/rejected": -1.172555923461914, + "logps/chosen": -237.8747100830078, + "logps/rejected": -377.8671875, + "loss": 0.409, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8478126525878906, + "rewards/margins": 1.3827223777770996, + "rewards/rejected": -3.2305350303649902, + "step": 9390 + }, + { + "epoch": 1.62, + "grad_norm": 27.350825817958476, + "learning_rate": 5.3114109308325743e-08, + "logits/chosen": -1.2861920595169067, + "logits/rejected": -1.2316094636917114, + "logps/chosen": -221.30990600585938, + "logps/rejected": -339.2297058105469, + "loss": 0.4666, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6726821660995483, + "rewards/margins": 1.1769441366195679, + "rewards/rejected": -2.849626302719116, + "step": 9400 + }, + { + "epoch": 1.62, + "eval_logits/chosen": -1.437654733657837, + "eval_logits/rejected": -1.4113147258758545, + "eval_logps/chosen": -238.54745483398438, + "eval_logps/rejected": -287.8304138183594, + "eval_loss": 0.6324562430381775, + "eval_rewards/accuracies": 0.6631041169166565, + "eval_rewards/chosen": -1.7984360456466675, + "eval_rewards/margins": 0.4482942521572113, + "eval_rewards/rejected": -2.246730327606201, + "eval_runtime": 358.2323, + "eval_samples_per_second": 12.015, + "eval_steps_per_second": 1.502, + "step": 9400 + }, + { + "epoch": 1.62, + "grad_norm": 37.163291490549675, + "learning_rate": 5.265170103291952e-08, + "logits/chosen": -1.3120262622833252, + "logits/rejected": -1.2514145374298096, + "logps/chosen": -221.3708038330078, + "logps/rejected": -354.61456298828125, + "loss": 0.4083, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6749778985977173, + "rewards/margins": 1.3499835729599, + "rewards/rejected": -3.0249617099761963, + "step": 9410 + }, + { + "epoch": 1.62, + "grad_norm": 38.28562781683107, + "learning_rate": 5.2191077393376165e-08, + "logits/chosen": -1.3560113906860352, + "logits/rejected": -1.298722267150879, + "logps/chosen": -231.69131469726562, + "logps/rejected": -346.3023376464844, + "loss": 0.4541, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7664096355438232, + "rewards/margins": 1.1748406887054443, + "rewards/rejected": -2.9412503242492676, + "step": 9420 + }, + { + "epoch": 1.62, + "grad_norm": 31.533083950987773, + "learning_rate": 5.173224255515099e-08, + "logits/chosen": -1.3096221685409546, + "logits/rejected": -1.2421365976333618, + "logps/chosen": -223.90087890625, + "logps/rejected": -373.5675048828125, + "loss": 0.4027, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7149451971054077, + "rewards/margins": 1.5314754247665405, + "rewards/rejected": -3.2464206218719482, + "step": 9430 + }, + { + "epoch": 1.63, + "grad_norm": 40.702906044337006, + "learning_rate": 5.127520066752256e-08, + "logits/chosen": -1.2992658615112305, + "logits/rejected": -1.2521995306015015, + "logps/chosen": -227.48019409179688, + "logps/rejected": -347.92181396484375, + "loss": 0.4109, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7786552906036377, + "rewards/margins": 1.176113247871399, + "rewards/rejected": -2.954768419265747, + "step": 9440 + }, + { + "epoch": 1.63, + "grad_norm": 32.090498966901635, + "learning_rate": 5.0819955863555916e-08, + "logits/chosen": -1.4480046033859253, + "logits/rejected": -1.3969746828079224, + "logps/chosen": -240.8600616455078, + "logps/rejected": -356.9917297363281, + "loss": 0.4612, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.874746561050415, + "rewards/margins": 1.189202070236206, + "rewards/rejected": -3.0639488697052, + "step": 9450 + }, + { + "epoch": 1.63, + "grad_norm": 17.790347252816513, + "learning_rate": 5.0366512260064883e-08, + "logits/chosen": -1.310302734375, + "logits/rejected": -1.2530990839004517, + "logps/chosen": -209.5319061279297, + "logps/rejected": -382.5646667480469, + "loss": 0.3101, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.5730886459350586, + "rewards/margins": 1.7125349044799805, + "rewards/rejected": -3.285623550415039, + "step": 9460 + }, + { + "epoch": 1.63, + "grad_norm": 38.95147289526329, + "learning_rate": 4.9914873957574906e-08, + "logits/chosen": -1.1751810312271118, + "logits/rejected": -1.1032475233078003, + "logps/chosen": -228.77847290039062, + "logps/rejected": -351.155517578125, + "loss": 0.4307, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7615420818328857, + "rewards/margins": 1.2454551458358765, + "rewards/rejected": -3.006997585296631, + "step": 9470 + }, + { + "epoch": 1.63, + "grad_norm": 26.885837020564978, + "learning_rate": 4.94650450402859e-08, + "logits/chosen": -1.3068211078643799, + "logits/rejected": -1.22744882106781, + "logps/chosen": -227.61032104492188, + "logps/rejected": -368.6788330078125, + "loss": 0.3906, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7363221645355225, + "rewards/margins": 1.4241564273834229, + "rewards/rejected": -3.1604788303375244, + "step": 9480 + }, + { + "epoch": 1.64, + "grad_norm": 30.073094764522576, + "learning_rate": 4.9017029576035404e-08, + "logits/chosen": -1.2682311534881592, + "logits/rejected": -1.2089643478393555, + "logps/chosen": -231.9444580078125, + "logps/rejected": -363.248291015625, + "loss": 0.3954, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7784042358398438, + "rewards/margins": 1.3314237594604492, + "rewards/rejected": -3.109828233718872, + "step": 9490 + }, + { + "epoch": 1.64, + "grad_norm": 23.54399691970181, + "learning_rate": 4.857083161626174e-08, + "logits/chosen": -1.3256847858428955, + "logits/rejected": -1.2643065452575684, + "logps/chosen": -226.8311004638672, + "logps/rejected": -383.41436767578125, + "loss": 0.3503, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7317718267440796, + "rewards/margins": 1.551546335220337, + "rewards/rejected": -3.283318042755127, + "step": 9500 + }, + { + "epoch": 1.64, + "eval_logits/chosen": -1.402784824371338, + "eval_logits/rejected": -1.3757189512252808, + "eval_logps/chosen": -252.00526428222656, + "eval_logps/rejected": -304.04388427734375, + "eval_loss": 0.6339713931083679, + "eval_rewards/accuracies": 0.6586896181106567, + "eval_rewards/chosen": -1.9330142736434937, + "eval_rewards/margins": 0.4758506715297699, + "eval_rewards/rejected": -2.408864736557007, + "eval_runtime": 358.2372, + "eval_samples_per_second": 12.014, + "eval_steps_per_second": 1.502, + "step": 9500 + }, + { + "epoch": 1.64, + "grad_norm": 33.834138719354485, + "learning_rate": 4.812645519596748e-08, + "logits/chosen": -1.2126704454421997, + "logits/rejected": -1.152756929397583, + "logps/chosen": -234.50845336914062, + "logps/rejected": -376.29010009765625, + "loss": 0.3692, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8366127014160156, + "rewards/margins": 1.3869558572769165, + "rewards/rejected": -3.2235684394836426, + "step": 9510 + }, + { + "epoch": 1.64, + "grad_norm": 24.273551744010256, + "learning_rate": 4.7683904333682715e-08, + "logits/chosen": -1.439879059791565, + "logits/rejected": -1.3854401111602783, + "logps/chosen": -251.1964874267578, + "logps/rejected": -393.2129821777344, + "loss": 0.426, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9885809421539307, + "rewards/margins": 1.3651524782180786, + "rewards/rejected": -3.3537330627441406, + "step": 9520 + }, + { + "epoch": 1.64, + "grad_norm": 34.99697034368316, + "learning_rate": 4.72431830314291e-08, + "logits/chosen": -1.3440197706222534, + "logits/rejected": -1.2706291675567627, + "logps/chosen": -233.45089721679688, + "logps/rejected": -384.12225341796875, + "loss": 0.368, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8057721853256226, + "rewards/margins": 1.5267770290374756, + "rewards/rejected": -3.3325493335723877, + "step": 9530 + }, + { + "epoch": 1.64, + "grad_norm": 41.925642700921735, + "learning_rate": 4.68042952746831e-08, + "logits/chosen": -1.247374415397644, + "logits/rejected": -1.1915947198867798, + "logps/chosen": -240.2723388671875, + "logps/rejected": -373.23394775390625, + "loss": 0.4017, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8617527484893799, + "rewards/margins": 1.3366732597351074, + "rewards/rejected": -3.1984262466430664, + "step": 9540 + }, + { + "epoch": 1.65, + "grad_norm": 33.59274463489468, + "learning_rate": 4.636724503234074e-08, + "logits/chosen": -1.3158290386199951, + "logits/rejected": -1.2710721492767334, + "logps/chosen": -234.71505737304688, + "logps/rejected": -374.32415771484375, + "loss": 0.4214, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8152387142181396, + "rewards/margins": 1.3604776859283447, + "rewards/rejected": -3.1757161617279053, + "step": 9550 + }, + { + "epoch": 1.65, + "grad_norm": 34.17989472034478, + "learning_rate": 4.593203625668077e-08, + "logits/chosen": -1.4079006910324097, + "logits/rejected": -1.3559339046478271, + "logps/chosen": -226.49813842773438, + "logps/rejected": -359.3128967285156, + "loss": 0.3967, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.728776216506958, + "rewards/margins": 1.3338409662246704, + "rewards/rejected": -3.0626168251037598, + "step": 9560 + }, + { + "epoch": 1.65, + "grad_norm": 31.348331592601255, + "learning_rate": 4.549867288332987e-08, + "logits/chosen": -1.2812343835830688, + "logits/rejected": -1.2303446531295776, + "logps/chosen": -227.1624298095703, + "logps/rejected": -361.9521789550781, + "loss": 0.4189, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7691608667373657, + "rewards/margins": 1.3438093662261963, + "rewards/rejected": -3.1129703521728516, + "step": 9570 + }, + { + "epoch": 1.65, + "grad_norm": 39.69075540606707, + "learning_rate": 4.5067158831226273e-08, + "logits/chosen": -1.3360213041305542, + "logits/rejected": -1.2739002704620361, + "logps/chosen": -245.7711639404297, + "logps/rejected": -386.2242126464844, + "loss": 0.4138, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9129165410995483, + "rewards/margins": 1.3799970149993896, + "rewards/rejected": -3.2929134368896484, + "step": 9580 + }, + { + "epoch": 1.65, + "grad_norm": 33.115164323400506, + "learning_rate": 4.463749800258479e-08, + "logits/chosen": -1.444392442703247, + "logits/rejected": -1.3800554275512695, + "logps/chosen": -227.8526153564453, + "logps/rejected": -368.66693115234375, + "loss": 0.3906, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7177770137786865, + "rewards/margins": 1.4058539867401123, + "rewards/rejected": -3.123631000518799, + "step": 9590 + }, + { + "epoch": 1.65, + "grad_norm": 41.69230357375958, + "learning_rate": 4.420969428286139e-08, + "logits/chosen": -1.2808119058609009, + "logits/rejected": -1.2005847692489624, + "logps/chosen": -230.84872436523438, + "logps/rejected": -385.7454528808594, + "loss": 0.3729, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7579460144042969, + "rewards/margins": 1.5761973857879639, + "rewards/rejected": -3.334143877029419, + "step": 9600 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -1.391427993774414, + "eval_logits/rejected": -1.3641211986541748, + "eval_logps/chosen": -252.29434204101562, + "eval_logps/rejected": -304.6582946777344, + "eval_loss": 0.6356525421142578, + "eval_rewards/accuracies": 0.6563661694526672, + "eval_rewards/chosen": -1.935904860496521, + "eval_rewards/margins": 0.4791041910648346, + "eval_rewards/rejected": -2.415009021759033, + "eval_runtime": 357.9055, + "eval_samples_per_second": 12.026, + "eval_steps_per_second": 1.503, + "step": 9600 + }, + { + "epoch": 1.66, + "grad_norm": 38.45195266525548, + "learning_rate": 4.378375154071806e-08, + "logits/chosen": -1.2705994844436646, + "logits/rejected": -1.2090730667114258, + "logps/chosen": -229.653076171875, + "logps/rejected": -377.4794006347656, + "loss": 0.3925, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7335882186889648, + "rewards/margins": 1.491674780845642, + "rewards/rejected": -3.2252631187438965, + "step": 9610 + }, + { + "epoch": 1.66, + "grad_norm": 34.39068811627909, + "learning_rate": 4.335967362798787e-08, + "logits/chosen": -1.4060554504394531, + "logits/rejected": -1.3566725254058838, + "logps/chosen": -241.97201538085938, + "logps/rejected": -348.53228759765625, + "loss": 0.4833, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.8736858367919922, + "rewards/margins": 1.0578181743621826, + "rewards/rejected": -2.9315037727355957, + "step": 9620 + }, + { + "epoch": 1.66, + "grad_norm": 41.46128098759825, + "learning_rate": 4.293746437963983e-08, + "logits/chosen": -1.3315681219100952, + "logits/rejected": -1.2634027004241943, + "logps/chosen": -258.986328125, + "logps/rejected": -377.8593444824219, + "loss": 0.4636, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0355629920959473, + "rewards/margins": 1.2129532098770142, + "rewards/rejected": -3.248516082763672, + "step": 9630 + }, + { + "epoch": 1.66, + "grad_norm": 43.601466824424364, + "learning_rate": 4.2517127613744986e-08, + "logits/chosen": -1.3833634853363037, + "logits/rejected": -1.3209584951400757, + "logps/chosen": -235.45803833007812, + "logps/rejected": -356.86627197265625, + "loss": 0.4311, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7948541641235352, + "rewards/margins": 1.2689039707183838, + "rewards/rejected": -3.063758134841919, + "step": 9640 + }, + { + "epoch": 1.66, + "grad_norm": 29.63300631760297, + "learning_rate": 4.209866713144078e-08, + "logits/chosen": -1.2863609790802002, + "logits/rejected": -1.2273352146148682, + "logps/chosen": -234.44369506835938, + "logps/rejected": -349.785888671875, + "loss": 0.4813, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7898155450820923, + "rewards/margins": 1.1682324409484863, + "rewards/rejected": -2.958048105239868, + "step": 9650 + }, + { + "epoch": 1.66, + "grad_norm": 28.816878634667912, + "learning_rate": 4.1682086716897826e-08, + "logits/chosen": -1.3099644184112549, + "logits/rejected": -1.264509916305542, + "logps/chosen": -219.1770477294922, + "logps/rejected": -347.0464172363281, + "loss": 0.4171, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6862246990203857, + "rewards/margins": 1.2479627132415771, + "rewards/rejected": -2.934187412261963, + "step": 9660 + }, + { + "epoch": 1.67, + "grad_norm": 29.767921273426662, + "learning_rate": 4.1267390137284725e-08, + "logits/chosen": -1.3837589025497437, + "logits/rejected": -1.307064414024353, + "logps/chosen": -233.31289672851562, + "logps/rejected": -396.89337158203125, + "loss": 0.3643, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7717788219451904, + "rewards/margins": 1.6641696691513062, + "rewards/rejected": -3.435948610305786, + "step": 9670 + }, + { + "epoch": 1.67, + "grad_norm": 35.63166890222917, + "learning_rate": 4.085458114273463e-08, + "logits/chosen": -1.3208513259887695, + "logits/rejected": -1.2681634426116943, + "logps/chosen": -229.1538848876953, + "logps/rejected": -348.280029296875, + "loss": 0.4797, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7467445135116577, + "rewards/margins": 1.2142714262008667, + "rewards/rejected": -2.9610159397125244, + "step": 9680 + }, + { + "epoch": 1.67, + "grad_norm": 37.72367340978869, + "learning_rate": 4.044366346631106e-08, + "logits/chosen": -1.248106598854065, + "logits/rejected": -1.1902921199798584, + "logps/chosen": -231.75619506835938, + "logps/rejected": -361.8299255371094, + "loss": 0.4176, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7594753503799438, + "rewards/margins": 1.3130239248275757, + "rewards/rejected": -3.0724992752075195, + "step": 9690 + }, + { + "epoch": 1.67, + "grad_norm": 29.824443223289826, + "learning_rate": 4.00346408239742e-08, + "logits/chosen": -1.2740222215652466, + "logits/rejected": -1.2087305784225464, + "logps/chosen": -237.3057403564453, + "logps/rejected": -371.45361328125, + "loss": 0.4403, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8458435535430908, + "rewards/margins": 1.3461719751358032, + "rewards/rejected": -3.1920151710510254, + "step": 9700 + }, + { + "epoch": 1.67, + "eval_logits/chosen": -1.417214274406433, + "eval_logits/rejected": -1.390296220779419, + "eval_logps/chosen": -244.72193908691406, + "eval_logps/rejected": -295.69439697265625, + "eval_loss": 0.6342071294784546, + "eval_rewards/accuracies": 0.6624070405960083, + "eval_rewards/chosen": -1.8601804971694946, + "eval_rewards/margins": 0.46518951654434204, + "eval_rewards/rejected": -2.3253698348999023, + "eval_runtime": 357.4423, + "eval_samples_per_second": 12.041, + "eval_steps_per_second": 1.505, + "step": 9700 + }, + { + "epoch": 1.67, + "grad_norm": 37.595203889217025, + "learning_rate": 3.96275169145473e-08, + "logits/chosen": -1.16835618019104, + "logits/rejected": -1.1231775283813477, + "logps/chosen": -235.44985961914062, + "logps/rejected": -346.90167236328125, + "loss": 0.4504, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.8184646368026733, + "rewards/margins": 1.131596326828003, + "rewards/rejected": -2.950061082839966, + "step": 9710 + }, + { + "epoch": 1.67, + "grad_norm": 45.4457406884452, + "learning_rate": 3.922229541968322e-08, + "logits/chosen": -1.3615391254425049, + "logits/rejected": -1.312709927558899, + "logps/chosen": -238.9713134765625, + "logps/rejected": -351.1409912109375, + "loss": 0.5068, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8562841415405273, + "rewards/margins": 1.1417663097381592, + "rewards/rejected": -2.9980504512786865, + "step": 9720 + }, + { + "epoch": 1.68, + "grad_norm": 33.15996891543965, + "learning_rate": 3.881898000383116e-08, + "logits/chosen": -1.3564598560333252, + "logits/rejected": -1.3007166385650635, + "logps/chosen": -208.45571899414062, + "logps/rejected": -348.89300537109375, + "loss": 0.4111, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.5474106073379517, + "rewards/margins": 1.3974087238311768, + "rewards/rejected": -2.944819450378418, + "step": 9730 + }, + { + "epoch": 1.68, + "grad_norm": 32.7011999016843, + "learning_rate": 3.841757431420351e-08, + "logits/chosen": -1.3424409627914429, + "logits/rejected": -1.2776859998703003, + "logps/chosen": -232.28494262695312, + "logps/rejected": -366.96038818359375, + "loss": 0.4073, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7781145572662354, + "rewards/margins": 1.3590996265411377, + "rewards/rejected": -3.137214183807373, + "step": 9740 + }, + { + "epoch": 1.68, + "grad_norm": 26.93270991808977, + "learning_rate": 3.801808198074266e-08, + "logits/chosen": -1.367996096611023, + "logits/rejected": -1.290379285812378, + "logps/chosen": -232.1277313232422, + "logps/rejected": -353.48175048828125, + "loss": 0.3957, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7703111171722412, + "rewards/margins": 1.271135926246643, + "rewards/rejected": -3.0414466857910156, + "step": 9750 + }, + { + "epoch": 1.68, + "grad_norm": 19.984914469702172, + "learning_rate": 3.7620506616088817e-08, + "logits/chosen": -1.3691097497940063, + "logits/rejected": -1.306137204170227, + "logps/chosen": -238.61715698242188, + "logps/rejected": -359.2353210449219, + "loss": 0.4193, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8544162511825562, + "rewards/margins": 1.225228190422058, + "rewards/rejected": -3.079644203186035, + "step": 9760 + }, + { + "epoch": 1.68, + "grad_norm": 28.256879995074712, + "learning_rate": 3.72248518155463e-08, + "logits/chosen": -1.2638490200042725, + "logits/rejected": -1.198878526687622, + "logps/chosen": -214.2969512939453, + "logps/rejected": -354.8340148925781, + "loss": 0.3748, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6020538806915283, + "rewards/margins": 1.4068208932876587, + "rewards/rejected": -3.0088746547698975, + "step": 9770 + }, + { + "epoch": 1.69, + "grad_norm": 34.80859221520256, + "learning_rate": 3.683112115705225e-08, + "logits/chosen": -1.3776103258132935, + "logits/rejected": -1.2958616018295288, + "logps/chosen": -220.2239227294922, + "logps/rejected": -346.7090759277344, + "loss": 0.4194, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.5932377576828003, + "rewards/margins": 1.3576481342315674, + "rewards/rejected": -2.9508860111236572, + "step": 9780 + }, + { + "epoch": 1.69, + "grad_norm": 49.153516151462675, + "learning_rate": 3.6439318201143096e-08, + "logits/chosen": -1.336469054222107, + "logits/rejected": -1.2982733249664307, + "logps/chosen": -235.4860382080078, + "logps/rejected": -368.72442626953125, + "loss": 0.4339, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.8210595846176147, + "rewards/margins": 1.2906904220581055, + "rewards/rejected": -3.1117501258850098, + "step": 9790 + }, + { + "epoch": 1.69, + "grad_norm": 45.583738179863275, + "learning_rate": 3.604944649092323e-08, + "logits/chosen": -1.3702198266983032, + "logits/rejected": -1.2832942008972168, + "logps/chosen": -227.76315307617188, + "logps/rejected": -392.9060363769531, + "loss": 0.3633, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7098026275634766, + "rewards/margins": 1.682472586631775, + "rewards/rejected": -3.392275333404541, + "step": 9800 + }, + { + "epoch": 1.69, + "eval_logits/chosen": -1.4198648929595947, + "eval_logits/rejected": -1.3927710056304932, + "eval_logps/chosen": -244.33860778808594, + "eval_logps/rejected": -295.2367248535156, + "eval_loss": 0.6346299648284912, + "eval_rewards/accuracies": 0.6589219570159912, + "eval_rewards/chosen": -1.856347680091858, + "eval_rewards/margins": 0.4644457995891571, + "eval_rewards/rejected": -2.320793390274048, + "eval_runtime": 357.9375, + "eval_samples_per_second": 12.024, + "eval_steps_per_second": 1.503, + "step": 9800 + }, + { + "epoch": 1.69, + "grad_norm": 27.982644219340326, + "learning_rate": 3.566150955203251e-08, + "logits/chosen": -1.2991117238998413, + "logits/rejected": -1.2290585041046143, + "logps/chosen": -232.1947784423828, + "logps/rejected": -362.37738037109375, + "loss": 0.438, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.767719030380249, + "rewards/margins": 1.3174628019332886, + "rewards/rejected": -3.085181951522827, + "step": 9810 + }, + { + "epoch": 1.69, + "grad_norm": 62.700137314497454, + "learning_rate": 3.52755108926146e-08, + "logits/chosen": -1.3249415159225464, + "logits/rejected": -1.2635711431503296, + "logps/chosen": -226.71157836914062, + "logps/rejected": -362.6283874511719, + "loss": 0.4126, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7259670495986938, + "rewards/margins": 1.360840916633606, + "rewards/rejected": -3.0868077278137207, + "step": 9820 + }, + { + "epoch": 1.69, + "grad_norm": 31.049190619621527, + "learning_rate": 3.489145400328511e-08, + "logits/chosen": -1.3645232915878296, + "logits/rejected": -1.3135316371917725, + "logps/chosen": -240.0381317138672, + "logps/rejected": -366.198486328125, + "loss": 0.4544, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.820874810218811, + "rewards/margins": 1.2675881385803223, + "rewards/rejected": -3.088463068008423, + "step": 9830 + }, + { + "epoch": 1.7, + "grad_norm": 31.393346729361205, + "learning_rate": 3.4509342357099904e-08, + "logits/chosen": -1.3261866569519043, + "logits/rejected": -1.248307704925537, + "logps/chosen": -227.368896484375, + "logps/rejected": -373.2166442871094, + "loss": 0.4319, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7153198719024658, + "rewards/margins": 1.4764636754989624, + "rewards/rejected": -3.1917834281921387, + "step": 9840 + }, + { + "epoch": 1.7, + "grad_norm": 28.920399059721735, + "learning_rate": 3.4129179409524225e-08, + "logits/chosen": -1.3411327600479126, + "logits/rejected": -1.2954285144805908, + "logps/chosen": -218.49609375, + "logps/rejected": -335.6559753417969, + "loss": 0.4306, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.639061689376831, + "rewards/margins": 1.2008213996887207, + "rewards/rejected": -2.8398830890655518, + "step": 9850 + }, + { + "epoch": 1.7, + "grad_norm": 42.06449748795488, + "learning_rate": 3.375096859840071e-08, + "logits/chosen": -1.4100219011306763, + "logits/rejected": -1.3593881130218506, + "logps/chosen": -245.61587524414062, + "logps/rejected": -357.762939453125, + "loss": 0.4891, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.89302659034729, + "rewards/margins": 1.1459534168243408, + "rewards/rejected": -3.038980007171631, + "step": 9860 + }, + { + "epoch": 1.7, + "grad_norm": 39.095333220619935, + "learning_rate": 3.337471334391903e-08, + "logits/chosen": -1.3635808229446411, + "logits/rejected": -1.2952406406402588, + "logps/chosen": -218.5416717529297, + "logps/rejected": -349.49041748046875, + "loss": 0.4054, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.658063530921936, + "rewards/margins": 1.3279998302459717, + "rewards/rejected": -2.9860637187957764, + "step": 9870 + }, + { + "epoch": 1.7, + "grad_norm": 26.222603081146218, + "learning_rate": 3.300041704858425e-08, + "logits/chosen": -1.2638031244277954, + "logits/rejected": -1.2089149951934814, + "logps/chosen": -224.624755859375, + "logps/rejected": -374.73345947265625, + "loss": 0.3794, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7256309986114502, + "rewards/margins": 1.4671021699905396, + "rewards/rejected": -3.1927330493927, + "step": 9880 + }, + { + "epoch": 1.7, + "grad_norm": 32.364868235731464, + "learning_rate": 3.262808309718668e-08, + "logits/chosen": -1.2537232637405396, + "logits/rejected": -1.204134225845337, + "logps/chosen": -237.5514373779297, + "logps/rejected": -368.892578125, + "loss": 0.4235, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8114869594573975, + "rewards/margins": 1.3386226892471313, + "rewards/rejected": -3.1501097679138184, + "step": 9890 + }, + { + "epoch": 1.71, + "grad_norm": 42.03184187301272, + "learning_rate": 3.2257714856770866e-08, + "logits/chosen": -1.3525625467300415, + "logits/rejected": -1.2814313173294067, + "logps/chosen": -209.0051727294922, + "logps/rejected": -360.9290466308594, + "loss": 0.3727, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.5728161334991455, + "rewards/margins": 1.538694977760315, + "rewards/rejected": -3.111510992050171, + "step": 9900 + }, + { + "epoch": 1.71, + "eval_logits/chosen": -1.424879550933838, + "eval_logits/rejected": -1.397822618484497, + "eval_logps/chosen": -246.3584747314453, + "eval_logps/rejected": -297.6012878417969, + "eval_loss": 0.6336408853530884, + "eval_rewards/accuracies": 0.6556691527366638, + "eval_rewards/chosen": -1.876546025276184, + "eval_rewards/margins": 0.46789297461509705, + "eval_rewards/rejected": -2.3444390296936035, + "eval_runtime": 357.1672, + "eval_samples_per_second": 12.05, + "eval_steps_per_second": 1.506, + "step": 9900 + }, + { + "epoch": 1.71, + "grad_norm": 26.80328363192874, + "learning_rate": 3.1889315676605325e-08, + "logits/chosen": -1.4107153415679932, + "logits/rejected": -1.322643756866455, + "logps/chosen": -216.0201873779297, + "logps/rejected": -352.2988586425781, + "loss": 0.4129, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5847244262695312, + "rewards/margins": 1.4318273067474365, + "rewards/rejected": -3.0165517330169678, + "step": 9910 + }, + { + "epoch": 1.71, + "grad_norm": 31.18899577015784, + "learning_rate": 3.152288888815227e-08, + "logits/chosen": -1.3761640787124634, + "logits/rejected": -1.3048861026763916, + "logps/chosen": -222.0412139892578, + "logps/rejected": -367.63140869140625, + "loss": 0.3517, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6936910152435303, + "rewards/margins": 1.4894893169403076, + "rewards/rejected": -3.183180332183838, + "step": 9920 + }, + { + "epoch": 1.71, + "grad_norm": 35.38732271471543, + "learning_rate": 3.1158437805037296e-08, + "logits/chosen": -1.3241338729858398, + "logits/rejected": -1.276207685470581, + "logps/chosen": -219.0576171875, + "logps/rejected": -356.7098693847656, + "loss": 0.4241, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.671561598777771, + "rewards/margins": 1.347208023071289, + "rewards/rejected": -3.0187695026397705, + "step": 9930 + }, + { + "epoch": 1.71, + "grad_norm": 24.595744118468183, + "learning_rate": 3.079596572301965e-08, + "logits/chosen": -1.4158904552459717, + "logits/rejected": -1.3684117794036865, + "logps/chosen": -236.92745971679688, + "logps/rejected": -355.1565246582031, + "loss": 0.4652, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8187148571014404, + "rewards/margins": 1.125727653503418, + "rewards/rejected": -2.9444425106048584, + "step": 9940 + }, + { + "epoch": 1.71, + "grad_norm": 59.09049440288645, + "learning_rate": 3.043547591996226e-08, + "logits/chosen": -1.3634991645812988, + "logits/rejected": -1.2813036441802979, + "logps/chosen": -228.6792755126953, + "logps/rejected": -375.29547119140625, + "loss": 0.3893, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7471336126327515, + "rewards/margins": 1.4925024509429932, + "rewards/rejected": -3.239635944366455, + "step": 9950 + }, + { + "epoch": 1.72, + "grad_norm": 50.2107128049137, + "learning_rate": 3.0076971655802196e-08, + "logits/chosen": -1.3980926275253296, + "logits/rejected": -1.3432905673980713, + "logps/chosen": -247.3916015625, + "logps/rejected": -370.38458251953125, + "loss": 0.4546, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.9397674798965454, + "rewards/margins": 1.2183306217193604, + "rewards/rejected": -3.158097743988037, + "step": 9960 + }, + { + "epoch": 1.72, + "grad_norm": 31.69304656950342, + "learning_rate": 2.972045617252114e-08, + "logits/chosen": -1.37814462184906, + "logits/rejected": -1.328137755393982, + "logps/chosen": -225.2104949951172, + "logps/rejected": -350.4515075683594, + "loss": 0.4552, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7387615442276, + "rewards/margins": 1.2483638525009155, + "rewards/rejected": -2.9871251583099365, + "step": 9970 + }, + { + "epoch": 1.72, + "grad_norm": 18.89889146936782, + "learning_rate": 2.9365932694115913e-08, + "logits/chosen": -1.2746312618255615, + "logits/rejected": -1.2151672840118408, + "logps/chosen": -238.1916046142578, + "logps/rejected": -384.50164794921875, + "loss": 0.4013, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8541069030761719, + "rewards/margins": 1.459223747253418, + "rewards/rejected": -3.3133304119110107, + "step": 9980 + }, + { + "epoch": 1.72, + "grad_norm": 42.47918178591677, + "learning_rate": 2.9013404426569855e-08, + "logits/chosen": -1.3667715787887573, + "logits/rejected": -1.2965288162231445, + "logps/chosen": -238.6469268798828, + "logps/rejected": -354.388427734375, + "loss": 0.4658, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8494361639022827, + "rewards/margins": 1.195669412612915, + "rewards/rejected": -3.045105457305908, + "step": 9990 + }, + { + "epoch": 1.72, + "grad_norm": 21.723980366758628, + "learning_rate": 2.8662874557823013e-08, + "logits/chosen": -1.37138831615448, + "logits/rejected": -1.3194690942764282, + "logps/chosen": -232.44027709960938, + "logps/rejected": -357.5736389160156, + "loss": 0.424, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7671289443969727, + "rewards/margins": 1.239452600479126, + "rewards/rejected": -3.0065817832946777, + "step": 10000 + }, + { + "epoch": 1.72, + "eval_logits/chosen": -1.4226433038711548, + "eval_logits/rejected": -1.3957637548446655, + "eval_logps/chosen": -245.6855010986328, + "eval_logps/rejected": -296.6435852050781, + "eval_loss": 0.6344332098960876, + "eval_rewards/accuracies": 0.6514869928359985, + "eval_rewards/chosen": -1.8698163032531738, + "eval_rewards/margins": 0.4650455117225647, + "eval_rewards/rejected": -2.334861993789673, + "eval_runtime": 357.1222, + "eval_samples_per_second": 12.052, + "eval_steps_per_second": 1.506, + "step": 10000 + }, + { + "epoch": 1.72, + "grad_norm": 19.257035851897566, + "learning_rate": 2.8314346257744177e-08, + "logits/chosen": -1.3713723421096802, + "logits/rejected": -1.3086481094360352, + "logps/chosen": -225.0717010498047, + "logps/rejected": -362.225830078125, + "loss": 0.3966, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7269824743270874, + "rewards/margins": 1.3675779104232788, + "rewards/rejected": -3.094560146331787, + "step": 10010 + }, + { + "epoch": 1.73, + "grad_norm": 33.76415450194936, + "learning_rate": 2.7967822678101466e-08, + "logits/chosen": -1.3136873245239258, + "logits/rejected": -1.243048071861267, + "logps/chosen": -228.7400360107422, + "logps/rejected": -360.3517761230469, + "loss": 0.411, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.75080144405365, + "rewards/margins": 1.3429168462753296, + "rewards/rejected": -3.0937180519104004, + "step": 10020 + }, + { + "epoch": 1.73, + "grad_norm": 30.936869629766193, + "learning_rate": 2.7623306952534316e-08, + "logits/chosen": -1.3288711309432983, + "logits/rejected": -1.260801076889038, + "logps/chosen": -239.26663208007812, + "logps/rejected": -358.018798828125, + "loss": 0.4206, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8187427520751953, + "rewards/margins": 1.2403138875961304, + "rewards/rejected": -3.0590567588806152, + "step": 10030 + }, + { + "epoch": 1.73, + "grad_norm": 23.395127833799734, + "learning_rate": 2.728080219652504e-08, + "logits/chosen": -1.509854793548584, + "logits/rejected": -1.4493193626403809, + "logps/chosen": -229.5972900390625, + "logps/rejected": -358.5816345214844, + "loss": 0.4241, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7475786209106445, + "rewards/margins": 1.2939976453781128, + "rewards/rejected": -3.0415761470794678, + "step": 10040 + }, + { + "epoch": 1.73, + "grad_norm": 36.993355629776445, + "learning_rate": 2.694031150737036e-08, + "logits/chosen": -1.3353779315948486, + "logits/rejected": -1.2893092632293701, + "logps/chosen": -227.55111694335938, + "logps/rejected": -348.4396057128906, + "loss": 0.4256, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7357978820800781, + "rewards/margins": 1.1867949962615967, + "rewards/rejected": -2.922593355178833, + "step": 10050 + }, + { + "epoch": 1.73, + "grad_norm": 40.38498271374321, + "learning_rate": 2.6601837964153996e-08, + "logits/chosen": -1.2823264598846436, + "logits/rejected": -1.2305335998535156, + "logps/chosen": -225.500732421875, + "logps/rejected": -361.58197021484375, + "loss": 0.4373, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7677650451660156, + "rewards/margins": 1.3243902921676636, + "rewards/rejected": -3.0921549797058105, + "step": 10060 + }, + { + "epoch": 1.74, + "grad_norm": 28.019376221143933, + "learning_rate": 2.6265384627718046e-08, + "logits/chosen": -1.2700541019439697, + "logits/rejected": -1.215319037437439, + "logps/chosen": -226.7916717529297, + "logps/rejected": -368.22149658203125, + "loss": 0.3848, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7437423467636108, + "rewards/margins": 1.4138736724853516, + "rewards/rejected": -3.157615900039673, + "step": 10070 + }, + { + "epoch": 1.74, + "grad_norm": 37.67411721104956, + "learning_rate": 2.593095454063615e-08, + "logits/chosen": -1.3826894760131836, + "logits/rejected": -1.3214690685272217, + "logps/chosen": -222.34335327148438, + "logps/rejected": -359.19488525390625, + "loss": 0.4212, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6584384441375732, + "rewards/margins": 1.3830474615097046, + "rewards/rejected": -3.0414860248565674, + "step": 10080 + }, + { + "epoch": 1.74, + "grad_norm": 24.463559826579306, + "learning_rate": 2.5598550727185142e-08, + "logits/chosen": -1.3830634355545044, + "logits/rejected": -1.3127849102020264, + "logps/chosen": -223.80313110351562, + "logps/rejected": -375.8487548828125, + "loss": 0.39, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6944698095321655, + "rewards/margins": 1.4869707822799683, + "rewards/rejected": -3.181440591812134, + "step": 10090 + }, + { + "epoch": 1.74, + "grad_norm": 35.57829724815369, + "learning_rate": 2.5268176193318473e-08, + "logits/chosen": -1.349346399307251, + "logits/rejected": -1.2962137460708618, + "logps/chosen": -229.2544403076172, + "logps/rejected": -371.0001525878906, + "loss": 0.3867, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.742475152015686, + "rewards/margins": 1.4224704504013062, + "rewards/rejected": -3.164945602416992, + "step": 10100 + }, + { + "epoch": 1.74, + "eval_logits/chosen": -1.4281973838806152, + "eval_logits/rejected": -1.401401162147522, + "eval_logps/chosen": -242.6608123779297, + "eval_logps/rejected": -292.8902587890625, + "eval_loss": 0.6348100900650024, + "eval_rewards/accuracies": 0.6610130071640015, + "eval_rewards/chosen": -1.8395695686340332, + "eval_rewards/margins": 0.4577590227127075, + "eval_rewards/rejected": -2.297328472137451, + "eval_runtime": 357.2066, + "eval_samples_per_second": 12.049, + "eval_steps_per_second": 1.506, + "step": 10100 + }, + { + "epoch": 1.74, + "grad_norm": 41.259295901796634, + "learning_rate": 2.4939833926638397e-08, + "logits/chosen": -1.379417896270752, + "logits/rejected": -1.333963394165039, + "logps/chosen": -243.1998748779297, + "logps/rejected": -391.07293701171875, + "loss": 0.4012, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.9264335632324219, + "rewards/margins": 1.4382061958312988, + "rewards/rejected": -3.3646399974823, + "step": 10110 + }, + { + "epoch": 1.74, + "grad_norm": 26.28585974209364, + "learning_rate": 2.4613526896369307e-08, + "logits/chosen": -1.3697757720947266, + "logits/rejected": -1.3050405979156494, + "logps/chosen": -231.44277954101562, + "logps/rejected": -369.87445068359375, + "loss": 0.3817, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.777197241783142, + "rewards/margins": 1.4158128499984741, + "rewards/rejected": -3.193009853363037, + "step": 10120 + }, + { + "epoch": 1.75, + "grad_norm": 30.6614965261688, + "learning_rate": 2.428925805333082e-08, + "logits/chosen": -1.3869436979293823, + "logits/rejected": -1.3147681951522827, + "logps/chosen": -221.4283905029297, + "logps/rejected": -376.77764892578125, + "loss": 0.3521, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6809848546981812, + "rewards/margins": 1.5506279468536377, + "rewards/rejected": -3.2316126823425293, + "step": 10130 + }, + { + "epoch": 1.75, + "grad_norm": 40.486241150607576, + "learning_rate": 2.396703032991107e-08, + "logits/chosen": -1.3461048603057861, + "logits/rejected": -1.2657787799835205, + "logps/chosen": -243.0239715576172, + "logps/rejected": -370.98052978515625, + "loss": 0.4374, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8258941173553467, + "rewards/margins": 1.3444820642471313, + "rewards/rejected": -3.1703763008117676, + "step": 10140 + }, + { + "epoch": 1.75, + "grad_norm": 31.540425471498317, + "learning_rate": 2.3646846640040158e-08, + "logits/chosen": -1.2644100189208984, + "logits/rejected": -1.1996811628341675, + "logps/chosen": -238.3133087158203, + "logps/rejected": -371.70947265625, + "loss": 0.4298, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8294227123260498, + "rewards/margins": 1.333860158920288, + "rewards/rejected": -3.163282871246338, + "step": 10150 + }, + { + "epoch": 1.75, + "grad_norm": 29.9318411012178, + "learning_rate": 2.332870987916383e-08, + "logits/chosen": -1.3238608837127686, + "logits/rejected": -1.2526966333389282, + "logps/chosen": -222.04052734375, + "logps/rejected": -380.37548828125, + "loss": 0.3546, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6999727487564087, + "rewards/margins": 1.5818251371383667, + "rewards/rejected": -3.2817981243133545, + "step": 10160 + }, + { + "epoch": 1.75, + "grad_norm": 34.151684842456746, + "learning_rate": 2.3012622924217323e-08, + "logits/chosen": -1.3320282697677612, + "logits/rejected": -1.272220253944397, + "logps/chosen": -232.2387237548828, + "logps/rejected": -378.3623046875, + "loss": 0.4196, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7736480236053467, + "rewards/margins": 1.466052770614624, + "rewards/rejected": -3.2397007942199707, + "step": 10170 + }, + { + "epoch": 1.75, + "grad_norm": 21.211664057036547, + "learning_rate": 2.2698588633599357e-08, + "logits/chosen": -1.2500728368759155, + "logits/rejected": -1.1695213317871094, + "logps/chosen": -226.1921844482422, + "logps/rejected": -385.45855712890625, + "loss": 0.3777, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.737788438796997, + "rewards/margins": 1.583823800086975, + "rewards/rejected": -3.321612596511841, + "step": 10180 + }, + { + "epoch": 1.76, + "grad_norm": 56.5968336109649, + "learning_rate": 2.2386609847146077e-08, + "logits/chosen": -1.2704510688781738, + "logits/rejected": -1.2027629613876343, + "logps/chosen": -225.5328826904297, + "logps/rejected": -359.3175354003906, + "loss": 0.4261, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7456719875335693, + "rewards/margins": 1.333605408668518, + "rewards/rejected": -3.079277515411377, + "step": 10190 + }, + { + "epoch": 1.76, + "grad_norm": 24.63754465967521, + "learning_rate": 2.2076689386105824e-08, + "logits/chosen": -1.3379052877426147, + "logits/rejected": -1.2737504243850708, + "logps/chosen": -230.8819580078125, + "logps/rejected": -371.2557067871094, + "loss": 0.3851, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7848135232925415, + "rewards/margins": 1.4187109470367432, + "rewards/rejected": -3.203524351119995, + "step": 10200 + }, + { + "epoch": 1.76, + "eval_logits/chosen": -1.3974082469940186, + "eval_logits/rejected": -1.369729995727539, + "eval_logps/chosen": -254.59268188476562, + "eval_logps/rejected": -307.6221618652344, + "eval_loss": 0.6357866525650024, + "eval_rewards/accuracies": 0.660780668258667, + "eval_rewards/chosen": -1.9588884115219116, + "eval_rewards/margins": 0.4857591688632965, + "eval_rewards/rejected": -2.444647789001465, + "eval_runtime": 356.793, + "eval_samples_per_second": 12.063, + "eval_steps_per_second": 1.508, + "step": 10200 + }, + { + "epoch": 1.76, + "grad_norm": 54.095086047288085, + "learning_rate": 2.176883005311303e-08, + "logits/chosen": -1.361449122428894, + "logits/rejected": -1.3111120462417603, + "logps/chosen": -245.9110565185547, + "logps/rejected": -408.2243957519531, + "loss": 0.3661, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9192225933074951, + "rewards/margins": 1.5922331809997559, + "rewards/rejected": -3.511455535888672, + "step": 10210 + }, + { + "epoch": 1.76, + "grad_norm": 47.443592896329925, + "learning_rate": 2.1463034632163535e-08, + "logits/chosen": -1.2670648097991943, + "logits/rejected": -1.2201905250549316, + "logps/chosen": -241.36117553710938, + "logps/rejected": -377.2726135253906, + "loss": 0.4243, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9298385381698608, + "rewards/margins": 1.3317430019378662, + "rewards/rejected": -3.2615818977355957, + "step": 10220 + }, + { + "epoch": 1.76, + "grad_norm": 27.354616295829352, + "learning_rate": 2.1159305888588664e-08, + "logits/chosen": -1.2255061864852905, + "logits/rejected": -1.15509033203125, + "logps/chosen": -232.21240234375, + "logps/rejected": -373.4509582519531, + "loss": 0.446, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7877719402313232, + "rewards/margins": 1.4283082485198975, + "rewards/rejected": -3.2160804271698, + "step": 10230 + }, + { + "epoch": 1.76, + "grad_norm": 18.82682331923304, + "learning_rate": 2.085764656903105e-08, + "logits/chosen": -1.308882236480713, + "logits/rejected": -1.2245006561279297, + "logps/chosen": -224.0420684814453, + "logps/rejected": -410.17626953125, + "loss": 0.3153, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7259820699691772, + "rewards/margins": 1.8521201610565186, + "rewards/rejected": -3.5781021118164062, + "step": 10240 + }, + { + "epoch": 1.77, + "grad_norm": 47.39267774633412, + "learning_rate": 2.055805940141897e-08, + "logits/chosen": -1.2967278957366943, + "logits/rejected": -1.2207520008087158, + "logps/chosen": -248.3931427001953, + "logps/rejected": -379.16094970703125, + "loss": 0.3673, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9161218404769897, + "rewards/margins": 1.378807783126831, + "rewards/rejected": -3.2949295043945312, + "step": 10250 + }, + { + "epoch": 1.77, + "grad_norm": 29.996102972446575, + "learning_rate": 2.0260547094942348e-08, + "logits/chosen": -1.2790513038635254, + "logits/rejected": -1.2392971515655518, + "logps/chosen": -238.34115600585938, + "logps/rejected": -378.8802185058594, + "loss": 0.4199, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8511323928833008, + "rewards/margins": 1.3689521551132202, + "rewards/rejected": -3.2200846672058105, + "step": 10260 + }, + { + "epoch": 1.77, + "grad_norm": 40.28205321720298, + "learning_rate": 1.9965112340027874e-08, + "logits/chosen": -1.300189733505249, + "logits/rejected": -1.2424399852752686, + "logps/chosen": -245.79641723632812, + "logps/rejected": -382.1834411621094, + "loss": 0.409, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9040024280548096, + "rewards/margins": 1.361567497253418, + "rewards/rejected": -3.2655701637268066, + "step": 10270 + }, + { + "epoch": 1.77, + "grad_norm": 30.493851403701377, + "learning_rate": 1.9671757808314675e-08, + "logits/chosen": -1.2651712894439697, + "logits/rejected": -1.2117892503738403, + "logps/chosen": -252.64151000976562, + "logps/rejected": -374.02337646484375, + "loss": 0.4544, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9832038879394531, + "rewards/margins": 1.2268593311309814, + "rewards/rejected": -3.2100627422332764, + "step": 10280 + }, + { + "epoch": 1.77, + "grad_norm": 32.40691710505685, + "learning_rate": 1.9380486152630548e-08, + "logits/chosen": -1.2505433559417725, + "logits/rejected": -1.1942849159240723, + "logps/chosen": -229.8437042236328, + "logps/rejected": -385.4642639160156, + "loss": 0.4206, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7842499017715454, + "rewards/margins": 1.5160019397735596, + "rewards/rejected": -3.3002521991729736, + "step": 10290 + }, + { + "epoch": 1.77, + "grad_norm": 39.84324911925736, + "learning_rate": 1.909130000696732e-08, + "logits/chosen": -1.325438380241394, + "logits/rejected": -1.2682257890701294, + "logps/chosen": -235.05142211914062, + "logps/rejected": -361.3345031738281, + "loss": 0.4322, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8042240142822266, + "rewards/margins": 1.2643927335739136, + "rewards/rejected": -3.068617105484009, + "step": 10300 + }, + { + "epoch": 1.77, + "eval_logits/chosen": -1.4002227783203125, + "eval_logits/rejected": -1.3728564977645874, + "eval_logps/chosen": -252.0375518798828, + "eval_logps/rejected": -304.3727722167969, + "eval_loss": 0.6351932287216187, + "eval_rewards/accuracies": 0.6584572196006775, + "eval_rewards/chosen": -1.9333373308181763, + "eval_rewards/margins": 0.4788166582584381, + "eval_rewards/rejected": -2.412153959274292, + "eval_runtime": 356.5282, + "eval_samples_per_second": 12.072, + "eval_steps_per_second": 1.509, + "step": 10300 + }, + { + "epoch": 1.78, + "grad_norm": 22.524340645441857, + "learning_rate": 1.8804201986457742e-08, + "logits/chosen": -1.261060357093811, + "logits/rejected": -1.1999661922454834, + "logps/chosen": -247.80233764648438, + "logps/rejected": -386.2306213378906, + "loss": 0.4087, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9156711101531982, + "rewards/margins": 1.4324285984039307, + "rewards/rejected": -3.34809947013855, + "step": 10310 + }, + { + "epoch": 1.78, + "grad_norm": 46.482526842999754, + "learning_rate": 1.851919468735119e-08, + "logits/chosen": -1.315914511680603, + "logits/rejected": -1.2488597631454468, + "logps/chosen": -235.482666015625, + "logps/rejected": -366.1687316894531, + "loss": 0.4226, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7874408960342407, + "rewards/margins": 1.3392503261566162, + "rewards/rejected": -3.1266913414001465, + "step": 10320 + }, + { + "epoch": 1.78, + "grad_norm": 55.06877120295062, + "learning_rate": 1.8236280686990653e-08, + "logits/chosen": -1.3398211002349854, + "logits/rejected": -1.2758935689926147, + "logps/chosen": -230.93673706054688, + "logps/rejected": -373.24432373046875, + "loss": 0.3857, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7879480123519897, + "rewards/margins": 1.3995113372802734, + "rewards/rejected": -3.1874594688415527, + "step": 10330 + }, + { + "epoch": 1.78, + "grad_norm": 31.59403091184685, + "learning_rate": 1.795546254378927e-08, + "logits/chosen": -1.3418684005737305, + "logits/rejected": -1.2638609409332275, + "logps/chosen": -231.3434295654297, + "logps/rejected": -384.9586181640625, + "loss": 0.3837, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7527868747711182, + "rewards/margins": 1.5512384176254272, + "rewards/rejected": -3.304025173187256, + "step": 10340 + }, + { + "epoch": 1.78, + "grad_norm": 44.48450974185094, + "learning_rate": 1.7676742797207045e-08, + "logits/chosen": -1.4229285717010498, + "logits/rejected": -1.3561543226242065, + "logps/chosen": -241.94143676757812, + "logps/rejected": -376.6054992675781, + "loss": 0.4181, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.878838300704956, + "rewards/margins": 1.376997470855713, + "rewards/rejected": -3.255835771560669, + "step": 10350 + }, + { + "epoch": 1.78, + "grad_norm": 40.416266152745024, + "learning_rate": 1.740012396772819e-08, + "logits/chosen": -1.2647850513458252, + "logits/rejected": -1.1986753940582275, + "logps/chosen": -245.0532684326172, + "logps/rejected": -359.286865234375, + "loss": 0.4955, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.891425371170044, + "rewards/margins": 1.173485279083252, + "rewards/rejected": -3.064910411834717, + "step": 10360 + }, + { + "epoch": 1.79, + "grad_norm": 29.777426788141348, + "learning_rate": 1.7125608556838035e-08, + "logits/chosen": -1.1616214513778687, + "logits/rejected": -1.0960752964019775, + "logps/chosen": -223.65115356445312, + "logps/rejected": -352.9761047363281, + "loss": 0.4088, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7168071269989014, + "rewards/margins": 1.2844127416610718, + "rewards/rejected": -3.001220226287842, + "step": 10370 + }, + { + "epoch": 1.79, + "grad_norm": 51.70883721503306, + "learning_rate": 1.6853199047000584e-08, + "logits/chosen": -1.3133689165115356, + "logits/rejected": -1.2742842435836792, + "logps/chosen": -249.2805633544922, + "logps/rejected": -336.363037109375, + "loss": 0.5641, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.974237084388733, + "rewards/margins": 0.8665148019790649, + "rewards/rejected": -2.840752124786377, + "step": 10380 + }, + { + "epoch": 1.79, + "grad_norm": 21.280760614032122, + "learning_rate": 1.6582897901636027e-08, + "logits/chosen": -1.3474326133728027, + "logits/rejected": -1.2722728252410889, + "logps/chosen": -228.17819213867188, + "logps/rejected": -372.76544189453125, + "loss": 0.391, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7238601446151733, + "rewards/margins": 1.463079571723938, + "rewards/rejected": -3.1869397163391113, + "step": 10390 + }, + { + "epoch": 1.79, + "grad_norm": 23.04577762515264, + "learning_rate": 1.6314707565098395e-08, + "logits/chosen": -1.2872307300567627, + "logits/rejected": -1.2240254878997803, + "logps/chosen": -246.9097900390625, + "logps/rejected": -409.812255859375, + "loss": 0.3405, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9479862451553345, + "rewards/margins": 1.5971990823745728, + "rewards/rejected": -3.5451855659484863, + "step": 10400 + }, + { + "epoch": 1.79, + "eval_logits/chosen": -1.411521315574646, + "eval_logits/rejected": -1.384405493736267, + "eval_logps/chosen": -247.26951599121094, + "eval_logps/rejected": -298.5337219238281, + "eval_loss": 0.6351563334465027, + "eval_rewards/accuracies": 0.660780668258667, + "eval_rewards/chosen": -1.885656714439392, + "eval_rewards/margins": 0.46810635924339294, + "eval_rewards/rejected": -2.3537631034851074, + "eval_runtime": 356.5783, + "eval_samples_per_second": 12.07, + "eval_steps_per_second": 1.509, + "step": 10400 + }, + { + "epoch": 1.79, + "grad_norm": 29.30147034557462, + "learning_rate": 1.6048630462653616e-08, + "logits/chosen": -1.2767484188079834, + "logits/rejected": -1.2087651491165161, + "logps/chosen": -246.88467407226562, + "logps/rejected": -367.8938293457031, + "loss": 0.4398, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9034740924835205, + "rewards/margins": 1.2447469234466553, + "rewards/rejected": -3.1482207775115967, + "step": 10410 + }, + { + "epoch": 1.8, + "grad_norm": 50.80138734136231, + "learning_rate": 1.578466900045733e-08, + "logits/chosen": -1.3154891729354858, + "logits/rejected": -1.2465871572494507, + "logps/chosen": -237.7150421142578, + "logps/rejected": -368.3978576660156, + "loss": 0.4055, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8362977504730225, + "rewards/margins": 1.3066072463989258, + "rewards/rejected": -3.142904758453369, + "step": 10420 + }, + { + "epoch": 1.8, + "grad_norm": 28.18485959040931, + "learning_rate": 1.5522825565533442e-08, + "logits/chosen": -1.3946081399917603, + "logits/rejected": -1.3362939357757568, + "logps/chosen": -232.037109375, + "logps/rejected": -357.0254211425781, + "loss": 0.4289, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7716169357299805, + "rewards/margins": 1.2683614492416382, + "rewards/rejected": -3.039978504180908, + "step": 10430 + }, + { + "epoch": 1.8, + "grad_norm": 27.574863749179777, + "learning_rate": 1.526310252575222e-08, + "logits/chosen": -1.4324071407318115, + "logits/rejected": -1.380183219909668, + "logps/chosen": -236.1354217529297, + "logps/rejected": -360.7831115722656, + "loss": 0.4444, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7827268838882446, + "rewards/margins": 1.2522282600402832, + "rewards/rejected": -3.0349552631378174, + "step": 10440 + }, + { + "epoch": 1.8, + "grad_norm": 34.86665149520393, + "learning_rate": 1.500550222980923e-08, + "logits/chosen": -1.3539096117019653, + "logits/rejected": -1.3092883825302124, + "logps/chosen": -238.3639678955078, + "logps/rejected": -363.4995422363281, + "loss": 0.4144, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8473567962646484, + "rewards/margins": 1.211411476135254, + "rewards/rejected": -3.0587682723999023, + "step": 10450 + }, + { + "epoch": 1.8, + "grad_norm": 32.44019457079582, + "learning_rate": 1.4750027007203653e-08, + "logits/chosen": -1.3425335884094238, + "logits/rejected": -1.2749333381652832, + "logps/chosen": -229.01583862304688, + "logps/rejected": -354.47314453125, + "loss": 0.423, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7443774938583374, + "rewards/margins": 1.2702993154525757, + "rewards/rejected": -3.014676809310913, + "step": 10460 + }, + { + "epoch": 1.8, + "grad_norm": 57.913238056684506, + "learning_rate": 1.4496679168217646e-08, + "logits/chosen": -1.1968591213226318, + "logits/rejected": -1.1376616954803467, + "logps/chosen": -241.31689453125, + "logps/rejected": -365.15399169921875, + "loss": 0.4734, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8941717147827148, + "rewards/margins": 1.2496927976608276, + "rewards/rejected": -3.143864393234253, + "step": 10470 + }, + { + "epoch": 1.81, + "grad_norm": 26.776034814613833, + "learning_rate": 1.4245461003895232e-08, + "logits/chosen": -1.3491319417953491, + "logits/rejected": -1.2737585306167603, + "logps/chosen": -229.96826171875, + "logps/rejected": -374.3705749511719, + "loss": 0.4425, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.740369439125061, + "rewards/margins": 1.4796626567840576, + "rewards/rejected": -3.220032215118408, + "step": 10480 + }, + { + "epoch": 1.81, + "grad_norm": 27.83311511088631, + "learning_rate": 1.3996374786021642e-08, + "logits/chosen": -1.3122376203536987, + "logits/rejected": -1.241257905960083, + "logps/chosen": -222.9138641357422, + "logps/rejected": -362.92169189453125, + "loss": 0.4151, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.691291093826294, + "rewards/margins": 1.4125699996948242, + "rewards/rejected": -3.1038613319396973, + "step": 10490 + }, + { + "epoch": 1.81, + "grad_norm": 34.16101354006198, + "learning_rate": 1.3749422767102698e-08, + "logits/chosen": -1.3112982511520386, + "logits/rejected": -1.2444483041763306, + "logps/chosen": -234.7496337890625, + "logps/rejected": -384.12420654296875, + "loss": 0.424, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.820643424987793, + "rewards/margins": 1.4697020053863525, + "rewards/rejected": -3.2903454303741455, + "step": 10500 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -1.411332368850708, + "eval_logits/rejected": -1.3842883110046387, + "eval_logps/chosen": -246.45022583007812, + "eval_logps/rejected": -297.5495300292969, + "eval_loss": 0.6351029276847839, + "eval_rewards/accuracies": 0.6598513126373291, + "eval_rewards/chosen": -1.8774638175964355, + "eval_rewards/margins": 0.46645745635032654, + "eval_rewards/rejected": -2.343921184539795, + "eval_runtime": 356.9292, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.507, + "step": 10500 + }, + { + "epoch": 1.81, + "grad_norm": 36.69075640202483, + "learning_rate": 1.3504607180344463e-08, + "logits/chosen": -1.3228697776794434, + "logits/rejected": -1.2543773651123047, + "logps/chosen": -228.5086669921875, + "logps/rejected": -364.3096923828125, + "loss": 0.415, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7290817499160767, + "rewards/margins": 1.3821680545806885, + "rewards/rejected": -3.1112494468688965, + "step": 10510 + }, + { + "epoch": 1.81, + "grad_norm": 46.69963744425329, + "learning_rate": 1.3261930239633261e-08, + "logits/chosen": -1.4053773880004883, + "logits/rejected": -1.3577836751937866, + "logps/chosen": -221.27139282226562, + "logps/rejected": -369.21697998046875, + "loss": 0.4035, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7077957391738892, + "rewards/margins": 1.4389338493347168, + "rewards/rejected": -3.1467297077178955, + "step": 10520 + }, + { + "epoch": 1.81, + "grad_norm": 35.672432590977046, + "learning_rate": 1.3021394139515197e-08, + "logits/chosen": -1.2971795797348022, + "logits/rejected": -1.2416961193084717, + "logps/chosen": -233.1261444091797, + "logps/rejected": -350.49969482421875, + "loss": 0.4311, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7978441715240479, + "rewards/margins": 1.185850977897644, + "rewards/rejected": -2.9836955070495605, + "step": 10530 + }, + { + "epoch": 1.82, + "grad_norm": 25.051590405247744, + "learning_rate": 1.2783001055176907e-08, + "logits/chosen": -1.2430702447891235, + "logits/rejected": -1.1768423318862915, + "logps/chosen": -227.86788940429688, + "logps/rejected": -368.8069152832031, + "loss": 0.3771, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7555568218231201, + "rewards/margins": 1.4385837316513062, + "rewards/rejected": -3.194140672683716, + "step": 10540 + }, + { + "epoch": 1.82, + "grad_norm": 48.523638164139314, + "learning_rate": 1.2546753142425315e-08, + "logits/chosen": -1.4035842418670654, + "logits/rejected": -1.3484973907470703, + "logps/chosen": -241.98580932617188, + "logps/rejected": -391.2903137207031, + "loss": 0.3919, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8824710845947266, + "rewards/margins": 1.472904920578003, + "rewards/rejected": -3.3553764820098877, + "step": 10550 + }, + { + "epoch": 1.82, + "grad_norm": 31.996639068258908, + "learning_rate": 1.2312652537668499e-08, + "logits/chosen": -1.2917983531951904, + "logits/rejected": -1.2220714092254639, + "logps/chosen": -226.38491821289062, + "logps/rejected": -382.4876403808594, + "loss": 0.4064, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7078090906143188, + "rewards/margins": 1.5710179805755615, + "rewards/rejected": -3.278826951980591, + "step": 10560 + }, + { + "epoch": 1.82, + "grad_norm": 39.94758110559314, + "learning_rate": 1.2080701357896267e-08, + "logits/chosen": -1.361748456954956, + "logits/rejected": -1.3035575151443481, + "logps/chosen": -242.5690460205078, + "logps/rejected": -395.84844970703125, + "loss": 0.3614, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.866334319114685, + "rewards/margins": 1.5249965190887451, + "rewards/rejected": -3.3913307189941406, + "step": 10570 + }, + { + "epoch": 1.82, + "grad_norm": 34.466730835739696, + "learning_rate": 1.185090170066097e-08, + "logits/chosen": -1.3756376504898071, + "logits/rejected": -1.310418963432312, + "logps/chosen": -224.53564453125, + "logps/rejected": -363.5881652832031, + "loss": 0.4041, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.711679220199585, + "rewards/margins": 1.3960368633270264, + "rewards/rejected": -3.1077163219451904, + "step": 10580 + }, + { + "epoch": 1.82, + "grad_norm": 25.78656181126498, + "learning_rate": 1.1623255644058638e-08, + "logits/chosen": -1.3188053369522095, + "logits/rejected": -1.2425676584243774, + "logps/chosen": -226.51016235351562, + "logps/rejected": -357.7701721191406, + "loss": 0.4113, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7062441110610962, + "rewards/margins": 1.3722339868545532, + "rewards/rejected": -3.0784783363342285, + "step": 10590 + }, + { + "epoch": 1.83, + "grad_norm": 34.38272700142523, + "learning_rate": 1.1397765246710072e-08, + "logits/chosen": -1.3734514713287354, + "logits/rejected": -1.3211066722869873, + "logps/chosen": -227.74270629882812, + "logps/rejected": -366.80364990234375, + "loss": 0.4396, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7487207651138306, + "rewards/margins": 1.372809648513794, + "rewards/rejected": -3.121530294418335, + "step": 10600 + }, + { + "epoch": 1.83, + "eval_logits/chosen": -1.4146060943603516, + "eval_logits/rejected": -1.3876330852508545, + "eval_logps/chosen": -246.19650268554688, + "eval_logps/rejected": -297.2034912109375, + "eval_loss": 0.6350103616714478, + "eval_rewards/accuracies": 0.6568308472633362, + "eval_rewards/chosen": -1.8749263286590576, + "eval_rewards/margins": 0.4655349552631378, + "eval_rewards/rejected": -2.340461254119873, + "eval_runtime": 357.4305, + "eval_samples_per_second": 12.042, + "eval_steps_per_second": 1.505, + "step": 10600 + }, + { + "epoch": 1.83, + "grad_norm": 36.19363313018661, + "learning_rate": 1.1174432547742308e-08, + "logits/chosen": -1.3016915321350098, + "logits/rejected": -1.2503132820129395, + "logps/chosen": -239.76791381835938, + "logps/rejected": -367.1284484863281, + "loss": 0.4453, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8869917392730713, + "rewards/margins": 1.2658240795135498, + "rewards/rejected": -3.152815580368042, + "step": 10610 + }, + { + "epoch": 1.83, + "grad_norm": 41.29882501819702, + "learning_rate": 1.095325956677015e-08, + "logits/chosen": -1.2536303997039795, + "logits/rejected": -1.1866223812103271, + "logps/chosen": -238.30874633789062, + "logps/rejected": -374.2566833496094, + "loss": 0.4032, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8451499938964844, + "rewards/margins": 1.374023675918579, + "rewards/rejected": -3.2191734313964844, + "step": 10620 + }, + { + "epoch": 1.83, + "grad_norm": 22.72297730303355, + "learning_rate": 1.0734248303877813e-08, + "logits/chosen": -1.3555238246917725, + "logits/rejected": -1.2904984951019287, + "logps/chosen": -231.20010375976562, + "logps/rejected": -360.0879211425781, + "loss": 0.4694, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7696985006332397, + "rewards/margins": 1.2896592617034912, + "rewards/rejected": -3.0593578815460205, + "step": 10630 + }, + { + "epoch": 1.83, + "grad_norm": 34.92246125378767, + "learning_rate": 1.051740073960114e-08, + "logits/chosen": -1.340003252029419, + "logits/rejected": -1.2763203382492065, + "logps/chosen": -236.17941284179688, + "logps/rejected": -368.7391052246094, + "loss": 0.4742, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8113651275634766, + "rewards/margins": 1.350304365158081, + "rewards/rejected": -3.1616694927215576, + "step": 10640 + }, + { + "epoch": 1.83, + "grad_norm": 26.61094450928108, + "learning_rate": 1.0302718834909213e-08, + "logits/chosen": -1.3977024555206299, + "logits/rejected": -1.3331856727600098, + "logps/chosen": -238.84042358398438, + "logps/rejected": -391.08477783203125, + "loss": 0.4176, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8521630764007568, + "rewards/margins": 1.520591139793396, + "rewards/rejected": -3.3727545738220215, + "step": 10650 + }, + { + "epoch": 1.84, + "grad_norm": 33.27724860143072, + "learning_rate": 1.0090204531187168e-08, + "logits/chosen": -1.2856873273849487, + "logits/rejected": -1.2244932651519775, + "logps/chosen": -238.58139038085938, + "logps/rejected": -374.8933410644531, + "loss": 0.4089, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.839249849319458, + "rewards/margins": 1.358032464981079, + "rewards/rejected": -3.197282314300537, + "step": 10660 + }, + { + "epoch": 1.84, + "grad_norm": 44.68715039350637, + "learning_rate": 9.8798597502181e-09, + "logits/chosen": -1.3030592203140259, + "logits/rejected": -1.2441256046295166, + "logps/chosen": -246.3955841064453, + "logps/rejected": -373.30462646484375, + "loss": 0.4701, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9426281452178955, + "rewards/margins": 1.2749965190887451, + "rewards/rejected": -3.2176246643066406, + "step": 10670 + }, + { + "epoch": 1.84, + "grad_norm": 26.493605739622634, + "learning_rate": 9.671686394166156e-09, + "logits/chosen": -1.3562123775482178, + "logits/rejected": -1.2756215333938599, + "logps/chosen": -221.86074829101562, + "logps/rejected": -360.75762939453125, + "loss": 0.3826, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6560554504394531, + "rewards/margins": 1.4439318180084229, + "rewards/rejected": -3.099987506866455, + "step": 10680 + }, + { + "epoch": 1.84, + "grad_norm": 30.331480121505034, + "learning_rate": 9.465686345558944e-09, + "logits/chosen": -1.3282508850097656, + "logits/rejected": -1.2702124118804932, + "logps/chosen": -226.7456817626953, + "logps/rejected": -377.0152282714844, + "loss": 0.4397, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7300522327423096, + "rewards/margins": 1.4764258861541748, + "rewards/rejected": -3.2064781188964844, + "step": 10690 + }, + { + "epoch": 1.84, + "grad_norm": 37.77942293381782, + "learning_rate": 9.261861467270787e-09, + "logits/chosen": -1.3761959075927734, + "logits/rejected": -1.3014271259307861, + "logps/chosen": -222.33877563476562, + "logps/rejected": -349.0968322753906, + "loss": 0.3908, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6618179082870483, + "rewards/margins": 1.3258897066116333, + "rewards/rejected": -2.9877076148986816, + "step": 10700 + }, + { + "epoch": 1.84, + "eval_logits/chosen": -1.4212182760238647, + "eval_logits/rejected": -1.3943599462509155, + "eval_logps/chosen": -243.04237365722656, + "eval_logps/rejected": -293.6067810058594, + "eval_loss": 0.6334287524223328, + "eval_rewards/accuracies": 0.6563661694526672, + "eval_rewards/chosen": -1.8433852195739746, + "eval_rewards/margins": 0.46110865473747253, + "eval_rewards/rejected": -2.3044939041137695, + "eval_runtime": 356.8228, + "eval_samples_per_second": 12.062, + "eval_steps_per_second": 1.508, + "step": 10700 + }, + { + "epoch": 1.85, + "grad_norm": 54.759408136897285, + "learning_rate": 9.060213602505778e-09, + "logits/chosen": -1.304149866104126, + "logits/rejected": -1.239341139793396, + "logps/chosen": -225.63449096679688, + "logps/rejected": -350.32049560546875, + "loss": 0.4472, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7219674587249756, + "rewards/margins": 1.2871848344802856, + "rewards/rejected": -3.0091521739959717, + "step": 10710 + }, + { + "epoch": 1.85, + "grad_norm": 57.3062622613245, + "learning_rate": 8.860744574781032e-09, + "logits/chosen": -1.338438630104065, + "logits/rejected": -1.269768476486206, + "logps/chosen": -236.2624053955078, + "logps/rejected": -356.3697509765625, + "loss": 0.4911, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8369505405426025, + "rewards/margins": 1.204301118850708, + "rewards/rejected": -3.0412516593933105, + "step": 10720 + }, + { + "epoch": 1.85, + "grad_norm": 28.412674272208022, + "learning_rate": 8.663456187910422e-09, + "logits/chosen": -1.4263569116592407, + "logits/rejected": -1.3510792255401611, + "logps/chosen": -229.66720581054688, + "logps/rejected": -364.2427673339844, + "loss": 0.3543, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7573401927947998, + "rewards/margins": 1.3864922523498535, + "rewards/rejected": -3.1438326835632324, + "step": 10730 + }, + { + "epoch": 1.85, + "grad_norm": 41.706917137582344, + "learning_rate": 8.468350225987908e-09, + "logits/chosen": -1.2837555408477783, + "logits/rejected": -1.222891926765442, + "logps/chosen": -246.54638671875, + "logps/rejected": -371.4312744140625, + "loss": 0.4706, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9186769723892212, + "rewards/margins": 1.2438628673553467, + "rewards/rejected": -3.1625399589538574, + "step": 10740 + }, + { + "epoch": 1.85, + "grad_norm": 31.262809555566417, + "learning_rate": 8.275428453371813e-09, + "logits/chosen": -1.248228907585144, + "logits/rejected": -1.1745529174804688, + "logps/chosen": -237.2993621826172, + "logps/rejected": -385.1238098144531, + "loss": 0.4173, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8524090051651, + "rewards/margins": 1.4723732471466064, + "rewards/rejected": -3.324782609939575, + "step": 10750 + }, + { + "epoch": 1.85, + "grad_norm": 35.16663063428534, + "learning_rate": 8.084692614668542e-09, + "logits/chosen": -1.3016769886016846, + "logits/rejected": -1.248867392539978, + "logps/chosen": -224.76113891601562, + "logps/rejected": -346.5380859375, + "loss": 0.4184, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7105858325958252, + "rewards/margins": 1.22708261013031, + "rewards/rejected": -2.937668561935425, + "step": 10760 + }, + { + "epoch": 1.86, + "grad_norm": 31.86195155798675, + "learning_rate": 7.89614443471695e-09, + "logits/chosen": -1.3076452016830444, + "logits/rejected": -1.2525701522827148, + "logps/chosen": -217.1156463623047, + "logps/rejected": -352.6279296875, + "loss": 0.3762, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6004784107208252, + "rewards/margins": 1.3801997900009155, + "rewards/rejected": -2.9806783199310303, + "step": 10770 + }, + { + "epoch": 1.86, + "grad_norm": 25.654760643763883, + "learning_rate": 7.7097856185728e-09, + "logits/chosen": -1.418269395828247, + "logits/rejected": -1.3548392057418823, + "logps/chosen": -222.4542999267578, + "logps/rejected": -363.5215148925781, + "loss": 0.4058, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6930938959121704, + "rewards/margins": 1.3915579319000244, + "rewards/rejected": -3.084651470184326, + "step": 10780 + }, + { + "epoch": 1.86, + "grad_norm": 40.803942439334094, + "learning_rate": 7.525617851493166e-09, + "logits/chosen": -1.4488575458526611, + "logits/rejected": -1.3801032304763794, + "logps/chosen": -207.01974487304688, + "logps/rejected": -345.1376647949219, + "loss": 0.3895, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.5470707416534424, + "rewards/margins": 1.3702001571655273, + "rewards/rejected": -2.917271137237549, + "step": 10790 + }, + { + "epoch": 1.86, + "grad_norm": 22.89870220316856, + "learning_rate": 7.343642798921384e-09, + "logits/chosen": -1.4306409358978271, + "logits/rejected": -1.3741027116775513, + "logps/chosen": -220.6624755859375, + "logps/rejected": -359.7532043457031, + "loss": 0.4273, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6872755289077759, + "rewards/margins": 1.371055006980896, + "rewards/rejected": -3.058330774307251, + "step": 10800 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -1.419447422027588, + "eval_logits/rejected": -1.3925857543945312, + "eval_logps/chosen": -244.09783935546875, + "eval_logps/rejected": -294.6657409667969, + "eval_loss": 0.6341846585273743, + "eval_rewards/accuracies": 0.6624070405960083, + "eval_rewards/chosen": -1.8539396524429321, + "eval_rewards/margins": 0.4611437916755676, + "eval_rewards/rejected": -2.3150837421417236, + "eval_runtime": 357.4896, + "eval_samples_per_second": 12.04, + "eval_steps_per_second": 1.505, + "step": 10800 + }, + { + "epoch": 1.86, + "grad_norm": 24.19588838169064, + "learning_rate": 7.1638621064718516e-09, + "logits/chosen": -1.373991847038269, + "logits/rejected": -1.2945966720581055, + "logps/chosen": -227.33596801757812, + "logps/rejected": -364.17059326171875, + "loss": 0.3831, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.699730634689331, + "rewards/margins": 1.4409074783325195, + "rewards/rejected": -3.140638589859009, + "step": 10810 + }, + { + "epoch": 1.86, + "grad_norm": 50.27316401128035, + "learning_rate": 6.986277399915197e-09, + "logits/chosen": -1.2879558801651, + "logits/rejected": -1.2268191576004028, + "logps/chosen": -209.937744140625, + "logps/rejected": -343.2181701660156, + "loss": 0.4086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5688936710357666, + "rewards/margins": 1.349342703819275, + "rewards/rejected": -2.918236255645752, + "step": 10820 + }, + { + "epoch": 1.87, + "grad_norm": 50.904093818767734, + "learning_rate": 6.8108902851636285e-09, + "logits/chosen": -1.3199676275253296, + "logits/rejected": -1.2508794069290161, + "logps/chosen": -232.073486328125, + "logps/rejected": -368.0849914550781, + "loss": 0.399, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8025493621826172, + "rewards/margins": 1.3453489542007446, + "rewards/rejected": -3.147897958755493, + "step": 10830 + }, + { + "epoch": 1.87, + "grad_norm": 27.336450230863502, + "learning_rate": 6.637702348256308e-09, + "logits/chosen": -1.3644187450408936, + "logits/rejected": -1.307117223739624, + "logps/chosen": -227.4879150390625, + "logps/rejected": -349.18182373046875, + "loss": 0.4494, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.74507737159729, + "rewards/margins": 1.237743854522705, + "rewards/rejected": -2.982821226119995, + "step": 10840 + }, + { + "epoch": 1.87, + "grad_norm": 33.93760320273209, + "learning_rate": 6.466715155345109e-09, + "logits/chosen": -1.2493406534194946, + "logits/rejected": -1.199512243270874, + "logps/chosen": -229.41708374023438, + "logps/rejected": -345.4873046875, + "loss": 0.4573, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7922929525375366, + "rewards/margins": 1.118058443069458, + "rewards/rejected": -2.910351276397705, + "step": 10850 + }, + { + "epoch": 1.87, + "grad_norm": 31.190541875354757, + "learning_rate": 6.2979302526803006e-09, + "logits/chosen": -1.4172183275222778, + "logits/rejected": -1.3366795778274536, + "logps/chosen": -223.34884643554688, + "logps/rejected": -358.8973693847656, + "loss": 0.4234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6810195446014404, + "rewards/margins": 1.3840689659118652, + "rewards/rejected": -3.0650882720947266, + "step": 10860 + }, + { + "epoch": 1.87, + "grad_norm": 19.263570637063424, + "learning_rate": 6.131349166596883e-09, + "logits/chosen": -1.2681770324707031, + "logits/rejected": -1.21076500415802, + "logps/chosen": -206.9633026123047, + "logps/rejected": -360.4692077636719, + "loss": 0.4066, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.5650874376296997, + "rewards/margins": 1.4790990352630615, + "rewards/rejected": -3.0441863536834717, + "step": 10870 + }, + { + "epoch": 1.87, + "grad_norm": 33.21398609679741, + "learning_rate": 5.966973403500303e-09, + "logits/chosen": -1.3271772861480713, + "logits/rejected": -1.2587413787841797, + "logps/chosen": -232.65988159179688, + "logps/rejected": -375.89111328125, + "loss": 0.3756, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7844676971435547, + "rewards/margins": 1.4478579759597778, + "rewards/rejected": -3.232325315475464, + "step": 10880 + }, + { + "epoch": 1.88, + "grad_norm": 37.56955046527683, + "learning_rate": 5.804804449853401e-09, + "logits/chosen": -1.3854949474334717, + "logits/rejected": -1.3297450542449951, + "logps/chosen": -223.22830200195312, + "logps/rejected": -355.37030029296875, + "loss": 0.4186, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7010080814361572, + "rewards/margins": 1.30489182472229, + "rewards/rejected": -3.0058999061584473, + "step": 10890 + }, + { + "epoch": 1.88, + "grad_norm": 37.057241152716664, + "learning_rate": 5.644843772162372e-09, + "logits/chosen": -1.434251308441162, + "logits/rejected": -1.3531643152236938, + "logps/chosen": -212.3564453125, + "logps/rejected": -349.568603515625, + "loss": 0.3762, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.5736942291259766, + "rewards/margins": 1.4162404537200928, + "rewards/rejected": -2.9899344444274902, + "step": 10900 + }, + { + "epoch": 1.88, + "eval_logits/chosen": -1.4172533750534058, + "eval_logits/rejected": -1.3904321193695068, + "eval_logps/chosen": -244.67039489746094, + "eval_logps/rejected": -295.28729248046875, + "eval_loss": 0.6345546245574951, + "eval_rewards/accuracies": 0.6565985083580017, + "eval_rewards/chosen": -1.8596652746200562, + "eval_rewards/margins": 0.4616338312625885, + "eval_rewards/rejected": -2.3212990760803223, + "eval_runtime": 357.2508, + "eval_samples_per_second": 12.048, + "eval_steps_per_second": 1.506, + "step": 10900 + }, + { + "epoch": 1.88, + "grad_norm": 37.413849937107756, + "learning_rate": 5.487092816963995e-09, + "logits/chosen": -1.3338580131530762, + "logits/rejected": -1.2663573026657104, + "logps/chosen": -217.49801635742188, + "logps/rejected": -344.0356750488281, + "loss": 0.4173, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6183340549468994, + "rewards/margins": 1.2802609205245972, + "rewards/rejected": -2.898594856262207, + "step": 10910 + }, + { + "epoch": 1.88, + "grad_norm": 42.748889303543294, + "learning_rate": 5.331553010812312e-09, + "logits/chosen": -1.3081706762313843, + "logits/rejected": -1.2376407384872437, + "logps/chosen": -229.73422241210938, + "logps/rejected": -366.51495361328125, + "loss": 0.3763, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7389205694198608, + "rewards/margins": 1.3969751596450806, + "rewards/rejected": -3.1358957290649414, + "step": 10920 + }, + { + "epoch": 1.88, + "grad_norm": 29.30790533708006, + "learning_rate": 5.1782257602657756e-09, + "logits/chosen": -1.257922887802124, + "logits/rejected": -1.1985923051834106, + "logps/chosen": -233.80294799804688, + "logps/rejected": -352.77984619140625, + "loss": 0.4355, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.795925498008728, + "rewards/margins": 1.217332363128662, + "rewards/rejected": -3.0132579803466797, + "step": 10930 + }, + { + "epoch": 1.88, + "grad_norm": 54.99655407488525, + "learning_rate": 5.027112451874483e-09, + "logits/chosen": -1.2420815229415894, + "logits/rejected": -1.1881957054138184, + "logps/chosen": -241.045166015625, + "logps/rejected": -369.63568115234375, + "loss": 0.4248, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8460403680801392, + "rewards/margins": 1.2986847162246704, + "rewards/rejected": -3.1447253227233887, + "step": 10940 + }, + { + "epoch": 1.89, + "grad_norm": 45.54073464601158, + "learning_rate": 4.878214452167739e-09, + "logits/chosen": -1.3072357177734375, + "logits/rejected": -1.2408037185668945, + "logps/chosen": -238.0092315673828, + "logps/rejected": -381.5384826660156, + "loss": 0.3866, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.834251046180725, + "rewards/margins": 1.4345420598983765, + "rewards/rejected": -3.2687935829162598, + "step": 10950 + }, + { + "epoch": 1.89, + "grad_norm": 30.095545129155003, + "learning_rate": 4.7315331076416275e-09, + "logits/chosen": -1.3551833629608154, + "logits/rejected": -1.293670654296875, + "logps/chosen": -236.3451385498047, + "logps/rejected": -368.11126708984375, + "loss": 0.4367, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8255088329315186, + "rewards/margins": 1.3299205303192139, + "rewards/rejected": -3.1554293632507324, + "step": 10960 + }, + { + "epoch": 1.89, + "grad_norm": 45.41850107802761, + "learning_rate": 4.587069744746791e-09, + "logits/chosen": -1.3423527479171753, + "logits/rejected": -1.2766934633255005, + "logps/chosen": -233.58682250976562, + "logps/rejected": -358.17694091796875, + "loss": 0.4907, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7673285007476807, + "rewards/margins": 1.2734668254852295, + "rewards/rejected": -3.04079532623291, + "step": 10970 + }, + { + "epoch": 1.89, + "grad_norm": 24.319102255891416, + "learning_rate": 4.44482566987664e-09, + "logits/chosen": -1.3505980968475342, + "logits/rejected": -1.2955100536346436, + "logps/chosen": -245.52920532226562, + "logps/rejected": -382.1690368652344, + "loss": 0.4328, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.9095827341079712, + "rewards/margins": 1.3507945537567139, + "rewards/rejected": -3.2603771686553955, + "step": 10980 + }, + { + "epoch": 1.89, + "grad_norm": 28.168547715893087, + "learning_rate": 4.304802169355221e-09, + "logits/chosen": -1.2861645221710205, + "logits/rejected": -1.22446608543396, + "logps/chosen": -218.29025268554688, + "logps/rejected": -352.78509521484375, + "loss": 0.4101, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6582218408584595, + "rewards/margins": 1.3449079990386963, + "rewards/rejected": -3.0031299591064453, + "step": 10990 + }, + { + "epoch": 1.9, + "grad_norm": 39.270052414600016, + "learning_rate": 4.167000509425811e-09, + "logits/chosen": -1.4578297138214111, + "logits/rejected": -1.4120782613754272, + "logps/chosen": -241.8314208984375, + "logps/rejected": -367.2518005371094, + "loss": 0.4734, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8894517421722412, + "rewards/margins": 1.2133595943450928, + "rewards/rejected": -3.102811336517334, + "step": 11000 + }, + { + "epoch": 1.9, + "eval_logits/chosen": -1.4190027713775635, + "eval_logits/rejected": -1.392012357711792, + "eval_logps/chosen": -243.87950134277344, + "eval_logps/rejected": -294.5247802734375, + "eval_loss": 0.6338525414466858, + "eval_rewards/accuracies": 0.6628717184066772, + "eval_rewards/chosen": -1.8517564535140991, + "eval_rewards/margins": 0.4619174599647522, + "eval_rewards/rejected": -2.313674211502075, + "eval_runtime": 357.5201, + "eval_samples_per_second": 12.038, + "eval_steps_per_second": 1.505, + "step": 11000 + }, + { + "epoch": 1.9, + "grad_norm": 33.16784261064953, + "learning_rate": 4.03142193623951e-09, + "logits/chosen": -1.3858684301376343, + "logits/rejected": -1.305426001548767, + "logps/chosen": -235.13864135742188, + "logps/rejected": -388.290771484375, + "loss": 0.3688, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8195550441741943, + "rewards/margins": 1.5488157272338867, + "rewards/rejected": -3.368370771408081, + "step": 11010 + }, + { + "epoch": 1.9, + "grad_norm": 19.279870183301984, + "learning_rate": 3.898067675843747e-09, + "logits/chosen": -1.435046911239624, + "logits/rejected": -1.3683885335922241, + "logps/chosen": -224.517578125, + "logps/rejected": -371.97283935546875, + "loss": 0.3641, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6953538656234741, + "rewards/margins": 1.4940444231033325, + "rewards/rejected": -3.1893982887268066, + "step": 11020 + }, + { + "epoch": 1.9, + "grad_norm": 26.617574306250333, + "learning_rate": 3.766938934171348e-09, + "logits/chosen": -1.3704140186309814, + "logits/rejected": -1.3155790567398071, + "logps/chosen": -236.8312225341797, + "logps/rejected": -383.66754150390625, + "loss": 0.4191, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8204883337020874, + "rewards/margins": 1.4713939428329468, + "rewards/rejected": -3.2918827533721924, + "step": 11030 + }, + { + "epoch": 1.9, + "grad_norm": 32.25461324792829, + "learning_rate": 3.6380368970296836e-09, + "logits/chosen": -1.4112730026245117, + "logits/rejected": -1.351285696029663, + "logps/chosen": -239.5612030029297, + "logps/rejected": -364.81146240234375, + "loss": 0.4416, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.831099271774292, + "rewards/margins": 1.2617604732513428, + "rewards/rejected": -3.0928597450256348, + "step": 11040 + }, + { + "epoch": 1.9, + "grad_norm": 35.4050705396756, + "learning_rate": 3.5113627300897285e-09, + "logits/chosen": -1.310435175895691, + "logits/rejected": -1.2423975467681885, + "logps/chosen": -222.96923828125, + "logps/rejected": -379.7068786621094, + "loss": 0.3806, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7100273370742798, + "rewards/margins": 1.534369707107544, + "rewards/rejected": -3.244396924972534, + "step": 11050 + }, + { + "epoch": 1.91, + "grad_norm": 29.633120494533653, + "learning_rate": 3.38691757887577e-09, + "logits/chosen": -1.3639460802078247, + "logits/rejected": -1.271759271621704, + "logps/chosen": -234.57968139648438, + "logps/rejected": -376.5106506347656, + "loss": 0.4119, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8031505346298218, + "rewards/margins": 1.444676399230957, + "rewards/rejected": -3.2478268146514893, + "step": 11060 + }, + { + "epoch": 1.91, + "grad_norm": 27.809450621921286, + "learning_rate": 3.2647025687549122e-09, + "logits/chosen": -1.3753821849822998, + "logits/rejected": -1.2898151874542236, + "logps/chosen": -224.57437133789062, + "logps/rejected": -369.9324645996094, + "loss": 0.4233, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.714037537574768, + "rewards/margins": 1.4897834062576294, + "rewards/rejected": -3.2038207054138184, + "step": 11070 + }, + { + "epoch": 1.91, + "grad_norm": 25.21379163567548, + "learning_rate": 3.144718804926866e-09, + "logits/chosen": -1.3679758310317993, + "logits/rejected": -1.3044774532318115, + "logps/chosen": -238.9508819580078, + "logps/rejected": -373.56060791015625, + "loss": 0.417, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8344926834106445, + "rewards/margins": 1.3762904405593872, + "rewards/rejected": -3.210782527923584, + "step": 11080 + }, + { + "epoch": 1.91, + "grad_norm": 38.50752589918087, + "learning_rate": 3.0269673724140356e-09, + "logits/chosen": -1.3562889099121094, + "logits/rejected": -1.297836184501648, + "logps/chosen": -233.75924682617188, + "logps/rejected": -358.478271484375, + "loss": 0.4123, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7592127323150635, + "rewards/margins": 1.2987269163131714, + "rewards/rejected": -3.0579395294189453, + "step": 11090 + }, + { + "epoch": 1.91, + "grad_norm": 34.59360161848991, + "learning_rate": 2.9114493360517245e-09, + "logits/chosen": -1.2469245195388794, + "logits/rejected": -1.1905173063278198, + "logps/chosen": -209.7041015625, + "logps/rejected": -341.9212646484375, + "loss": 0.4333, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.5883010625839233, + "rewards/margins": 1.2910573482513428, + "rewards/rejected": -2.8793585300445557, + "step": 11100 + }, + { + "epoch": 1.91, + "eval_logits/chosen": -1.4189980030059814, + "eval_logits/rejected": -1.3920680284500122, + "eval_logps/chosen": -244.1648712158203, + "eval_logps/rejected": -294.9982604980469, + "eval_loss": 0.6333078145980835, + "eval_rewards/accuracies": 0.6598513126373291, + "eval_rewards/chosen": -1.8546103239059448, + "eval_rewards/margins": 0.46379825472831726, + "eval_rewards/rejected": -2.318408489227295, + "eval_runtime": 357.2276, + "eval_samples_per_second": 12.048, + "eval_steps_per_second": 1.506, + "step": 11100 + }, + { + "epoch": 1.91, + "grad_norm": 15.157481934801936, + "learning_rate": 2.79816574047842e-09, + "logits/chosen": -1.3375742435455322, + "logits/rejected": -1.2590216398239136, + "logps/chosen": -242.087890625, + "logps/rejected": -421.6622619628906, + "loss": 0.3844, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8821719884872437, + "rewards/margins": 1.799425721168518, + "rewards/rejected": -3.681597948074341, + "step": 11110 + }, + { + "epoch": 1.92, + "grad_norm": 41.14442211032838, + "learning_rate": 2.6871176101263825e-09, + "logits/chosen": -1.4798122644424438, + "logits/rejected": -1.4134316444396973, + "logps/chosen": -232.623046875, + "logps/rejected": -359.533935546875, + "loss": 0.4216, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.730859398841858, + "rewards/margins": 1.3277714252471924, + "rewards/rejected": -3.05863094329834, + "step": 11120 + }, + { + "epoch": 1.92, + "grad_norm": 41.710468519729744, + "learning_rate": 2.578305949212434e-09, + "logits/chosen": -1.273736834526062, + "logits/rejected": -1.2053143978118896, + "logps/chosen": -243.1206817626953, + "logps/rejected": -371.94866943359375, + "loss": 0.4025, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8726370334625244, + "rewards/margins": 1.3150079250335693, + "rewards/rejected": -3.1876449584960938, + "step": 11130 + }, + { + "epoch": 1.92, + "grad_norm": 41.66910869223657, + "learning_rate": 2.4717317417287942e-09, + "logits/chosen": -1.2594302892684937, + "logits/rejected": -1.1979601383209229, + "logps/chosen": -219.0839080810547, + "logps/rejected": -353.35504150390625, + "loss": 0.3744, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6528394222259521, + "rewards/margins": 1.3649063110351562, + "rewards/rejected": -3.0177457332611084, + "step": 11140 + }, + { + "epoch": 1.92, + "grad_norm": 38.68187084741804, + "learning_rate": 2.3673959514342314e-09, + "logits/chosen": -1.3535066843032837, + "logits/rejected": -1.3013697862625122, + "logps/chosen": -242.1443328857422, + "logps/rejected": -369.8888244628906, + "loss": 0.43, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8408015966415405, + "rewards/margins": 1.2930688858032227, + "rewards/rejected": -3.1338706016540527, + "step": 11150 + }, + { + "epoch": 1.92, + "grad_norm": 34.31089036359821, + "learning_rate": 2.2652995218452877e-09, + "logits/chosen": -1.4165607690811157, + "logits/rejected": -1.3578795194625854, + "logps/chosen": -216.8806610107422, + "logps/rejected": -338.1597900390625, + "loss": 0.4338, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.647953748703003, + "rewards/margins": 1.2238695621490479, + "rewards/rejected": -2.871823310852051, + "step": 11160 + }, + { + "epoch": 1.92, + "grad_norm": 32.24674833849174, + "learning_rate": 2.165443376227871e-09, + "logits/chosen": -1.2586653232574463, + "logits/rejected": -1.196004867553711, + "logps/chosen": -232.2088165283203, + "logps/rejected": -331.0061340332031, + "loss": 0.4821, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.750544786453247, + "rewards/margins": 1.0693135261535645, + "rewards/rejected": -2.8198580741882324, + "step": 11170 + }, + { + "epoch": 1.93, + "grad_norm": 57.55683167173714, + "learning_rate": 2.0678284175887907e-09, + "logits/chosen": -1.4004487991333008, + "logits/rejected": -1.3373186588287354, + "logps/chosen": -235.8096466064453, + "logps/rejected": -374.24713134765625, + "loss": 0.3973, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8187503814697266, + "rewards/margins": 1.4154729843139648, + "rewards/rejected": -3.2342236042022705, + "step": 11180 + }, + { + "epoch": 1.93, + "grad_norm": 25.781820157206223, + "learning_rate": 1.972455528667677e-09, + "logits/chosen": -1.3892544507980347, + "logits/rejected": -1.3081748485565186, + "logps/chosen": -220.63912963867188, + "logps/rejected": -372.5516052246094, + "loss": 0.3301, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.6577503681182861, + "rewards/margins": 1.551475167274475, + "rewards/rejected": -3.20922589302063, + "step": 11190 + }, + { + "epoch": 1.93, + "grad_norm": 33.77308243819322, + "learning_rate": 1.8793255719288246e-09, + "logits/chosen": -1.3923676013946533, + "logits/rejected": -1.3273457288742065, + "logps/chosen": -210.1555633544922, + "logps/rejected": -346.9501647949219, + "loss": 0.4305, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5684833526611328, + "rewards/margins": 1.3637385368347168, + "rewards/rejected": -2.9322218894958496, + "step": 11200 + }, + { + "epoch": 1.93, + "eval_logits/chosen": -1.42206609249115, + "eval_logits/rejected": -1.3953404426574707, + "eval_logps/chosen": -243.3866424560547, + "eval_logps/rejected": -293.89874267578125, + "eval_loss": 0.6334691643714905, + "eval_rewards/accuracies": 0.6563661694526672, + "eval_rewards/chosen": -1.8468278646469116, + "eval_rewards/margins": 0.46058568358421326, + "eval_rewards/rejected": -2.307413339614868, + "eval_runtime": 357.503, + "eval_samples_per_second": 12.039, + "eval_steps_per_second": 1.505, + "step": 11200 + }, + { + "epoch": 1.93, + "grad_norm": 46.567374288537444, + "learning_rate": 1.7884393895536697e-09, + "logits/chosen": -1.2399379014968872, + "logits/rejected": -1.1777968406677246, + "logps/chosen": -229.732666015625, + "logps/rejected": -374.9830017089844, + "loss": 0.4382, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.793482780456543, + "rewards/margins": 1.417189121246338, + "rewards/rejected": -3.2106716632843018, + "step": 11210 + }, + { + "epoch": 1.93, + "grad_norm": 42.21368814153827, + "learning_rate": 1.6997978034329342e-09, + "logits/chosen": -1.3409579992294312, + "logits/rejected": -1.2815383672714233, + "logps/chosen": -217.97921752929688, + "logps/rejected": -357.2254943847656, + "loss": 0.4403, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6595194339752197, + "rewards/margins": 1.3560142517089844, + "rewards/rejected": -3.015533447265625, + "step": 11220 + }, + { + "epoch": 1.93, + "grad_norm": 39.048264387033385, + "learning_rate": 1.613401615159299e-09, + "logits/chosen": -1.3046488761901855, + "logits/rejected": -1.2500449419021606, + "logps/chosen": -243.40029907226562, + "logps/rejected": -376.0064697265625, + "loss": 0.4023, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8764787912368774, + "rewards/margins": 1.3685901165008545, + "rewards/rejected": -3.2450687885284424, + "step": 11230 + }, + { + "epoch": 1.94, + "grad_norm": 23.205283737644205, + "learning_rate": 1.5292516060201599e-09, + "logits/chosen": -1.3030173778533936, + "logits/rejected": -1.2475535869598389, + "logps/chosen": -230.6564483642578, + "logps/rejected": -360.25225830078125, + "loss": 0.4305, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7603286504745483, + "rewards/margins": 1.3079051971435547, + "rewards/rejected": -3.0682339668273926, + "step": 11240 + }, + { + "epoch": 1.94, + "grad_norm": 27.435468950638114, + "learning_rate": 1.4473485369905224e-09, + "logits/chosen": -1.3240660429000854, + "logits/rejected": -1.261580228805542, + "logps/chosen": -228.3889923095703, + "logps/rejected": -360.775634765625, + "loss": 0.4024, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7416374683380127, + "rewards/margins": 1.3298765420913696, + "rewards/rejected": -3.071514129638672, + "step": 11250 + }, + { + "epoch": 1.94, + "grad_norm": 19.467311006382324, + "learning_rate": 1.3676931487261456e-09, + "logits/chosen": -1.2779386043548584, + "logits/rejected": -1.209826111793518, + "logps/chosen": -221.6068115234375, + "logps/rejected": -340.8597717285156, + "loss": 0.4412, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.637843370437622, + "rewards/margins": 1.2554326057434082, + "rewards/rejected": -2.8932759761810303, + "step": 11260 + }, + { + "epoch": 1.94, + "grad_norm": 41.60730906780407, + "learning_rate": 1.2902861615568527e-09, + "logits/chosen": -1.3289337158203125, + "logits/rejected": -1.2553983926773071, + "logps/chosen": -224.650390625, + "logps/rejected": -358.2517395019531, + "loss": 0.391, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6910244226455688, + "rewards/margins": 1.3813101053237915, + "rewards/rejected": -3.0723345279693604, + "step": 11270 + }, + { + "epoch": 1.94, + "grad_norm": 32.1598075986716, + "learning_rate": 1.2151282754799542e-09, + "logits/chosen": -1.3617111444473267, + "logits/rejected": -1.293905258178711, + "logps/chosen": -228.1493377685547, + "logps/rejected": -348.77618408203125, + "loss": 0.4462, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7183462381362915, + "rewards/margins": 1.2455885410308838, + "rewards/rejected": -2.9639346599578857, + "step": 11280 + }, + { + "epoch": 1.95, + "grad_norm": 36.803056421177146, + "learning_rate": 1.1422201701540567e-09, + "logits/chosen": -1.3835200071334839, + "logits/rejected": -1.3248523473739624, + "logps/chosen": -217.21005249023438, + "logps/rejected": -349.54583740234375, + "loss": 0.4033, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.5726218223571777, + "rewards/margins": 1.3615262508392334, + "rewards/rejected": -2.9341483116149902, + "step": 11290 + }, + { + "epoch": 1.95, + "grad_norm": 27.85129396845735, + "learning_rate": 1.0715625048927092e-09, + "logits/chosen": -1.309777021408081, + "logits/rejected": -1.2495863437652588, + "logps/chosen": -242.6230926513672, + "logps/rejected": -351.98077392578125, + "loss": 0.4817, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8370163440704346, + "rewards/margins": 1.1458604335784912, + "rewards/rejected": -2.9828765392303467, + "step": 11300 + }, + { + "epoch": 1.95, + "eval_logits/chosen": -1.4203462600708008, + "eval_logits/rejected": -1.393379807472229, + "eval_logps/chosen": -244.32652282714844, + "eval_logps/rejected": -295.0477294921875, + "eval_loss": 0.6342768669128418, + "eval_rewards/accuracies": 0.6572955250740051, + "eval_rewards/chosen": -1.8562268018722534, + "eval_rewards/margins": 0.46267637610435486, + "eval_rewards/rejected": -2.3189032077789307, + "eval_runtime": 356.9343, + "eval_samples_per_second": 12.058, + "eval_steps_per_second": 1.507, + "step": 11300 + }, + { + "epoch": 1.95, + "grad_norm": 33.21894208609608, + "learning_rate": 1.0031559186586825e-09, + "logits/chosen": -1.4185220003128052, + "logits/rejected": -1.3642728328704834, + "logps/chosen": -219.1095428466797, + "logps/rejected": -368.1912841796875, + "loss": 0.3543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.686640739440918, + "rewards/margins": 1.4802013635635376, + "rewards/rejected": -3.166841983795166, + "step": 11310 + }, + { + "epoch": 1.95, + "grad_norm": 24.242052165940123, + "learning_rate": 9.370010300579213e-10, + "logits/chosen": -1.350401520729065, + "logits/rejected": -1.2788327932357788, + "logps/chosen": -226.5559539794922, + "logps/rejected": -364.3349914550781, + "loss": 0.4343, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7163565158843994, + "rewards/margins": 1.3881375789642334, + "rewards/rejected": -3.104494094848633, + "step": 11320 + }, + { + "epoch": 1.95, + "grad_norm": 49.892604398278046, + "learning_rate": 8.730984373342409e-10, + "logits/chosen": -1.3533201217651367, + "logits/rejected": -1.2792202234268188, + "logps/chosen": -225.33120727539062, + "logps/rejected": -375.6051330566406, + "loss": 0.3547, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7277292013168335, + "rewards/margins": 1.547004222869873, + "rewards/rejected": -3.274733304977417, + "step": 11330 + }, + { + "epoch": 1.95, + "grad_norm": 44.808939079357344, + "learning_rate": 8.114487183636942e-10, + "logits/chosen": -1.2439250946044922, + "logits/rejected": -1.1751753091812134, + "logps/chosen": -238.3853759765625, + "logps/rejected": -387.01287841796875, + "loss": 0.3996, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8302602767944336, + "rewards/margins": 1.4923627376556396, + "rewards/rejected": -3.3226234912872314, + "step": 11340 + }, + { + "epoch": 1.96, + "grad_norm": 24.43792912942958, + "learning_rate": 7.520524306494358e-10, + "logits/chosen": -1.3848811388015747, + "logits/rejected": -1.3261343240737915, + "logps/chosen": -250.03085327148438, + "logps/rejected": -366.9549255371094, + "loss": 0.4724, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.9313633441925049, + "rewards/margins": 1.177704095840454, + "rewards/rejected": -3.109067440032959, + "step": 11350 + }, + { + "epoch": 1.96, + "grad_norm": 31.83383405122785, + "learning_rate": 6.949101113166711e-10, + "logits/chosen": -1.330773949623108, + "logits/rejected": -1.2642290592193604, + "logps/chosen": -235.5715789794922, + "logps/rejected": -365.17340087890625, + "loss": 0.4185, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8331444263458252, + "rewards/margins": 1.3331224918365479, + "rewards/rejected": -3.166267156600952, + "step": 11360 + }, + { + "epoch": 1.96, + "grad_norm": 29.61007825180616, + "learning_rate": 6.40022277107799e-10, + "logits/chosen": -1.3195604085922241, + "logits/rejected": -1.2653484344482422, + "logps/chosen": -229.4175567626953, + "logps/rejected": -352.00555419921875, + "loss": 0.4398, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7311922311782837, + "rewards/margins": 1.2529064416885376, + "rewards/rejected": -2.9840986728668213, + "step": 11370 + }, + { + "epoch": 1.96, + "grad_norm": 44.883310537971305, + "learning_rate": 5.873894243776933e-10, + "logits/chosen": -1.2741248607635498, + "logits/rejected": -1.2058513164520264, + "logps/chosen": -221.5206298828125, + "logps/rejected": -360.199951171875, + "loss": 0.4065, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6969740390777588, + "rewards/margins": 1.4046361446380615, + "rewards/rejected": -3.1016104221343994, + "step": 11380 + }, + { + "epoch": 1.96, + "grad_norm": 36.423896869469424, + "learning_rate": 5.370120290893176e-10, + "logits/chosen": -1.4164044857025146, + "logits/rejected": -1.340454339981079, + "logps/chosen": -214.89956665039062, + "logps/rejected": -366.628662109375, + "loss": 0.4007, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.625978708267212, + "rewards/margins": 1.4872604608535767, + "rewards/rejected": -3.113239288330078, + "step": 11390 + }, + { + "epoch": 1.96, + "grad_norm": 28.749205056855722, + "learning_rate": 4.888905468093673e-10, + "logits/chosen": -1.3569167852401733, + "logits/rejected": -1.291182041168213, + "logps/chosen": -212.83273315429688, + "logps/rejected": -341.9942626953125, + "loss": 0.4146, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5691052675247192, + "rewards/margins": 1.31778883934021, + "rewards/rejected": -2.8868937492370605, + "step": 11400 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -1.41786789894104, + "eval_logits/rejected": -1.3909337520599365, + "eval_logps/chosen": -244.43157958984375, + "eval_logps/rejected": -295.2254638671875, + "eval_loss": 0.63393235206604, + "eval_rewards/accuracies": 0.6559014916419983, + "eval_rewards/chosen": -1.857277512550354, + "eval_rewards/margins": 0.4634034037590027, + "eval_rewards/rejected": -2.320681095123291, + "eval_runtime": 357.3673, + "eval_samples_per_second": 12.044, + "eval_steps_per_second": 1.505, + "step": 11400 + }, + { + "epoch": 1.97, + "grad_norm": 39.086102079615756, + "learning_rate": 4.430254127040789e-10, + "logits/chosen": -1.3270251750946045, + "logits/rejected": -1.2652333974838257, + "logps/chosen": -229.4177703857422, + "logps/rejected": -348.8304748535156, + "loss": 0.4342, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7375816106796265, + "rewards/margins": 1.199951171875, + "rewards/rejected": -2.937532901763916, + "step": 11410 + }, + { + "epoch": 1.97, + "grad_norm": 30.366456037929765, + "learning_rate": 3.994170415353715e-10, + "logits/chosen": -1.3328666687011719, + "logits/rejected": -1.2708826065063477, + "logps/chosen": -233.0135498046875, + "logps/rejected": -347.16339111328125, + "loss": 0.4453, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.793405532836914, + "rewards/margins": 1.175714135169983, + "rewards/rejected": -2.9691193103790283, + "step": 11420 + }, + { + "epoch": 1.97, + "grad_norm": 23.516266855640406, + "learning_rate": 3.5806582765715574e-10, + "logits/chosen": -1.2737079858779907, + "logits/rejected": -1.215456247329712, + "logps/chosen": -230.99423217773438, + "logps/rejected": -346.54241943359375, + "loss": 0.4635, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7601213455200195, + "rewards/margins": 1.1789329051971436, + "rewards/rejected": -2.939054012298584, + "step": 11430 + }, + { + "epoch": 1.97, + "grad_norm": 32.39137150045276, + "learning_rate": 3.189721450116145e-10, + "logits/chosen": -1.340698003768921, + "logits/rejected": -1.2886393070220947, + "logps/chosen": -225.9718780517578, + "logps/rejected": -343.7590637207031, + "loss": 0.4333, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7345173358917236, + "rewards/margins": 1.1770877838134766, + "rewards/rejected": -2.9116053581237793, + "step": 11440 + }, + { + "epoch": 1.97, + "grad_norm": 47.231248686402424, + "learning_rate": 2.821363471259275e-10, + "logits/chosen": -1.2820355892181396, + "logits/rejected": -1.2136328220367432, + "logps/chosen": -230.76296997070312, + "logps/rejected": -371.7513122558594, + "loss": 0.4059, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.780504822731018, + "rewards/margins": 1.4201017618179321, + "rewards/rejected": -3.20060658454895, + "step": 11450 + }, + { + "epoch": 1.97, + "grad_norm": 28.594067560080052, + "learning_rate": 2.4755876710905176e-10, + "logits/chosen": -1.3040361404418945, + "logits/rejected": -1.2424486875534058, + "logps/chosen": -228.9510498046875, + "logps/rejected": -367.95770263671875, + "loss": 0.3752, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7342042922973633, + "rewards/margins": 1.4023338556289673, + "rewards/rejected": -3.136538028717041, + "step": 11460 + }, + { + "epoch": 1.98, + "grad_norm": 46.590396282617746, + "learning_rate": 2.1523971764869642e-10, + "logits/chosen": -1.3816394805908203, + "logits/rejected": -1.3001785278320312, + "logps/chosen": -231.14248657226562, + "logps/rejected": -376.31292724609375, + "loss": 0.3637, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7429434061050415, + "rewards/margins": 1.5104016065597534, + "rewards/rejected": -3.253345012664795, + "step": 11470 + }, + { + "epoch": 1.98, + "grad_norm": 43.745231317437764, + "learning_rate": 1.8517949100854692e-10, + "logits/chosen": -1.3776280879974365, + "logits/rejected": -1.3017531633377075, + "logps/chosen": -220.01980590820312, + "logps/rejected": -345.5537414550781, + "loss": 0.4132, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6392385959625244, + "rewards/margins": 1.3121652603149414, + "rewards/rejected": -2.9514036178588867, + "step": 11480 + }, + { + "epoch": 1.98, + "grad_norm": 23.908719760101306, + "learning_rate": 1.5737835902551733e-10, + "logits/chosen": -1.3258212804794312, + "logits/rejected": -1.2626917362213135, + "logps/chosen": -230.5511474609375, + "logps/rejected": -344.8309631347656, + "loss": 0.4828, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.758782982826233, + "rewards/margins": 1.1783698797225952, + "rewards/rejected": -2.937152862548828, + "step": 11490 + }, + { + "epoch": 1.98, + "grad_norm": 31.08219846166836, + "learning_rate": 1.318365731074189e-10, + "logits/chosen": -1.3679813146591187, + "logits/rejected": -1.315306305885315, + "logps/chosen": -218.0261688232422, + "logps/rejected": -330.38092041015625, + "loss": 0.432, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6775197982788086, + "rewards/margins": 1.118550419807434, + "rewards/rejected": -2.796069860458374, + "step": 11500 + }, + { + "epoch": 1.98, + "eval_logits/chosen": -1.4183286428451538, + "eval_logits/rejected": -1.391395092010498, + "eval_logps/chosen": -244.1782684326172, + "eval_logps/rejected": -295.0009765625, + "eval_loss": 0.6337403059005737, + "eval_rewards/accuracies": 0.6535780429840088, + "eval_rewards/chosen": -1.8547443151474, + "eval_rewards/margins": 0.4636920690536499, + "eval_rewards/rejected": -2.31843638420105, + "eval_runtime": 364.0419, + "eval_samples_per_second": 11.823, + "eval_steps_per_second": 1.478, + "step": 11500 + }, + { + "epoch": 1.98, + "grad_norm": 25.52111148377799, + "learning_rate": 1.0855436423054532e-10, + "logits/chosen": -1.2700302600860596, + "logits/rejected": -1.2187522649765015, + "logps/chosen": -233.6306610107422, + "logps/rejected": -368.69073486328125, + "loss": 0.4327, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7816520929336548, + "rewards/margins": 1.3728293180465698, + "rewards/rejected": -3.1544814109802246, + "step": 11510 + }, + { + "epoch": 1.98, + "grad_norm": 32.012807334453846, + "learning_rate": 8.753194293770194e-11, + "logits/chosen": -1.3286519050598145, + "logits/rejected": -1.2325983047485352, + "logps/chosen": -229.2296905517578, + "logps/rejected": -372.2969970703125, + "loss": 0.3641, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.694801688194275, + "rewards/margins": 1.5285245180130005, + "rewards/rejected": -3.2233262062072754, + "step": 11520 + }, + { + "epoch": 1.99, + "grad_norm": 30.754768134489776, + "learning_rate": 6.87694993363186e-11, + "logits/chosen": -1.3455301523208618, + "logits/rejected": -1.2808736562728882, + "logps/chosen": -218.40139770507812, + "logps/rejected": -335.3419494628906, + "loss": 0.4166, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6494470834732056, + "rewards/margins": 1.2140170335769653, + "rewards/rejected": -2.863463878631592, + "step": 11530 + }, + { + "epoch": 1.99, + "grad_norm": 52.90046362094887, + "learning_rate": 5.226720309656207e-11, + "logits/chosen": -1.3888723850250244, + "logits/rejected": -1.3319811820983887, + "logps/chosen": -217.9016876220703, + "logps/rejected": -374.586669921875, + "loss": 0.3692, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6716915369033813, + "rewards/margins": 1.5221760272979736, + "rewards/rejected": -3.1938672065734863, + "step": 11540 + }, + { + "epoch": 1.99, + "grad_norm": 32.18141803466796, + "learning_rate": 3.802520345000393e-11, + "logits/chosen": -1.3312755823135376, + "logits/rejected": -1.2780405282974243, + "logps/chosen": -224.06698608398438, + "logps/rejected": -344.835205078125, + "loss": 0.46, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7274624109268188, + "rewards/margins": 1.1927831172943115, + "rewards/rejected": -2.92024564743042, + "step": 11550 + }, + { + "epoch": 1.99, + "grad_norm": 43.51210987286007, + "learning_rate": 2.604362918812164e-11, + "logits/chosen": -1.3421592712402344, + "logits/rejected": -1.2694909572601318, + "logps/chosen": -227.5470733642578, + "logps/rejected": -358.80706787109375, + "loss": 0.4156, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7258684635162354, + "rewards/margins": 1.3443853855133057, + "rewards/rejected": -3.070253849029541, + "step": 11560 + }, + { + "epoch": 1.99, + "grad_norm": 38.872585179691995, + "learning_rate": 1.6322588661216163e-11, + "logits/chosen": -1.3375303745269775, + "logits/rejected": -1.271527647972107, + "logps/chosen": -235.52359008789062, + "logps/rejected": -379.27459716796875, + "loss": 0.3802, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8141777515411377, + "rewards/margins": 1.421951413154602, + "rewards/rejected": -3.23612904548645, + "step": 11570 + }, + { + "epoch": 2.0, + "grad_norm": 42.37337871627236, + "learning_rate": 8.862169777440475e-12, + "logits/chosen": -1.3970595598220825, + "logits/rejected": -1.3340885639190674, + "logps/chosen": -227.24227905273438, + "logps/rejected": -373.0314636230469, + "loss": 0.3938, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7100292444229126, + "rewards/margins": 1.4638845920562744, + "rewards/rejected": -3.1739137172698975, + "step": 11580 + }, + { + "epoch": 2.0, + "grad_norm": 37.01271407525554, + "learning_rate": 3.6624400018836485e-12, + "logits/chosen": -1.2879482507705688, + "logits/rejected": -1.215693712234497, + "logps/chosen": -228.08352661132812, + "logps/rejected": -351.55242919921875, + "loss": 0.4093, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6859127283096313, + "rewards/margins": 1.3195241689682007, + "rewards/rejected": -3.005436420440674, + "step": 11590 + }, + { + "epoch": 2.0, + "grad_norm": 33.774484526273135, + "learning_rate": 7.234463561267557e-13, + "logits/chosen": -1.341506004333496, + "logits/rejected": -1.2840282917022705, + "logps/chosen": -213.89993286132812, + "logps/rejected": -343.8310852050781, + "loss": 0.429, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.634570837020874, + "rewards/margins": 1.2662864923477173, + "rewards/rejected": -2.900857448577881, + "step": 11600 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -1.4189956188201904, + "eval_logits/rejected": -1.3920025825500488, + "eval_logps/chosen": -244.38746643066406, + "eval_logps/rejected": -295.19293212890625, + "eval_loss": 0.6342188119888306, + "eval_rewards/accuracies": 0.6579925417900085, + "eval_rewards/chosen": -1.8568360805511475, + "eval_rewards/margins": 0.4635196328163147, + "eval_rewards/rejected": -2.3203558921813965, + "eval_runtime": 356.6898, + "eval_samples_per_second": 12.067, + "eval_steps_per_second": 1.508, + "step": 11600 + }, + { + "epoch": 2.0, + "step": 11608, + "total_flos": 0.0, + "train_loss": 0.5042109644922366, + "train_runtime": 89019.0317, + "train_samples_per_second": 2.086, + "train_steps_per_second": 0.13 + } + ], + "logging_steps": 10, + "max_steps": 11608, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}