{ "best_metric": 0.34911444783210754, "best_model_checkpoint": "./models/checkpoint-1350", "epoch": 1.5, "eval_steps": 270, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011111111111111111, "grad_norm": 0.964627206325531, "learning_rate": 3.703703703703704e-07, "logits/chosen": 1.587358832359314, "logits/rejected": 1.593401551246643, "logps/chosen": -74.5113754272461, "logps/rejected": -80.5272445678711, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0022222222222222222, "grad_norm": 1.1964473724365234, "learning_rate": 7.407407407407408e-07, "logits/chosen": 1.5702040195465088, "logits/rejected": 1.5829124450683594, "logps/chosen": -58.661991119384766, "logps/rejected": -78.643310546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0033333333333333335, "grad_norm": 1.1883488893508911, "learning_rate": 1.1111111111111112e-06, "logits/chosen": 1.7048407793045044, "logits/rejected": 1.69460928440094, "logps/chosen": -65.94305419921875, "logps/rejected": -83.93917846679688, "loss": 0.6737, "rewards/accuracies": 0.5, "rewards/chosen": -0.004763223230838776, "rewards/margins": 0.04160633310675621, "rewards/rejected": -0.04636955261230469, "step": 3 }, { "epoch": 0.0044444444444444444, "grad_norm": 3.2139475345611572, "learning_rate": 1.4814814814814817e-06, "logits/chosen": 2.1135401725769043, "logits/rejected": 2.1162350177764893, "logps/chosen": -115.66337585449219, "logps/rejected": -234.76425170898438, "loss": 0.6548, "rewards/accuracies": 0.5, "rewards/chosen": 0.013595961034297943, "rewards/margins": 0.08178672939538956, "rewards/rejected": -0.06819076836109161, "step": 4 }, { "epoch": 0.005555555555555556, "grad_norm": 1.5120346546173096, "learning_rate": 1.8518518518518519e-06, "logits/chosen": 1.7832422256469727, "logits/rejected": 1.761995792388916, "logps/chosen": -70.7474136352539, "logps/rejected": -79.55470275878906, "loss": 0.7354, "rewards/accuracies": 0.0, "rewards/chosen": -0.08225631713867188, "rewards/margins": -0.08251915127038956, "rewards/rejected": 0.00026283226907253265, "step": 5 }, { "epoch": 0.006666666666666667, "grad_norm": 1.6049247980117798, "learning_rate": 2.2222222222222225e-06, "logits/chosen": 1.952986240386963, "logits/rejected": 1.9634885787963867, "logps/chosen": -108.12012481689453, "logps/rejected": -115.03923034667969, "loss": 0.707, "rewards/accuracies": 0.0, "rewards/chosen": -0.09396743774414062, "rewards/margins": -0.027441028505563736, "rewards/rejected": -0.06652641296386719, "step": 6 }, { "epoch": 0.0077777777777777776, "grad_norm": 1.4904389381408691, "learning_rate": 2.5925925925925925e-06, "logits/chosen": 1.7306190729141235, "logits/rejected": 1.7248234748840332, "logps/chosen": -92.63914489746094, "logps/rejected": -117.31549072265625, "loss": 0.6686, "rewards/accuracies": 0.5, "rewards/chosen": -0.06744308024644852, "rewards/margins": 0.05123482272028923, "rewards/rejected": -0.11867789924144745, "step": 7 }, { "epoch": 0.008888888888888889, "grad_norm": 1.868098258972168, "learning_rate": 2.9629629629629633e-06, "logits/chosen": 1.2929918766021729, "logits/rejected": 1.5113823413848877, "logps/chosen": -53.52079391479492, "logps/rejected": -160.8567352294922, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.03008861653506756, "rewards/margins": 0.02466411702334881, "rewards/rejected": 0.00542449951171875, "step": 8 }, { "epoch": 0.01, "grad_norm": 1.9787408113479614, "learning_rate": 3.3333333333333333e-06, "logits/chosen": 1.7580476999282837, "logits/rejected": 1.7341763973236084, "logps/chosen": -52.69316482543945, "logps/rejected": -124.8278579711914, "loss": 0.7208, "rewards/accuracies": 0.0, "rewards/chosen": -0.09640693664550781, "rewards/margins": -0.054576873779296875, "rewards/rejected": -0.04183006286621094, "step": 9 }, { "epoch": 0.011111111111111112, "grad_norm": 0.8551863431930542, "learning_rate": 3.7037037037037037e-06, "logits/chosen": 1.5674762725830078, "logits/rejected": 1.5638999938964844, "logps/chosen": -61.49891662597656, "logps/rejected": -70.89288330078125, "loss": 0.6829, "rewards/accuracies": 0.5, "rewards/chosen": 0.02249602973461151, "rewards/margins": 0.020903009921312332, "rewards/rejected": 0.001593019813299179, "step": 10 }, { "epoch": 0.012222222222222223, "grad_norm": 3.1162986755371094, "learning_rate": 4.074074074074075e-06, "logits/chosen": 1.8080631494522095, "logits/rejected": 1.7988734245300293, "logps/chosen": -98.58515930175781, "logps/rejected": -133.62159729003906, "loss": 0.6647, "rewards/accuracies": 0.5, "rewards/chosen": 0.0630260482430458, "rewards/margins": 0.06048622354865074, "rewards/rejected": 0.0025398246943950653, "step": 11 }, { "epoch": 0.013333333333333334, "grad_norm": 1.921143889427185, "learning_rate": 4.444444444444445e-06, "logits/chosen": 1.9271034002304077, "logits/rejected": 1.885392665863037, "logps/chosen": -100.290283203125, "logps/rejected": -77.11638641357422, "loss": 0.7805, "rewards/accuracies": 0.0, "rewards/chosen": -0.1513877958059311, "rewards/margins": -0.16757851839065552, "rewards/rejected": 0.016190720722079277, "step": 12 }, { "epoch": 0.014444444444444444, "grad_norm": 1.5794175863265991, "learning_rate": 4.814814814814815e-06, "logits/chosen": 1.6001924276351929, "logits/rejected": 1.5639276504516602, "logps/chosen": -76.5877914428711, "logps/rejected": -87.19918823242188, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": -0.04023399204015732, "rewards/margins": 0.0302963275462389, "rewards/rejected": -0.07053032517433167, "step": 13 }, { "epoch": 0.015555555555555555, "grad_norm": 2.2297215461730957, "learning_rate": 5.185185185185185e-06, "logits/chosen": 1.830111026763916, "logits/rejected": 1.7968027591705322, "logps/chosen": -122.8951644897461, "logps/rejected": -142.72314453125, "loss": 0.7043, "rewards/accuracies": 0.5, "rewards/chosen": -0.03818397596478462, "rewards/margins": -0.021287156268954277, "rewards/rejected": -0.016896819695830345, "step": 14 }, { "epoch": 0.016666666666666666, "grad_norm": 2.637876033782959, "learning_rate": 5.555555555555556e-06, "logits/chosen": 2.1385433673858643, "logits/rejected": 2.1026387214660645, "logps/chosen": -117.89601135253906, "logps/rejected": -143.9423370361328, "loss": 0.7163, "rewards/accuracies": 0.5, "rewards/chosen": -0.062108613550662994, "rewards/margins": -0.04160041734576225, "rewards/rejected": -0.020508192479610443, "step": 15 }, { "epoch": 0.017777777777777778, "grad_norm": 1.6890418529510498, "learning_rate": 5.925925925925927e-06, "logits/chosen": 1.925990104675293, "logits/rejected": 1.952521800994873, "logps/chosen": -86.40887451171875, "logps/rejected": -71.00070190429688, "loss": 0.6492, "rewards/accuracies": 1.0, "rewards/chosen": 0.12040863186120987, "rewards/margins": 0.0913364440202713, "rewards/rejected": 0.029072191566228867, "step": 16 }, { "epoch": 0.01888888888888889, "grad_norm": 2.029874801635742, "learning_rate": 6.296296296296296e-06, "logits/chosen": 1.675471305847168, "logits/rejected": 1.7311854362487793, "logps/chosen": -108.28228759765625, "logps/rejected": -173.41026306152344, "loss": 0.7394, "rewards/accuracies": 0.0, "rewards/chosen": -0.0845264419913292, "rewards/margins": -0.09012261033058167, "rewards/rejected": 0.005596160888671875, "step": 17 }, { "epoch": 0.02, "grad_norm": 0.9358479380607605, "learning_rate": 6.666666666666667e-06, "logits/chosen": 1.6838430166244507, "logits/rejected": 1.6848626136779785, "logps/chosen": -63.256282806396484, "logps/rejected": -65.94783020019531, "loss": 0.6639, "rewards/accuracies": 1.0, "rewards/chosen": 0.07935123145580292, "rewards/margins": 0.06027068942785263, "rewards/rejected": 0.019080543890595436, "step": 18 }, { "epoch": 0.021111111111111112, "grad_norm": 1.2506437301635742, "learning_rate": 7.0370370370370375e-06, "logits/chosen": 1.5521399974822998, "logits/rejected": 1.5590038299560547, "logps/chosen": -55.257930755615234, "logps/rejected": -44.40534210205078, "loss": 0.7306, "rewards/accuracies": 0.0, "rewards/chosen": -0.028900718316435814, "rewards/margins": -0.07338161766529083, "rewards/rejected": 0.04448089748620987, "step": 19 }, { "epoch": 0.022222222222222223, "grad_norm": 1.1717438697814941, "learning_rate": 7.4074074074074075e-06, "logits/chosen": 1.489425539970398, "logits/rejected": 1.4848383665084839, "logps/chosen": -76.05974578857422, "logps/rejected": -97.445556640625, "loss": 0.6193, "rewards/accuracies": 1.0, "rewards/chosen": 0.013974763453006744, "rewards/margins": 0.15377768874168396, "rewards/rejected": -0.1398029327392578, "step": 20 }, { "epoch": 0.023333333333333334, "grad_norm": 1.747902750968933, "learning_rate": 7.777777777777777e-06, "logits/chosen": 1.7915772199630737, "logits/rejected": 1.8315253257751465, "logps/chosen": -62.081932067871094, "logps/rejected": -87.96475982666016, "loss": 0.725, "rewards/accuracies": 0.5, "rewards/chosen": -0.03796558454632759, "rewards/margins": -0.06154727563261986, "rewards/rejected": 0.023581694811582565, "step": 21 }, { "epoch": 0.024444444444444446, "grad_norm": 1.7777291536331177, "learning_rate": 8.14814814814815e-06, "logits/chosen": 1.8696916103363037, "logits/rejected": 1.9995701313018799, "logps/chosen": -69.93438720703125, "logps/rejected": -60.69755172729492, "loss": 0.7253, "rewards/accuracies": 0.5, "rewards/chosen": -0.04954032972455025, "rewards/margins": -0.061872489750385284, "rewards/rejected": 0.01233215257525444, "step": 22 }, { "epoch": 0.025555555555555557, "grad_norm": 2.1952433586120605, "learning_rate": 8.518518518518519e-06, "logits/chosen": 1.6216216087341309, "logits/rejected": 1.6242220401763916, "logps/chosen": -88.7285385131836, "logps/rejected": -175.496337890625, "loss": 0.6481, "rewards/accuracies": 1.0, "rewards/chosen": -0.015279770828783512, "rewards/margins": 0.0923030823469162, "rewards/rejected": -0.10758285224437714, "step": 23 }, { "epoch": 0.02666666666666667, "grad_norm": 1.7182083129882812, "learning_rate": 8.88888888888889e-06, "logits/chosen": 1.8613630533218384, "logits/rejected": 1.8241791725158691, "logps/chosen": -80.31359100341797, "logps/rejected": -98.98373413085938, "loss": 0.7015, "rewards/accuracies": 0.5, "rewards/chosen": -0.06282711029052734, "rewards/margins": -0.01623706892132759, "rewards/rejected": -0.04659004136919975, "step": 24 }, { "epoch": 0.027777777777777776, "grad_norm": 0.15061946213245392, "learning_rate": 9.259259259259259e-06, "logits/chosen": 1.4815609455108643, "logits/rejected": 1.4915060997009277, "logps/chosen": -40.906986236572266, "logps/rejected": -38.79916763305664, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": 0.056847479194402695, "rewards/margins": 0.005861666053533554, "rewards/rejected": 0.05098581314086914, "step": 25 }, { "epoch": 0.028888888888888888, "grad_norm": 1.7389438152313232, "learning_rate": 9.62962962962963e-06, "logits/chosen": 1.846755027770996, "logits/rejected": 1.8342325687408447, "logps/chosen": -118.75205993652344, "logps/rejected": -113.50042724609375, "loss": 0.7545, "rewards/accuracies": 0.0, "rewards/chosen": -0.1549350768327713, "rewards/margins": -0.11907348781824112, "rewards/rejected": -0.03586158901453018, "step": 26 }, { "epoch": 0.03, "grad_norm": 1.849458932876587, "learning_rate": 1e-05, "logits/chosen": 2.124572277069092, "logits/rejected": 2.136536121368408, "logps/chosen": -107.97257232666016, "logps/rejected": -134.48846435546875, "loss": 0.7244, "rewards/accuracies": 0.0, "rewards/chosen": -0.03628692775964737, "rewards/margins": -0.0609130859375, "rewards/rejected": 0.02462616004049778, "step": 27 }, { "epoch": 0.03111111111111111, "grad_norm": 1.238383412361145, "learning_rate": 1.037037037037037e-05, "logits/chosen": 1.784015417098999, "logits/rejected": 1.809431791305542, "logps/chosen": -69.24337005615234, "logps/rejected": -81.53703308105469, "loss": 0.6601, "rewards/accuracies": 0.5, "rewards/chosen": -0.009550858289003372, "rewards/margins": 0.06917991489171982, "rewards/rejected": -0.07873077690601349, "step": 28 }, { "epoch": 0.03222222222222222, "grad_norm": 1.4611886739730835, "learning_rate": 1.074074074074074e-05, "logits/chosen": 1.9695029258728027, "logits/rejected": 1.9921891689300537, "logps/chosen": -77.09239959716797, "logps/rejected": -77.73757934570312, "loss": 0.6939, "rewards/accuracies": 0.5, "rewards/chosen": -0.05179538577795029, "rewards/margins": -0.0008993148803710938, "rewards/rejected": -0.05089607462286949, "step": 29 }, { "epoch": 0.03333333333333333, "grad_norm": 1.707033634185791, "learning_rate": 1.1111111111111112e-05, "logits/chosen": 1.4423596858978271, "logits/rejected": 1.4216914176940918, "logps/chosen": -52.29315185546875, "logps/rejected": -66.96943664550781, "loss": 0.676, "rewards/accuracies": 0.5, "rewards/chosen": -0.004590225405991077, "rewards/margins": 0.03599243611097336, "rewards/rejected": -0.04058266058564186, "step": 30 }, { "epoch": 0.034444444444444444, "grad_norm": 1.7463372945785522, "learning_rate": 1.1481481481481482e-05, "logits/chosen": 1.3726036548614502, "logits/rejected": 1.4458048343658447, "logps/chosen": -68.83973693847656, "logps/rejected": -127.02235412597656, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": -0.1643892228603363, "rewards/margins": 0.023491479456424713, "rewards/rejected": -0.1878807097673416, "step": 31 }, { "epoch": 0.035555555555555556, "grad_norm": 0.9763980507850647, "learning_rate": 1.1851851851851853e-05, "logits/chosen": 1.55757737159729, "logits/rejected": 1.5490975379943848, "logps/chosen": -85.28057861328125, "logps/rejected": -74.91813659667969, "loss": 0.7162, "rewards/accuracies": 0.0, "rewards/chosen": -0.15017110109329224, "rewards/margins": -0.04560985788702965, "rewards/rejected": -0.10456123948097229, "step": 32 }, { "epoch": 0.03666666666666667, "grad_norm": 0.8882907629013062, "learning_rate": 1.2222222222222222e-05, "logits/chosen": 1.6205241680145264, "logits/rejected": 1.648435354232788, "logps/chosen": -64.73739624023438, "logps/rejected": -61.94883728027344, "loss": 0.715, "rewards/accuracies": 0.0, "rewards/chosen": -0.06149311363697052, "rewards/margins": -0.042905811220407486, "rewards/rejected": -0.018587302416563034, "step": 33 }, { "epoch": 0.03777777777777778, "grad_norm": 2.4717092514038086, "learning_rate": 1.2592592592592592e-05, "logits/chosen": 1.6775972843170166, "logits/rejected": 1.637088418006897, "logps/chosen": -71.07632446289062, "logps/rejected": -164.83062744140625, "loss": 0.6142, "rewards/accuracies": 1.0, "rewards/chosen": 0.000835418701171875, "rewards/margins": 0.17076608538627625, "rewards/rejected": -0.16993065178394318, "step": 34 }, { "epoch": 0.03888888888888889, "grad_norm": 2.2960546016693115, "learning_rate": 1.2962962962962962e-05, "logits/chosen": 1.685824990272522, "logits/rejected": 1.6909576654434204, "logps/chosen": -82.73219299316406, "logps/rejected": -104.31394958496094, "loss": 0.6274, "rewards/accuracies": 1.0, "rewards/chosen": -0.023319626227021217, "rewards/margins": 0.139292910695076, "rewards/rejected": -0.16261254251003265, "step": 35 }, { "epoch": 0.04, "grad_norm": 3.587982416152954, "learning_rate": 1.3333333333333333e-05, "logits/chosen": 1.6964161396026611, "logits/rejected": 1.681493878364563, "logps/chosen": -80.79440307617188, "logps/rejected": -255.21246337890625, "loss": 0.58, "rewards/accuracies": 1.0, "rewards/chosen": -0.11494827270507812, "rewards/margins": 0.24139708280563354, "rewards/rejected": -0.35634535551071167, "step": 36 }, { "epoch": 0.04111111111111111, "grad_norm": 2.2253293991088867, "learning_rate": 1.3703703703703704e-05, "logits/chosen": 2.12424898147583, "logits/rejected": 2.1394336223602295, "logps/chosen": -115.24567413330078, "logps/rejected": -159.59548950195312, "loss": 0.7046, "rewards/accuracies": 0.5, "rewards/chosen": -0.05182495340704918, "rewards/margins": -0.019217681139707565, "rewards/rejected": -0.032607272267341614, "step": 37 }, { "epoch": 0.042222222222222223, "grad_norm": 1.2612190246582031, "learning_rate": 1.4074074074074075e-05, "logits/chosen": 1.6499874591827393, "logits/rejected": 1.6448873281478882, "logps/chosen": -85.41886138916016, "logps/rejected": -100.32022094726562, "loss": 0.643, "rewards/accuracies": 1.0, "rewards/chosen": -0.08860988914966583, "rewards/margins": 0.10447102040052414, "rewards/rejected": -0.19308090209960938, "step": 38 }, { "epoch": 0.043333333333333335, "grad_norm": 1.4537714719772339, "learning_rate": 1.4444444444444444e-05, "logits/chosen": 1.7761759757995605, "logits/rejected": 1.76588773727417, "logps/chosen": -98.62322998046875, "logps/rejected": -99.86741638183594, "loss": 0.6419, "rewards/accuracies": 1.0, "rewards/chosen": -0.06925506889820099, "rewards/margins": 0.10565796494483948, "rewards/rejected": -0.17491301894187927, "step": 39 }, { "epoch": 0.044444444444444446, "grad_norm": 1.4564423561096191, "learning_rate": 1.4814814814814815e-05, "logits/chosen": 1.981092929840088, "logits/rejected": 1.9781627655029297, "logps/chosen": -75.01533508300781, "logps/rejected": -94.89653015136719, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": -0.04973487928509712, "rewards/margins": 0.007932281121611595, "rewards/rejected": -0.05766715854406357, "step": 40 }, { "epoch": 0.04555555555555556, "grad_norm": 1.0509978532791138, "learning_rate": 1.5185185185185186e-05, "logits/chosen": 1.5225938558578491, "logits/rejected": 1.534175992012024, "logps/chosen": -49.76565170288086, "logps/rejected": -66.34207153320312, "loss": 0.6709, "rewards/accuracies": 1.0, "rewards/chosen": 0.001028250902891159, "rewards/margins": 0.04521980136632919, "rewards/rejected": -0.044191550463438034, "step": 41 }, { "epoch": 0.04666666666666667, "grad_norm": 1.711934208869934, "learning_rate": 1.5555555555555555e-05, "logits/chosen": 1.8962862491607666, "logits/rejected": 1.897247314453125, "logps/chosen": -122.20228576660156, "logps/rejected": -132.25216674804688, "loss": 0.62, "rewards/accuracies": 1.0, "rewards/chosen": -0.050075531005859375, "rewards/margins": 0.1530940979719162, "rewards/rejected": -0.20316962897777557, "step": 42 }, { "epoch": 0.04777777777777778, "grad_norm": 2.3543484210968018, "learning_rate": 1.5925925925925926e-05, "logits/chosen": 1.5449676513671875, "logits/rejected": 1.5653077363967896, "logps/chosen": -119.62660217285156, "logps/rejected": -246.40908813476562, "loss": 0.5988, "rewards/accuracies": 1.0, "rewards/chosen": -0.14300155639648438, "rewards/margins": 0.19946135580539703, "rewards/rejected": -0.3424629271030426, "step": 43 }, { "epoch": 0.04888888888888889, "grad_norm": 1.7880644798278809, "learning_rate": 1.62962962962963e-05, "logits/chosen": 1.7310125827789307, "logits/rejected": 1.7422864437103271, "logps/chosen": -99.23149108886719, "logps/rejected": -157.47816467285156, "loss": 0.6009, "rewards/accuracies": 1.0, "rewards/chosen": -0.023392483592033386, "rewards/margins": 0.19408226013183594, "rewards/rejected": -0.21747475862503052, "step": 44 }, { "epoch": 0.05, "grad_norm": 1.3956307172775269, "learning_rate": 1.6666666666666667e-05, "logits/chosen": 1.8954123258590698, "logits/rejected": 1.8871264457702637, "logps/chosen": -67.74177551269531, "logps/rejected": -82.30088806152344, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": -0.0016712192445993423, "rewards/margins": 0.029368208721280098, "rewards/rejected": -0.03103942796587944, "step": 45 }, { "epoch": 0.051111111111111114, "grad_norm": 2.003779172897339, "learning_rate": 1.7037037037037038e-05, "logits/chosen": 1.5863382816314697, "logits/rejected": 1.5584232807159424, "logps/chosen": -75.53849029541016, "logps/rejected": -145.7866668701172, "loss": 0.5855, "rewards/accuracies": 0.5, "rewards/chosen": -0.05518150329589844, "rewards/margins": 0.25419387221336365, "rewards/rejected": -0.3093753755092621, "step": 46 }, { "epoch": 0.052222222222222225, "grad_norm": 1.8270933628082275, "learning_rate": 1.740740740740741e-05, "logits/chosen": 1.884851336479187, "logits/rejected": 1.8724640607833862, "logps/chosen": -88.71607971191406, "logps/rejected": -98.39363098144531, "loss": 0.6689, "rewards/accuracies": 0.5, "rewards/chosen": -0.012431716546416283, "rewards/margins": 0.05319824069738388, "rewards/rejected": -0.06562995910644531, "step": 47 }, { "epoch": 0.05333333333333334, "grad_norm": 1.748655915260315, "learning_rate": 1.777777777777778e-05, "logits/chosen": 1.624474048614502, "logits/rejected": 1.623655915260315, "logps/chosen": -38.20254135131836, "logps/rejected": -114.11456298828125, "loss": 0.5958, "rewards/accuracies": 1.0, "rewards/chosen": -0.05160503461956978, "rewards/margins": 0.20960788428783417, "rewards/rejected": -0.26121291518211365, "step": 48 }, { "epoch": 0.05444444444444444, "grad_norm": 1.7473804950714111, "learning_rate": 1.814814814814815e-05, "logits/chosen": 1.4195319414138794, "logits/rejected": 1.436247706413269, "logps/chosen": -76.2438735961914, "logps/rejected": -100.14031982421875, "loss": 0.6842, "rewards/accuracies": 0.5, "rewards/chosen": -0.13276425004005432, "rewards/margins": 0.021105773746967316, "rewards/rejected": -0.15387001633644104, "step": 49 }, { "epoch": 0.05555555555555555, "grad_norm": 1.574964165687561, "learning_rate": 1.8518518518518518e-05, "logits/chosen": 1.791442632675171, "logits/rejected": 1.7936460971832275, "logps/chosen": -61.06804656982422, "logps/rejected": -86.86553192138672, "loss": 0.6509, "rewards/accuracies": 0.5, "rewards/chosen": -0.09531307220458984, "rewards/margins": 0.09109802544116974, "rewards/rejected": -0.18641109764575958, "step": 50 }, { "epoch": 0.056666666666666664, "grad_norm": 2.060523271560669, "learning_rate": 1.888888888888889e-05, "logits/chosen": 1.544954538345337, "logits/rejected": 1.553112268447876, "logps/chosen": -81.31719970703125, "logps/rejected": -117.30509948730469, "loss": 0.6519, "rewards/accuracies": 1.0, "rewards/chosen": -0.16208267211914062, "rewards/margins": 0.08607254922389984, "rewards/rejected": -0.24815522134304047, "step": 51 }, { "epoch": 0.057777777777777775, "grad_norm": 2.8339874744415283, "learning_rate": 1.925925925925926e-05, "logits/chosen": 1.7908909320831299, "logits/rejected": 1.7629308700561523, "logps/chosen": -111.61192321777344, "logps/rejected": -216.4823455810547, "loss": 0.5073, "rewards/accuracies": 1.0, "rewards/chosen": -0.12723807990550995, "rewards/margins": 0.47036972641944885, "rewards/rejected": -0.5976077914237976, "step": 52 }, { "epoch": 0.058888888888888886, "grad_norm": 2.0283331871032715, "learning_rate": 1.962962962962963e-05, "logits/chosen": 1.6896365880966187, "logits/rejected": 1.7048797607421875, "logps/chosen": -95.7759780883789, "logps/rejected": -137.47116088867188, "loss": 0.5939, "rewards/accuracies": 1.0, "rewards/chosen": -0.2904655635356903, "rewards/margins": 0.21527443826198578, "rewards/rejected": -0.5057399868965149, "step": 53 }, { "epoch": 0.06, "grad_norm": 1.7469266653060913, "learning_rate": 2e-05, "logits/chosen": 1.7697209119796753, "logits/rejected": 1.7386178970336914, "logps/chosen": -74.900390625, "logps/rejected": -109.59677124023438, "loss": 0.6283, "rewards/accuracies": 1.0, "rewards/chosen": -0.16575202345848083, "rewards/margins": 0.13905715942382812, "rewards/rejected": -0.30480918288230896, "step": 54 }, { "epoch": 0.06111111111111111, "grad_norm": 3.0066206455230713, "learning_rate": 2.037037037037037e-05, "logits/chosen": 1.7325794696807861, "logits/rejected": 1.732982873916626, "logps/chosen": -85.84654235839844, "logps/rejected": -228.00827026367188, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": -0.18922272324562073, "rewards/margins": 0.7647788524627686, "rewards/rejected": -0.9540016651153564, "step": 55 }, { "epoch": 0.06222222222222222, "grad_norm": 1.8644434213638306, "learning_rate": 2.074074074074074e-05, "logits/chosen": 1.8657515048980713, "logits/rejected": 1.8879640102386475, "logps/chosen": -78.72285461425781, "logps/rejected": -101.9678726196289, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": -0.07206115871667862, "rewards/margins": 0.04620666801929474, "rewards/rejected": -0.11826782673597336, "step": 56 }, { "epoch": 0.06333333333333334, "grad_norm": 1.9023540019989014, "learning_rate": 2.111111111111111e-05, "logits/chosen": 1.7792305946350098, "logits/rejected": 1.7769355773925781, "logps/chosen": -92.96107482910156, "logps/rejected": -117.98345947265625, "loss": 0.5819, "rewards/accuracies": 1.0, "rewards/chosen": -0.16865500807762146, "rewards/margins": 0.24228018522262573, "rewards/rejected": -0.4109352231025696, "step": 57 }, { "epoch": 0.06444444444444444, "grad_norm": 2.1627721786499023, "learning_rate": 2.148148148148148e-05, "logits/chosen": 1.7198245525360107, "logits/rejected": 1.723222017288208, "logps/chosen": -94.08353424072266, "logps/rejected": -178.54150390625, "loss": 0.4899, "rewards/accuracies": 0.5, "rewards/chosen": -0.2021978348493576, "rewards/margins": 0.6163568496704102, "rewards/rejected": -0.818554699420929, "step": 58 }, { "epoch": 0.06555555555555556, "grad_norm": 1.9864712953567505, "learning_rate": 2.1851851851851852e-05, "logits/chosen": 1.6879370212554932, "logits/rejected": 1.6846476793289185, "logps/chosen": -87.98388671875, "logps/rejected": -73.5925064086914, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": -0.2818569242954254, "rewards/margins": 0.0013933032751083374, "rewards/rejected": -0.28325024247169495, "step": 59 }, { "epoch": 0.06666666666666667, "grad_norm": 2.4778943061828613, "learning_rate": 2.2222222222222223e-05, "logits/chosen": 1.7185921669006348, "logits/rejected": 1.6852530241012573, "logps/chosen": -77.68290710449219, "logps/rejected": -116.40189361572266, "loss": 0.5175, "rewards/accuracies": 1.0, "rewards/chosen": -0.18148326873779297, "rewards/margins": 0.39011478424072266, "rewards/rejected": -0.5715980529785156, "step": 60 }, { "epoch": 0.06777777777777778, "grad_norm": 3.0705056190490723, "learning_rate": 2.2592592592592594e-05, "logits/chosen": 1.549826979637146, "logits/rejected": 1.552480936050415, "logps/chosen": -105.76761627197266, "logps/rejected": -115.78946685791016, "loss": 0.6661, "rewards/accuracies": 1.0, "rewards/chosen": -0.38608360290527344, "rewards/margins": 0.05504380166530609, "rewards/rejected": -0.4411274194717407, "step": 61 }, { "epoch": 0.06888888888888889, "grad_norm": 1.790187120437622, "learning_rate": 2.2962962962962965e-05, "logits/chosen": 1.7170157432556152, "logits/rejected": 1.7121881246566772, "logps/chosen": -85.77484130859375, "logps/rejected": -113.1250991821289, "loss": 0.625, "rewards/accuracies": 1.0, "rewards/chosen": -0.32881850004196167, "rewards/margins": 0.1448974758386612, "rewards/rejected": -0.47371599078178406, "step": 62 }, { "epoch": 0.07, "grad_norm": 1.897810459136963, "learning_rate": 2.3333333333333336e-05, "logits/chosen": 1.765171766281128, "logits/rejected": 1.7672276496887207, "logps/chosen": -80.08262634277344, "logps/rejected": -111.42768859863281, "loss": 0.5917, "rewards/accuracies": 1.0, "rewards/chosen": -0.06182899326086044, "rewards/margins": 0.21553078293800354, "rewards/rejected": -0.2773597836494446, "step": 63 }, { "epoch": 0.07111111111111111, "grad_norm": 1.2691526412963867, "learning_rate": 2.3703703703703707e-05, "logits/chosen": 1.4514224529266357, "logits/rejected": 1.4330427646636963, "logps/chosen": -65.18637084960938, "logps/rejected": -93.48057556152344, "loss": 0.5987, "rewards/accuracies": 1.0, "rewards/chosen": -0.24078866839408875, "rewards/margins": 0.19913369417190552, "rewards/rejected": -0.4399223327636719, "step": 64 }, { "epoch": 0.07222222222222222, "grad_norm": 2.2606098651885986, "learning_rate": 2.4074074074074074e-05, "logits/chosen": 2.0007028579711914, "logits/rejected": 1.9412254095077515, "logps/chosen": -116.66047668457031, "logps/rejected": -141.75929260253906, "loss": 0.5542, "rewards/accuracies": 1.0, "rewards/chosen": -0.649639904499054, "rewards/margins": 0.3094291687011719, "rewards/rejected": -0.9590690732002258, "step": 65 }, { "epoch": 0.07333333333333333, "grad_norm": 2.2353146076202393, "learning_rate": 2.4444444444444445e-05, "logits/chosen": 1.7158548831939697, "logits/rejected": 1.7214916944503784, "logps/chosen": -104.28157043457031, "logps/rejected": -128.12850952148438, "loss": 0.6088, "rewards/accuracies": 1.0, "rewards/chosen": -0.6537491083145142, "rewards/margins": 0.17663806676864624, "rewards/rejected": -0.8303871154785156, "step": 66 }, { "epoch": 0.07444444444444444, "grad_norm": 2.199907064437866, "learning_rate": 2.4814814814814816e-05, "logits/chosen": 1.6395995616912842, "logits/rejected": 1.6217764616012573, "logps/chosen": -94.70133972167969, "logps/rejected": -153.1549835205078, "loss": 0.3378, "rewards/accuracies": 1.0, "rewards/chosen": -0.3395376205444336, "rewards/margins": 0.929686963558197, "rewards/rejected": -1.2692246437072754, "step": 67 }, { "epoch": 0.07555555555555556, "grad_norm": 2.14762282371521, "learning_rate": 2.5185185185185183e-05, "logits/chosen": 1.9198837280273438, "logits/rejected": 1.899277925491333, "logps/chosen": -114.6630859375, "logps/rejected": -201.70877075195312, "loss": 0.2805, "rewards/accuracies": 1.0, "rewards/chosen": -0.6231940984725952, "rewards/margins": 1.2016448974609375, "rewards/rejected": -1.8248389959335327, "step": 68 }, { "epoch": 0.07666666666666666, "grad_norm": 1.5109450817108154, "learning_rate": 2.5555555555555554e-05, "logits/chosen": 1.5922627449035645, "logits/rejected": 1.611965298652649, "logps/chosen": -58.126338958740234, "logps/rejected": -67.91969299316406, "loss": 0.6805, "rewards/accuracies": 0.5, "rewards/chosen": -0.2385282665491104, "rewards/margins": 0.026756267994642258, "rewards/rejected": -0.26528453826904297, "step": 69 }, { "epoch": 0.07777777777777778, "grad_norm": 1.9288115501403809, "learning_rate": 2.5925925925925925e-05, "logits/chosen": 1.8296849727630615, "logits/rejected": 1.829742431640625, "logps/chosen": -95.935302734375, "logps/rejected": -198.1670379638672, "loss": 0.1968, "rewards/accuracies": 1.0, "rewards/chosen": -0.14311543107032776, "rewards/margins": 1.5504560470581055, "rewards/rejected": -1.6935715675354004, "step": 70 }, { "epoch": 0.07888888888888888, "grad_norm": 4.266966819763184, "learning_rate": 2.6296296296296296e-05, "logits/chosen": 1.6613762378692627, "logits/rejected": 1.6321707963943481, "logps/chosen": -140.92919921875, "logps/rejected": -194.3029022216797, "loss": 0.5384, "rewards/accuracies": 0.5, "rewards/chosen": -0.9971637725830078, "rewards/margins": 0.42647480964660645, "rewards/rejected": -1.4236385822296143, "step": 71 }, { "epoch": 0.08, "grad_norm": 1.7725245952606201, "learning_rate": 2.6666666666666667e-05, "logits/chosen": 1.6231601238250732, "logits/rejected": 1.5862537622451782, "logps/chosen": -57.777034759521484, "logps/rejected": -63.0678596496582, "loss": 0.6308, "rewards/accuracies": 0.5, "rewards/chosen": -0.15093021094799042, "rewards/margins": 0.15535412728786469, "rewards/rejected": -0.3062843382358551, "step": 72 }, { "epoch": 0.0811111111111111, "grad_norm": 2.8387224674224854, "learning_rate": 2.7037037037037037e-05, "logits/chosen": 1.9348162412643433, "logits/rejected": 1.9357737302780151, "logps/chosen": -106.52703857421875, "logps/rejected": -122.8449478149414, "loss": 0.6421, "rewards/accuracies": 0.5, "rewards/chosen": -0.27306097745895386, "rewards/margins": 0.1155468076467514, "rewards/rejected": -0.38860780000686646, "step": 73 }, { "epoch": 0.08222222222222222, "grad_norm": 2.1512317657470703, "learning_rate": 2.7407407407407408e-05, "logits/chosen": 1.599566102027893, "logits/rejected": 1.602142333984375, "logps/chosen": -127.11915588378906, "logps/rejected": -255.64981079101562, "loss": 0.4013, "rewards/accuracies": 0.5, "rewards/chosen": -1.485198736190796, "rewards/margins": 1.3596293926239014, "rewards/rejected": -2.8448281288146973, "step": 74 }, { "epoch": 0.08333333333333333, "grad_norm": 1.493711233139038, "learning_rate": 2.777777777777778e-05, "logits/chosen": 1.4468770027160645, "logits/rejected": 1.4404709339141846, "logps/chosen": -97.31271362304688, "logps/rejected": -157.94195556640625, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": -0.7884476184844971, "rewards/margins": 1.9556832313537598, "rewards/rejected": -2.7441306114196777, "step": 75 }, { "epoch": 0.08444444444444445, "grad_norm": 1.6876009702682495, "learning_rate": 2.814814814814815e-05, "logits/chosen": 1.7240617275238037, "logits/rejected": 1.737077236175537, "logps/chosen": -76.52220916748047, "logps/rejected": -91.74371337890625, "loss": 0.5912, "rewards/accuracies": 0.5, "rewards/chosen": -0.28651106357574463, "rewards/margins": 0.2854497730731964, "rewards/rejected": -0.5719608068466187, "step": 76 }, { "epoch": 0.08555555555555555, "grad_norm": 2.2114036083221436, "learning_rate": 2.851851851851852e-05, "logits/chosen": 1.5422091484069824, "logits/rejected": 1.5338011980056763, "logps/chosen": -93.317138671875, "logps/rejected": -101.80584716796875, "loss": 0.7367, "rewards/accuracies": 0.5, "rewards/chosen": -1.1777729988098145, "rewards/margins": -0.029868945479393005, "rewards/rejected": -1.1479040384292603, "step": 77 }, { "epoch": 0.08666666666666667, "grad_norm": 2.213303327560425, "learning_rate": 2.8888888888888888e-05, "logits/chosen": 1.3021442890167236, "logits/rejected": 1.3342745304107666, "logps/chosen": -66.1043701171875, "logps/rejected": -74.09657287597656, "loss": 0.6875, "rewards/accuracies": 0.5, "rewards/chosen": -0.45370960235595703, "rewards/margins": 0.028228551149368286, "rewards/rejected": -0.4819381833076477, "step": 78 }, { "epoch": 0.08777777777777777, "grad_norm": 1.2050501108169556, "learning_rate": 2.925925925925926e-05, "logits/chosen": 1.549387812614441, "logits/rejected": 1.5419864654541016, "logps/chosen": -53.489715576171875, "logps/rejected": -77.66796875, "loss": 0.414, "rewards/accuracies": 1.0, "rewards/chosen": -0.1888553649187088, "rewards/margins": 0.8734670877456665, "rewards/rejected": -1.0623224973678589, "step": 79 }, { "epoch": 0.08888888888888889, "grad_norm": 1.8362517356872559, "learning_rate": 2.962962962962963e-05, "logits/chosen": 1.4876477718353271, "logits/rejected": 1.4949002265930176, "logps/chosen": -94.93338775634766, "logps/rejected": -110.24267578125, "loss": 0.4786, "rewards/accuracies": 1.0, "rewards/chosen": -0.9818565249443054, "rewards/margins": 0.5171284079551697, "rewards/rejected": -1.498984932899475, "step": 80 }, { "epoch": 0.09, "grad_norm": 2.4406867027282715, "learning_rate": 3e-05, "logits/chosen": 1.787015438079834, "logits/rejected": 1.8846383094787598, "logps/chosen": -123.8663330078125, "logps/rejected": -155.14784240722656, "loss": 0.6004, "rewards/accuracies": 0.5, "rewards/chosen": -1.1795952320098877, "rewards/margins": 0.614324688911438, "rewards/rejected": -1.7939199209213257, "step": 81 }, { "epoch": 0.09111111111111111, "grad_norm": 1.6418097019195557, "learning_rate": 3.037037037037037e-05, "logits/chosen": 1.5709681510925293, "logits/rejected": 1.5515928268432617, "logps/chosen": -109.72415161132812, "logps/rejected": -201.74673461914062, "loss": 0.2112, "rewards/accuracies": 1.0, "rewards/chosen": -1.070885181427002, "rewards/margins": 2.5237319469451904, "rewards/rejected": -3.5946171283721924, "step": 82 }, { "epoch": 0.09222222222222222, "grad_norm": 4.790890216827393, "learning_rate": 3.074074074074074e-05, "logits/chosen": 1.2771456241607666, "logits/rejected": 1.2531366348266602, "logps/chosen": -105.06478118896484, "logps/rejected": -100.83853149414062, "loss": 0.7815, "rewards/accuracies": 0.5, "rewards/chosen": -1.5182292461395264, "rewards/margins": -0.06937482953071594, "rewards/rejected": -1.4488544464111328, "step": 83 }, { "epoch": 0.09333333333333334, "grad_norm": 1.794681429862976, "learning_rate": 3.111111111111111e-05, "logits/chosen": 1.3017021417617798, "logits/rejected": 1.3239482641220093, "logps/chosen": -74.48855590820312, "logps/rejected": -105.38568115234375, "loss": 0.3862, "rewards/accuracies": 1.0, "rewards/chosen": -0.40635186433792114, "rewards/margins": 0.8958755135536194, "rewards/rejected": -1.3022273778915405, "step": 84 }, { "epoch": 0.09444444444444444, "grad_norm": 1.2266799211502075, "learning_rate": 3.148148148148148e-05, "logits/chosen": 1.2236592769622803, "logits/rejected": 1.223386287689209, "logps/chosen": -91.18898010253906, "logps/rejected": -95.43978881835938, "loss": 0.6349, "rewards/accuracies": 0.5, "rewards/chosen": -1.050331473350525, "rewards/margins": 0.13474655151367188, "rewards/rejected": -1.1850780248641968, "step": 85 }, { "epoch": 0.09555555555555556, "grad_norm": 1.798143744468689, "learning_rate": 3.185185185185185e-05, "logits/chosen": 1.7932617664337158, "logits/rejected": 1.7851908206939697, "logps/chosen": -121.31637573242188, "logps/rejected": -161.26097106933594, "loss": 0.2435, "rewards/accuracies": 1.0, "rewards/chosen": -1.3449885845184326, "rewards/margins": 1.319892406463623, "rewards/rejected": -2.6648812294006348, "step": 86 }, { "epoch": 0.09666666666666666, "grad_norm": 4.006865501403809, "learning_rate": 3.222222222222223e-05, "logits/chosen": 1.53578782081604, "logits/rejected": 1.5837293863296509, "logps/chosen": -96.06893920898438, "logps/rejected": -68.64762878417969, "loss": 1.1278, "rewards/accuracies": 0.0, "rewards/chosen": -1.1463409662246704, "rewards/margins": -0.7286598682403564, "rewards/rejected": -0.41768109798431396, "step": 87 }, { "epoch": 0.09777777777777778, "grad_norm": 1.6974478960037231, "learning_rate": 3.25925925925926e-05, "logits/chosen": 1.7367286682128906, "logits/rejected": 1.737941026687622, "logps/chosen": -64.71845245361328, "logps/rejected": -106.40463256835938, "loss": 0.309, "rewards/accuracies": 1.0, "rewards/chosen": -0.5299625396728516, "rewards/margins": 1.020270586013794, "rewards/rejected": -1.5502331256866455, "step": 88 }, { "epoch": 0.09888888888888889, "grad_norm": 3.300274133682251, "learning_rate": 3.2962962962962964e-05, "logits/chosen": 1.7683470249176025, "logits/rejected": 1.7794362306594849, "logps/chosen": -105.37261199951172, "logps/rejected": -148.53079223632812, "loss": 0.4874, "rewards/accuracies": 1.0, "rewards/chosen": -1.327021837234497, "rewards/margins": 0.5162055492401123, "rewards/rejected": -1.8432273864746094, "step": 89 }, { "epoch": 0.1, "grad_norm": 1.47025728225708, "learning_rate": 3.3333333333333335e-05, "logits/chosen": 1.4212974309921265, "logits/rejected": 1.422567367553711, "logps/chosen": -48.77056121826172, "logps/rejected": -81.56902313232422, "loss": 0.385, "rewards/accuracies": 1.0, "rewards/chosen": -0.5088943839073181, "rewards/margins": 0.8691510558128357, "rewards/rejected": -1.3780454397201538, "step": 90 }, { "epoch": 0.10111111111111111, "grad_norm": 4.700313091278076, "learning_rate": 3.3703703703703706e-05, "logits/chosen": 1.5764694213867188, "logits/rejected": 1.5587718486785889, "logps/chosen": -136.34291076660156, "logps/rejected": -116.45772552490234, "loss": 0.8538, "rewards/accuracies": 0.5, "rewards/chosen": -1.6738388538360596, "rewards/margins": -0.27041375637054443, "rewards/rejected": -1.4034249782562256, "step": 91 }, { "epoch": 0.10222222222222223, "grad_norm": 1.2101768255233765, "learning_rate": 3.4074074074074077e-05, "logits/chosen": 1.479997158050537, "logits/rejected": 1.4712399244308472, "logps/chosen": -139.7488250732422, "logps/rejected": -201.36907958984375, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": -2.978419065475464, "rewards/margins": 2.327021837234497, "rewards/rejected": -5.305440902709961, "step": 92 }, { "epoch": 0.10333333333333333, "grad_norm": 1.2305712699890137, "learning_rate": 3.444444444444445e-05, "logits/chosen": 1.3023488521575928, "logits/rejected": 1.3083076477050781, "logps/chosen": -74.62271118164062, "logps/rejected": -122.06697082519531, "loss": 0.2447, "rewards/accuracies": 1.0, "rewards/chosen": -0.8665428161621094, "rewards/margins": 1.6845862865447998, "rewards/rejected": -2.551129102706909, "step": 93 }, { "epoch": 0.10444444444444445, "grad_norm": 1.858676791191101, "learning_rate": 3.481481481481482e-05, "logits/chosen": 1.1540656089782715, "logits/rejected": 1.1550817489624023, "logps/chosen": -96.77842712402344, "logps/rejected": -114.7625732421875, "loss": 0.3455, "rewards/accuracies": 1.0, "rewards/chosen": -2.3275973796844482, "rewards/margins": 0.9714933037757874, "rewards/rejected": -3.29909086227417, "step": 94 }, { "epoch": 0.10555555555555556, "grad_norm": 2.060750961303711, "learning_rate": 3.518518518518519e-05, "logits/chosen": 1.265845775604248, "logits/rejected": 1.2881114482879639, "logps/chosen": -66.60663604736328, "logps/rejected": -90.7275390625, "loss": 0.4054, "rewards/accuracies": 1.0, "rewards/chosen": -0.6439563632011414, "rewards/margins": 0.7417579889297485, "rewards/rejected": -1.3857142925262451, "step": 95 }, { "epoch": 0.10666666666666667, "grad_norm": 0.8952216506004333, "learning_rate": 3.555555555555556e-05, "logits/chosen": 1.465663194656372, "logits/rejected": 1.4667658805847168, "logps/chosen": -69.3805160522461, "logps/rejected": -116.96279907226562, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3900684416294098, "rewards/margins": 2.274697780609131, "rewards/rejected": -2.664766311645508, "step": 96 }, { "epoch": 0.10777777777777778, "grad_norm": 2.9724342823028564, "learning_rate": 3.592592592592593e-05, "logits/chosen": 1.611980676651001, "logits/rejected": 1.6153910160064697, "logps/chosen": -89.46023559570312, "logps/rejected": -112.84394836425781, "loss": 0.521, "rewards/accuracies": 0.5, "rewards/chosen": -1.141866683959961, "rewards/margins": 1.1625434160232544, "rewards/rejected": -2.304410219192505, "step": 97 }, { "epoch": 0.10888888888888888, "grad_norm": 0.8037980198860168, "learning_rate": 3.62962962962963e-05, "logits/chosen": 0.9927407503128052, "logits/rejected": 1.1304383277893066, "logps/chosen": -66.63685607910156, "logps/rejected": -189.22171020507812, "loss": 0.0694, "rewards/accuracies": 1.0, "rewards/chosen": -1.0246148109436035, "rewards/margins": 2.974991798400879, "rewards/rejected": -3.9996066093444824, "step": 98 }, { "epoch": 0.11, "grad_norm": 4.517455577850342, "learning_rate": 3.6666666666666666e-05, "logits/chosen": 1.5243792533874512, "logits/rejected": 1.5181734561920166, "logps/chosen": -137.09681701660156, "logps/rejected": -153.77027893066406, "loss": 0.8543, "rewards/accuracies": 0.5, "rewards/chosen": -2.6188578605651855, "rewards/margins": 0.978363573551178, "rewards/rejected": -3.597221612930298, "step": 99 }, { "epoch": 0.1111111111111111, "grad_norm": 1.5612226724624634, "learning_rate": 3.7037037037037037e-05, "logits/chosen": 1.312394142150879, "logits/rejected": 1.323418378829956, "logps/chosen": -62.346981048583984, "logps/rejected": -91.62457275390625, "loss": 0.4075, "rewards/accuracies": 1.0, "rewards/chosen": -0.8858341574668884, "rewards/margins": 0.9504798650741577, "rewards/rejected": -1.8363139629364014, "step": 100 }, { "epoch": 0.11222222222222222, "grad_norm": 1.9922595024108887, "learning_rate": 3.740740740740741e-05, "logits/chosen": 1.5976425409317017, "logits/rejected": 1.584465742111206, "logps/chosen": -57.799354553222656, "logps/rejected": -101.38355255126953, "loss": 0.3549, "rewards/accuracies": 1.0, "rewards/chosen": -0.49500980973243713, "rewards/margins": 1.4845099449157715, "rewards/rejected": -1.9795196056365967, "step": 101 }, { "epoch": 0.11333333333333333, "grad_norm": 0.9857215285301208, "learning_rate": 3.777777777777778e-05, "logits/chosen": 1.9635066986083984, "logits/rejected": 1.8774685859680176, "logps/chosen": -385.9969482421875, "logps/rejected": -533.520751953125, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -10.580450057983398, "rewards/margins": 5.847229480743408, "rewards/rejected": -16.42768096923828, "step": 102 }, { "epoch": 0.11444444444444445, "grad_norm": 1.5797380208969116, "learning_rate": 3.814814814814815e-05, "logits/chosen": 1.3814945220947266, "logits/rejected": 1.3732110261917114, "logps/chosen": -65.51873779296875, "logps/rejected": -124.97151184082031, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": -0.7811294794082642, "rewards/margins": 2.0249152183532715, "rewards/rejected": -2.806044578552246, "step": 103 }, { "epoch": 0.11555555555555555, "grad_norm": 32.19942092895508, "learning_rate": 3.851851851851852e-05, "logits/chosen": 1.9044795036315918, "logits/rejected": 1.9398353099822998, "logps/chosen": -392.8150939941406, "logps/rejected": -283.0675964355469, "loss": 7.0023, "rewards/accuracies": 0.5, "rewards/chosen": -11.779252052307129, "rewards/margins": -3.6223227977752686, "rewards/rejected": -8.156929016113281, "step": 104 }, { "epoch": 0.11666666666666667, "grad_norm": 9.056168556213379, "learning_rate": 3.888888888888889e-05, "logits/chosen": 1.4609990119934082, "logits/rejected": 1.4576020240783691, "logps/chosen": -174.0449981689453, "logps/rejected": -191.82113647460938, "loss": 0.7463, "rewards/accuracies": 0.5, "rewards/chosen": -5.855546951293945, "rewards/margins": 0.11034560203552246, "rewards/rejected": -5.965892791748047, "step": 105 }, { "epoch": 0.11777777777777777, "grad_norm": 0.22639404237270355, "learning_rate": 3.925925925925926e-05, "logits/chosen": 1.5053675174713135, "logits/rejected": 1.5287299156188965, "logps/chosen": -103.06468963623047, "logps/rejected": -192.09066772460938, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -1.786957025527954, "rewards/margins": 4.192770004272461, "rewards/rejected": -5.979726791381836, "step": 106 }, { "epoch": 0.11888888888888889, "grad_norm": 2.7650609016418457, "learning_rate": 3.962962962962963e-05, "logits/chosen": 0.875189483165741, "logits/rejected": 0.8775643110275269, "logps/chosen": -114.65999603271484, "logps/rejected": -123.29772186279297, "loss": 0.5428, "rewards/accuracies": 0.5, "rewards/chosen": -4.322582721710205, "rewards/margins": 0.3902958035469055, "rewards/rejected": -4.712878704071045, "step": 107 }, { "epoch": 0.12, "grad_norm": 2.326057195663452, "learning_rate": 4e-05, "logits/chosen": 1.2276611328125, "logits/rejected": 1.2140488624572754, "logps/chosen": -145.82791137695312, "logps/rejected": -198.41839599609375, "loss": 0.3238, "rewards/accuracies": 1.0, "rewards/chosen": -4.437378883361816, "rewards/margins": 2.5805184841156006, "rewards/rejected": -7.017897129058838, "step": 108 }, { "epoch": 0.12111111111111111, "grad_norm": 2.3543052673339844, "learning_rate": 4.0370370370370374e-05, "logits/chosen": 1.2371888160705566, "logits/rejected": 1.2405812740325928, "logps/chosen": -133.5179901123047, "logps/rejected": -167.326171875, "loss": 0.2671, "rewards/accuracies": 1.0, "rewards/chosen": -5.264418601989746, "rewards/margins": 1.3283827304840088, "rewards/rejected": -6.592801094055176, "step": 109 }, { "epoch": 0.12222222222222222, "grad_norm": 1.3328789472579956, "learning_rate": 4.074074074074074e-05, "logits/chosen": 1.0871175527572632, "logits/rejected": 1.1041843891143799, "logps/chosen": -66.52952575683594, "logps/rejected": -86.1075439453125, "loss": 0.5155, "rewards/accuracies": 0.5, "rewards/chosen": -0.9652903079986572, "rewards/margins": 0.7563764452934265, "rewards/rejected": -1.7216668128967285, "step": 110 }, { "epoch": 0.12333333333333334, "grad_norm": 1.572357177734375, "learning_rate": 4.111111111111111e-05, "logits/chosen": 0.9135056734085083, "logits/rejected": 0.8970118761062622, "logps/chosen": -70.09820556640625, "logps/rejected": -102.05812072753906, "loss": 0.2713, "rewards/accuracies": 1.0, "rewards/chosen": -1.448287010192871, "rewards/margins": 1.2904077768325806, "rewards/rejected": -2.738694667816162, "step": 111 }, { "epoch": 0.12444444444444444, "grad_norm": 2.505145788192749, "learning_rate": 4.148148148148148e-05, "logits/chosen": 1.7135692834854126, "logits/rejected": 1.7289738655090332, "logps/chosen": -156.36756896972656, "logps/rejected": -304.5160827636719, "loss": 0.1914, "rewards/accuracies": 1.0, "rewards/chosen": -3.172088146209717, "rewards/margins": 5.9899091720581055, "rewards/rejected": -9.16199779510498, "step": 112 }, { "epoch": 0.12555555555555556, "grad_norm": 1.7437183856964111, "learning_rate": 4.185185185185185e-05, "logits/chosen": 1.1896926164627075, "logits/rejected": 1.1929486989974976, "logps/chosen": -47.36754608154297, "logps/rejected": -66.55594635009766, "loss": 0.4095, "rewards/accuracies": 1.0, "rewards/chosen": -0.46504804491996765, "rewards/margins": 0.7185218930244446, "rewards/rejected": -1.1835699081420898, "step": 113 }, { "epoch": 0.12666666666666668, "grad_norm": 1.4554977416992188, "learning_rate": 4.222222222222222e-05, "logits/chosen": 1.5314819812774658, "logits/rejected": 1.5435881614685059, "logps/chosen": -163.78147888183594, "logps/rejected": -212.66212463378906, "loss": 0.1056, "rewards/accuracies": 1.0, "rewards/chosen": -3.663634777069092, "rewards/margins": 2.2370223999023438, "rewards/rejected": -5.900656700134277, "step": 114 }, { "epoch": 0.12777777777777777, "grad_norm": 1.5873533487319946, "learning_rate": 4.259259259259259e-05, "logits/chosen": 1.0804393291473389, "logits/rejected": 1.1274089813232422, "logps/chosen": -84.89960479736328, "logps/rejected": -153.0184783935547, "loss": 0.3814, "rewards/accuracies": 0.5, "rewards/chosen": -1.5335112810134888, "rewards/margins": 2.66007924079895, "rewards/rejected": -4.19359016418457, "step": 115 }, { "epoch": 0.1288888888888889, "grad_norm": 15.798798561096191, "learning_rate": 4.296296296296296e-05, "logits/chosen": 1.0453428030014038, "logits/rejected": 0.9885035753250122, "logps/chosen": -114.61045837402344, "logps/rejected": -57.198997497558594, "loss": 3.9676, "rewards/accuracies": 0.0, "rewards/chosen": -4.602366924285889, "rewards/margins": -3.6607542037963867, "rewards/rejected": -0.9416126608848572, "step": 116 }, { "epoch": 0.13, "grad_norm": 1.802281379699707, "learning_rate": 4.3333333333333334e-05, "logits/chosen": 1.052227258682251, "logits/rejected": 1.084881067276001, "logps/chosen": -160.79132080078125, "logps/rejected": -266.9136047363281, "loss": 0.1288, "rewards/accuracies": 1.0, "rewards/chosen": -6.280523300170898, "rewards/margins": 3.9985947608947754, "rewards/rejected": -10.279117584228516, "step": 117 }, { "epoch": 0.13111111111111112, "grad_norm": 12.939175605773926, "learning_rate": 4.3703703703703705e-05, "logits/chosen": 1.2408828735351562, "logits/rejected": 1.2029078006744385, "logps/chosen": -244.38262939453125, "logps/rejected": -216.74526977539062, "loss": 2.008, "rewards/accuracies": 0.5, "rewards/chosen": -8.37761402130127, "rewards/margins": -0.29110264778137207, "rewards/rejected": -8.086511611938477, "step": 118 }, { "epoch": 0.1322222222222222, "grad_norm": 16.59554100036621, "learning_rate": 4.4074074074074076e-05, "logits/chosen": 1.3328406810760498, "logits/rejected": 1.324540138244629, "logps/chosen": -166.54127502441406, "logps/rejected": -120.67322540283203, "loss": 2.0769, "rewards/accuracies": 0.0, "rewards/chosen": -5.704492568969727, "rewards/margins": -1.8372573852539062, "rewards/rejected": -3.867234945297241, "step": 119 }, { "epoch": 0.13333333333333333, "grad_norm": 13.450758934020996, "learning_rate": 4.4444444444444447e-05, "logits/chosen": 1.3500542640686035, "logits/rejected": 1.2090803384780884, "logps/chosen": -141.12115478515625, "logps/rejected": -148.06033325195312, "loss": 2.0517, "rewards/accuracies": 0.5, "rewards/chosen": -4.260946273803711, "rewards/margins": 0.9149446487426758, "rewards/rejected": -5.175890922546387, "step": 120 }, { "epoch": 0.13444444444444445, "grad_norm": 15.019047737121582, "learning_rate": 4.481481481481482e-05, "logits/chosen": 1.3904173374176025, "logits/rejected": 1.3981860876083374, "logps/chosen": -177.68191528320312, "logps/rejected": -101.91424560546875, "loss": 2.9378, "rewards/accuracies": 0.0, "rewards/chosen": -4.749836444854736, "rewards/margins": -2.877058982849121, "rewards/rejected": -1.8727777004241943, "step": 121 }, { "epoch": 0.13555555555555557, "grad_norm": 0.29529455304145813, "learning_rate": 4.518518518518519e-05, "logits/chosen": 1.5433100461959839, "logits/rejected": 1.5453602075576782, "logps/chosen": -145.46864318847656, "logps/rejected": -258.5157470703125, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -2.745546817779541, "rewards/margins": 5.638494491577148, "rewards/rejected": -8.384040832519531, "step": 122 }, { "epoch": 0.13666666666666666, "grad_norm": 2.8554294109344482, "learning_rate": 4.555555555555556e-05, "logits/chosen": 0.989469587802887, "logits/rejected": 0.998894989490509, "logps/chosen": -67.05355834960938, "logps/rejected": -67.64825439453125, "loss": 0.7481, "rewards/accuracies": 0.5, "rewards/chosen": -1.4845073223114014, "rewards/margins": -0.0019320845603942871, "rewards/rejected": -1.4825752973556519, "step": 123 }, { "epoch": 0.13777777777777778, "grad_norm": 0.20606175065040588, "learning_rate": 4.592592592592593e-05, "logits/chosen": 1.1468162536621094, "logits/rejected": 1.1282365322113037, "logps/chosen": -89.57289123535156, "logps/rejected": -158.4973602294922, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -1.388584852218628, "rewards/margins": 4.674880027770996, "rewards/rejected": -6.063465118408203, "step": 124 }, { "epoch": 0.1388888888888889, "grad_norm": 1.9783127307891846, "learning_rate": 4.62962962962963e-05, "logits/chosen": 1.1873929500579834, "logits/rejected": 1.163598895072937, "logps/chosen": -82.33207702636719, "logps/rejected": -136.6854248046875, "loss": 0.3511, "rewards/accuracies": 0.5, "rewards/chosen": -1.3722432851791382, "rewards/margins": 3.23903489112854, "rewards/rejected": -4.611278057098389, "step": 125 }, { "epoch": 0.14, "grad_norm": 1.5958621501922607, "learning_rate": 4.666666666666667e-05, "logits/chosen": 1.0048305988311768, "logits/rejected": 1.0061559677124023, "logps/chosen": -80.93070983886719, "logps/rejected": -138.30177307128906, "loss": 0.3133, "rewards/accuracies": 1.0, "rewards/chosen": -1.889986515045166, "rewards/margins": 2.4265527725219727, "rewards/rejected": -4.316539287567139, "step": 126 }, { "epoch": 0.1411111111111111, "grad_norm": 1.9847005605697632, "learning_rate": 4.703703703703704e-05, "logits/chosen": 1.0960899591445923, "logits/rejected": 1.143872857093811, "logps/chosen": -84.49383544921875, "logps/rejected": -103.65217590332031, "loss": 0.3109, "rewards/accuracies": 1.0, "rewards/chosen": -1.109788179397583, "rewards/margins": 1.0089964866638184, "rewards/rejected": -2.1187846660614014, "step": 127 }, { "epoch": 0.14222222222222222, "grad_norm": 1.1922110319137573, "learning_rate": 4.740740740740741e-05, "logits/chosen": 1.416774034500122, "logits/rejected": 1.4077658653259277, "logps/chosen": -69.85118865966797, "logps/rejected": -129.65106201171875, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": -0.963848888874054, "rewards/margins": 2.1057958602905273, "rewards/rejected": -3.0696449279785156, "step": 128 }, { "epoch": 0.14333333333333334, "grad_norm": 1.3719844818115234, "learning_rate": 4.7777777777777784e-05, "logits/chosen": 1.5415847301483154, "logits/rejected": 1.527397871017456, "logps/chosen": -76.88078308105469, "logps/rejected": -146.98727416992188, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": -0.5743347406387329, "rewards/margins": 3.2011055946350098, "rewards/rejected": -3.775440216064453, "step": 129 }, { "epoch": 0.14444444444444443, "grad_norm": 0.5151137709617615, "learning_rate": 4.814814814814815e-05, "logits/chosen": 1.499194860458374, "logits/rejected": 1.503077507019043, "logps/chosen": -146.52764892578125, "logps/rejected": -246.77291870117188, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": -3.5399162769317627, "rewards/margins": 5.0705180168151855, "rewards/rejected": -8.610434532165527, "step": 130 }, { "epoch": 0.14555555555555555, "grad_norm": 3.9016013145446777, "learning_rate": 4.851851851851852e-05, "logits/chosen": 1.3073267936706543, "logits/rejected": 1.3325181007385254, "logps/chosen": -133.61697387695312, "logps/rejected": -184.1026611328125, "loss": 0.2872, "rewards/accuracies": 1.0, "rewards/chosen": -3.253795862197876, "rewards/margins": 2.23928165435791, "rewards/rejected": -5.493077278137207, "step": 131 }, { "epoch": 0.14666666666666667, "grad_norm": 0.6159332394599915, "learning_rate": 4.888888888888889e-05, "logits/chosen": 1.3477895259857178, "logits/rejected": 1.301927089691162, "logps/chosen": -183.1279296875, "logps/rejected": -272.909912109375, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": -6.717911720275879, "rewards/margins": 3.984036445617676, "rewards/rejected": -10.701948165893555, "step": 132 }, { "epoch": 0.14777777777777779, "grad_norm": 0.9195361733436584, "learning_rate": 4.925925925925926e-05, "logits/chosen": 1.345101237297058, "logits/rejected": 1.3546230792999268, "logps/chosen": -69.36975860595703, "logps/rejected": -152.46054077148438, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": -0.6549488306045532, "rewards/margins": 2.902041435241699, "rewards/rejected": -3.556990146636963, "step": 133 }, { "epoch": 0.14888888888888888, "grad_norm": 0.10239502787590027, "learning_rate": 4.962962962962963e-05, "logits/chosen": 1.1393225193023682, "logits/rejected": 1.0948164463043213, "logps/chosen": -173.59759521484375, "logps/rejected": -286.6609191894531, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.507652759552002, "rewards/margins": 5.980779647827148, "rewards/rejected": -10.488431930541992, "step": 134 }, { "epoch": 0.15, "grad_norm": 3.124730110168457, "learning_rate": 5e-05, "logits/chosen": 0.8166018724441528, "logits/rejected": 0.8005633354187012, "logps/chosen": -123.33694458007812, "logps/rejected": -208.25225830078125, "loss": 0.7827, "rewards/accuracies": 0.5, "rewards/chosen": -4.188698768615723, "rewards/margins": 6.018104553222656, "rewards/rejected": -10.206803321838379, "step": 135 }, { "epoch": 0.1511111111111111, "grad_norm": 1.5003418922424316, "learning_rate": 4.9999981248545295e-05, "logits/chosen": 1.2776434421539307, "logits/rejected": 1.289499282836914, "logps/chosen": -81.35440063476562, "logps/rejected": -122.35768127441406, "loss": 0.1588, "rewards/accuracies": 1.0, "rewards/chosen": -1.641808271408081, "rewards/margins": 1.9101128578186035, "rewards/rejected": -3.5519211292266846, "step": 136 }, { "epoch": 0.15222222222222223, "grad_norm": 0.09202127158641815, "learning_rate": 4.99999249942093e-05, "logits/chosen": 0.9005594849586487, "logits/rejected": 0.9221370220184326, "logps/chosen": -114.71597290039062, "logps/rejected": -262.2054443359375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -3.1650655269622803, "rewards/margins": 8.033636093139648, "rewards/rejected": -11.198701858520508, "step": 137 }, { "epoch": 0.15333333333333332, "grad_norm": 2.434523820877075, "learning_rate": 4.9999831237076424e-05, "logits/chosen": 1.2907774448394775, "logits/rejected": 1.2946574687957764, "logps/chosen": -179.28375244140625, "logps/rejected": -279.2033386230469, "loss": 0.1918, "rewards/accuracies": 1.0, "rewards/chosen": -6.696291923522949, "rewards/margins": 3.420769214630127, "rewards/rejected": -10.117060661315918, "step": 138 }, { "epoch": 0.15444444444444444, "grad_norm": 0.872352123260498, "learning_rate": 4.999969997728729e-05, "logits/chosen": 0.9462923407554626, "logits/rejected": 0.9417994618415833, "logps/chosen": -72.51927185058594, "logps/rejected": -128.62954711914062, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": -1.2891497611999512, "rewards/margins": 2.9601688385009766, "rewards/rejected": -4.249318599700928, "step": 139 }, { "epoch": 0.15555555555555556, "grad_norm": 0.3511127829551697, "learning_rate": 4.999953121503881e-05, "logits/chosen": 1.041064977645874, "logits/rejected": 1.0550111532211304, "logps/chosen": -156.4183349609375, "logps/rejected": -284.1146240234375, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -5.098064422607422, "rewards/margins": 7.723272323608398, "rewards/rejected": -12.82133674621582, "step": 140 }, { "epoch": 0.15666666666666668, "grad_norm": 4.0040693283081055, "learning_rate": 4.999932495058415e-05, "logits/chosen": 0.7963643074035645, "logits/rejected": 0.7766878604888916, "logps/chosen": -78.50262451171875, "logps/rejected": -95.00994873046875, "loss": 0.6298, "rewards/accuracies": 0.5, "rewards/chosen": -2.114379405975342, "rewards/margins": 1.7242661714553833, "rewards/rejected": -3.8386454582214355, "step": 141 }, { "epoch": 0.15777777777777777, "grad_norm": 9.214180946350098, "learning_rate": 4.999908118423273e-05, "logits/chosen": 0.9585148096084595, "logits/rejected": 0.9418221116065979, "logps/chosen": -126.07600402832031, "logps/rejected": -160.81719970703125, "loss": 0.5181, "rewards/accuracies": 0.5, "rewards/chosen": -4.23268985748291, "rewards/margins": 1.4678010940551758, "rewards/rejected": -5.700490951538086, "step": 142 }, { "epoch": 0.15888888888888889, "grad_norm": 0.22340665757656097, "learning_rate": 4.9998799916350224e-05, "logits/chosen": 0.8733727931976318, "logits/rejected": 0.8818212747573853, "logps/chosen": -116.42982482910156, "logps/rejected": -226.25442504882812, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -4.79374885559082, "rewards/margins": 4.4101409912109375, "rewards/rejected": -9.203889846801758, "step": 143 }, { "epoch": 0.16, "grad_norm": 5.794167995452881, "learning_rate": 4.999848114735858e-05, "logits/chosen": 0.8997300863265991, "logits/rejected": 0.9110772609710693, "logps/chosen": -175.35791015625, "logps/rejected": -206.57888793945312, "loss": 0.5119, "rewards/accuracies": 0.5, "rewards/chosen": -6.938982963562012, "rewards/margins": 2.4744555950164795, "rewards/rejected": -9.41343879699707, "step": 144 }, { "epoch": 0.16111111111111112, "grad_norm": 2.0388023853302, "learning_rate": 4.999812487773597e-05, "logits/chosen": 1.0242507457733154, "logits/rejected": 1.0349136590957642, "logps/chosen": -142.8397216796875, "logps/rejected": -175.53140258789062, "loss": 0.1935, "rewards/accuracies": 1.0, "rewards/chosen": -4.637690544128418, "rewards/margins": 1.643040657043457, "rewards/rejected": -6.280731201171875, "step": 145 }, { "epoch": 0.1622222222222222, "grad_norm": 2.915043592453003, "learning_rate": 4.9997731108016856e-05, "logits/chosen": 0.7855651378631592, "logits/rejected": 0.781745433807373, "logps/chosen": -103.03030395507812, "logps/rejected": -120.555419921875, "loss": 0.6662, "rewards/accuracies": 0.5, "rewards/chosen": -3.85825777053833, "rewards/margins": 1.147570252418518, "rewards/rejected": -5.005827903747559, "step": 146 }, { "epoch": 0.16333333333333333, "grad_norm": 3.4638118743896484, "learning_rate": 4.999729983879192e-05, "logits/chosen": 1.421783208847046, "logits/rejected": 1.4258348941802979, "logps/chosen": -122.23641967773438, "logps/rejected": -152.86712646484375, "loss": 0.5376, "rewards/accuracies": 0.5, "rewards/chosen": -4.872014999389648, "rewards/margins": 1.3686140775680542, "rewards/rejected": -6.240629196166992, "step": 147 }, { "epoch": 0.16444444444444445, "grad_norm": 18.839500427246094, "learning_rate": 4.9996831070708136e-05, "logits/chosen": 1.0232353210449219, "logits/rejected": 1.0181726217269897, "logps/chosen": -164.22381591796875, "logps/rejected": -90.71993255615234, "loss": 4.4787, "rewards/accuracies": 0.5, "rewards/chosen": -6.298289775848389, "rewards/margins": -4.094772815704346, "rewards/rejected": -2.203516960144043, "step": 148 }, { "epoch": 0.16555555555555557, "grad_norm": 0.009234304539859295, "learning_rate": 4.9996324804468695e-05, "logits/chosen": 1.081225037574768, "logits/rejected": 1.082531452178955, "logps/chosen": -79.72488403320312, "logps/rejected": -199.94778442382812, "loss": 0.3466, "rewards/accuracies": 1.0, "rewards/chosen": -1.1995495557785034, "rewards/margins": 6.442424774169922, "rewards/rejected": -7.641974449157715, "step": 149 }, { "epoch": 0.16666666666666666, "grad_norm": 0.060363467782735825, "learning_rate": 4.999578104083307e-05, "logits/chosen": 1.1977136135101318, "logits/rejected": 1.2284719944000244, "logps/chosen": -223.39620971679688, "logps/rejected": -393.90850830078125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -7.425601959228516, "rewards/margins": 9.423572540283203, "rewards/rejected": -16.84917449951172, "step": 150 }, { "epoch": 0.16777777777777778, "grad_norm": 3.5991640090942383, "learning_rate": 4.999519978061696e-05, "logits/chosen": 0.9973795413970947, "logits/rejected": 1.0171294212341309, "logps/chosen": -116.09068298339844, "logps/rejected": -138.1101837158203, "loss": 0.2653, "rewards/accuracies": 1.0, "rewards/chosen": -3.8928585052490234, "rewards/margins": 1.3767584562301636, "rewards/rejected": -5.269617080688477, "step": 151 }, { "epoch": 0.1688888888888889, "grad_norm": 4.037815570831299, "learning_rate": 4.9994581024692324e-05, "logits/chosen": 1.1273605823516846, "logits/rejected": 1.109969973564148, "logps/chosen": -99.56399536132812, "logps/rejected": -93.7951889038086, "loss": 0.6762, "rewards/accuracies": 0.5, "rewards/chosen": -2.017375946044922, "rewards/margins": 0.09155121445655823, "rewards/rejected": -2.1089272499084473, "step": 152 }, { "epoch": 0.17, "grad_norm": 2.1894826889038086, "learning_rate": 4.999392477398737e-05, "logits/chosen": 0.7049329280853271, "logits/rejected": 0.7389975786209106, "logps/chosen": -125.2940673828125, "logps/rejected": -162.79666137695312, "loss": 0.2107, "rewards/accuracies": 1.0, "rewards/chosen": -3.769153118133545, "rewards/margins": 1.4516644477844238, "rewards/rejected": -5.220817565917969, "step": 153 }, { "epoch": 0.1711111111111111, "grad_norm": 4.346587181091309, "learning_rate": 4.9993231029486544e-05, "logits/chosen": 0.9578156471252441, "logits/rejected": 0.9708178639411926, "logps/chosen": -108.83192443847656, "logps/rejected": -153.83740234375, "loss": 0.3736, "rewards/accuracies": 0.5, "rewards/chosen": -3.7089853286743164, "rewards/margins": 2.223890542984009, "rewards/rejected": -5.932875633239746, "step": 154 }, { "epoch": 0.17222222222222222, "grad_norm": 2.5044848918914795, "learning_rate": 4.999249979223055e-05, "logits/chosen": 1.1679116487503052, "logits/rejected": 1.181077241897583, "logps/chosen": -161.9647674560547, "logps/rejected": -196.61988830566406, "loss": 0.1878, "rewards/accuracies": 1.0, "rewards/chosen": -7.7564191818237305, "rewards/margins": 1.7475008964538574, "rewards/rejected": -9.50391960144043, "step": 155 }, { "epoch": 0.17333333333333334, "grad_norm": 1.6763205528259277, "learning_rate": 4.999173106331633e-05, "logits/chosen": 0.9922992587089539, "logits/rejected": 0.9602010250091553, "logps/chosen": -172.23097229003906, "logps/rejected": -218.07412719726562, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": -5.730727672576904, "rewards/margins": 3.119436502456665, "rewards/rejected": -8.850164413452148, "step": 156 }, { "epoch": 0.17444444444444446, "grad_norm": 3.875519037246704, "learning_rate": 4.999092484389707e-05, "logits/chosen": 0.8287628889083862, "logits/rejected": 0.8315039873123169, "logps/chosen": -83.39523315429688, "logps/rejected": -99.36117553710938, "loss": 0.5583, "rewards/accuracies": 0.5, "rewards/chosen": -1.7585126161575317, "rewards/margins": 0.35260865092277527, "rewards/rejected": -2.11112117767334, "step": 157 }, { "epoch": 0.17555555555555555, "grad_norm": 13.091570854187012, "learning_rate": 4.999008113518218e-05, "logits/chosen": 0.7459204792976379, "logits/rejected": 0.6323636174201965, "logps/chosen": -158.21563720703125, "logps/rejected": -134.81561279296875, "loss": 2.8792, "rewards/accuracies": 0.5, "rewards/chosen": -5.661371231079102, "rewards/margins": -1.1220248937606812, "rewards/rejected": -4.539346218109131, "step": 158 }, { "epoch": 0.17666666666666667, "grad_norm": 2.8447210788726807, "learning_rate": 4.9989199938437326e-05, "logits/chosen": 1.0176703929901123, "logits/rejected": 1.0209418535232544, "logps/chosen": -152.46978759765625, "logps/rejected": -172.83274841308594, "loss": 0.2192, "rewards/accuracies": 1.0, "rewards/chosen": -4.7676239013671875, "rewards/margins": 1.6251521110534668, "rewards/rejected": -6.392776012420654, "step": 159 }, { "epoch": 0.17777777777777778, "grad_norm": 0.3525926470756531, "learning_rate": 4.9988281254984414e-05, "logits/chosen": 0.7372010946273804, "logits/rejected": 0.7442748546600342, "logps/chosen": -99.61550903320312, "logps/rejected": -204.60079956054688, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": -2.857567310333252, "rewards/margins": 7.730868816375732, "rewards/rejected": -10.588436126708984, "step": 160 }, { "epoch": 0.17888888888888888, "grad_norm": 12.022184371948242, "learning_rate": 4.9987325086201564e-05, "logits/chosen": 1.1958391666412354, "logits/rejected": 1.1773509979248047, "logps/chosen": -161.49583435058594, "logps/rejected": -125.13191223144531, "loss": 1.5714, "rewards/accuracies": 0.5, "rewards/chosen": -5.613795280456543, "rewards/margins": -1.1612695455551147, "rewards/rejected": -4.452525615692139, "step": 161 }, { "epoch": 0.18, "grad_norm": 0.20275698602199554, "learning_rate": 4.9986331433523156e-05, "logits/chosen": 0.9507079124450684, "logits/rejected": 0.9461952447891235, "logps/chosen": -264.2442932128906, "logps/rejected": -366.00396728515625, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -11.924967765808105, "rewards/margins": 5.447269916534424, "rewards/rejected": -17.372238159179688, "step": 162 }, { "epoch": 0.1811111111111111, "grad_norm": 4.029171466827393, "learning_rate": 4.9985300298439764e-05, "logits/chosen": 0.7035899758338928, "logits/rejected": 0.6865305304527283, "logps/chosen": -60.92144775390625, "logps/rejected": -83.11170959472656, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": -0.8441837430000305, "rewards/margins": 1.1943950653076172, "rewards/rejected": -2.038578748703003, "step": 163 }, { "epoch": 0.18222222222222223, "grad_norm": 12.392237663269043, "learning_rate": 4.998423168249823e-05, "logits/chosen": 0.9455868005752563, "logits/rejected": 0.9003270864486694, "logps/chosen": -210.270751953125, "logps/rejected": -199.78587341308594, "loss": 1.869, "rewards/accuracies": 0.5, "rewards/chosen": -7.7148590087890625, "rewards/margins": 2.073317050933838, "rewards/rejected": -9.788176536560059, "step": 164 }, { "epoch": 0.18333333333333332, "grad_norm": 0.029438573867082596, "learning_rate": 4.998312558730159e-05, "logits/chosen": 0.7175464630126953, "logits/rejected": 0.7310649156570435, "logps/chosen": -156.21469116210938, "logps/rejected": -333.5090637207031, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.7102532386779785, "rewards/margins": 8.140151977539062, "rewards/rejected": -12.8504056930542, "step": 165 }, { "epoch": 0.18444444444444444, "grad_norm": 13.36672306060791, "learning_rate": 4.9981982014509126e-05, "logits/chosen": 0.7917848825454712, "logits/rejected": 0.79588782787323, "logps/chosen": -232.8542938232422, "logps/rejected": -212.86184692382812, "loss": 2.3542, "rewards/accuracies": 0.5, "rewards/chosen": -6.563692569732666, "rewards/margins": 1.693019151687622, "rewards/rejected": -8.256711959838867, "step": 166 }, { "epoch": 0.18555555555555556, "grad_norm": 13.304723739624023, "learning_rate": 4.998080096583632e-05, "logits/chosen": 1.3950765132904053, "logits/rejected": 1.4173359870910645, "logps/chosen": -180.1671142578125, "logps/rejected": -190.47010803222656, "loss": 0.9606, "rewards/accuracies": 0.5, "rewards/chosen": -5.196643829345703, "rewards/margins": 0.17163759469985962, "rewards/rejected": -5.368281364440918, "step": 167 }, { "epoch": 0.18666666666666668, "grad_norm": 0.002705258782953024, "learning_rate": 4.997958244305489e-05, "logits/chosen": 0.9235450029373169, "logits/rejected": 0.9213215112686157, "logps/chosen": -127.92697143554688, "logps/rejected": -327.5588073730469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.689639091491699, "rewards/margins": 10.383021354675293, "rewards/rejected": -13.072660446166992, "step": 168 }, { "epoch": 0.18777777777777777, "grad_norm": 9.228717803955078, "learning_rate": 4.997832644799276e-05, "logits/chosen": 0.5383849143981934, "logits/rejected": 0.5158981084823608, "logps/chosen": -203.05264282226562, "logps/rejected": -192.52810668945312, "loss": 1.7202, "rewards/accuracies": 0.0, "rewards/chosen": -11.264119148254395, "rewards/margins": -1.4632501602172852, "rewards/rejected": -9.80086898803711, "step": 169 }, { "epoch": 0.18888888888888888, "grad_norm": 0.6578316688537598, "learning_rate": 4.997703298253406e-05, "logits/chosen": 0.7544792890548706, "logits/rejected": 0.7465224266052246, "logps/chosen": -60.84596633911133, "logps/rejected": -162.48208618164062, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -0.8069696426391602, "rewards/margins": 5.2520751953125, "rewards/rejected": -6.05904483795166, "step": 170 }, { "epoch": 0.19, "grad_norm": 1.0633653402328491, "learning_rate": 4.997570204861915e-05, "logits/chosen": 0.9544074535369873, "logits/rejected": 0.964580774307251, "logps/chosen": -92.8996810913086, "logps/rejected": -133.84820556640625, "loss": 0.2537, "rewards/accuracies": 1.0, "rewards/chosen": -2.9659035205841064, "rewards/margins": 2.2728445529937744, "rewards/rejected": -5.238748073577881, "step": 171 }, { "epoch": 0.19111111111111112, "grad_norm": 0.04443879798054695, "learning_rate": 4.9974333648244584e-05, "logits/chosen": 0.6101863980293274, "logits/rejected": 0.6141354441642761, "logps/chosen": -103.84774780273438, "logps/rejected": -197.12783813476562, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -3.1148998737335205, "rewards/margins": 6.378216743469238, "rewards/rejected": -9.49311637878418, "step": 172 }, { "epoch": 0.1922222222222222, "grad_norm": 7.201624393463135, "learning_rate": 4.997292778346312e-05, "logits/chosen": 0.8587669134140015, "logits/rejected": 0.8760038614273071, "logps/chosen": -132.0455780029297, "logps/rejected": -160.0282440185547, "loss": 0.8076, "rewards/accuracies": 0.5, "rewards/chosen": -3.780181884765625, "rewards/margins": 1.5048305988311768, "rewards/rejected": -5.285012722015381, "step": 173 }, { "epoch": 0.19333333333333333, "grad_norm": 7.482102394104004, "learning_rate": 4.997148445638371e-05, "logits/chosen": 0.9877073764801025, "logits/rejected": 0.9868419170379639, "logps/chosen": -188.32281494140625, "logps/rejected": -187.48397827148438, "loss": 0.5687, "rewards/accuracies": 1.0, "rewards/chosen": -6.712105751037598, "rewards/margins": 0.2764599323272705, "rewards/rejected": -6.988565444946289, "step": 174 }, { "epoch": 0.19444444444444445, "grad_norm": 2.35133695602417, "learning_rate": 4.9970003669171525e-05, "logits/chosen": 0.8454946875572205, "logits/rejected": 0.8463809490203857, "logps/chosen": -95.83246612548828, "logps/rejected": -167.45065307617188, "loss": 0.1968, "rewards/accuracies": 1.0, "rewards/chosen": -2.4286632537841797, "rewards/margins": 3.3916015625, "rewards/rejected": -5.82026481628418, "step": 175 }, { "epoch": 0.19555555555555557, "grad_norm": 3.581979274749756, "learning_rate": 4.9968485424047916e-05, "logits/chosen": 0.8193004131317139, "logits/rejected": 0.7891165614128113, "logps/chosen": -136.52040100097656, "logps/rejected": -148.72315979003906, "loss": 0.5011, "rewards/accuracies": 0.5, "rewards/chosen": -4.286550998687744, "rewards/margins": 0.5561413764953613, "rewards/rejected": -4.8426923751831055, "step": 176 }, { "epoch": 0.19666666666666666, "grad_norm": 3.4804611206054688, "learning_rate": 4.996692972329043e-05, "logits/chosen": 1.0166072845458984, "logits/rejected": 1.042419672012329, "logps/chosen": -190.80902099609375, "logps/rejected": -233.5691680908203, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": -5.4673237800598145, "rewards/margins": 3.4434902667999268, "rewards/rejected": -8.91081428527832, "step": 177 }, { "epoch": 0.19777777777777777, "grad_norm": 1.216090202331543, "learning_rate": 4.996533656923279e-05, "logits/chosen": 0.6919585466384888, "logits/rejected": 0.685686469078064, "logps/chosen": -70.78359985351562, "logps/rejected": -86.85313415527344, "loss": 0.4348, "rewards/accuracies": 0.5, "rewards/chosen": -1.6090431213378906, "rewards/margins": 0.9462345242500305, "rewards/rejected": -2.5552775859832764, "step": 178 }, { "epoch": 0.1988888888888889, "grad_norm": 7.7435302734375, "learning_rate": 4.996370596426492e-05, "logits/chosen": 0.8129884004592896, "logits/rejected": 0.7940980195999146, "logps/chosen": -108.20877075195312, "logps/rejected": -146.63616943359375, "loss": 0.7441, "rewards/accuracies": 0.5, "rewards/chosen": -2.417509078979492, "rewards/margins": 1.7018548250198364, "rewards/rejected": -4.119363784790039, "step": 179 }, { "epoch": 0.2, "grad_norm": 6.263914108276367, "learning_rate": 4.996203791083291e-05, "logits/chosen": 0.6220883131027222, "logits/rejected": 0.6484527587890625, "logps/chosen": -66.89225769042969, "logps/rejected": -90.65328979492188, "loss": 0.5821, "rewards/accuracies": 0.5, "rewards/chosen": -1.708361268043518, "rewards/margins": 0.6097134947776794, "rewards/rejected": -2.3180747032165527, "step": 180 }, { "epoch": 0.2011111111111111, "grad_norm": 3.588029623031616, "learning_rate": 4.9960332411439035e-05, "logits/chosen": 0.8486727476119995, "logits/rejected": 0.8487511277198792, "logps/chosen": -116.4251937866211, "logps/rejected": -161.7483367919922, "loss": 0.2688, "rewards/accuracies": 1.0, "rewards/chosen": -3.0894970893859863, "rewards/margins": 2.269402503967285, "rewards/rejected": -5.35890007019043, "step": 181 }, { "epoch": 0.20222222222222222, "grad_norm": 8.705796241760254, "learning_rate": 4.995858946864176e-05, "logits/chosen": 0.8535293340682983, "logits/rejected": 0.9470130801200867, "logps/chosen": -117.60743713378906, "logps/rejected": -82.78147888183594, "loss": 1.4221, "rewards/accuracies": 0.0, "rewards/chosen": -2.9612207412719727, "rewards/margins": -1.1383323669433594, "rewards/rejected": -1.8228881359100342, "step": 182 }, { "epoch": 0.20333333333333334, "grad_norm": 3.0968480110168457, "learning_rate": 4.995680908505568e-05, "logits/chosen": 0.7181336283683777, "logits/rejected": 0.7182783484458923, "logps/chosen": -124.79554748535156, "logps/rejected": -168.82699584960938, "loss": 0.392, "rewards/accuracies": 0.5, "rewards/chosen": -4.991233825683594, "rewards/margins": 1.8461244106292725, "rewards/rejected": -6.837357997894287, "step": 183 }, { "epoch": 0.20444444444444446, "grad_norm": 1.3149911165237427, "learning_rate": 4.9954991263351584e-05, "logits/chosen": 0.30352526903152466, "logits/rejected": 0.3040367364883423, "logps/chosen": -45.75611114501953, "logps/rejected": -82.826904296875, "loss": 0.2683, "rewards/accuracies": 1.0, "rewards/chosen": -0.35752391815185547, "rewards/margins": 1.6788386106491089, "rewards/rejected": -2.036362409591675, "step": 184 }, { "epoch": 0.20555555555555555, "grad_norm": 2.1677024364471436, "learning_rate": 4.9953136006256415e-05, "logits/chosen": 0.6579700708389282, "logits/rejected": 0.6361050605773926, "logps/chosen": -83.34025573730469, "logps/rejected": -117.58058166503906, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": -1.5314428806304932, "rewards/margins": 1.4600225687026978, "rewards/rejected": -2.9914655685424805, "step": 185 }, { "epoch": 0.20666666666666667, "grad_norm": 0.9257028698921204, "learning_rate": 4.995124331655329e-05, "logits/chosen": 0.7522636651992798, "logits/rejected": 0.8084293603897095, "logps/chosen": -104.05084228515625, "logps/rejected": -185.1298370361328, "loss": 0.0702, "rewards/accuracies": 1.0, "rewards/chosen": -2.9067068099975586, "rewards/margins": 3.541466236114502, "rewards/rejected": -6.4481730461120605, "step": 186 }, { "epoch": 0.20777777777777778, "grad_norm": 5.037042140960693, "learning_rate": 4.9949313197081435e-05, "logits/chosen": 1.0069222450256348, "logits/rejected": 1.0482374429702759, "logps/chosen": -143.95779418945312, "logps/rejected": -166.33587646484375, "loss": 0.3392, "rewards/accuracies": 1.0, "rewards/chosen": -3.8999226093292236, "rewards/margins": 0.9195562601089478, "rewards/rejected": -4.819478988647461, "step": 187 }, { "epoch": 0.2088888888888889, "grad_norm": 9.531548500061035, "learning_rate": 4.9947345650736286e-05, "logits/chosen": 0.8179107904434204, "logits/rejected": 0.8211343288421631, "logps/chosen": -196.22116088867188, "logps/rejected": -266.9176025390625, "loss": 0.331, "rewards/accuracies": 1.0, "rewards/chosen": -8.020105361938477, "rewards/margins": 3.5429439544677734, "rewards/rejected": -11.56304931640625, "step": 188 }, { "epoch": 0.21, "grad_norm": 5.689389228820801, "learning_rate": 4.994534068046937e-05, "logits/chosen": 0.5554993152618408, "logits/rejected": 0.5809474587440491, "logps/chosen": -76.22923278808594, "logps/rejected": -71.9394760131836, "loss": 1.0426, "rewards/accuracies": 0.0, "rewards/chosen": -2.070128917694092, "rewards/margins": -0.6054730415344238, "rewards/rejected": -1.464655876159668, "step": 189 }, { "epoch": 0.2111111111111111, "grad_norm": 1.6259000301361084, "learning_rate": 4.994329828928838e-05, "logits/chosen": 0.5795235633850098, "logits/rejected": 0.5716588497161865, "logps/chosen": -103.54397583007812, "logps/rejected": -141.988525390625, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": -2.193939208984375, "rewards/margins": 2.0905370712280273, "rewards/rejected": -4.284476280212402, "step": 190 }, { "epoch": 0.21222222222222223, "grad_norm": 6.204521656036377, "learning_rate": 4.994121848025714e-05, "logits/chosen": 0.40955448150634766, "logits/rejected": 0.4000794291496277, "logps/chosen": -61.5738525390625, "logps/rejected": -50.18740463256836, "loss": 0.8702, "rewards/accuracies": 0.0, "rewards/chosen": -1.5796359777450562, "rewards/margins": -0.3182252049446106, "rewards/rejected": -1.2614107131958008, "step": 191 }, { "epoch": 0.21333333333333335, "grad_norm": 10.165814399719238, "learning_rate": 4.993910125649561e-05, "logits/chosen": 0.6426538228988647, "logits/rejected": 0.7245504260063171, "logps/chosen": -99.31587982177734, "logps/rejected": -101.8595199584961, "loss": 0.9353, "rewards/accuracies": 0.5, "rewards/chosen": -3.0304198265075684, "rewards/margins": -0.3160911798477173, "rewards/rejected": -2.7143287658691406, "step": 192 }, { "epoch": 0.21444444444444444, "grad_norm": 24.12255859375, "learning_rate": 4.993694662117987e-05, "logits/chosen": 0.902106761932373, "logits/rejected": 0.9164290428161621, "logps/chosen": -65.63589477539062, "logps/rejected": -85.33277893066406, "loss": 0.2762, "rewards/accuracies": 1.0, "rewards/chosen": -1.4597523212432861, "rewards/margins": 1.2098228931427002, "rewards/rejected": -2.6695752143859863, "step": 193 }, { "epoch": 0.21555555555555556, "grad_norm": 0.011419141665101051, "learning_rate": 4.993475457754212e-05, "logits/chosen": 1.1625410318374634, "logits/rejected": 1.1716305017471313, "logps/chosen": -69.38377380371094, "logps/rejected": -202.3413848876953, "loss": 0.3466, "rewards/accuracies": 0.5, "rewards/chosen": -0.9358536005020142, "rewards/margins": 4.85792875289917, "rewards/rejected": -5.7937822341918945, "step": 194 }, { "epoch": 0.21666666666666667, "grad_norm": 39.35941696166992, "learning_rate": 4.993252512887069e-05, "logits/chosen": 1.1149723529815674, "logits/rejected": 1.099027395248413, "logps/chosen": -134.03623962402344, "logps/rejected": -159.49241638183594, "loss": 0.4573, "rewards/accuracies": 1.0, "rewards/chosen": -4.88626766204834, "rewards/margins": 0.577968955039978, "rewards/rejected": -5.464236259460449, "step": 195 }, { "epoch": 0.21777777777777776, "grad_norm": 20.0416202545166, "learning_rate": 4.993025827851e-05, "logits/chosen": 1.5879268646240234, "logits/rejected": 1.628857970237732, "logps/chosen": -151.49014282226562, "logps/rejected": -224.2655792236328, "loss": 0.6724, "rewards/accuracies": 0.5, "rewards/chosen": -5.762251377105713, "rewards/margins": 3.8934531211853027, "rewards/rejected": -9.655704498291016, "step": 196 }, { "epoch": 0.21888888888888888, "grad_norm": 3.1550920009613037, "learning_rate": 4.9927954029860596e-05, "logits/chosen": 0.9698492288589478, "logits/rejected": 0.9681626558303833, "logps/chosen": -110.77711486816406, "logps/rejected": -112.57987213134766, "loss": 0.7288, "rewards/accuracies": 0.5, "rewards/chosen": -3.861011028289795, "rewards/margins": -0.06820416450500488, "rewards/rejected": -3.792807102203369, "step": 197 }, { "epoch": 0.22, "grad_norm": 0.206202432513237, "learning_rate": 4.992561238637912e-05, "logits/chosen": 0.6641535758972168, "logits/rejected": 0.6665283441543579, "logps/chosen": -74.79289245605469, "logps/rejected": -169.37567138671875, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -1.5472561120986938, "rewards/margins": 5.34957218170166, "rewards/rejected": -6.8968281745910645, "step": 198 }, { "epoch": 0.22111111111111112, "grad_norm": 2.864158868789673, "learning_rate": 4.992323335157831e-05, "logits/chosen": 1.0902498960494995, "logits/rejected": 1.0655139684677124, "logps/chosen": -87.12443542480469, "logps/rejected": -143.348388671875, "loss": 0.1392, "rewards/accuracies": 1.0, "rewards/chosen": -1.6467605829238892, "rewards/margins": 3.0183684825897217, "rewards/rejected": -4.6651291847229, "step": 199 }, { "epoch": 0.2222222222222222, "grad_norm": 9.38684368133545, "learning_rate": 4.992081692902699e-05, "logits/chosen": 0.3969607353210449, "logits/rejected": 0.42929527163505554, "logps/chosen": -131.10850524902344, "logps/rejected": -167.5130157470703, "loss": 1.7254, "rewards/accuracies": 0.5, "rewards/chosen": -4.084038734436035, "rewards/margins": 2.460939407348633, "rewards/rejected": -6.544978141784668, "step": 200 }, { "epoch": 0.22333333333333333, "grad_norm": 3.4999029636383057, "learning_rate": 4.9918363122350086e-05, "logits/chosen": 0.4904259443283081, "logits/rejected": 0.4851856231689453, "logps/chosen": -72.92430877685547, "logps/rejected": -71.34220123291016, "loss": 0.6846, "rewards/accuracies": 0.5, "rewards/chosen": -1.2826265096664429, "rewards/margins": 0.07785488665103912, "rewards/rejected": -1.3604813814163208, "step": 201 }, { "epoch": 0.22444444444444445, "grad_norm": 14.189846992492676, "learning_rate": 4.991587193522858e-05, "logits/chosen": 1.0344287157058716, "logits/rejected": 1.0367014408111572, "logps/chosen": -224.84323120117188, "logps/rejected": -200.6574249267578, "loss": 2.4023, "rewards/accuracies": 0.5, "rewards/chosen": -7.537484169006348, "rewards/margins": -0.8711509704589844, "rewards/rejected": -6.666333198547363, "step": 202 }, { "epoch": 0.22555555555555556, "grad_norm": 5.036700248718262, "learning_rate": 4.991334337139955e-05, "logits/chosen": 0.47978973388671875, "logits/rejected": 0.48186472058296204, "logps/chosen": -166.0140380859375, "logps/rejected": -174.00662231445312, "loss": 0.6314, "rewards/accuracies": 0.5, "rewards/chosen": -5.7350754737854, "rewards/margins": 0.14915752410888672, "rewards/rejected": -5.884232997894287, "step": 203 }, { "epoch": 0.22666666666666666, "grad_norm": 0.7116608023643494, "learning_rate": 4.9910777434656136e-05, "logits/chosen": 0.5056048631668091, "logits/rejected": 0.4823484718799591, "logps/chosen": -61.95745086669922, "logps/rejected": -136.18296813964844, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": -1.0834022760391235, "rewards/margins": 3.3995237350463867, "rewards/rejected": -4.482925891876221, "step": 204 }, { "epoch": 0.22777777777777777, "grad_norm": 5.616718769073486, "learning_rate": 4.990817412884754e-05, "logits/chosen": 0.567503809928894, "logits/rejected": 0.5554540157318115, "logps/chosen": -111.12155151367188, "logps/rejected": -125.14772033691406, "loss": 0.6614, "rewards/accuracies": 0.5, "rewards/chosen": -2.5712831020355225, "rewards/margins": 0.06605517864227295, "rewards/rejected": -2.637338161468506, "step": 205 }, { "epoch": 0.2288888888888889, "grad_norm": 3.640303373336792, "learning_rate": 4.9905533457879024e-05, "logits/chosen": 0.8430173397064209, "logits/rejected": 0.8309611082077026, "logps/chosen": -95.00192260742188, "logps/rejected": -113.32624816894531, "loss": 0.3446, "rewards/accuracies": 1.0, "rewards/chosen": -1.4531704187393188, "rewards/margins": 1.135424017906189, "rewards/rejected": -2.588594436645508, "step": 206 }, { "epoch": 0.23, "grad_norm": 1.9613778591156006, "learning_rate": 4.9902855425711905e-05, "logits/chosen": 0.8123503923416138, "logits/rejected": 0.8138333559036255, "logps/chosen": -96.87920379638672, "logps/rejected": -197.65000915527344, "loss": 0.081, "rewards/accuracies": 1.0, "rewards/chosen": -3.062093496322632, "rewards/margins": 4.35516357421875, "rewards/rejected": -7.417256832122803, "step": 207 }, { "epoch": 0.2311111111111111, "grad_norm": 2.3872005939483643, "learning_rate": 4.990014003636353e-05, "logits/chosen": 0.9048120975494385, "logits/rejected": 0.9127912521362305, "logps/chosen": -96.37702941894531, "logps/rejected": -132.52188110351562, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": -0.9789970517158508, "rewards/margins": 1.8474761247634888, "rewards/rejected": -2.8264732360839844, "step": 208 }, { "epoch": 0.23222222222222222, "grad_norm": 3.2368524074554443, "learning_rate": 4.989738729390732e-05, "logits/chosen": 0.648262619972229, "logits/rejected": 0.6552884578704834, "logps/chosen": -101.09939575195312, "logps/rejected": -98.2458267211914, "loss": 0.6859, "rewards/accuracies": 0.5, "rewards/chosen": -2.032818555831909, "rewards/margins": 0.014508843421936035, "rewards/rejected": -2.0473275184631348, "step": 209 }, { "epoch": 0.23333333333333334, "grad_norm": 0.5915521383285522, "learning_rate": 4.9894597202472696e-05, "logits/chosen": 0.5674071907997131, "logits/rejected": 0.5324715971946716, "logps/chosen": -120.63805389404297, "logps/rejected": -191.78427124023438, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -2.8730194568634033, "rewards/margins": 4.435626029968262, "rewards/rejected": -7.308645725250244, "step": 210 }, { "epoch": 0.23444444444444446, "grad_norm": 2.547374725341797, "learning_rate": 4.989176976624511e-05, "logits/chosen": 0.8792842626571655, "logits/rejected": 0.8814612627029419, "logps/chosen": -75.44282531738281, "logps/rejected": -163.81683349609375, "loss": 0.1452, "rewards/accuracies": 1.0, "rewards/chosen": -1.283816933631897, "rewards/margins": 3.8810648918151855, "rewards/rejected": -5.164881706237793, "step": 211 }, { "epoch": 0.23555555555555555, "grad_norm": 2.228644609451294, "learning_rate": 4.988890498946607e-05, "logits/chosen": 0.6692065000534058, "logits/rejected": 0.6452646255493164, "logps/chosen": -75.20098876953125, "logps/rejected": -128.3943328857422, "loss": 0.2997, "rewards/accuracies": 1.0, "rewards/chosen": -2.0172817707061768, "rewards/margins": 1.9734854698181152, "rewards/rejected": -3.990767240524292, "step": 212 }, { "epoch": 0.23666666666666666, "grad_norm": 4.751955032348633, "learning_rate": 4.9886002876433056e-05, "logits/chosen": 0.7308270931243896, "logits/rejected": 0.7340854406356812, "logps/chosen": -106.1060562133789, "logps/rejected": -122.2120361328125, "loss": 0.4973, "rewards/accuracies": 0.5, "rewards/chosen": -2.5560052394866943, "rewards/margins": 0.8283758759498596, "rewards/rejected": -3.384381055831909, "step": 213 }, { "epoch": 0.23777777777777778, "grad_norm": 2.674299478530884, "learning_rate": 4.9883063431499585e-05, "logits/chosen": 0.7196017503738403, "logits/rejected": 0.7225269079208374, "logps/chosen": -51.84063720703125, "logps/rejected": -53.51921844482422, "loss": 0.4664, "rewards/accuracies": 1.0, "rewards/chosen": -0.2583826184272766, "rewards/margins": 0.5227543115615845, "rewards/rejected": -0.7811369299888611, "step": 214 }, { "epoch": 0.2388888888888889, "grad_norm": 2.5562164783477783, "learning_rate": 4.9880086659075156e-05, "logits/chosen": 1.1124818325042725, "logits/rejected": 1.1116195917129517, "logps/chosen": -117.7965316772461, "logps/rejected": -267.0352478027344, "loss": 0.1945, "rewards/accuracies": 1.0, "rewards/chosen": -2.4497594833374023, "rewards/margins": 7.5080413818359375, "rewards/rejected": -9.95780086517334, "step": 215 }, { "epoch": 0.24, "grad_norm": 22.173320770263672, "learning_rate": 4.9877072563625285e-05, "logits/chosen": 1.007946491241455, "logits/rejected": 1.0106940269470215, "logps/chosen": -168.15884399414062, "logps/rejected": -164.18963623046875, "loss": 0.6698, "rewards/accuracies": 0.5, "rewards/chosen": -5.61307430267334, "rewards/margins": 0.7377567291259766, "rewards/rejected": -6.350831031799316, "step": 216 }, { "epoch": 0.2411111111111111, "grad_norm": 0.14546535909175873, "learning_rate": 4.987402114967146e-05, "logits/chosen": 0.6770639419555664, "logits/rejected": 0.6881896257400513, "logps/chosen": -93.15580749511719, "logps/rejected": -205.22056579589844, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.4745607376098633, "rewards/margins": 5.386501312255859, "rewards/rejected": -7.861062049865723, "step": 217 }, { "epoch": 0.24222222222222223, "grad_norm": 3.5543534755706787, "learning_rate": 4.987093242179116e-05, "logits/chosen": 0.7077668905258179, "logits/rejected": 0.7094806432723999, "logps/chosen": -97.16419219970703, "logps/rejected": -155.58587646484375, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": -2.148388147354126, "rewards/margins": 2.709935188293457, "rewards/rejected": -4.858323574066162, "step": 218 }, { "epoch": 0.24333333333333335, "grad_norm": 0.2553195357322693, "learning_rate": 4.986780638461784e-05, "logits/chosen": 0.976659893989563, "logits/rejected": 0.9893417358398438, "logps/chosen": -152.57177734375, "logps/rejected": -318.68426513671875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -3.633622169494629, "rewards/margins": 9.246904373168945, "rewards/rejected": -12.880526542663574, "step": 219 }, { "epoch": 0.24444444444444444, "grad_norm": 3.098940372467041, "learning_rate": 4.986464304284091e-05, "logits/chosen": 1.1159636974334717, "logits/rejected": 1.1053651571273804, "logps/chosen": -107.98556518554688, "logps/rejected": -164.00747680664062, "loss": 0.0954, "rewards/accuracies": 1.0, "rewards/chosen": -3.431183338165283, "rewards/margins": 2.6876444816589355, "rewards/rejected": -6.118827819824219, "step": 220 }, { "epoch": 0.24555555555555555, "grad_norm": 3.533510446548462, "learning_rate": 4.9861442401205746e-05, "logits/chosen": 0.4635196030139923, "logits/rejected": 0.47579896450042725, "logps/chosen": -155.95448303222656, "logps/rejected": -167.6121063232422, "loss": 0.1632, "rewards/accuracies": 1.0, "rewards/chosen": -5.162532806396484, "rewards/margins": 1.7355769872665405, "rewards/rejected": -6.8981099128723145, "step": 221 }, { "epoch": 0.24666666666666667, "grad_norm": 3.242952585220337, "learning_rate": 4.98582044645137e-05, "logits/chosen": 0.5403412580490112, "logits/rejected": 0.5549043416976929, "logps/chosen": -83.97018432617188, "logps/rejected": -119.43214416503906, "loss": 0.2569, "rewards/accuracies": 1.0, "rewards/chosen": -2.3906443119049072, "rewards/margins": 1.247878074645996, "rewards/rejected": -3.6385223865509033, "step": 222 }, { "epoch": 0.2477777777777778, "grad_norm": 9.200871467590332, "learning_rate": 4.985492923762205e-05, "logits/chosen": 0.8909025192260742, "logits/rejected": 0.8939725160598755, "logps/chosen": -175.84906005859375, "logps/rejected": -178.55001831054688, "loss": 0.542, "rewards/accuracies": 1.0, "rewards/chosen": -5.595398902893066, "rewards/margins": 0.36496543884277344, "rewards/rejected": -5.96036434173584, "step": 223 }, { "epoch": 0.24888888888888888, "grad_norm": 0.4330923855304718, "learning_rate": 4.985161672544401e-05, "logits/chosen": 0.2490202784538269, "logits/rejected": 0.24008788168430328, "logps/chosen": -111.02076721191406, "logps/rejected": -160.39984130859375, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": -3.815814971923828, "rewards/margins": 3.5979907512664795, "rewards/rejected": -7.413805961608887, "step": 224 }, { "epoch": 0.25, "grad_norm": 3.746175527572632, "learning_rate": 4.984826693294874e-05, "logits/chosen": 0.11750394105911255, "logits/rejected": 0.1150815561413765, "logps/chosen": -139.56085205078125, "logps/rejected": -178.83334350585938, "loss": 0.1964, "rewards/accuracies": 1.0, "rewards/chosen": -7.217936992645264, "rewards/margins": 1.593536376953125, "rewards/rejected": -8.81147289276123, "step": 225 }, { "epoch": 0.2511111111111111, "grad_norm": 24.864656448364258, "learning_rate": 4.9844879865161306e-05, "logits/chosen": 0.3901464343070984, "logits/rejected": 0.38432174921035767, "logps/chosen": -206.71533203125, "logps/rejected": -139.5667724609375, "loss": 4.8159, "rewards/accuracies": 0.5, "rewards/chosen": -10.001323699951172, "rewards/margins": -4.419900417327881, "rewards/rejected": -5.581423282623291, "step": 226 }, { "epoch": 0.25222222222222224, "grad_norm": 8.109025001525879, "learning_rate": 4.984145552716273e-05, "logits/chosen": 0.20054353773593903, "logits/rejected": 0.2015720009803772, "logps/chosen": -99.39993286132812, "logps/rejected": -134.44656372070312, "loss": 1.1009, "rewards/accuracies": 0.5, "rewards/chosen": -3.042801856994629, "rewards/margins": 2.517031669616699, "rewards/rejected": -5.559833526611328, "step": 227 }, { "epoch": 0.25333333333333335, "grad_norm": 0.32151830196380615, "learning_rate": 4.9837993924089886e-05, "logits/chosen": 0.3085002303123474, "logits/rejected": 0.30798622965812683, "logps/chosen": -99.37248229980469, "logps/rejected": -175.26348876953125, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -3.420287609100342, "rewards/margins": 4.689691543579102, "rewards/rejected": -8.109979629516602, "step": 228 }, { "epoch": 0.2544444444444444, "grad_norm": 2.159350633621216, "learning_rate": 4.9834495061135604e-05, "logits/chosen": 0.2833161950111389, "logits/rejected": 0.2819622755050659, "logps/chosen": -68.6833724975586, "logps/rejected": -123.18251037597656, "loss": 0.2199, "rewards/accuracies": 1.0, "rewards/chosen": -1.9685605764389038, "rewards/margins": 2.1059863567352295, "rewards/rejected": -4.074546813964844, "step": 229 }, { "epoch": 0.25555555555555554, "grad_norm": 13.745996475219727, "learning_rate": 4.983095894354858e-05, "logits/chosen": 0.5137923359870911, "logits/rejected": 0.4164670705795288, "logps/chosen": -127.56266784667969, "logps/rejected": -161.15274047851562, "loss": 0.6366, "rewards/accuracies": 0.5, "rewards/chosen": -4.351569175720215, "rewards/margins": 0.1478661298751831, "rewards/rejected": -4.4994354248046875, "step": 230 }, { "epoch": 0.25666666666666665, "grad_norm": 2.1722536087036133, "learning_rate": 4.982738557663339e-05, "logits/chosen": 0.6298158764839172, "logits/rejected": 0.6742088794708252, "logps/chosen": -94.770263671875, "logps/rejected": -168.74331665039062, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": -2.310671806335449, "rewards/margins": 2.504854917526245, "rewards/rejected": -4.815526485443115, "step": 231 }, { "epoch": 0.2577777777777778, "grad_norm": 6.700493335723877, "learning_rate": 4.982377496575052e-05, "logits/chosen": 0.1459362506866455, "logits/rejected": 0.20083485543727875, "logps/chosen": -117.76814270019531, "logps/rejected": -156.39892578125, "loss": 0.4021, "rewards/accuracies": 0.5, "rewards/chosen": -5.010497093200684, "rewards/margins": 2.2646636962890625, "rewards/rejected": -7.275160789489746, "step": 232 }, { "epoch": 0.2588888888888889, "grad_norm": 0.1440276801586151, "learning_rate": 4.9820127116316294e-05, "logits/chosen": 0.7428863048553467, "logits/rejected": 0.7525875568389893, "logps/chosen": -153.30799865722656, "logps/rejected": -317.836669921875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.607441425323486, "rewards/margins": 9.531506538391113, "rewards/rejected": -14.138947486877441, "step": 233 }, { "epoch": 0.26, "grad_norm": 6.511999130249023, "learning_rate": 4.981644203380291e-05, "logits/chosen": 0.029637932777404785, "logits/rejected": 0.0638638585805893, "logps/chosen": -68.84324645996094, "logps/rejected": -79.11209106445312, "loss": 0.4288, "rewards/accuracies": 1.0, "rewards/chosen": -1.9704220294952393, "rewards/margins": 0.777305543422699, "rewards/rejected": -2.747727394104004, "step": 234 }, { "epoch": 0.2611111111111111, "grad_norm": 4.526661396026611, "learning_rate": 4.9812719723738435e-05, "logits/chosen": 0.3714377284049988, "logits/rejected": 0.36530569195747375, "logps/chosen": -96.88034057617188, "logps/rejected": -111.05685424804688, "loss": 0.4629, "rewards/accuracies": 1.0, "rewards/chosen": -2.617449998855591, "rewards/margins": 0.539548397064209, "rewards/rejected": -3.1569983959198, "step": 235 }, { "epoch": 0.26222222222222225, "grad_norm": 4.171183109283447, "learning_rate": 4.9808960191706745e-05, "logits/chosen": 0.02566736936569214, "logits/rejected": 0.04190387576818466, "logps/chosen": -158.579833984375, "logps/rejected": -367.7066345214844, "loss": 0.1534, "rewards/accuracies": 1.0, "rewards/chosen": -6.7442193031311035, "rewards/margins": 10.986772537231445, "rewards/rejected": -17.73099136352539, "step": 236 }, { "epoch": 0.2633333333333333, "grad_norm": 2.6066935062408447, "learning_rate": 4.980516344334759e-05, "logits/chosen": 0.2812420725822449, "logits/rejected": 0.28983134031295776, "logps/chosen": -104.43193054199219, "logps/rejected": -204.13839721679688, "loss": 0.087, "rewards/accuracies": 1.0, "rewards/chosen": -3.2163960933685303, "rewards/margins": 7.629384517669678, "rewards/rejected": -10.845780372619629, "step": 237 }, { "epoch": 0.2644444444444444, "grad_norm": 2.2529749870300293, "learning_rate": 4.980132948435653e-05, "logits/chosen": 0.31353652477264404, "logits/rejected": 0.3123881220817566, "logps/chosen": -112.49354553222656, "logps/rejected": -165.1072998046875, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": -3.6637487411499023, "rewards/margins": 2.585325241088867, "rewards/rejected": -6.2490739822387695, "step": 238 }, { "epoch": 0.26555555555555554, "grad_norm": 0.4345913827419281, "learning_rate": 4.9797458320484955e-05, "logits/chosen": 0.07310516387224197, "logits/rejected": 0.05031590908765793, "logps/chosen": -96.12832641601562, "logps/rejected": -150.91964721679688, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -3.21736478805542, "rewards/margins": 4.269068717956543, "rewards/rejected": -7.486433506011963, "step": 239 }, { "epoch": 0.26666666666666666, "grad_norm": 4.620720863342285, "learning_rate": 4.979354995754006e-05, "logits/chosen": 0.44801923632621765, "logits/rejected": 0.44419652223587036, "logps/chosen": -159.78273010253906, "logps/rejected": -185.6401824951172, "loss": 0.2285, "rewards/accuracies": 1.0, "rewards/chosen": -6.601757526397705, "rewards/margins": 1.6451702117919922, "rewards/rejected": -8.246927261352539, "step": 240 }, { "epoch": 0.2677777777777778, "grad_norm": 0.25542575120925903, "learning_rate": 4.978960440138484e-05, "logits/chosen": 0.5185869932174683, "logits/rejected": 0.4948597550392151, "logps/chosen": -117.57159423828125, "logps/rejected": -221.66615295410156, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.877673625946045, "rewards/margins": 7.663633346557617, "rewards/rejected": -12.541306495666504, "step": 241 }, { "epoch": 0.2688888888888889, "grad_norm": 29.050823211669922, "learning_rate": 4.9785621657938084e-05, "logits/chosen": 0.8020373582839966, "logits/rejected": 0.8319034576416016, "logps/chosen": -236.39157104492188, "logps/rejected": -244.54678344726562, "loss": 0.7128, "rewards/accuracies": 0.5, "rewards/chosen": -9.463287353515625, "rewards/margins": 0.5438370704650879, "rewards/rejected": -10.007124900817871, "step": 242 }, { "epoch": 0.27, "grad_norm": 13.48017406463623, "learning_rate": 4.978160173317438e-05, "logits/chosen": 0.30613577365875244, "logits/rejected": 0.3161311745643616, "logps/chosen": -182.42999267578125, "logps/rejected": -173.35299682617188, "loss": 0.6477, "rewards/accuracies": 0.5, "rewards/chosen": -7.262606620788574, "rewards/margins": 0.5837867259979248, "rewards/rejected": -7.846393585205078, "step": 243 }, { "epoch": 0.27111111111111114, "grad_norm": 9.406580647919327e-05, "learning_rate": 4.977754463312408e-05, "logits/chosen": -0.03548378497362137, "logits/rejected": -0.06673412770032883, "logps/chosen": -92.80603790283203, "logps/rejected": -328.4048156738281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.854949712753296, "rewards/margins": 14.920032501220703, "rewards/rejected": -17.774982452392578, "step": 244 }, { "epoch": 0.2722222222222222, "grad_norm": 1.0741401638370007e-05, "learning_rate": 4.977345036387331e-05, "logits/chosen": 0.29820385575294495, "logits/rejected": 0.27482688426971436, "logps/chosen": -124.09878540039062, "logps/rejected": -394.9486389160156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.510623931884766, "rewards/margins": 17.69420623779297, "rewards/rejected": -22.204830169677734, "step": 245 }, { "epoch": 0.2733333333333333, "grad_norm": 23.513545989990234, "learning_rate": 4.976931893156395e-05, "logits/chosen": 0.33492299914360046, "logits/rejected": 0.36550194025039673, "logps/chosen": -228.7473602294922, "logps/rejected": -229.84768676757812, "loss": 2.7937, "rewards/accuracies": 0.5, "rewards/chosen": -11.347902297973633, "rewards/margins": -0.14699339866638184, "rewards/rejected": -11.200908660888672, "step": 246 }, { "epoch": 0.27444444444444444, "grad_norm": 0.3831864893436432, "learning_rate": 4.9765150342393607e-05, "logits/chosen": 0.2896062135696411, "logits/rejected": 0.30395886301994324, "logps/chosen": -205.04672241210938, "logps/rejected": -282.767333984375, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -10.242804527282715, "rewards/margins": 4.840688705444336, "rewards/rejected": -15.08349323272705, "step": 247 }, { "epoch": 0.27555555555555555, "grad_norm": 2.546215295791626, "learning_rate": 4.976094460261568e-05, "logits/chosen": 0.21945802867412567, "logits/rejected": 0.24706542491912842, "logps/chosen": -129.0390167236328, "logps/rejected": -184.51222229003906, "loss": 0.2075, "rewards/accuracies": 1.0, "rewards/chosen": -4.782685279846191, "rewards/margins": 3.494847297668457, "rewards/rejected": -8.277532577514648, "step": 248 }, { "epoch": 0.27666666666666667, "grad_norm": 4.89005708694458, "learning_rate": 4.975670171853926e-05, "logits/chosen": -0.053481727838516235, "logits/rejected": -0.03925175964832306, "logps/chosen": -91.83181762695312, "logps/rejected": -111.29765319824219, "loss": 0.3083, "rewards/accuracies": 1.0, "rewards/chosen": -3.751936197280884, "rewards/margins": 1.0376248359680176, "rewards/rejected": -4.7895612716674805, "step": 249 }, { "epoch": 0.2777777777777778, "grad_norm": 1.644893765449524, "learning_rate": 4.9752421696529164e-05, "logits/chosen": 0.2932344675064087, "logits/rejected": 0.2937529385089874, "logps/chosen": -159.35227966308594, "logps/rejected": -231.81597900390625, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": -6.2245588302612305, "rewards/margins": 4.980123996734619, "rewards/rejected": -11.204683303833008, "step": 250 }, { "epoch": 0.2788888888888889, "grad_norm": 2.0503132343292236, "learning_rate": 4.974810454300591e-05, "logits/chosen": 0.19721657037734985, "logits/rejected": 0.17003442347049713, "logps/chosen": -189.82147216796875, "logps/rejected": -251.67872619628906, "loss": 0.1287, "rewards/accuracies": 1.0, "rewards/chosen": -9.716439247131348, "rewards/margins": 4.958928108215332, "rewards/rejected": -14.67536735534668, "step": 251 }, { "epoch": 0.28, "grad_norm": 0.4329065978527069, "learning_rate": 4.974375026444575e-05, "logits/chosen": 0.28687289357185364, "logits/rejected": 0.2910487651824951, "logps/chosen": -178.41390991210938, "logps/rejected": -333.785888671875, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -8.13511848449707, "rewards/margins": 9.350677490234375, "rewards/rejected": -17.485795974731445, "step": 252 }, { "epoch": 0.2811111111111111, "grad_norm": 0.25617754459381104, "learning_rate": 4.973935886738061e-05, "logits/chosen": -0.06954015046358109, "logits/rejected": -0.07156814634799957, "logps/chosen": -130.11923217773438, "logps/rejected": -267.8585205078125, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -6.915092468261719, "rewards/margins": 8.983718872070312, "rewards/rejected": -15.898811340332031, "step": 253 }, { "epoch": 0.2822222222222222, "grad_norm": 4.498588562011719, "learning_rate": 4.973493035839808e-05, "logits/chosen": -0.20769423246383667, "logits/rejected": -0.19955235719680786, "logps/chosen": -175.53970336914062, "logps/rejected": -220.50634765625, "loss": 0.3119, "rewards/accuracies": 1.0, "rewards/chosen": -7.975495338439941, "rewards/margins": 4.9920783042907715, "rewards/rejected": -12.967574119567871, "step": 254 }, { "epoch": 0.2833333333333333, "grad_norm": 0.0023359302431344986, "learning_rate": 4.9730464744141445e-05, "logits/chosen": -0.028131097555160522, "logits/rejected": -0.026412751525640488, "logps/chosen": -141.23089599609375, "logps/rejected": -301.4130554199219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.864480018615723, "rewards/margins": 10.410040855407715, "rewards/rejected": -15.274520874023438, "step": 255 }, { "epoch": 0.28444444444444444, "grad_norm": 3.563753604888916, "learning_rate": 4.972596203130966e-05, "logits/chosen": -0.19401924312114716, "logits/rejected": -0.2137964963912964, "logps/chosen": -223.83399963378906, "logps/rejected": -317.62408447265625, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": -13.598297119140625, "rewards/margins": 7.118513107299805, "rewards/rejected": -20.71681022644043, "step": 256 }, { "epoch": 0.28555555555555556, "grad_norm": 12.637032508850098, "learning_rate": 4.97214222266573e-05, "logits/chosen": 0.17071302235126495, "logits/rejected": 0.17154350876808167, "logps/chosen": -203.27589416503906, "logps/rejected": -208.15765380859375, "loss": 1.4575, "rewards/accuracies": 0.5, "rewards/chosen": -10.365172386169434, "rewards/margins": 0.5405327081680298, "rewards/rejected": -10.905704498291016, "step": 257 }, { "epoch": 0.2866666666666667, "grad_norm": 0.5343703031539917, "learning_rate": 4.971684533699461e-05, "logits/chosen": 0.005155415274202824, "logits/rejected": 0.001355406828224659, "logps/chosen": -124.52782440185547, "logps/rejected": -240.4281005859375, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -4.496756553649902, "rewards/margins": 8.64199161529541, "rewards/rejected": -13.138748168945312, "step": 258 }, { "epoch": 0.2877777777777778, "grad_norm": 1.6350138187408447, "learning_rate": 4.971223136918745e-05, "logits/chosen": 0.050041504204273224, "logits/rejected": 0.05459631234407425, "logps/chosen": -257.2825622558594, "logps/rejected": -359.0766906738281, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -15.204426765441895, "rewards/margins": 5.697434425354004, "rewards/rejected": -20.9018611907959, "step": 259 }, { "epoch": 0.28888888888888886, "grad_norm": 0.2949289381504059, "learning_rate": 4.970758033015731e-05, "logits/chosen": 0.0014536082744598389, "logits/rejected": -0.010807052254676819, "logps/chosen": -266.399169921875, "logps/rejected": -373.8905944824219, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -13.201983451843262, "rewards/margins": 8.214594841003418, "rewards/rejected": -21.41657829284668, "step": 260 }, { "epoch": 0.29, "grad_norm": 1.3416260480880737, "learning_rate": 4.970289222688129e-05, "logits/chosen": -0.3643057942390442, "logits/rejected": -0.34971535205841064, "logps/chosen": -192.91915893554688, "logps/rejected": -270.6666564941406, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -8.93553352355957, "rewards/margins": 6.293454647064209, "rewards/rejected": -15.228988647460938, "step": 261 }, { "epoch": 0.2911111111111111, "grad_norm": 0.004141124431043863, "learning_rate": 4.9698167066392104e-05, "logits/chosen": -0.05628548562526703, "logits/rejected": -0.003909517079591751, "logps/chosen": -190.0975341796875, "logps/rejected": -537.92041015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.992842197418213, "rewards/margins": 22.954273223876953, "rewards/rejected": -30.947114944458008, "step": 262 }, { "epoch": 0.2922222222222222, "grad_norm": 2.8033788204193115, "learning_rate": 4.9693404855778026e-05, "logits/chosen": -0.5522273778915405, "logits/rejected": -0.5644251108169556, "logps/chosen": -95.13334655761719, "logps/rejected": -155.68939208984375, "loss": 0.1562, "rewards/accuracies": 1.0, "rewards/chosen": -4.444374084472656, "rewards/margins": 4.02031946182251, "rewards/rejected": -8.464693069458008, "step": 263 }, { "epoch": 0.29333333333333333, "grad_norm": 1.2488009929656982, "learning_rate": 4.968860560218293e-05, "logits/chosen": -0.364124596118927, "logits/rejected": -0.3485461473464966, "logps/chosen": -106.46548461914062, "logps/rejected": -171.74893188476562, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": -5.10057258605957, "rewards/margins": 5.038763046264648, "rewards/rejected": -10.139334678649902, "step": 264 }, { "epoch": 0.29444444444444445, "grad_norm": 0.0001868911786004901, "learning_rate": 4.968376931280626e-05, "logits/chosen": -0.1599990576505661, "logits/rejected": -0.171653613448143, "logps/chosen": -143.64324951171875, "logps/rejected": -340.96807861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.357876777648926, "rewards/margins": 13.797296524047852, "rewards/rejected": -20.155174255371094, "step": 265 }, { "epoch": 0.29555555555555557, "grad_norm": 0.11289782077074051, "learning_rate": 4.9678895994903015e-05, "logits/chosen": -0.07120626419782639, "logits/rejected": -0.058324411511421204, "logps/chosen": -259.5531005859375, "logps/rejected": -419.5189208984375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -13.68425464630127, "rewards/margins": 11.11913013458252, "rewards/rejected": -24.80338478088379, "step": 266 }, { "epoch": 0.2966666666666667, "grad_norm": 1.8008530139923096, "learning_rate": 4.967398565578373e-05, "logits/chosen": -0.4766896069049835, "logits/rejected": -0.476317822933197, "logps/chosen": -87.08863830566406, "logps/rejected": -165.18011474609375, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": -4.3603386878967285, "rewards/margins": 5.429108619689941, "rewards/rejected": -9.789447784423828, "step": 267 }, { "epoch": 0.29777777777777775, "grad_norm": 0.16479596495628357, "learning_rate": 4.966903830281449e-05, "logits/chosen": -0.5531883835792542, "logits/rejected": -0.5681743621826172, "logps/chosen": -207.68934631347656, "logps/rejected": -290.55596923828125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -12.142021179199219, "rewards/margins": 5.742420673370361, "rewards/rejected": -17.884441375732422, "step": 268 }, { "epoch": 0.29888888888888887, "grad_norm": 0.9707714915275574, "learning_rate": 4.966405394341689e-05, "logits/chosen": -0.30490192770957947, "logits/rejected": -0.2871077060699463, "logps/chosen": -149.22335815429688, "logps/rejected": -283.9515075683594, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -7.547700881958008, "rewards/margins": 10.621053695678711, "rewards/rejected": -18.16875457763672, "step": 269 }, { "epoch": 0.3, "grad_norm": 1.5882608890533447, "learning_rate": 4.965903258506806e-05, "logits/chosen": -0.5484447479248047, "logits/rejected": -0.550959587097168, "logps/chosen": -90.94281005859375, "logps/rejected": -166.7667999267578, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -4.077125549316406, "rewards/margins": 5.890243053436279, "rewards/rejected": -9.967369079589844, "step": 270 }, { "epoch": 0.3, "eval_logits/chosen": -0.5755235552787781, "eval_logits/rejected": -0.5680462121963501, "eval_logps/chosen": -210.76124572753906, "eval_logps/rejected": -328.3013000488281, "eval_loss": 0.607344388961792, "eval_rewards/accuracies": 0.8700000047683716, "eval_rewards/chosen": -12.2990140914917, "eval_rewards/margins": 9.07348346710205, "eval_rewards/rejected": -21.372499465942383, "eval_runtime": 85.0748, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.294, "step": 270 }, { "epoch": 0.3011111111111111, "grad_norm": 4.51216983795166, "learning_rate": 4.965397423530063e-05, "logits/chosen": -0.7424349784851074, "logits/rejected": -0.7497687339782715, "logps/chosen": -232.29367065429688, "logps/rejected": -311.918212890625, "loss": 0.6881, "rewards/accuracies": 0.5, "rewards/chosen": -13.830830574035645, "rewards/margins": 7.626102924346924, "rewards/rejected": -21.456933975219727, "step": 271 }, { "epoch": 0.3022222222222222, "grad_norm": 11.471277236938477, "learning_rate": 4.964887890170269e-05, "logits/chosen": -0.42639055848121643, "logits/rejected": -0.423634797334671, "logps/chosen": -346.8958740234375, "logps/rejected": -372.8656921386719, "loss": 0.1421, "rewards/accuracies": 1.0, "rewards/chosen": -20.488922119140625, "rewards/margins": 2.2132840156555176, "rewards/rejected": -22.702205657958984, "step": 272 }, { "epoch": 0.30333333333333334, "grad_norm": 0.849371612071991, "learning_rate": 4.964374659191786e-05, "logits/chosen": -0.8087621927261353, "logits/rejected": -0.7943145036697388, "logps/chosen": -127.55708312988281, "logps/rejected": -224.135986328125, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": -7.151095867156982, "rewards/margins": 7.181909084320068, "rewards/rejected": -14.33300495147705, "step": 273 }, { "epoch": 0.30444444444444446, "grad_norm": 0.00014791231660638005, "learning_rate": 4.963857731364518e-05, "logits/chosen": -0.520218014717102, "logits/rejected": -0.5244771242141724, "logps/chosen": -210.79603576660156, "logps/rejected": -747.6204223632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.129982948303223, "rewards/margins": 35.92407989501953, "rewards/rejected": -50.05406188964844, "step": 274 }, { "epoch": 0.3055555555555556, "grad_norm": 0.0002357628254685551, "learning_rate": 4.963337107463918e-05, "logits/chosen": -0.4710427224636078, "logits/rejected": -0.47855985164642334, "logps/chosen": -356.01336669921875, "logps/rejected": -519.0822143554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.58477210998535, "rewards/margins": 14.462190628051758, "rewards/rejected": -38.04696273803711, "step": 275 }, { "epoch": 0.30666666666666664, "grad_norm": 2.0119266991969198e-05, "learning_rate": 4.9628127882709827e-05, "logits/chosen": -0.6789518594741821, "logits/rejected": -0.7046225666999817, "logps/chosen": -177.97991943359375, "logps/rejected": -383.53729248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.683534622192383, "rewards/margins": 17.077754974365234, "rewards/rejected": -27.76129150390625, "step": 276 }, { "epoch": 0.30777777777777776, "grad_norm": 1.1012552976608276, "learning_rate": 4.9622847745722505e-05, "logits/chosen": -0.693565309047699, "logits/rejected": -0.6992260217666626, "logps/chosen": -571.7523193359375, "logps/rejected": -672.4853515625, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -41.145545959472656, "rewards/margins": 10.187372207641602, "rewards/rejected": -51.332916259765625, "step": 277 }, { "epoch": 0.3088888888888889, "grad_norm": 0.6452011466026306, "learning_rate": 4.9617530671598044e-05, "logits/chosen": -0.7364058494567871, "logits/rejected": -0.7208298444747925, "logps/chosen": -283.79119873046875, "logps/rejected": -390.2685241699219, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -18.47183609008789, "rewards/margins": 9.048556327819824, "rewards/rejected": -27.52039337158203, "step": 278 }, { "epoch": 0.31, "grad_norm": 0.029234938323497772, "learning_rate": 4.961217666831268e-05, "logits/chosen": -0.08577486872673035, "logits/rejected": -0.09564816951751709, "logps/chosen": -142.5680389404297, "logps/rejected": -283.8293151855469, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -7.192890167236328, "rewards/margins": 11.840350151062012, "rewards/rejected": -19.033241271972656, "step": 279 }, { "epoch": 0.3111111111111111, "grad_norm": 25.246206283569336, "learning_rate": 4.960678574389803e-05, "logits/chosen": 0.18935135006904602, "logits/rejected": 0.16053859889507294, "logps/chosen": -215.3026123046875, "logps/rejected": -444.8119201660156, "loss": 0.3885, "rewards/accuracies": 0.5, "rewards/chosen": -13.739055633544922, "rewards/margins": 20.219514846801758, "rewards/rejected": -33.95857238769531, "step": 280 }, { "epoch": 0.31222222222222223, "grad_norm": 1.4537978172302246, "learning_rate": 4.960135790644111e-05, "logits/chosen": 0.55597984790802, "logits/rejected": 0.5381332039833069, "logps/chosen": -321.299072265625, "logps/rejected": -396.6799011230469, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -22.723224639892578, "rewards/margins": 7.591421604156494, "rewards/rejected": -30.314647674560547, "step": 281 }, { "epoch": 0.31333333333333335, "grad_norm": 0.3719119131565094, "learning_rate": 4.959589316408432e-05, "logits/chosen": 0.5154537558555603, "logits/rejected": 0.44133782386779785, "logps/chosen": -250.6259002685547, "logps/rejected": -340.27349853515625, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -19.139915466308594, "rewards/margins": 7.266761302947998, "rewards/rejected": -26.40667724609375, "step": 282 }, { "epoch": 0.31444444444444447, "grad_norm": 0.011114238761365414, "learning_rate": 4.959039152502539e-05, "logits/chosen": 0.49303552508354187, "logits/rejected": 0.49824458360671997, "logps/chosen": -419.1337890625, "logps/rejected": -546.1395874023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.04513168334961, "rewards/margins": 10.869523048400879, "rewards/rejected": -42.91465759277344, "step": 283 }, { "epoch": 0.31555555555555553, "grad_norm": 1.296866774559021, "learning_rate": 4.958485299751743e-05, "logits/chosen": 0.33269864320755005, "logits/rejected": 0.32513028383255005, "logps/chosen": -182.2707977294922, "logps/rejected": -578.83837890625, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -10.903726577758789, "rewards/margins": 30.43411636352539, "rewards/rejected": -41.33784484863281, "step": 284 }, { "epoch": 0.31666666666666665, "grad_norm": 4.881822109222412, "learning_rate": 4.957927758986888e-05, "logits/chosen": 0.33238035440444946, "logits/rejected": 0.31468528509140015, "logps/chosen": -165.8448944091797, "logps/rejected": -183.73388671875, "loss": 0.1859, "rewards/accuracies": 1.0, "rewards/chosen": -10.76291275024414, "rewards/margins": 1.9590213298797607, "rewards/rejected": -12.72193431854248, "step": 285 }, { "epoch": 0.31777777777777777, "grad_norm": 104.6271743774414, "learning_rate": 4.9573665310443484e-05, "logits/chosen": -0.10227087885141373, "logits/rejected": -0.10868105292320251, "logps/chosen": -154.33819580078125, "logps/rejected": -150.67433166503906, "loss": 1.7459, "rewards/accuracies": 0.5, "rewards/chosen": -10.636112213134766, "rewards/margins": -0.6513490676879883, "rewards/rejected": -9.984763145446777, "step": 286 }, { "epoch": 0.3188888888888889, "grad_norm": 0.22999466955661774, "learning_rate": 4.9568016167660334e-05, "logits/chosen": 0.1850919872522354, "logits/rejected": 0.2808927893638611, "logps/chosen": -205.54525756835938, "logps/rejected": -456.02496337890625, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -14.041083335876465, "rewards/margins": 18.58612823486328, "rewards/rejected": -32.62721252441406, "step": 287 }, { "epoch": 0.32, "grad_norm": 107.84134674072266, "learning_rate": 4.956233016999379e-05, "logits/chosen": 0.15680812299251556, "logits/rejected": 0.12576663494110107, "logps/chosen": -458.26190185546875, "logps/rejected": -638.0904541015625, "loss": 4.1225, "rewards/accuracies": 0.5, "rewards/chosen": -36.76569747924805, "rewards/margins": 15.048696517944336, "rewards/rejected": -51.81439208984375, "step": 288 }, { "epoch": 0.3211111111111111, "grad_norm": 0.7204978466033936, "learning_rate": 4.955660732597351e-05, "logits/chosen": -0.019451133906841278, "logits/rejected": -0.025101397186517715, "logps/chosen": -299.4198303222656, "logps/rejected": -576.7284545898438, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -21.342098236083984, "rewards/margins": 21.939929962158203, "rewards/rejected": -43.28202819824219, "step": 289 }, { "epoch": 0.32222222222222224, "grad_norm": 0.003123871749266982, "learning_rate": 4.955084764418443e-05, "logits/chosen": -0.11471034586429596, "logits/rejected": -0.11877584457397461, "logps/chosen": -738.6446533203125, "logps/rejected": -960.8284912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -66.6318359375, "rewards/margins": 12.158294677734375, "rewards/rejected": -78.79013061523438, "step": 290 }, { "epoch": 0.3233333333333333, "grad_norm": 0.015370378270745277, "learning_rate": 4.954505113326674e-05, "logits/chosen": -0.022081922739744186, "logits/rejected": -0.007314602844417095, "logps/chosen": -142.4644317626953, "logps/rejected": -263.9211120605469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.572524070739746, "rewards/margins": 9.484514236450195, "rewards/rejected": -19.057037353515625, "step": 291 }, { "epoch": 0.3244444444444444, "grad_norm": 0.4668368995189667, "learning_rate": 4.953921780191588e-05, "logits/chosen": 0.0201495960354805, "logits/rejected": -0.01548599824309349, "logps/chosen": -709.4041748046875, "logps/rejected": -929.292724609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -59.230682373046875, "rewards/margins": 17.667753219604492, "rewards/rejected": -76.8984375, "step": 292 }, { "epoch": 0.32555555555555554, "grad_norm": 7.334802150726318, "learning_rate": 4.953334765888254e-05, "logits/chosen": -0.3433140516281128, "logits/rejected": -0.3498418927192688, "logps/chosen": -684.1629638671875, "logps/rejected": -968.5575561523438, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -54.933204650878906, "rewards/margins": 24.727670669555664, "rewards/rejected": -79.66087341308594, "step": 293 }, { "epoch": 0.32666666666666666, "grad_norm": 38.0053825378418, "learning_rate": 4.9527440712972594e-05, "logits/chosen": -0.015307029709219933, "logits/rejected": -0.04292779788374901, "logps/chosen": -537.14697265625, "logps/rejected": -614.2705078125, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": -40.78852462768555, "rewards/margins": 7.037482738494873, "rewards/rejected": -47.82600402832031, "step": 294 }, { "epoch": 0.3277777777777778, "grad_norm": 0.044104620814323425, "learning_rate": 4.952149697304716e-05, "logits/chosen": 0.04197852686047554, "logits/rejected": 0.00028145313262939453, "logps/chosen": -316.2242126464844, "logps/rejected": -475.84710693359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -26.282773971557617, "rewards/margins": 11.122703552246094, "rewards/rejected": -37.405479431152344, "step": 295 }, { "epoch": 0.3288888888888889, "grad_norm": 834.1669311523438, "learning_rate": 4.9515516448022554e-05, "logits/chosen": -0.3256528973579407, "logits/rejected": -0.3578278422355652, "logps/chosen": -1068.56689453125, "logps/rejected": -786.950439453125, "loss": 18.1145, "rewards/accuracies": 0.5, "rewards/chosen": -84.56151580810547, "rewards/margins": -16.48114776611328, "rewards/rejected": -68.08036804199219, "step": 296 }, { "epoch": 0.33, "grad_norm": 1.430393099784851, "learning_rate": 4.9509499146870236e-05, "logits/chosen": -0.28044256567955017, "logits/rejected": -0.28648126125335693, "logps/chosen": -400.3743896484375, "logps/rejected": -503.04034423828125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -32.01557159423828, "rewards/margins": 7.130472183227539, "rewards/rejected": -39.14604568481445, "step": 297 }, { "epoch": 0.33111111111111113, "grad_norm": 3.371788501739502, "learning_rate": 4.950344507861687e-05, "logits/chosen": -0.03414088115096092, "logits/rejected": -0.04092078283429146, "logps/chosen": -147.55136108398438, "logps/rejected": -192.65191650390625, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -8.843978881835938, "rewards/margins": 3.7995357513427734, "rewards/rejected": -12.643514633178711, "step": 298 }, { "epoch": 0.3322222222222222, "grad_norm": 528.6290283203125, "learning_rate": 4.949735425234426e-05, "logits/chosen": -0.3585573136806488, "logits/rejected": -0.3377287983894348, "logps/chosen": -824.890625, "logps/rejected": -891.2520751953125, "loss": 1.7982, "rewards/accuracies": 0.5, "rewards/chosen": -65.05032348632812, "rewards/margins": 4.745350360870361, "rewards/rejected": -69.79566955566406, "step": 299 }, { "epoch": 0.3333333333333333, "grad_norm": 697.4292602539062, "learning_rate": 4.949122667718935e-05, "logits/chosen": -0.4188559055328369, "logits/rejected": -0.4424627423286438, "logps/chosen": -388.2839660644531, "logps/rejected": -500.8250427246094, "loss": 4.6508, "rewards/accuracies": 0.5, "rewards/chosen": -30.42583465576172, "rewards/margins": 10.742940902709961, "rewards/rejected": -41.16877365112305, "step": 300 }, { "epoch": 0.33444444444444443, "grad_norm": 37.65538787841797, "learning_rate": 4.948506236234422e-05, "logits/chosen": -0.2633706331253052, "logits/rejected": -0.20921888947486877, "logps/chosen": -335.9444580078125, "logps/rejected": -354.047607421875, "loss": 0.503, "rewards/accuracies": 0.5, "rewards/chosen": -25.981929779052734, "rewards/margins": 2.164139747619629, "rewards/rejected": -28.146068572998047, "step": 301 }, { "epoch": 0.33555555555555555, "grad_norm": 737.0805053710938, "learning_rate": 4.947886131705607e-05, "logits/chosen": -0.5584155321121216, "logits/rejected": -0.5183042883872986, "logps/chosen": -484.9677734375, "logps/rejected": -494.8004150390625, "loss": 2.7389, "rewards/accuracies": 0.5, "rewards/chosen": -40.588661193847656, "rewards/margins": 0.8513965606689453, "rewards/rejected": -41.44005584716797, "step": 302 }, { "epoch": 0.33666666666666667, "grad_norm": 100.43080139160156, "learning_rate": 4.947262355062717e-05, "logits/chosen": -0.2496279776096344, "logits/rejected": -0.23672229051589966, "logps/chosen": -347.4626159667969, "logps/rejected": -390.5146484375, "loss": 1.9583, "rewards/accuracies": 0.5, "rewards/chosen": -29.364665985107422, "rewards/margins": 3.33258056640625, "rewards/rejected": -32.69724655151367, "step": 303 }, { "epoch": 0.3377777777777778, "grad_norm": 0.8107805252075195, "learning_rate": 4.9466349072414905e-05, "logits/chosen": -0.2638104557991028, "logits/rejected": -0.2456619292497635, "logps/chosen": -657.9945678710938, "logps/rejected": -801.16552734375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -60.06109619140625, "rewards/margins": 9.855857849121094, "rewards/rejected": -69.91695404052734, "step": 304 }, { "epoch": 0.3388888888888889, "grad_norm": 130.14862060546875, "learning_rate": 4.946003789183173e-05, "logits/chosen": -0.3958190679550171, "logits/rejected": -0.38962501287460327, "logps/chosen": -356.509521484375, "logps/rejected": -418.33721923828125, "loss": 0.7993, "rewards/accuracies": 0.5, "rewards/chosen": -29.859291076660156, "rewards/margins": 4.832291603088379, "rewards/rejected": -34.69158172607422, "step": 305 }, { "epoch": 0.34, "grad_norm": 1.7909403344021357e-12, "learning_rate": 4.9453690018345144e-05, "logits/chosen": -0.21372400224208832, "logits/rejected": -0.2239450067281723, "logps/chosen": -504.6278381347656, "logps/rejected": -992.6201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -42.29572677612305, "rewards/margins": 39.386138916015625, "rewards/rejected": -81.68186950683594, "step": 306 }, { "epoch": 0.3411111111111111, "grad_norm": 0.25239798426628113, "learning_rate": 4.944730546147769e-05, "logits/chosen": -0.4186308979988098, "logits/rejected": -0.44803786277770996, "logps/chosen": -357.01519775390625, "logps/rejected": -646.8837280273438, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -29.06670379638672, "rewards/margins": 24.451488494873047, "rewards/rejected": -53.51819610595703, "step": 307 }, { "epoch": 0.3422222222222222, "grad_norm": 0.4576690196990967, "learning_rate": 4.944088423080695e-05, "logits/chosen": -0.3369998335838318, "logits/rejected": -0.3089853525161743, "logps/chosen": -310.8232116699219, "logps/rejected": -506.07269287109375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -26.051651000976562, "rewards/margins": 15.123618125915527, "rewards/rejected": -41.175270080566406, "step": 308 }, { "epoch": 0.3433333333333333, "grad_norm": 1099.982177734375, "learning_rate": 4.943442633596552e-05, "logits/chosen": -0.633461058139801, "logits/rejected": -0.7285591959953308, "logps/chosen": -663.1358642578125, "logps/rejected": -562.1896362304688, "loss": 11.7188, "rewards/accuracies": 0.5, "rewards/chosen": -56.657894134521484, "rewards/margins": -10.496763229370117, "rewards/rejected": -46.161128997802734, "step": 309 }, { "epoch": 0.34444444444444444, "grad_norm": 120.3612289428711, "learning_rate": 4.9427931786641e-05, "logits/chosen": -0.6433749198913574, "logits/rejected": -0.6652141213417053, "logps/chosen": -365.67547607421875, "logps/rejected": -376.4675598144531, "loss": 0.67, "rewards/accuracies": 0.5, "rewards/chosen": -26.57131576538086, "rewards/margins": 3.053093433380127, "rewards/rejected": -29.624408721923828, "step": 310 }, { "epoch": 0.34555555555555556, "grad_norm": 1.166981816291809, "learning_rate": 4.9421400592575957e-05, "logits/chosen": -0.5554991960525513, "logits/rejected": -0.5367240309715271, "logps/chosen": -602.1057739257812, "logps/rejected": -781.11181640625, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -49.00614929199219, "rewards/margins": 15.2838134765625, "rewards/rejected": -64.28996276855469, "step": 311 }, { "epoch": 0.3466666666666667, "grad_norm": 0.8258697986602783, "learning_rate": 4.941483276356795e-05, "logits/chosen": -0.2965262532234192, "logits/rejected": -0.30186033248901367, "logps/chosen": -362.395751953125, "logps/rejected": -463.6152038574219, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -28.48711395263672, "rewards/margins": 8.371428489685059, "rewards/rejected": -36.858543395996094, "step": 312 }, { "epoch": 0.3477777777777778, "grad_norm": 0.006670548114925623, "learning_rate": 4.940822830946948e-05, "logits/chosen": -0.5734964609146118, "logits/rejected": -0.5939089059829712, "logps/chosen": -306.355224609375, "logps/rejected": -539.45947265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -25.031658172607422, "rewards/margins": 19.21436309814453, "rewards/rejected": -44.24602127075195, "step": 313 }, { "epoch": 0.3488888888888889, "grad_norm": 53.73747634887695, "learning_rate": 4.9401587240188e-05, "logits/chosen": -0.4031756520271301, "logits/rejected": -0.4042486548423767, "logps/chosen": -254.24053955078125, "logps/rejected": -284.7808532714844, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": -19.84695816040039, "rewards/margins": 2.3661999702453613, "rewards/rejected": -22.213157653808594, "step": 314 }, { "epoch": 0.35, "grad_norm": 0.005522846709936857, "learning_rate": 4.9394909565685894e-05, "logits/chosen": -0.4725743532180786, "logits/rejected": -0.5043455958366394, "logps/chosen": -601.2261962890625, "logps/rejected": -923.0452270507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -49.17688751220703, "rewards/margins": 27.4329891204834, "rewards/rejected": -76.60987854003906, "step": 315 }, { "epoch": 0.3511111111111111, "grad_norm": 0.8047577738761902, "learning_rate": 4.9388195295980446e-05, "logits/chosen": -0.47970202565193176, "logits/rejected": -0.4910230040550232, "logps/chosen": -380.84765625, "logps/rejected": -521.6475830078125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -28.17324447631836, "rewards/margins": 11.876016616821289, "rewards/rejected": -40.04926300048828, "step": 316 }, { "epoch": 0.3522222222222222, "grad_norm": 0.02119472436606884, "learning_rate": 4.9381444441143834e-05, "logits/chosen": -0.3841322660446167, "logits/rejected": -0.3866615891456604, "logps/chosen": -337.6022644042969, "logps/rejected": -525.2113037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.111652374267578, "rewards/margins": 15.842277526855469, "rewards/rejected": -43.95392990112305, "step": 317 }, { "epoch": 0.35333333333333333, "grad_norm": 179.27618408203125, "learning_rate": 4.9374657011303135e-05, "logits/chosen": -0.3366695046424866, "logits/rejected": -0.3157649040222168, "logps/chosen": -564.8817138671875, "logps/rejected": -466.992431640625, "loss": 8.0192, "rewards/accuracies": 0.0, "rewards/chosen": -45.840904235839844, "rewards/margins": -7.859552383422852, "rewards/rejected": -37.981353759765625, "step": 318 }, { "epoch": 0.35444444444444445, "grad_norm": 26.62212562561035, "learning_rate": 4.936783301664028e-05, "logits/chosen": -0.45790213346481323, "logits/rejected": -0.4602280259132385, "logps/chosen": -299.5520324707031, "logps/rejected": -307.8302001953125, "loss": 0.3429, "rewards/accuracies": 1.0, "rewards/chosen": -22.36705780029297, "rewards/margins": 1.3245620727539062, "rewards/rejected": -23.691619873046875, "step": 319 }, { "epoch": 0.35555555555555557, "grad_norm": 8.681018698553089e-06, "learning_rate": 4.9360972467392056e-05, "logits/chosen": -0.6234016418457031, "logits/rejected": -0.6459677815437317, "logps/chosen": -397.7835388183594, "logps/rejected": -647.90283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.114112854003906, "rewards/margins": 21.513507843017578, "rewards/rejected": -52.62761688232422, "step": 320 }, { "epoch": 0.3566666666666667, "grad_norm": 62.91702651977539, "learning_rate": 4.935407537385009e-05, "logits/chosen": -0.4898855686187744, "logits/rejected": -0.5585818290710449, "logps/chosen": -417.1715087890625, "logps/rejected": -805.86669921875, "loss": 0.6468, "rewards/accuracies": 0.5, "rewards/chosen": -33.197200775146484, "rewards/margins": 32.901023864746094, "rewards/rejected": -66.09822845458984, "step": 321 }, { "epoch": 0.35777777777777775, "grad_norm": 0.15051259100437164, "learning_rate": 4.934714174636082e-05, "logits/chosen": -0.12757551670074463, "logits/rejected": -0.13986535370349884, "logps/chosen": -216.11009216308594, "logps/rejected": -326.43121337890625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -15.440340042114258, "rewards/margins": 9.735942840576172, "rewards/rejected": -25.17628288269043, "step": 322 }, { "epoch": 0.35888888888888887, "grad_norm": 485.6281433105469, "learning_rate": 4.934017159532549e-05, "logits/chosen": -0.4888547658920288, "logits/rejected": -0.4897529184818268, "logps/chosen": -584.5462646484375, "logps/rejected": -626.3961181640625, "loss": 10.4383, "rewards/accuracies": 0.5, "rewards/chosen": -47.118385314941406, "rewards/margins": 3.411410331726074, "rewards/rejected": -50.52979278564453, "step": 323 }, { "epoch": 0.36, "grad_norm": 13.607995986938477, "learning_rate": 4.933316493120015e-05, "logits/chosen": -0.24895811080932617, "logits/rejected": -0.27632343769073486, "logps/chosen": -304.6307067871094, "logps/rejected": -642.9656982421875, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": -23.508487701416016, "rewards/margins": 28.728219985961914, "rewards/rejected": -52.2367057800293, "step": 324 }, { "epoch": 0.3611111111111111, "grad_norm": 138.2022705078125, "learning_rate": 4.9326121764495596e-05, "logits/chosen": -0.30515310168266296, "logits/rejected": -0.3140343427658081, "logps/chosen": -369.1429443359375, "logps/rejected": -373.081787109375, "loss": 4.2285, "rewards/accuracies": 0.5, "rewards/chosen": -27.850345611572266, "rewards/margins": 0.28951454162597656, "rewards/rejected": -28.139860153198242, "step": 325 }, { "epoch": 0.3622222222222222, "grad_norm": 0.000752690655644983, "learning_rate": 4.9319042105777415e-05, "logits/chosen": -0.2531960606575012, "logits/rejected": -0.23363250494003296, "logps/chosen": -193.6144256591797, "logps/rejected": -330.71563720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.625802040100098, "rewards/margins": 11.905756950378418, "rewards/rejected": -24.531558990478516, "step": 326 }, { "epoch": 0.36333333333333334, "grad_norm": 3.2646324634552, "learning_rate": 4.931192596566591e-05, "logits/chosen": -0.11494448035955429, "logits/rejected": -0.07009037584066391, "logps/chosen": -453.96832275390625, "logps/rejected": -495.58880615234375, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -33.923179626464844, "rewards/margins": 4.714908599853516, "rewards/rejected": -38.638084411621094, "step": 327 }, { "epoch": 0.36444444444444446, "grad_norm": 87.63572692871094, "learning_rate": 4.930477335483611e-05, "logits/chosen": -0.003595806658267975, "logits/rejected": 0.02652040868997574, "logps/chosen": -450.75048828125, "logps/rejected": -333.83172607421875, "loss": 11.0321, "rewards/accuracies": 0.5, "rewards/chosen": -35.38301086425781, "rewards/margins": -10.621012687683105, "rewards/rejected": -24.761999130249023, "step": 328 }, { "epoch": 0.3655555555555556, "grad_norm": 66.11498260498047, "learning_rate": 4.9297584284017774e-05, "logits/chosen": 0.15091046690940857, "logits/rejected": 0.15120144188404083, "logps/chosen": -407.39605712890625, "logps/rejected": -471.1376953125, "loss": 2.8644, "rewards/accuracies": 0.5, "rewards/chosen": -32.328369140625, "rewards/margins": 4.046537399291992, "rewards/rejected": -36.37490463256836, "step": 329 }, { "epoch": 0.36666666666666664, "grad_norm": 0.0003161925415042788, "learning_rate": 4.929035876399535e-05, "logits/chosen": 0.13186562061309814, "logits/rejected": 0.10898102819919586, "logps/chosen": -273.8263244628906, "logps/rejected": -476.98822021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.486705780029297, "rewards/margins": 16.602527618408203, "rewards/rejected": -36.0892333984375, "step": 330 }, { "epoch": 0.36777777777777776, "grad_norm": 53.771949768066406, "learning_rate": 4.9283096805607945e-05, "logits/chosen": 0.06943248212337494, "logits/rejected": 0.10434553027153015, "logps/chosen": -336.79827880859375, "logps/rejected": -323.91546630859375, "loss": 1.1029, "rewards/accuracies": 0.5, "rewards/chosen": -23.146154403686523, "rewards/margins": 0.8324871063232422, "rewards/rejected": -23.978641510009766, "step": 331 }, { "epoch": 0.3688888888888889, "grad_norm": 54.22163772583008, "learning_rate": 4.9275798419749353e-05, "logits/chosen": 0.22479934990406036, "logits/rejected": 0.21770404279232025, "logps/chosen": -290.7532653808594, "logps/rejected": -263.16717529296875, "loss": 2.5697, "rewards/accuracies": 0.5, "rewards/chosen": -19.117900848388672, "rewards/margins": -1.0918660163879395, "rewards/rejected": -18.02603530883789, "step": 332 }, { "epoch": 0.37, "grad_norm": 0.07660724967718124, "learning_rate": 4.9268463617368e-05, "logits/chosen": 0.22500908374786377, "logits/rejected": 0.23258163034915924, "logps/chosen": -226.66058349609375, "logps/rejected": -397.6791076660156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -16.213321685791016, "rewards/margins": 11.778264045715332, "rewards/rejected": -27.99158477783203, "step": 333 }, { "epoch": 0.3711111111111111, "grad_norm": 76.51652526855469, "learning_rate": 4.926109240946695e-05, "logits/chosen": 0.41982752084732056, "logits/rejected": 0.4143088459968567, "logps/chosen": -247.0833740234375, "logps/rejected": -273.891357421875, "loss": 1.3246, "rewards/accuracies": 0.5, "rewards/chosen": -16.164993286132812, "rewards/margins": 3.211289405822754, "rewards/rejected": -19.376283645629883, "step": 334 }, { "epoch": 0.37222222222222223, "grad_norm": 3.5699779987335205, "learning_rate": 4.925368480710385e-05, "logits/chosen": 0.24599310755729675, "logits/rejected": 0.23430922627449036, "logps/chosen": -182.0816650390625, "logps/rejected": -422.48046875, "loss": 0.095, "rewards/accuracies": 1.0, "rewards/chosen": -10.654780387878418, "rewards/margins": 17.14912986755371, "rewards/rejected": -27.803909301757812, "step": 335 }, { "epoch": 0.37333333333333335, "grad_norm": 0.4823857247829437, "learning_rate": 4.924624082139099e-05, "logits/chosen": 0.5802302956581116, "logits/rejected": 0.5555367469787598, "logps/chosen": -264.5758361816406, "logps/rejected": -343.7125244140625, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -17.544965744018555, "rewards/margins": 5.6952738761901855, "rewards/rejected": -23.240238189697266, "step": 336 }, { "epoch": 0.37444444444444447, "grad_norm": 4.558318138122559, "learning_rate": 4.923876046349519e-05, "logits/chosen": 0.42339786887168884, "logits/rejected": 0.45372337102890015, "logps/chosen": -148.55372619628906, "logps/rejected": -190.2027587890625, "loss": 0.1391, "rewards/accuracies": 1.0, "rewards/chosen": -9.137714385986328, "rewards/margins": 3.333714723587036, "rewards/rejected": -12.471429824829102, "step": 337 }, { "epoch": 0.37555555555555553, "grad_norm": 0.07500973343849182, "learning_rate": 4.923124374463789e-05, "logits/chosen": 0.3591451644897461, "logits/rejected": 0.3496806025505066, "logps/chosen": -493.4720458984375, "logps/rejected": -598.7730102539062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -36.92036056518555, "rewards/margins": 8.86223316192627, "rewards/rejected": -45.7825927734375, "step": 338 }, { "epoch": 0.37666666666666665, "grad_norm": 5.24008646607399e-06, "learning_rate": 4.922369067609501e-05, "logits/chosen": 0.4637269377708435, "logits/rejected": 0.5189558863639832, "logps/chosen": -363.40557861328125, "logps/rejected": -1004.1218872070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.057395935058594, "rewards/margins": 47.22442626953125, "rewards/rejected": -74.28182220458984, "step": 339 }, { "epoch": 0.37777777777777777, "grad_norm": 0.015302000567317009, "learning_rate": 4.921610126919706e-05, "logits/chosen": 0.48092421889305115, "logits/rejected": 0.4827513098716736, "logps/chosen": -321.67572021484375, "logps/rejected": -571.0349731445312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -20.672042846679688, "rewards/margins": 19.891563415527344, "rewards/rejected": -40.56360626220703, "step": 340 }, { "epoch": 0.3788888888888889, "grad_norm": 7.527382877015043e-07, "learning_rate": 4.920847553532902e-05, "logits/chosen": 0.4670676589012146, "logits/rejected": 0.4727013111114502, "logps/chosen": -354.81158447265625, "logps/rejected": -677.5972900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.188941955566406, "rewards/margins": 21.339466094970703, "rewards/rejected": -45.52840805053711, "step": 341 }, { "epoch": 0.38, "grad_norm": 60.35717010498047, "learning_rate": 4.9200813485930375e-05, "logits/chosen": 0.8293460011482239, "logits/rejected": 0.8142740726470947, "logps/chosen": -283.8396301269531, "logps/rejected": -390.9620056152344, "loss": 0.5518, "rewards/accuracies": 0.5, "rewards/chosen": -19.84065055847168, "rewards/margins": 7.117458820343018, "rewards/rejected": -26.95810890197754, "step": 342 }, { "epoch": 0.3811111111111111, "grad_norm": 0.022123005241155624, "learning_rate": 4.919311513249509e-05, "logits/chosen": 0.5353503227233887, "logits/rejected": 0.5935667753219604, "logps/chosen": -107.7679672241211, "logps/rejected": -405.0362548828125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -6.137140274047852, "rewards/margins": 19.899213790893555, "rewards/rejected": -26.036354064941406, "step": 343 }, { "epoch": 0.38222222222222224, "grad_norm": 13.305831909179688, "learning_rate": 4.9185380486571595e-05, "logits/chosen": 0.417242169380188, "logits/rejected": 0.40047985315322876, "logps/chosen": -718.3516845703125, "logps/rejected": -657.9684448242188, "loss": 0.2294, "rewards/accuracies": 1.0, "rewards/chosen": -48.884212493896484, "rewards/margins": 2.9786124229431152, "rewards/rejected": -51.862823486328125, "step": 344 }, { "epoch": 0.38333333333333336, "grad_norm": 0.1985122412443161, "learning_rate": 4.917760955976277e-05, "logits/chosen": 0.4768250584602356, "logits/rejected": 0.4910718500614166, "logps/chosen": -313.79095458984375, "logps/rejected": -388.8758239746094, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -20.15602684020996, "rewards/margins": 7.311872482299805, "rewards/rejected": -27.467899322509766, "step": 345 }, { "epoch": 0.3844444444444444, "grad_norm": 24.55601692199707, "learning_rate": 4.916980236372589e-05, "logits/chosen": 0.7100859880447388, "logits/rejected": 0.6604170799255371, "logps/chosen": -188.57688903808594, "logps/rejected": -276.72174072265625, "loss": 0.4508, "rewards/accuracies": 0.5, "rewards/chosen": -11.624773025512695, "rewards/margins": 5.66764497756958, "rewards/rejected": -17.292417526245117, "step": 346 }, { "epoch": 0.38555555555555554, "grad_norm": 2.1609408855438232, "learning_rate": 4.916195891017268e-05, "logits/chosen": 0.25286754965782166, "logits/rejected": 0.2612367868423462, "logps/chosen": -377.59564208984375, "logps/rejected": -427.7490539550781, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -29.99429702758789, "rewards/margins": 5.161617755889893, "rewards/rejected": -35.155914306640625, "step": 347 }, { "epoch": 0.38666666666666666, "grad_norm": 0.015138383954763412, "learning_rate": 4.915407921086921e-05, "logits/chosen": 0.6210384368896484, "logits/rejected": 0.6439251899719238, "logps/chosen": -132.2225799560547, "logps/rejected": -407.763427734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.642600059509277, "rewards/margins": 19.39201545715332, "rewards/rejected": -27.03461456298828, "step": 348 }, { "epoch": 0.3877777777777778, "grad_norm": 77.52584838867188, "learning_rate": 4.914616327763595e-05, "logits/chosen": 0.47187259793281555, "logits/rejected": 0.37680625915527344, "logps/chosen": -227.99000549316406, "logps/rejected": -158.4286346435547, "loss": 8.4082, "rewards/accuracies": 0.5, "rewards/chosen": -15.413931846618652, "rewards/margins": -6.207904815673828, "rewards/rejected": -9.206026077270508, "step": 349 }, { "epoch": 0.3888888888888889, "grad_norm": 6.353384971618652, "learning_rate": 4.9138211122347736e-05, "logits/chosen": 0.5024459362030029, "logits/rejected": 0.5228053331375122, "logps/chosen": -116.04466247558594, "logps/rejected": -136.12924194335938, "loss": 0.2335, "rewards/accuracies": 1.0, "rewards/chosen": -6.643399715423584, "rewards/margins": 1.3377931118011475, "rewards/rejected": -7.981192588806152, "step": 350 }, { "epoch": 0.39, "grad_norm": 0.6801918745040894, "learning_rate": 4.913022275693372e-05, "logits/chosen": 0.4642636477947235, "logits/rejected": 0.4464987516403198, "logps/chosen": -140.9134979248047, "logps/rejected": -248.51031494140625, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -8.01346206665039, "rewards/margins": 8.736809730529785, "rewards/rejected": -16.75027084350586, "step": 351 }, { "epoch": 0.39111111111111113, "grad_norm": 1.6831358152558096e-05, "learning_rate": 4.9122198193377374e-05, "logits/chosen": 0.42786139249801636, "logits/rejected": 0.42461901903152466, "logps/chosen": -246.0947265625, "logps/rejected": -438.01519775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.40835952758789, "rewards/margins": 15.8480863571167, "rewards/rejected": -32.256446838378906, "step": 352 }, { "epoch": 0.39222222222222225, "grad_norm": 0.21234169602394104, "learning_rate": 4.911413744371648e-05, "logits/chosen": 0.7513759732246399, "logits/rejected": 0.752644419670105, "logps/chosen": -167.1126708984375, "logps/rejected": -229.3471221923828, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -9.327463150024414, "rewards/margins": 5.147858619689941, "rewards/rejected": -14.475322723388672, "step": 353 }, { "epoch": 0.3933333333333333, "grad_norm": 22.859529495239258, "learning_rate": 4.91060405200431e-05, "logits/chosen": 0.5945161581039429, "logits/rejected": 0.6194489002227783, "logps/chosen": -174.85317993164062, "logps/rejected": -225.2041473388672, "loss": 0.8092, "rewards/accuracies": 0.5, "rewards/chosen": -10.782807350158691, "rewards/margins": 3.9059300422668457, "rewards/rejected": -14.688737869262695, "step": 354 }, { "epoch": 0.39444444444444443, "grad_norm": 0.1215277835726738, "learning_rate": 4.9097907434503564e-05, "logits/chosen": 0.4054487347602844, "logits/rejected": 0.4333512783050537, "logps/chosen": -337.9779968261719, "logps/rejected": -567.408447265625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -23.029836654663086, "rewards/margins": 15.432072639465332, "rewards/rejected": -38.461910247802734, "step": 355 }, { "epoch": 0.39555555555555555, "grad_norm": 0.12345419824123383, "learning_rate": 4.9089738199298446e-05, "logits/chosen": 0.35904762148857117, "logits/rejected": 0.35216522216796875, "logps/chosen": -188.30490112304688, "logps/rejected": -262.8484191894531, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -12.060041427612305, "rewards/margins": 5.893759250640869, "rewards/rejected": -17.953800201416016, "step": 356 }, { "epoch": 0.39666666666666667, "grad_norm": 15.187692642211914, "learning_rate": 4.908153282668255e-05, "logits/chosen": 0.5943483114242554, "logits/rejected": 0.6286430954933167, "logps/chosen": -175.2889862060547, "logps/rejected": -228.97183227539062, "loss": 1.3472, "rewards/accuracies": 0.5, "rewards/chosen": -11.143731117248535, "rewards/margins": 2.5937392711639404, "rewards/rejected": -13.737470626831055, "step": 357 }, { "epoch": 0.3977777777777778, "grad_norm": 6.0488778430567436e-09, "learning_rate": 4.907329132896489e-05, "logits/chosen": 0.4361143410205841, "logits/rejected": 0.45137929916381836, "logps/chosen": -278.4703674316406, "logps/rejected": -593.40087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.792621612548828, "rewards/margins": 26.110389709472656, "rewards/rejected": -44.903011322021484, "step": 358 }, { "epoch": 0.3988888888888889, "grad_norm": 69.67054748535156, "learning_rate": 4.906501371850867e-05, "logits/chosen": 0.36390823125839233, "logits/rejected": 0.3627249002456665, "logps/chosen": -284.2291259765625, "logps/rejected": -276.0118408203125, "loss": 1.558, "rewards/accuracies": 0.0, "rewards/chosen": -21.700334548950195, "rewards/margins": -1.302311897277832, "rewards/rejected": -20.39802360534668, "step": 359 }, { "epoch": 0.4, "grad_norm": 0.3287867307662964, "learning_rate": 4.905670000773126e-05, "logits/chosen": 0.005119726061820984, "logits/rejected": 0.12496720999479294, "logps/chosen": -148.80955505371094, "logps/rejected": -251.1022186279297, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -8.329236030578613, "rewards/margins": 6.870147228240967, "rewards/rejected": -15.199382781982422, "step": 360 }, { "epoch": 0.4011111111111111, "grad_norm": 1.9425104856491089, "learning_rate": 4.904835020910422e-05, "logits/chosen": 0.3877089023590088, "logits/rejected": 0.40360698103904724, "logps/chosen": -203.1226806640625, "logps/rejected": -257.43560791015625, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": -12.790562629699707, "rewards/margins": 4.715620040893555, "rewards/rejected": -17.506183624267578, "step": 361 }, { "epoch": 0.4022222222222222, "grad_norm": 0.008581283502280712, "learning_rate": 4.903996433515319e-05, "logits/chosen": 0.4351121187210083, "logits/rejected": 0.4313506484031677, "logps/chosen": -172.20541381835938, "logps/rejected": -394.08538818359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.537007331848145, "rewards/margins": 17.715435028076172, "rewards/rejected": -27.25244140625, "step": 362 }, { "epoch": 0.4033333333333333, "grad_norm": 12.326796531677246, "learning_rate": 4.9031542398457974e-05, "logits/chosen": 0.3967075049877167, "logits/rejected": 0.4113192558288574, "logps/chosen": -126.78914642333984, "logps/rejected": -134.5423583984375, "loss": 0.6555, "rewards/accuracies": 0.5, "rewards/chosen": -6.587523937225342, "rewards/margins": 0.3467932939529419, "rewards/rejected": -6.934317111968994, "step": 363 }, { "epoch": 0.40444444444444444, "grad_norm": 34.76066589355469, "learning_rate": 4.9023084411652454e-05, "logits/chosen": 0.43208396434783936, "logits/rejected": 0.46297866106033325, "logps/chosen": -319.0078125, "logps/rejected": -336.6872253417969, "loss": 0.7557, "rewards/accuracies": 0.5, "rewards/chosen": -23.397789001464844, "rewards/margins": -0.019787311553955078, "rewards/rejected": -23.378002166748047, "step": 364 }, { "epoch": 0.40555555555555556, "grad_norm": 0.0012862966395914555, "learning_rate": 4.90145903874246e-05, "logits/chosen": 0.3929058313369751, "logits/rejected": 0.416513055562973, "logps/chosen": -556.0380249023438, "logps/rejected": -757.1921997070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -35.430877685546875, "rewards/margins": 16.358253479003906, "rewards/rejected": -51.78913116455078, "step": 365 }, { "epoch": 0.4066666666666667, "grad_norm": 0.07337088882923126, "learning_rate": 4.900606033851642e-05, "logits/chosen": 0.3275752365589142, "logits/rejected": 0.3509056270122528, "logps/chosen": -225.3531494140625, "logps/rejected": -499.7837829589844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -16.47944450378418, "rewards/margins": 21.660799026489258, "rewards/rejected": -38.14024353027344, "step": 366 }, { "epoch": 0.4077777777777778, "grad_norm": 0.8422836065292358, "learning_rate": 4.8997494277723994e-05, "logits/chosen": 0.42671459913253784, "logits/rejected": 0.46671929955482483, "logps/chosen": -394.9187316894531, "logps/rejected": -514.3138427734375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -28.072311401367188, "rewards/margins": 8.789200782775879, "rewards/rejected": -36.86151123046875, "step": 367 }, { "epoch": 0.4088888888888889, "grad_norm": 2.5007171630859375, "learning_rate": 4.898889221789741e-05, "logits/chosen": 0.5091537237167358, "logits/rejected": 0.49444344639778137, "logps/chosen": -186.89108276367188, "logps/rejected": -289.908203125, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": -11.074460983276367, "rewards/margins": 8.201192855834961, "rewards/rejected": -19.275653839111328, "step": 368 }, { "epoch": 0.41, "grad_norm": 88.10014343261719, "learning_rate": 4.8980254171940746e-05, "logits/chosen": 0.2509012520313263, "logits/rejected": 0.1179523915052414, "logps/chosen": -475.2916564941406, "logps/rejected": -462.69451904296875, "loss": 7.1804, "rewards/accuracies": 0.5, "rewards/chosen": -31.608278274536133, "rewards/margins": 0.6212368011474609, "rewards/rejected": -32.229515075683594, "step": 369 }, { "epoch": 0.4111111111111111, "grad_norm": 0.15974661707878113, "learning_rate": 4.897158015281209e-05, "logits/chosen": 0.3052937984466553, "logits/rejected": 0.33098113536834717, "logps/chosen": -216.5129852294922, "logps/rejected": -312.7490234375, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -15.563617706298828, "rewards/margins": 6.93007230758667, "rewards/rejected": -22.493690490722656, "step": 370 }, { "epoch": 0.4122222222222222, "grad_norm": 50.91242599487305, "learning_rate": 4.896287017352348e-05, "logits/chosen": 0.24524685740470886, "logits/rejected": 0.22449800372123718, "logps/chosen": -510.0158996582031, "logps/rejected": -479.3099365234375, "loss": 3.8348, "rewards/accuracies": 0.5, "rewards/chosen": -39.51234436035156, "rewards/margins": -2.8765487670898438, "rewards/rejected": -36.63579559326172, "step": 371 }, { "epoch": 0.41333333333333333, "grad_norm": 5.480117321014404, "learning_rate": 4.8954124247140895e-05, "logits/chosen": 0.08634935319423676, "logits/rejected": 0.08604402840137482, "logps/chosen": -101.27082824707031, "logps/rejected": -248.50721740722656, "loss": 0.1519, "rewards/accuracies": 1.0, "rewards/chosen": -6.128851890563965, "rewards/margins": 10.667739868164062, "rewards/rejected": -16.796592712402344, "step": 372 }, { "epoch": 0.41444444444444445, "grad_norm": 2.794440507888794, "learning_rate": 4.8945342386784235e-05, "logits/chosen": 0.23729421198368073, "logits/rejected": 0.254472017288208, "logps/chosen": -201.5169219970703, "logps/rejected": -497.4725036621094, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": -14.267953872680664, "rewards/margins": 25.191783905029297, "rewards/rejected": -39.459739685058594, "step": 373 }, { "epoch": 0.41555555555555557, "grad_norm": 49.03179931640625, "learning_rate": 4.8936524605627324e-05, "logits/chosen": 0.1169009804725647, "logits/rejected": 0.09062185138463974, "logps/chosen": -275.3717956542969, "logps/rejected": -408.7578125, "loss": 2.0282, "rewards/accuracies": 0.5, "rewards/chosen": -19.67531967163086, "rewards/margins": 10.216899871826172, "rewards/rejected": -29.89221954345703, "step": 374 }, { "epoch": 0.4166666666666667, "grad_norm": 0.07108890265226364, "learning_rate": 4.892767091689786e-05, "logits/chosen": 0.29784607887268066, "logits/rejected": 0.2959267497062683, "logps/chosen": -436.90142822265625, "logps/rejected": -595.3516845703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -30.08844566345215, "rewards/margins": 15.261832237243652, "rewards/rejected": -45.350276947021484, "step": 375 }, { "epoch": 0.4177777777777778, "grad_norm": 6.419147491455078, "learning_rate": 4.8918781333877394e-05, "logits/chosen": 0.27116626501083374, "logits/rejected": 0.2931271195411682, "logps/chosen": -588.3807373046875, "logps/rejected": -683.500732421875, "loss": 0.1371, "rewards/accuracies": 1.0, "rewards/chosen": -47.335994720458984, "rewards/margins": 8.826996803283691, "rewards/rejected": -56.16299057006836, "step": 376 }, { "epoch": 0.41888888888888887, "grad_norm": 2.9378144361658087e-08, "learning_rate": 4.890985586990135e-05, "logits/chosen": 0.1648416668176651, "logits/rejected": 0.12888869643211365, "logps/chosen": -292.826171875, "logps/rejected": -641.8221435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.1334285736084, "rewards/margins": 26.49222183227539, "rewards/rejected": -45.625648498535156, "step": 377 }, { "epoch": 0.42, "grad_norm": 0.09813285619020462, "learning_rate": 4.8900894538358944e-05, "logits/chosen": 0.20270754396915436, "logits/rejected": 0.21315526962280273, "logps/chosen": -113.46299743652344, "logps/rejected": -271.4941711425781, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.981195449829102, "rewards/margins": 13.276585578918457, "rewards/rejected": -19.257781982421875, "step": 378 }, { "epoch": 0.4211111111111111, "grad_norm": 6.526578426361084, "learning_rate": 4.8891897352693234e-05, "logits/chosen": 0.12485845386981964, "logits/rejected": 0.1318386197090149, "logps/chosen": -243.2113037109375, "logps/rejected": -440.0687561035156, "loss": 0.1538, "rewards/accuracies": 1.0, "rewards/chosen": -17.40934944152832, "rewards/margins": 11.134639739990234, "rewards/rejected": -28.543987274169922, "step": 379 }, { "epoch": 0.4222222222222222, "grad_norm": 8.610490798950195, "learning_rate": 4.888286432640104e-05, "logits/chosen": 0.2687956988811493, "logits/rejected": 0.3315019905567169, "logps/chosen": -284.8797912597656, "logps/rejected": -366.6783447265625, "loss": 0.2209, "rewards/accuracies": 1.0, "rewards/chosen": -19.486583709716797, "rewards/margins": 4.191601276397705, "rewards/rejected": -23.678184509277344, "step": 380 }, { "epoch": 0.42333333333333334, "grad_norm": 3.3242790699005127, "learning_rate": 4.887379547303295e-05, "logits/chosen": -0.014402282424271107, "logits/rejected": -0.003365292213857174, "logps/chosen": -101.69248962402344, "logps/rejected": -149.67724609375, "loss": 0.168, "rewards/accuracies": 1.0, "rewards/chosen": -5.958422660827637, "rewards/margins": 2.8969788551330566, "rewards/rejected": -8.855401992797852, "step": 381 }, { "epoch": 0.42444444444444446, "grad_norm": 0.008118590340018272, "learning_rate": 4.88646908061933e-05, "logits/chosen": 0.20393434166908264, "logits/rejected": 0.15407447516918182, "logps/chosen": -294.0162048339844, "logps/rejected": -561.8963623046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -19.316383361816406, "rewards/margins": 20.29446792602539, "rewards/rejected": -39.6108512878418, "step": 382 }, { "epoch": 0.4255555555555556, "grad_norm": 0.014230523258447647, "learning_rate": 4.885555033954016e-05, "logits/chosen": 0.07227814197540283, "logits/rejected": 0.13287897408008575, "logps/chosen": -92.12969970703125, "logps/rejected": -205.81771850585938, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.597346305847168, "rewards/margins": 7.847845077514648, "rewards/rejected": -12.445192337036133, "step": 383 }, { "epoch": 0.4266666666666667, "grad_norm": 0.11041514575481415, "learning_rate": 4.884637408678527e-05, "logits/chosen": 0.08154091238975525, "logits/rejected": 0.10573387145996094, "logps/chosen": -205.27264404296875, "logps/rejected": -283.8736877441406, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -12.090282440185547, "rewards/margins": 6.919081211090088, "rewards/rejected": -19.00936508178711, "step": 384 }, { "epoch": 0.42777777777777776, "grad_norm": 0.009053057990968227, "learning_rate": 4.88371620616941e-05, "logits/chosen": 0.3820195198059082, "logits/rejected": 0.4248345196247101, "logps/chosen": -648.062255859375, "logps/rejected": -922.0550537109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -49.3682861328125, "rewards/margins": 21.989843368530273, "rewards/rejected": -71.35812377929688, "step": 385 }, { "epoch": 0.4288888888888889, "grad_norm": 1.944989344337955e-05, "learning_rate": 4.8827914278085754e-05, "logits/chosen": 0.18455299735069275, "logits/rejected": 0.1495962142944336, "logps/chosen": -249.77810668945312, "logps/rejected": -601.8843383789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.449896812438965, "rewards/margins": 24.148502349853516, "rewards/rejected": -39.5984001159668, "step": 386 }, { "epoch": 0.43, "grad_norm": 18.038658142089844, "learning_rate": 4.881863074983298e-05, "logits/chosen": 0.09859008342027664, "logits/rejected": 0.10421962291002274, "logps/chosen": -158.335693359375, "logps/rejected": -157.22503662109375, "loss": 3.0511, "rewards/accuracies": 0.5, "rewards/chosen": -10.47332763671875, "rewards/margins": -0.3899245262145996, "rewards/rejected": -10.083403587341309, "step": 387 }, { "epoch": 0.4311111111111111, "grad_norm": 0.06297452747821808, "learning_rate": 4.880931149086215e-05, "logits/chosen": 0.11731361597776413, "logits/rejected": 0.13902878761291504, "logps/chosen": -336.3645324707031, "logps/rejected": -432.7171325683594, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -24.462600708007812, "rewards/margins": 9.352191925048828, "rewards/rejected": -33.81479263305664, "step": 388 }, { "epoch": 0.43222222222222223, "grad_norm": 0.7396783232688904, "learning_rate": 4.879995651515324e-05, "logits/chosen": -0.025982936844229698, "logits/rejected": -0.009327489882707596, "logps/chosen": -168.3340606689453, "logps/rejected": -246.15769958496094, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -9.314703941345215, "rewards/margins": 6.238446235656738, "rewards/rejected": -15.553150177001953, "step": 389 }, { "epoch": 0.43333333333333335, "grad_norm": 0.4287324845790863, "learning_rate": 4.87905658367398e-05, "logits/chosen": 0.28557083010673523, "logits/rejected": 0.28355085849761963, "logps/chosen": -171.7362518310547, "logps/rejected": -313.6852111816406, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -10.350580215454102, "rewards/margins": 10.4736909866333, "rewards/rejected": -20.824270248413086, "step": 390 }, { "epoch": 0.43444444444444447, "grad_norm": 53.60374069213867, "learning_rate": 4.878113946970894e-05, "logits/chosen": 0.039289072155952454, "logits/rejected": 0.12498031556606293, "logps/chosen": -294.6166687011719, "logps/rejected": -344.651123046875, "loss": 3.8646, "rewards/accuracies": 0.5, "rewards/chosen": -19.57427215576172, "rewards/margins": 3.9062113761901855, "rewards/rejected": -23.480484008789062, "step": 391 }, { "epoch": 0.43555555555555553, "grad_norm": 1.0537596940994263, "learning_rate": 4.8771677428201314e-05, "logits/chosen": 0.34470927715301514, "logits/rejected": 0.3388291299343109, "logps/chosen": -229.87161254882812, "logps/rejected": -296.74871826171875, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -14.04867935180664, "rewards/margins": 5.396803379058838, "rewards/rejected": -19.44548225402832, "step": 392 }, { "epoch": 0.43666666666666665, "grad_norm": 16.635164260864258, "learning_rate": 4.876217972641107e-05, "logits/chosen": 0.21306920051574707, "logits/rejected": 0.19827616214752197, "logps/chosen": -221.14141845703125, "logps/rejected": -304.69512939453125, "loss": 0.6227, "rewards/accuracies": 0.5, "rewards/chosen": -14.28038215637207, "rewards/margins": 6.491017818450928, "rewards/rejected": -20.771400451660156, "step": 393 }, { "epoch": 0.43777777777777777, "grad_norm": 0.6936870217323303, "learning_rate": 4.875264637858589e-05, "logits/chosen": -0.009100593626499176, "logits/rejected": 0.03308924660086632, "logps/chosen": -187.20689392089844, "logps/rejected": -264.35089111328125, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -12.838022232055664, "rewards/margins": 5.331586837768555, "rewards/rejected": -18.16960906982422, "step": 394 }, { "epoch": 0.4388888888888889, "grad_norm": 9.974623935704585e-06, "learning_rate": 4.874307739902689e-05, "logits/chosen": 0.3178292512893677, "logits/rejected": 0.3085869550704956, "logps/chosen": -157.6679229736328, "logps/rejected": -391.67999267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.250457763671875, "rewards/margins": 18.878368377685547, "rewards/rejected": -28.128826141357422, "step": 395 }, { "epoch": 0.44, "grad_norm": 15.931026458740234, "learning_rate": 4.8733472802088654e-05, "logits/chosen": 0.13155701756477356, "logits/rejected": 0.14508317410945892, "logps/chosen": -188.92849731445312, "logps/rejected": -245.96348571777344, "loss": 1.0279, "rewards/accuracies": 0.5, "rewards/chosen": -12.262447357177734, "rewards/margins": 4.4357523918151855, "rewards/rejected": -16.698200225830078, "step": 396 }, { "epoch": 0.4411111111111111, "grad_norm": 59.553916931152344, "learning_rate": 4.8723832602179185e-05, "logits/chosen": -0.10193920880556107, "logits/rejected": -0.10493535548448563, "logps/chosen": -287.15777587890625, "logps/rejected": -227.77694702148438, "loss": 5.1635, "rewards/accuracies": 0.0, "rewards/chosen": -20.619823455810547, "rewards/margins": -4.8169074058532715, "rewards/rejected": -15.802915573120117, "step": 397 }, { "epoch": 0.44222222222222224, "grad_norm": 39.462032318115234, "learning_rate": 4.871415681375992e-05, "logits/chosen": 0.04653315618634224, "logits/rejected": 0.041662465780973434, "logps/chosen": -242.3671875, "logps/rejected": -286.7337646484375, "loss": 1.5623, "rewards/accuracies": 0.5, "rewards/chosen": -17.847091674804688, "rewards/margins": 2.467491626739502, "rewards/rejected": -20.314584732055664, "step": 398 }, { "epoch": 0.44333333333333336, "grad_norm": 2.7691292762756348, "learning_rate": 4.870444545134568e-05, "logits/chosen": -0.05164015293121338, "logits/rejected": -0.046257615089416504, "logps/chosen": -109.12935638427734, "logps/rejected": -137.64218139648438, "loss": 0.1478, "rewards/accuracies": 1.0, "rewards/chosen": -5.811816692352295, "rewards/margins": 1.8422157764434814, "rewards/rejected": -7.6540327072143555, "step": 399 }, { "epoch": 0.4444444444444444, "grad_norm": 0.00017364005907438695, "learning_rate": 4.869469852950461e-05, "logits/chosen": 0.18573957681655884, "logits/rejected": 0.20637641847133636, "logps/chosen": -477.70892333984375, "logps/rejected": -759.930419921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.40575408935547, "rewards/margins": 19.469608306884766, "rewards/rejected": -52.875362396240234, "step": 400 }, { "epoch": 0.44555555555555554, "grad_norm": 12.935738563537598, "learning_rate": 4.868491606285823e-05, "logits/chosen": 0.010910626500844955, "logits/rejected": 0.023723404854536057, "logps/chosen": -174.8920440673828, "logps/rejected": -280.47906494140625, "loss": 0.3775, "rewards/accuracies": 0.5, "rewards/chosen": -11.74255084991455, "rewards/margins": 6.7033796310424805, "rewards/rejected": -18.44593048095703, "step": 401 }, { "epoch": 0.44666666666666666, "grad_norm": 0.4806179702281952, "learning_rate": 4.86750980660814e-05, "logits/chosen": 0.004861321300268173, "logits/rejected": 0.02646533027291298, "logps/chosen": -466.5215759277344, "logps/rejected": -511.2183837890625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -34.74518966674805, "rewards/margins": 5.406767845153809, "rewards/rejected": -40.151954650878906, "step": 402 }, { "epoch": 0.4477777777777778, "grad_norm": 0.0022816541604697704, "learning_rate": 4.8665244553902244e-05, "logits/chosen": -0.08564843982458115, "logits/rejected": -0.07090689241886139, "logps/chosen": -318.19390869140625, "logps/rejected": -483.53558349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.669391632080078, "rewards/margins": 10.801262855529785, "rewards/rejected": -33.47065734863281, "step": 403 }, { "epoch": 0.4488888888888889, "grad_norm": 65.70623779296875, "learning_rate": 4.8655355541102176e-05, "logits/chosen": -0.06561517715454102, "logits/rejected": -0.08167295157909393, "logps/chosen": -360.77197265625, "logps/rejected": -353.25445556640625, "loss": 2.5352, "rewards/accuracies": 0.5, "rewards/chosen": -25.405710220336914, "rewards/margins": -0.7791054248809814, "rewards/rejected": -24.626605987548828, "step": 404 }, { "epoch": 0.45, "grad_norm": 3.9807755456422456e-06, "learning_rate": 4.864543104251587e-05, "logits/chosen": 0.14298143982887268, "logits/rejected": 0.08869757503271103, "logps/chosen": -175.10934448242188, "logps/rejected": -399.767822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.493881225585938, "rewards/margins": 17.476722717285156, "rewards/rejected": -27.970603942871094, "step": 405 }, { "epoch": 0.45111111111111113, "grad_norm": 0.06475096940994263, "learning_rate": 4.863547107303123e-05, "logits/chosen": -0.13470853865146637, "logits/rejected": -0.1288093626499176, "logps/chosen": -354.1844482421875, "logps/rejected": -465.83343505859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -24.765336990356445, "rewards/margins": 8.563085556030273, "rewards/rejected": -33.32842254638672, "step": 406 }, { "epoch": 0.45222222222222225, "grad_norm": 3.9115021228790283, "learning_rate": 4.862547564758936e-05, "logits/chosen": -0.21490387618541718, "logits/rejected": -0.25029903650283813, "logps/chosen": -160.5019989013672, "logps/rejected": -286.246337890625, "loss": 0.0967, "rewards/accuracies": 1.0, "rewards/chosen": -10.331470489501953, "rewards/margins": 10.911758422851562, "rewards/rejected": -21.243228912353516, "step": 407 }, { "epoch": 0.4533333333333333, "grad_norm": 0.0035522817634046078, "learning_rate": 4.8615444781184575e-05, "logits/chosen": -0.1514441967010498, "logits/rejected": -0.1488938182592392, "logps/chosen": -315.8199768066406, "logps/rejected": -437.89996337890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -20.883272171020508, "rewards/margins": 11.73844051361084, "rewards/rejected": -32.62171173095703, "step": 408 }, { "epoch": 0.45444444444444443, "grad_norm": 0.025408785790205002, "learning_rate": 4.860537848886433e-05, "logits/chosen": -0.12331990897655487, "logits/rejected": -0.14319530129432678, "logps/chosen": -332.17962646484375, "logps/rejected": -485.61956787109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -25.491146087646484, "rewards/margins": 11.958159446716309, "rewards/rejected": -37.44930648803711, "step": 409 }, { "epoch": 0.45555555555555555, "grad_norm": 2.793487071990967, "learning_rate": 4.8595276785729236e-05, "logits/chosen": -0.05237812548875809, "logits/rejected": -0.059520818293094635, "logps/chosen": -292.34027099609375, "logps/rejected": -379.33795166015625, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -20.47428321838379, "rewards/margins": 6.040631294250488, "rewards/rejected": -26.514915466308594, "step": 410 }, { "epoch": 0.45666666666666667, "grad_norm": 0.21006570756435394, "learning_rate": 4.858513968693304e-05, "logits/chosen": 0.027305684983730316, "logits/rejected": 0.019406795501708984, "logps/chosen": -222.6563720703125, "logps/rejected": -279.2453918457031, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -13.88858413696289, "rewards/margins": 5.870565891265869, "rewards/rejected": -19.7591495513916, "step": 411 }, { "epoch": 0.4577777777777778, "grad_norm": 45.2879524230957, "learning_rate": 4.857496720768254e-05, "logits/chosen": -0.053027600049972534, "logits/rejected": -0.2772376835346222, "logps/chosen": -200.8613739013672, "logps/rejected": -114.15376281738281, "loss": 10.0757, "rewards/accuracies": 0.5, "rewards/chosen": -13.400067329406738, "rewards/margins": -7.799375057220459, "rewards/rejected": -5.600692272186279, "step": 412 }, { "epoch": 0.4588888888888889, "grad_norm": 0.1180129200220108, "learning_rate": 4.8564759363237666e-05, "logits/chosen": -0.0773501843214035, "logits/rejected": -0.079042449593544, "logps/chosen": -268.2794189453125, "logps/rejected": -363.4179992675781, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -18.749197006225586, "rewards/margins": 6.1401543617248535, "rewards/rejected": -24.88935089111328, "step": 413 }, { "epoch": 0.46, "grad_norm": 0.6151158213615417, "learning_rate": 4.855451616891136e-05, "logits/chosen": 0.23278436064720154, "logits/rejected": 0.23731489479541779, "logps/chosen": -397.818115234375, "logps/rejected": -459.39544677734375, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -28.732864379882812, "rewards/margins": 5.182681083679199, "rewards/rejected": -33.91554641723633, "step": 414 }, { "epoch": 0.46111111111111114, "grad_norm": 0.02156289480626583, "learning_rate": 4.854423764006961e-05, "logits/chosen": 0.07557745277881622, "logits/rejected": 0.0774305984377861, "logps/chosen": -312.1730041503906, "logps/rejected": -449.376708984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -21.98764419555664, "rewards/margins": 9.13068962097168, "rewards/rejected": -31.11833381652832, "step": 415 }, { "epoch": 0.4622222222222222, "grad_norm": 5.298972246237099e-05, "learning_rate": 4.853392379213141e-05, "logits/chosen": -0.12956129014492035, "logits/rejected": -0.12366049736738205, "logps/chosen": -279.870849609375, "logps/rejected": -485.8118896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.28246307373047, "rewards/margins": 16.406723022460938, "rewards/rejected": -36.689186096191406, "step": 416 }, { "epoch": 0.4633333333333333, "grad_norm": 2.3641183376312256, "learning_rate": 4.8523574640568713e-05, "logits/chosen": -0.020781714469194412, "logits/rejected": 0.05874735116958618, "logps/chosen": -368.23968505859375, "logps/rejected": -538.8060302734375, "loss": 0.0193, "rewards/accuracies": 1.0, "rewards/chosen": -23.493356704711914, "rewards/margins": 14.29702377319336, "rewards/rejected": -37.790382385253906, "step": 417 }, { "epoch": 0.46444444444444444, "grad_norm": 9.715100168250501e-05, "learning_rate": 4.851319020090647e-05, "logits/chosen": 0.1167077049612999, "logits/rejected": 0.12401759624481201, "logps/chosen": -228.02703857421875, "logps/rejected": -503.52508544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.91866683959961, "rewards/margins": 19.17205238342285, "rewards/rejected": -31.09071922302246, "step": 418 }, { "epoch": 0.46555555555555556, "grad_norm": 1.2199531662648866e-10, "learning_rate": 4.8502770488722544e-05, "logits/chosen": 0.13689081370830536, "logits/rejected": 0.12034010887145996, "logps/chosen": -514.0646362304688, "logps/rejected": -991.0902099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -37.451622009277344, "rewards/margins": 29.140756607055664, "rewards/rejected": -66.59237670898438, "step": 419 }, { "epoch": 0.4666666666666667, "grad_norm": 2.980344772338867, "learning_rate": 4.849231551964771e-05, "logits/chosen": -0.004592999815940857, "logits/rejected": 0.01929013431072235, "logps/chosen": -230.91595458984375, "logps/rejected": -323.23175048828125, "loss": 0.1587, "rewards/accuracies": 1.0, "rewards/chosen": -14.332998275756836, "rewards/margins": 7.1430182456970215, "rewards/rejected": -21.476016998291016, "step": 420 }, { "epoch": 0.4677777777777778, "grad_norm": 5.044056415557861, "learning_rate": 4.848182530936565e-05, "logits/chosen": -0.03712940216064453, "logits/rejected": -0.023200765252113342, "logps/chosen": -183.35809326171875, "logps/rejected": -205.5402069091797, "loss": 0.707, "rewards/accuracies": 0.5, "rewards/chosen": -11.485251426696777, "rewards/margins": 2.3820815086364746, "rewards/rejected": -13.867332458496094, "step": 421 }, { "epoch": 0.4688888888888889, "grad_norm": 20.121213912963867, "learning_rate": 4.8471299873612884e-05, "logits/chosen": 0.3397771120071411, "logits/rejected": 0.33615821599960327, "logps/chosen": -361.53326416015625, "logps/rejected": -379.66082763671875, "loss": 0.9121, "rewards/accuracies": 0.5, "rewards/chosen": -22.46927261352539, "rewards/margins": 1.4614839553833008, "rewards/rejected": -23.930757522583008, "step": 422 }, { "epoch": 0.47, "grad_norm": 2.560032367706299, "learning_rate": 4.8460739228178806e-05, "logits/chosen": 0.30692529678344727, "logits/rejected": 0.2986672818660736, "logps/chosen": -249.67433166503906, "logps/rejected": -382.1378479003906, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": -16.105005264282227, "rewards/margins": 9.650163650512695, "rewards/rejected": -25.755168914794922, "step": 423 }, { "epoch": 0.4711111111111111, "grad_norm": 48.33751678466797, "learning_rate": 4.84501433889056e-05, "logits/chosen": 0.002255454659461975, "logits/rejected": -0.016937144100666046, "logps/chosen": -303.7843933105469, "logps/rejected": -291.1084899902344, "loss": 4.1736, "rewards/accuracies": 0.5, "rewards/chosen": -19.97822380065918, "rewards/margins": 1.398221492767334, "rewards/rejected": -21.376445770263672, "step": 424 }, { "epoch": 0.4722222222222222, "grad_norm": 1.459672212600708, "learning_rate": 4.8439512371688264e-05, "logits/chosen": 0.15104568004608154, "logits/rejected": 0.16409087181091309, "logps/chosen": -219.68191528320312, "logps/rejected": -418.9354248046875, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": -14.006685256958008, "rewards/margins": 14.319334983825684, "rewards/rejected": -28.326019287109375, "step": 425 }, { "epoch": 0.47333333333333333, "grad_norm": 8.25017261505127, "learning_rate": 4.8428846192474575e-05, "logits/chosen": -0.19650860130786896, "logits/rejected": 0.06996503472328186, "logps/chosen": -141.77862548828125, "logps/rejected": -218.00027465820312, "loss": 0.1441, "rewards/accuracies": 1.0, "rewards/chosen": -9.039179801940918, "rewards/margins": 2.6805765628814697, "rewards/rejected": -11.719757080078125, "step": 426 }, { "epoch": 0.47444444444444445, "grad_norm": 50.90774154663086, "learning_rate": 4.841814486726502e-05, "logits/chosen": 0.10883571952581406, "logits/rejected": 0.11300275474786758, "logps/chosen": -232.14697265625, "logps/rejected": -262.89288330078125, "loss": 0.8669, "rewards/accuracies": 0.5, "rewards/chosen": -16.032995223999023, "rewards/margins": 2.2797117233276367, "rewards/rejected": -18.312705993652344, "step": 427 }, { "epoch": 0.47555555555555556, "grad_norm": 2.6572067737579346, "learning_rate": 4.8407408412112844e-05, "logits/chosen": 0.16936737298965454, "logits/rejected": 0.1875167042016983, "logps/chosen": -157.3213653564453, "logps/rejected": -183.23460388183594, "loss": 0.2183, "rewards/accuracies": 1.0, "rewards/chosen": -9.00556755065918, "rewards/margins": 2.510236978530884, "rewards/rejected": -11.515804290771484, "step": 428 }, { "epoch": 0.4766666666666667, "grad_norm": 4.397669315338135, "learning_rate": 4.839663684312398e-05, "logits/chosen": -0.13877922296524048, "logits/rejected": -0.05140033736824989, "logps/chosen": -257.3121643066406, "logps/rejected": -305.19537353515625, "loss": 0.0862, "rewards/accuracies": 1.0, "rewards/chosen": -16.091596603393555, "rewards/margins": 3.7092790603637695, "rewards/rejected": -19.800874710083008, "step": 429 }, { "epoch": 0.4777777777777778, "grad_norm": 4.512719631195068, "learning_rate": 4.838583017645702e-05, "logits/chosen": 0.1380952000617981, "logits/rejected": 0.14048446714878082, "logps/chosen": -163.04171752929688, "logps/rejected": -218.95758056640625, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": -9.464393615722656, "rewards/margins": 4.505451202392578, "rewards/rejected": -13.969844818115234, "step": 430 }, { "epoch": 0.47888888888888886, "grad_norm": 0.6118378043174744, "learning_rate": 4.837498842832324e-05, "logits/chosen": 0.09739330410957336, "logits/rejected": 0.09181972593069077, "logps/chosen": -223.00099182128906, "logps/rejected": -255.48638916015625, "loss": 0.3549, "rewards/accuracies": 0.5, "rewards/chosen": -14.386837005615234, "rewards/margins": 2.0446510314941406, "rewards/rejected": -16.431488037109375, "step": 431 }, { "epoch": 0.48, "grad_norm": 3.3557770252227783, "learning_rate": 4.8364111614986527e-05, "logits/chosen": 0.1725594699382782, "logits/rejected": 0.20582205057144165, "logps/chosen": -84.38212585449219, "logps/rejected": -125.71180725097656, "loss": 0.4022, "rewards/accuracies": 0.5, "rewards/chosen": -3.269456386566162, "rewards/margins": 2.693902015686035, "rewards/rejected": -5.963358402252197, "step": 432 }, { "epoch": 0.4811111111111111, "grad_norm": 0.0068191103637218475, "learning_rate": 4.8353199752763344e-05, "logits/chosen": 0.1384795606136322, "logits/rejected": 0.12829212844371796, "logps/chosen": -221.6590576171875, "logps/rejected": -457.39776611328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.579670906066895, "rewards/margins": 16.192991256713867, "rewards/rejected": -30.772663116455078, "step": 433 }, { "epoch": 0.4822222222222222, "grad_norm": 4.493951320648193, "learning_rate": 4.8342252858022775e-05, "logits/chosen": 0.08360593020915985, "logits/rejected": 0.11488797515630722, "logps/chosen": -237.3058624267578, "logps/rejected": -345.3595886230469, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": -14.124784469604492, "rewards/margins": 7.930981636047363, "rewards/rejected": -22.05576515197754, "step": 434 }, { "epoch": 0.48333333333333334, "grad_norm": 0.10311396420001984, "learning_rate": 4.833127094718643e-05, "logits/chosen": 0.29798829555511475, "logits/rejected": 0.29421278834342957, "logps/chosen": -114.53166198730469, "logps/rejected": -200.90908813476562, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.6359148025512695, "rewards/margins": 6.137424945831299, "rewards/rejected": -11.773340225219727, "step": 435 }, { "epoch": 0.48444444444444446, "grad_norm": 0.5338541269302368, "learning_rate": 4.832025403672845e-05, "logits/chosen": 0.1487748920917511, "logits/rejected": 0.17128746211528778, "logps/chosen": -173.12847900390625, "logps/rejected": -300.1663818359375, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -10.681846618652344, "rewards/margins": 8.120452880859375, "rewards/rejected": -18.80229949951172, "step": 436 }, { "epoch": 0.4855555555555556, "grad_norm": 9.376507759094238, "learning_rate": 4.8309202143175484e-05, "logits/chosen": 0.05262155085802078, "logits/rejected": 0.049904193729162216, "logps/chosen": -246.3385009765625, "logps/rejected": -265.572021484375, "loss": 0.238, "rewards/accuracies": 1.0, "rewards/chosen": -17.034725189208984, "rewards/margins": 2.084589958190918, "rewards/rejected": -19.119314193725586, "step": 437 }, { "epoch": 0.4866666666666667, "grad_norm": 2.888063669204712, "learning_rate": 4.829811528310666e-05, "logits/chosen": 0.07179306447505951, "logits/rejected": 0.07473259419202805, "logps/chosen": -217.01744079589844, "logps/rejected": -312.5491943359375, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -14.696636199951172, "rewards/margins": 7.419041633605957, "rewards/rejected": -22.115676879882812, "step": 438 }, { "epoch": 0.48777777777777775, "grad_norm": 0.0019313014345243573, "learning_rate": 4.828699347315356e-05, "logits/chosen": -0.2874913215637207, "logits/rejected": -0.29578936100006104, "logps/chosen": -313.46978759765625, "logps/rejected": -477.9320068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.62856674194336, "rewards/margins": 13.564882278442383, "rewards/rejected": -34.193450927734375, "step": 439 }, { "epoch": 0.4888888888888889, "grad_norm": 0.4755980968475342, "learning_rate": 4.827583673000019e-05, "logits/chosen": -0.1000068262219429, "logits/rejected": -0.09699825942516327, "logps/chosen": -209.8822021484375, "logps/rejected": -289.81744384765625, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -15.738228797912598, "rewards/margins": 5.875933647155762, "rewards/rejected": -21.61416244506836, "step": 440 }, { "epoch": 0.49, "grad_norm": 38.136451721191406, "learning_rate": 4.8264645070382964e-05, "logits/chosen": 0.011126243509352207, "logits/rejected": -0.14430394768714905, "logps/chosen": -185.91989135742188, "logps/rejected": -149.25064086914062, "loss": 5.0159, "rewards/accuracies": 0.5, "rewards/chosen": -11.655533790588379, "rewards/margins": -3.0271503925323486, "rewards/rejected": -8.62838363647461, "step": 441 }, { "epoch": 0.4911111111111111, "grad_norm": 0.18217438459396362, "learning_rate": 4.825341851109068e-05, "logits/chosen": -0.3232160210609436, "logits/rejected": -0.3529653549194336, "logps/chosen": -290.2325439453125, "logps/rejected": -365.92779541015625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -21.20379638671875, "rewards/margins": 6.670583724975586, "rewards/rejected": -27.874380111694336, "step": 442 }, { "epoch": 0.4922222222222222, "grad_norm": 0.19152595102787018, "learning_rate": 4.8242157068964466e-05, "logits/chosen": -0.23653644323349, "logits/rejected": -0.28795984387397766, "logps/chosen": -192.47601318359375, "logps/rejected": -332.6755676269531, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -12.604879379272461, "rewards/margins": 11.20513916015625, "rewards/rejected": -23.81001853942871, "step": 443 }, { "epoch": 0.49333333333333335, "grad_norm": 6.270524024963379, "learning_rate": 4.823086076089781e-05, "logits/chosen": -0.09904371201992035, "logits/rejected": -0.08655060827732086, "logps/chosen": -295.4136657714844, "logps/rejected": -341.5915832519531, "loss": 0.0838, "rewards/accuracies": 1.0, "rewards/chosen": -21.487876892089844, "rewards/margins": 4.807831764221191, "rewards/rejected": -26.29570770263672, "step": 444 }, { "epoch": 0.49444444444444446, "grad_norm": 0.622852087020874, "learning_rate": 4.821952960383649e-05, "logits/chosen": -0.021587349474430084, "logits/rejected": -0.01942632347345352, "logps/chosen": -229.5586395263672, "logps/rejected": -421.4346618652344, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": -15.254935264587402, "rewards/margins": 15.574117660522461, "rewards/rejected": -30.82905387878418, "step": 445 }, { "epoch": 0.4955555555555556, "grad_norm": 0.08531899750232697, "learning_rate": 4.8208163614778555e-05, "logits/chosen": -0.24234555661678314, "logits/rejected": -0.22787898778915405, "logps/chosen": -127.23355865478516, "logps/rejected": -217.30825805664062, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -7.864404678344727, "rewards/margins": 6.292803764343262, "rewards/rejected": -14.157209396362305, "step": 446 }, { "epoch": 0.49666666666666665, "grad_norm": 2.4710540771484375, "learning_rate": 4.819676281077431e-05, "logits/chosen": 0.017475295811891556, "logits/rejected": 0.018641797825694084, "logps/chosen": -299.2492370605469, "logps/rejected": -388.14697265625, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": -21.0810546875, "rewards/margins": 8.201211929321289, "rewards/rejected": -29.282264709472656, "step": 447 }, { "epoch": 0.49777777777777776, "grad_norm": 0.0043671028688549995, "learning_rate": 4.8185327208926295e-05, "logits/chosen": -0.20318791270256042, "logits/rejected": -0.19862860441207886, "logps/chosen": -293.1810302734375, "logps/rejected": -548.4443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.674407958984375, "rewards/margins": 21.69176483154297, "rewards/rejected": -42.366172790527344, "step": 448 }, { "epoch": 0.4988888888888889, "grad_norm": 0.002952954964712262, "learning_rate": 4.817385682638924e-05, "logits/chosen": -0.2320467233657837, "logits/rejected": -0.23921024799346924, "logps/chosen": -191.1558074951172, "logps/rejected": -363.24420166015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.464888572692871, "rewards/margins": 13.271187782287598, "rewards/rejected": -26.73607635498047, "step": 449 }, { "epoch": 0.5, "grad_norm": 5.8081955909729, "learning_rate": 4.8162351680370044e-05, "logits/chosen": -0.041524894535541534, "logits/rejected": -0.0327264778316021, "logps/chosen": -232.45347595214844, "logps/rejected": -276.39385986328125, "loss": 0.1823, "rewards/accuracies": 1.0, "rewards/chosen": -16.000123977661133, "rewards/margins": 4.084897994995117, "rewards/rejected": -20.08502197265625, "step": 450 }, { "epoch": 0.5011111111111111, "grad_norm": 2.0412022422533482e-05, "learning_rate": 4.815081178812778e-05, "logits/chosen": -0.06208375468850136, "logits/rejected": -0.05391335487365723, "logps/chosen": -345.88433837890625, "logps/rejected": -622.7772216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.378135681152344, "rewards/margins": 23.347736358642578, "rewards/rejected": -48.72587203979492, "step": 451 }, { "epoch": 0.5022222222222222, "grad_norm": 0.11310431361198425, "learning_rate": 4.813923716697363e-05, "logits/chosen": -0.29350125789642334, "logits/rejected": -0.29849404096603394, "logps/chosen": -147.73834228515625, "logps/rejected": -469.0579833984375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -9.430177688598633, "rewards/margins": 24.272966384887695, "rewards/rejected": -33.70314407348633, "step": 452 }, { "epoch": 0.5033333333333333, "grad_norm": 89.5409927368164, "learning_rate": 4.812762783427085e-05, "logits/chosen": -0.4319671094417572, "logits/rejected": -0.4901720881462097, "logps/chosen": -446.32208251953125, "logps/rejected": -483.66815185546875, "loss": 5.7067, "rewards/accuracies": 0.5, "rewards/chosen": -33.586063385009766, "rewards/margins": 2.2307024002075195, "rewards/rejected": -35.81676483154297, "step": 453 }, { "epoch": 0.5044444444444445, "grad_norm": 1.3697983026504517, "learning_rate": 4.811598380743482e-05, "logits/chosen": -0.3675556480884552, "logits/rejected": -0.3732084035873413, "logps/chosen": -209.56155395507812, "logps/rejected": -314.40606689453125, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -15.478317260742188, "rewards/margins": 8.733879089355469, "rewards/rejected": -24.212196350097656, "step": 454 }, { "epoch": 0.5055555555555555, "grad_norm": 1.3602561950683594, "learning_rate": 4.8104305103932914e-05, "logits/chosen": -0.22308766841888428, "logits/rejected": -0.23648443818092346, "logps/chosen": -361.5826110839844, "logps/rejected": -530.9982299804688, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -27.379947662353516, "rewards/margins": 14.248883247375488, "rewards/rejected": -41.62882995605469, "step": 455 }, { "epoch": 0.5066666666666667, "grad_norm": 9.517644882202148, "learning_rate": 4.8092591741284546e-05, "logits/chosen": -0.33119887113571167, "logits/rejected": -0.311024010181427, "logps/chosen": -205.9384002685547, "logps/rejected": -266.48687744140625, "loss": 0.1042, "rewards/accuracies": 1.0, "rewards/chosen": -13.095968246459961, "rewards/margins": 4.820119380950928, "rewards/rejected": -17.916088104248047, "step": 456 }, { "epoch": 0.5077777777777778, "grad_norm": 164.91822814941406, "learning_rate": 4.808084373706114e-05, "logits/chosen": -0.34913885593414307, "logits/rejected": -0.3263116180896759, "logps/chosen": -592.8624267578125, "logps/rejected": -587.6072998046875, "loss": 3.0295, "rewards/accuracies": 0.5, "rewards/chosen": -47.37583923339844, "rewards/margins": 0.3719921112060547, "rewards/rejected": -47.747833251953125, "step": 457 }, { "epoch": 0.5088888888888888, "grad_norm": 0.014838256873190403, "learning_rate": 4.806906110888606e-05, "logits/chosen": -0.41966986656188965, "logits/rejected": -0.4476931393146515, "logps/chosen": -701.5173950195312, "logps/rejected": -834.6512451171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -55.88774108886719, "rewards/margins": 12.001840591430664, "rewards/rejected": -67.88957977294922, "step": 458 }, { "epoch": 0.51, "grad_norm": 0.08798251301050186, "learning_rate": 4.805724387443462e-05, "logits/chosen": -0.2760388255119324, "logits/rejected": -0.2792109251022339, "logps/chosen": -413.7474365234375, "logps/rejected": -579.0394287109375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -32.05229568481445, "rewards/margins": 13.238158226013184, "rewards/rejected": -45.29045486450195, "step": 459 }, { "epoch": 0.5111111111111111, "grad_norm": 24.459970474243164, "learning_rate": 4.804539205143405e-05, "logits/chosen": -0.4337116777896881, "logits/rejected": -0.44546574354171753, "logps/chosen": -257.7248840332031, "logps/rejected": -344.0628662109375, "loss": 0.5282, "rewards/accuracies": 0.5, "rewards/chosen": -19.174808502197266, "rewards/margins": 7.779985427856445, "rewards/rejected": -26.95479393005371, "step": 460 }, { "epoch": 0.5122222222222222, "grad_norm": 0.009213360957801342, "learning_rate": 4.8033505657663445e-05, "logits/chosen": -0.5451526641845703, "logits/rejected": -0.5481545925140381, "logps/chosen": -273.56243896484375, "logps/rejected": -408.6334228515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -20.356666564941406, "rewards/margins": 11.093234062194824, "rewards/rejected": -31.449899673461914, "step": 461 }, { "epoch": 0.5133333333333333, "grad_norm": 0.05220331624150276, "learning_rate": 4.8021584710953815e-05, "logits/chosen": -0.4384024739265442, "logits/rejected": -0.4895780682563782, "logps/chosen": -236.36199951171875, "logps/rejected": -320.59698486328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -17.732742309570312, "rewards/margins": 7.04229736328125, "rewards/rejected": -24.775039672851562, "step": 462 }, { "epoch": 0.5144444444444445, "grad_norm": 0.0030830141622573137, "learning_rate": 4.800962922918793e-05, "logits/chosen": -0.6021794080734253, "logits/rejected": -0.6007270812988281, "logps/chosen": -474.9808349609375, "logps/rejected": -632.83544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -36.05101013183594, "rewards/margins": 13.282150268554688, "rewards/rejected": -49.333160400390625, "step": 463 }, { "epoch": 0.5155555555555555, "grad_norm": 7.416355401801411e-06, "learning_rate": 4.799763923030043e-05, "logits/chosen": -0.6369425058364868, "logits/rejected": -0.6433393955230713, "logps/chosen": -532.528076171875, "logps/rejected": -1135.6632080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -43.61982727050781, "rewards/margins": 46.32053756713867, "rewards/rejected": -89.94036865234375, "step": 464 }, { "epoch": 0.5166666666666667, "grad_norm": 4.684656143188477, "learning_rate": 4.79856147322777e-05, "logits/chosen": -0.6085008382797241, "logits/rejected": -0.6025664210319519, "logps/chosen": -508.16754150390625, "logps/rejected": -613.1409301757812, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -41.86006164550781, "rewards/margins": 8.416984558105469, "rewards/rejected": -50.27704620361328, "step": 465 }, { "epoch": 0.5177777777777778, "grad_norm": 72.03083038330078, "learning_rate": 4.797355575315788e-05, "logits/chosen": -0.8408241271972656, "logits/rejected": -0.8139203786849976, "logps/chosen": -214.582275390625, "logps/rejected": -338.6270751953125, "loss": 2.2636, "rewards/accuracies": 0.5, "rewards/chosen": -14.642263412475586, "rewards/margins": 9.78508186340332, "rewards/rejected": -24.427345275878906, "step": 466 }, { "epoch": 0.5188888888888888, "grad_norm": 0.0033472280483692884, "learning_rate": 4.7961462311030845e-05, "logits/chosen": -0.6486800909042358, "logits/rejected": -0.6635938286781311, "logps/chosen": -684.967529296875, "logps/rejected": -818.019775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -56.44300842285156, "rewards/margins": 10.215351104736328, "rewards/rejected": -66.65836334228516, "step": 467 }, { "epoch": 0.52, "grad_norm": 16.872087478637695, "learning_rate": 4.7949334424038176e-05, "logits/chosen": -0.6695341467857361, "logits/rejected": -0.6674842834472656, "logps/chosen": -484.53656005859375, "logps/rejected": -509.10919189453125, "loss": 0.1538, "rewards/accuracies": 1.0, "rewards/chosen": -40.615074157714844, "rewards/margins": 2.361405372619629, "rewards/rejected": -42.97648239135742, "step": 468 }, { "epoch": 0.5211111111111111, "grad_norm": 0.15718169510364532, "learning_rate": 4.7937172110373105e-05, "logits/chosen": -0.6171848177909851, "logits/rejected": -0.624437689781189, "logps/chosen": -505.4447021484375, "logps/rejected": -600.0762939453125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -39.910640716552734, "rewards/margins": 8.109513282775879, "rewards/rejected": -48.0201530456543, "step": 469 }, { "epoch": 0.5222222222222223, "grad_norm": 8.851956367492676, "learning_rate": 4.7924975388280524e-05, "logits/chosen": -0.7710382342338562, "logits/rejected": -0.7662855982780457, "logps/chosen": -350.7492980957031, "logps/rejected": -385.4314880371094, "loss": 0.3166, "rewards/accuracies": 1.0, "rewards/chosen": -27.957748413085938, "rewards/margins": 3.5259392261505127, "rewards/rejected": -31.483686447143555, "step": 470 }, { "epoch": 0.5233333333333333, "grad_norm": 0.0010904478840529919, "learning_rate": 4.791274427605693e-05, "logits/chosen": -0.7361353635787964, "logits/rejected": -0.7277028560638428, "logps/chosen": -453.43402099609375, "logps/rejected": -631.2952880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -35.64508819580078, "rewards/margins": 13.957695960998535, "rewards/rejected": -49.602783203125, "step": 471 }, { "epoch": 0.5244444444444445, "grad_norm": 61.94932174682617, "learning_rate": 4.790047879205041e-05, "logits/chosen": -0.7067735195159912, "logits/rejected": -0.7180030345916748, "logps/chosen": -377.69281005859375, "logps/rejected": -483.403564453125, "loss": 1.1146, "rewards/accuracies": 0.5, "rewards/chosen": -30.434160232543945, "rewards/margins": 5.635149002075195, "rewards/rejected": -36.06930923461914, "step": 472 }, { "epoch": 0.5255555555555556, "grad_norm": 80.57006072998047, "learning_rate": 4.788817895466063e-05, "logits/chosen": -0.549356997013092, "logits/rejected": -0.552036702632904, "logps/chosen": -445.32550048828125, "logps/rejected": -533.1294555664062, "loss": 1.6144, "rewards/accuracies": 0.5, "rewards/chosen": -36.4919548034668, "rewards/margins": 6.914266586303711, "rewards/rejected": -43.406219482421875, "step": 473 }, { "epoch": 0.5266666666666666, "grad_norm": 2.26885177312397e-08, "learning_rate": 4.787584478233877e-05, "logits/chosen": -0.8784161806106567, "logits/rejected": -0.8472887277603149, "logps/chosen": -722.2238159179688, "logps/rejected": -1316.8197021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -64.10980987548828, "rewards/margins": 39.60997009277344, "rewards/rejected": -103.71977233886719, "step": 474 }, { "epoch": 0.5277777777777778, "grad_norm": 0.00310464296489954, "learning_rate": 4.786347629358753e-05, "logits/chosen": -0.7199623584747314, "logits/rejected": -0.8295122385025024, "logps/chosen": -515.1920776367188, "logps/rejected": -878.6954956054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -42.28263854980469, "rewards/margins": 26.288450241088867, "rewards/rejected": -68.57109069824219, "step": 475 }, { "epoch": 0.5288888888888889, "grad_norm": 0.04385121166706085, "learning_rate": 4.785107350696107e-05, "logits/chosen": -0.4773239493370056, "logits/rejected": -0.5202263593673706, "logps/chosen": -607.9476928710938, "logps/rejected": -768.38720703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -52.224700927734375, "rewards/margins": 13.294368743896484, "rewards/rejected": -65.51907348632812, "step": 476 }, { "epoch": 0.53, "grad_norm": 0.21070116758346558, "learning_rate": 4.783863644106502e-05, "logits/chosen": -0.9364955425262451, "logits/rejected": -0.9624282717704773, "logps/chosen": -331.7967529296875, "logps/rejected": -425.01837158203125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -24.266183853149414, "rewards/margins": 8.698238372802734, "rewards/rejected": -32.96442413330078, "step": 477 }, { "epoch": 0.5311111111111111, "grad_norm": 0.3606346547603607, "learning_rate": 4.782616511455644e-05, "logits/chosen": -0.8932220935821533, "logits/rejected": -0.9635410904884338, "logps/chosen": -252.3544921875, "logps/rejected": -377.5279541015625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -21.193330764770508, "rewards/margins": 6.263426780700684, "rewards/rejected": -27.456756591796875, "step": 478 }, { "epoch": 0.5322222222222223, "grad_norm": 0.02579995058476925, "learning_rate": 4.7813659546143745e-05, "logits/chosen": -0.8404313921928406, "logits/rejected": -0.8676011562347412, "logps/chosen": -350.5596618652344, "logps/rejected": -505.9067687988281, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -28.113876342773438, "rewards/margins": 12.12387466430664, "rewards/rejected": -40.23775100708008, "step": 479 }, { "epoch": 0.5333333333333333, "grad_norm": 9.345119906356558e-06, "learning_rate": 4.7801119754586766e-05, "logits/chosen": -0.9277992248535156, "logits/rejected": -0.9983352422714233, "logps/chosen": -409.9505310058594, "logps/rejected": -767.5635986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.98674774169922, "rewards/margins": 26.063779830932617, "rewards/rejected": -59.05052947998047, "step": 480 }, { "epoch": 0.5344444444444445, "grad_norm": 0.005540132522583008, "learning_rate": 4.7788545758696635e-05, "logits/chosen": -0.8552422523498535, "logits/rejected": -0.8498562574386597, "logps/chosen": -432.430908203125, "logps/rejected": -581.492919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -38.31005096435547, "rewards/margins": 12.255088806152344, "rewards/rejected": -50.56513977050781, "step": 481 }, { "epoch": 0.5355555555555556, "grad_norm": 0.00018564298807177693, "learning_rate": 4.777593757733582e-05, "logits/chosen": -0.8410835266113281, "logits/rejected": -0.8234373331069946, "logps/chosen": -372.0001220703125, "logps/rejected": -569.853271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.136371612548828, "rewards/margins": 17.74886703491211, "rewards/rejected": -48.88523864746094, "step": 482 }, { "epoch": 0.5366666666666666, "grad_norm": 0.11211370676755905, "learning_rate": 4.776329522941805e-05, "logits/chosen": -0.8005378246307373, "logits/rejected": -0.790703296661377, "logps/chosen": -309.38397216796875, "logps/rejected": -385.3167724609375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -26.13857078552246, "rewards/margins": 6.465510368347168, "rewards/rejected": -32.60408020019531, "step": 483 }, { "epoch": 0.5377777777777778, "grad_norm": 0.21446536481380463, "learning_rate": 4.775061873390832e-05, "logits/chosen": -0.9643493890762329, "logits/rejected": -0.9750192165374756, "logps/chosen": -390.84747314453125, "logps/rejected": -503.0348815917969, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -32.224365234375, "rewards/margins": 9.532646179199219, "rewards/rejected": -41.75701141357422, "step": 484 }, { "epoch": 0.5388888888888889, "grad_norm": 14.478490829467773, "learning_rate": 4.7737908109822854e-05, "logits/chosen": -0.8245651721954346, "logits/rejected": -0.8259553909301758, "logps/chosen": -387.017822265625, "logps/rejected": -509.9576416015625, "loss": 0.6314, "rewards/accuracies": 0.5, "rewards/chosen": -33.71280288696289, "rewards/margins": 9.456096649169922, "rewards/rejected": -43.16889953613281, "step": 485 }, { "epoch": 0.54, "grad_norm": 16.347183227539062, "learning_rate": 4.7725163376229064e-05, "logits/chosen": -0.9077595472335815, "logits/rejected": -0.9373822808265686, "logps/chosen": -273.1741943359375, "logps/rejected": -375.8546447753906, "loss": 0.1092, "rewards/accuracies": 1.0, "rewards/chosen": -22.009628295898438, "rewards/margins": 7.54038143157959, "rewards/rejected": -29.550010681152344, "step": 486 }, { "epoch": 0.5411111111111111, "grad_norm": 58.27195739746094, "learning_rate": 4.7712384552245536e-05, "logits/chosen": -0.6024246215820312, "logits/rejected": -0.6541932225227356, "logps/chosen": -424.54681396484375, "logps/rejected": -566.4591064453125, "loss": 1.1181, "rewards/accuracies": 0.5, "rewards/chosen": -35.26250076293945, "rewards/margins": 11.252135276794434, "rewards/rejected": -46.5146369934082, "step": 487 }, { "epoch": 0.5422222222222223, "grad_norm": 4.247241973876953, "learning_rate": 4.7699571657041994e-05, "logits/chosen": -0.4077141284942627, "logits/rejected": -0.3988763689994812, "logps/chosen": -545.916748046875, "logps/rejected": -650.0489501953125, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": -43.249549865722656, "rewards/margins": 10.130289077758789, "rewards/rejected": -53.37983703613281, "step": 488 }, { "epoch": 0.5433333333333333, "grad_norm": 0.08501764386892319, "learning_rate": 4.768672470983926e-05, "logits/chosen": -0.8657889366149902, "logits/rejected": -0.8877992630004883, "logps/chosen": -261.8565673828125, "logps/rejected": -433.7759704589844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -21.36813735961914, "rewards/margins": 11.635833740234375, "rewards/rejected": -33.003971099853516, "step": 489 }, { "epoch": 0.5444444444444444, "grad_norm": 0.0038666860200464725, "learning_rate": 4.767384372990927e-05, "logits/chosen": -0.7178463935852051, "logits/rejected": -0.7504041194915771, "logps/chosen": -504.4922790527344, "logps/rejected": -684.2086181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -42.286827087402344, "rewards/margins": 15.471380233764648, "rewards/rejected": -57.75820541381836, "step": 490 }, { "epoch": 0.5455555555555556, "grad_norm": 75.63831329345703, "learning_rate": 4.766092873657497e-05, "logits/chosen": -0.813745379447937, "logits/rejected": -0.8228471279144287, "logps/chosen": -349.15191650390625, "logps/rejected": -347.80157470703125, "loss": 2.1088, "rewards/accuracies": 0.0, "rewards/chosen": -29.84192657470703, "rewards/margins": -1.7472448348999023, "rewards/rejected": -28.094682693481445, "step": 491 }, { "epoch": 0.5466666666666666, "grad_norm": 0.0463394895195961, "learning_rate": 4.7647979749210384e-05, "logits/chosen": -0.7556103467941284, "logits/rejected": -0.743425726890564, "logps/chosen": -361.2544250488281, "logps/rejected": -453.5407409667969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -27.25750732421875, "rewards/margins": 8.121718406677246, "rewards/rejected": -35.37922668457031, "step": 492 }, { "epoch": 0.5477777777777778, "grad_norm": 1.819043517112732, "learning_rate": 4.763499678724047e-05, "logits/chosen": -0.7614933252334595, "logits/rejected": -0.7622076272964478, "logps/chosen": -418.6034851074219, "logps/rejected": -693.71435546875, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -33.31962585449219, "rewards/margins": 21.324050903320312, "rewards/rejected": -54.6436767578125, "step": 493 }, { "epoch": 0.5488888888888889, "grad_norm": 1.8091321843383135e-11, "learning_rate": 4.762197987014119e-05, "logits/chosen": -0.6763979196548462, "logits/rejected": -0.6560391783714294, "logps/chosen": -558.0198974609375, "logps/rejected": -948.45361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -46.51225280761719, "rewards/margins": 32.24382019042969, "rewards/rejected": -78.75607299804688, "step": 494 }, { "epoch": 0.55, "grad_norm": 5.584978225670056e-06, "learning_rate": 4.760892901743944e-05, "logits/chosen": -0.7159850597381592, "logits/rejected": -0.7726882696151733, "logps/chosen": -487.8374938964844, "logps/rejected": -844.630126953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -39.31949234008789, "rewards/margins": 27.89765739440918, "rewards/rejected": -67.21714782714844, "step": 495 }, { "epoch": 0.5511111111111111, "grad_norm": 69.53382873535156, "learning_rate": 4.759584424871302e-05, "logits/chosen": -0.6221277713775635, "logits/rejected": -0.6317606568336487, "logps/chosen": -381.858154296875, "logps/rejected": -458.45623779296875, "loss": 2.7307, "rewards/accuracies": 0.5, "rewards/chosen": -31.09476089477539, "rewards/margins": 4.367923736572266, "rewards/rejected": -35.462684631347656, "step": 496 }, { "epoch": 0.5522222222222222, "grad_norm": 35.515174865722656, "learning_rate": 4.758272558359059e-05, "logits/chosen": -0.7931150197982788, "logits/rejected": -0.8098109364509583, "logps/chosen": -279.26751708984375, "logps/rejected": -347.8569030761719, "loss": 0.8503, "rewards/accuracies": 0.5, "rewards/chosen": -22.312999725341797, "rewards/margins": 4.545464515686035, "rewards/rejected": -26.85846519470215, "step": 497 }, { "epoch": 0.5533333333333333, "grad_norm": 1.723198652267456, "learning_rate": 4.7569573041751696e-05, "logits/chosen": -0.7823481559753418, "logits/rejected": -0.7844313383102417, "logps/chosen": -264.8987731933594, "logps/rejected": -356.7437744140625, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -20.80377197265625, "rewards/margins": 7.010197639465332, "rewards/rejected": -27.813968658447266, "step": 498 }, { "epoch": 0.5544444444444444, "grad_norm": 0.0016230945475399494, "learning_rate": 4.755638664292666e-05, "logits/chosen": -0.3382144272327423, "logits/rejected": -0.3488667905330658, "logps/chosen": -593.045166015625, "logps/rejected": -805.0645751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -46.175010681152344, "rewards/margins": 15.360506057739258, "rewards/rejected": -61.53551483154297, "step": 499 }, { "epoch": 0.5555555555555556, "grad_norm": 92.05142211914062, "learning_rate": 4.7543166406896646e-05, "logits/chosen": -0.3090946078300476, "logits/rejected": -0.31108495593070984, "logps/chosen": -441.25360107421875, "logps/rejected": -425.1546325683594, "loss": 3.0722, "rewards/accuracies": 0.5, "rewards/chosen": -32.91456604003906, "rewards/margins": -0.8042154312133789, "rewards/rejected": -32.1103515625, "step": 500 }, { "epoch": 0.5566666666666666, "grad_norm": 70.09380340576172, "learning_rate": 4.752991235349351e-05, "logits/chosen": -0.5095341205596924, "logits/rejected": -0.508000373840332, "logps/chosen": -410.18328857421875, "logps/rejected": -398.6973571777344, "loss": 3.7984, "rewards/accuracies": 0.5, "rewards/chosen": -33.34007263183594, "rewards/margins": -0.5680532455444336, "rewards/rejected": -32.77202224731445, "step": 501 }, { "epoch": 0.5577777777777778, "grad_norm": 3.726977825164795, "learning_rate": 4.751662450259989e-05, "logits/chosen": -0.4888767898082733, "logits/rejected": -0.4937818646430969, "logps/chosen": -698.32421875, "logps/rejected": -750.1864013671875, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -53.797874450683594, "rewards/margins": 4.585552215576172, "rewards/rejected": -58.383426666259766, "step": 502 }, { "epoch": 0.5588888888888889, "grad_norm": 0.004400759004056454, "learning_rate": 4.750330287414912e-05, "logits/chosen": -0.09704133868217468, "logits/rejected": -0.164251446723938, "logps/chosen": -290.5973815917969, "logps/rejected": -657.0825805664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.16842269897461, "rewards/margins": 27.68710708618164, "rewards/rejected": -49.85552978515625, "step": 503 }, { "epoch": 0.56, "grad_norm": 6.706117687826918e-07, "learning_rate": 4.7489947488125175e-05, "logits/chosen": -0.41427865624427795, "logits/rejected": -0.4545383155345917, "logps/chosen": -322.28570556640625, "logps/rejected": -631.9754638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.519062042236328, "rewards/margins": 22.42822265625, "rewards/rejected": -47.94728469848633, "step": 504 }, { "epoch": 0.5611111111111111, "grad_norm": 0.025359978899359703, "learning_rate": 4.7476558364562707e-05, "logits/chosen": -0.5862389206886292, "logits/rejected": -0.42912107706069946, "logps/chosen": -227.64552307128906, "logps/rejected": -383.2972412109375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -15.757247924804688, "rewards/margins": 9.402442932128906, "rewards/rejected": -25.159690856933594, "step": 505 }, { "epoch": 0.5622222222222222, "grad_norm": 0.001312658772803843, "learning_rate": 4.746313552354694e-05, "logits/chosen": -0.4962073862552643, "logits/rejected": -0.5227873921394348, "logps/chosen": -254.3424835205078, "logps/rejected": -423.3291320800781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.592811584472656, "rewards/margins": 13.672367095947266, "rewards/rejected": -32.26517868041992, "step": 506 }, { "epoch": 0.5633333333333334, "grad_norm": 0.05272407829761505, "learning_rate": 4.74496789852137e-05, "logits/chosen": -0.3458145558834076, "logits/rejected": -0.3289065361022949, "logps/chosen": -173.20108032226562, "logps/rejected": -229.4534912109375, "loss": 0.3467, "rewards/accuracies": 0.5, "rewards/chosen": -12.805727005004883, "rewards/margins": 4.378876686096191, "rewards/rejected": -17.184602737426758, "step": 507 }, { "epoch": 0.5644444444444444, "grad_norm": 0.0001571753527969122, "learning_rate": 4.743618876974938e-05, "logits/chosen": -0.19156277179718018, "logits/rejected": -0.2555132210254669, "logps/chosen": -469.4532470703125, "logps/rejected": -734.75341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -37.33474349975586, "rewards/margins": 19.59859275817871, "rewards/rejected": -56.93333435058594, "step": 508 }, { "epoch": 0.5655555555555556, "grad_norm": 0.1479106843471527, "learning_rate": 4.742266489739085e-05, "logits/chosen": -0.2007373571395874, "logits/rejected": -0.22685417532920837, "logps/chosen": -260.16546630859375, "logps/rejected": -352.63232421875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -19.350385665893555, "rewards/margins": 7.166162967681885, "rewards/rejected": -26.51654815673828, "step": 509 }, { "epoch": 0.5666666666666667, "grad_norm": 0.00026178202824667096, "learning_rate": 4.7409107388425504e-05, "logits/chosen": -0.3789467513561249, "logits/rejected": -0.36425358057022095, "logps/chosen": -187.88204956054688, "logps/rejected": -361.45928955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.290192604064941, "rewards/margins": 15.114446640014648, "rewards/rejected": -27.404640197753906, "step": 510 }, { "epoch": 0.5677777777777778, "grad_norm": 11.169763565063477, "learning_rate": 4.739551626319119e-05, "logits/chosen": -0.3037755489349365, "logits/rejected": -0.3133015036582947, "logps/chosen": -338.2936706542969, "logps/rejected": -385.098876953125, "loss": 0.9967, "rewards/accuracies": 0.5, "rewards/chosen": -27.157730102539062, "rewards/margins": 4.050895690917969, "rewards/rejected": -31.20862579345703, "step": 511 }, { "epoch": 0.5688888888888889, "grad_norm": 0.06742426007986069, "learning_rate": 4.738189154207616e-05, "logits/chosen": -0.23277950286865234, "logits/rejected": -0.24136579036712646, "logps/chosen": -458.871337890625, "logps/rejected": -537.8095092773438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -35.0903434753418, "rewards/margins": 9.403509140014648, "rewards/rejected": -44.49385070800781, "step": 512 }, { "epoch": 0.57, "grad_norm": 4.4628682136535645, "learning_rate": 4.736823324551909e-05, "logits/chosen": -0.7822916507720947, "logits/rejected": -0.2818727493286133, "logps/chosen": -267.3800048828125, "logps/rejected": -568.7508544921875, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -19.546266555786133, "rewards/margins": 12.606139183044434, "rewards/rejected": -32.15240478515625, "step": 513 }, { "epoch": 0.5711111111111111, "grad_norm": 10.704031944274902, "learning_rate": 4.735454139400902e-05, "logits/chosen": -0.30580058693885803, "logits/rejected": -0.3290247321128845, "logps/chosen": -284.40673828125, "logps/rejected": -417.0465087890625, "loss": 0.192, "rewards/accuracies": 1.0, "rewards/chosen": -21.00873565673828, "rewards/margins": 9.070011138916016, "rewards/rejected": -30.078746795654297, "step": 514 }, { "epoch": 0.5722222222222222, "grad_norm": 3.1198577880859375, "learning_rate": 4.734081600808531e-05, "logits/chosen": -0.03451978415250778, "logits/rejected": -0.014554706402122974, "logps/chosen": -360.0036926269531, "logps/rejected": -437.75408935546875, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": -26.600852966308594, "rewards/margins": 6.725624084472656, "rewards/rejected": -33.32647705078125, "step": 515 }, { "epoch": 0.5733333333333334, "grad_norm": 0.00015417342365253717, "learning_rate": 4.732705710833764e-05, "logits/chosen": -0.08797413855791092, "logits/rejected": -0.09933739900588989, "logps/chosen": -303.66326904296875, "logps/rejected": -600.9674682617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.54410171508789, "rewards/margins": 24.16721534729004, "rewards/rejected": -47.71131896972656, "step": 516 }, { "epoch": 0.5744444444444444, "grad_norm": 1.4065730571746826, "learning_rate": 4.731326471540597e-05, "logits/chosen": 0.0476020984351635, "logits/rejected": 0.07065797597169876, "logps/chosen": -283.6218566894531, "logps/rejected": -388.3386535644531, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -20.968276977539062, "rewards/margins": 6.753623962402344, "rewards/rejected": -27.721900939941406, "step": 517 }, { "epoch": 0.5755555555555556, "grad_norm": 0.07704484462738037, "learning_rate": 4.729943884998048e-05, "logits/chosen": -0.12095600366592407, "logits/rejected": -0.12406669557094574, "logps/chosen": -461.13372802734375, "logps/rejected": -556.1517333984375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -35.71037673950195, "rewards/margins": 9.485191345214844, "rewards/rejected": -45.1955680847168, "step": 518 }, { "epoch": 0.5766666666666667, "grad_norm": 12.556764602661133, "learning_rate": 4.728557953280159e-05, "logits/chosen": -0.16162455081939697, "logits/rejected": -0.16412736475467682, "logps/chosen": -242.108154296875, "logps/rejected": -286.26556396484375, "loss": 0.3196, "rewards/accuracies": 1.0, "rewards/chosen": -17.166730880737305, "rewards/margins": 3.8580288887023926, "rewards/rejected": -21.02475929260254, "step": 519 }, { "epoch": 0.5777777777777777, "grad_norm": 0.002045472851023078, "learning_rate": 4.727168678465988e-05, "logits/chosen": -0.220831498503685, "logits/rejected": -0.20650292932987213, "logps/chosen": -275.5704345703125, "logps/rejected": -435.09490966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.947044372558594, "rewards/margins": 11.828832626342773, "rewards/rejected": -30.775875091552734, "step": 520 }, { "epoch": 0.5788888888888889, "grad_norm": 2.5849621295928955, "learning_rate": 4.725776062639609e-05, "logits/chosen": -0.19309666752815247, "logits/rejected": -0.19244593381881714, "logps/chosen": -197.22398376464844, "logps/rejected": -233.60678100585938, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": -14.720943450927734, "rewards/margins": 2.954658031463623, "rewards/rejected": -17.675601959228516, "step": 521 }, { "epoch": 0.58, "grad_norm": 16.262386322021484, "learning_rate": 4.7243801078901084e-05, "logits/chosen": 0.018162675201892853, "logits/rejected": 0.017284244298934937, "logps/chosen": -535.0092163085938, "logps/rejected": -626.5125732421875, "loss": 0.3437, "rewards/accuracies": 1.0, "rewards/chosen": -40.231849670410156, "rewards/margins": 8.743437767028809, "rewards/rejected": -48.97528839111328, "step": 522 }, { "epoch": 0.5811111111111111, "grad_norm": 9.63356283634198e-10, "learning_rate": 4.72298081631158e-05, "logits/chosen": -0.2667589783668518, "logits/rejected": -0.25576603412628174, "logps/chosen": -287.7998046875, "logps/rejected": -775.5821533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.008007049560547, "rewards/margins": 37.108184814453125, "rewards/rejected": -57.11619186401367, "step": 523 }, { "epoch": 0.5822222222222222, "grad_norm": 19.82350730895996, "learning_rate": 4.721578190003124e-05, "logits/chosen": -0.20157653093338013, "logits/rejected": -0.10507342219352722, "logps/chosen": -209.9446563720703, "logps/rejected": -388.5279235839844, "loss": 0.949, "rewards/accuracies": 0.5, "rewards/chosen": -14.627119064331055, "rewards/margins": 13.843914031982422, "rewards/rejected": -28.471033096313477, "step": 524 }, { "epoch": 0.5833333333333334, "grad_norm": 17.157495498657227, "learning_rate": 4.7201722310688445e-05, "logits/chosen": -0.19689765572547913, "logits/rejected": -0.19159351289272308, "logps/chosen": -291.86187744140625, "logps/rejected": -395.77581787109375, "loss": 0.2992, "rewards/accuracies": 1.0, "rewards/chosen": -20.5880184173584, "rewards/margins": 8.764786720275879, "rewards/rejected": -29.352806091308594, "step": 525 }, { "epoch": 0.5844444444444444, "grad_norm": 0.5147489309310913, "learning_rate": 4.718762941617841e-05, "logits/chosen": -0.12402834743261337, "logits/rejected": -0.11848525702953339, "logps/chosen": -194.66748046875, "logps/rejected": -303.9326477050781, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -12.601727485656738, "rewards/margins": 6.461307048797607, "rewards/rejected": -19.063034057617188, "step": 526 }, { "epoch": 0.5855555555555556, "grad_norm": 7.926740363473073e-05, "learning_rate": 4.717350323764215e-05, "logits/chosen": -0.20254644751548767, "logits/rejected": -0.18613021075725555, "logps/chosen": -370.8522033691406, "logps/rejected": -537.1632080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.71630096435547, "rewards/margins": 13.62989330291748, "rewards/rejected": -43.346195220947266, "step": 527 }, { "epoch": 0.5866666666666667, "grad_norm": 0.016237081959843636, "learning_rate": 4.7159343796270546e-05, "logits/chosen": -0.11792245507240295, "logits/rejected": -0.12822963297367096, "logps/chosen": -220.93487548828125, "logps/rejected": -316.622802734375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -15.52653694152832, "rewards/margins": 8.173667907714844, "rewards/rejected": -23.700206756591797, "step": 528 }, { "epoch": 0.5877777777777777, "grad_norm": 3.813746616287972e-07, "learning_rate": 4.714515111330442e-05, "logits/chosen": -0.37973931431770325, "logits/rejected": -0.3472598195075989, "logps/chosen": -625.5733032226562, "logps/rejected": -831.0513916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -44.686973571777344, "rewards/margins": 19.460533142089844, "rewards/rejected": -64.14750671386719, "step": 529 }, { "epoch": 0.5888888888888889, "grad_norm": 1.513299822807312, "learning_rate": 4.713092521003445e-05, "logits/chosen": -0.15720206499099731, "logits/rejected": -0.18402284383773804, "logps/chosen": -544.4859619140625, "logps/rejected": -773.16748046875, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -40.88871765136719, "rewards/margins": 16.86516761779785, "rewards/rejected": -57.753883361816406, "step": 530 }, { "epoch": 0.59, "grad_norm": 0.37365400791168213, "learning_rate": 4.711666610780115e-05, "logits/chosen": -0.2396746277809143, "logits/rejected": -0.2131005972623825, "logps/chosen": -274.5570068359375, "logps/rejected": -340.07305908203125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -20.270156860351562, "rewards/margins": 4.704959869384766, "rewards/rejected": -24.975116729736328, "step": 531 }, { "epoch": 0.5911111111111111, "grad_norm": 9.54733657836914, "learning_rate": 4.710237382799483e-05, "logits/chosen": -0.4290142059326172, "logits/rejected": -0.4289799630641937, "logps/chosen": -232.01907348632812, "logps/rejected": -326.3015441894531, "loss": 0.1832, "rewards/accuracies": 1.0, "rewards/chosen": -16.251317977905273, "rewards/margins": 7.161870956420898, "rewards/rejected": -23.413188934326172, "step": 532 }, { "epoch": 0.5922222222222222, "grad_norm": 0.16434408724308014, "learning_rate": 4.708804839205556e-05, "logits/chosen": -0.4479272961616516, "logits/rejected": -0.45871245861053467, "logps/chosen": -533.3278198242188, "logps/rejected": -654.2686767578125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -41.25101852416992, "rewards/margins": 9.241235733032227, "rewards/rejected": -50.49225616455078, "step": 533 }, { "epoch": 0.5933333333333334, "grad_norm": 46.21477127075195, "learning_rate": 4.707368982147318e-05, "logits/chosen": -0.15983185172080994, "logits/rejected": -0.14363422989845276, "logps/chosen": -452.37860107421875, "logps/rejected": -512.2060546875, "loss": 3.417, "rewards/accuracies": 0.5, "rewards/chosen": -34.322898864746094, "rewards/margins": 4.511208534240723, "rewards/rejected": -38.8341064453125, "step": 534 }, { "epoch": 0.5944444444444444, "grad_norm": 1.845737099647522, "learning_rate": 4.70592981377872e-05, "logits/chosen": -0.3422400951385498, "logits/rejected": -0.35603922605514526, "logps/chosen": -141.70013427734375, "logps/rejected": -358.9424743652344, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": -9.731019020080566, "rewards/margins": 17.410463333129883, "rewards/rejected": -27.141483306884766, "step": 535 }, { "epoch": 0.5955555555555555, "grad_norm": 17.68801498413086, "learning_rate": 4.704487336258684e-05, "logits/chosen": -0.0643836259841919, "logits/rejected": -0.06799043715000153, "logps/chosen": -455.81195068359375, "logps/rejected": -558.40576171875, "loss": 0.3343, "rewards/accuracies": 1.0, "rewards/chosen": -37.46582794189453, "rewards/margins": 7.980991363525391, "rewards/rejected": -45.446815490722656, "step": 536 }, { "epoch": 0.5966666666666667, "grad_norm": 0.9353404641151428, "learning_rate": 4.703041551751092e-05, "logits/chosen": -0.2857837975025177, "logits/rejected": -0.3037693500518799, "logps/chosen": -423.10601806640625, "logps/rejected": -560.121826171875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -31.516338348388672, "rewards/margins": 12.700353622436523, "rewards/rejected": -44.21669006347656, "step": 537 }, { "epoch": 0.5977777777777777, "grad_norm": 4.531661943474319e-07, "learning_rate": 4.701592462424791e-05, "logits/chosen": -0.3965891897678375, "logits/rejected": -0.4650178849697113, "logps/chosen": -393.78497314453125, "logps/rejected": -666.9634399414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.092605590820312, "rewards/margins": 19.285905838012695, "rewards/rejected": -50.378509521484375, "step": 538 }, { "epoch": 0.5988888888888889, "grad_norm": 0.009397444315254688, "learning_rate": 4.700140070453582e-05, "logits/chosen": -0.27354055643081665, "logits/rejected": -0.27703532576560974, "logps/chosen": -259.4410095214844, "logps/rejected": -427.646728515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -17.893125534057617, "rewards/margins": 13.971839904785156, "rewards/rejected": -31.864965438842773, "step": 539 }, { "epoch": 0.6, "grad_norm": 0.0004701978759840131, "learning_rate": 4.698684378016222e-05, "logits/chosen": -0.19382712244987488, "logits/rejected": -0.16517198085784912, "logps/chosen": -308.03350830078125, "logps/rejected": -512.0942993164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.465797424316406, "rewards/margins": 17.566890716552734, "rewards/rejected": -39.032684326171875, "step": 540 }, { "epoch": 0.6, "eval_logits/chosen": -0.5492357611656189, "eval_logits/rejected": -0.5420712232589722, "eval_logps/chosen": -346.34442138671875, "eval_logps/rejected": -482.45391845703125, "eval_loss": 0.5548104643821716, "eval_rewards/accuracies": 0.8550000190734863, "eval_rewards/chosen": -25.857332229614258, "eval_rewards/margins": 10.930427551269531, "eval_rewards/rejected": -36.787757873535156, "eval_runtime": 84.6071, "eval_samples_per_second": 2.364, "eval_steps_per_second": 0.295, "step": 540 }, { "epoch": 0.6011111111111112, "grad_norm": 17.386497497558594, "learning_rate": 4.697225387296422e-05, "logits/chosen": -0.3418370187282562, "logits/rejected": -0.3126104176044464, "logps/chosen": -141.2191925048828, "logps/rejected": -151.21658325195312, "loss": 0.6345, "rewards/accuracies": 0.5, "rewards/chosen": -8.425697326660156, "rewards/margins": 0.3129005432128906, "rewards/rejected": -8.738597869873047, "step": 541 }, { "epoch": 0.6022222222222222, "grad_norm": 2.531743064082548e-07, "learning_rate": 4.695763100482834e-05, "logits/chosen": -0.3002645969390869, "logits/rejected": -0.30263441801071167, "logps/chosen": -432.171630859375, "logps/rejected": -710.423095703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.17323303222656, "rewards/margins": 21.90402603149414, "rewards/rejected": -55.07726287841797, "step": 542 }, { "epoch": 0.6033333333333334, "grad_norm": 0.042111024260520935, "learning_rate": 4.6942975197690604e-05, "logits/chosen": -0.4909493029117584, "logits/rejected": -0.48899468779563904, "logps/chosen": -233.61334228515625, "logps/rejected": -340.1009521484375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -16.50295639038086, "rewards/margins": 8.342466354370117, "rewards/rejected": -24.845422744750977, "step": 543 }, { "epoch": 0.6044444444444445, "grad_norm": 0.636889636516571, "learning_rate": 4.692828647353642e-05, "logits/chosen": -0.48118001222610474, "logits/rejected": -0.42380911111831665, "logps/chosen": -144.78175354003906, "logps/rejected": -245.02859497070312, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -10.474493026733398, "rewards/margins": 5.493167400360107, "rewards/rejected": -15.967660903930664, "step": 544 }, { "epoch": 0.6055555555555555, "grad_norm": 51.69217300415039, "learning_rate": 4.6913564854400595e-05, "logits/chosen": -0.25049489736557007, "logits/rejected": -0.24619705975055695, "logps/chosen": -441.3612060546875, "logps/rejected": -483.95477294921875, "loss": 1.9761, "rewards/accuracies": 0.5, "rewards/chosen": -34.66702651977539, "rewards/margins": 3.8220090866088867, "rewards/rejected": -38.489036560058594, "step": 545 }, { "epoch": 0.6066666666666667, "grad_norm": 0.0005876933573745191, "learning_rate": 4.689881036236726e-05, "logits/chosen": -0.3353941738605499, "logits/rejected": -0.3433823883533478, "logps/chosen": -290.7570495605469, "logps/rejected": -589.6549072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.030593872070312, "rewards/margins": 22.633899688720703, "rewards/rejected": -44.664493560791016, "step": 546 }, { "epoch": 0.6077777777777778, "grad_norm": 2.805171251296997, "learning_rate": 4.688402301956988e-05, "logits/chosen": -0.13314960896968842, "logits/rejected": -0.11821696162223816, "logps/chosen": -225.88272094726562, "logps/rejected": -340.85577392578125, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": -15.116338729858398, "rewards/margins": 8.09093189239502, "rewards/rejected": -23.207271575927734, "step": 547 }, { "epoch": 0.6088888888888889, "grad_norm": 0.0004591559409163892, "learning_rate": 4.6869202848191174e-05, "logits/chosen": -0.255156010389328, "logits/rejected": -0.24705356359481812, "logps/chosen": -215.2024688720703, "logps/rejected": -432.97467041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.263492584228516, "rewards/margins": 16.56711769104004, "rewards/rejected": -30.830608367919922, "step": 548 }, { "epoch": 0.61, "grad_norm": 0.001135864295065403, "learning_rate": 4.685434987046314e-05, "logits/chosen": -0.3368328809738159, "logits/rejected": -0.3904187083244324, "logps/chosen": -204.973388671875, "logps/rejected": -373.6313171386719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.004629135131836, "rewards/margins": 14.332122802734375, "rewards/rejected": -27.33675193786621, "step": 549 }, { "epoch": 0.6111111111111112, "grad_norm": 1.4715870975123835e-06, "learning_rate": 4.683946410866696e-05, "logits/chosen": -0.11414772272109985, "logits/rejected": -0.11652138084173203, "logps/chosen": -332.3826904296875, "logps/rejected": -560.5443725585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.178226470947266, "rewards/margins": 18.321489334106445, "rewards/rejected": -43.499717712402344, "step": 550 }, { "epoch": 0.6122222222222222, "grad_norm": 0.29652732610702515, "learning_rate": 4.682454558513303e-05, "logits/chosen": -0.18449267745018005, "logits/rejected": -0.19313111901283264, "logps/chosen": -312.637451171875, "logps/rejected": -429.58709716796875, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -24.03350257873535, "rewards/margins": 8.40461540222168, "rewards/rejected": -32.43811798095703, "step": 551 }, { "epoch": 0.6133333333333333, "grad_norm": 30.235132217407227, "learning_rate": 4.680959432224085e-05, "logits/chosen": -0.21501684188842773, "logits/rejected": -0.2082269936800003, "logps/chosen": -307.752685546875, "logps/rejected": -396.7143859863281, "loss": 1.0737, "rewards/accuracies": 0.5, "rewards/chosen": -21.421157836914062, "rewards/margins": 7.94624137878418, "rewards/rejected": -29.36739730834961, "step": 552 }, { "epoch": 0.6144444444444445, "grad_norm": 36.63205337524414, "learning_rate": 4.679461034241906e-05, "logits/chosen": -0.3350578844547272, "logits/rejected": -0.3456741273403168, "logps/chosen": -592.3206787109375, "logps/rejected": -627.080810546875, "loss": 0.3834, "rewards/accuracies": 0.5, "rewards/chosen": -45.48170852661133, "rewards/margins": 5.463123321533203, "rewards/rejected": -50.94483184814453, "step": 553 }, { "epoch": 0.6155555555555555, "grad_norm": 28.36060333251953, "learning_rate": 4.6779593668145385e-05, "logits/chosen": 0.008850380778312683, "logits/rejected": 0.03185790777206421, "logps/chosen": -375.17242431640625, "logps/rejected": -440.510498046875, "loss": 0.4905, "rewards/accuracies": 0.5, "rewards/chosen": -29.207782745361328, "rewards/margins": 4.400762557983398, "rewards/rejected": -33.608543395996094, "step": 554 }, { "epoch": 0.6166666666666667, "grad_norm": 23.412044525146484, "learning_rate": 4.676454432194656e-05, "logits/chosen": -0.36839568614959717, "logits/rejected": -0.36404934525489807, "logps/chosen": -338.24615478515625, "logps/rejected": -401.253662109375, "loss": 1.2251, "rewards/accuracies": 0.5, "rewards/chosen": -26.413196563720703, "rewards/margins": 3.8444414138793945, "rewards/rejected": -30.257638931274414, "step": 555 }, { "epoch": 0.6177777777777778, "grad_norm": 0.013944488950073719, "learning_rate": 4.674946232639838e-05, "logits/chosen": -0.3178008496761322, "logits/rejected": -0.3191778361797333, "logps/chosen": -341.4825439453125, "logps/rejected": -464.94952392578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -27.497806549072266, "rewards/margins": 9.408833503723145, "rewards/rejected": -36.906639099121094, "step": 556 }, { "epoch": 0.6188888888888889, "grad_norm": 0.09893468767404556, "learning_rate": 4.673434770412558e-05, "logits/chosen": -0.2729421854019165, "logits/rejected": -0.2533239424228668, "logps/chosen": -295.27392578125, "logps/rejected": -404.70050048828125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -21.035446166992188, "rewards/margins": 6.84163761138916, "rewards/rejected": -27.87708282470703, "step": 557 }, { "epoch": 0.62, "grad_norm": 3.3922793865203857, "learning_rate": 4.671920047780186e-05, "logits/chosen": -0.22916260361671448, "logits/rejected": -0.25855040550231934, "logps/chosen": -325.1375427246094, "logps/rejected": -462.2332763671875, "loss": 0.0788, "rewards/accuracies": 1.0, "rewards/chosen": -22.447940826416016, "rewards/margins": 10.985625267028809, "rewards/rejected": -33.433563232421875, "step": 558 }, { "epoch": 0.6211111111111111, "grad_norm": 0.002327159745618701, "learning_rate": 4.6704020670149815e-05, "logits/chosen": -0.3058890998363495, "logits/rejected": -0.2893940210342407, "logps/chosen": -304.17913818359375, "logps/rejected": -508.7514953613281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.63442611694336, "rewards/margins": 17.061012268066406, "rewards/rejected": -40.695438385009766, "step": 559 }, { "epoch": 0.6222222222222222, "grad_norm": 0.009385719895362854, "learning_rate": 4.668880830394093e-05, "logits/chosen": -0.10829828679561615, "logits/rejected": -0.12944351136684418, "logps/chosen": -406.62591552734375, "logps/rejected": -535.9666748046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -27.494121551513672, "rewards/margins": 12.503243446350098, "rewards/rejected": -39.99736785888672, "step": 560 }, { "epoch": 0.6233333333333333, "grad_norm": 2.292879104614258, "learning_rate": 4.667356340199551e-05, "logits/chosen": -0.20534977316856384, "logits/rejected": -0.21396484971046448, "logps/chosen": -325.428466796875, "logps/rejected": -504.60015869140625, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -22.098831176757812, "rewards/margins": 12.77750301361084, "rewards/rejected": -34.87633514404297, "step": 561 }, { "epoch": 0.6244444444444445, "grad_norm": 0.4360617399215698, "learning_rate": 4.6658285987182706e-05, "logits/chosen": -0.255919873714447, "logits/rejected": -0.2649689316749573, "logps/chosen": -323.66302490234375, "logps/rejected": -383.34307861328125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -22.691913604736328, "rewards/margins": 4.83267879486084, "rewards/rejected": -27.524593353271484, "step": 562 }, { "epoch": 0.6255555555555555, "grad_norm": 6.083060952732922e-07, "learning_rate": 4.66429760824204e-05, "logits/chosen": -0.14925958216190338, "logits/rejected": -0.1308813989162445, "logps/chosen": -390.97369384765625, "logps/rejected": -668.5313720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.307640075683594, "rewards/margins": 22.61144256591797, "rewards/rejected": -53.91908264160156, "step": 563 }, { "epoch": 0.6266666666666667, "grad_norm": 0.1439003348350525, "learning_rate": 4.6627633710675236e-05, "logits/chosen": -0.061461418867111206, "logits/rejected": -0.04503050073981285, "logps/chosen": -488.79986572265625, "logps/rejected": -715.93994140625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -38.772377014160156, "rewards/margins": 14.356781005859375, "rewards/rejected": -53.12915802001953, "step": 564 }, { "epoch": 0.6277777777777778, "grad_norm": 25.06688117980957, "learning_rate": 4.6612258894962563e-05, "logits/chosen": -0.10525703430175781, "logits/rejected": -0.17769083380699158, "logps/chosen": -309.36505126953125, "logps/rejected": -355.01385498046875, "loss": 0.3174, "rewards/accuracies": 1.0, "rewards/chosen": -22.650850296020508, "rewards/margins": 4.225459098815918, "rewards/rejected": -26.87630844116211, "step": 565 }, { "epoch": 0.6288888888888889, "grad_norm": 37.79369354248047, "learning_rate": 4.6596851658346384e-05, "logits/chosen": -0.16395774483680725, "logits/rejected": -0.1621040403842926, "logps/chosen": -255.31576538085938, "logps/rejected": -474.5025939941406, "loss": 0.7145, "rewards/accuracies": 0.5, "rewards/chosen": -19.198623657226562, "rewards/margins": 16.048450469970703, "rewards/rejected": -35.24707794189453, "step": 566 }, { "epoch": 0.63, "grad_norm": 101.3075942993164, "learning_rate": 4.6581412023939354e-05, "logits/chosen": -0.09420419484376907, "logits/rejected": -0.0320439338684082, "logps/chosen": -533.2174072265625, "logps/rejected": -553.0574951171875, "loss": 2.029, "rewards/accuracies": 0.5, "rewards/chosen": -36.482208251953125, "rewards/margins": 4.767735481262207, "rewards/rejected": -41.24994659423828, "step": 567 }, { "epoch": 0.6311111111111111, "grad_norm": 4.2629218101501465, "learning_rate": 4.656594001490271e-05, "logits/chosen": -0.5332432985305786, "logits/rejected": -0.5306819677352905, "logps/chosen": -202.43106079101562, "logps/rejected": -273.94415283203125, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": -13.630614280700684, "rewards/margins": 5.412919044494629, "rewards/rejected": -19.043533325195312, "step": 568 }, { "epoch": 0.6322222222222222, "grad_norm": 10.65031909942627, "learning_rate": 4.655043565444628e-05, "logits/chosen": 0.0930953323841095, "logits/rejected": 0.1243007630109787, "logps/chosen": -304.723388671875, "logps/rejected": -406.22119140625, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": -22.824581146240234, "rewards/margins": 8.099909782409668, "rewards/rejected": -30.92449188232422, "step": 569 }, { "epoch": 0.6333333333333333, "grad_norm": 0.48418760299682617, "learning_rate": 4.6534898965828405e-05, "logits/chosen": -0.27050575613975525, "logits/rejected": -0.2732468843460083, "logps/chosen": -262.30072021484375, "logps/rejected": -314.73773193359375, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -18.796161651611328, "rewards/margins": 4.6018524169921875, "rewards/rejected": -23.398014068603516, "step": 570 }, { "epoch": 0.6344444444444445, "grad_norm": 41.678890228271484, "learning_rate": 4.651932997235592e-05, "logits/chosen": -0.3281720280647278, "logits/rejected": -0.32817238569259644, "logps/chosen": -553.257568359375, "logps/rejected": -627.424072265625, "loss": 4.5186, "rewards/accuracies": 0.5, "rewards/chosen": -47.10371017456055, "rewards/margins": 4.824405670166016, "rewards/rejected": -51.92811584472656, "step": 571 }, { "epoch": 0.6355555555555555, "grad_norm": 16.753128051757812, "learning_rate": 4.650372869738414e-05, "logits/chosen": -0.3994610011577606, "logits/rejected": -0.3997359275817871, "logps/chosen": -165.3798828125, "logps/rejected": -281.8898010253906, "loss": 0.3651, "rewards/accuracies": 0.5, "rewards/chosen": -11.977753639221191, "rewards/margins": 7.094931602478027, "rewards/rejected": -19.07268524169922, "step": 572 }, { "epoch": 0.6366666666666667, "grad_norm": 63.23877716064453, "learning_rate": 4.648809516431678e-05, "logits/chosen": -0.03802650421857834, "logits/rejected": -0.0350295826792717, "logps/chosen": -671.381591796875, "logps/rejected": -648.8165893554688, "loss": 6.7283, "rewards/accuracies": 0.5, "rewards/chosen": -54.4600715637207, "rewards/margins": -0.8225574493408203, "rewards/rejected": -53.63751220703125, "step": 573 }, { "epoch": 0.6377777777777778, "grad_norm": 1.055150914908154e-05, "learning_rate": 4.647242939660596e-05, "logits/chosen": -0.06833849102258682, "logits/rejected": -0.042555805295705795, "logps/chosen": -435.78741455078125, "logps/rejected": -663.6184692382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.99317932128906, "rewards/margins": 17.258655548095703, "rewards/rejected": -50.251834869384766, "step": 574 }, { "epoch": 0.6388888888888888, "grad_norm": 0.005665441509336233, "learning_rate": 4.645673141775217e-05, "logits/chosen": 0.010716710239648819, "logits/rejected": 0.05658222734928131, "logps/chosen": -243.54454040527344, "logps/rejected": -401.67413330078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.323282241821289, "rewards/margins": 10.848529815673828, "rewards/rejected": -26.171812057495117, "step": 575 }, { "epoch": 0.64, "grad_norm": 8.266736030578613, "learning_rate": 4.644100125130418e-05, "logits/chosen": -0.21663770079612732, "logits/rejected": -0.21535225212574005, "logps/chosen": -322.76239013671875, "logps/rejected": -380.2269287109375, "loss": 0.1487, "rewards/accuracies": 1.0, "rewards/chosen": -24.543964385986328, "rewards/margins": 5.265473365783691, "rewards/rejected": -29.809436798095703, "step": 576 }, { "epoch": 0.6411111111111111, "grad_norm": 0.011924519203603268, "learning_rate": 4.6425238920859087e-05, "logits/chosen": 0.014540478587150574, "logits/rejected": 0.015614144504070282, "logps/chosen": -287.3695983886719, "logps/rejected": -463.8795166015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -20.5944881439209, "rewards/margins": 14.785709381103516, "rewards/rejected": -35.38019561767578, "step": 577 }, { "epoch": 0.6422222222222222, "grad_norm": 0.002787942998111248, "learning_rate": 4.6409444450062226e-05, "logits/chosen": -0.20297771692276, "logits/rejected": -0.1893511563539505, "logps/chosen": -252.7091827392578, "logps/rejected": -458.9480895996094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.02452278137207, "rewards/margins": 16.167865753173828, "rewards/rejected": -35.19239044189453, "step": 578 }, { "epoch": 0.6433333333333333, "grad_norm": 0.010718289762735367, "learning_rate": 4.639361786260713e-05, "logits/chosen": -0.166642427444458, "logits/rejected": -0.19896408915519714, "logps/chosen": -235.65335083007812, "logps/rejected": -456.7757568359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -15.9784574508667, "rewards/margins": 18.48407745361328, "rewards/rejected": -34.4625358581543, "step": 579 }, { "epoch": 0.6444444444444445, "grad_norm": 0.8168871402740479, "learning_rate": 4.6377759182235516e-05, "logits/chosen": -0.19358763098716736, "logits/rejected": -0.19576969742774963, "logps/chosen": -216.92239379882812, "logps/rejected": -322.3817443847656, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -16.91401481628418, "rewards/margins": 8.394937515258789, "rewards/rejected": -25.30895233154297, "step": 580 }, { "epoch": 0.6455555555555555, "grad_norm": 1.3539398908615112, "learning_rate": 4.636186843273727e-05, "logits/chosen": 0.16789288818836212, "logits/rejected": 0.1665099710226059, "logps/chosen": -293.5237731933594, "logps/rejected": -394.75506591796875, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -21.84799575805664, "rewards/margins": 8.547290802001953, "rewards/rejected": -30.395286560058594, "step": 581 }, { "epoch": 0.6466666666666666, "grad_norm": 1.9487749338150024, "learning_rate": 4.634594563795035e-05, "logits/chosen": 0.3089434504508972, "logits/rejected": 0.31129804253578186, "logps/chosen": -396.7366638183594, "logps/rejected": -573.07177734375, "loss": 0.0616, "rewards/accuracies": 1.0, "rewards/chosen": -27.954021453857422, "rewards/margins": 13.36994743347168, "rewards/rejected": -41.32396697998047, "step": 582 }, { "epoch": 0.6477777777777778, "grad_norm": 0.09514360129833221, "learning_rate": 4.632999082176081e-05, "logits/chosen": -0.11944103240966797, "logits/rejected": -0.13302156329154968, "logps/chosen": -298.8192138671875, "logps/rejected": -460.4969482421875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -21.053077697753906, "rewards/margins": 12.658792495727539, "rewards/rejected": -33.71186828613281, "step": 583 }, { "epoch": 0.6488888888888888, "grad_norm": 4.528407573699951, "learning_rate": 4.6314004008102726e-05, "logits/chosen": -0.16015923023223877, "logits/rejected": -0.14851638674736023, "logps/chosen": -232.71926879882812, "logps/rejected": -369.34954833984375, "loss": 0.1001, "rewards/accuracies": 1.0, "rewards/chosen": -16.499530792236328, "rewards/margins": 10.935506820678711, "rewards/rejected": -27.43503761291504, "step": 584 }, { "epoch": 0.65, "grad_norm": 1.0728104515855819e-10, "learning_rate": 4.629798522095818e-05, "logits/chosen": 0.004231639206409454, "logits/rejected": -0.0689489021897316, "logps/chosen": -482.7322998046875, "logps/rejected": -943.2256469726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.82907485961914, "rewards/margins": 34.29050064086914, "rewards/rejected": -68.11957550048828, "step": 585 }, { "epoch": 0.6511111111111111, "grad_norm": 0.16976261138916016, "learning_rate": 4.628193448435721e-05, "logits/chosen": -0.36306819319725037, "logits/rejected": -0.3547168970108032, "logps/chosen": -219.43426513671875, "logps/rejected": -462.8876037597656, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -16.399883270263672, "rewards/margins": 20.967498779296875, "rewards/rejected": -37.36738204956055, "step": 586 }, { "epoch": 0.6522222222222223, "grad_norm": 0.005403765477240086, "learning_rate": 4.626585182237781e-05, "logits/chosen": -0.06416279077529907, "logits/rejected": -0.06326672434806824, "logps/chosen": -399.2923583984375, "logps/rejected": -522.4906005859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.72199249267578, "rewards/margins": 11.042609214782715, "rewards/rejected": -42.76460266113281, "step": 587 }, { "epoch": 0.6533333333333333, "grad_norm": 0.20559951663017273, "learning_rate": 4.624973725914582e-05, "logits/chosen": -0.06210920214653015, "logits/rejected": -0.06960634887218475, "logps/chosen": -241.14166259765625, "logps/rejected": -395.2648620605469, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -18.078861236572266, "rewards/margins": 13.175414085388184, "rewards/rejected": -31.254276275634766, "step": 588 }, { "epoch": 0.6544444444444445, "grad_norm": 3.453816316323355e-05, "learning_rate": 4.623359081883498e-05, "logits/chosen": -0.14930155873298645, "logits/rejected": -0.12734808027744293, "logps/chosen": -447.2582092285156, "logps/rejected": -645.9642944335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.283355712890625, "rewards/margins": 17.143543243408203, "rewards/rejected": -49.426902770996094, "step": 589 }, { "epoch": 0.6555555555555556, "grad_norm": 7.658502101898193, "learning_rate": 4.621741252566681e-05, "logits/chosen": -0.2760390043258667, "logits/rejected": -0.27568376064300537, "logps/chosen": -208.30165100097656, "logps/rejected": -327.44976806640625, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": -15.494443893432617, "rewards/margins": 9.504720687866211, "rewards/rejected": -24.999164581298828, "step": 590 }, { "epoch": 0.6566666666666666, "grad_norm": 45.01296615600586, "learning_rate": 4.620120240391065e-05, "logits/chosen": -0.1960555762052536, "logits/rejected": -0.22459545731544495, "logps/chosen": -477.4336242675781, "logps/rejected": -574.9246826171875, "loss": 0.6201, "rewards/accuracies": 0.5, "rewards/chosen": -37.31473922729492, "rewards/margins": 8.951613426208496, "rewards/rejected": -46.26634979248047, "step": 591 }, { "epoch": 0.6577777777777778, "grad_norm": 0.006973618175834417, "learning_rate": 4.6184960477883564e-05, "logits/chosen": -0.2398427575826645, "logits/rejected": -0.24383138120174408, "logps/chosen": -205.08560180664062, "logps/rejected": -400.4714660644531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.939797401428223, "rewards/margins": 16.245677947998047, "rewards/rejected": -32.18547821044922, "step": 592 }, { "epoch": 0.6588888888888889, "grad_norm": 3.536868131703841e-09, "learning_rate": 4.616868677195032e-05, "logits/chosen": -0.30624037981033325, "logits/rejected": -0.31590378284454346, "logps/chosen": -279.830322265625, "logps/rejected": -650.5941162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.089313507080078, "rewards/margins": 28.618667602539062, "rewards/rejected": -50.70798110961914, "step": 593 }, { "epoch": 0.66, "grad_norm": 0.831355094909668, "learning_rate": 4.6152381310523387e-05, "logits/chosen": -0.2871212363243103, "logits/rejected": -0.28497040271759033, "logps/chosen": -338.08441162109375, "logps/rejected": -422.45172119140625, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -27.37787628173828, "rewards/margins": 6.567145824432373, "rewards/rejected": -33.94502258300781, "step": 594 }, { "epoch": 0.6611111111111111, "grad_norm": 46.180110931396484, "learning_rate": 4.613604411806285e-05, "logits/chosen": -0.4201611280441284, "logits/rejected": -0.42428362369537354, "logps/chosen": -382.445068359375, "logps/rejected": -389.75018310546875, "loss": 7.166, "rewards/accuracies": 0.5, "rewards/chosen": -29.355180740356445, "rewards/margins": 0.7815532684326172, "rewards/rejected": -30.136734008789062, "step": 595 }, { "epoch": 0.6622222222222223, "grad_norm": 12.56535530090332, "learning_rate": 4.6119675219076395e-05, "logits/chosen": -0.5199668407440186, "logits/rejected": -0.5189607739448547, "logps/chosen": -357.8254699707031, "logps/rejected": -383.20654296875, "loss": 0.4997, "rewards/accuracies": 0.5, "rewards/chosen": -29.567977905273438, "rewards/margins": 1.6890382766723633, "rewards/rejected": -31.257017135620117, "step": 596 }, { "epoch": 0.6633333333333333, "grad_norm": 89.61067962646484, "learning_rate": 4.610327463811927e-05, "logits/chosen": -0.3886187970638275, "logits/rejected": -0.37615031003952026, "logps/chosen": -324.5775146484375, "logps/rejected": -225.33084106445312, "loss": 6.4099, "rewards/accuracies": 0.0, "rewards/chosen": -23.124879837036133, "rewards/margins": -6.359364032745361, "rewards/rejected": -16.765514373779297, "step": 597 }, { "epoch": 0.6644444444444444, "grad_norm": 25.794729232788086, "learning_rate": 4.608684239979427e-05, "logits/chosen": -0.4059697985649109, "logits/rejected": -0.41584140062332153, "logps/chosen": -423.3397521972656, "logps/rejected": -513.4691772460938, "loss": 0.3348, "rewards/accuracies": 1.0, "rewards/chosen": -33.950279235839844, "rewards/margins": 6.717349052429199, "rewards/rejected": -40.66762924194336, "step": 598 }, { "epoch": 0.6655555555555556, "grad_norm": 1.5457817316055298, "learning_rate": 4.607037852875165e-05, "logits/chosen": -0.11175529658794403, "logits/rejected": -0.12603402137756348, "logps/chosen": -824.0009155273438, "logps/rejected": -1146.801025390625, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -66.72039794921875, "rewards/margins": 26.797300338745117, "rewards/rejected": -93.5177001953125, "step": 599 }, { "epoch": 0.6666666666666666, "grad_norm": 0.03328476473689079, "learning_rate": 4.6053883049689145e-05, "logits/chosen": -0.2059912085533142, "logits/rejected": -0.19204923510551453, "logps/chosen": -221.41253662109375, "logps/rejected": -444.06494140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -15.262081146240234, "rewards/margins": 18.680173873901367, "rewards/rejected": -33.94225311279297, "step": 600 }, { "epoch": 0.6677777777777778, "grad_norm": 26.29569435119629, "learning_rate": 4.603735598735189e-05, "logits/chosen": -0.20711588859558105, "logits/rejected": -0.22018158435821533, "logps/chosen": -571.0641479492188, "logps/rejected": -648.575927734375, "loss": 0.2863, "rewards/accuracies": 1.0, "rewards/chosen": -43.81731414794922, "rewards/margins": 7.9801483154296875, "rewards/rejected": -51.797462463378906, "step": 601 }, { "epoch": 0.6688888888888889, "grad_norm": 0.06850291788578033, "learning_rate": 4.6020797366532397e-05, "logits/chosen": -0.1587243527173996, "logits/rejected": -0.14845257997512817, "logps/chosen": -358.0841064453125, "logps/rejected": -476.3421936035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -26.45770263671875, "rewards/margins": 11.555975914001465, "rewards/rejected": -38.01367950439453, "step": 602 }, { "epoch": 0.67, "grad_norm": 5.401813983917236, "learning_rate": 4.600420721207053e-05, "logits/chosen": -0.36023616790771484, "logits/rejected": -0.3306226134300232, "logps/chosen": -234.05059814453125, "logps/rejected": -299.0948486328125, "loss": 0.3339, "rewards/accuracies": 1.0, "rewards/chosen": -16.820104598999023, "rewards/margins": 5.061682224273682, "rewards/rejected": -21.881786346435547, "step": 603 }, { "epoch": 0.6711111111111111, "grad_norm": 0.06338556855916977, "learning_rate": 4.598758554885344e-05, "logits/chosen": -0.26342374086380005, "logits/rejected": -0.19157905876636505, "logps/chosen": -300.0224609375, "logps/rejected": -469.6527099609375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -21.832918167114258, "rewards/margins": 13.517375946044922, "rewards/rejected": -35.35029602050781, "step": 604 }, { "epoch": 0.6722222222222223, "grad_norm": 5.421672344207764, "learning_rate": 4.597093240181557e-05, "logits/chosen": -0.2668984532356262, "logits/rejected": -0.2602892816066742, "logps/chosen": -381.8797302246094, "logps/rejected": -452.42547607421875, "loss": 0.1293, "rewards/accuracies": 1.0, "rewards/chosen": -31.15137481689453, "rewards/margins": 3.751138687133789, "rewards/rejected": -34.90251159667969, "step": 605 }, { "epoch": 0.6733333333333333, "grad_norm": 18.384096145629883, "learning_rate": 4.595424779593857e-05, "logits/chosen": -0.315964937210083, "logits/rejected": -0.31824928522109985, "logps/chosen": -181.3050537109375, "logps/rejected": -180.639404296875, "loss": 0.8741, "rewards/accuracies": 0.5, "rewards/chosen": -13.87405014038086, "rewards/margins": -0.29682350158691406, "rewards/rejected": -13.577226638793945, "step": 606 }, { "epoch": 0.6744444444444444, "grad_norm": 0.00011058315431000665, "learning_rate": 4.593753175625129e-05, "logits/chosen": -0.2606033980846405, "logits/rejected": -0.2512040436267853, "logps/chosen": -545.7975463867188, "logps/rejected": -812.1452026367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -43.37498474121094, "rewards/margins": 22.58294677734375, "rewards/rejected": -65.95793151855469, "step": 607 }, { "epoch": 0.6755555555555556, "grad_norm": 0.0003045557241421193, "learning_rate": 4.592078430782975e-05, "logits/chosen": -0.09015306830406189, "logits/rejected": -0.07258119434118271, "logps/chosen": -258.5989990234375, "logps/rejected": -433.4585876464844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.866117477416992, "rewards/margins": 14.939373016357422, "rewards/rejected": -33.80549240112305, "step": 608 }, { "epoch": 0.6766666666666666, "grad_norm": 16.166093826293945, "learning_rate": 4.5904005475797055e-05, "logits/chosen": -0.38979193568229675, "logits/rejected": -0.40878626704216003, "logps/chosen": -482.67767333984375, "logps/rejected": -535.4376220703125, "loss": 0.936, "rewards/accuracies": 0.5, "rewards/chosen": -38.514915466308594, "rewards/margins": 4.867245197296143, "rewards/rejected": -43.38216018676758, "step": 609 }, { "epoch": 0.6777777777777778, "grad_norm": 0.07436764985322952, "learning_rate": 4.588719528532342e-05, "logits/chosen": -0.23602789640426636, "logits/rejected": -0.21144422888755798, "logps/chosen": -344.75970458984375, "logps/rejected": -507.07623291015625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -27.510324478149414, "rewards/margins": 12.28441047668457, "rewards/rejected": -39.794734954833984, "step": 610 }, { "epoch": 0.6788888888888889, "grad_norm": 0.8378509283065796, "learning_rate": 4.5870353761626065e-05, "logits/chosen": -0.4565754532814026, "logits/rejected": -0.4592251181602478, "logps/chosen": -426.9901123046875, "logps/rejected": -466.6265869140625, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -32.91936111450195, "rewards/margins": 4.35479211807251, "rewards/rejected": -37.27415466308594, "step": 611 }, { "epoch": 0.68, "grad_norm": 0.004257762338966131, "learning_rate": 4.585348092996925e-05, "logits/chosen": -0.21731507778167725, "logits/rejected": -0.1769104152917862, "logps/chosen": -379.5220642089844, "logps/rejected": -484.2578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -26.8493595123291, "rewards/margins": 9.883251190185547, "rewards/rejected": -36.73261260986328, "step": 612 }, { "epoch": 0.6811111111111111, "grad_norm": 0.0024775194469839334, "learning_rate": 4.583657681566419e-05, "logits/chosen": -0.14008629322052002, "logits/rejected": -0.1312515139579773, "logps/chosen": -394.0913391113281, "logps/rejected": -582.4666748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.158607482910156, "rewards/margins": 14.257041931152344, "rewards/rejected": -44.4156494140625, "step": 613 }, { "epoch": 0.6822222222222222, "grad_norm": 1.2654374837875366, "learning_rate": 4.581964144406901e-05, "logits/chosen": -0.3330204486846924, "logits/rejected": -0.3742793798446655, "logps/chosen": -321.8624267578125, "logps/rejected": -423.0284423828125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -22.32745361328125, "rewards/margins": 7.676220417022705, "rewards/rejected": -30.003673553466797, "step": 614 }, { "epoch": 0.6833333333333333, "grad_norm": 50.312095642089844, "learning_rate": 4.580267484058876e-05, "logits/chosen": -0.22435440123081207, "logits/rejected": -0.2168644219636917, "logps/chosen": -287.90570068359375, "logps/rejected": -311.7400817871094, "loss": 2.6458, "rewards/accuracies": 0.5, "rewards/chosen": -23.2194766998291, "rewards/margins": 0.6631088256835938, "rewards/rejected": -23.882585525512695, "step": 615 }, { "epoch": 0.6844444444444444, "grad_norm": 0.0003239224315620959, "learning_rate": 4.5785677030675286e-05, "logits/chosen": -0.1076403260231018, "logits/rejected": -0.11203013360500336, "logps/chosen": -521.8116455078125, "logps/rejected": -759.8988037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -43.16960906982422, "rewards/margins": 14.825557708740234, "rewards/rejected": -57.99517059326172, "step": 616 }, { "epoch": 0.6855555555555556, "grad_norm": 0.5235415697097778, "learning_rate": 4.5768648039827325e-05, "logits/chosen": 0.02654496394097805, "logits/rejected": 0.02180054783821106, "logps/chosen": -498.2125244140625, "logps/rejected": -576.060546875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -38.866641998291016, "rewards/margins": 8.390604019165039, "rewards/rejected": -47.25724792480469, "step": 617 }, { "epoch": 0.6866666666666666, "grad_norm": 0.009154853411018848, "learning_rate": 4.575158789359031e-05, "logits/chosen": -0.09973657131195068, "logits/rejected": -0.10985437035560608, "logps/chosen": -439.733154296875, "logps/rejected": -720.8668212890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -33.934661865234375, "rewards/margins": 25.403392791748047, "rewards/rejected": -59.338050842285156, "step": 618 }, { "epoch": 0.6877777777777778, "grad_norm": 0.000816323678009212, "learning_rate": 4.573449661755646e-05, "logits/chosen": -0.2789964973926544, "logits/rejected": -0.2690514922142029, "logps/chosen": -306.9751892089844, "logps/rejected": -453.6601257324219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.095531463623047, "rewards/margins": 12.636151313781738, "rewards/rejected": -36.73168182373047, "step": 619 }, { "epoch": 0.6888888888888889, "grad_norm": 0.9672163128852844, "learning_rate": 4.5717374237364665e-05, "logits/chosen": -0.37883439660072327, "logits/rejected": -0.37198424339294434, "logps/chosen": -216.18235778808594, "logps/rejected": -261.9222412109375, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": -16.110776901245117, "rewards/margins": 4.510706424713135, "rewards/rejected": -20.621482849121094, "step": 620 }, { "epoch": 0.69, "grad_norm": 0.001956075197085738, "learning_rate": 4.5700220778700504e-05, "logits/chosen": -0.11723906546831131, "logits/rejected": -0.10757620632648468, "logps/chosen": -396.8409423828125, "logps/rejected": -515.0135498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.82741928100586, "rewards/margins": 10.877249717712402, "rewards/rejected": -40.70466613769531, "step": 621 }, { "epoch": 0.6911111111111111, "grad_norm": 1.3524582386016846, "learning_rate": 4.5683036267296156e-05, "logits/chosen": -0.442306786775589, "logits/rejected": -0.4632127285003662, "logps/chosen": -230.61688232421875, "logps/rejected": -348.1609802246094, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": -17.135271072387695, "rewards/margins": 7.895287036895752, "rewards/rejected": -25.03055763244629, "step": 622 }, { "epoch": 0.6922222222222222, "grad_norm": 33.60292053222656, "learning_rate": 4.5665820728930385e-05, "logits/chosen": -0.44740843772888184, "logits/rejected": -0.45476216077804565, "logps/chosen": -571.6705322265625, "logps/rejected": -580.08447265625, "loss": 3.4753, "rewards/accuracies": 0.5, "rewards/chosen": -49.24449157714844, "rewards/margins": -1.1472492218017578, "rewards/rejected": -48.09724426269531, "step": 623 }, { "epoch": 0.6933333333333334, "grad_norm": 79.90501403808594, "learning_rate": 4.564857418942851e-05, "logits/chosen": -0.31021687388420105, "logits/rejected": -0.3188110589981079, "logps/chosen": -722.1289672851562, "logps/rejected": -798.8150634765625, "loss": 1.4359, "rewards/accuracies": 0.5, "rewards/chosen": -59.47103500366211, "rewards/margins": 7.314844131469727, "rewards/rejected": -66.78588104248047, "step": 624 }, { "epoch": 0.6944444444444444, "grad_norm": 32.96128845214844, "learning_rate": 4.563129667466234e-05, "logits/chosen": -0.2848309874534607, "logits/rejected": -0.2876594066619873, "logps/chosen": -541.14599609375, "logps/rejected": -571.9803466796875, "loss": 0.252, "rewards/accuracies": 1.0, "rewards/chosen": -44.07958984375, "rewards/margins": 3.8779735565185547, "rewards/rejected": -47.95756530761719, "step": 625 }, { "epoch": 0.6955555555555556, "grad_norm": 1.1539881938915642e-07, "learning_rate": 4.561398821055016e-05, "logits/chosen": -0.24552659690380096, "logits/rejected": -0.26227355003356934, "logps/chosen": -363.51068115234375, "logps/rejected": -652.832763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.688800811767578, "rewards/margins": 22.760009765625, "rewards/rejected": -52.44880676269531, "step": 626 }, { "epoch": 0.6966666666666667, "grad_norm": 0.05072171613574028, "learning_rate": 4.559664882305668e-05, "logits/chosen": -0.4942433834075928, "logits/rejected": -0.4953221082687378, "logps/chosen": -407.85113525390625, "logps/rejected": -607.794921875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -33.86170959472656, "rewards/margins": 15.108240127563477, "rewards/rejected": -48.96995162963867, "step": 627 }, { "epoch": 0.6977777777777778, "grad_norm": 0.11442580819129944, "learning_rate": 4.557927853819299e-05, "logits/chosen": -0.27679523825645447, "logits/rejected": -0.27445363998413086, "logps/chosen": -441.625, "logps/rejected": -538.2437133789062, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -35.90381622314453, "rewards/margins": 8.123567581176758, "rewards/rejected": -44.027381896972656, "step": 628 }, { "epoch": 0.6988888888888889, "grad_norm": 16.652788162231445, "learning_rate": 4.556187738201656e-05, "logits/chosen": 0.03335461765527725, "logits/rejected": 0.056426286697387695, "logps/chosen": -362.43878173828125, "logps/rejected": -489.91400146484375, "loss": 0.2462, "rewards/accuracies": 1.0, "rewards/chosen": -27.42664337158203, "rewards/margins": 9.547270774841309, "rewards/rejected": -36.973915100097656, "step": 629 }, { "epoch": 0.7, "grad_norm": 3.673884153366089, "learning_rate": 4.554444538063113e-05, "logits/chosen": -0.2694697976112366, "logits/rejected": -0.2727832496166229, "logps/chosen": -327.12786865234375, "logps/rejected": -386.6516418457031, "loss": 0.258, "rewards/accuracies": 1.0, "rewards/chosen": -25.94481658935547, "rewards/margins": 4.857398986816406, "rewards/rejected": -30.802215576171875, "step": 630 }, { "epoch": 0.7011111111111111, "grad_norm": 13.918408393859863, "learning_rate": 4.552698256018674e-05, "logits/chosen": -0.36660903692245483, "logits/rejected": -0.37317949533462524, "logps/chosen": -287.7021789550781, "logps/rejected": -303.10467529296875, "loss": 0.4761, "rewards/accuracies": 0.5, "rewards/chosen": -23.830738067626953, "rewards/margins": 1.6324858665466309, "rewards/rejected": -25.46322250366211, "step": 631 }, { "epoch": 0.7022222222222222, "grad_norm": 85.2928237915039, "learning_rate": 4.5509488946879655e-05, "logits/chosen": 0.26082539558410645, "logits/rejected": 0.27223870158195496, "logps/chosen": -553.3845825195312, "logps/rejected": -513.4561767578125, "loss": 3.5291, "rewards/accuracies": 0.0, "rewards/chosen": -44.80022048950195, "rewards/margins": -3.430604934692383, "rewards/rejected": -41.36961364746094, "step": 632 }, { "epoch": 0.7033333333333334, "grad_norm": 0.004172021988779306, "learning_rate": 4.549196456695232e-05, "logits/chosen": -0.12634076178073883, "logits/rejected": -0.12761694192886353, "logps/chosen": -265.566162109375, "logps/rejected": -403.1243896484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -20.453250885009766, "rewards/margins": 11.240657806396484, "rewards/rejected": -31.69390869140625, "step": 633 }, { "epoch": 0.7044444444444444, "grad_norm": 1.0941144227981567, "learning_rate": 4.547440944669335e-05, "logits/chosen": -0.316785991191864, "logits/rejected": -0.32665058970451355, "logps/chosen": -314.42095947265625, "logps/rejected": -391.655517578125, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -25.516441345214844, "rewards/margins": 6.5554962158203125, "rewards/rejected": -32.071937561035156, "step": 634 }, { "epoch": 0.7055555555555556, "grad_norm": 18.04153060913086, "learning_rate": 4.545682361243748e-05, "logits/chosen": -0.26329848170280457, "logits/rejected": -0.2564953863620758, "logps/chosen": -253.0839385986328, "logps/rejected": -373.84490966796875, "loss": 0.8985, "rewards/accuracies": 0.5, "rewards/chosen": -20.358863830566406, "rewards/margins": 8.314234733581543, "rewards/rejected": -28.673099517822266, "step": 635 }, { "epoch": 0.7066666666666667, "grad_norm": 94.07976531982422, "learning_rate": 4.54392070905655e-05, "logits/chosen": 0.13757818937301636, "logits/rejected": 0.12103809416294098, "logps/chosen": -586.9873657226562, "logps/rejected": -704.6460571289062, "loss": 0.7755, "rewards/accuracies": 0.5, "rewards/chosen": -41.87786102294922, "rewards/margins": 12.652952194213867, "rewards/rejected": -54.53081130981445, "step": 636 }, { "epoch": 0.7077777777777777, "grad_norm": 1.2703794240951538, "learning_rate": 4.542155990750423e-05, "logits/chosen": 0.09223221242427826, "logits/rejected": 0.0522521510720253, "logps/chosen": -688.8833618164062, "logps/rejected": -787.979736328125, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -56.62965774536133, "rewards/margins": 5.94416618347168, "rewards/rejected": -62.573822021484375, "step": 637 }, { "epoch": 0.7088888888888889, "grad_norm": 45.05097579956055, "learning_rate": 4.540388208972651e-05, "logits/chosen": -0.17163056135177612, "logits/rejected": -0.22061288356781006, "logps/chosen": -316.97802734375, "logps/rejected": -402.59893798828125, "loss": 2.9046, "rewards/accuracies": 0.5, "rewards/chosen": -25.021007537841797, "rewards/margins": 7.350645065307617, "rewards/rejected": -32.37165451049805, "step": 638 }, { "epoch": 0.71, "grad_norm": 23.523914337158203, "learning_rate": 4.538617366375112e-05, "logits/chosen": -0.16113309562206268, "logits/rejected": -0.16862215101718903, "logps/chosen": -428.74761962890625, "logps/rejected": -480.8131103515625, "loss": 0.4315, "rewards/accuracies": 0.5, "rewards/chosen": -33.86891174316406, "rewards/margins": 2.876725196838379, "rewards/rejected": -36.745635986328125, "step": 639 }, { "epoch": 0.7111111111111111, "grad_norm": 5.730121612548828, "learning_rate": 4.536843465614277e-05, "logits/chosen": -0.12194007635116577, "logits/rejected": -0.13899469375610352, "logps/chosen": -309.204833984375, "logps/rejected": -562.15869140625, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": -24.753726959228516, "rewards/margins": 20.61840057373047, "rewards/rejected": -45.372127532958984, "step": 640 }, { "epoch": 0.7122222222222222, "grad_norm": 5.299200534820557, "learning_rate": 4.535066509351202e-05, "logits/chosen": -0.10505476593971252, "logits/rejected": -0.09652282297611237, "logps/chosen": -311.54034423828125, "logps/rejected": -402.02935791015625, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": -24.175003051757812, "rewards/margins": 5.033343315124512, "rewards/rejected": -29.20834732055664, "step": 641 }, { "epoch": 0.7133333333333334, "grad_norm": 0.056116823107004166, "learning_rate": 4.533286500251529e-05, "logits/chosen": -0.2257516086101532, "logits/rejected": -0.23207268118858337, "logps/chosen": -288.6353759765625, "logps/rejected": -406.69873046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -22.24431800842285, "rewards/margins": 9.986021041870117, "rewards/rejected": -32.23033905029297, "step": 642 }, { "epoch": 0.7144444444444444, "grad_norm": 1.2083648443222046, "learning_rate": 4.5315034409854796e-05, "logits/chosen": 0.010412279516458511, "logits/rejected": -0.002713572233915329, "logps/chosen": -228.22695922851562, "logps/rejected": -324.47735595703125, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -15.368870735168457, "rewards/margins": 8.3316068649292, "rewards/rejected": -23.700477600097656, "step": 643 }, { "epoch": 0.7155555555555555, "grad_norm": 0.005345476791262627, "learning_rate": 4.5297173342278496e-05, "logits/chosen": -0.189570352435112, "logits/rejected": -0.1780727058649063, "logps/chosen": -182.201416015625, "logps/rejected": -375.41778564453125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.025887489318848, "rewards/margins": 14.782833099365234, "rewards/rejected": -27.808719635009766, "step": 644 }, { "epoch": 0.7166666666666667, "grad_norm": 1.7503979206085205, "learning_rate": 4.5279281826580056e-05, "logits/chosen": 0.1892162561416626, "logits/rejected": 0.19643297791481018, "logps/chosen": -418.8938293457031, "logps/rejected": -493.60443115234375, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -29.515151977539062, "rewards/margins": 7.675441741943359, "rewards/rejected": -37.19059371948242, "step": 645 }, { "epoch": 0.7177777777777777, "grad_norm": 0.5674684047698975, "learning_rate": 4.5261359889598855e-05, "logits/chosen": -0.19160060584545135, "logits/rejected": -0.1849679797887802, "logps/chosen": -465.4162292480469, "logps/rejected": -559.5374755859375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -37.53208923339844, "rewards/margins": 5.763501167297363, "rewards/rejected": -43.295589447021484, "step": 646 }, { "epoch": 0.7188888888888889, "grad_norm": 43.35117721557617, "learning_rate": 4.5243407558219866e-05, "logits/chosen": 0.015500187873840332, "logits/rejected": 0.014411285519599915, "logps/chosen": -313.14404296875, "logps/rejected": -356.4093017578125, "loss": 2.3954, "rewards/accuracies": 0.5, "rewards/chosen": -23.163501739501953, "rewards/margins": 2.7205429077148438, "rewards/rejected": -25.884044647216797, "step": 647 }, { "epoch": 0.72, "grad_norm": 32.591331481933594, "learning_rate": 4.522542485937369e-05, "logits/chosen": 0.23743167519569397, "logits/rejected": 0.2470468282699585, "logps/chosen": -347.5137939453125, "logps/rejected": -486.3334655761719, "loss": 2.474, "rewards/accuracies": 0.5, "rewards/chosen": -23.762720108032227, "rewards/margins": 10.132436752319336, "rewards/rejected": -33.89515686035156, "step": 648 }, { "epoch": 0.7211111111111111, "grad_norm": 0.1525786966085434, "learning_rate": 4.520741182003645e-05, "logits/chosen": -0.1472112089395523, "logits/rejected": -0.1480754017829895, "logps/chosen": -285.0813293457031, "logps/rejected": -423.085205078125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -19.7720947265625, "rewards/margins": 12.595715522766113, "rewards/rejected": -32.3678092956543, "step": 649 }, { "epoch": 0.7222222222222222, "grad_norm": 0.014218617230653763, "learning_rate": 4.518936846722982e-05, "logits/chosen": 0.3776334226131439, "logits/rejected": 0.3688035011291504, "logps/chosen": -232.6135711669922, "logps/rejected": -409.3578186035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -16.180675506591797, "rewards/margins": 14.51278305053711, "rewards/rejected": -30.693458557128906, "step": 650 }, { "epoch": 0.7233333333333334, "grad_norm": 14.696516990661621, "learning_rate": 4.517129482802092e-05, "logits/chosen": 0.354424387216568, "logits/rejected": 0.369911789894104, "logps/chosen": -342.7969055175781, "logps/rejected": -392.181396484375, "loss": 0.2427, "rewards/accuracies": 1.0, "rewards/chosen": -23.73914337158203, "rewards/margins": 3.3384170532226562, "rewards/rejected": -27.077560424804688, "step": 651 }, { "epoch": 0.7244444444444444, "grad_norm": 0.9472267627716064, "learning_rate": 4.5153190929522314e-05, "logits/chosen": 0.2927304804325104, "logits/rejected": 0.30502402782440186, "logps/chosen": -604.1934814453125, "logps/rejected": -774.1602783203125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -47.40554428100586, "rewards/margins": 14.037675857543945, "rewards/rejected": -61.44322204589844, "step": 652 }, { "epoch": 0.7255555555555555, "grad_norm": 1.3460259437561035, "learning_rate": 4.513505679889195e-05, "logits/chosen": -0.17204774916172028, "logits/rejected": -0.16462022066116333, "logps/chosen": -152.34461975097656, "logps/rejected": -256.0693359375, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": -10.990575790405273, "rewards/margins": 8.448793411254883, "rewards/rejected": -19.439369201660156, "step": 653 }, { "epoch": 0.7266666666666667, "grad_norm": 1.7964026927947998, "learning_rate": 4.511689246333314e-05, "logits/chosen": -0.03483642637729645, "logits/rejected": -0.027443237602710724, "logps/chosen": -363.204345703125, "logps/rejected": -422.625, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": -28.254764556884766, "rewards/margins": 4.244527339935303, "rewards/rejected": -32.499290466308594, "step": 654 }, { "epoch": 0.7277777777777777, "grad_norm": 2.97391939163208, "learning_rate": 4.50986979500945e-05, "logits/chosen": 0.21128778159618378, "logits/rejected": 0.20951484143733978, "logps/chosen": -151.88182067871094, "logps/rejected": -242.29876708984375, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": -9.441771507263184, "rewards/margins": 8.571592330932617, "rewards/rejected": -18.013362884521484, "step": 655 }, { "epoch": 0.7288888888888889, "grad_norm": 7.943380832672119, "learning_rate": 4.508047328646993e-05, "logits/chosen": 0.3210466504096985, "logits/rejected": 0.30739226937294006, "logps/chosen": -426.9659729003906, "logps/rejected": -516.9207763671875, "loss": 0.1309, "rewards/accuracies": 1.0, "rewards/chosen": -30.239734649658203, "rewards/margins": 6.985235214233398, "rewards/rejected": -37.22496795654297, "step": 656 }, { "epoch": 0.73, "grad_norm": 0.22540844976902008, "learning_rate": 4.5062218499798526e-05, "logits/chosen": 0.17430943250656128, "logits/rejected": 0.17308290302753448, "logps/chosen": -338.3282470703125, "logps/rejected": -395.68865966796875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -21.887693405151367, "rewards/margins": 7.565052509307861, "rewards/rejected": -29.45274543762207, "step": 657 }, { "epoch": 0.7311111111111112, "grad_norm": 0.16646026074886322, "learning_rate": 4.5043933617464604e-05, "logits/chosen": 0.13476715981960297, "logits/rejected": 0.139692023396492, "logps/chosen": -134.72003173828125, "logps/rejected": -217.94570922851562, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -8.120211601257324, "rewards/margins": 6.832303047180176, "rewards/rejected": -14.9525146484375, "step": 658 }, { "epoch": 0.7322222222222222, "grad_norm": 5.783388137817383, "learning_rate": 4.502561866689761e-05, "logits/chosen": 0.11581218987703323, "logits/rejected": 0.12642577290534973, "logps/chosen": -208.80218505859375, "logps/rejected": -278.91656494140625, "loss": 0.2325, "rewards/accuracies": 1.0, "rewards/chosen": -12.596918106079102, "rewards/margins": 5.870020866394043, "rewards/rejected": -18.466938018798828, "step": 659 }, { "epoch": 0.7333333333333333, "grad_norm": 0.00010928670963039622, "learning_rate": 4.5007273675572104e-05, "logits/chosen": 0.27141785621643066, "logits/rejected": 0.24691647291183472, "logps/chosen": -259.7066955566406, "logps/rejected": -476.30816650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.095449447631836, "rewards/margins": 19.336456298828125, "rewards/rejected": -34.431907653808594, "step": 660 }, { "epoch": 0.7344444444444445, "grad_norm": 52.162017822265625, "learning_rate": 4.49888986710077e-05, "logits/chosen": 0.17971405386924744, "logits/rejected": 0.17001201212406158, "logps/chosen": -381.85491943359375, "logps/rejected": -380.1461486816406, "loss": 3.7268, "rewards/accuracies": 0.5, "rewards/chosen": -23.95669174194336, "rewards/margins": 2.7216272354125977, "rewards/rejected": -26.67831802368164, "step": 661 }, { "epoch": 0.7355555555555555, "grad_norm": 2.1141915321350098, "learning_rate": 4.497049368076907e-05, "logits/chosen": 0.42818814516067505, "logits/rejected": 0.43167972564697266, "logps/chosen": -348.704833984375, "logps/rejected": -386.48760986328125, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -23.1026554107666, "rewards/margins": 3.5552544593811035, "rewards/rejected": -26.657909393310547, "step": 662 }, { "epoch": 0.7366666666666667, "grad_norm": 3.8118557929992676, "learning_rate": 4.495205873246581e-05, "logits/chosen": 0.19201040267944336, "logits/rejected": 0.233985036611557, "logps/chosen": -292.8336181640625, "logps/rejected": -407.4678955078125, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -18.6350040435791, "rewards/margins": 6.893161296844482, "rewards/rejected": -25.52816390991211, "step": 663 }, { "epoch": 0.7377777777777778, "grad_norm": 5.23031759262085, "learning_rate": 4.49335938537525e-05, "logits/chosen": 0.34710901975631714, "logits/rejected": 0.35444220900535583, "logps/chosen": -312.9074401855469, "logps/rejected": -413.8456115722656, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": -18.975425720214844, "rewards/margins": 6.948920249938965, "rewards/rejected": -25.924346923828125, "step": 664 }, { "epoch": 0.7388888888888889, "grad_norm": 26.584638595581055, "learning_rate": 4.4915099072328615e-05, "logits/chosen": 0.4705788493156433, "logits/rejected": 0.48859715461730957, "logps/chosen": -287.65740966796875, "logps/rejected": -357.0147705078125, "loss": 0.4681, "rewards/accuracies": 0.5, "rewards/chosen": -17.850120544433594, "rewards/margins": 4.544195175170898, "rewards/rejected": -22.39431381225586, "step": 665 }, { "epoch": 0.74, "grad_norm": 0.0023918363731354475, "learning_rate": 4.4896574415938465e-05, "logits/chosen": 0.10988462716341019, "logits/rejected": 0.10208061337471008, "logps/chosen": -239.04989624023438, "logps/rejected": -392.0845642089844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.701056480407715, "rewards/margins": 11.206294059753418, "rewards/rejected": -24.907350540161133, "step": 666 }, { "epoch": 0.7411111111111112, "grad_norm": 0.8002569675445557, "learning_rate": 4.48780199123712e-05, "logits/chosen": 0.2836395800113678, "logits/rejected": 0.3395109176635742, "logps/chosen": -323.0415954589844, "logps/rejected": -504.4815368652344, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": -21.849750518798828, "rewards/margins": 12.433506965637207, "rewards/rejected": -34.28325653076172, "step": 667 }, { "epoch": 0.7422222222222222, "grad_norm": 1.0205997228622437, "learning_rate": 4.485943558946073e-05, "logits/chosen": 0.07917995750904083, "logits/rejected": 0.1021929606795311, "logps/chosen": -221.314697265625, "logps/rejected": -378.5191650390625, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": -15.262256622314453, "rewards/margins": 10.59432315826416, "rewards/rejected": -25.85658073425293, "step": 668 }, { "epoch": 0.7433333333333333, "grad_norm": 4.702728271484375, "learning_rate": 4.48408214750857e-05, "logits/chosen": -0.2030932605266571, "logits/rejected": -0.19126659631729126, "logps/chosen": -136.25543212890625, "logps/rejected": -325.9189758300781, "loss": 0.6108, "rewards/accuracies": 0.5, "rewards/chosen": -8.249531745910645, "rewards/margins": 16.49059295654297, "rewards/rejected": -24.740123748779297, "step": 669 }, { "epoch": 0.7444444444444445, "grad_norm": 0.00017478459631092846, "learning_rate": 4.482217759716946e-05, "logits/chosen": -0.13309648633003235, "logits/rejected": -0.13397005200386047, "logps/chosen": -214.07144165039062, "logps/rejected": -449.8781433105469, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.262972831726074, "rewards/margins": 19.518291473388672, "rewards/rejected": -32.78126525878906, "step": 670 }, { "epoch": 0.7455555555555555, "grad_norm": 6.603038968933106e-07, "learning_rate": 4.4803503983679975e-05, "logits/chosen": 0.11901476234197617, "logits/rejected": 0.13822653889656067, "logps/chosen": -422.55609130859375, "logps/rejected": -660.3213500976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.559188842773438, "rewards/margins": 19.864194869995117, "rewards/rejected": -45.42338562011719, "step": 671 }, { "epoch": 0.7466666666666667, "grad_norm": 25.099088668823242, "learning_rate": 4.478480066262987e-05, "logits/chosen": -0.09787356108427048, "logits/rejected": -0.09543611854314804, "logps/chosen": -176.219970703125, "logps/rejected": -190.82730102539062, "loss": 0.6781, "rewards/accuracies": 0.5, "rewards/chosen": -9.450093269348145, "rewards/margins": 0.3420405387878418, "rewards/rejected": -9.792133331298828, "step": 672 }, { "epoch": 0.7477777777777778, "grad_norm": 0.02712138742208481, "learning_rate": 4.476606766207627e-05, "logits/chosen": 0.13446281850337982, "logits/rejected": 0.14930453896522522, "logps/chosen": -438.62847900390625, "logps/rejected": -639.2050170898438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -30.184452056884766, "rewards/margins": 13.18688678741455, "rewards/rejected": -43.371337890625, "step": 673 }, { "epoch": 0.7488888888888889, "grad_norm": 0.48312267661094666, "learning_rate": 4.4747305010120876e-05, "logits/chosen": 0.0858042761683464, "logits/rejected": 0.08677864819765091, "logps/chosen": -391.6105651855469, "logps/rejected": -486.990234375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -27.24571990966797, "rewards/margins": 9.216896057128906, "rewards/rejected": -36.462615966796875, "step": 674 }, { "epoch": 0.75, "grad_norm": 0.10115915536880493, "learning_rate": 4.4728512734909844e-05, "logits/chosen": -0.33372974395751953, "logits/rejected": -0.30075952410697937, "logps/chosen": -250.42625427246094, "logps/rejected": -361.36358642578125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -15.420072555541992, "rewards/margins": 11.791290283203125, "rewards/rejected": -27.21136474609375, "step": 675 }, { "epoch": 0.7511111111111111, "grad_norm": 2.390852451324463, "learning_rate": 4.4709690864633766e-05, "logits/chosen": -0.3874291479587555, "logits/rejected": -0.3842531442642212, "logps/chosen": -140.43096923828125, "logps/rejected": -221.7076416015625, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -8.431407928466797, "rewards/margins": 6.79141092300415, "rewards/rejected": -15.222818374633789, "step": 676 }, { "epoch": 0.7522222222222222, "grad_norm": 3.7474379539489746, "learning_rate": 4.4690839427527665e-05, "logits/chosen": -0.15196296572685242, "logits/rejected": -0.13600696623325348, "logps/chosen": -220.39056396484375, "logps/rejected": -360.0867614746094, "loss": 0.1717, "rewards/accuracies": 1.0, "rewards/chosen": -15.626178741455078, "rewards/margins": 8.594626426696777, "rewards/rejected": -24.22080421447754, "step": 677 }, { "epoch": 0.7533333333333333, "grad_norm": 0.012061014771461487, "learning_rate": 4.4671958451870854e-05, "logits/chosen": -0.5710174441337585, "logits/rejected": -0.5529033541679382, "logps/chosen": -138.20797729492188, "logps/rejected": -250.92843627929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.681585788726807, "rewards/margins": 9.160207748413086, "rewards/rejected": -15.841793060302734, "step": 678 }, { "epoch": 0.7544444444444445, "grad_norm": 5.256855010986328, "learning_rate": 4.465304796598702e-05, "logits/chosen": 0.01308523491024971, "logits/rejected": -0.028451258316636086, "logps/chosen": -216.45289611816406, "logps/rejected": -250.7764129638672, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": -12.44328498840332, "rewards/margins": 3.4268670082092285, "rewards/rejected": -15.87015151977539, "step": 679 }, { "epoch": 0.7555555555555555, "grad_norm": 0.4943789541721344, "learning_rate": 4.463410799824408e-05, "logits/chosen": -0.6814990639686584, "logits/rejected": -0.7268380522727966, "logps/chosen": -136.94602966308594, "logps/rejected": -261.5169677734375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -7.93494987487793, "rewards/margins": 9.106650352478027, "rewards/rejected": -17.04159927368164, "step": 680 }, { "epoch": 0.7566666666666667, "grad_norm": 0.19651548564434052, "learning_rate": 4.4615138577054195e-05, "logits/chosen": -0.2143779993057251, "logits/rejected": -0.21859502792358398, "logps/chosen": -237.63137817382812, "logps/rejected": -305.0786437988281, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -16.536670684814453, "rewards/margins": 5.809107780456543, "rewards/rejected": -22.34577751159668, "step": 681 }, { "epoch": 0.7577777777777778, "grad_norm": 0.00035372708225622773, "learning_rate": 4.4596139730873707e-05, "logits/chosen": -0.39630839228630066, "logits/rejected": -0.36958175897598267, "logps/chosen": -290.99334716796875, "logps/rejected": -599.5736694335938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.41954803466797, "rewards/margins": 23.13480567932129, "rewards/rejected": -44.554351806640625, "step": 682 }, { "epoch": 0.7588888888888888, "grad_norm": 0.0007873640861362219, "learning_rate": 4.457711148820308e-05, "logits/chosen": -0.22852733731269836, "logits/rejected": -0.22462210059165955, "logps/chosen": -479.0210876464844, "logps/rejected": -614.3367919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -34.537635803222656, "rewards/margins": 11.427717208862305, "rewards/rejected": -45.965354919433594, "step": 683 }, { "epoch": 0.76, "grad_norm": 0.6876626014709473, "learning_rate": 4.455805387758691e-05, "logits/chosen": -0.2856229245662689, "logits/rejected": -0.2943215072154999, "logps/chosen": -300.6446228027344, "logps/rejected": -625.4793701171875, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -20.3853702545166, "rewards/margins": 20.21742057800293, "rewards/rejected": -40.60279083251953, "step": 684 }, { "epoch": 0.7611111111111111, "grad_norm": 0.04229077324271202, "learning_rate": 4.4538966927613836e-05, "logits/chosen": -0.1335938423871994, "logits/rejected": -0.13258740305900574, "logps/chosen": -202.18487548828125, "logps/rejected": -353.71136474609375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -13.841950416564941, "rewards/margins": 10.791176795959473, "rewards/rejected": -24.633127212524414, "step": 685 }, { "epoch": 0.7622222222222222, "grad_norm": 1.1247036724171267e-07, "learning_rate": 4.4519850666916484e-05, "logits/chosen": -0.1011250764131546, "logits/rejected": -0.0732671320438385, "logps/chosen": -262.67095947265625, "logps/rejected": -549.6732177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.295061111450195, "rewards/margins": 21.43636131286621, "rewards/rejected": -37.731422424316406, "step": 686 }, { "epoch": 0.7633333333333333, "grad_norm": 0.02209744043648243, "learning_rate": 4.4500705124171485e-05, "logits/chosen": -0.712788462638855, "logits/rejected": -0.7633808851242065, "logps/chosen": -149.4326629638672, "logps/rejected": -301.6557312011719, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.554929733276367, "rewards/margins": 11.570379257202148, "rewards/rejected": -21.125308990478516, "step": 687 }, { "epoch": 0.7644444444444445, "grad_norm": 0.04184373468160629, "learning_rate": 4.4481530328099364e-05, "logits/chosen": -0.33263298869132996, "logits/rejected": -0.32466819882392883, "logps/chosen": -271.4150085449219, "logps/rejected": -508.1540832519531, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -20.329055786132812, "rewards/margins": 15.828550338745117, "rewards/rejected": -36.15760803222656, "step": 688 }, { "epoch": 0.7655555555555555, "grad_norm": 0.0012411974603310227, "learning_rate": 4.446232630746457e-05, "logits/chosen": -0.10283008217811584, "logits/rejected": -0.13690994679927826, "logps/chosen": -581.5504760742188, "logps/rejected": -762.3313598632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -45.22569274902344, "rewards/margins": 14.146265029907227, "rewards/rejected": -59.3719596862793, "step": 689 }, { "epoch": 0.7666666666666667, "grad_norm": 0.016777126118540764, "learning_rate": 4.444309309107535e-05, "logits/chosen": -0.4222570061683655, "logits/rejected": -0.3936172127723694, "logps/chosen": -373.6925048828125, "logps/rejected": -616.6082153320312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -29.34855079650879, "rewards/margins": 20.83599281311035, "rewards/rejected": -50.18454360961914, "step": 690 }, { "epoch": 0.7677777777777778, "grad_norm": 0.0008763744845055044, "learning_rate": 4.4423830707783775e-05, "logits/chosen": -0.40839508175849915, "logits/rejected": -0.4100821018218994, "logps/chosen": -322.4753112792969, "logps/rejected": -444.4151611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.018457412719727, "rewards/margins": 11.264230728149414, "rewards/rejected": -35.28268814086914, "step": 691 }, { "epoch": 0.7688888888888888, "grad_norm": 64.140869140625, "learning_rate": 4.4404539186485665e-05, "logits/chosen": -0.1794617474079132, "logits/rejected": -0.3436836004257202, "logps/chosen": -313.34649658203125, "logps/rejected": -310.98529052734375, "loss": 3.9745, "rewards/accuracies": 0.5, "rewards/chosen": -24.060184478759766, "rewards/margins": -1.3146677017211914, "rewards/rejected": -22.745515823364258, "step": 692 }, { "epoch": 0.77, "grad_norm": 0.00041376453009434044, "learning_rate": 4.438521855612054e-05, "logits/chosen": -0.4289166033267975, "logits/rejected": -0.4217976927757263, "logps/chosen": -269.845703125, "logps/rejected": -421.88421630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.37019920349121, "rewards/margins": 13.080543518066406, "rewards/rejected": -32.45074462890625, "step": 693 }, { "epoch": 0.7711111111111111, "grad_norm": 2.8355158065096475e-05, "learning_rate": 4.4365868845671597e-05, "logits/chosen": -0.29367706179618835, "logits/rejected": -0.29411381483078003, "logps/chosen": -243.3744354248047, "logps/rejected": -444.03955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.158462524414062, "rewards/margins": 17.361080169677734, "rewards/rejected": -34.5195426940918, "step": 694 }, { "epoch": 0.7722222222222223, "grad_norm": 0.05820606276392937, "learning_rate": 4.434649008416565e-05, "logits/chosen": -0.3581782281398773, "logits/rejected": -0.3668329417705536, "logps/chosen": -417.7569885253906, "logps/rejected": -513.7354736328125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -31.009689331054688, "rewards/margins": 6.966358184814453, "rewards/rejected": -37.97604751586914, "step": 695 }, { "epoch": 0.7733333333333333, "grad_norm": 0.32133159041404724, "learning_rate": 4.43270823006731e-05, "logits/chosen": -0.13984711468219757, "logits/rejected": -0.1460215449333191, "logps/chosen": -485.99749755859375, "logps/rejected": -572.042236328125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -39.077640533447266, "rewards/margins": 6.451805114746094, "rewards/rejected": -45.52944564819336, "step": 696 }, { "epoch": 0.7744444444444445, "grad_norm": 23.538944244384766, "learning_rate": 4.4307645524307884e-05, "logits/chosen": -0.5120337009429932, "logits/rejected": -0.536341667175293, "logps/chosen": -341.44818115234375, "logps/rejected": -468.9588928222656, "loss": 0.2507, "rewards/accuracies": 1.0, "rewards/chosen": -24.505596160888672, "rewards/margins": 9.245630264282227, "rewards/rejected": -33.751224517822266, "step": 697 }, { "epoch": 0.7755555555555556, "grad_norm": 44.0572509765625, "learning_rate": 4.428817978422742e-05, "logits/chosen": -0.36080071330070496, "logits/rejected": -0.3461170196533203, "logps/chosen": -340.1202392578125, "logps/rejected": -423.6039123535156, "loss": 4.9208, "rewards/accuracies": 0.5, "rewards/chosen": -27.414630889892578, "rewards/margins": 7.604944229125977, "rewards/rejected": -35.01957702636719, "step": 698 }, { "epoch": 0.7766666666666666, "grad_norm": 0.9909535646438599, "learning_rate": 4.4268685109632603e-05, "logits/chosen": -0.4445905089378357, "logits/rejected": -0.4557275176048279, "logps/chosen": -192.6361846923828, "logps/rejected": -347.827880859375, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -12.918421745300293, "rewards/margins": 12.689937591552734, "rewards/rejected": -25.608360290527344, "step": 699 }, { "epoch": 0.7777777777777778, "grad_norm": 1.1585808992385864, "learning_rate": 4.424916152976768e-05, "logits/chosen": -0.18115508556365967, "logits/rejected": -0.16596165299415588, "logps/chosen": -427.58294677734375, "logps/rejected": -539.2384033203125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -33.46098327636719, "rewards/margins": 9.101473808288574, "rewards/rejected": -42.56245803833008, "step": 700 }, { "epoch": 0.7788888888888889, "grad_norm": 0.01474736537784338, "learning_rate": 4.422960907392032e-05, "logits/chosen": -0.12718059122562408, "logits/rejected": -0.15468092262744904, "logps/chosen": -523.6901245117188, "logps/rejected": -765.9114990234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -37.763633728027344, "rewards/margins": 20.309457778930664, "rewards/rejected": -58.07309341430664, "step": 701 }, { "epoch": 0.78, "grad_norm": 6.080226898193359, "learning_rate": 4.421002777142148e-05, "logits/chosen": -0.28518182039260864, "logits/rejected": -0.29020386934280396, "logps/chosen": -477.66741943359375, "logps/rejected": -659.3648071289062, "loss": 0.2224, "rewards/accuracies": 1.0, "rewards/chosen": -37.91859817504883, "rewards/margins": 14.914324760437012, "rewards/rejected": -52.832923889160156, "step": 702 }, { "epoch": 0.7811111111111111, "grad_norm": 0.2219855785369873, "learning_rate": 4.419041765164538e-05, "logits/chosen": -0.24169491231441498, "logits/rejected": -0.2420274168252945, "logps/chosen": -457.130126953125, "logps/rejected": -722.7053833007812, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -36.045921325683594, "rewards/margins": 22.525800704956055, "rewards/rejected": -58.57172393798828, "step": 703 }, { "epoch": 0.7822222222222223, "grad_norm": 0.014310394413769245, "learning_rate": 4.417077874400949e-05, "logits/chosen": -0.6199904680252075, "logits/rejected": -0.6683277487754822, "logps/chosen": -396.4384460449219, "logps/rejected": -583.3681640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -29.943647384643555, "rewards/margins": 12.177701950073242, "rewards/rejected": -42.1213493347168, "step": 704 }, { "epoch": 0.7833333333333333, "grad_norm": 2.2092647552490234, "learning_rate": 4.415111107797445e-05, "logits/chosen": -0.5919160842895508, "logits/rejected": -0.5914275646209717, "logps/chosen": -221.9962921142578, "logps/rejected": -282.9263000488281, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": -17.337730407714844, "rewards/margins": 4.2716064453125, "rewards/rejected": -21.609336853027344, "step": 705 }, { "epoch": 0.7844444444444445, "grad_norm": 89.659423828125, "learning_rate": 4.4131414683044056e-05, "logits/chosen": -0.4286334812641144, "logits/rejected": -0.4337315857410431, "logps/chosen": -489.156494140625, "logps/rejected": -307.9498291015625, "loss": 12.0943, "rewards/accuracies": 0.0, "rewards/chosen": -35.94513702392578, "rewards/margins": -11.765707015991211, "rewards/rejected": -24.179428100585938, "step": 706 }, { "epoch": 0.7855555555555556, "grad_norm": 2.1630126866512e-05, "learning_rate": 4.41116895887652e-05, "logits/chosen": -0.3947850465774536, "logits/rejected": -0.423414945602417, "logps/chosen": -332.92803955078125, "logps/rejected": -530.0518798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.70984649658203, "rewards/margins": 16.548320770263672, "rewards/rejected": -40.2581672668457, "step": 707 }, { "epoch": 0.7866666666666666, "grad_norm": 0.04704172909259796, "learning_rate": 4.4091935824727805e-05, "logits/chosen": -0.3439096212387085, "logits/rejected": -0.36454105377197266, "logps/chosen": -307.4599609375, "logps/rejected": -410.67205810546875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -21.57805633544922, "rewards/margins": 8.026285171508789, "rewards/rejected": -29.604339599609375, "step": 708 }, { "epoch": 0.7877777777777778, "grad_norm": 13.221299171447754, "learning_rate": 4.407215342056481e-05, "logits/chosen": -0.5818586945533752, "logits/rejected": -0.5703901052474976, "logps/chosen": -132.47569274902344, "logps/rejected": -159.52635192871094, "loss": 0.5288, "rewards/accuracies": 0.5, "rewards/chosen": -8.646215438842773, "rewards/margins": 2.8580679893493652, "rewards/rejected": -11.504283905029297, "step": 709 }, { "epoch": 0.7888888888888889, "grad_norm": 2.7411272525787354, "learning_rate": 4.405234240595214e-05, "logits/chosen": -0.39174318313598633, "logits/rejected": -0.38828736543655396, "logps/chosen": -641.76318359375, "logps/rejected": -842.9891967773438, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": -45.191688537597656, "rewards/margins": 15.19218921661377, "rewards/rejected": -60.38387680053711, "step": 710 }, { "epoch": 0.79, "grad_norm": 7.482235431671143, "learning_rate": 4.4032502810608614e-05, "logits/chosen": -0.4277830719947815, "logits/rejected": -0.44655489921569824, "logps/chosen": -445.2044372558594, "logps/rejected": -611.6765747070312, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": -33.28594207763672, "rewards/margins": 11.07568359375, "rewards/rejected": -44.36162567138672, "step": 711 }, { "epoch": 0.7911111111111111, "grad_norm": 0.0001712091179797426, "learning_rate": 4.4012634664295935e-05, "logits/chosen": -0.46596843004226685, "logits/rejected": -0.45118382573127747, "logps/chosen": -297.8394775390625, "logps/rejected": -466.59515380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.233104705810547, "rewards/margins": 15.043068885803223, "rewards/rejected": -37.27617645263672, "step": 712 }, { "epoch": 0.7922222222222223, "grad_norm": 12.316311836242676, "learning_rate": 4.3992737996818646e-05, "logits/chosen": -0.5812647342681885, "logits/rejected": -0.5666804313659668, "logps/chosen": -266.74053955078125, "logps/rejected": -320.54498291015625, "loss": 0.1986, "rewards/accuracies": 1.0, "rewards/chosen": -18.676050186157227, "rewards/margins": 4.131303787231445, "rewards/rejected": -22.807353973388672, "step": 713 }, { "epoch": 0.7933333333333333, "grad_norm": 0.009345537051558495, "learning_rate": 4.397281283802404e-05, "logits/chosen": -0.2903996407985687, "logits/rejected": -0.2515751123428345, "logps/chosen": -267.4500427246094, "logps/rejected": -435.7385559082031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -19.800466537475586, "rewards/margins": 11.390535354614258, "rewards/rejected": -31.191001892089844, "step": 714 }, { "epoch": 0.7944444444444444, "grad_norm": 0.0009562201448716223, "learning_rate": 4.395285921780219e-05, "logits/chosen": -0.17411990463733673, "logits/rejected": -0.15105605125427246, "logps/chosen": -339.8360290527344, "logps/rejected": -489.2575378417969, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.66063690185547, "rewards/margins": 12.456337928771973, "rewards/rejected": -35.116973876953125, "step": 715 }, { "epoch": 0.7955555555555556, "grad_norm": 3.5760130882263184, "learning_rate": 4.393287716608585e-05, "logits/chosen": -0.1755208969116211, "logits/rejected": -0.14557358622550964, "logps/chosen": -330.48968505859375, "logps/rejected": -455.950927734375, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -23.359310150146484, "rewards/margins": 11.607084274291992, "rewards/rejected": -34.96639633178711, "step": 716 }, { "epoch": 0.7966666666666666, "grad_norm": 0.27521756291389465, "learning_rate": 4.391286671285042e-05, "logits/chosen": -0.35858362913131714, "logits/rejected": -0.38848739862442017, "logps/chosen": -453.3267822265625, "logps/rejected": -534.9801635742188, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -34.59674835205078, "rewards/margins": 8.275094032287598, "rewards/rejected": -42.87184143066406, "step": 717 }, { "epoch": 0.7977777777777778, "grad_norm": 11.167784690856934, "learning_rate": 4.38928278881139e-05, "logits/chosen": -0.2691381573677063, "logits/rejected": -0.23282289505004883, "logps/chosen": -142.60667419433594, "logps/rejected": -289.081298828125, "loss": 0.9183, "rewards/accuracies": 0.5, "rewards/chosen": -8.108678817749023, "rewards/margins": 9.584816932678223, "rewards/rejected": -17.693496704101562, "step": 718 }, { "epoch": 0.7988888888888889, "grad_norm": 43.15885925292969, "learning_rate": 4.387276072193687e-05, "logits/chosen": -0.4334985017776489, "logits/rejected": -0.4439623951911926, "logps/chosen": -293.1733093261719, "logps/rejected": -293.06109619140625, "loss": 0.7951, "rewards/accuracies": 0.5, "rewards/chosen": -18.557722091674805, "rewards/margins": 2.204671859741211, "rewards/rejected": -20.762393951416016, "step": 719 }, { "epoch": 0.8, "grad_norm": 0.0006985223153606057, "learning_rate": 4.385266524442241e-05, "logits/chosen": -0.29289472103118896, "logits/rejected": -0.3275684118270874, "logps/chosen": -302.36651611328125, "logps/rejected": -497.62554931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.53427505493164, "rewards/margins": 15.369202613830566, "rewards/rejected": -37.903480529785156, "step": 720 }, { "epoch": 0.8011111111111111, "grad_norm": 9.466177940368652, "learning_rate": 4.383254148571607e-05, "logits/chosen": -0.4358949661254883, "logits/rejected": -0.4227355122566223, "logps/chosen": -256.3292236328125, "logps/rejected": -306.6482849121094, "loss": 0.2136, "rewards/accuracies": 1.0, "rewards/chosen": -19.53888702392578, "rewards/margins": 2.782059669494629, "rewards/rejected": -22.320945739746094, "step": 721 }, { "epoch": 0.8022222222222222, "grad_norm": 0.12051234394311905, "learning_rate": 4.381238947600584e-05, "logits/chosen": -0.27261972427368164, "logits/rejected": -0.2710520029067993, "logps/chosen": -294.5284729003906, "logps/rejected": -372.2506408691406, "loss": 0.3466, "rewards/accuracies": 0.5, "rewards/chosen": -22.461809158325195, "rewards/margins": 6.758718490600586, "rewards/rejected": -29.22052764892578, "step": 722 }, { "epoch": 0.8033333333333333, "grad_norm": 0.007198858540505171, "learning_rate": 4.379220924552209e-05, "logits/chosen": -0.4732869565486908, "logits/rejected": -0.4718460738658905, "logps/chosen": -218.75149536132812, "logps/rejected": -447.2056884765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.374183654785156, "rewards/margins": 16.505611419677734, "rewards/rejected": -31.87979507446289, "step": 723 }, { "epoch": 0.8044444444444444, "grad_norm": 0.48898157477378845, "learning_rate": 4.377200082453749e-05, "logits/chosen": -0.39949458837509155, "logits/rejected": -0.3921704590320587, "logps/chosen": -325.8331604003906, "logps/rejected": -433.66796875, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -23.525177001953125, "rewards/margins": 8.046031951904297, "rewards/rejected": -31.571208953857422, "step": 724 }, { "epoch": 0.8055555555555556, "grad_norm": 0.4758000075817108, "learning_rate": 4.3751764243367025e-05, "logits/chosen": -0.4241786003112793, "logits/rejected": -0.40869084000587463, "logps/chosen": -745.7467041015625, "logps/rejected": -827.6497802734375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -55.32468795776367, "rewards/margins": 7.136257171630859, "rewards/rejected": -62.46094512939453, "step": 725 }, { "epoch": 0.8066666666666666, "grad_norm": 28.920801162719727, "learning_rate": 4.3731499532367936e-05, "logits/chosen": -0.09495387971401215, "logits/rejected": -0.15187716484069824, "logps/chosen": -501.79278564453125, "logps/rejected": -661.4903564453125, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": -39.90403366088867, "rewards/margins": 11.114568710327148, "rewards/rejected": -51.01860427856445, "step": 726 }, { "epoch": 0.8077777777777778, "grad_norm": 32.06861114501953, "learning_rate": 4.371120672193964e-05, "logits/chosen": -0.45786136388778687, "logits/rejected": -0.44962817430496216, "logps/chosen": -311.1431884765625, "logps/rejected": -295.54736328125, "loss": 4.8203, "rewards/accuracies": 0.5, "rewards/chosen": -21.12289810180664, "rewards/margins": -0.46268320083618164, "rewards/rejected": -20.660215377807617, "step": 727 }, { "epoch": 0.8088888888888889, "grad_norm": 0.019925935193896294, "learning_rate": 4.3690885842523714e-05, "logits/chosen": -0.43113067746162415, "logits/rejected": -0.4302302896976471, "logps/chosen": -130.5729217529297, "logps/rejected": -222.23089599609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.103150844573975, "rewards/margins": 7.59239387512207, "rewards/rejected": -14.695545196533203, "step": 728 }, { "epoch": 0.81, "grad_norm": 7.17628211077681e-07, "learning_rate": 4.367053692460385e-05, "logits/chosen": -0.5735124945640564, "logits/rejected": -0.5504454374313354, "logps/chosen": -519.3167114257812, "logps/rejected": -807.6104736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -42.46998596191406, "rewards/margins": 22.070512771606445, "rewards/rejected": -64.54049682617188, "step": 729 }, { "epoch": 0.8111111111111111, "grad_norm": 13.286771774291992, "learning_rate": 4.365015999870579e-05, "logits/chosen": -0.44607245922088623, "logits/rejected": -0.4467373490333557, "logps/chosen": -314.69268798828125, "logps/rejected": -363.91717529296875, "loss": 0.2968, "rewards/accuracies": 1.0, "rewards/chosen": -24.277137756347656, "rewards/margins": 3.8188743591308594, "rewards/rejected": -28.096012115478516, "step": 730 }, { "epoch": 0.8122222222222222, "grad_norm": 0.11339224874973297, "learning_rate": 4.362975509539728e-05, "logits/chosen": -0.17382554709911346, "logits/rejected": -0.17412573099136353, "logps/chosen": -547.025390625, "logps/rejected": -636.6171875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -42.28843688964844, "rewards/margins": 6.997331619262695, "rewards/rejected": -49.285770416259766, "step": 731 }, { "epoch": 0.8133333333333334, "grad_norm": 62.64221954345703, "learning_rate": 4.3609322245288066e-05, "logits/chosen": -0.37461361289024353, "logits/rejected": -0.3694998621940613, "logps/chosen": -415.73626708984375, "logps/rejected": -390.9900207519531, "loss": 2.6417, "rewards/accuracies": 0.0, "rewards/chosen": -32.306575775146484, "rewards/margins": -2.4391069412231445, "rewards/rejected": -29.867469787597656, "step": 732 }, { "epoch": 0.8144444444444444, "grad_norm": 36.9998664855957, "learning_rate": 4.35888614790298e-05, "logits/chosen": -0.39543968439102173, "logits/rejected": -0.39597511291503906, "logps/chosen": -280.3340759277344, "logps/rejected": -275.298828125, "loss": 1.7693, "rewards/accuracies": 0.5, "rewards/chosen": -20.95943260192871, "rewards/margins": -1.026214599609375, "rewards/rejected": -19.933218002319336, "step": 733 }, { "epoch": 0.8155555555555556, "grad_norm": 56.17190170288086, "learning_rate": 4.3568372827316004e-05, "logits/chosen": -0.4180459976196289, "logits/rejected": -0.4100969731807709, "logps/chosen": -407.78912353515625, "logps/rejected": -435.00213623046875, "loss": 2.3344, "rewards/accuracies": 0.5, "rewards/chosen": -32.73868942260742, "rewards/margins": 0.5260119438171387, "rewards/rejected": -33.26470184326172, "step": 734 }, { "epoch": 0.8166666666666667, "grad_norm": 0.5659170150756836, "learning_rate": 4.3547856320882044e-05, "logits/chosen": -0.37071675062179565, "logits/rejected": -0.40562763810157776, "logps/chosen": -457.9248962402344, "logps/rejected": -653.4508666992188, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -29.97591209411621, "rewards/margins": 10.5169038772583, "rewards/rejected": -40.49281692504883, "step": 735 }, { "epoch": 0.8177777777777778, "grad_norm": 0.00039385425043292344, "learning_rate": 4.3527311990505064e-05, "logits/chosen": -0.28598901629447937, "logits/rejected": -0.27602097392082214, "logps/chosen": -336.24884033203125, "logps/rejected": -604.283935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.447315216064453, "rewards/margins": 21.91487693786621, "rewards/rejected": -47.36219024658203, "step": 736 }, { "epoch": 0.8188888888888889, "grad_norm": 0.2988591492176056, "learning_rate": 4.3506739867003966e-05, "logits/chosen": -0.27260398864746094, "logits/rejected": -0.26709842681884766, "logps/chosen": -307.87213134765625, "logps/rejected": -403.45135498046875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -21.412498474121094, "rewards/margins": 7.820199966430664, "rewards/rejected": -29.23270034790039, "step": 737 }, { "epoch": 0.82, "grad_norm": 1.9830745458602905, "learning_rate": 4.3486139981239304e-05, "logits/chosen": -0.2856745421886444, "logits/rejected": -0.29630419611930847, "logps/chosen": -325.76177978515625, "logps/rejected": -446.5693359375, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": -23.051668167114258, "rewards/margins": 8.855539321899414, "rewards/rejected": -31.907207489013672, "step": 738 }, { "epoch": 0.8211111111111111, "grad_norm": 6.386728637153283e-05, "learning_rate": 4.3465512364113327e-05, "logits/chosen": -0.34344279766082764, "logits/rejected": -0.3442509174346924, "logps/chosen": -388.2489013671875, "logps/rejected": -541.5992431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.66498374938965, "rewards/margins": 13.906067848205566, "rewards/rejected": -43.57105255126953, "step": 739 }, { "epoch": 0.8222222222222222, "grad_norm": 8.620932579040527, "learning_rate": 4.3444857046569855e-05, "logits/chosen": -0.1996985226869583, "logits/rejected": -0.18842977285385132, "logps/chosen": -344.3810729980469, "logps/rejected": -386.38177490234375, "loss": 0.1896, "rewards/accuracies": 1.0, "rewards/chosen": -24.763111114501953, "rewards/margins": 3.8639039993286133, "rewards/rejected": -28.627016067504883, "step": 740 }, { "epoch": 0.8233333333333334, "grad_norm": 0.024116836488246918, "learning_rate": 4.3424174059594256e-05, "logits/chosen": -0.1274215579032898, "logits/rejected": -0.11584465950727463, "logps/chosen": -603.4799194335938, "logps/rejected": -757.1605224609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -47.02104187011719, "rewards/margins": 10.40733528137207, "rewards/rejected": -57.428375244140625, "step": 741 }, { "epoch": 0.8244444444444444, "grad_norm": 1.747113823890686, "learning_rate": 4.340346343421343e-05, "logits/chosen": -0.18261240422725677, "logits/rejected": -0.17001673579216003, "logps/chosen": -420.0610046386719, "logps/rejected": -476.5283508300781, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -31.29424285888672, "rewards/margins": 4.335538864135742, "rewards/rejected": -35.629783630371094, "step": 742 }, { "epoch": 0.8255555555555556, "grad_norm": 11.752572059631348, "learning_rate": 4.3382725201495723e-05, "logits/chosen": -0.3980557322502136, "logits/rejected": -0.3873634934425354, "logps/chosen": -357.8211669921875, "logps/rejected": -381.02886962890625, "loss": 0.1407, "rewards/accuracies": 1.0, "rewards/chosen": -28.479461669921875, "rewards/margins": 1.9122076034545898, "rewards/rejected": -30.39167022705078, "step": 743 }, { "epoch": 0.8266666666666667, "grad_norm": 0.58945232629776, "learning_rate": 4.336195939255089e-05, "logits/chosen": -0.3841128945350647, "logits/rejected": -0.36959511041641235, "logps/chosen": -355.67230224609375, "logps/rejected": -439.1192321777344, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -27.85738754272461, "rewards/margins": 7.078980445861816, "rewards/rejected": -34.93636703491211, "step": 744 }, { "epoch": 0.8277777777777777, "grad_norm": 38.07997131347656, "learning_rate": 4.334116603853007e-05, "logits/chosen": -0.4761132597923279, "logits/rejected": -0.4615354537963867, "logps/chosen": -339.089111328125, "logps/rejected": -328.89306640625, "loss": 4.9654, "rewards/accuracies": 0.5, "rewards/chosen": -24.99433135986328, "rewards/margins": 1.0522336959838867, "rewards/rejected": -26.04656410217285, "step": 745 }, { "epoch": 0.8288888888888889, "grad_norm": 0.12747086584568024, "learning_rate": 4.3320345170625716e-05, "logits/chosen": -0.49730151891708374, "logits/rejected": -0.4785885512828827, "logps/chosen": -253.026123046875, "logps/rejected": -431.3618469238281, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -18.425310134887695, "rewards/margins": 14.619234085083008, "rewards/rejected": -33.0445442199707, "step": 746 }, { "epoch": 0.83, "grad_norm": 53.57808303833008, "learning_rate": 4.3299496820071546e-05, "logits/chosen": -0.44679415225982666, "logits/rejected": -0.44055256247520447, "logps/chosen": -363.3487548828125, "logps/rejected": -415.90093994140625, "loss": 5.2792, "rewards/accuracies": 0.5, "rewards/chosen": -28.544300079345703, "rewards/margins": 3.1249098777770996, "rewards/rejected": -31.669208526611328, "step": 747 }, { "epoch": 0.8311111111111111, "grad_norm": 0.9915576577186584, "learning_rate": 4.3278621018142504e-05, "logits/chosen": -0.4291904866695404, "logits/rejected": -0.4149267077445984, "logps/chosen": -267.1895751953125, "logps/rejected": -365.3778076171875, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -17.268274307250977, "rewards/margins": 7.683533191680908, "rewards/rejected": -24.951807022094727, "step": 748 }, { "epoch": 0.8322222222222222, "grad_norm": 0.5647404789924622, "learning_rate": 4.325771779615475e-05, "logits/chosen": -0.49470067024230957, "logits/rejected": -0.5187956690788269, "logps/chosen": -150.12127685546875, "logps/rejected": -304.7006530761719, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -9.566539764404297, "rewards/margins": 11.99386978149414, "rewards/rejected": -21.56041145324707, "step": 749 }, { "epoch": 0.8333333333333334, "grad_norm": 0.02967625856399536, "learning_rate": 4.3236787185465525e-05, "logits/chosen": -0.4473322629928589, "logits/rejected": -0.43939870595932007, "logps/chosen": -245.7248992919922, "logps/rejected": -350.2401123046875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -17.073429107666016, "rewards/margins": 7.489704132080078, "rewards/rejected": -24.563133239746094, "step": 750 }, { "epoch": 0.8344444444444444, "grad_norm": 0.17850598692893982, "learning_rate": 4.321582921747318e-05, "logits/chosen": -0.5789490938186646, "logits/rejected": -0.6000778079032898, "logps/chosen": -427.402587890625, "logps/rejected": -581.1033325195312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -31.969959259033203, "rewards/margins": 13.3441801071167, "rewards/rejected": -45.31414031982422, "step": 751 }, { "epoch": 0.8355555555555556, "grad_norm": 8.847668482303561e-07, "learning_rate": 4.3194843923617126e-05, "logits/chosen": -0.4643845558166504, "logits/rejected": -0.447843462228775, "logps/chosen": -219.68960571289062, "logps/rejected": -426.6856689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.78742790222168, "rewards/margins": 19.52178192138672, "rewards/rejected": -32.309207916259766, "step": 752 }, { "epoch": 0.8366666666666667, "grad_norm": 0.056798867881298065, "learning_rate": 4.317383133537773e-05, "logits/chosen": -0.45169001817703247, "logits/rejected": -0.44170480966567993, "logps/chosen": -272.7117004394531, "logps/rejected": -436.24639892578125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -17.9819278717041, "rewards/margins": 15.732460021972656, "rewards/rejected": -33.714385986328125, "step": 753 }, { "epoch": 0.8377777777777777, "grad_norm": 0.0004886622191406786, "learning_rate": 4.315279148427632e-05, "logits/chosen": -0.3124806880950928, "logits/rejected": -0.35999536514282227, "logps/chosen": -259.0509948730469, "logps/rejected": -535.64599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.437400817871094, "rewards/margins": 13.732967376708984, "rewards/rejected": -30.170368194580078, "step": 754 }, { "epoch": 0.8388888888888889, "grad_norm": 0.06737364083528519, "learning_rate": 4.3131724401875125e-05, "logits/chosen": -0.2539001405239105, "logits/rejected": -0.23766466975212097, "logps/chosen": -317.8954162597656, "logps/rejected": -630.9536743164062, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -24.65331268310547, "rewards/margins": 22.497928619384766, "rewards/rejected": -47.151241302490234, "step": 755 }, { "epoch": 0.84, "grad_norm": 7.722708225250244, "learning_rate": 4.311063011977723e-05, "logits/chosen": -0.383439302444458, "logits/rejected": -0.3865945637226105, "logps/chosen": -397.3080749511719, "logps/rejected": -541.44775390625, "loss": 0.0907, "rewards/accuracies": 1.0, "rewards/chosen": -31.1417236328125, "rewards/margins": 12.75686264038086, "rewards/rejected": -43.89858627319336, "step": 756 }, { "epoch": 0.8411111111111111, "grad_norm": 0.04752185568213463, "learning_rate": 4.3089508669626494e-05, "logits/chosen": -0.8476710319519043, "logits/rejected": -0.8217136859893799, "logps/chosen": -144.96334838867188, "logps/rejected": -231.88014221191406, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.767049789428711, "rewards/margins": 7.100872993469238, "rewards/rejected": -15.867923736572266, "step": 757 }, { "epoch": 0.8422222222222222, "grad_norm": 0.11345358192920685, "learning_rate": 4.306836008310756e-05, "logits/chosen": -0.19720086455345154, "logits/rejected": -0.17475129663944244, "logps/chosen": -376.47418212890625, "logps/rejected": -665.4389038085938, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -27.316116333007812, "rewards/margins": 18.07199478149414, "rewards/rejected": -45.38811111450195, "step": 758 }, { "epoch": 0.8433333333333334, "grad_norm": 1.4873885447741486e-05, "learning_rate": 4.304718439194577e-05, "logits/chosen": -0.0785754919052124, "logits/rejected": -0.1045161634683609, "logps/chosen": -311.23211669921875, "logps/rejected": -471.719970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.48125648498535, "rewards/margins": 15.683835983276367, "rewards/rejected": -36.16509246826172, "step": 759 }, { "epoch": 0.8444444444444444, "grad_norm": 50.028167724609375, "learning_rate": 4.302598162790712e-05, "logits/chosen": -0.5017668008804321, "logits/rejected": -0.5542943477630615, "logps/chosen": -479.6855773925781, "logps/rejected": -628.5458374023438, "loss": 1.96, "rewards/accuracies": 0.5, "rewards/chosen": -32.65278625488281, "rewards/margins": 11.978409767150879, "rewards/rejected": -44.631195068359375, "step": 760 }, { "epoch": 0.8455555555555555, "grad_norm": 0.00010515975009184331, "learning_rate": 4.3004751822798235e-05, "logits/chosen": -0.3755473494529724, "logits/rejected": -0.3906514644622803, "logps/chosen": -315.532470703125, "logps/rejected": -548.769287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.918861389160156, "rewards/margins": 20.27176284790039, "rewards/rejected": -43.19062423706055, "step": 761 }, { "epoch": 0.8466666666666667, "grad_norm": 2.963139802969472e-11, "learning_rate": 4.2983495008466276e-05, "logits/chosen": -0.4612213373184204, "logits/rejected": -0.4522266983985901, "logps/chosen": -264.2802734375, "logps/rejected": -609.9371337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.78772735595703, "rewards/margins": 31.919891357421875, "rewards/rejected": -49.707618713378906, "step": 762 }, { "epoch": 0.8477777777777777, "grad_norm": 0.017672624439001083, "learning_rate": 4.296221121679896e-05, "logits/chosen": -0.45902061462402344, "logits/rejected": -0.46480506658554077, "logps/chosen": -299.4942626953125, "logps/rejected": -513.1404418945312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -22.05693817138672, "rewards/margins": 18.018474578857422, "rewards/rejected": -40.07541275024414, "step": 763 }, { "epoch": 0.8488888888888889, "grad_norm": 12.664743423461914, "learning_rate": 4.2940900479724436e-05, "logits/chosen": -0.7801885604858398, "logits/rejected": -0.7673557996749878, "logps/chosen": -276.9939270019531, "logps/rejected": -446.7846984863281, "loss": 0.3229, "rewards/accuracies": 1.0, "rewards/chosen": -20.062057495117188, "rewards/margins": 14.948287010192871, "rewards/rejected": -35.010345458984375, "step": 764 }, { "epoch": 0.85, "grad_norm": 32.36884689331055, "learning_rate": 4.2919562829211283e-05, "logits/chosen": -0.6341562271118164, "logits/rejected": -0.6410857439041138, "logps/chosen": -229.0276641845703, "logps/rejected": -340.72100830078125, "loss": 1.4495, "rewards/accuracies": 0.5, "rewards/chosen": -15.14930534362793, "rewards/margins": 9.751654624938965, "rewards/rejected": -24.90096092224121, "step": 765 }, { "epoch": 0.8511111111111112, "grad_norm": 52.64460754394531, "learning_rate": 4.289819829726848e-05, "logits/chosen": -0.5089181661605835, "logits/rejected": -0.5092489123344421, "logps/chosen": -314.06640625, "logps/rejected": -789.0921630859375, "loss": 1.6681, "rewards/accuracies": 0.5, "rewards/chosen": -22.43765640258789, "rewards/margins": 36.10246276855469, "rewards/rejected": -58.54011535644531, "step": 766 }, { "epoch": 0.8522222222222222, "grad_norm": 0.005918477196246386, "learning_rate": 4.28768069159453e-05, "logits/chosen": -0.39462733268737793, "logits/rejected": -0.3960924744606018, "logps/chosen": -344.92938232421875, "logps/rejected": -515.31982421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -26.773300170898438, "rewards/margins": 13.503827095031738, "rewards/rejected": -40.277130126953125, "step": 767 }, { "epoch": 0.8533333333333334, "grad_norm": 0.0011735489824786782, "learning_rate": 4.2855388717331295e-05, "logits/chosen": -0.446662962436676, "logits/rejected": -0.44585493206977844, "logps/chosen": -218.44842529296875, "logps/rejected": -368.54168701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.286519050598145, "rewards/margins": 13.15664005279541, "rewards/rejected": -28.443159103393555, "step": 768 }, { "epoch": 0.8544444444444445, "grad_norm": 0.0033488834742456675, "learning_rate": 4.283394373355628e-05, "logits/chosen": -0.5961101055145264, "logits/rejected": -0.5903946161270142, "logps/chosen": -425.595947265625, "logps/rejected": -647.0348510742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.23980712890625, "rewards/margins": 20.829193115234375, "rewards/rejected": -54.069000244140625, "step": 769 }, { "epoch": 0.8555555555555555, "grad_norm": 1.3084924830764066e-05, "learning_rate": 4.2812471996790206e-05, "logits/chosen": -0.308584600687027, "logits/rejected": -0.30260512232780457, "logps/chosen": -901.067138671875, "logps/rejected": -1177.487060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -69.38667297363281, "rewards/margins": 29.014299392700195, "rewards/rejected": -98.40097045898438, "step": 770 }, { "epoch": 0.8566666666666667, "grad_norm": 0.032043345272541046, "learning_rate": 4.279097353924318e-05, "logits/chosen": -0.5491685271263123, "logits/rejected": -0.5746341943740845, "logps/chosen": -376.325927734375, "logps/rejected": -515.740966796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -27.339187622070312, "rewards/margins": 11.804203987121582, "rewards/rejected": -39.14339065551758, "step": 771 }, { "epoch": 0.8577777777777778, "grad_norm": 0.2988600730895996, "learning_rate": 4.27694483931654e-05, "logits/chosen": -0.42768773436546326, "logits/rejected": -0.43229278922080994, "logps/chosen": -533.359619140625, "logps/rejected": -615.9560546875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -43.252342224121094, "rewards/margins": 6.427619934082031, "rewards/rejected": -49.679962158203125, "step": 772 }, { "epoch": 0.8588888888888889, "grad_norm": 0.017563093453645706, "learning_rate": 4.274789659084708e-05, "logits/chosen": -0.5626379251480103, "logits/rejected": -0.5668295621871948, "logps/chosen": -258.18157958984375, "logps/rejected": -375.1309814453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -16.699798583984375, "rewards/margins": 10.14726448059082, "rewards/rejected": -26.847063064575195, "step": 773 }, { "epoch": 0.86, "grad_norm": 2.408397108411009e-07, "learning_rate": 4.2726318164618435e-05, "logits/chosen": -0.7051448225975037, "logits/rejected": -0.6983608603477478, "logps/chosen": -472.7507629394531, "logps/rejected": -731.2786865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -37.641273498535156, "rewards/margins": 21.739748001098633, "rewards/rejected": -59.38102340698242, "step": 774 }, { "epoch": 0.8611111111111112, "grad_norm": 0.0030442948918789625, "learning_rate": 4.2704713146849614e-05, "logits/chosen": -0.5920155048370361, "logits/rejected": -0.6140743494033813, "logps/chosen": -280.5921630859375, "logps/rejected": -437.08648681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.306270599365234, "rewards/margins": 13.208971977233887, "rewards/rejected": -33.51524353027344, "step": 775 }, { "epoch": 0.8622222222222222, "grad_norm": 0.007589503191411495, "learning_rate": 4.268308156995067e-05, "logits/chosen": -0.5062188506126404, "logits/rejected": -0.5166007280349731, "logps/chosen": -478.2378234863281, "logps/rejected": -625.8052368164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -34.934810638427734, "rewards/margins": 15.02163028717041, "rewards/rejected": -49.95643997192383, "step": 776 }, { "epoch": 0.8633333333333333, "grad_norm": 8.567523002624512, "learning_rate": 4.266142346637146e-05, "logits/chosen": -0.5632612705230713, "logits/rejected": -0.563681960105896, "logps/chosen": -387.1310119628906, "logps/rejected": -408.63226318359375, "loss": 0.3372, "rewards/accuracies": 1.0, "rewards/chosen": -30.784154891967773, "rewards/margins": 2.6911792755126953, "rewards/rejected": -33.47533416748047, "step": 777 }, { "epoch": 0.8644444444444445, "grad_norm": 4.822784831048921e-06, "learning_rate": 4.263973886860169e-05, "logits/chosen": -0.44629132747650146, "logits/rejected": -0.45008736848831177, "logps/chosen": -343.1689453125, "logps/rejected": -643.5860595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.042705535888672, "rewards/margins": 24.604555130004883, "rewards/rejected": -52.64726257324219, "step": 778 }, { "epoch": 0.8655555555555555, "grad_norm": 4.338958751759492e-05, "learning_rate": 4.2618027809170756e-05, "logits/chosen": -0.7087322473526001, "logits/rejected": -0.7265149354934692, "logps/chosen": -227.81036376953125, "logps/rejected": -473.6676025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.462160110473633, "rewards/margins": 19.051219940185547, "rewards/rejected": -35.51338195800781, "step": 779 }, { "epoch": 0.8666666666666667, "grad_norm": 2.646909713745117, "learning_rate": 4.259629032064779e-05, "logits/chosen": -0.5221399664878845, "logits/rejected": -0.5391180515289307, "logps/chosen": -550.262939453125, "logps/rejected": -764.116943359375, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -42.97206497192383, "rewards/margins": 16.140527725219727, "rewards/rejected": -59.11259460449219, "step": 780 }, { "epoch": 0.8677777777777778, "grad_norm": 0.0005958808469586074, "learning_rate": 4.257452643564155e-05, "logits/chosen": -0.7374674081802368, "logits/rejected": -0.7398434281349182, "logps/chosen": -357.869384765625, "logps/rejected": -529.375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.22521209716797, "rewards/margins": 13.541065216064453, "rewards/rejected": -41.76627731323242, "step": 781 }, { "epoch": 0.8688888888888889, "grad_norm": 0.006885566283017397, "learning_rate": 4.2552736186800386e-05, "logits/chosen": -0.6453142166137695, "logits/rejected": -0.6531805396080017, "logps/chosen": -307.38494873046875, "logps/rejected": -479.5106201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.241853713989258, "rewards/margins": 15.346230506896973, "rewards/rejected": -37.58808517456055, "step": 782 }, { "epoch": 0.87, "grad_norm": 9.001699447631836, "learning_rate": 4.2530919606812216e-05, "logits/chosen": -0.6226636171340942, "logits/rejected": -0.6314506530761719, "logps/chosen": -334.92449951171875, "logps/rejected": -465.62774658203125, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": -25.220674514770508, "rewards/margins": 10.982964515686035, "rewards/rejected": -36.20363998413086, "step": 783 }, { "epoch": 0.8711111111111111, "grad_norm": 8.803957825875841e-06, "learning_rate": 4.250907672840445e-05, "logits/chosen": -0.48432105779647827, "logits/rejected": -0.4738672375679016, "logps/chosen": -366.17431640625, "logps/rejected": -561.190185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.420320510864258, "rewards/margins": 17.24113655090332, "rewards/rejected": -44.66145706176758, "step": 784 }, { "epoch": 0.8722222222222222, "grad_norm": 59.7996711730957, "learning_rate": 4.2487207584343955e-05, "logits/chosen": -0.46988430619239807, "logits/rejected": -0.47441810369491577, "logps/chosen": -546.6412353515625, "logps/rejected": -948.0780029296875, "loss": 4.4638, "rewards/accuracies": 0.5, "rewards/chosen": -43.380619049072266, "rewards/margins": 33.170997619628906, "rewards/rejected": -76.5516128540039, "step": 785 }, { "epoch": 0.8733333333333333, "grad_norm": 0.060419172048568726, "learning_rate": 4.2465312207436974e-05, "logits/chosen": -0.7855024337768555, "logits/rejected": -0.8048604130744934, "logps/chosen": -237.029296875, "logps/rejected": -362.4464111328125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -17.895740509033203, "rewards/margins": 10.175025939941406, "rewards/rejected": -28.07076644897461, "step": 786 }, { "epoch": 0.8744444444444445, "grad_norm": 30.824203491210938, "learning_rate": 4.244339063052913e-05, "logits/chosen": -0.6491284370422363, "logits/rejected": -0.6338303089141846, "logps/chosen": -288.1285400390625, "logps/rejected": -313.34228515625, "loss": 1.0477, "rewards/accuracies": 0.5, "rewards/chosen": -21.568958282470703, "rewards/margins": 3.3811097145080566, "rewards/rejected": -24.9500675201416, "step": 787 }, { "epoch": 0.8755555555555555, "grad_norm": 8.773974695941433e-05, "learning_rate": 4.242144288650534e-05, "logits/chosen": -0.6178389191627502, "logits/rejected": -0.599535346031189, "logps/chosen": -571.0923461914062, "logps/rejected": -789.0712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -40.28545379638672, "rewards/margins": 18.568254470825195, "rewards/rejected": -58.85371017456055, "step": 788 }, { "epoch": 0.8766666666666667, "grad_norm": 4.927937030792236, "learning_rate": 4.239946900828978e-05, "logits/chosen": -0.35581135749816895, "logits/rejected": -0.4241589307785034, "logps/chosen": -720.413330078125, "logps/rejected": -904.824951171875, "loss": 0.0742, "rewards/accuracies": 1.0, "rewards/chosen": -55.67143630981445, "rewards/margins": 11.81887149810791, "rewards/rejected": -67.49031066894531, "step": 789 }, { "epoch": 0.8777777777777778, "grad_norm": 2.642321135226666e-07, "learning_rate": 4.237746902884581e-05, "logits/chosen": -0.6205227375030518, "logits/rejected": -0.6219427585601807, "logps/chosen": -299.03997802734375, "logps/rejected": -833.0835571289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.399559020996094, "rewards/margins": 37.9036979675293, "rewards/rejected": -60.30325698852539, "step": 790 }, { "epoch": 0.8788888888888889, "grad_norm": 7.29632520233281e-05, "learning_rate": 4.235544298117597e-05, "logits/chosen": -0.4787724018096924, "logits/rejected": -0.48484501242637634, "logps/chosen": -228.3846435546875, "logps/rejected": -515.6959228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.94472885131836, "rewards/margins": 23.74503517150879, "rewards/rejected": -39.689762115478516, "step": 791 }, { "epoch": 0.88, "grad_norm": 4.066462516784668, "learning_rate": 4.233339089832189e-05, "logits/chosen": -0.7562122344970703, "logits/rejected": -0.7569894790649414, "logps/chosen": -215.65093994140625, "logps/rejected": -230.28778076171875, "loss": 0.3963, "rewards/accuracies": 1.0, "rewards/chosen": -15.718832969665527, "rewards/margins": 1.1294443607330322, "rewards/rejected": -16.848278045654297, "step": 792 }, { "epoch": 0.8811111111111111, "grad_norm": 48.34563446044922, "learning_rate": 4.2311312813364264e-05, "logits/chosen": -0.26742690801620483, "logits/rejected": -0.23142947256565094, "logps/chosen": -427.0914611816406, "logps/rejected": -520.018798828125, "loss": 0.4369, "rewards/accuracies": 0.5, "rewards/chosen": -33.294921875, "rewards/margins": 6.9620819091796875, "rewards/rejected": -40.25700378417969, "step": 793 }, { "epoch": 0.8822222222222222, "grad_norm": 0.09568222612142563, "learning_rate": 4.228920875942279e-05, "logits/chosen": -0.47204333543777466, "logits/rejected": -0.4487084448337555, "logps/chosen": -358.82086181640625, "logps/rejected": -521.09814453125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -27.86922836303711, "rewards/margins": 12.805474281311035, "rewards/rejected": -40.67470169067383, "step": 794 }, { "epoch": 0.8833333333333333, "grad_norm": 1.8592198102851398e-05, "learning_rate": 4.226707876965611e-05, "logits/chosen": -0.2645072340965271, "logits/rejected": -0.28904038667678833, "logps/chosen": -275.25762939453125, "logps/rejected": -536.1057739257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.169391632080078, "rewards/margins": 20.67530059814453, "rewards/rejected": -39.84469223022461, "step": 795 }, { "epoch": 0.8844444444444445, "grad_norm": 0.0010651408229023218, "learning_rate": 4.2244922877261804e-05, "logits/chosen": -0.09764954447746277, "logits/rejected": -0.12891125679016113, "logps/chosen": -507.50860595703125, "logps/rejected": -770.7363891601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -37.26622772216797, "rewards/margins": 19.116552352905273, "rewards/rejected": -56.38277816772461, "step": 796 }, { "epoch": 0.8855555555555555, "grad_norm": 4.311526708988822e-07, "learning_rate": 4.222274111547627e-05, "logits/chosen": -1.0608189105987549, "logits/rejected": -0.6965916156768799, "logps/chosen": -247.06478881835938, "logps/rejected": -492.1934814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.825971603393555, "rewards/margins": 19.183582305908203, "rewards/rejected": -36.009552001953125, "step": 797 }, { "epoch": 0.8866666666666667, "grad_norm": 0.015685977414250374, "learning_rate": 4.2200533517574746e-05, "logits/chosen": -0.42096367478370667, "logits/rejected": -0.4188067317008972, "logps/chosen": -252.94442749023438, "logps/rejected": -405.3169250488281, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -18.361692428588867, "rewards/margins": 9.893375396728516, "rewards/rejected": -28.255067825317383, "step": 798 }, { "epoch": 0.8877777777777778, "grad_norm": 0.8334619998931885, "learning_rate": 4.21783001168712e-05, "logits/chosen": -0.3727554678916931, "logits/rejected": -0.39798468351364136, "logps/chosen": -175.89877319335938, "logps/rejected": -240.77659606933594, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -10.261224746704102, "rewards/margins": 5.8075666427612305, "rewards/rejected": -16.068790435791016, "step": 799 }, { "epoch": 0.8888888888888888, "grad_norm": 11.15588092803955, "learning_rate": 4.215604094671835e-05, "logits/chosen": -0.48448628187179565, "logits/rejected": -0.46779167652130127, "logps/chosen": -307.1094970703125, "logps/rejected": -424.2611389160156, "loss": 0.2908, "rewards/accuracies": 1.0, "rewards/chosen": -22.55978775024414, "rewards/margins": 10.311325073242188, "rewards/rejected": -32.87111282348633, "step": 800 }, { "epoch": 0.89, "grad_norm": 0.5684265494346619, "learning_rate": 4.21337560405075e-05, "logits/chosen": -0.5317736864089966, "logits/rejected": -0.5237900018692017, "logps/chosen": -177.828125, "logps/rejected": -274.69439697265625, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -12.226216316223145, "rewards/margins": 6.384650230407715, "rewards/rejected": -18.61086654663086, "step": 801 }, { "epoch": 0.8911111111111111, "grad_norm": 62.25132369995117, "learning_rate": 4.211144543166862e-05, "logits/chosen": -0.43495461344718933, "logits/rejected": -0.32696130871772766, "logps/chosen": -212.36009216308594, "logps/rejected": -160.5628204345703, "loss": 5.8243, "rewards/accuracies": 0.0, "rewards/chosen": -15.525333404541016, "rewards/margins": -5.550500392913818, "rewards/rejected": -9.974833488464355, "step": 802 }, { "epoch": 0.8922222222222222, "grad_norm": 2.403284788131714, "learning_rate": 4.208910915367022e-05, "logits/chosen": -0.7183055877685547, "logits/rejected": -0.6899228096008301, "logps/chosen": -207.14193725585938, "logps/rejected": -248.30177307128906, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": -13.212297439575195, "rewards/margins": 2.9920549392700195, "rewards/rejected": -16.20435333251953, "step": 803 }, { "epoch": 0.8933333333333333, "grad_norm": 1.1482512718430371e-07, "learning_rate": 4.206674724001933e-05, "logits/chosen": -0.21995003521442413, "logits/rejected": -0.20711255073547363, "logps/chosen": -288.0633544921875, "logps/rejected": -568.01953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.39035415649414, "rewards/margins": 22.490215301513672, "rewards/rejected": -41.88056945800781, "step": 804 }, { "epoch": 0.8944444444444445, "grad_norm": 56.830074310302734, "learning_rate": 4.204435972426141e-05, "logits/chosen": -0.44551458954811096, "logits/rejected": -0.4560723304748535, "logps/chosen": -350.0782775878906, "logps/rejected": -256.90777587890625, "loss": 8.6429, "rewards/accuracies": 0.0, "rewards/chosen": -23.808456420898438, "rewards/margins": -8.57785701751709, "rewards/rejected": -15.230599403381348, "step": 805 }, { "epoch": 0.8955555555555555, "grad_norm": 0.04235943406820297, "learning_rate": 4.202194663998034e-05, "logits/chosen": -0.011653143912553787, "logits/rejected": -0.030135322362184525, "logps/chosen": -360.76568603515625, "logps/rejected": -465.9054870605469, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -26.424076080322266, "rewards/margins": 9.076269149780273, "rewards/rejected": -35.50034713745117, "step": 806 }, { "epoch": 0.8966666666666666, "grad_norm": 0.39300259947776794, "learning_rate": 4.199950802079835e-05, "logits/chosen": -0.45646053552627563, "logits/rejected": -0.41857707500457764, "logps/chosen": -183.93191528320312, "logps/rejected": -262.3080749511719, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -11.48341178894043, "rewards/margins": 6.551949501037598, "rewards/rejected": -18.035362243652344, "step": 807 }, { "epoch": 0.8977777777777778, "grad_norm": 0.07815617322921753, "learning_rate": 4.197704390037599e-05, "logits/chosen": -0.15834885835647583, "logits/rejected": -0.14149132370948792, "logps/chosen": -203.91635131835938, "logps/rejected": -568.3348999023438, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -10.64340877532959, "rewards/margins": 27.43257713317871, "rewards/rejected": -38.075984954833984, "step": 808 }, { "epoch": 0.8988888888888888, "grad_norm": 0.025484723970294, "learning_rate": 4.195455431241205e-05, "logits/chosen": -0.25996479392051697, "logits/rejected": -0.29616037011146545, "logps/chosen": -406.1092834472656, "logps/rejected": -521.81103515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -25.211299896240234, "rewards/margins": 9.348491668701172, "rewards/rejected": -34.559791564941406, "step": 809 }, { "epoch": 0.9, "grad_norm": 0.0002344427484786138, "learning_rate": 4.193203929064353e-05, "logits/chosen": -0.5274702310562134, "logits/rejected": -0.5076823830604553, "logps/chosen": -215.5650634765625, "logps/rejected": -383.02532958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.238351821899414, "rewards/margins": 15.572050094604492, "rewards/rejected": -30.810401916503906, "step": 810 }, { "epoch": 0.9, "eval_logits/chosen": -0.42810994386672974, "eval_logits/rejected": -0.4275129437446594, "eval_logps/chosen": -246.36102294921875, "eval_logps/rejected": -373.7821960449219, "eval_loss": 0.38109543919563293, "eval_rewards/accuracies": 0.8899999856948853, "eval_rewards/chosen": -15.858991622924805, "eval_rewards/margins": 10.061595916748047, "eval_rewards/rejected": -25.92058563232422, "eval_runtime": 86.2123, "eval_samples_per_second": 2.32, "eval_steps_per_second": 0.29, "step": 810 }, { "epoch": 0.9011111111111111, "grad_norm": 0.00142627430614084, "learning_rate": 4.190949886884558e-05, "logits/chosen": -0.3862552046775818, "logits/rejected": -0.3829232454299927, "logps/chosen": -276.8846435546875, "logps/rejected": -530.8536987304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.450233459472656, "rewards/margins": 17.864940643310547, "rewards/rejected": -36.3151741027832, "step": 811 }, { "epoch": 0.9022222222222223, "grad_norm": 2.642026662826538, "learning_rate": 4.188693308083146e-05, "logits/chosen": -0.4319331645965576, "logits/rejected": -0.4145256578922272, "logps/chosen": -143.024169921875, "logps/rejected": -271.6532287597656, "loss": 0.2, "rewards/accuracies": 1.0, "rewards/chosen": -8.903973579406738, "rewards/margins": 10.114554405212402, "rewards/rejected": -19.01852798461914, "step": 812 }, { "epoch": 0.9033333333333333, "grad_norm": 0.0018805887084454298, "learning_rate": 4.186434196045247e-05, "logits/chosen": -0.16974051296710968, "logits/rejected": -0.1897805780172348, "logps/chosen": -379.4453430175781, "logps/rejected": -547.92724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -26.070796966552734, "rewards/margins": 11.785326957702637, "rewards/rejected": -37.85612487792969, "step": 813 }, { "epoch": 0.9044444444444445, "grad_norm": 18.11777687072754, "learning_rate": 4.184172554159793e-05, "logits/chosen": -0.1096363440155983, "logits/rejected": -0.11350283771753311, "logps/chosen": -326.181884765625, "logps/rejected": -463.63043212890625, "loss": 0.4627, "rewards/accuracies": 0.5, "rewards/chosen": -22.61785125732422, "rewards/margins": 8.481831550598145, "rewards/rejected": -31.099681854248047, "step": 814 }, { "epoch": 0.9055555555555556, "grad_norm": 7.369016657321481e-06, "learning_rate": 4.18190838581951e-05, "logits/chosen": -0.21062463521957397, "logits/rejected": -0.19078274071216583, "logps/chosen": -361.1734619140625, "logps/rejected": -690.3353271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.831283569335938, "rewards/margins": 18.803050994873047, "rewards/rejected": -42.634334564208984, "step": 815 }, { "epoch": 0.9066666666666666, "grad_norm": 0.00016922474605962634, "learning_rate": 4.1796416944209136e-05, "logits/chosen": -0.2580575942993164, "logits/rejected": -0.2296864092350006, "logps/chosen": -323.5754699707031, "logps/rejected": -494.98846435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.23879623413086, "rewards/margins": 17.860958099365234, "rewards/rejected": -35.099754333496094, "step": 816 }, { "epoch": 0.9077777777777778, "grad_norm": 2.1050145626068115, "learning_rate": 4.177372483364304e-05, "logits/chosen": -0.332622766494751, "logits/rejected": -0.2893717885017395, "logps/chosen": -206.905029296875, "logps/rejected": -241.18348693847656, "loss": 0.0696, "rewards/accuracies": 1.0, "rewards/chosen": -14.711755752563477, "rewards/margins": 2.6295228004455566, "rewards/rejected": -17.341278076171875, "step": 817 }, { "epoch": 0.9088888888888889, "grad_norm": 0.9017345309257507, "learning_rate": 4.175100756053763e-05, "logits/chosen": -0.48731690645217896, "logits/rejected": -0.46465611457824707, "logps/chosen": -141.43466186523438, "logps/rejected": -228.515869140625, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -8.113912582397461, "rewards/margins": 5.461429595947266, "rewards/rejected": -13.575342178344727, "step": 818 }, { "epoch": 0.91, "grad_norm": 0.6730761528015137, "learning_rate": 4.172826515897146e-05, "logits/chosen": -0.0698300376534462, "logits/rejected": -0.07702431827783585, "logps/chosen": -326.3304443359375, "logps/rejected": -425.53619384765625, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -19.70612144470215, "rewards/margins": 9.422538757324219, "rewards/rejected": -29.128660202026367, "step": 819 }, { "epoch": 0.9111111111111111, "grad_norm": 2.724905014038086, "learning_rate": 4.1705497663060767e-05, "logits/chosen": -0.2197083681821823, "logits/rejected": -0.22195076942443848, "logps/chosen": -187.591064453125, "logps/rejected": -244.860107421875, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": -10.341282844543457, "rewards/margins": 3.580348491668701, "rewards/rejected": -13.921630859375, "step": 820 }, { "epoch": 0.9122222222222223, "grad_norm": 0.0020186440087854862, "learning_rate": 4.168270510695946e-05, "logits/chosen": -0.07634244114160538, "logits/rejected": -0.08713898062705994, "logps/chosen": -287.903076171875, "logps/rejected": -503.1753845214844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.053447723388672, "rewards/margins": 17.76179313659668, "rewards/rejected": -35.81523895263672, "step": 821 }, { "epoch": 0.9133333333333333, "grad_norm": 0.0037503379862755537, "learning_rate": 4.165988752485901e-05, "logits/chosen": -0.3507292866706848, "logits/rejected": -0.34377098083496094, "logps/chosen": -143.62057495117188, "logps/rejected": -285.7519836425781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.291558265686035, "rewards/margins": 9.362825393676758, "rewards/rejected": -17.65438461303711, "step": 822 }, { "epoch": 0.9144444444444444, "grad_norm": 0.28576746582984924, "learning_rate": 4.163704495098845e-05, "logits/chosen": -0.20838119089603424, "logits/rejected": -0.2103133350610733, "logps/chosen": -226.9012451171875, "logps/rejected": -297.212890625, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -14.27435302734375, "rewards/margins": 5.1877312660217285, "rewards/rejected": -19.46208381652832, "step": 823 }, { "epoch": 0.9155555555555556, "grad_norm": 0.04294663667678833, "learning_rate": 4.161417741961431e-05, "logits/chosen": -0.4415122866630554, "logits/rejected": -0.43608635663986206, "logps/chosen": -115.21173095703125, "logps/rejected": -273.2531433105469, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.3216552734375, "rewards/margins": 13.121792793273926, "rewards/rejected": -18.443449020385742, "step": 824 }, { "epoch": 0.9166666666666666, "grad_norm": 5.202506599744083e-06, "learning_rate": 4.159128496504053e-05, "logits/chosen": -0.09922932088375092, "logits/rejected": -0.11073232442140579, "logps/chosen": -182.38052368164062, "logps/rejected": -508.1999816894531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.201408386230469, "rewards/margins": 23.124868392944336, "rewards/rejected": -33.32627868652344, "step": 825 }, { "epoch": 0.9177777777777778, "grad_norm": 0.9338911771774292, "learning_rate": 4.156836762160848e-05, "logits/chosen": -0.4423384666442871, "logits/rejected": -0.421843945980072, "logps/chosen": -188.36386108398438, "logps/rejected": -348.99981689453125, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -11.978910446166992, "rewards/margins": 12.798425674438477, "rewards/rejected": -24.77733612060547, "step": 826 }, { "epoch": 0.9188888888888889, "grad_norm": 0.11484096199274063, "learning_rate": 4.154542542369683e-05, "logits/chosen": -0.41914206743240356, "logits/rejected": -0.4048241078853607, "logps/chosen": -241.54129028320312, "logps/rejected": -405.71337890625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -16.76479721069336, "rewards/margins": 15.429125785827637, "rewards/rejected": -32.19392395019531, "step": 827 }, { "epoch": 0.92, "grad_norm": 0.0043014842085540295, "learning_rate": 4.152245840572153e-05, "logits/chosen": -0.07895079255104065, "logits/rejected": -0.10053456574678421, "logps/chosen": -255.73471069335938, "logps/rejected": -387.27728271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.90955352783203, "rewards/margins": 10.205255508422852, "rewards/rejected": -28.11480712890625, "step": 828 }, { "epoch": 0.9211111111111111, "grad_norm": 0.0009510570671409369, "learning_rate": 4.1499466602135805e-05, "logits/chosen": -0.04117386043071747, "logits/rejected": -0.019284337759017944, "logps/chosen": -319.15472412109375, "logps/rejected": -562.0650634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.12196922302246, "rewards/margins": 18.891002655029297, "rewards/rejected": -40.012969970703125, "step": 829 }, { "epoch": 0.9222222222222223, "grad_norm": 0.00021894625388085842, "learning_rate": 4.147645004743003e-05, "logits/chosen": -0.29517894983291626, "logits/rejected": -0.28960558772087097, "logps/chosen": -224.77996826171875, "logps/rejected": -393.3873596191406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.237995147705078, "rewards/margins": 12.719832420349121, "rewards/rejected": -26.957828521728516, "step": 830 }, { "epoch": 0.9233333333333333, "grad_norm": 5.095441818237305, "learning_rate": 4.145340877613171e-05, "logits/chosen": -0.623434841632843, "logits/rejected": -0.6192575693130493, "logps/chosen": -113.55785369873047, "logps/rejected": -124.69732666015625, "loss": 0.651, "rewards/accuracies": 0.5, "rewards/chosen": -6.6310014724731445, "rewards/margins": 1.1398286819458008, "rewards/rejected": -7.770830154418945, "step": 831 }, { "epoch": 0.9244444444444444, "grad_norm": 0.4913995862007141, "learning_rate": 4.1430342822805423e-05, "logits/chosen": -0.5038714408874512, "logits/rejected": -0.5031296014785767, "logps/chosen": -181.08425903320312, "logps/rejected": -236.07379150390625, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -11.466792106628418, "rewards/margins": 4.93398380279541, "rewards/rejected": -16.400775909423828, "step": 832 }, { "epoch": 0.9255555555555556, "grad_norm": 6.21803941669441e-09, "learning_rate": 4.1407252222052805e-05, "logits/chosen": -0.3042715787887573, "logits/rejected": -0.2936471998691559, "logps/chosen": -280.8255615234375, "logps/rejected": -588.23779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.970844268798828, "rewards/margins": 27.837614059448242, "rewards/rejected": -44.80845642089844, "step": 833 }, { "epoch": 0.9266666666666666, "grad_norm": 0.035839300602674484, "learning_rate": 4.1384137008512434e-05, "logits/chosen": -0.01023021712899208, "logits/rejected": 0.0029095634818077087, "logps/chosen": -357.02752685546875, "logps/rejected": -630.128173828125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -23.93439292907715, "rewards/margins": 21.138038635253906, "rewards/rejected": -45.07243347167969, "step": 834 }, { "epoch": 0.9277777777777778, "grad_norm": 0.0011718458263203502, "learning_rate": 4.136099721685983e-05, "logits/chosen": -0.41980063915252686, "logits/rejected": -0.4014482796192169, "logps/chosen": -247.87139892578125, "logps/rejected": -382.99163818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.524535179138184, "rewards/margins": 11.683311462402344, "rewards/rejected": -27.207847595214844, "step": 835 }, { "epoch": 0.9288888888888889, "grad_norm": 2.0855259895324707, "learning_rate": 4.133783288180735e-05, "logits/chosen": -0.1015416607260704, "logits/rejected": -0.06346246600151062, "logps/chosen": -281.68206787109375, "logps/rejected": -511.65118408203125, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": -19.148632049560547, "rewards/margins": 14.600643157958984, "rewards/rejected": -33.74927520751953, "step": 836 }, { "epoch": 0.93, "grad_norm": 0.008554819040000439, "learning_rate": 4.131464403810422e-05, "logits/chosen": -0.3628332316875458, "logits/rejected": -0.3641660809516907, "logps/chosen": -236.76324462890625, "logps/rejected": -342.93475341796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -16.6512508392334, "rewards/margins": 8.908329010009766, "rewards/rejected": -25.55957794189453, "step": 837 }, { "epoch": 0.9311111111111111, "grad_norm": 0.001635027234442532, "learning_rate": 4.129143072053638e-05, "logits/chosen": -0.2945724129676819, "logits/rejected": -0.284768283367157, "logps/chosen": -155.4898681640625, "logps/rejected": -334.079345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.975641250610352, "rewards/margins": 15.3976469039917, "rewards/rejected": -23.373287200927734, "step": 838 }, { "epoch": 0.9322222222222222, "grad_norm": 0.0002922069397754967, "learning_rate": 4.126819296392653e-05, "logits/chosen": -0.4284715950489044, "logits/rejected": -0.39743340015411377, "logps/chosen": -182.65826416015625, "logps/rejected": -338.6444091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.22705078125, "rewards/margins": 13.233304977416992, "rewards/rejected": -24.460355758666992, "step": 839 }, { "epoch": 0.9333333333333333, "grad_norm": 1.6037148237228394, "learning_rate": 4.1244930803134e-05, "logits/chosen": -0.5057257413864136, "logits/rejected": -0.5018907189369202, "logps/chosen": -111.29670715332031, "logps/rejected": -159.97369384765625, "loss": 0.0636, "rewards/accuracies": 1.0, "rewards/chosen": -6.604384422302246, "rewards/margins": 3.0376780033111572, "rewards/rejected": -9.64206314086914, "step": 840 }, { "epoch": 0.9344444444444444, "grad_norm": 0.0010989236179739237, "learning_rate": 4.122164427305473e-05, "logits/chosen": -0.35032927989959717, "logits/rejected": -0.36038699746131897, "logps/chosen": -216.85330200195312, "logps/rejected": -380.7457275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.869061470031738, "rewards/margins": 12.385305404663086, "rewards/rejected": -26.25436782836914, "step": 841 }, { "epoch": 0.9355555555555556, "grad_norm": 16.77202033996582, "learning_rate": 4.119833340862123e-05, "logits/chosen": -0.1903669685125351, "logits/rejected": -0.18890824913978577, "logps/chosen": -296.6312561035156, "logps/rejected": -501.381591796875, "loss": 0.2131, "rewards/accuracies": 1.0, "rewards/chosen": -20.64375114440918, "rewards/margins": 17.91857147216797, "rewards/rejected": -38.562320709228516, "step": 842 }, { "epoch": 0.9366666666666666, "grad_norm": 7.21697501759877e-09, "learning_rate": 4.117499824480251e-05, "logits/chosen": -0.45211154222488403, "logits/rejected": -0.4578664302825928, "logps/chosen": -128.47378540039062, "logps/rejected": -476.3241271972656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.191130638122559, "rewards/margins": 27.740886688232422, "rewards/rejected": -34.9320182800293, "step": 843 }, { "epoch": 0.9377777777777778, "grad_norm": 0.06776688247919083, "learning_rate": 4.115163881660403e-05, "logits/chosen": -0.7319363355636597, "logits/rejected": -0.7370768785476685, "logps/chosen": -91.97711181640625, "logps/rejected": -203.55538940429688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -4.800385475158691, "rewards/margins": 9.125739097595215, "rewards/rejected": -13.926124572753906, "step": 844 }, { "epoch": 0.9388888888888889, "grad_norm": 4.6131754061207175e-05, "learning_rate": 4.1128255159067665e-05, "logits/chosen": -0.6410982608795166, "logits/rejected": -0.636712372303009, "logps/chosen": -233.14431762695312, "logps/rejected": -444.746337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.832866668701172, "rewards/margins": 15.260908126831055, "rewards/rejected": -30.093774795532227, "step": 845 }, { "epoch": 0.94, "grad_norm": 0.08915267884731293, "learning_rate": 4.110484730727161e-05, "logits/chosen": -0.25793007016181946, "logits/rejected": -0.2509935796260834, "logps/chosen": -166.52951049804688, "logps/rejected": -265.2695007324219, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -10.961963653564453, "rewards/margins": 6.371007442474365, "rewards/rejected": -17.332969665527344, "step": 846 }, { "epoch": 0.9411111111111111, "grad_norm": 0.22761113941669464, "learning_rate": 4.108141529633036e-05, "logits/chosen": -0.25505903363227844, "logits/rejected": -0.24946464598178864, "logps/chosen": -568.841064453125, "logps/rejected": -631.986083984375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -39.57518768310547, "rewards/margins": 8.615489959716797, "rewards/rejected": -48.190673828125, "step": 847 }, { "epoch": 0.9422222222222222, "grad_norm": 0.3115740120410919, "learning_rate": 4.1057959161394674e-05, "logits/chosen": -0.6341114044189453, "logits/rejected": -0.6307196021080017, "logps/chosen": -140.09158325195312, "logps/rejected": -266.92138671875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -6.559474945068359, "rewards/margins": 7.3131914138793945, "rewards/rejected": -13.872666358947754, "step": 848 }, { "epoch": 0.9433333333333334, "grad_norm": 0.004205223172903061, "learning_rate": 4.1034478937651474e-05, "logits/chosen": -0.3674086034297943, "logits/rejected": -0.3729237914085388, "logps/chosen": -298.4883117675781, "logps/rejected": -589.5206298828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -19.713104248046875, "rewards/margins": 24.16244888305664, "rewards/rejected": -43.875553131103516, "step": 849 }, { "epoch": 0.9444444444444444, "grad_norm": 0.006258365232497454, "learning_rate": 4.101097466032383e-05, "logits/chosen": -0.518823504447937, "logits/rejected": -0.517522931098938, "logps/chosen": -256.13153076171875, "logps/rejected": -372.9981384277344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.950092315673828, "rewards/margins": 9.537365913391113, "rewards/rejected": -26.487457275390625, "step": 850 }, { "epoch": 0.9455555555555556, "grad_norm": 5.957705974578857, "learning_rate": 4.098744636467089e-05, "logits/chosen": -0.6528604626655579, "logits/rejected": -0.6334058046340942, "logps/chosen": -206.04229736328125, "logps/rejected": -220.28981018066406, "loss": 0.2287, "rewards/accuracies": 1.0, "rewards/chosen": -13.1096830368042, "rewards/margins": 1.3797411918640137, "rewards/rejected": -14.489423751831055, "step": 851 }, { "epoch": 0.9466666666666667, "grad_norm": 0.8735276460647583, "learning_rate": 4.0963894085987843e-05, "logits/chosen": -0.27950185537338257, "logits/rejected": -0.2840229868888855, "logps/chosen": -216.69091796875, "logps/rejected": -380.6740417480469, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -13.140765190124512, "rewards/margins": 13.472846031188965, "rewards/rejected": -26.613611221313477, "step": 852 }, { "epoch": 0.9477777777777778, "grad_norm": 0.3122299611568451, "learning_rate": 4.094031785960585e-05, "logits/chosen": -0.48313474655151367, "logits/rejected": -0.4788661003112793, "logps/chosen": -197.32028198242188, "logps/rejected": -282.9070129394531, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -11.85896110534668, "rewards/margins": 5.562575340270996, "rewards/rejected": -17.421537399291992, "step": 853 }, { "epoch": 0.9488888888888889, "grad_norm": 2.552751064300537, "learning_rate": 4.091671772089198e-05, "logits/chosen": -0.5320582389831543, "logits/rejected": -0.487896203994751, "logps/chosen": -140.73983764648438, "logps/rejected": -279.60546875, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": -7.801395416259766, "rewards/margins": 12.30195426940918, "rewards/rejected": -20.103349685668945, "step": 854 }, { "epoch": 0.95, "grad_norm": 5.988253906252794e-05, "learning_rate": 4.089309370524921e-05, "logits/chosen": -0.5366358160972595, "logits/rejected": -0.5814464688301086, "logps/chosen": -205.87451171875, "logps/rejected": -719.197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.823894500732422, "rewards/margins": 38.18509292602539, "rewards/rejected": -53.00898742675781, "step": 855 }, { "epoch": 0.9511111111111111, "grad_norm": 5.722037167288363e-05, "learning_rate": 4.0869445848116284e-05, "logits/chosen": -0.437539279460907, "logits/rejected": -0.42221954464912415, "logps/chosen": -322.0758361816406, "logps/rejected": -512.692138671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.626304626464844, "rewards/margins": 15.553738594055176, "rewards/rejected": -39.1800422668457, "step": 856 }, { "epoch": 0.9522222222222222, "grad_norm": 4.118287086486816, "learning_rate": 4.0845774184967754e-05, "logits/chosen": -0.7076168060302734, "logits/rejected": -0.6675238609313965, "logps/chosen": -157.31219482421875, "logps/rejected": -539.5748901367188, "loss": 0.1262, "rewards/accuracies": 1.0, "rewards/chosen": -11.027571678161621, "rewards/margins": 27.481529235839844, "rewards/rejected": -38.50910186767578, "step": 857 }, { "epoch": 0.9533333333333334, "grad_norm": 7.478187084197998, "learning_rate": 4.0822078751313884e-05, "logits/chosen": -0.49433833360671997, "logits/rejected": -0.4888341426849365, "logps/chosen": -175.41079711914062, "logps/rejected": -272.5142822265625, "loss": 0.334, "rewards/accuracies": 1.0, "rewards/chosen": -10.585180282592773, "rewards/margins": 8.236431121826172, "rewards/rejected": -18.821611404418945, "step": 858 }, { "epoch": 0.9544444444444444, "grad_norm": 5.272284030914307, "learning_rate": 4.079835958270056e-05, "logits/chosen": -0.5949715971946716, "logits/rejected": -0.5766944885253906, "logps/chosen": -461.7208251953125, "logps/rejected": -553.7354125976562, "loss": 0.0871, "rewards/accuracies": 1.0, "rewards/chosen": -34.49949264526367, "rewards/margins": 7.454092979431152, "rewards/rejected": -41.95358657836914, "step": 859 }, { "epoch": 0.9555555555555556, "grad_norm": 0.01225542463362217, "learning_rate": 4.0774616714709316e-05, "logits/chosen": -0.6151982545852661, "logits/rejected": -0.612145185470581, "logps/chosen": -232.866943359375, "logps/rejected": -522.516845703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.67613410949707, "rewards/margins": 24.659746170043945, "rewards/rejected": -39.335880279541016, "step": 860 }, { "epoch": 0.9566666666666667, "grad_norm": 1.2441256558304303e-06, "learning_rate": 4.075085018295719e-05, "logits/chosen": -0.5987961292266846, "logits/rejected": -0.5981524586677551, "logps/chosen": -249.6154327392578, "logps/rejected": -512.1275634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.862913131713867, "rewards/margins": 22.189197540283203, "rewards/rejected": -41.05210876464844, "step": 861 }, { "epoch": 0.9577777777777777, "grad_norm": 0.0015029022470116615, "learning_rate": 4.072706002309676e-05, "logits/chosen": -0.6782752275466919, "logits/rejected": -0.6439812779426575, "logps/chosen": -243.68017578125, "logps/rejected": -366.0345764160156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.412321090698242, "rewards/margins": 10.811285018920898, "rewards/rejected": -28.22360610961914, "step": 862 }, { "epoch": 0.9588888888888889, "grad_norm": 3.098057985305786, "learning_rate": 4.070324627081604e-05, "logits/chosen": -0.7048677206039429, "logits/rejected": -0.7115183472633362, "logps/chosen": -250.94830322265625, "logps/rejected": -300.19366455078125, "loss": 0.1329, "rewards/accuracies": 1.0, "rewards/chosen": -18.45759391784668, "rewards/margins": 4.767515659332275, "rewards/rejected": -23.225109100341797, "step": 863 }, { "epoch": 0.96, "grad_norm": 0.0012098302831873298, "learning_rate": 4.067940896183843e-05, "logits/chosen": -0.5380294322967529, "logits/rejected": -0.5863718390464783, "logps/chosen": -256.4425354003906, "logps/rejected": -393.57708740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.848554611206055, "rewards/margins": 11.54560661315918, "rewards/rejected": -28.394161224365234, "step": 864 }, { "epoch": 0.9611111111111111, "grad_norm": 1.2511377334594727, "learning_rate": 4.065554813192266e-05, "logits/chosen": -0.790912389755249, "logits/rejected": -0.795559287071228, "logps/chosen": -288.98309326171875, "logps/rejected": -460.01861572265625, "loss": 0.0305, "rewards/accuracies": 1.0, "rewards/chosen": -22.449756622314453, "rewards/margins": 13.67829418182373, "rewards/rejected": -36.1280517578125, "step": 865 }, { "epoch": 0.9622222222222222, "grad_norm": 6.195525514840483e-08, "learning_rate": 4.063166381686275e-05, "logits/chosen": -0.5780863165855408, "logits/rejected": -0.5428587198257446, "logps/chosen": -364.5535888671875, "logps/rejected": -968.0983276367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -26.895523071289062, "rewards/margins": 36.787105560302734, "rewards/rejected": -63.68263244628906, "step": 866 }, { "epoch": 0.9633333333333334, "grad_norm": 0.8543556928634644, "learning_rate": 4.0607756052487956e-05, "logits/chosen": -0.5455676317214966, "logits/rejected": -0.5526853203773499, "logps/chosen": -240.4935302734375, "logps/rejected": -324.12591552734375, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -17.973464965820312, "rewards/margins": 6.75145959854126, "rewards/rejected": -24.724925994873047, "step": 867 }, { "epoch": 0.9644444444444444, "grad_norm": 0.005095474887639284, "learning_rate": 4.05838248746627e-05, "logits/chosen": -0.4322633147239685, "logits/rejected": -0.45385605096817017, "logps/chosen": -531.8271484375, "logps/rejected": -657.6192626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -39.04155349731445, "rewards/margins": 11.952057838439941, "rewards/rejected": -50.99361038208008, "step": 868 }, { "epoch": 0.9655555555555555, "grad_norm": 0.025413954630494118, "learning_rate": 4.055987031928655e-05, "logits/chosen": -0.5362026691436768, "logits/rejected": -0.5258467197418213, "logps/chosen": -414.6763916015625, "logps/rejected": -733.795654296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -30.694412231445312, "rewards/margins": 26.054035186767578, "rewards/rejected": -56.74844741821289, "step": 869 }, { "epoch": 0.9666666666666667, "grad_norm": 0.5680553913116455, "learning_rate": 4.053589242229412e-05, "logits/chosen": -0.25116950273513794, "logits/rejected": -0.27983570098876953, "logps/chosen": -616.2606811523438, "logps/rejected": -830.0702514648438, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -48.61688995361328, "rewards/margins": 17.479267120361328, "rewards/rejected": -66.09616088867188, "step": 870 }, { "epoch": 0.9677777777777777, "grad_norm": 81.84339141845703, "learning_rate": 4.051189121965504e-05, "logits/chosen": -0.5646835565567017, "logits/rejected": -0.685867428779602, "logps/chosen": -480.9488525390625, "logps/rejected": -586.5339965820312, "loss": 8.5849, "rewards/accuracies": 0.5, "rewards/chosen": -39.893287658691406, "rewards/margins": 7.530569076538086, "rewards/rejected": -47.423858642578125, "step": 871 }, { "epoch": 0.9688888888888889, "grad_norm": 3.75528926531421e-13, "learning_rate": 4.0487866747373906e-05, "logits/chosen": -0.673292875289917, "logits/rejected": -0.7803962230682373, "logps/chosen": -452.05859375, "logps/rejected": -1153.4501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.78504943847656, "rewards/margins": 54.324058532714844, "rewards/rejected": -88.10911560058594, "step": 872 }, { "epoch": 0.97, "grad_norm": 2.1651831616509298e-07, "learning_rate": 4.046381904149024e-05, "logits/chosen": -0.08462420850992203, "logits/rejected": -0.06040126830339432, "logps/chosen": -351.4764404296875, "logps/rejected": -640.509521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.367534637451172, "rewards/margins": 22.367345809936523, "rewards/rejected": -46.73487854003906, "step": 873 }, { "epoch": 0.9711111111111111, "grad_norm": 14.4449462890625, "learning_rate": 4.0439748138078374e-05, "logits/chosen": -0.523148238658905, "logits/rejected": -0.4889718294143677, "logps/chosen": -368.2580871582031, "logps/rejected": -391.65057373046875, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": -27.378585815429688, "rewards/margins": 3.204446315765381, "rewards/rejected": -30.583032608032227, "step": 874 }, { "epoch": 0.9722222222222222, "grad_norm": 1.423081913577141e-19, "learning_rate": 4.041565407324749e-05, "logits/chosen": -0.24547480046749115, "logits/rejected": -0.2566688060760498, "logps/chosen": -556.526123046875, "logps/rejected": -1137.5111083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -40.616546630859375, "rewards/margins": 52.37699890136719, "rewards/rejected": -92.99354553222656, "step": 875 }, { "epoch": 0.9733333333333334, "grad_norm": 23.629302978515625, "learning_rate": 4.039153688314145e-05, "logits/chosen": -0.44840896129608154, "logits/rejected": -0.46981340646743774, "logps/chosen": -377.1634216308594, "logps/rejected": -501.51678466796875, "loss": 0.3105, "rewards/accuracies": 1.0, "rewards/chosen": -28.51364517211914, "rewards/margins": 9.22217082977295, "rewards/rejected": -37.735816955566406, "step": 876 }, { "epoch": 0.9744444444444444, "grad_norm": 0.439152330160141, "learning_rate": 4.0367396603938894e-05, "logits/chosen": -0.44510674476623535, "logits/rejected": -0.4150109887123108, "logps/chosen": -374.81500244140625, "logps/rejected": -457.3393859863281, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -28.040679931640625, "rewards/margins": 7.059308052062988, "rewards/rejected": -35.09999084472656, "step": 877 }, { "epoch": 0.9755555555555555, "grad_norm": 91.39579772949219, "learning_rate": 4.034323327185302e-05, "logits/chosen": -0.6360231637954712, "logits/rejected": -0.6108345985412598, "logps/chosen": -597.3301391601562, "logps/rejected": -612.3858032226562, "loss": 3.0924, "rewards/accuracies": 0.5, "rewards/chosen": -48.3536376953125, "rewards/margins": 1.0566930770874023, "rewards/rejected": -49.41033172607422, "step": 878 }, { "epoch": 0.9766666666666667, "grad_norm": 9.227576083503664e-05, "learning_rate": 4.031904692313165e-05, "logits/chosen": -0.7037904262542725, "logits/rejected": -0.7253705263137817, "logps/chosen": -316.6020812988281, "logps/rejected": -661.302978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.78540802001953, "rewards/margins": 27.575769424438477, "rewards/rejected": -52.36117935180664, "step": 879 }, { "epoch": 0.9777777777777777, "grad_norm": 3.0408880710601807, "learning_rate": 4.0294837594057125e-05, "logits/chosen": -0.5571787357330322, "logits/rejected": -0.5324241518974304, "logps/chosen": -415.0831298828125, "logps/rejected": -484.4784240722656, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": -32.97025680541992, "rewards/margins": 5.9685773849487305, "rewards/rejected": -38.93883514404297, "step": 880 }, { "epoch": 0.9788888888888889, "grad_norm": 0.0004675827512983233, "learning_rate": 4.027060532094624e-05, "logits/chosen": -0.5412706136703491, "logits/rejected": -0.5506626963615417, "logps/chosen": -346.26690673828125, "logps/rejected": -604.60693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.765474319458008, "rewards/margins": 21.91321563720703, "rewards/rejected": -47.678688049316406, "step": 881 }, { "epoch": 0.98, "grad_norm": 2.2446513980867167e-08, "learning_rate": 4.024635014015023e-05, "logits/chosen": -0.4411046504974365, "logits/rejected": -0.41623610258102417, "logps/chosen": -429.7698974609375, "logps/rejected": -648.415283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.497844696044922, "rewards/margins": 22.41277503967285, "rewards/rejected": -51.910621643066406, "step": 882 }, { "epoch": 0.9811111111111112, "grad_norm": 0.005037570837885141, "learning_rate": 4.02220720880547e-05, "logits/chosen": -0.5705093145370483, "logits/rejected": -0.5530703067779541, "logps/chosen": -306.124267578125, "logps/rejected": -435.7098388671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -23.19715118408203, "rewards/margins": 11.362543106079102, "rewards/rejected": -34.559696197509766, "step": 883 }, { "epoch": 0.9822222222222222, "grad_norm": 0.006006752140820026, "learning_rate": 4.019777120107954e-05, "logits/chosen": -0.7868660092353821, "logits/rejected": -0.7908229827880859, "logps/chosen": -337.5703430175781, "logps/rejected": -488.30572509765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -27.31743621826172, "rewards/margins": 9.543618202209473, "rewards/rejected": -36.861053466796875, "step": 884 }, { "epoch": 0.9833333333333333, "grad_norm": 77.48323059082031, "learning_rate": 4.0173447515678916e-05, "logits/chosen": -0.7705494165420532, "logits/rejected": -0.753944456577301, "logps/chosen": -371.38275146484375, "logps/rejected": -368.0663146972656, "loss": 2.5248, "rewards/accuracies": 0.5, "rewards/chosen": -29.631444931030273, "rewards/margins": -0.2567453384399414, "rewards/rejected": -29.374698638916016, "step": 885 }, { "epoch": 0.9844444444444445, "grad_norm": 4.626312147593126e-05, "learning_rate": 4.0149101068341194e-05, "logits/chosen": -0.4372534155845642, "logits/rejected": -0.4464263319969177, "logps/chosen": -498.6544189453125, "logps/rejected": -856.5299682617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -37.61430740356445, "rewards/margins": 28.95107650756836, "rewards/rejected": -66.56538391113281, "step": 886 }, { "epoch": 0.9855555555555555, "grad_norm": 2.535136809456162e-05, "learning_rate": 4.0124731895588864e-05, "logits/chosen": -0.4341237545013428, "logits/rejected": -0.472345769405365, "logps/chosen": -452.9433898925781, "logps/rejected": -780.0538330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.81045150756836, "rewards/margins": 25.74209213256836, "rewards/rejected": -58.55254364013672, "step": 887 }, { "epoch": 0.9866666666666667, "grad_norm": 83.91105651855469, "learning_rate": 4.0100340033978526e-05, "logits/chosen": -0.23460160195827484, "logits/rejected": -0.21008001267910004, "logps/chosen": -596.0387573242188, "logps/rejected": -532.008544921875, "loss": 2.9224, "rewards/accuracies": 0.5, "rewards/chosen": -42.93225860595703, "rewards/margins": -1.377823829650879, "rewards/rejected": -41.55443572998047, "step": 888 }, { "epoch": 0.9877777777777778, "grad_norm": 0.0019643933046609163, "learning_rate": 4.007592552010081e-05, "logits/chosen": -0.5182616710662842, "logits/rejected": -0.526436984539032, "logps/chosen": -315.798828125, "logps/rejected": -806.5570068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.895038604736328, "rewards/margins": 39.27623748779297, "rewards/rejected": -63.17127990722656, "step": 889 }, { "epoch": 0.9888888888888889, "grad_norm": 7.483355298279482e-13, "learning_rate": 4.005148839058034e-05, "logits/chosen": -0.46029484272003174, "logits/rejected": -0.4832759499549866, "logps/chosen": -407.7744140625, "logps/rejected": -842.7235107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.572994232177734, "rewards/margins": 34.31005859375, "rewards/rejected": -66.88304901123047, "step": 890 }, { "epoch": 0.99, "grad_norm": 0.011902363039553165, "learning_rate": 4.002702868207563e-05, "logits/chosen": -0.4196403920650482, "logits/rejected": -0.4109646677970886, "logps/chosen": -332.9317932128906, "logps/rejected": -525.528076171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -24.366451263427734, "rewards/margins": 16.27066421508789, "rewards/rejected": -40.637115478515625, "step": 891 }, { "epoch": 0.9911111111111112, "grad_norm": 0.03494444116950035, "learning_rate": 4.000254643127911e-05, "logits/chosen": -0.745226263999939, "logits/rejected": -0.7543327808380127, "logps/chosen": -345.64019775390625, "logps/rejected": -674.4775390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -26.7427978515625, "rewards/margins": 23.08111572265625, "rewards/rejected": -49.82391357421875, "step": 892 }, { "epoch": 0.9922222222222222, "grad_norm": 35.28209686279297, "learning_rate": 3.9978041674917e-05, "logits/chosen": -0.7366160154342651, "logits/rejected": -0.7383006811141968, "logps/chosen": -223.73556518554688, "logps/rejected": -259.2276611328125, "loss": 2.3004, "rewards/accuracies": 0.5, "rewards/chosen": -17.608409881591797, "rewards/margins": 2.5709805488586426, "rewards/rejected": -20.17938995361328, "step": 893 }, { "epoch": 0.9933333333333333, "grad_norm": 2.859135150856673e-10, "learning_rate": 3.995351444974929e-05, "logits/chosen": -0.41895899176597595, "logits/rejected": -0.425022691488266, "logps/chosen": -512.2947998046875, "logps/rejected": -1002.1484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -42.61608123779297, "rewards/margins": 42.14800262451172, "rewards/rejected": -84.76408386230469, "step": 894 }, { "epoch": 0.9944444444444445, "grad_norm": 0.00032989089959301054, "learning_rate": 3.9928964792569655e-05, "logits/chosen": -0.5683507919311523, "logits/rejected": -0.5588250160217285, "logps/chosen": -238.22157287597656, "logps/rejected": -391.972900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.47706413269043, "rewards/margins": 12.580259323120117, "rewards/rejected": -31.057323455810547, "step": 895 }, { "epoch": 0.9955555555555555, "grad_norm": 0.08311721682548523, "learning_rate": 3.990439274020546e-05, "logits/chosen": -0.4722200632095337, "logits/rejected": -0.4909358024597168, "logps/chosen": -396.04522705078125, "logps/rejected": -503.97076416015625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -31.74181365966797, "rewards/margins": 9.21958065032959, "rewards/rejected": -40.961395263671875, "step": 896 }, { "epoch": 0.9966666666666667, "grad_norm": 14.3080472946167, "learning_rate": 3.987979832951763e-05, "logits/chosen": -0.5250222682952881, "logits/rejected": -0.5287713408470154, "logps/chosen": -238.20111083984375, "logps/rejected": -262.9792785644531, "loss": 0.9735, "rewards/accuracies": 0.5, "rewards/chosen": -17.514789581298828, "rewards/margins": 1.8076845407485962, "rewards/rejected": -19.322473526000977, "step": 897 }, { "epoch": 0.9977777777777778, "grad_norm": 0.00022033360437490046, "learning_rate": 3.985518159740065e-05, "logits/chosen": -0.5261914134025574, "logits/rejected": -0.5172275304794312, "logps/chosen": -227.21995544433594, "logps/rejected": -395.60198974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.301885604858398, "rewards/margins": 13.760112762451172, "rewards/rejected": -29.061996459960938, "step": 898 }, { "epoch": 0.9988888888888889, "grad_norm": 16.672941207885742, "learning_rate": 3.983054258078248e-05, "logits/chosen": -0.727085292339325, "logits/rejected": -0.7253548502922058, "logps/chosen": -150.23696899414062, "logps/rejected": -163.93702697753906, "loss": 0.9315, "rewards/accuracies": 0.5, "rewards/chosen": -9.849408149719238, "rewards/margins": 1.6170392036437988, "rewards/rejected": -11.466447830200195, "step": 899 }, { "epoch": 1.0, "grad_norm": 0.03417162597179413, "learning_rate": 3.9805881316624506e-05, "logits/chosen": -0.4812771677970886, "logits/rejected": -0.4824010729789734, "logps/chosen": -428.9564208984375, "logps/rejected": -576.14501953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -33.388946533203125, "rewards/margins": 10.847587585449219, "rewards/rejected": -44.236534118652344, "step": 900 }, { "epoch": 1.001111111111111, "grad_norm": 8.596412953920662e-06, "learning_rate": 3.97811978419215e-05, "logits/chosen": -0.4302983283996582, "logits/rejected": -0.41553372144699097, "logps/chosen": -382.187255859375, "logps/rejected": -618.1383056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.777854919433594, "rewards/margins": 18.233623504638672, "rewards/rejected": -47.011478424072266, "step": 901 }, { "epoch": 1.0022222222222221, "grad_norm": 0.0004428688553161919, "learning_rate": 3.975649219370155e-05, "logits/chosen": -0.6594560146331787, "logits/rejected": -0.6469042301177979, "logps/chosen": -322.94342041015625, "logps/rejected": -497.04534912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.805633544921875, "rewards/margins": 16.30156707763672, "rewards/rejected": -41.107200622558594, "step": 902 }, { "epoch": 1.0033333333333334, "grad_norm": 0.0032454095780849457, "learning_rate": 3.9731764409025996e-05, "logits/chosen": -0.5758850574493408, "logits/rejected": -0.5588394403457642, "logps/chosen": -221.94134521484375, "logps/rejected": -465.10406494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.87556266784668, "rewards/margins": 18.234539031982422, "rewards/rejected": -34.11009979248047, "step": 903 }, { "epoch": 1.0044444444444445, "grad_norm": 0.004079330246895552, "learning_rate": 3.9707014524989404e-05, "logits/chosen": -0.2932683527469635, "logits/rejected": -0.3122369050979614, "logps/chosen": -320.0010070800781, "logps/rejected": -520.462158203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -22.64928436279297, "rewards/margins": 16.553970336914062, "rewards/rejected": -39.20325469970703, "step": 904 }, { "epoch": 1.0055555555555555, "grad_norm": 0.01858220435678959, "learning_rate": 3.968224257871948e-05, "logits/chosen": -0.49318501353263855, "logits/rejected": -0.49275946617126465, "logps/chosen": -368.0931396484375, "logps/rejected": -468.572265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -27.489028930664062, "rewards/margins": 8.858509063720703, "rewards/rejected": -36.347537994384766, "step": 905 }, { "epoch": 1.0066666666666666, "grad_norm": 0.022100891917943954, "learning_rate": 3.9657448607377e-05, "logits/chosen": -0.14612430334091187, "logits/rejected": -0.16030049324035645, "logps/chosen": -582.9677734375, "logps/rejected": -698.5653076171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -39.124244689941406, "rewards/margins": 13.14334487915039, "rewards/rejected": -52.2675895690918, "step": 906 }, { "epoch": 1.0077777777777779, "grad_norm": 0.0035443860106170177, "learning_rate": 3.963263264815584e-05, "logits/chosen": -0.3555017113685608, "logits/rejected": -0.35324737429618835, "logps/chosen": -304.5107421875, "logps/rejected": -452.8131103515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -21.59927749633789, "rewards/margins": 12.420950889587402, "rewards/rejected": -34.02022933959961, "step": 907 }, { "epoch": 1.008888888888889, "grad_norm": 0.06847555190324783, "learning_rate": 3.960779473828281e-05, "logits/chosen": -0.43775665760040283, "logits/rejected": -0.46236512064933777, "logps/chosen": -391.775146484375, "logps/rejected": -607.8811645507812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -28.484710693359375, "rewards/margins": 17.2543888092041, "rewards/rejected": -45.739097595214844, "step": 908 }, { "epoch": 1.01, "grad_norm": 0.009908195585012436, "learning_rate": 3.9582934915017665e-05, "logits/chosen": -0.4860020577907562, "logits/rejected": -0.46611812710762024, "logps/chosen": -310.30670166015625, "logps/rejected": -488.93902587890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -22.526952743530273, "rewards/margins": 13.88932991027832, "rewards/rejected": -36.416282653808594, "step": 909 }, { "epoch": 1.011111111111111, "grad_norm": 0.13028304278850555, "learning_rate": 3.955805321565304e-05, "logits/chosen": -0.6374976634979248, "logits/rejected": -0.6452587842941284, "logps/chosen": -262.89154052734375, "logps/rejected": -311.3592529296875, "loss": 0.3468, "rewards/accuracies": 0.5, "rewards/chosen": -19.076618194580078, "rewards/margins": 3.7609291076660156, "rewards/rejected": -22.837547302246094, "step": 910 }, { "epoch": 1.0122222222222221, "grad_norm": 3.974650098825805e-05, "learning_rate": 3.953314967751437e-05, "logits/chosen": -0.411349356174469, "logits/rejected": -0.43020451068878174, "logps/chosen": -288.9849853515625, "logps/rejected": -453.48663330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.21100425720215, "rewards/margins": 15.128791809082031, "rewards/rejected": -33.33979797363281, "step": 911 }, { "epoch": 1.0133333333333334, "grad_norm": 3.546863555908203, "learning_rate": 3.950822433795988e-05, "logits/chosen": -0.519774854183197, "logits/rejected": -0.5156953930854797, "logps/chosen": -280.4649963378906, "logps/rejected": -378.35614013671875, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -20.01573944091797, "rewards/margins": 6.161557197570801, "rewards/rejected": -26.177295684814453, "step": 912 }, { "epoch": 1.0144444444444445, "grad_norm": 0.2822834551334381, "learning_rate": 3.948327723438045e-05, "logits/chosen": -0.6356294751167297, "logits/rejected": -0.6251711249351501, "logps/chosen": -97.73452758789062, "logps/rejected": -172.1810302734375, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -5.2346038818359375, "rewards/margins": 5.977433204650879, "rewards/rejected": -11.212038040161133, "step": 913 }, { "epoch": 1.0155555555555555, "grad_norm": 0.0873795822262764, "learning_rate": 3.945830840419966e-05, "logits/chosen": -0.5687836408615112, "logits/rejected": -0.5737097263336182, "logps/chosen": -172.47207641601562, "logps/rejected": -398.86865234375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -11.1439208984375, "rewards/margins": 17.99553108215332, "rewards/rejected": -29.13945198059082, "step": 914 }, { "epoch": 1.0166666666666666, "grad_norm": 0.0018127610674127936, "learning_rate": 3.9433317884873664e-05, "logits/chosen": -0.5654761791229248, "logits/rejected": -0.5648948550224304, "logps/chosen": -173.7194366455078, "logps/rejected": -300.2122497558594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.47634506225586, "rewards/margins": 10.577975273132324, "rewards/rejected": -20.0543212890625, "step": 915 }, { "epoch": 1.0177777777777777, "grad_norm": 3.291143002570607e-05, "learning_rate": 3.940830571389114e-05, "logits/chosen": -0.22308768332004547, "logits/rejected": -0.20064327120780945, "logps/chosen": -334.9690246582031, "logps/rejected": -515.4559326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.38535499572754, "rewards/margins": 15.939664840698242, "rewards/rejected": -39.32501983642578, "step": 916 }, { "epoch": 1.018888888888889, "grad_norm": 0.014692772179841995, "learning_rate": 3.9383271928773246e-05, "logits/chosen": -0.502543568611145, "logits/rejected": -0.4876163601875305, "logps/chosen": -150.70980834960938, "logps/rejected": -265.01007080078125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -9.525039672851562, "rewards/margins": 9.312721252441406, "rewards/rejected": -18.83776092529297, "step": 917 }, { "epoch": 1.02, "grad_norm": 1.236972689628601, "learning_rate": 3.935821656707359e-05, "logits/chosen": -0.2926425337791443, "logits/rejected": -0.271138072013855, "logps/chosen": -141.18357849121094, "logps/rejected": -227.7574462890625, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": -10.34594440460205, "rewards/margins": 6.542619705200195, "rewards/rejected": -16.888565063476562, "step": 918 }, { "epoch": 1.021111111111111, "grad_norm": 3.466879367828369, "learning_rate": 3.9333139666378124e-05, "logits/chosen": -0.3055206835269928, "logits/rejected": -0.31556788086891174, "logps/chosen": -326.92242431640625, "logps/rejected": -544.6077880859375, "loss": 0.245, "rewards/accuracies": 1.0, "rewards/chosen": -22.417665481567383, "rewards/margins": 19.866064071655273, "rewards/rejected": -42.283729553222656, "step": 919 }, { "epoch": 1.0222222222222221, "grad_norm": 0.18530739843845367, "learning_rate": 3.930804126430513e-05, "logits/chosen": -0.5506863594055176, "logits/rejected": -0.5499393343925476, "logps/chosen": -171.96542358398438, "logps/rejected": -227.59698486328125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -10.136667251586914, "rewards/margins": 5.540598392486572, "rewards/rejected": -15.677265167236328, "step": 920 }, { "epoch": 1.0233333333333334, "grad_norm": 1.2339848279953003, "learning_rate": 3.92829213985051e-05, "logits/chosen": -0.5240741968154907, "logits/rejected": -0.530502438545227, "logps/chosen": -276.77593994140625, "logps/rejected": -366.56298828125, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": -20.599742889404297, "rewards/margins": 8.725889205932617, "rewards/rejected": -29.325634002685547, "step": 921 }, { "epoch": 1.0244444444444445, "grad_norm": 0.15173646807670593, "learning_rate": 3.925778010666079e-05, "logits/chosen": -0.5628122091293335, "logits/rejected": -0.5698325634002686, "logps/chosen": -282.827392578125, "logps/rejected": -362.6696472167969, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -21.346435546875, "rewards/margins": 7.183531761169434, "rewards/rejected": -28.52996826171875, "step": 922 }, { "epoch": 1.0255555555555556, "grad_norm": 45.02131271362305, "learning_rate": 3.923261742648704e-05, "logits/chosen": -0.6656967401504517, "logits/rejected": -0.7603224515914917, "logps/chosen": -226.78602600097656, "logps/rejected": -107.26272583007812, "loss": 10.7058, "rewards/accuracies": 0.0, "rewards/chosen": -16.22378921508789, "rewards/margins": -10.359254837036133, "rewards/rejected": -5.864533424377441, "step": 923 }, { "epoch": 1.0266666666666666, "grad_norm": 0.0788012146949768, "learning_rate": 3.920743339573081e-05, "logits/chosen": -0.3935435116291046, "logits/rejected": -0.3944547176361084, "logps/chosen": -284.4685363769531, "logps/rejected": -446.14111328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -19.086687088012695, "rewards/margins": 12.346888542175293, "rewards/rejected": -31.433576583862305, "step": 924 }, { "epoch": 1.0277777777777777, "grad_norm": 0.053066011518239975, "learning_rate": 3.918222805217108e-05, "logits/chosen": -0.25461870431900024, "logits/rejected": -0.2605401277542114, "logps/chosen": -295.75909423828125, "logps/rejected": -565.1690063476562, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -19.29279327392578, "rewards/margins": 23.091487884521484, "rewards/rejected": -42.384281158447266, "step": 925 }, { "epoch": 1.028888888888889, "grad_norm": 0.008342457935214043, "learning_rate": 3.915700143361878e-05, "logits/chosen": -0.3807195723056793, "logits/rejected": -0.3787826895713806, "logps/chosen": -287.4904479980469, "logps/rejected": -410.4898681640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -21.222179412841797, "rewards/margins": 9.423833847045898, "rewards/rejected": -30.646013259887695, "step": 926 }, { "epoch": 1.03, "grad_norm": 5.489984512329102, "learning_rate": 3.91317535779168e-05, "logits/chosen": -0.43028950691223145, "logits/rejected": -0.441083699464798, "logps/chosen": -179.49996948242188, "logps/rejected": -214.1118927001953, "loss": 0.2824, "rewards/accuracies": 1.0, "rewards/chosen": -12.71615219116211, "rewards/margins": 3.492063045501709, "rewards/rejected": -16.208215713500977, "step": 927 }, { "epoch": 1.031111111111111, "grad_norm": 0.042144663631916046, "learning_rate": 3.9106484522939844e-05, "logits/chosen": -0.24042367935180664, "logits/rejected": -0.24052855372428894, "logps/chosen": -152.32479858398438, "logps/rejected": -296.7701416015625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.79398250579834, "rewards/margins": 9.00532341003418, "rewards/rejected": -17.799306869506836, "step": 928 }, { "epoch": 1.0322222222222222, "grad_norm": 0.1188407763838768, "learning_rate": 3.9081194306594436e-05, "logits/chosen": -0.09866663813591003, "logits/rejected": -0.08532414585351944, "logps/chosen": -190.06271362304688, "logps/rejected": -267.0174865722656, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -12.262956619262695, "rewards/margins": 6.967530250549316, "rewards/rejected": -19.230487823486328, "step": 929 }, { "epoch": 1.0333333333333334, "grad_norm": 10.708698272705078, "learning_rate": 3.905588296681886e-05, "logits/chosen": -0.22468788921833038, "logits/rejected": -0.22634494304656982, "logps/chosen": -239.54434204101562, "logps/rejected": -396.21136474609375, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": -15.684968948364258, "rewards/margins": 10.109857559204102, "rewards/rejected": -25.79482650756836, "step": 930 }, { "epoch": 1.0344444444444445, "grad_norm": 2.844705448978857e-08, "learning_rate": 3.903055054158304e-05, "logits/chosen": -0.4240577220916748, "logits/rejected": -0.4148569107055664, "logps/chosen": -268.82037353515625, "logps/rejected": -668.672119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.303058624267578, "rewards/margins": 28.022449493408203, "rewards/rejected": -47.32550811767578, "step": 931 }, { "epoch": 1.0355555555555556, "grad_norm": 0.008080322295427322, "learning_rate": 3.90051970688886e-05, "logits/chosen": -0.15684843063354492, "logits/rejected": -0.15358754992485046, "logps/chosen": -218.92813110351562, "logps/rejected": -331.85687255859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.280792236328125, "rewards/margins": 9.197572708129883, "rewards/rejected": -23.478364944458008, "step": 932 }, { "epoch": 1.0366666666666666, "grad_norm": 0.0993359312415123, "learning_rate": 3.897982258676867e-05, "logits/chosen": -0.3533812165260315, "logits/rejected": -0.3444604277610779, "logps/chosen": -139.21636962890625, "logps/rejected": -268.55743408203125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -8.178659439086914, "rewards/margins": 10.829458236694336, "rewards/rejected": -19.00811767578125, "step": 933 }, { "epoch": 1.0377777777777777, "grad_norm": 0.0072511970065534115, "learning_rate": 3.895442713328794e-05, "logits/chosen": -0.2601079046726227, "logits/rejected": -0.27186137437820435, "logps/chosen": -259.559326171875, "logps/rejected": -368.9036560058594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.957366943359375, "rewards/margins": 9.083166122436523, "rewards/rejected": -26.0405330657959, "step": 934 }, { "epoch": 1.038888888888889, "grad_norm": 6.86876106262207, "learning_rate": 3.892901074654255e-05, "logits/chosen": -0.49105104804039, "logits/rejected": -0.48179829120635986, "logps/chosen": -170.3349151611328, "logps/rejected": -257.77215576171875, "loss": 0.3641, "rewards/accuracies": 0.5, "rewards/chosen": -10.42505931854248, "rewards/margins": 6.975379467010498, "rewards/rejected": -17.40043830871582, "step": 935 }, { "epoch": 1.04, "grad_norm": 0.018836772069334984, "learning_rate": 3.890357346466001e-05, "logits/chosen": 0.06595972180366516, "logits/rejected": 0.04046795889735222, "logps/chosen": -316.6522216796875, "logps/rejected": -447.75140380859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -20.586246490478516, "rewards/margins": 10.545844078063965, "rewards/rejected": -31.132089614868164, "step": 936 }, { "epoch": 1.041111111111111, "grad_norm": 0.32837846875190735, "learning_rate": 3.887811532579925e-05, "logits/chosen": -0.6102510690689087, "logits/rejected": -0.6038616895675659, "logps/chosen": -144.3401336669922, "logps/rejected": -230.1337127685547, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -9.171529769897461, "rewards/margins": 8.094705581665039, "rewards/rejected": -17.2662353515625, "step": 937 }, { "epoch": 1.0422222222222222, "grad_norm": 0.32814013957977295, "learning_rate": 3.8852636368150395e-05, "logits/chosen": -0.5603848695755005, "logits/rejected": -0.5684846043586731, "logps/chosen": -106.61015319824219, "logps/rejected": -182.78660583496094, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -6.702716827392578, "rewards/margins": 5.566933631896973, "rewards/rejected": -12.26965045928955, "step": 938 }, { "epoch": 1.0433333333333334, "grad_norm": 3.6683679809357272e-06, "learning_rate": 3.882713662993487e-05, "logits/chosen": -0.041283074766397476, "logits/rejected": -0.03436620160937309, "logps/chosen": -370.14434814453125, "logps/rejected": -626.0197143554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.662147521972656, "rewards/margins": 19.131149291992188, "rewards/rejected": -44.793296813964844, "step": 939 }, { "epoch": 1.0444444444444445, "grad_norm": 0.9936245083808899, "learning_rate": 3.880161614940525e-05, "logits/chosen": -0.40580376982688904, "logits/rejected": -0.4071115255355835, "logps/chosen": -117.72978210449219, "logps/rejected": -174.73666381835938, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -7.300490379333496, "rewards/margins": 4.806122779846191, "rewards/rejected": -12.106613159179688, "step": 940 }, { "epoch": 1.0455555555555556, "grad_norm": 0.08242514729499817, "learning_rate": 3.877607496484522e-05, "logits/chosen": -0.27350395917892456, "logits/rejected": -0.27187687158584595, "logps/chosen": -293.63775634765625, "logps/rejected": -436.8634033203125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -20.709640502929688, "rewards/margins": 9.882118225097656, "rewards/rejected": -30.591758728027344, "step": 941 }, { "epoch": 1.0466666666666666, "grad_norm": 0.12706133723258972, "learning_rate": 3.875051311456953e-05, "logits/chosen": -0.1300460249185562, "logits/rejected": -0.12787173688411713, "logps/chosen": -397.76123046875, "logps/rejected": -473.1900939941406, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -28.923763275146484, "rewards/margins": 7.032456398010254, "rewards/rejected": -35.95621871948242, "step": 942 }, { "epoch": 1.0477777777777777, "grad_norm": 0.08942333608865738, "learning_rate": 3.872493063692393e-05, "logits/chosen": -0.6200021505355835, "logits/rejected": -0.6393814086914062, "logps/chosen": -118.64346313476562, "logps/rejected": -218.40579223632812, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -7.0671820640563965, "rewards/margins": 7.9869489669799805, "rewards/rejected": -15.054130554199219, "step": 943 }, { "epoch": 1.048888888888889, "grad_norm": 5.856019924976863e-05, "learning_rate": 3.869932757028513e-05, "logits/chosen": -0.4583469331264496, "logits/rejected": -0.4734276235103607, "logps/chosen": -284.92242431640625, "logps/rejected": -496.6825256347656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.901275634765625, "rewards/margins": 17.083541870117188, "rewards/rejected": -36.98481750488281, "step": 944 }, { "epoch": 1.05, "grad_norm": 0.035386890172958374, "learning_rate": 3.867370395306068e-05, "logits/chosen": -0.32357004284858704, "logits/rejected": -0.32226842641830444, "logps/chosen": -330.9620361328125, "logps/rejected": -433.4453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -22.451763153076172, "rewards/margins": 11.82230281829834, "rewards/rejected": -34.27406692504883, "step": 945 }, { "epoch": 1.051111111111111, "grad_norm": 12.391487121582031, "learning_rate": 3.8648059823689e-05, "logits/chosen": -0.3658633232116699, "logits/rejected": -0.36702531576156616, "logps/chosen": -308.8013610839844, "logps/rejected": -355.25640869140625, "loss": 0.3399, "rewards/accuracies": 1.0, "rewards/chosen": -20.62628936767578, "rewards/margins": 5.269684314727783, "rewards/rejected": -25.895973205566406, "step": 946 }, { "epoch": 1.0522222222222222, "grad_norm": 4.866115093231201, "learning_rate": 3.862239522063927e-05, "logits/chosen": -0.22730350494384766, "logits/rejected": -0.21834444999694824, "logps/chosen": -411.12518310546875, "logps/rejected": -490.9923095703125, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": -29.1153507232666, "rewards/margins": 7.380949020385742, "rewards/rejected": -36.496299743652344, "step": 947 }, { "epoch": 1.0533333333333332, "grad_norm": 56.479305267333984, "learning_rate": 3.859671018241139e-05, "logits/chosen": -0.4497927725315094, "logits/rejected": -0.43037670850753784, "logps/chosen": -424.4516906738281, "logps/rejected": -428.0216064453125, "loss": 1.979, "rewards/accuracies": 0.5, "rewards/chosen": -31.333744049072266, "rewards/margins": 2.000823974609375, "rewards/rejected": -33.33456802368164, "step": 948 }, { "epoch": 1.0544444444444445, "grad_norm": 22.847755432128906, "learning_rate": 3.85710047475359e-05, "logits/chosen": -0.7444285154342651, "logits/rejected": -0.700193464756012, "logps/chosen": -318.3692626953125, "logps/rejected": -430.8302001953125, "loss": 0.7176, "rewards/accuracies": 0.5, "rewards/chosen": -21.172607421875, "rewards/margins": 10.642585754394531, "rewards/rejected": -31.815195083618164, "step": 949 }, { "epoch": 1.0555555555555556, "grad_norm": 53.986488342285156, "learning_rate": 3.8545278954573936e-05, "logits/chosen": -0.3303492069244385, "logits/rejected": -0.3226044774055481, "logps/chosen": -189.35882568359375, "logps/rejected": -346.8812561035156, "loss": 1.6082, "rewards/accuracies": 0.5, "rewards/chosen": -12.620914459228516, "rewards/margins": 11.72430419921875, "rewards/rejected": -24.345218658447266, "step": 950 }, { "epoch": 1.0566666666666666, "grad_norm": 0.0022972405422478914, "learning_rate": 3.851953284211719e-05, "logits/chosen": -0.40441831946372986, "logits/rejected": -0.41271889209747314, "logps/chosen": -255.9960479736328, "logps/rejected": -523.9296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.409931182861328, "rewards/margins": 19.63673210144043, "rewards/rejected": -36.046661376953125, "step": 951 }, { "epoch": 1.0577777777777777, "grad_norm": 0.039563409984111786, "learning_rate": 3.8493766448787825e-05, "logits/chosen": -0.6152668595314026, "logits/rejected": -0.5676716566085815, "logps/chosen": -308.619384765625, "logps/rejected": -529.039306640625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -19.849376678466797, "rewards/margins": 15.837100982666016, "rewards/rejected": -35.68647766113281, "step": 952 }, { "epoch": 1.058888888888889, "grad_norm": 0.3052694797515869, "learning_rate": 3.846797981323843e-05, "logits/chosen": -0.36802637577056885, "logits/rejected": -0.3654777407646179, "logps/chosen": -269.8501281738281, "logps/rejected": -333.3544921875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -18.54395294189453, "rewards/margins": 5.577624797821045, "rewards/rejected": -24.121578216552734, "step": 953 }, { "epoch": 1.06, "grad_norm": 0.00019267576863057911, "learning_rate": 3.844217297415196e-05, "logits/chosen": -0.3904782235622406, "logits/rejected": -0.36918699741363525, "logps/chosen": -222.85328674316406, "logps/rejected": -408.57745361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.159686088562012, "rewards/margins": 14.141341209411621, "rewards/rejected": -29.301027297973633, "step": 954 }, { "epoch": 1.0611111111111111, "grad_norm": 0.06611934304237366, "learning_rate": 3.841634597024167e-05, "logits/chosen": -0.459109365940094, "logits/rejected": -0.43827909231185913, "logps/chosen": -345.20013427734375, "logps/rejected": -425.71014404296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -25.909454345703125, "rewards/margins": 7.0935821533203125, "rewards/rejected": -33.00303649902344, "step": 955 }, { "epoch": 1.0622222222222222, "grad_norm": 0.916911244392395, "learning_rate": 3.839049884025108e-05, "logits/chosen": -0.5976671576499939, "logits/rejected": -0.6012221574783325, "logps/chosen": -230.57650756835938, "logps/rejected": -303.82659912109375, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -13.898749351501465, "rewards/margins": 5.5090742111206055, "rewards/rejected": -19.40782356262207, "step": 956 }, { "epoch": 1.0633333333333332, "grad_norm": 53.930686950683594, "learning_rate": 3.836463162295389e-05, "logits/chosen": -0.22263294458389282, "logits/rejected": -0.40659549832344055, "logps/chosen": -279.1647644042969, "logps/rejected": -290.8965759277344, "loss": 7.1249, "rewards/accuracies": 0.5, "rewards/chosen": -17.229822158813477, "rewards/margins": 3.072779655456543, "rewards/rejected": -20.302600860595703, "step": 957 }, { "epoch": 1.0644444444444445, "grad_norm": 0.10888441652059555, "learning_rate": 3.8338744357153936e-05, "logits/chosen": -0.21221420168876648, "logits/rejected": -0.2127380669116974, "logps/chosen": -240.3553466796875, "logps/rejected": -416.5422668457031, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -16.453943252563477, "rewards/margins": 11.352492332458496, "rewards/rejected": -27.806434631347656, "step": 958 }, { "epoch": 1.0655555555555556, "grad_norm": 0.0023426120169460773, "learning_rate": 3.831283708168513e-05, "logits/chosen": -0.18158510327339172, "logits/rejected": -0.14377903938293457, "logps/chosen": -281.6862487792969, "logps/rejected": -459.89105224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.589031219482422, "rewards/margins": 16.246017456054688, "rewards/rejected": -33.83504867553711, "step": 959 }, { "epoch": 1.0666666666666667, "grad_norm": 6.080934047698975, "learning_rate": 3.82869098354114e-05, "logits/chosen": -0.4711917042732239, "logits/rejected": -0.48657578229904175, "logps/chosen": -352.849853515625, "logps/rejected": -452.83154296875, "loss": 0.307, "rewards/accuracies": 1.0, "rewards/chosen": -24.88558006286621, "rewards/margins": 4.975189685821533, "rewards/rejected": -29.860769271850586, "step": 960 }, { "epoch": 1.0677777777777777, "grad_norm": 0.0014411831507459283, "learning_rate": 3.826096265722663e-05, "logits/chosen": -0.08696538209915161, "logits/rejected": -0.01628706604242325, "logps/chosen": -271.25787353515625, "logps/rejected": -494.0740966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.182817459106445, "rewards/margins": 16.50338363647461, "rewards/rejected": -31.686201095581055, "step": 961 }, { "epoch": 1.068888888888889, "grad_norm": 4.107956886291504, "learning_rate": 3.823499558605461e-05, "logits/chosen": -0.8501591682434082, "logits/rejected": -0.4775123596191406, "logps/chosen": -115.66015625, "logps/rejected": -418.44677734375, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": -6.27825403213501, "rewards/margins": 12.737502098083496, "rewards/rejected": -19.015756607055664, "step": 962 }, { "epoch": 1.07, "grad_norm": 0.0008113353978842497, "learning_rate": 3.8209008660848974e-05, "logits/chosen": -0.2328188121318817, "logits/rejected": -0.20502732694149017, "logps/chosen": -233.86520385742188, "logps/rejected": -410.1595458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.691951751708984, "rewards/margins": 15.138465881347656, "rewards/rejected": -28.83041763305664, "step": 963 }, { "epoch": 1.0711111111111111, "grad_norm": 0.00069883355172351, "learning_rate": 3.818300192059313e-05, "logits/chosen": -0.19535179436206818, "logits/rejected": -0.21504366397857666, "logps/chosen": -251.96621704101562, "logps/rejected": -456.4828796386719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.603919982910156, "rewards/margins": 15.263215065002441, "rewards/rejected": -30.86713409423828, "step": 964 }, { "epoch": 1.0722222222222222, "grad_norm": 0.031726568937301636, "learning_rate": 3.81569754043002e-05, "logits/chosen": -0.13351595401763916, "logits/rejected": -0.1404585838317871, "logps/chosen": -203.72711181640625, "logps/rejected": -307.556640625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -12.337185859680176, "rewards/margins": 8.503578186035156, "rewards/rejected": -20.840763092041016, "step": 965 }, { "epoch": 1.0733333333333333, "grad_norm": 9.359615069115534e-05, "learning_rate": 3.813092915101301e-05, "logits/chosen": -0.12417663633823395, "logits/rejected": -0.13896428048610687, "logps/chosen": -129.7952423095703, "logps/rejected": -389.1637268066406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.673955917358398, "rewards/margins": 18.43292808532715, "rewards/rejected": -26.106884002685547, "step": 966 }, { "epoch": 1.0744444444444445, "grad_norm": 1.0816428661346436, "learning_rate": 3.8104863199803966e-05, "logits/chosen": -0.30191150307655334, "logits/rejected": -0.33025750517845154, "logps/chosen": -134.5914764404297, "logps/rejected": -168.07020568847656, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": -6.266073703765869, "rewards/margins": 4.207850456237793, "rewards/rejected": -10.47392463684082, "step": 967 }, { "epoch": 1.0755555555555556, "grad_norm": 20.492979049682617, "learning_rate": 3.807877758977501e-05, "logits/chosen": -0.2182314097881317, "logits/rejected": -0.2211095094680786, "logps/chosen": -186.4130096435547, "logps/rejected": -186.747802734375, "loss": 1.284, "rewards/accuracies": 0.5, "rewards/chosen": -9.593011856079102, "rewards/margins": 2.023542642593384, "rewards/rejected": -11.616554260253906, "step": 968 }, { "epoch": 1.0766666666666667, "grad_norm": 1.3795086145401, "learning_rate": 3.805267236005762e-05, "logits/chosen": -0.23159821331501007, "logits/rejected": -0.23563066124916077, "logps/chosen": -171.73477172851562, "logps/rejected": -236.01788330078125, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": -10.125911712646484, "rewards/margins": 4.545949935913086, "rewards/rejected": -14.67186164855957, "step": 969 }, { "epoch": 1.0777777777777777, "grad_norm": 0.1012837365269661, "learning_rate": 3.8026547549812665e-05, "logits/chosen": -0.3693625330924988, "logits/rejected": -0.3658826947212219, "logps/chosen": -57.70320129394531, "logps/rejected": -150.1689453125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.4162275791168213, "rewards/margins": 6.945929527282715, "rewards/rejected": -9.362156867980957, "step": 970 }, { "epoch": 1.0788888888888888, "grad_norm": 6.403225415851921e-05, "learning_rate": 3.8000403198230387e-05, "logits/chosen": 0.1907111555337906, "logits/rejected": 0.1948828250169754, "logps/chosen": -252.479736328125, "logps/rejected": -440.0592041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.96167755126953, "rewards/margins": 14.27566146850586, "rewards/rejected": -31.23733901977539, "step": 971 }, { "epoch": 1.08, "grad_norm": 0.187295064330101, "learning_rate": 3.797423934453038e-05, "logits/chosen": -0.22307497262954712, "logits/rejected": -0.17157727479934692, "logps/chosen": -215.70989990234375, "logps/rejected": -322.4551086425781, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -11.548563003540039, "rewards/margins": 9.017562866210938, "rewards/rejected": -20.566125869750977, "step": 972 }, { "epoch": 1.0811111111111111, "grad_norm": 5.694502830505371, "learning_rate": 3.794805602796146e-05, "logits/chosen": -0.055743224918842316, "logits/rejected": -0.0491078682243824, "logps/chosen": -181.13323974609375, "logps/rejected": -228.45301818847656, "loss": 0.1403, "rewards/accuracies": 1.0, "rewards/chosen": -9.616270065307617, "rewards/margins": 4.402264595031738, "rewards/rejected": -14.018535614013672, "step": 973 }, { "epoch": 1.0822222222222222, "grad_norm": 2.7276103496551514, "learning_rate": 3.792185328780165e-05, "logits/chosen": 0.07840035110712051, "logits/rejected": 0.09685097634792328, "logps/chosen": -182.54006958007812, "logps/rejected": -246.87088012695312, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -9.908528327941895, "rewards/margins": 4.008607864379883, "rewards/rejected": -13.917136192321777, "step": 974 }, { "epoch": 1.0833333333333333, "grad_norm": 0.18076159060001373, "learning_rate": 3.7895631163358105e-05, "logits/chosen": -0.1371578574180603, "logits/rejected": -0.10833652317523956, "logps/chosen": -237.7834014892578, "logps/rejected": -361.74981689453125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -15.052204132080078, "rewards/margins": 9.769359588623047, "rewards/rejected": -24.821563720703125, "step": 975 }, { "epoch": 1.0844444444444445, "grad_norm": 0.0008879475062713027, "learning_rate": 3.7869389693967064e-05, "logits/chosen": 0.07097329199314117, "logits/rejected": 0.09128688275814056, "logps/chosen": -378.244140625, "logps/rejected": -523.902099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.833337783813477, "rewards/margins": 12.192952156066895, "rewards/rejected": -35.02629089355469, "step": 976 }, { "epoch": 1.0855555555555556, "grad_norm": 0.06064291298389435, "learning_rate": 3.7843128918993783e-05, "logits/chosen": -0.18849217891693115, "logits/rejected": -0.19251039624214172, "logps/chosen": -235.77317810058594, "logps/rejected": -327.8177490234375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -12.366214752197266, "rewards/margins": 6.995147705078125, "rewards/rejected": -19.36136245727539, "step": 977 }, { "epoch": 1.0866666666666667, "grad_norm": 0.08497323095798492, "learning_rate": 3.781684887783249e-05, "logits/chosen": -0.09890364110469818, "logits/rejected": -0.07855537533760071, "logps/chosen": -284.1493225097656, "logps/rejected": -428.96063232421875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -18.810651779174805, "rewards/margins": 10.725387573242188, "rewards/rejected": -29.536039352416992, "step": 978 }, { "epoch": 1.0877777777777777, "grad_norm": 2.666781425476074, "learning_rate": 3.779054960990631e-05, "logits/chosen": -0.31866341829299927, "logits/rejected": -0.311689555644989, "logps/chosen": -316.1949768066406, "logps/rejected": -488.77685546875, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": -18.19890785217285, "rewards/margins": 15.768153190612793, "rewards/rejected": -33.96706008911133, "step": 979 }, { "epoch": 1.0888888888888888, "grad_norm": 0.01747402362525463, "learning_rate": 3.7764231154667184e-05, "logits/chosen": -0.30222707986831665, "logits/rejected": -0.31263232231140137, "logps/chosen": -255.61355590820312, "logps/rejected": -365.2471923828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -15.740911483764648, "rewards/margins": 8.134477615356445, "rewards/rejected": -23.875389099121094, "step": 980 }, { "epoch": 1.09, "grad_norm": 0.015934515744447708, "learning_rate": 3.773789355159587e-05, "logits/chosen": -0.48191145062446594, "logits/rejected": -0.4803435206413269, "logps/chosen": -99.4846420288086, "logps/rejected": -197.90487670898438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -4.8445940017700195, "rewards/margins": 7.770145416259766, "rewards/rejected": -12.614739418029785, "step": 981 }, { "epoch": 1.0911111111111111, "grad_norm": 2.9125627406756394e-05, "learning_rate": 3.771153684020184e-05, "logits/chosen": 0.13934867084026337, "logits/rejected": 0.1131667047739029, "logps/chosen": -243.34254455566406, "logps/rejected": -506.09613037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.55350112915039, "rewards/margins": 17.298784255981445, "rewards/rejected": -31.85228729248047, "step": 982 }, { "epoch": 1.0922222222222222, "grad_norm": 4.227876663208008, "learning_rate": 3.7685161060023224e-05, "logits/chosen": -0.42166006565093994, "logits/rejected": -0.4096555709838867, "logps/chosen": -100.12556457519531, "logps/rejected": -199.44879150390625, "loss": 0.4764, "rewards/accuracies": 0.5, "rewards/chosen": -5.6612162590026855, "rewards/margins": 6.027068138122559, "rewards/rejected": -11.688284873962402, "step": 983 }, { "epoch": 1.0933333333333333, "grad_norm": 0.0059366533532738686, "learning_rate": 3.765876625062677e-05, "logits/chosen": -0.32798707485198975, "logits/rejected": -0.304437518119812, "logps/chosen": -219.17144775390625, "logps/rejected": -346.433349609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.715677261352539, "rewards/margins": 11.103384017944336, "rewards/rejected": -22.819061279296875, "step": 984 }, { "epoch": 1.0944444444444446, "grad_norm": 0.07852864265441895, "learning_rate": 3.763235245160775e-05, "logits/chosen": -0.4637465476989746, "logits/rejected": -0.4399971067905426, "logps/chosen": -121.60179901123047, "logps/rejected": -197.5133514404297, "loss": 0.3466, "rewards/accuracies": 0.5, "rewards/chosen": -6.223718643188477, "rewards/margins": 5.511992454528809, "rewards/rejected": -11.735711097717285, "step": 985 }, { "epoch": 1.0955555555555556, "grad_norm": 1.3828759620082565e-05, "learning_rate": 3.7605919702589934e-05, "logits/chosen": -0.504707396030426, "logits/rejected": -0.493743360042572, "logps/chosen": -172.1906280517578, "logps/rejected": -385.9537353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.609877586364746, "rewards/margins": 16.800094604492188, "rewards/rejected": -27.40997314453125, "step": 986 }, { "epoch": 1.0966666666666667, "grad_norm": 0.0008948522736318409, "learning_rate": 3.757946804322555e-05, "logits/chosen": -0.13918852806091309, "logits/rejected": -0.15672901272773743, "logps/chosen": -187.10848999023438, "logps/rejected": -322.131591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.394972801208496, "rewards/margins": 11.023569107055664, "rewards/rejected": -22.418540954589844, "step": 987 }, { "epoch": 1.0977777777777777, "grad_norm": 29.458051681518555, "learning_rate": 3.7552997513195134e-05, "logits/chosen": -0.18212831020355225, "logits/rejected": -0.17693397402763367, "logps/chosen": -306.83209228515625, "logps/rejected": -367.9896240234375, "loss": 1.2831, "rewards/accuracies": 0.5, "rewards/chosen": -18.788414001464844, "rewards/margins": 5.2136945724487305, "rewards/rejected": -24.00210952758789, "step": 988 }, { "epoch": 1.0988888888888888, "grad_norm": 0.07726653665304184, "learning_rate": 3.752650815220758e-05, "logits/chosen": 0.11216554045677185, "logits/rejected": 0.0949043482542038, "logps/chosen": -455.96435546875, "logps/rejected": -608.9050903320312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -28.406423568725586, "rewards/margins": 10.845159530639648, "rewards/rejected": -39.251583099365234, "step": 989 }, { "epoch": 1.1, "grad_norm": 10.290431022644043, "learning_rate": 3.7500000000000003e-05, "logits/chosen": -0.249942809343338, "logits/rejected": -0.26857468485832214, "logps/chosen": -178.38844299316406, "logps/rejected": -297.1343994140625, "loss": 0.4247, "rewards/accuracies": 0.5, "rewards/chosen": -10.59642219543457, "rewards/margins": 8.692152976989746, "rewards/rejected": -19.28857421875, "step": 990 }, { "epoch": 1.1011111111111112, "grad_norm": 4.935085235047154e-05, "learning_rate": 3.747347309633772e-05, "logits/chosen": -0.20945684611797333, "logits/rejected": -0.1867825835943222, "logps/chosen": -206.54861450195312, "logps/rejected": -376.9527587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.583656311035156, "rewards/margins": 14.176431655883789, "rewards/rejected": -25.760087966918945, "step": 991 }, { "epoch": 1.1022222222222222, "grad_norm": 22.241594314575195, "learning_rate": 3.744692748101417e-05, "logits/chosen": -0.41915756464004517, "logits/rejected": -0.428169846534729, "logps/chosen": -211.586669921875, "logps/rejected": -262.0228271484375, "loss": 0.2679, "rewards/accuracies": 1.0, "rewards/chosen": -13.31212329864502, "rewards/margins": 3.2415354251861572, "rewards/rejected": -16.55365753173828, "step": 992 }, { "epoch": 1.1033333333333333, "grad_norm": 0.006423710845410824, "learning_rate": 3.742036319385087e-05, "logits/chosen": -0.4679776132106781, "logits/rejected": -0.49432483315467834, "logps/chosen": -353.63458251953125, "logps/rejected": -475.07916259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -20.750219345092773, "rewards/margins": 10.838682174682617, "rewards/rejected": -31.58890151977539, "step": 993 }, { "epoch": 1.1044444444444443, "grad_norm": 0.00014752325660083443, "learning_rate": 3.739378027469735e-05, "logits/chosen": -0.09281207621097565, "logits/rejected": -0.11839812994003296, "logps/chosen": -293.32855224609375, "logps/rejected": -447.4817810058594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.550825119018555, "rewards/margins": 12.880779266357422, "rewards/rejected": -30.431604385375977, "step": 994 }, { "epoch": 1.1055555555555556, "grad_norm": 0.24995505809783936, "learning_rate": 3.736717876343106e-05, "logits/chosen": -0.19973847270011902, "logits/rejected": -0.173880472779274, "logps/chosen": -214.3303680419922, "logps/rejected": -267.6997375488281, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -13.565336227416992, "rewards/margins": 5.1257758140563965, "rewards/rejected": -18.691112518310547, "step": 995 }, { "epoch": 1.1066666666666667, "grad_norm": 0.26105865836143494, "learning_rate": 3.734055869995738e-05, "logits/chosen": -0.4473116397857666, "logits/rejected": -0.4506340026855469, "logps/chosen": -101.15618896484375, "logps/rejected": -200.82774353027344, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -4.3147454261779785, "rewards/margins": 8.514200210571289, "rewards/rejected": -12.82894515991211, "step": 996 }, { "epoch": 1.1077777777777778, "grad_norm": 0.0012574594002217054, "learning_rate": 3.7313920124209504e-05, "logits/chosen": -0.7060949206352234, "logits/rejected": -0.7054212093353271, "logps/chosen": -92.63850402832031, "logps/rejected": -261.44964599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.936795711517334, "rewards/margins": 13.33562183380127, "rewards/rejected": -19.272418975830078, "step": 997 }, { "epoch": 1.1088888888888888, "grad_norm": 49.73457336425781, "learning_rate": 3.728726307614838e-05, "logits/chosen": -0.5877841711044312, "logits/rejected": -0.899716854095459, "logps/chosen": -238.6086883544922, "logps/rejected": -255.76846313476562, "loss": 4.6493, "rewards/accuracies": 0.5, "rewards/chosen": -14.949070930480957, "rewards/margins": 3.08380126953125, "rewards/rejected": -18.032873153686523, "step": 998 }, { "epoch": 1.11, "grad_norm": 2.5847907636489253e-07, "learning_rate": 3.726058759576271e-05, "logits/chosen": -0.27337175607681274, "logits/rejected": -0.2661017179489136, "logps/chosen": -163.90208435058594, "logps/rejected": -491.79510498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.7922945022583, "rewards/margins": 26.279224395751953, "rewards/rejected": -36.07151794433594, "step": 999 }, { "epoch": 1.1111111111111112, "grad_norm": 0.38055717945098877, "learning_rate": 3.723389372306879e-05, "logits/chosen": -0.6225008368492126, "logits/rejected": -0.6205447912216187, "logps/chosen": -191.91561889648438, "logps/rejected": -271.02764892578125, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -11.410056114196777, "rewards/margins": 6.791707515716553, "rewards/rejected": -18.201763153076172, "step": 1000 }, { "epoch": 1.1122222222222222, "grad_norm": 20.718984603881836, "learning_rate": 3.7207181498110545e-05, "logits/chosen": -0.45185190439224243, "logits/rejected": -0.568145751953125, "logps/chosen": -128.9713897705078, "logps/rejected": -158.41073608398438, "loss": 0.6446, "rewards/accuracies": 0.5, "rewards/chosen": -6.827933311462402, "rewards/margins": 3.112240791320801, "rewards/rejected": -9.940174102783203, "step": 1001 }, { "epoch": 1.1133333333333333, "grad_norm": 0.08299943059682846, "learning_rate": 3.718045096095943e-05, "logits/chosen": -0.2849467396736145, "logits/rejected": -0.28638991713523865, "logps/chosen": -240.60792541503906, "logps/rejected": -342.7281494140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -15.375896453857422, "rewards/margins": 8.920430183410645, "rewards/rejected": -24.29632568359375, "step": 1002 }, { "epoch": 1.1144444444444443, "grad_norm": 0.10394684970378876, "learning_rate": 3.715370215171435e-05, "logits/chosen": -0.5165773034095764, "logits/rejected": -0.5299787521362305, "logps/chosen": -299.4464416503906, "logps/rejected": -390.2891845703125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -21.429018020629883, "rewards/margins": 6.8100905418396, "rewards/rejected": -28.23910903930664, "step": 1003 }, { "epoch": 1.1155555555555556, "grad_norm": 0.013566657900810242, "learning_rate": 3.712693511050164e-05, "logits/chosen": -0.28024694323539734, "logits/rejected": -0.2547891139984131, "logps/chosen": -189.14645385742188, "logps/rejected": -330.23577880859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.376239776611328, "rewards/margins": 10.882214546203613, "rewards/rejected": -21.258453369140625, "step": 1004 }, { "epoch": 1.1166666666666667, "grad_norm": 0.04046337679028511, "learning_rate": 3.7100149877474974e-05, "logits/chosen": -0.7194216847419739, "logits/rejected": -0.78838050365448, "logps/chosen": -111.17716217041016, "logps/rejected": -259.422607421875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.495238780975342, "rewards/margins": 12.222526550292969, "rewards/rejected": -17.71776580810547, "step": 1005 }, { "epoch": 1.1177777777777778, "grad_norm": 6.531714916229248, "learning_rate": 3.707334649281532e-05, "logits/chosen": -0.4910905063152313, "logits/rejected": -0.4830675721168518, "logps/chosen": -122.32813262939453, "logps/rejected": -132.43991088867188, "loss": 0.2631, "rewards/accuracies": 1.0, "rewards/chosen": -5.377425193786621, "rewards/margins": 1.577639102935791, "rewards/rejected": -6.955064296722412, "step": 1006 }, { "epoch": 1.1188888888888888, "grad_norm": 21.137664794921875, "learning_rate": 3.7046524996730866e-05, "logits/chosen": -0.2833409607410431, "logits/rejected": -0.27428868412971497, "logps/chosen": -254.30845642089844, "logps/rejected": -242.69778442382812, "loss": 2.3488, "rewards/accuracies": 0.5, "rewards/chosen": -17.874473571777344, "rewards/margins": -0.770291805267334, "rewards/rejected": -17.104183197021484, "step": 1007 }, { "epoch": 1.12, "grad_norm": 0.16335225105285645, "learning_rate": 3.7019685429456986e-05, "logits/chosen": -0.3156099021434784, "logits/rejected": -0.31689953804016113, "logps/chosen": -104.90428161621094, "logps/rejected": -215.52023315429688, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.780174732208252, "rewards/margins": 8.860414505004883, "rewards/rejected": -12.640588760375977, "step": 1008 }, { "epoch": 1.1211111111111112, "grad_norm": 28.97460174560547, "learning_rate": 3.699282783125616e-05, "logits/chosen": -0.045429326593875885, "logits/rejected": -0.034674808382987976, "logps/chosen": -206.14144897460938, "logps/rejected": -258.59197998046875, "loss": 1.2273, "rewards/accuracies": 0.5, "rewards/chosen": -11.592824935913086, "rewards/margins": 4.23032808303833, "rewards/rejected": -15.823152542114258, "step": 1009 }, { "epoch": 1.1222222222222222, "grad_norm": 0.024601083248853683, "learning_rate": 3.69659522424179e-05, "logits/chosen": -0.23452889919281006, "logits/rejected": -0.23204228281974792, "logps/chosen": -192.83702087402344, "logps/rejected": -368.5962829589844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -11.1650972366333, "rewards/margins": 14.674081802368164, "rewards/rejected": -25.83917999267578, "step": 1010 }, { "epoch": 1.1233333333333333, "grad_norm": 14.399267196655273, "learning_rate": 3.693905870325872e-05, "logits/chosen": -0.40070271492004395, "logits/rejected": -0.3592774569988251, "logps/chosen": -216.56053161621094, "logps/rejected": -346.2997741699219, "loss": 0.3228, "rewards/accuracies": 1.0, "rewards/chosen": -11.209234237670898, "rewards/margins": 8.193181991577148, "rewards/rejected": -19.402414321899414, "step": 1011 }, { "epoch": 1.1244444444444444, "grad_norm": 0.00322019518353045, "learning_rate": 3.691214725412205e-05, "logits/chosen": -0.11547476053237915, "logits/rejected": -0.08875826746225357, "logps/chosen": -138.5582275390625, "logps/rejected": -286.564208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.795862197875977, "rewards/margins": 12.060945510864258, "rewards/rejected": -18.856807708740234, "step": 1012 }, { "epoch": 1.1255555555555556, "grad_norm": 0.014462564140558243, "learning_rate": 3.6885217935378216e-05, "logits/chosen": -0.20721451938152313, "logits/rejected": -0.22921441495418549, "logps/chosen": -143.21205139160156, "logps/rejected": -267.58111572265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -9.10982894897461, "rewards/margins": 9.619359016418457, "rewards/rejected": -18.72918701171875, "step": 1013 }, { "epoch": 1.1266666666666667, "grad_norm": 0.050283074378967285, "learning_rate": 3.685827078742432e-05, "logits/chosen": -0.12340974807739258, "logits/rejected": -0.11385199427604675, "logps/chosen": -160.8265838623047, "logps/rejected": -253.9102783203125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.614823341369629, "rewards/margins": 6.735048294067383, "rewards/rejected": -15.349871635437012, "step": 1014 }, { "epoch": 1.1277777777777778, "grad_norm": 0.01720566488802433, "learning_rate": 3.683130585068421e-05, "logits/chosen": -0.1717349737882614, "logits/rejected": -0.16533608734607697, "logps/chosen": -130.31851196289062, "logps/rejected": -229.64324951171875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -7.422634124755859, "rewards/margins": 7.78599739074707, "rewards/rejected": -15.20863151550293, "step": 1015 }, { "epoch": 1.1288888888888888, "grad_norm": 0.4285356402397156, "learning_rate": 3.6804323165608445e-05, "logits/chosen": -0.026145048439502716, "logits/rejected": 0.008365780115127563, "logps/chosen": -154.17544555664062, "logps/rejected": -334.5257263183594, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -7.169423580169678, "rewards/margins": 10.41854190826416, "rewards/rejected": -17.58796501159668, "step": 1016 }, { "epoch": 1.13, "grad_norm": 0.6745039820671082, "learning_rate": 3.6777322772674186e-05, "logits/chosen": -0.021430402994155884, "logits/rejected": 0.014841191470623016, "logps/chosen": -174.67733764648438, "logps/rejected": -247.76536560058594, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -8.478711128234863, "rewards/margins": 4.290863037109375, "rewards/rejected": -12.769574165344238, "step": 1017 }, { "epoch": 1.1311111111111112, "grad_norm": 0.1613026261329651, "learning_rate": 3.675030471238515e-05, "logits/chosen": -0.41621169447898865, "logits/rejected": -0.37801533937454224, "logps/chosen": -90.34493255615234, "logps/rejected": -172.07272338867188, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -2.7359871864318848, "rewards/margins": 5.46198844909668, "rewards/rejected": -8.197976112365723, "step": 1018 }, { "epoch": 1.1322222222222222, "grad_norm": 0.03122841753065586, "learning_rate": 3.6723269025271604e-05, "logits/chosen": 0.09939122200012207, "logits/rejected": 0.14797618985176086, "logps/chosen": -201.46536254882812, "logps/rejected": -380.1781921386719, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -10.512569427490234, "rewards/margins": 9.052385330200195, "rewards/rejected": -19.56495475769043, "step": 1019 }, { "epoch": 1.1333333333333333, "grad_norm": 0.12927468121051788, "learning_rate": 3.66962157518902e-05, "logits/chosen": 0.275818794965744, "logits/rejected": 0.3077850341796875, "logps/chosen": -182.75360107421875, "logps/rejected": -289.3876953125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -8.882903099060059, "rewards/margins": 7.4085493087768555, "rewards/rejected": -16.291452407836914, "step": 1020 }, { "epoch": 1.1344444444444444, "grad_norm": 0.004887021612375975, "learning_rate": 3.6669144932824e-05, "logits/chosen": -0.3037090599536896, "logits/rejected": -0.24330782890319824, "logps/chosen": -288.0158996582031, "logps/rejected": -364.1776123046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.0640287399292, "rewards/margins": 10.04494857788086, "rewards/rejected": -23.108978271484375, "step": 1021 }, { "epoch": 1.1355555555555557, "grad_norm": 1.91560959815979, "learning_rate": 3.664205660868239e-05, "logits/chosen": -0.2355625033378601, "logits/rejected": -0.22141551971435547, "logps/chosen": -170.53909301757812, "logps/rejected": -216.66961669921875, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": -9.750984191894531, "rewards/margins": 3.0702385902404785, "rewards/rejected": -12.821222305297852, "step": 1022 }, { "epoch": 1.1366666666666667, "grad_norm": 1.7388099431991577, "learning_rate": 3.6614950820101004e-05, "logits/chosen": -0.23026694357395172, "logits/rejected": -0.2096872329711914, "logps/chosen": -221.99606323242188, "logps/rejected": -296.653076171875, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": -10.874488830566406, "rewards/margins": 6.60811185836792, "rewards/rejected": -17.482601165771484, "step": 1023 }, { "epoch": 1.1377777777777778, "grad_norm": 0.08189105242490768, "learning_rate": 3.6587827607741684e-05, "logits/chosen": 0.0848471000790596, "logits/rejected": 0.1087227389216423, "logps/chosen": -207.20306396484375, "logps/rejected": -297.9901428222656, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -11.949462890625, "rewards/margins": 6.753635883331299, "rewards/rejected": -18.70309829711914, "step": 1024 }, { "epoch": 1.1388888888888888, "grad_norm": 0.44170907139778137, "learning_rate": 3.6560687012292394e-05, "logits/chosen": -0.253902405500412, "logits/rejected": -0.22262708842754364, "logps/chosen": -251.10787963867188, "logps/rejected": -572.3552856445312, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -14.489176750183105, "rewards/margins": 19.215682983398438, "rewards/rejected": -33.70486068725586, "step": 1025 }, { "epoch": 1.1400000000000001, "grad_norm": 0.0011539782863110304, "learning_rate": 3.65335290744672e-05, "logits/chosen": -0.22364920377731323, "logits/rejected": -0.1705678105354309, "logps/chosen": -146.11309814453125, "logps/rejected": -301.0605773925781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.06253433227539, "rewards/margins": 10.946817398071289, "rewards/rejected": -19.00935173034668, "step": 1026 }, { "epoch": 1.1411111111111112, "grad_norm": 0.8342512249946594, "learning_rate": 3.6506353835006155e-05, "logits/chosen": -0.37697771191596985, "logits/rejected": -0.3499525785446167, "logps/chosen": -176.5516357421875, "logps/rejected": -239.91683959960938, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": -11.325688362121582, "rewards/margins": 4.215996742248535, "rewards/rejected": -15.541685104370117, "step": 1027 }, { "epoch": 1.1422222222222222, "grad_norm": 0.3561316728591919, "learning_rate": 3.6479161334675296e-05, "logits/chosen": -0.1909886598587036, "logits/rejected": -0.1611156165599823, "logps/chosen": -203.31365966796875, "logps/rejected": -293.234375, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -11.407435417175293, "rewards/margins": 5.209601402282715, "rewards/rejected": -16.617036819458008, "step": 1028 }, { "epoch": 1.1433333333333333, "grad_norm": 0.0799984484910965, "learning_rate": 3.645195161426653e-05, "logits/chosen": -0.12834769487380981, "logits/rejected": -0.11716984212398529, "logps/chosen": -234.9388427734375, "logps/rejected": -370.0706787109375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -14.37957763671875, "rewards/margins": 9.81635856628418, "rewards/rejected": -24.195938110351562, "step": 1029 }, { "epoch": 1.1444444444444444, "grad_norm": 10.491423606872559, "learning_rate": 3.6424724714597594e-05, "logits/chosen": -0.4723678529262543, "logits/rejected": -0.46866220235824585, "logps/chosen": -174.31045532226562, "logps/rejected": -199.2326202392578, "loss": 0.3869, "rewards/accuracies": 0.5, "rewards/chosen": -9.479469299316406, "rewards/margins": 2.8102431297302246, "rewards/rejected": -12.289711952209473, "step": 1030 }, { "epoch": 1.1455555555555557, "grad_norm": 3.823810577392578, "learning_rate": 3.6397480676512025e-05, "logits/chosen": -0.38416749238967896, "logits/rejected": -0.3800395131111145, "logps/chosen": -162.84375, "logps/rejected": -261.41278076171875, "loss": 0.1799, "rewards/accuracies": 1.0, "rewards/chosen": -8.792927742004395, "rewards/margins": 6.834688663482666, "rewards/rejected": -15.627616882324219, "step": 1031 }, { "epoch": 1.1466666666666667, "grad_norm": 0.019724581390619278, "learning_rate": 3.637021954087904e-05, "logits/chosen": -0.22519995272159576, "logits/rejected": -0.2461327314376831, "logps/chosen": -153.1543731689453, "logps/rejected": -294.22515869140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.444095611572266, "rewards/margins": 12.68794059753418, "rewards/rejected": -21.132034301757812, "step": 1032 }, { "epoch": 1.1477777777777778, "grad_norm": 7.769956111907959, "learning_rate": 3.6342941348593515e-05, "logits/chosen": -0.16367043554782867, "logits/rejected": -0.173386812210083, "logps/chosen": -247.4352569580078, "logps/rejected": -267.66192626953125, "loss": 0.4035, "rewards/accuracies": 0.5, "rewards/chosen": -16.170866012573242, "rewards/margins": 2.9362292289733887, "rewards/rejected": -19.10709571838379, "step": 1033 }, { "epoch": 1.1488888888888888, "grad_norm": 27.343006134033203, "learning_rate": 3.631564614057592e-05, "logits/chosen": -0.22904017567634583, "logits/rejected": -0.20925277471542358, "logps/chosen": -222.098876953125, "logps/rejected": -312.6313171386719, "loss": 1.0503, "rewards/accuracies": 0.5, "rewards/chosen": -14.915894508361816, "rewards/margins": 5.803906440734863, "rewards/rejected": -20.71980094909668, "step": 1034 }, { "epoch": 1.15, "grad_norm": 0.6408769488334656, "learning_rate": 3.628833395777224e-05, "logits/chosen": -0.5275874137878418, "logits/rejected": -0.5148206949234009, "logps/chosen": -200.5960235595703, "logps/rejected": -231.19003295898438, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -10.822698593139648, "rewards/margins": 4.346584320068359, "rewards/rejected": -15.169282913208008, "step": 1035 }, { "epoch": 1.1511111111111112, "grad_norm": 5.086670398712158, "learning_rate": 3.6261004841153924e-05, "logits/chosen": -0.31031396985054016, "logits/rejected": -0.2975437641143799, "logps/chosen": -211.8347625732422, "logps/rejected": -355.78778076171875, "loss": 0.0924, "rewards/accuracies": 1.0, "rewards/chosen": -13.299003601074219, "rewards/margins": 11.72791576385498, "rewards/rejected": -25.026920318603516, "step": 1036 }, { "epoch": 1.1522222222222223, "grad_norm": 3.5459372997283936, "learning_rate": 3.623365883171782e-05, "logits/chosen": -0.4028824269771576, "logits/rejected": -0.3749975562095642, "logps/chosen": -267.9396667480469, "logps/rejected": -332.8221435546875, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -17.880903244018555, "rewards/margins": 5.562031269073486, "rewards/rejected": -23.442934036254883, "step": 1037 }, { "epoch": 1.1533333333333333, "grad_norm": 0.0065590133890509605, "learning_rate": 3.620629597048614e-05, "logits/chosen": -0.10433497279882431, "logits/rejected": -0.09701776504516602, "logps/chosen": -217.32479858398438, "logps/rejected": -458.54290771484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.170608520507812, "rewards/margins": 18.600276947021484, "rewards/rejected": -31.770885467529297, "step": 1038 }, { "epoch": 1.1544444444444444, "grad_norm": 0.0003616553731262684, "learning_rate": 3.617891629850637e-05, "logits/chosen": -0.9628247022628784, "logits/rejected": -0.5082279443740845, "logps/chosen": -114.93914794921875, "logps/rejected": -365.4307861328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.07163667678833, "rewards/margins": 16.483556747436523, "rewards/rejected": -23.555192947387695, "step": 1039 }, { "epoch": 1.1555555555555554, "grad_norm": 0.00012559135211631656, "learning_rate": 3.615151985685117e-05, "logits/chosen": -0.37764644622802734, "logits/rejected": -0.3645309805870056, "logps/chosen": -238.51316833496094, "logps/rejected": -415.4508056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.402811050415039, "rewards/margins": 13.4328031539917, "rewards/rejected": -27.835613250732422, "step": 1040 }, { "epoch": 1.1566666666666667, "grad_norm": 0.05948052182793617, "learning_rate": 3.612410668661842e-05, "logits/chosen": -0.21459192037582397, "logits/rejected": -0.1850774884223938, "logps/chosen": -106.68849182128906, "logps/rejected": -217.72439575195312, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.8259899616241455, "rewards/margins": 9.182571411132812, "rewards/rejected": -13.008560180664062, "step": 1041 }, { "epoch": 1.1577777777777778, "grad_norm": 5.987268924713135, "learning_rate": 3.609667682893105e-05, "logits/chosen": -0.5527137517929077, "logits/rejected": -0.5417503714561462, "logps/chosen": -137.13360595703125, "logps/rejected": -221.6246337890625, "loss": 0.1446, "rewards/accuracies": 1.0, "rewards/chosen": -8.684802055358887, "rewards/margins": 5.5749406814575195, "rewards/rejected": -14.259742736816406, "step": 1042 }, { "epoch": 1.1588888888888889, "grad_norm": 19.346792221069336, "learning_rate": 3.6069230324937043e-05, "logits/chosen": -0.3764539361000061, "logits/rejected": -0.38198164105415344, "logps/chosen": -177.2361297607422, "logps/rejected": -282.6412658691406, "loss": 0.5488, "rewards/accuracies": 0.5, "rewards/chosen": -8.838172912597656, "rewards/margins": 9.155149459838867, "rewards/rejected": -17.99332046508789, "step": 1043 }, { "epoch": 1.16, "grad_norm": 3.3917042401299113e-06, "learning_rate": 3.604176721580935e-05, "logits/chosen": 0.014313310384750366, "logits/rejected": 0.01846522092819214, "logps/chosen": -308.90582275390625, "logps/rejected": -571.0337524414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.559614181518555, "rewards/margins": 17.761093139648438, "rewards/rejected": -35.320709228515625, "step": 1044 }, { "epoch": 1.1611111111111112, "grad_norm": 0.0015718190697953105, "learning_rate": 3.601428754274584e-05, "logits/chosen": -0.5310087203979492, "logits/rejected": -0.5351850390434265, "logps/chosen": -207.18264770507812, "logps/rejected": -328.86383056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.362982749938965, "rewards/margins": 11.657918930053711, "rewards/rejected": -25.02090072631836, "step": 1045 }, { "epoch": 1.1622222222222223, "grad_norm": 0.08753430843353271, "learning_rate": 3.59867913469692e-05, "logits/chosen": -0.15546052157878876, "logits/rejected": -0.1141827404499054, "logps/chosen": -232.38185119628906, "logps/rejected": -451.766845703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -13.900193214416504, "rewards/margins": 12.502861976623535, "rewards/rejected": -26.40305519104004, "step": 1046 }, { "epoch": 1.1633333333333333, "grad_norm": 0.00658825458958745, "learning_rate": 3.5959278669726935e-05, "logits/chosen": -0.4378686547279358, "logits/rejected": -0.4312337636947632, "logps/chosen": -244.83457946777344, "logps/rejected": -480.6805725097656, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.719694137573242, "rewards/margins": 17.766576766967773, "rewards/rejected": -33.486270904541016, "step": 1047 }, { "epoch": 1.1644444444444444, "grad_norm": 0.04778069630265236, "learning_rate": 3.593174955229127e-05, "logits/chosen": -0.4509483873844147, "logits/rejected": -0.5013527274131775, "logps/chosen": -212.09515380859375, "logps/rejected": -335.9323425292969, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -13.201671600341797, "rewards/margins": 8.92824649810791, "rewards/rejected": -22.12991714477539, "step": 1048 }, { "epoch": 1.1655555555555557, "grad_norm": 0.00796605460345745, "learning_rate": 3.5904204035959075e-05, "logits/chosen": -0.38335710763931274, "logits/rejected": -0.44754427671432495, "logps/chosen": -186.82394409179688, "logps/rejected": -346.7454833984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.687881469726562, "rewards/margins": 10.722341537475586, "rewards/rejected": -23.41022300720215, "step": 1049 }, { "epoch": 1.1666666666666667, "grad_norm": 5.337181573850103e-05, "learning_rate": 3.587664216205183e-05, "logits/chosen": -0.3894776403903961, "logits/rejected": -0.3936188817024231, "logps/chosen": -216.92037963867188, "logps/rejected": -492.6287536621094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.528152465820312, "rewards/margins": 23.278759002685547, "rewards/rejected": -37.80691146850586, "step": 1050 }, { "epoch": 1.1677777777777778, "grad_norm": 0.00465406896546483, "learning_rate": 3.584906397191556e-05, "logits/chosen": -0.5047542452812195, "logits/rejected": -0.48858439922332764, "logps/chosen": -247.0367431640625, "logps/rejected": -506.8766784667969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.28035545349121, "rewards/margins": 20.921985626220703, "rewards/rejected": -37.20233917236328, "step": 1051 }, { "epoch": 1.1688888888888889, "grad_norm": 11.186051368713379, "learning_rate": 3.5821469506920756e-05, "logits/chosen": -0.5121027827262878, "logits/rejected": -0.5067103505134583, "logps/chosen": -159.25839233398438, "logps/rejected": -171.8542938232422, "loss": 0.6119, "rewards/accuracies": 0.5, "rewards/chosen": -10.118753433227539, "rewards/margins": 1.427377700805664, "rewards/rejected": -11.546131134033203, "step": 1052 }, { "epoch": 1.17, "grad_norm": 0.055217936635017395, "learning_rate": 3.579385880846232e-05, "logits/chosen": -0.7050354480743408, "logits/rejected": -0.6788297891616821, "logps/chosen": -179.64996337890625, "logps/rejected": -299.4534606933594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -12.003437042236328, "rewards/margins": 10.164581298828125, "rewards/rejected": -22.168018341064453, "step": 1053 }, { "epoch": 1.1711111111111112, "grad_norm": 0.05251724645495415, "learning_rate": 3.576623191795954e-05, "logits/chosen": -0.4221099317073822, "logits/rejected": -0.4237387180328369, "logps/chosen": -207.1473388671875, "logps/rejected": -348.57806396484375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -12.546756744384766, "rewards/margins": 11.968363761901855, "rewards/rejected": -24.515121459960938, "step": 1054 }, { "epoch": 1.1722222222222223, "grad_norm": 0.0005985983298160136, "learning_rate": 3.5738588876855924e-05, "logits/chosen": -0.40436094999313354, "logits/rejected": -0.4201485514640808, "logps/chosen": -199.8120880126953, "logps/rejected": -452.2352294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.655598640441895, "rewards/margins": 17.470169067382812, "rewards/rejected": -31.12576675415039, "step": 1055 }, { "epoch": 1.1733333333333333, "grad_norm": 2.295459747314453, "learning_rate": 3.571092972661928e-05, "logits/chosen": -0.4884805679321289, "logits/rejected": -0.4948773682117462, "logps/chosen": -205.6551513671875, "logps/rejected": -502.5335998535156, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": -11.987140655517578, "rewards/margins": 18.00078010559082, "rewards/rejected": -29.9879207611084, "step": 1056 }, { "epoch": 1.1744444444444444, "grad_norm": 0.0003753094933927059, "learning_rate": 3.5683254508741546e-05, "logits/chosen": -0.21403777599334717, "logits/rejected": -0.20032387971878052, "logps/chosen": -350.9725036621094, "logps/rejected": -531.5283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.632461547851562, "rewards/margins": 13.250946044921875, "rewards/rejected": -35.88340759277344, "step": 1057 }, { "epoch": 1.1755555555555555, "grad_norm": 11.36070728302002, "learning_rate": 3.565556326473877e-05, "logits/chosen": -0.4408838152885437, "logits/rejected": -0.44532376527786255, "logps/chosen": -205.54727172851562, "logps/rejected": -336.84954833984375, "loss": 0.247, "rewards/accuracies": 1.0, "rewards/chosen": -13.75515079498291, "rewards/margins": 9.308652877807617, "rewards/rejected": -23.063804626464844, "step": 1058 }, { "epoch": 1.1766666666666667, "grad_norm": 0.03370336443185806, "learning_rate": 3.562785603615104e-05, "logits/chosen": -0.6639463305473328, "logits/rejected": -0.6782292127609253, "logps/chosen": -108.08656311035156, "logps/rejected": -233.52508544921875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -6.105447769165039, "rewards/margins": 9.029929161071777, "rewards/rejected": -15.135376930236816, "step": 1059 }, { "epoch": 1.1777777777777778, "grad_norm": 0.550901472568512, "learning_rate": 3.560013286454242e-05, "logits/chosen": -0.4237847328186035, "logits/rejected": -0.4138566255569458, "logps/chosen": -181.22251892089844, "logps/rejected": -259.8328552246094, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -10.53331184387207, "rewards/margins": 6.741623401641846, "rewards/rejected": -17.27493667602539, "step": 1060 }, { "epoch": 1.1788888888888889, "grad_norm": 6.806051731109619, "learning_rate": 3.55723937915009e-05, "logits/chosen": -0.25655290484428406, "logits/rejected": -0.2620440721511841, "logps/chosen": -180.63648986816406, "logps/rejected": -347.7450866699219, "loss": 0.4954, "rewards/accuracies": 0.5, "rewards/chosen": -10.70089340209961, "rewards/margins": 13.824972152709961, "rewards/rejected": -24.525867462158203, "step": 1061 }, { "epoch": 1.18, "grad_norm": 1.2794055938720703, "learning_rate": 3.5544638858638304e-05, "logits/chosen": -0.13984552025794983, "logits/rejected": -0.12837979197502136, "logps/chosen": -191.57205200195312, "logps/rejected": -273.8128967285156, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -10.441560745239258, "rewards/margins": 7.124518394470215, "rewards/rejected": -17.566078186035156, "step": 1062 }, { "epoch": 1.181111111111111, "grad_norm": 4.754730070999358e-06, "learning_rate": 3.5516868107590285e-05, "logits/chosen": -0.6120290756225586, "logits/rejected": -0.6132985353469849, "logps/chosen": -185.93893432617188, "logps/rejected": -281.1500549316406, "loss": 0.3466, "rewards/accuracies": 1.0, "rewards/chosen": -12.069352149963379, "rewards/margins": 8.681377410888672, "rewards/rejected": -20.750730514526367, "step": 1063 }, { "epoch": 1.1822222222222223, "grad_norm": 3.816186563199153e-06, "learning_rate": 3.5489081580016185e-05, "logits/chosen": -0.23850053548812866, "logits/rejected": -0.24317467212677002, "logps/chosen": -229.1615447998047, "logps/rejected": -630.7595825195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.000492095947266, "rewards/margins": 30.997852325439453, "rewards/rejected": -45.99834442138672, "step": 1064 }, { "epoch": 1.1833333333333333, "grad_norm": 0.00147780601400882, "learning_rate": 3.546127931759903e-05, "logits/chosen": -0.5697752237319946, "logits/rejected": -0.5219024419784546, "logps/chosen": -264.9617919921875, "logps/rejected": -383.76373291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.330738067626953, "rewards/margins": 12.117511749267578, "rewards/rejected": -28.44824981689453, "step": 1065 }, { "epoch": 1.1844444444444444, "grad_norm": 1.5442897165485192e-06, "learning_rate": 3.543346136204545e-05, "logits/chosen": -0.33329352736473083, "logits/rejected": -0.3135702610015869, "logps/chosen": -199.270751953125, "logps/rejected": -576.2095947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.366874694824219, "rewards/margins": 31.34400177001953, "rewards/rejected": -44.71087646484375, "step": 1066 }, { "epoch": 1.1855555555555555, "grad_norm": 14.30867862701416, "learning_rate": 3.5405627755085616e-05, "logits/chosen": -0.5042968988418579, "logits/rejected": -0.5098111629486084, "logps/chosen": -105.30198669433594, "logps/rejected": -101.73469543457031, "loss": 0.7341, "rewards/accuracies": 0.0, "rewards/chosen": -6.201943397521973, "rewards/margins": -0.07879257202148438, "rewards/rejected": -6.123150825500488, "step": 1067 }, { "epoch": 1.1866666666666668, "grad_norm": 7.987827302713413e-06, "learning_rate": 3.537777853847318e-05, "logits/chosen": -0.29715538024902344, "logits/rejected": -0.27865731716156006, "logps/chosen": -295.7060241699219, "logps/rejected": -564.8291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.6605281829834, "rewards/margins": 17.09929656982422, "rewards/rejected": -35.75982666015625, "step": 1068 }, { "epoch": 1.1877777777777778, "grad_norm": 4.136797904968262, "learning_rate": 3.534991375398521e-05, "logits/chosen": -0.5674909949302673, "logits/rejected": -0.5425808429718018, "logps/chosen": -237.177001953125, "logps/rejected": -360.2304992675781, "loss": 0.4476, "rewards/accuracies": 0.5, "rewards/chosen": -15.953513145446777, "rewards/margins": 9.338382720947266, "rewards/rejected": -25.29189682006836, "step": 1069 }, { "epoch": 1.1888888888888889, "grad_norm": 0.020911239087581635, "learning_rate": 3.532203344342212e-05, "logits/chosen": -0.11953594535589218, "logits/rejected": -0.10948672890663147, "logps/chosen": -407.7371826171875, "logps/rejected": -592.095703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -31.154539108276367, "rewards/margins": 14.94063949584961, "rewards/rejected": -46.095176696777344, "step": 1070 }, { "epoch": 1.19, "grad_norm": 27.80660629272461, "learning_rate": 3.5294137648607625e-05, "logits/chosen": -0.6482747793197632, "logits/rejected": -0.6459890604019165, "logps/chosen": -348.58935546875, "logps/rejected": -352.582275390625, "loss": 0.9236, "rewards/accuracies": 0.5, "rewards/chosen": -22.067119598388672, "rewards/margins": 2.638040542602539, "rewards/rejected": -24.705162048339844, "step": 1071 }, { "epoch": 1.1911111111111112, "grad_norm": 0.8924272656440735, "learning_rate": 3.526622641138866e-05, "logits/chosen": -0.3364531993865967, "logits/rejected": -0.3225747346878052, "logps/chosen": -640.93408203125, "logps/rejected": -856.12353515625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -43.41294479370117, "rewards/margins": 17.68737030029297, "rewards/rejected": -61.100311279296875, "step": 1072 }, { "epoch": 1.1922222222222223, "grad_norm": 58.204891204833984, "learning_rate": 3.523829977363533e-05, "logits/chosen": -0.4501991868019104, "logits/rejected": -0.5951912999153137, "logps/chosen": -412.91192626953125, "logps/rejected": -651.7562866210938, "loss": 2.8431, "rewards/accuracies": 0.5, "rewards/chosen": -25.18390464782715, "rewards/margins": 21.33102798461914, "rewards/rejected": -46.51493453979492, "step": 1073 }, { "epoch": 1.1933333333333334, "grad_norm": 0.07798010110855103, "learning_rate": 3.521035777724084e-05, "logits/chosen": -0.44379204511642456, "logits/rejected": -0.43577441573143005, "logps/chosen": -209.07911682128906, "logps/rejected": -331.37017822265625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -13.308411598205566, "rewards/margins": 10.991839408874512, "rewards/rejected": -24.300251007080078, "step": 1074 }, { "epoch": 1.1944444444444444, "grad_norm": 1.772539734840393, "learning_rate": 3.518240046412144e-05, "logits/chosen": -0.5398101806640625, "logits/rejected": -0.5228912234306335, "logps/chosen": -553.2422485351562, "logps/rejected": -596.9129638671875, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -36.44016647338867, "rewards/margins": 4.900063991546631, "rewards/rejected": -41.34022903442383, "step": 1075 }, { "epoch": 1.1955555555555555, "grad_norm": 7.570631980895996, "learning_rate": 3.515442787621635e-05, "logits/chosen": -0.5290051102638245, "logits/rejected": -0.531541645526886, "logps/chosen": -187.5355987548828, "logps/rejected": -241.09103393554688, "loss": 0.8714, "rewards/accuracies": 0.5, "rewards/chosen": -11.946968078613281, "rewards/margins": 4.849268913269043, "rewards/rejected": -16.79623794555664, "step": 1076 }, { "epoch": 1.1966666666666668, "grad_norm": 0.41807833313941956, "learning_rate": 3.512644005548771e-05, "logits/chosen": -1.0022629499435425, "logits/rejected": -0.7062435150146484, "logps/chosen": -188.06103515625, "logps/rejected": -415.7874450683594, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -12.294387817382812, "rewards/margins": 12.556785583496094, "rewards/rejected": -24.851173400878906, "step": 1077 }, { "epoch": 1.1977777777777778, "grad_norm": 1.4274532986746635e-05, "learning_rate": 3.5098437043920505e-05, "logits/chosen": -0.3154919743537903, "logits/rejected": -0.26788246631622314, "logps/chosen": -203.8761444091797, "logps/rejected": -515.7097778320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.014299392700195, "rewards/margins": 23.85393714904785, "rewards/rejected": -36.86823654174805, "step": 1078 }, { "epoch": 1.198888888888889, "grad_norm": 0.0027419966645538807, "learning_rate": 3.5070418883522515e-05, "logits/chosen": -0.3251504600048065, "logits/rejected": -0.32577943801879883, "logps/chosen": -157.51499938964844, "logps/rejected": -323.976806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.847200393676758, "rewards/margins": 11.020326614379883, "rewards/rejected": -19.86752700805664, "step": 1079 }, { "epoch": 1.2, "grad_norm": 1.5922410488128662, "learning_rate": 3.504238561632424e-05, "logits/chosen": -0.5371301174163818, "logits/rejected": -0.5486058592796326, "logps/chosen": -161.06439208984375, "logps/rejected": -201.85812377929688, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -9.509645462036133, "rewards/margins": 3.589555501937866, "rewards/rejected": -13.099201202392578, "step": 1080 }, { "epoch": 1.2, "eval_logits/chosen": -0.7240422964096069, "eval_logits/rejected": -0.7191410660743713, "eval_logps/chosen": -274.7575988769531, "eval_logps/rejected": -401.846435546875, "eval_loss": 0.39044448733329773, "eval_rewards/accuracies": 0.8849999904632568, "eval_rewards/chosen": -18.69864845275879, "eval_rewards/margins": 10.028363227844238, "eval_rewards/rejected": -28.727012634277344, "eval_runtime": 86.5744, "eval_samples_per_second": 2.31, "eval_steps_per_second": 0.289, "step": 1080 }, { "epoch": 1.201111111111111, "grad_norm": 4.419209957122803, "learning_rate": 3.501433728437884e-05, "logits/chosen": -0.7584704756736755, "logits/rejected": -0.7661380767822266, "logps/chosen": -133.18344116210938, "logps/rejected": -185.68313598632812, "loss": 0.3918, "rewards/accuracies": 0.5, "rewards/chosen": -7.17563533782959, "rewards/margins": 4.400823593139648, "rewards/rejected": -11.576458930969238, "step": 1081 }, { "epoch": 1.2022222222222223, "grad_norm": 1.2319148778915405, "learning_rate": 3.498627392976208e-05, "logits/chosen": -0.6633173227310181, "logits/rejected": -0.6339872479438782, "logps/chosen": -328.7464294433594, "logps/rejected": -464.1333312988281, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -24.58924102783203, "rewards/margins": 10.639493942260742, "rewards/rejected": -35.228736877441406, "step": 1082 }, { "epoch": 1.2033333333333334, "grad_norm": 0.0057974401861429214, "learning_rate": 3.495819559457226e-05, "logits/chosen": -0.491214394569397, "logits/rejected": -0.4646666944026947, "logps/chosen": -323.5763854980469, "logps/rejected": -522.9932861328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -20.543014526367188, "rewards/margins": 17.159969329833984, "rewards/rejected": -37.70298385620117, "step": 1083 }, { "epoch": 1.2044444444444444, "grad_norm": 45.259925842285156, "learning_rate": 3.493010232093015e-05, "logits/chosen": -0.2753481864929199, "logits/rejected": -0.2797679901123047, "logps/chosen": -300.1786804199219, "logps/rejected": -342.16217041015625, "loss": 5.9945, "rewards/accuracies": 0.5, "rewards/chosen": -21.663604736328125, "rewards/margins": 4.158371925354004, "rewards/rejected": -25.821975708007812, "step": 1084 }, { "epoch": 1.2055555555555555, "grad_norm": 1.6673084246576764e-06, "learning_rate": 3.490199415097892e-05, "logits/chosen": -0.3647487759590149, "logits/rejected": -0.3298546075820923, "logps/chosen": -371.2220458984375, "logps/rejected": -677.98388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.905620574951172, "rewards/margins": 19.023761749267578, "rewards/rejected": -43.92938232421875, "step": 1085 }, { "epoch": 1.2066666666666666, "grad_norm": 1.3878739046901956e-08, "learning_rate": 3.487387112688411e-05, "logits/chosen": -0.4827202558517456, "logits/rejected": -0.47118738293647766, "logps/chosen": -291.42694091796875, "logps/rejected": -596.033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.163101196289062, "rewards/margins": 23.112079620361328, "rewards/rejected": -42.275184631347656, "step": 1086 }, { "epoch": 1.2077777777777778, "grad_norm": 1.6586503982543945, "learning_rate": 3.484573329083353e-05, "logits/chosen": -0.34336331486701965, "logits/rejected": -0.3528388738632202, "logps/chosen": -307.13250732421875, "logps/rejected": -401.7021484375, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -19.933568954467773, "rewards/margins": 5.60068416595459, "rewards/rejected": -25.534252166748047, "step": 1087 }, { "epoch": 1.208888888888889, "grad_norm": 0.035426851361989975, "learning_rate": 3.481758068503719e-05, "logits/chosen": -0.6522742509841919, "logits/rejected": -0.6579524278640747, "logps/chosen": -216.2215118408203, "logps/rejected": -306.47283935546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.820235252380371, "rewards/margins": 9.160943984985352, "rewards/rejected": -21.981178283691406, "step": 1088 }, { "epoch": 1.21, "grad_norm": 0.2990629971027374, "learning_rate": 3.478941335172729e-05, "logits/chosen": -0.6369844079017639, "logits/rejected": -0.6214416027069092, "logps/chosen": -258.2081298828125, "logps/rejected": -460.86175537109375, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -18.939998626708984, "rewards/margins": 17.168182373046875, "rewards/rejected": -36.108184814453125, "step": 1089 }, { "epoch": 1.211111111111111, "grad_norm": 29.704547882080078, "learning_rate": 3.476123133315811e-05, "logits/chosen": -0.38978511095046997, "logits/rejected": -0.4181896150112152, "logps/chosen": -559.383056640625, "logps/rejected": -651.7105712890625, "loss": 0.3246, "rewards/accuracies": 1.0, "rewards/chosen": -40.60301208496094, "rewards/margins": 5.573761940002441, "rewards/rejected": -46.17677307128906, "step": 1090 }, { "epoch": 1.2122222222222223, "grad_norm": 14.807245254516602, "learning_rate": 3.473303467160594e-05, "logits/chosen": -0.7056930661201477, "logits/rejected": -0.7010146379470825, "logps/chosen": -281.6546936035156, "logps/rejected": -332.8174743652344, "loss": 0.6841, "rewards/accuracies": 0.5, "rewards/chosen": -20.933792114257812, "rewards/margins": 3.381528854370117, "rewards/rejected": -24.31532096862793, "step": 1091 }, { "epoch": 1.2133333333333334, "grad_norm": 0.023032698780298233, "learning_rate": 3.470482340936907e-05, "logits/chosen": -0.652521550655365, "logits/rejected": -0.6219891905784607, "logps/chosen": -292.19659423828125, "logps/rejected": -511.33734130859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -18.480106353759766, "rewards/margins": 17.58205795288086, "rewards/rejected": -36.062164306640625, "step": 1092 }, { "epoch": 1.2144444444444444, "grad_norm": 0.15314069390296936, "learning_rate": 3.467659758876767e-05, "logits/chosen": -0.47803324460983276, "logits/rejected": -0.48324882984161377, "logps/chosen": -349.0363464355469, "logps/rejected": -432.5327453613281, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -23.61506462097168, "rewards/margins": 7.288515090942383, "rewards/rejected": -30.903579711914062, "step": 1093 }, { "epoch": 1.2155555555555555, "grad_norm": 0.16562683880329132, "learning_rate": 3.464835725214377e-05, "logits/chosen": -0.5046131610870361, "logits/rejected": -0.4962460994720459, "logps/chosen": -176.8943634033203, "logps/rejected": -282.51483154296875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -13.02552318572998, "rewards/margins": 6.402409553527832, "rewards/rejected": -19.427932739257812, "step": 1094 }, { "epoch": 1.2166666666666668, "grad_norm": 0.9942495226860046, "learning_rate": 3.4620102441861143e-05, "logits/chosen": -0.6238248348236084, "logits/rejected": -0.6688826084136963, "logps/chosen": -325.091796875, "logps/rejected": -540.7859497070312, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -23.803579330444336, "rewards/margins": 16.54971694946289, "rewards/rejected": -40.35329818725586, "step": 1095 }, { "epoch": 1.2177777777777778, "grad_norm": 1.7011141777038574, "learning_rate": 3.45918332003053e-05, "logits/chosen": -0.6046280264854431, "logits/rejected": -0.614544689655304, "logps/chosen": -230.225341796875, "logps/rejected": -335.5845947265625, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": -16.14960479736328, "rewards/margins": 7.919414043426514, "rewards/rejected": -24.069019317626953, "step": 1096 }, { "epoch": 1.218888888888889, "grad_norm": 0.19574995338916779, "learning_rate": 3.456354956988339e-05, "logits/chosen": -0.7238683700561523, "logits/rejected": -0.6978284120559692, "logps/chosen": -272.1901550292969, "logps/rejected": -379.18975830078125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -21.579235076904297, "rewards/margins": 7.008890151977539, "rewards/rejected": -28.588125228881836, "step": 1097 }, { "epoch": 1.22, "grad_norm": 0.4970322251319885, "learning_rate": 3.453525159302415e-05, "logits/chosen": -0.6059907674789429, "logits/rejected": -0.5704008340835571, "logps/chosen": -308.0906982421875, "logps/rejected": -405.9208984375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -22.197505950927734, "rewards/margins": 6.502004623413086, "rewards/rejected": -28.699512481689453, "step": 1098 }, { "epoch": 1.221111111111111, "grad_norm": 1.295690655708313, "learning_rate": 3.450693931217785e-05, "logits/chosen": -0.309835821390152, "logits/rejected": -0.31937628984451294, "logps/chosen": -413.75311279296875, "logps/rejected": -440.19110107421875, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": -28.58177375793457, "rewards/margins": 3.938608169555664, "rewards/rejected": -32.520381927490234, "step": 1099 }, { "epoch": 1.2222222222222223, "grad_norm": 79.88685607910156, "learning_rate": 3.4478612769816196e-05, "logits/chosen": -0.9511440992355347, "logits/rejected": -0.9693904519081116, "logps/chosen": -358.2830810546875, "logps/rejected": -265.9544677734375, "loss": 7.9967, "rewards/accuracies": 0.5, "rewards/chosen": -26.252559661865234, "rewards/margins": -7.298425674438477, "rewards/rejected": -18.954133987426758, "step": 1100 }, { "epoch": 1.2233333333333334, "grad_norm": 0.0013196105137467384, "learning_rate": 3.445027200843229e-05, "logits/chosen": -0.7324624061584473, "logits/rejected": -0.72640061378479, "logps/chosen": -357.8897399902344, "logps/rejected": -516.7114868164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.83576774597168, "rewards/margins": 14.21173095703125, "rewards/rejected": -42.04750061035156, "step": 1101 }, { "epoch": 1.2244444444444444, "grad_norm": 0.07496663928031921, "learning_rate": 3.442191707054059e-05, "logits/chosen": -0.6944974660873413, "logits/rejected": -0.6952128410339355, "logps/chosen": -430.5369873046875, "logps/rejected": -523.98779296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -34.64255905151367, "rewards/margins": 8.9041748046875, "rewards/rejected": -43.54673385620117, "step": 1102 }, { "epoch": 1.2255555555555555, "grad_norm": 3.730547541636042e-05, "learning_rate": 3.439354799867679e-05, "logits/chosen": -0.6063581109046936, "logits/rejected": -0.599156379699707, "logps/chosen": -430.53045654296875, "logps/rejected": -705.6959228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.78040885925293, "rewards/margins": 23.323333740234375, "rewards/rejected": -54.10374450683594, "step": 1103 }, { "epoch": 1.2266666666666666, "grad_norm": 11.110870361328125, "learning_rate": 3.436516483539781e-05, "logits/chosen": -0.7097593545913696, "logits/rejected": -0.697634220123291, "logps/chosen": -196.64938354492188, "logps/rejected": -225.58868408203125, "loss": 0.3694, "rewards/accuracies": 0.5, "rewards/chosen": -13.52796745300293, "rewards/margins": 3.6115846633911133, "rewards/rejected": -17.139551162719727, "step": 1104 }, { "epoch": 1.2277777777777779, "grad_norm": 0.049757953733205795, "learning_rate": 3.433676762328168e-05, "logits/chosen": -0.5791199803352356, "logits/rejected": -0.6100703477859497, "logps/chosen": -236.711669921875, "logps/rejected": -371.44244384765625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -15.922933578491211, "rewards/margins": 10.80508804321289, "rewards/rejected": -26.7280216217041, "step": 1105 }, { "epoch": 1.228888888888889, "grad_norm": 6.7820353507995605, "learning_rate": 3.430835640492754e-05, "logits/chosen": -1.0345323085784912, "logits/rejected": -0.8908888697624207, "logps/chosen": -313.11383056640625, "logps/rejected": -507.76324462890625, "loss": 0.2877, "rewards/accuracies": 1.0, "rewards/chosen": -20.791154861450195, "rewards/margins": 14.507792472839355, "rewards/rejected": -35.298946380615234, "step": 1106 }, { "epoch": 1.23, "grad_norm": 7.950656026878278e-07, "learning_rate": 3.427993122295552e-05, "logits/chosen": -0.40320831537246704, "logits/rejected": -0.39040225744247437, "logps/chosen": -333.4339294433594, "logps/rejected": -649.7603759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.072505950927734, "rewards/margins": 26.32532501220703, "rewards/rejected": -50.39783477783203, "step": 1107 }, { "epoch": 1.231111111111111, "grad_norm": 1.729838186292909e-05, "learning_rate": 3.42514921200067e-05, "logits/chosen": -0.6093225479125977, "logits/rejected": -0.6162161231040955, "logps/chosen": -694.8660278320312, "logps/rejected": -1037.535400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -55.366432189941406, "rewards/margins": 23.850921630859375, "rewards/rejected": -79.21735382080078, "step": 1108 }, { "epoch": 1.232222222222222, "grad_norm": 0.31004413962364197, "learning_rate": 3.422303913874305e-05, "logits/chosen": -0.7341557741165161, "logits/rejected": -0.7153472900390625, "logps/chosen": -316.6192626953125, "logps/rejected": -451.40814208984375, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -23.948322296142578, "rewards/margins": 9.954815864562988, "rewards/rejected": -33.90313720703125, "step": 1109 }, { "epoch": 1.2333333333333334, "grad_norm": 55.825313568115234, "learning_rate": 3.4194572321847336e-05, "logits/chosen": -0.7702044248580933, "logits/rejected": -0.7747737169265747, "logps/chosen": -493.16009521484375, "logps/rejected": -523.1885986328125, "loss": 2.043, "rewards/accuracies": 0.5, "rewards/chosen": -38.47895431518555, "rewards/margins": 2.566777229309082, "rewards/rejected": -41.04573059082031, "step": 1110 }, { "epoch": 1.2344444444444445, "grad_norm": 0.4041297733783722, "learning_rate": 3.4166091712023106e-05, "logits/chosen": -0.8434622287750244, "logits/rejected": -0.8324259519577026, "logps/chosen": -323.7066650390625, "logps/rejected": -387.35418701171875, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -25.663333892822266, "rewards/margins": 5.1223554611206055, "rewards/rejected": -30.785688400268555, "step": 1111 }, { "epoch": 1.2355555555555555, "grad_norm": 0.016534410417079926, "learning_rate": 3.413759735199459e-05, "logits/chosen": -0.7223848104476929, "logits/rejected": -0.7047760486602783, "logps/chosen": -580.625732421875, "logps/rejected": -695.560546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -46.112918853759766, "rewards/margins": 11.011363983154297, "rewards/rejected": -57.12428283691406, "step": 1112 }, { "epoch": 1.2366666666666666, "grad_norm": 2.6880834980147483e-07, "learning_rate": 3.410908928450665e-05, "logits/chosen": -0.6876155138015747, "logits/rejected": -0.6786773204803467, "logps/chosen": -358.447265625, "logps/rejected": -628.7307739257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.751808166503906, "rewards/margins": 23.579448699951172, "rewards/rejected": -52.33125686645508, "step": 1113 }, { "epoch": 1.2377777777777779, "grad_norm": 1.319317789238994e-06, "learning_rate": 3.40805675523247e-05, "logits/chosen": -0.853351354598999, "logits/rejected": -0.8134815096855164, "logps/chosen": -196.50186157226562, "logps/rejected": -429.27288818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.893254280090332, "rewards/margins": 18.1531925201416, "rewards/rejected": -32.04644775390625, "step": 1114 }, { "epoch": 1.238888888888889, "grad_norm": 0.08275783807039261, "learning_rate": 3.405203219823466e-05, "logits/chosen": -0.6258403658866882, "logits/rejected": -0.6324753761291504, "logps/chosen": -500.6219482421875, "logps/rejected": -611.31298828125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -40.079254150390625, "rewards/margins": 7.629915237426758, "rewards/rejected": -47.709171295166016, "step": 1115 }, { "epoch": 1.24, "grad_norm": 0.03054768778383732, "learning_rate": 3.4023483265042874e-05, "logits/chosen": -0.9271330833435059, "logits/rejected": -0.9292842149734497, "logps/chosen": -238.4080352783203, "logps/rejected": -305.1952209472656, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -14.87767219543457, "rewards/margins": 8.35052490234375, "rewards/rejected": -23.22819709777832, "step": 1116 }, { "epoch": 1.241111111111111, "grad_norm": 0.05150346830487251, "learning_rate": 3.3994920795576065e-05, "logits/chosen": -0.9200377464294434, "logits/rejected": -0.9325908422470093, "logps/chosen": -419.97491455078125, "logps/rejected": -548.5128784179688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -34.219547271728516, "rewards/margins": 9.952924728393555, "rewards/rejected": -44.17247009277344, "step": 1117 }, { "epoch": 1.2422222222222223, "grad_norm": 4.298266742530643e-10, "learning_rate": 3.396634483268126e-05, "logits/chosen": -0.5890426635742188, "logits/rejected": -0.6064373254776001, "logps/chosen": -728.938232421875, "logps/rejected": -1095.3837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -52.547786712646484, "rewards/margins": 28.467098236083984, "rewards/rejected": -81.01488494873047, "step": 1118 }, { "epoch": 1.2433333333333334, "grad_norm": 0.00010779834701679647, "learning_rate": 3.393775541922575e-05, "logits/chosen": -0.704108476638794, "logits/rejected": -0.724443793296814, "logps/chosen": -425.9281311035156, "logps/rejected": -650.9085083007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -36.270721435546875, "rewards/margins": 17.63309669494629, "rewards/rejected": -53.90381622314453, "step": 1119 }, { "epoch": 1.2444444444444445, "grad_norm": 1.4394936561584473, "learning_rate": 3.390915259809696e-05, "logits/chosen": -0.8380403518676758, "logits/rejected": -0.849768877029419, "logps/chosen": -402.96343994140625, "logps/rejected": -507.44512939453125, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": -32.354888916015625, "rewards/margins": 8.498493194580078, "rewards/rejected": -40.8533821105957, "step": 1120 }, { "epoch": 1.2455555555555555, "grad_norm": 6.258552343041401e-09, "learning_rate": 3.388053641220246e-05, "logits/chosen": -0.8635650873184204, "logits/rejected": -0.8644813299179077, "logps/chosen": -601.1712646484375, "logps/rejected": -931.6651000976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -51.51227569580078, "rewards/margins": 25.516435623168945, "rewards/rejected": -77.02871704101562, "step": 1121 }, { "epoch": 1.2466666666666666, "grad_norm": 0.43595561385154724, "learning_rate": 3.385190690446984e-05, "logits/chosen": -0.9358810186386108, "logits/rejected": -0.8962790966033936, "logps/chosen": -203.76364135742188, "logps/rejected": -267.8464050292969, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -13.276731491088867, "rewards/margins": 4.57404899597168, "rewards/rejected": -17.850780487060547, "step": 1122 }, { "epoch": 1.2477777777777779, "grad_norm": 21.98056411743164, "learning_rate": 3.382326411784672e-05, "logits/chosen": -0.9007632732391357, "logits/rejected": -0.8931815028190613, "logps/chosen": -277.02874755859375, "logps/rejected": -289.70184326171875, "loss": 0.386, "rewards/accuracies": 0.5, "rewards/chosen": -21.363128662109375, "rewards/margins": 1.5860786437988281, "rewards/rejected": -22.949207305908203, "step": 1123 }, { "epoch": 1.248888888888889, "grad_norm": 2.5713279247283936, "learning_rate": 3.379460809530061e-05, "logits/chosen": -0.8587130308151245, "logits/rejected": -0.8488739132881165, "logps/chosen": -417.41748046875, "logps/rejected": -537.6888427734375, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -33.74488067626953, "rewards/margins": 7.661352157592773, "rewards/rejected": -41.40623474121094, "step": 1124 }, { "epoch": 1.25, "grad_norm": 0.019138788804411888, "learning_rate": 3.376593887981887e-05, "logits/chosen": -0.536022424697876, "logits/rejected": -0.5411964654922485, "logps/chosen": -976.619873046875, "logps/rejected": -1098.943603515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -84.53974914550781, "rewards/margins": 11.256641387939453, "rewards/rejected": -95.79638671875, "step": 1125 }, { "epoch": 1.251111111111111, "grad_norm": 3.478949785232544, "learning_rate": 3.3737256514408644e-05, "logits/chosen": -0.9007757902145386, "logits/rejected": -0.8951504230499268, "logps/chosen": -136.51658630371094, "logps/rejected": -171.02792358398438, "loss": 0.0916, "rewards/accuracies": 1.0, "rewards/chosen": -9.597746849060059, "rewards/margins": 3.996220827102661, "rewards/rejected": -13.59396743774414, "step": 1126 }, { "epoch": 1.2522222222222221, "grad_norm": 0.5720980763435364, "learning_rate": 3.370856104209685e-05, "logits/chosen": -0.8376792669296265, "logits/rejected": -0.8401541709899902, "logps/chosen": -298.08990478515625, "logps/rejected": -361.74603271484375, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -20.250972747802734, "rewards/margins": 5.764504909515381, "rewards/rejected": -26.01547622680664, "step": 1127 }, { "epoch": 1.2533333333333334, "grad_norm": 0.0001507470296928659, "learning_rate": 3.367985250593001e-05, "logits/chosen": -0.8247244358062744, "logits/rejected": -0.8362586498260498, "logps/chosen": -446.62060546875, "logps/rejected": -738.9429931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -35.9389533996582, "rewards/margins": 23.152740478515625, "rewards/rejected": -59.09169387817383, "step": 1128 }, { "epoch": 1.2544444444444445, "grad_norm": 8.28089177957736e-05, "learning_rate": 3.365113094897429e-05, "logits/chosen": -0.8901547789573669, "logits/rejected": -0.8906691074371338, "logps/chosen": -380.3721923828125, "logps/rejected": -581.781494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.22945785522461, "rewards/margins": 15.149023056030273, "rewards/rejected": -42.37847900390625, "step": 1129 }, { "epoch": 1.2555555555555555, "grad_norm": 0.18194514513015747, "learning_rate": 3.362239641431536e-05, "logits/chosen": -0.8842751979827881, "logits/rejected": -0.8642561435699463, "logps/chosen": -504.2742614746094, "logps/rejected": -741.3284301757812, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -38.279747009277344, "rewards/margins": 15.945014953613281, "rewards/rejected": -54.224761962890625, "step": 1130 }, { "epoch": 1.2566666666666666, "grad_norm": 0.0002852333418559283, "learning_rate": 3.359364894505836e-05, "logits/chosen": -0.807183027267456, "logits/rejected": -0.8318166732788086, "logps/chosen": -528.0140991210938, "logps/rejected": -728.3021850585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -42.41314697265625, "rewards/margins": 17.061548233032227, "rewards/rejected": -59.474693298339844, "step": 1131 }, { "epoch": 1.2577777777777777, "grad_norm": 36.939964294433594, "learning_rate": 3.3564888584327835e-05, "logits/chosen": -0.8024373054504395, "logits/rejected": -0.7951889038085938, "logps/chosen": -600.5908203125, "logps/rejected": -675.2410888671875, "loss": 2.282, "rewards/accuracies": 0.5, "rewards/chosen": -47.58341979980469, "rewards/margins": 7.236835479736328, "rewards/rejected": -54.82025909423828, "step": 1132 }, { "epoch": 1.258888888888889, "grad_norm": 91.10792541503906, "learning_rate": 3.353611537526769e-05, "logits/chosen": -1.0141475200653076, "logits/rejected": -1.0594055652618408, "logps/chosen": -420.58123779296875, "logps/rejected": -474.4754638671875, "loss": 1.1219, "rewards/accuracies": 0.5, "rewards/chosen": -34.26728820800781, "rewards/margins": 3.9202098846435547, "rewards/rejected": -38.1875, "step": 1133 }, { "epoch": 1.26, "grad_norm": 79.47479248046875, "learning_rate": 3.350732936104108e-05, "logits/chosen": -0.9485384225845337, "logits/rejected": -0.9522465467453003, "logps/chosen": -519.9869995117188, "logps/rejected": -575.281005859375, "loss": 2.1895, "rewards/accuracies": 0.5, "rewards/chosen": -41.99152374267578, "rewards/margins": 4.212804794311523, "rewards/rejected": -46.20432662963867, "step": 1134 }, { "epoch": 1.261111111111111, "grad_norm": 89.88656616210938, "learning_rate": 3.347853058483037e-05, "logits/chosen": -0.8892645835876465, "logits/rejected": -1.0331355333328247, "logps/chosen": -587.7260131835938, "logps/rejected": -893.2612915039062, "loss": 13.2794, "rewards/accuracies": 0.5, "rewards/chosen": -50.68217468261719, "rewards/margins": 18.078933715820312, "rewards/rejected": -68.7611083984375, "step": 1135 }, { "epoch": 1.2622222222222224, "grad_norm": 11.960118293762207, "learning_rate": 3.3449719089837085e-05, "logits/chosen": -0.6617128849029541, "logits/rejected": -0.6512296199798584, "logps/chosen": -358.5455322265625, "logps/rejected": -479.0328369140625, "loss": 0.1201, "rewards/accuracies": 1.0, "rewards/chosen": -26.823665618896484, "rewards/margins": 10.368813514709473, "rewards/rejected": -37.192481994628906, "step": 1136 }, { "epoch": 1.2633333333333332, "grad_norm": 0.01516872551292181, "learning_rate": 3.3420894919281816e-05, "logits/chosen": -0.705463707447052, "logits/rejected": -0.7183117866516113, "logps/chosen": -423.51763916015625, "logps/rejected": -635.9431762695312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -35.19464874267578, "rewards/margins": 16.7429256439209, "rewards/rejected": -51.93757247924805, "step": 1137 }, { "epoch": 1.2644444444444445, "grad_norm": 3.5375983715057373, "learning_rate": 3.339205811640417e-05, "logits/chosen": -1.031968593597412, "logits/rejected": -1.0058989524841309, "logps/chosen": -226.5491180419922, "logps/rejected": -329.05072021484375, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -13.843762397766113, "rewards/margins": 9.29403305053711, "rewards/rejected": -23.137794494628906, "step": 1138 }, { "epoch": 1.2655555555555555, "grad_norm": 0.4200778007507324, "learning_rate": 3.3363208724462714e-05, "logits/chosen": -0.9081524610519409, "logits/rejected": -0.9106837511062622, "logps/chosen": -298.6524658203125, "logps/rejected": -566.8109741210938, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -22.116418838500977, "rewards/margins": 21.678335189819336, "rewards/rejected": -43.79475402832031, "step": 1139 }, { "epoch": 1.2666666666666666, "grad_norm": 6.26518958597444e-05, "learning_rate": 3.333434678673489e-05, "logits/chosen": -0.6706831455230713, "logits/rejected": -0.6817194223403931, "logps/chosen": -287.8787841796875, "logps/rejected": -472.81268310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.69680404663086, "rewards/margins": 15.285782814025879, "rewards/rejected": -36.98258590698242, "step": 1140 }, { "epoch": 1.267777777777778, "grad_norm": 0.08974367380142212, "learning_rate": 3.330547234651696e-05, "logits/chosen": -0.7628417015075684, "logits/rejected": -0.7563062310218811, "logps/chosen": -246.45582580566406, "logps/rejected": -415.51141357421875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -17.333879470825195, "rewards/margins": 13.991189956665039, "rewards/rejected": -31.325069427490234, "step": 1141 }, { "epoch": 1.268888888888889, "grad_norm": 1.4994750022888184, "learning_rate": 3.327658544712395e-05, "logits/chosen": -0.8366434574127197, "logits/rejected": -0.8159940838813782, "logps/chosen": -283.1402282714844, "logps/rejected": -295.338134765625, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -18.621004104614258, "rewards/margins": 5.0576300621032715, "rewards/rejected": -23.678634643554688, "step": 1142 }, { "epoch": 1.27, "grad_norm": 0.011902649886906147, "learning_rate": 3.3247686131889574e-05, "logits/chosen": -0.746666669845581, "logits/rejected": -0.7609056234359741, "logps/chosen": -287.7684326171875, "logps/rejected": -395.8873291015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -20.458106994628906, "rewards/margins": 10.425792694091797, "rewards/rejected": -30.883899688720703, "step": 1143 }, { "epoch": 1.271111111111111, "grad_norm": 1.2298708895741584e-07, "learning_rate": 3.3218774444166165e-05, "logits/chosen": -0.6406785249710083, "logits/rejected": -0.6684203147888184, "logps/chosen": -325.453857421875, "logps/rejected": -635.9059448242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.45821762084961, "rewards/margins": 21.48630142211914, "rewards/rejected": -45.94451904296875, "step": 1144 }, { "epoch": 1.2722222222222221, "grad_norm": 14.36624526977539, "learning_rate": 3.318985042732461e-05, "logits/chosen": -0.6001132130622864, "logits/rejected": -0.6039655208587646, "logps/chosen": -359.09747314453125, "logps/rejected": -594.1448974609375, "loss": 0.4771, "rewards/accuracies": 0.5, "rewards/chosen": -26.300575256347656, "rewards/margins": 16.40939712524414, "rewards/rejected": -42.7099723815918, "step": 1145 }, { "epoch": 1.2733333333333334, "grad_norm": 0.00683129345998168, "learning_rate": 3.316091412475431e-05, "logits/chosen": -0.7902097702026367, "logits/rejected": -0.7686789035797119, "logps/chosen": -533.58544921875, "logps/rejected": -705.8438720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -43.03788757324219, "rewards/margins": 12.68057632446289, "rewards/rejected": -55.71846389770508, "step": 1146 }, { "epoch": 1.2744444444444445, "grad_norm": 0.0020666734781116247, "learning_rate": 3.313196557986308e-05, "logits/chosen": -0.4348980188369751, "logits/rejected": -0.4338167607784271, "logps/chosen": -276.23333740234375, "logps/rejected": -432.3183898925781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.410634994506836, "rewards/margins": 13.101032257080078, "rewards/rejected": -31.51166534423828, "step": 1147 }, { "epoch": 1.2755555555555556, "grad_norm": 0.6405504941940308, "learning_rate": 3.3103004836077114e-05, "logits/chosen": -0.9221420288085938, "logits/rejected": -0.9253071546554565, "logps/chosen": -111.90196990966797, "logps/rejected": -166.250732421875, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": -6.754261016845703, "rewards/margins": 5.025030136108398, "rewards/rejected": -11.779291152954102, "step": 1148 }, { "epoch": 1.2766666666666666, "grad_norm": 0.010565209202468395, "learning_rate": 3.3074031936840896e-05, "logits/chosen": -0.7106007933616638, "logits/rejected": -0.723418116569519, "logps/chosen": -351.8512878417969, "logps/rejected": -493.6191711425781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -26.872020721435547, "rewards/margins": 11.64608383178711, "rewards/rejected": -38.518104553222656, "step": 1149 }, { "epoch": 1.2777777777777777, "grad_norm": 0.000831556913908571, "learning_rate": 3.3045046925617146e-05, "logits/chosen": -0.7987624406814575, "logits/rejected": -0.7861530780792236, "logps/chosen": -215.88177490234375, "logps/rejected": -430.49322509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.786880493164062, "rewards/margins": 15.176080703735352, "rewards/rejected": -29.96295928955078, "step": 1150 }, { "epoch": 1.278888888888889, "grad_norm": 0.8909220099449158, "learning_rate": 3.3016049845886753e-05, "logits/chosen": -0.895613431930542, "logits/rejected": -0.8768013715744019, "logps/chosen": -190.413818359375, "logps/rejected": -251.11610412597656, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -15.200081825256348, "rewards/margins": 3.6632471084594727, "rewards/rejected": -18.86332893371582, "step": 1151 }, { "epoch": 1.28, "grad_norm": 0.010402925312519073, "learning_rate": 3.29870407411487e-05, "logits/chosen": -0.8855157494544983, "logits/rejected": -0.871821403503418, "logps/chosen": -358.6158447265625, "logps/rejected": -461.73284912109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -26.228843688964844, "rewards/margins": 10.966593742370605, "rewards/rejected": -37.195438385009766, "step": 1152 }, { "epoch": 1.281111111111111, "grad_norm": 0.00039449046016670763, "learning_rate": 3.2958019654920044e-05, "logits/chosen": -0.578254759311676, "logits/rejected": -0.5665132999420166, "logps/chosen": -339.43255615234375, "logps/rejected": -504.1581115722656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.667224884033203, "rewards/margins": 13.33066463470459, "rewards/rejected": -37.997886657714844, "step": 1153 }, { "epoch": 1.2822222222222222, "grad_norm": 0.028207123279571533, "learning_rate": 3.2928986630735784e-05, "logits/chosen": -0.6106897592544556, "logits/rejected": -0.5920702815055847, "logps/chosen": -317.54193115234375, "logps/rejected": -401.0167541503906, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -22.913928985595703, "rewards/margins": 9.498872756958008, "rewards/rejected": -32.41279983520508, "step": 1154 }, { "epoch": 1.2833333333333332, "grad_norm": 1.2157349793540106e-08, "learning_rate": 3.289994171214882e-05, "logits/chosen": -0.7893879413604736, "logits/rejected": -0.7826883792877197, "logps/chosen": -450.8855285644531, "logps/rejected": -751.4498291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -34.60601806640625, "rewards/margins": 24.740253448486328, "rewards/rejected": -59.346275329589844, "step": 1155 }, { "epoch": 1.2844444444444445, "grad_norm": 1.812753677368164, "learning_rate": 3.287088494272992e-05, "logits/chosen": -0.5327188968658447, "logits/rejected": -0.5603907108306885, "logps/chosen": -297.5242614746094, "logps/rejected": -346.69232177734375, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -22.168371200561523, "rewards/margins": 3.3064780235290527, "rewards/rejected": -25.474849700927734, "step": 1156 }, { "epoch": 1.2855555555555556, "grad_norm": 0.7429085373878479, "learning_rate": 3.284181636606762e-05, "logits/chosen": -0.5918606519699097, "logits/rejected": -0.5810333490371704, "logps/chosen": -290.2857360839844, "logps/rejected": -357.5367126464844, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -21.779491424560547, "rewards/margins": 5.140336990356445, "rewards/rejected": -26.919830322265625, "step": 1157 }, { "epoch": 1.2866666666666666, "grad_norm": 0.06093937158584595, "learning_rate": 3.281273602576816e-05, "logits/chosen": -0.7295757532119751, "logits/rejected": -0.7856974601745605, "logps/chosen": -284.7388916015625, "logps/rejected": -409.8436279296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -20.84883689880371, "rewards/margins": 8.707620620727539, "rewards/rejected": -29.55645751953125, "step": 1158 }, { "epoch": 1.287777777777778, "grad_norm": 0.011593243107199669, "learning_rate": 3.2783643965455465e-05, "logits/chosen": -0.6375495195388794, "logits/rejected": -0.6458132266998291, "logps/chosen": -370.7925109863281, "logps/rejected": -632.815185546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -26.687501907348633, "rewards/margins": 21.6215877532959, "rewards/rejected": -48.30908966064453, "step": 1159 }, { "epoch": 1.2888888888888888, "grad_norm": 0.005463710054755211, "learning_rate": 3.275454022877097e-05, "logits/chosen": -0.5524839162826538, "logits/rejected": -0.5653072595596313, "logps/chosen": -394.6564636230469, "logps/rejected": -623.16455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.705101013183594, "rewards/margins": 19.964183807373047, "rewards/rejected": -44.66928482055664, "step": 1160 }, { "epoch": 1.29, "grad_norm": 0.18523553013801575, "learning_rate": 3.272542485937369e-05, "logits/chosen": -0.7231124639511108, "logits/rejected": -0.6982595920562744, "logps/chosen": -254.26405334472656, "logps/rejected": -357.7164611816406, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -18.68618392944336, "rewards/margins": 5.927525520324707, "rewards/rejected": -24.61370849609375, "step": 1161 }, { "epoch": 1.291111111111111, "grad_norm": 0.0013610488967970014, "learning_rate": 3.269629790094006e-05, "logits/chosen": -0.772537112236023, "logits/rejected": -0.7729246616363525, "logps/chosen": -350.53460693359375, "logps/rejected": -514.8486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.737224578857422, "rewards/margins": 13.514721870422363, "rewards/rejected": -41.25194549560547, "step": 1162 }, { "epoch": 1.2922222222222222, "grad_norm": 6.356735866575036e-06, "learning_rate": 3.2667159397163916e-05, "logits/chosen": -0.9731773138046265, "logits/rejected": -0.997789204120636, "logps/chosen": -298.83013916015625, "logps/rejected": -519.6799926757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.8809814453125, "rewards/margins": 16.9765625, "rewards/rejected": -38.8575439453125, "step": 1163 }, { "epoch": 1.2933333333333334, "grad_norm": 1.4179081916809082, "learning_rate": 3.26380093917564e-05, "logits/chosen": -0.8241095542907715, "logits/rejected": -0.8272333145141602, "logps/chosen": -373.5196533203125, "logps/rejected": -445.2720947265625, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": -28.776416778564453, "rewards/margins": 6.853231430053711, "rewards/rejected": -35.62964630126953, "step": 1164 }, { "epoch": 1.2944444444444445, "grad_norm": 4.382749557495117, "learning_rate": 3.26088479284459e-05, "logits/chosen": -0.7376973628997803, "logits/rejected": -0.7447601556777954, "logps/chosen": -282.1229248046875, "logps/rejected": -302.7000427246094, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": -18.998294830322266, "rewards/margins": 2.665947914123535, "rewards/rejected": -21.664243698120117, "step": 1165 }, { "epoch": 1.2955555555555556, "grad_norm": 0.9470803141593933, "learning_rate": 3.257967505097803e-05, "logits/chosen": -0.8929893970489502, "logits/rejected": -0.9083536267280579, "logps/chosen": -269.4893493652344, "logps/rejected": -361.02557373046875, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -21.430280685424805, "rewards/margins": 6.866990089416504, "rewards/rejected": -28.297271728515625, "step": 1166 }, { "epoch": 1.2966666666666666, "grad_norm": 0.06599203497171402, "learning_rate": 3.2550490803115474e-05, "logits/chosen": -0.9167760610580444, "logits/rejected": -0.8943976163864136, "logps/chosen": -385.3570251464844, "logps/rejected": -499.8207092285156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -29.642871856689453, "rewards/margins": 9.463091850280762, "rewards/rejected": -39.10596466064453, "step": 1167 }, { "epoch": 1.2977777777777777, "grad_norm": 4.370644091977738e-05, "learning_rate": 3.252129522863802e-05, "logits/chosen": -0.7923968434333801, "logits/rejected": -0.800431489944458, "logps/chosen": -465.63897705078125, "logps/rejected": -690.8274536132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -36.349754333496094, "rewards/margins": 18.952224731445312, "rewards/rejected": -55.301979064941406, "step": 1168 }, { "epoch": 1.298888888888889, "grad_norm": 8.885181427001953, "learning_rate": 3.249208837134243e-05, "logits/chosen": -0.9901947975158691, "logits/rejected": -0.9846863746643066, "logps/chosen": -290.4068298339844, "logps/rejected": -348.74884033203125, "loss": 0.2457, "rewards/accuracies": 1.0, "rewards/chosen": -20.71703338623047, "rewards/margins": 6.368098258972168, "rewards/rejected": -27.08513069152832, "step": 1169 }, { "epoch": 1.3, "grad_norm": 1.4440867062148754e-06, "learning_rate": 3.246287027504237e-05, "logits/chosen": -0.8466595411300659, "logits/rejected": -0.8135979175567627, "logps/chosen": -590.1613159179688, "logps/rejected": -843.0851440429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -41.654232025146484, "rewards/margins": 25.569713592529297, "rewards/rejected": -67.22394561767578, "step": 1170 }, { "epoch": 1.301111111111111, "grad_norm": 1.2141258594056126e-05, "learning_rate": 3.2433640983568394e-05, "logits/chosen": -0.8636716604232788, "logits/rejected": -0.8702871799468994, "logps/chosen": -371.198974609375, "logps/rejected": -626.307373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.93106460571289, "rewards/margins": 20.54737091064453, "rewards/rejected": -48.47843551635742, "step": 1171 }, { "epoch": 1.3022222222222222, "grad_norm": 5.9382476806640625, "learning_rate": 3.240440054076784e-05, "logits/chosen": -0.9579585790634155, "logits/rejected": -0.9786052107810974, "logps/chosen": -480.3575439453125, "logps/rejected": -666.4959716796875, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -35.05200958251953, "rewards/margins": 17.681413650512695, "rewards/rejected": -52.733421325683594, "step": 1172 }, { "epoch": 1.3033333333333332, "grad_norm": 1.6187493801116943, "learning_rate": 3.2375148990504776e-05, "logits/chosen": -0.9418230056762695, "logits/rejected": -0.9389443397521973, "logps/chosen": -232.81692504882812, "logps/rejected": -421.66351318359375, "loss": 0.1209, "rewards/accuracies": 1.0, "rewards/chosen": -17.931442260742188, "rewards/margins": 15.934490203857422, "rewards/rejected": -33.86593246459961, "step": 1173 }, { "epoch": 1.3044444444444445, "grad_norm": 4.686391548602842e-05, "learning_rate": 3.2345886376659946e-05, "logits/chosen": -0.9522944688796997, "logits/rejected": -0.9557245969772339, "logps/chosen": -294.54339599609375, "logps/rejected": -447.5179443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.8929443359375, "rewards/margins": 14.528725624084473, "rewards/rejected": -37.421669006347656, "step": 1174 }, { "epoch": 1.3055555555555556, "grad_norm": 0.000556872459128499, "learning_rate": 3.231661274313065e-05, "logits/chosen": -0.8321289420127869, "logits/rejected": -0.8234414458274841, "logps/chosen": -207.49530029296875, "logps/rejected": -342.54034423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.694061279296875, "rewards/margins": 12.22831916809082, "rewards/rejected": -24.922380447387695, "step": 1175 }, { "epoch": 1.3066666666666666, "grad_norm": 0.006023135501891375, "learning_rate": 3.228732813383076e-05, "logits/chosen": -0.9252109527587891, "logits/rejected": -0.9417662620544434, "logps/chosen": -364.71392822265625, "logps/rejected": -663.0241088867188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -28.458208084106445, "rewards/margins": 20.513357162475586, "rewards/rejected": -48.97156524658203, "step": 1176 }, { "epoch": 1.3077777777777777, "grad_norm": 0.0050237602554261684, "learning_rate": 3.225803259269059e-05, "logits/chosen": -0.9057779312133789, "logits/rejected": -0.9020297527313232, "logps/chosen": -313.6606140136719, "logps/rejected": -459.46148681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.072866439819336, "rewards/margins": 13.720565795898438, "rewards/rejected": -37.793434143066406, "step": 1177 }, { "epoch": 1.3088888888888888, "grad_norm": 0.13464303314685822, "learning_rate": 3.222872616365687e-05, "logits/chosen": -0.9377118349075317, "logits/rejected": -0.9456422924995422, "logps/chosen": -422.6854248046875, "logps/rejected": -512.5366821289062, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -31.305912017822266, "rewards/margins": 9.605637550354004, "rewards/rejected": -40.91154861450195, "step": 1178 }, { "epoch": 1.31, "grad_norm": 0.00012125380453653634, "learning_rate": 3.2199408890692655e-05, "logits/chosen": -0.867013692855835, "logits/rejected": -0.8607621192932129, "logps/chosen": -413.7069091796875, "logps/rejected": -677.07080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.762115478515625, "rewards/margins": 20.59077262878418, "rewards/rejected": -52.35289001464844, "step": 1179 }, { "epoch": 1.3111111111111111, "grad_norm": 0.010083724744617939, "learning_rate": 3.217008081777726e-05, "logits/chosen": -0.8365203142166138, "logits/rejected": -0.8953075408935547, "logps/chosen": -470.27435302734375, "logps/rejected": -667.39404296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -32.258384704589844, "rewards/margins": 15.45273208618164, "rewards/rejected": -47.711116790771484, "step": 1180 }, { "epoch": 1.3122222222222222, "grad_norm": 0.02203153818845749, "learning_rate": 3.214074198890621e-05, "logits/chosen": -0.662952184677124, "logits/rejected": -0.6807578802108765, "logps/chosen": -336.05718994140625, "logps/rejected": -433.7147521972656, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -23.939743041992188, "rewards/margins": 8.638450622558594, "rewards/rejected": -32.57819366455078, "step": 1181 }, { "epoch": 1.3133333333333335, "grad_norm": 0.019086534157395363, "learning_rate": 3.211139244809115e-05, "logits/chosen": -0.8337119221687317, "logits/rejected": -0.844194233417511, "logps/chosen": -315.14056396484375, "logps/rejected": -525.7843017578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -26.402315139770508, "rewards/margins": 14.941704750061035, "rewards/rejected": -41.34402084350586, "step": 1182 }, { "epoch": 1.3144444444444445, "grad_norm": 0.15559881925582886, "learning_rate": 3.208203223935983e-05, "logits/chosen": -0.6239709258079529, "logits/rejected": -0.6046693325042725, "logps/chosen": -283.15191650390625, "logps/rejected": -351.819091796875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -19.65460968017578, "rewards/margins": 6.403743743896484, "rewards/rejected": -26.058353424072266, "step": 1183 }, { "epoch": 1.3155555555555556, "grad_norm": 150.47679138183594, "learning_rate": 3.2052661406755974e-05, "logits/chosen": -0.7889986038208008, "logits/rejected": -1.0860356092453003, "logps/chosen": -523.4698486328125, "logps/rejected": -309.21160888671875, "loss": 24.0391, "rewards/accuracies": 0.5, "rewards/chosen": -42.51286697387695, "rewards/margins": -19.057065963745117, "rewards/rejected": -23.455801010131836, "step": 1184 }, { "epoch": 1.3166666666666667, "grad_norm": 0.0014242251636460423, "learning_rate": 3.202327999433924e-05, "logits/chosen": -0.6616151332855225, "logits/rejected": -0.6589460372924805, "logps/chosen": -288.2294006347656, "logps/rejected": -510.46966552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.846139907836914, "rewards/margins": 15.816268920898438, "rewards/rejected": -37.66240692138672, "step": 1185 }, { "epoch": 1.3177777777777777, "grad_norm": 0.007687154691666365, "learning_rate": 3.199388804618516e-05, "logits/chosen": -0.5819280743598938, "logits/rejected": -0.5830832719802856, "logps/chosen": -271.563232421875, "logps/rejected": -610.03955078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -19.15275001525879, "rewards/margins": 23.936803817749023, "rewards/rejected": -43.08955383300781, "step": 1186 }, { "epoch": 1.318888888888889, "grad_norm": 0.5803526043891907, "learning_rate": 3.1964485606385094e-05, "logits/chosen": -0.7858847379684448, "logits/rejected": -0.7814669609069824, "logps/chosen": -405.3174743652344, "logps/rejected": -467.2099609375, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -33.74994659423828, "rewards/margins": 5.694296836853027, "rewards/rejected": -39.444244384765625, "step": 1187 }, { "epoch": 1.32, "grad_norm": 0.14890140295028687, "learning_rate": 3.1935072719046115e-05, "logits/chosen": -0.9494962692260742, "logits/rejected": -0.9534491896629333, "logps/chosen": -303.2198486328125, "logps/rejected": -387.44842529296875, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -22.763778686523438, "rewards/margins": 8.436649322509766, "rewards/rejected": -31.200428009033203, "step": 1188 }, { "epoch": 1.3211111111111111, "grad_norm": 0.034704651683568954, "learning_rate": 3.1905649428290984e-05, "logits/chosen": -0.5283823609352112, "logits/rejected": -0.535535991191864, "logps/chosen": -293.59912109375, "logps/rejected": -400.1159362792969, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -21.90583038330078, "rewards/margins": 8.539210319519043, "rewards/rejected": -30.445039749145508, "step": 1189 }, { "epoch": 1.3222222222222222, "grad_norm": 0.0047100502997636795, "learning_rate": 3.1876215778258045e-05, "logits/chosen": -0.6378222703933716, "logits/rejected": -0.6388559341430664, "logps/chosen": -497.8297119140625, "logps/rejected": -708.7213134765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -36.34308624267578, "rewards/margins": 15.859331130981445, "rewards/rejected": -52.202415466308594, "step": 1190 }, { "epoch": 1.3233333333333333, "grad_norm": 0.06131556257605553, "learning_rate": 3.184677181310121e-05, "logits/chosen": -0.6307685971260071, "logits/rejected": -0.634036123752594, "logps/chosen": -396.042236328125, "logps/rejected": -552.313232421875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -29.777374267578125, "rewards/margins": 10.202241897583008, "rewards/rejected": -39.9796142578125, "step": 1191 }, { "epoch": 1.3244444444444445, "grad_norm": 1.100051999092102, "learning_rate": 3.1817317576989856e-05, "logits/chosen": -0.836810827255249, "logits/rejected": -0.8143692016601562, "logps/chosen": -371.1875, "logps/rejected": -465.2613525390625, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -28.476520538330078, "rewards/margins": 5.831082344055176, "rewards/rejected": -34.30760192871094, "step": 1192 }, { "epoch": 1.3255555555555556, "grad_norm": 0.04764511436223984, "learning_rate": 3.1787853114108754e-05, "logits/chosen": -0.49622324109077454, "logits/rejected": -0.4987778961658478, "logps/chosen": -238.57838439941406, "logps/rejected": -328.609619140625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -16.497766494750977, "rewards/margins": 7.062910556793213, "rewards/rejected": -23.56067657470703, "step": 1193 }, { "epoch": 1.3266666666666667, "grad_norm": 0.08690983057022095, "learning_rate": 3.175837846865805e-05, "logits/chosen": -0.7114551067352295, "logits/rejected": -0.7199759483337402, "logps/chosen": -323.3251953125, "logps/rejected": -428.8138427734375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -21.870311737060547, "rewards/margins": 8.787042617797852, "rewards/rejected": -30.65735626220703, "step": 1194 }, { "epoch": 1.3277777777777777, "grad_norm": 0.0389179103076458, "learning_rate": 3.172889368485311e-05, "logits/chosen": -0.6253566741943359, "logits/rejected": -0.6514289379119873, "logps/chosen": -233.22390747070312, "logps/rejected": -362.3868408203125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -15.884159088134766, "rewards/margins": 10.414838790893555, "rewards/rejected": -26.29899787902832, "step": 1195 }, { "epoch": 1.3288888888888888, "grad_norm": 0.000307722482830286, "learning_rate": 3.169939880692456e-05, "logits/chosen": -0.6148372292518616, "logits/rejected": -0.6120091080665588, "logps/chosen": -502.9095458984375, "logps/rejected": -698.3698120117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -37.427757263183594, "rewards/margins": 17.226299285888672, "rewards/rejected": -54.654056549072266, "step": 1196 }, { "epoch": 1.33, "grad_norm": 0.004292602650821209, "learning_rate": 3.1669893879118156e-05, "logits/chosen": -0.8362588882446289, "logits/rejected": -0.8618030548095703, "logps/chosen": -464.4699401855469, "logps/rejected": -702.715576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -35.312782287597656, "rewards/margins": 17.26502799987793, "rewards/rejected": -52.57780838012695, "step": 1197 }, { "epoch": 1.3311111111111111, "grad_norm": 4.77919566037599e-05, "learning_rate": 3.1640378945694704e-05, "logits/chosen": -0.7537152767181396, "logits/rejected": -0.80055832862854, "logps/chosen": -187.9356689453125, "logps/rejected": -374.7626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.931252479553223, "rewards/margins": 14.74686050415039, "rewards/rejected": -26.678112030029297, "step": 1198 }, { "epoch": 1.3322222222222222, "grad_norm": 0.19577232003211975, "learning_rate": 3.161085405093006e-05, "logits/chosen": -0.8221469521522522, "logits/rejected": -0.8256490230560303, "logps/chosen": -261.5404052734375, "logps/rejected": -350.6756896972656, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -20.44083023071289, "rewards/margins": 6.617975234985352, "rewards/rejected": -27.058805465698242, "step": 1199 }, { "epoch": 1.3333333333333333, "grad_norm": 0.03477991372346878, "learning_rate": 3.158131923911498e-05, "logits/chosen": -0.8523095846176147, "logits/rejected": -0.9373700618743896, "logps/chosen": -153.3251953125, "logps/rejected": -561.0701904296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -10.28706169128418, "rewards/margins": 28.996479034423828, "rewards/rejected": -39.283538818359375, "step": 1200 }, { "epoch": 1.3344444444444443, "grad_norm": 0.2597058415412903, "learning_rate": 3.1551774554555134e-05, "logits/chosen": -0.5965104699134827, "logits/rejected": -0.603573203086853, "logps/chosen": -372.7412414550781, "logps/rejected": -480.0003967285156, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -26.163734436035156, "rewards/margins": 9.775516510009766, "rewards/rejected": -35.93925094604492, "step": 1201 }, { "epoch": 1.3355555555555556, "grad_norm": 0.5005232691764832, "learning_rate": 3.152222004157099e-05, "logits/chosen": -0.6972241401672363, "logits/rejected": -0.6912410855293274, "logps/chosen": -376.68133544921875, "logps/rejected": -429.22021484375, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -28.88412094116211, "rewards/margins": 4.7766876220703125, "rewards/rejected": -33.66080856323242, "step": 1202 }, { "epoch": 1.3366666666666667, "grad_norm": 17.09547996520996, "learning_rate": 3.149265574449775e-05, "logits/chosen": -0.7674028873443604, "logits/rejected": -0.7516874074935913, "logps/chosen": -188.30160522460938, "logps/rejected": -220.56005859375, "loss": 0.3435, "rewards/accuracies": 1.0, "rewards/chosen": -12.432596206665039, "rewards/margins": 3.606480598449707, "rewards/rejected": -16.039077758789062, "step": 1203 }, { "epoch": 1.3377777777777777, "grad_norm": 0.11002788692712784, "learning_rate": 3.1463081707685296e-05, "logits/chosen": -0.6547306776046753, "logits/rejected": -0.6622074246406555, "logps/chosen": -247.98968505859375, "logps/rejected": -317.44219970703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -17.89121437072754, "rewards/margins": 6.095422267913818, "rewards/rejected": -23.986637115478516, "step": 1204 }, { "epoch": 1.338888888888889, "grad_norm": 5.019257400817878e-07, "learning_rate": 3.1433497975498126e-05, "logits/chosen": -0.7220999598503113, "logits/rejected": -0.7065885066986084, "logps/chosen": -435.67120361328125, "logps/rejected": -689.3165283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -34.87784957885742, "rewards/margins": 19.267601013183594, "rewards/rejected": -54.145450592041016, "step": 1205 }, { "epoch": 1.34, "grad_norm": 32.837066650390625, "learning_rate": 3.140390459231528e-05, "logits/chosen": -0.9368401765823364, "logits/rejected": -0.9074068069458008, "logps/chosen": -325.0089111328125, "logps/rejected": -351.529052734375, "loss": 1.2933, "rewards/accuracies": 0.5, "rewards/chosen": -26.35123062133789, "rewards/margins": 2.040839195251465, "rewards/rejected": -28.392070770263672, "step": 1206 }, { "epoch": 1.3411111111111111, "grad_norm": 0.039055678993463516, "learning_rate": 3.13743016025303e-05, "logits/chosen": -0.6203926205635071, "logits/rejected": -0.6176601648330688, "logps/chosen": -272.5814208984375, "logps/rejected": -352.6728515625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -21.265445709228516, "rewards/margins": 7.336824417114258, "rewards/rejected": -28.602272033691406, "step": 1207 }, { "epoch": 1.3422222222222222, "grad_norm": 0.09020353853702545, "learning_rate": 3.134468905055108e-05, "logits/chosen": -0.6307733058929443, "logits/rejected": -0.6193915605545044, "logps/chosen": -295.4552307128906, "logps/rejected": -381.412841796875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -23.415409088134766, "rewards/margins": 6.50084114074707, "rewards/rejected": -29.916248321533203, "step": 1208 }, { "epoch": 1.3433333333333333, "grad_norm": 15.746272087097168, "learning_rate": 3.131506698079992e-05, "logits/chosen": -0.8223830461502075, "logits/rejected": -0.8163706064224243, "logps/chosen": -271.306884765625, "logps/rejected": -348.4595947265625, "loss": 0.331, "rewards/accuracies": 1.0, "rewards/chosen": -19.25278091430664, "rewards/margins": 7.698250770568848, "rewards/rejected": -26.951032638549805, "step": 1209 }, { "epoch": 1.3444444444444446, "grad_norm": 0.02644646354019642, "learning_rate": 3.128543543771336e-05, "logits/chosen": -0.776463508605957, "logits/rejected": -0.7828724384307861, "logps/chosen": -334.90045166015625, "logps/rejected": -448.0252685546875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -22.949451446533203, "rewards/margins": 9.824647903442383, "rewards/rejected": -32.77409744262695, "step": 1210 }, { "epoch": 1.3455555555555556, "grad_norm": 2.1025205881874953e-11, "learning_rate": 3.1255794465742174e-05, "logits/chosen": -0.6290451884269714, "logits/rejected": -0.6362565755844116, "logps/chosen": -352.24420166015625, "logps/rejected": -751.944580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.258329391479492, "rewards/margins": 33.79063415527344, "rewards/rejected": -57.04896545410156, "step": 1211 }, { "epoch": 1.3466666666666667, "grad_norm": 0.0002495874650776386, "learning_rate": 3.122614410935125e-05, "logits/chosen": -0.5840096473693848, "logits/rejected": -0.5829692482948303, "logps/chosen": -371.5384521484375, "logps/rejected": -566.6407470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.304059982299805, "rewards/margins": 14.640985488891602, "rewards/rejected": -45.945045471191406, "step": 1212 }, { "epoch": 1.3477777777777777, "grad_norm": 0.05142858996987343, "learning_rate": 3.119648441301959e-05, "logits/chosen": -0.6583006978034973, "logits/rejected": -0.6389794945716858, "logps/chosen": -296.018798828125, "logps/rejected": -375.24853515625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -23.080446243286133, "rewards/margins": 6.798450469970703, "rewards/rejected": -29.878896713256836, "step": 1213 }, { "epoch": 1.3488888888888888, "grad_norm": 0.7799180150032043, "learning_rate": 3.1166815421240193e-05, "logits/chosen": -0.5217552185058594, "logits/rejected": -0.5204120874404907, "logps/chosen": -289.94921875, "logps/rejected": -365.81072998046875, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -21.952316284179688, "rewards/margins": 6.519092559814453, "rewards/rejected": -28.47140884399414, "step": 1214 }, { "epoch": 1.35, "grad_norm": 0.00025383176398463547, "learning_rate": 3.1137137178519985e-05, "logits/chosen": -0.6749807596206665, "logits/rejected": -0.6866486668586731, "logps/chosen": -264.93121337890625, "logps/rejected": -430.41668701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.522289276123047, "rewards/margins": 12.30783462524414, "rewards/rejected": -28.830123901367188, "step": 1215 }, { "epoch": 1.3511111111111112, "grad_norm": 8.137293815612793, "learning_rate": 3.110744972937979e-05, "logits/chosen": -0.5921620726585388, "logits/rejected": -0.5294556021690369, "logps/chosen": -339.1707763671875, "logps/rejected": -426.2987365722656, "loss": 0.1111, "rewards/accuracies": 1.0, "rewards/chosen": -26.253368377685547, "rewards/margins": 7.790223121643066, "rewards/rejected": -34.0435905456543, "step": 1216 }, { "epoch": 1.3522222222222222, "grad_norm": 1.9849576915476064e-07, "learning_rate": 3.107775311835423e-05, "logits/chosen": -0.4276106059551239, "logits/rejected": -0.40853211283683777, "logps/chosen": -393.5445556640625, "logps/rejected": -982.379638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.27968978881836, "rewards/margins": 42.27250671386719, "rewards/rejected": -74.55219268798828, "step": 1217 }, { "epoch": 1.3533333333333333, "grad_norm": 0.10903879255056381, "learning_rate": 3.104804738999169e-05, "logits/chosen": -0.6848978996276855, "logits/rejected": -0.7288869619369507, "logps/chosen": -164.0013427734375, "logps/rejected": -211.31381225585938, "loss": 0.347, "rewards/accuracies": 1.0, "rewards/chosen": -11.83232593536377, "rewards/margins": 3.562915802001953, "rewards/rejected": -15.395241737365723, "step": 1218 }, { "epoch": 1.3544444444444443, "grad_norm": 0.01660083793103695, "learning_rate": 3.1018332588854223e-05, "logits/chosen": -0.7348146438598633, "logits/rejected": -0.7296610474586487, "logps/chosen": -321.3115234375, "logps/rejected": -427.4028015136719, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -22.2152099609375, "rewards/margins": 8.667560577392578, "rewards/rejected": -30.882770538330078, "step": 1219 }, { "epoch": 1.3555555555555556, "grad_norm": 1.6266343593597412, "learning_rate": 3.0988608759517475e-05, "logits/chosen": -0.8125827312469482, "logits/rejected": -0.8220561146736145, "logps/chosen": -177.39007568359375, "logps/rejected": -245.02244567871094, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -11.7461576461792, "rewards/margins": 4.153000831604004, "rewards/rejected": -15.899158477783203, "step": 1220 }, { "epoch": 1.3566666666666667, "grad_norm": 1.2407222986221313, "learning_rate": 3.0958875946570646e-05, "logits/chosen": -0.7255386114120483, "logits/rejected": -0.6931182146072388, "logps/chosen": -570.144287109375, "logps/rejected": -653.1799926757812, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -45.45130157470703, "rewards/margins": 7.032260894775391, "rewards/rejected": -52.48356628417969, "step": 1221 }, { "epoch": 1.3577777777777778, "grad_norm": 87.11197662353516, "learning_rate": 3.092913419461643e-05, "logits/chosen": -0.5881918668746948, "logits/rejected": -0.5863900184631348, "logps/chosen": -467.62420654296875, "logps/rejected": -421.6405029296875, "loss": 3.1444, "rewards/accuracies": 0.0, "rewards/chosen": -35.5469856262207, "rewards/margins": -2.797682762145996, "rewards/rejected": -32.749305725097656, "step": 1222 }, { "epoch": 1.3588888888888888, "grad_norm": 0.0006262369570322335, "learning_rate": 3.089938354827091e-05, "logits/chosen": -0.5284088850021362, "logits/rejected": -0.5105457901954651, "logps/chosen": -340.6404113769531, "logps/rejected": -569.5709228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.836820602416992, "rewards/margins": 12.448539733886719, "rewards/rejected": -34.285362243652344, "step": 1223 }, { "epoch": 1.3599999999999999, "grad_norm": 0.10297965258359909, "learning_rate": 3.086962405216353e-05, "logits/chosen": -0.5446739196777344, "logits/rejected": -0.5353917479515076, "logps/chosen": -294.9407043457031, "logps/rejected": -393.10272216796875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -22.22189712524414, "rewards/margins": 7.843880653381348, "rewards/rejected": -30.065776824951172, "step": 1224 }, { "epoch": 1.3611111111111112, "grad_norm": 0.5041775703430176, "learning_rate": 3.083985575093697e-05, "logits/chosen": -0.7327945232391357, "logits/rejected": -0.722826361656189, "logps/chosen": -202.14346313476562, "logps/rejected": -287.766357421875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -14.72932243347168, "rewards/margins": 7.191934585571289, "rewards/rejected": -21.92125701904297, "step": 1225 }, { "epoch": 1.3622222222222222, "grad_norm": 0.8838841319084167, "learning_rate": 3.081007868924716e-05, "logits/chosen": -0.5964117050170898, "logits/rejected": -0.5985506772994995, "logps/chosen": -305.35345458984375, "logps/rejected": -383.884521484375, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -21.22966194152832, "rewards/margins": 5.039816856384277, "rewards/rejected": -26.26947784423828, "step": 1226 }, { "epoch": 1.3633333333333333, "grad_norm": 0.01073842030018568, "learning_rate": 3.078029291176316e-05, "logits/chosen": -0.671387791633606, "logits/rejected": -0.6764581203460693, "logps/chosen": -295.1224670410156, "logps/rejected": -410.6396179199219, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -20.45638656616211, "rewards/margins": 10.305456161499023, "rewards/rejected": -30.761842727661133, "step": 1227 }, { "epoch": 1.3644444444444446, "grad_norm": 14.279627799987793, "learning_rate": 3.075049846316711e-05, "logits/chosen": -0.4997161328792572, "logits/rejected": -0.5180837512016296, "logps/chosen": -332.01898193359375, "logps/rejected": -358.92095947265625, "loss": 0.1543, "rewards/accuracies": 1.0, "rewards/chosen": -23.865455627441406, "rewards/margins": 3.910367012023926, "rewards/rejected": -27.775821685791016, "step": 1228 }, { "epoch": 1.3655555555555556, "grad_norm": 6.464750185841694e-05, "learning_rate": 3.072069538815415e-05, "logits/chosen": -0.39291414618492126, "logits/rejected": -0.376559853553772, "logps/chosen": -166.39605712890625, "logps/rejected": -341.187744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.804718017578125, "rewards/margins": 14.519235610961914, "rewards/rejected": -25.32395362854004, "step": 1229 }, { "epoch": 1.3666666666666667, "grad_norm": 9.070005262401537e-07, "learning_rate": 3.069088373143234e-05, "logits/chosen": -0.5634620785713196, "logits/rejected": -0.552689790725708, "logps/chosen": -571.9707641601562, "logps/rejected": -910.3707275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -47.087772369384766, "rewards/margins": 26.35542869567871, "rewards/rejected": -73.44319915771484, "step": 1230 }, { "epoch": 1.3677777777777778, "grad_norm": 0.01171477884054184, "learning_rate": 3.0661063537722647e-05, "logits/chosen": -0.43182018399238586, "logits/rejected": -0.427354097366333, "logps/chosen": -335.837890625, "logps/rejected": -435.7060546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -24.885356903076172, "rewards/margins": 10.79754638671875, "rewards/rejected": -35.68290328979492, "step": 1231 }, { "epoch": 1.3688888888888888, "grad_norm": 0.001384291215799749, "learning_rate": 3.0631234851758834e-05, "logits/chosen": -0.43308311700820923, "logits/rejected": -0.43777111172676086, "logps/chosen": -449.45819091796875, "logps/rejected": -794.1171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.002891540527344, "rewards/margins": 26.315929412841797, "rewards/rejected": -59.318824768066406, "step": 1232 }, { "epoch": 1.37, "grad_norm": 3.3140393497888e-05, "learning_rate": 3.06013977182874e-05, "logits/chosen": -0.38281700015068054, "logits/rejected": -0.3989453613758087, "logps/chosen": -245.56907653808594, "logps/rejected": -512.978759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.437177658081055, "rewards/margins": 21.197786331176758, "rewards/rejected": -36.63496398925781, "step": 1233 }, { "epoch": 1.3711111111111112, "grad_norm": 0.04449201002717018, "learning_rate": 3.057155218206752e-05, "logits/chosen": -0.4773881733417511, "logits/rejected": -0.47419655323028564, "logps/chosen": -199.28103637695312, "logps/rejected": -311.410400390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -14.06181812286377, "rewards/margins": 7.787569999694824, "rewards/rejected": -21.849388122558594, "step": 1234 }, { "epoch": 1.3722222222222222, "grad_norm": 43.553897857666016, "learning_rate": 3.0541698287870965e-05, "logits/chosen": -0.42344769835472107, "logits/rejected": -0.41565632820129395, "logps/chosen": -358.8847351074219, "logps/rejected": -340.48480224609375, "loss": 1.3816, "rewards/accuracies": 0.0, "rewards/chosen": -28.87781524658203, "rewards/margins": -0.9675998687744141, "rewards/rejected": -27.910215377807617, "step": 1235 }, { "epoch": 1.3733333333333333, "grad_norm": 60.03107833862305, "learning_rate": 3.051183608048206e-05, "logits/chosen": -0.4973793029785156, "logits/rejected": -0.6333597898483276, "logps/chosen": -300.2869873046875, "logps/rejected": -396.58929443359375, "loss": 10.9404, "rewards/accuracies": 0.5, "rewards/chosen": -21.976577758789062, "rewards/margins": 5.230583190917969, "rewards/rejected": -27.20716094970703, "step": 1236 }, { "epoch": 1.3744444444444444, "grad_norm": 0.04270509257912636, "learning_rate": 3.048196560469758e-05, "logits/chosen": -0.20880669355392456, "logits/rejected": -0.2084420621395111, "logps/chosen": -274.2767333984375, "logps/rejected": -344.21051025390625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -18.518085479736328, "rewards/margins": 7.573086738586426, "rewards/rejected": -26.09117317199707, "step": 1237 }, { "epoch": 1.3755555555555556, "grad_norm": 0.06338273733854294, "learning_rate": 3.0452086905326728e-05, "logits/chosen": -0.5201363563537598, "logits/rejected": -0.521070659160614, "logps/chosen": -167.18154907226562, "logps/rejected": -254.73245239257812, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -9.893752098083496, "rewards/margins": 7.18618106842041, "rewards/rejected": -17.079933166503906, "step": 1238 }, { "epoch": 1.3766666666666667, "grad_norm": 3.986403465270996, "learning_rate": 3.0422200027191023e-05, "logits/chosen": -0.43926066160202026, "logits/rejected": -0.444225013256073, "logps/chosen": -198.40762329101562, "logps/rejected": -355.7780456542969, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": -13.377809524536133, "rewards/margins": 12.139748573303223, "rewards/rejected": -25.517559051513672, "step": 1239 }, { "epoch": 1.3777777777777778, "grad_norm": 0.005586075596511364, "learning_rate": 3.0392305015124255e-05, "logits/chosen": -0.38281485438346863, "logits/rejected": -0.381062388420105, "logps/chosen": -249.8658447265625, "logps/rejected": -370.6976318359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -17.08326530456543, "rewards/margins": 10.394646644592285, "rewards/rejected": -27.47791290283203, "step": 1240 }, { "epoch": 1.3788888888888888, "grad_norm": 0.007010848727077246, "learning_rate": 3.0362401913972428e-05, "logits/chosen": -0.46593141555786133, "logits/rejected": -0.4651058316230774, "logps/chosen": -295.2550048828125, "logps/rejected": -469.74212646484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -22.546443939208984, "rewards/margins": 11.77447509765625, "rewards/rejected": -34.320919036865234, "step": 1241 }, { "epoch": 1.38, "grad_norm": 0.2623584270477295, "learning_rate": 3.0332490768593675e-05, "logits/chosen": -0.43191924691200256, "logits/rejected": -0.43018868565559387, "logps/chosen": -386.3530578613281, "logps/rejected": -534.1435546875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -30.578065872192383, "rewards/margins": 8.67660140991211, "rewards/rejected": -39.254669189453125, "step": 1242 }, { "epoch": 1.3811111111111112, "grad_norm": 0.00340826204046607, "learning_rate": 3.0302571623858188e-05, "logits/chosen": -0.3483830690383911, "logits/rejected": -0.33365437388420105, "logps/chosen": -245.84934997558594, "logps/rejected": -442.719970703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.276012420654297, "rewards/margins": 17.05269432067871, "rewards/rejected": -33.328704833984375, "step": 1243 }, { "epoch": 1.3822222222222222, "grad_norm": 70.89935302734375, "learning_rate": 3.0272644524648175e-05, "logits/chosen": -0.22999081015586853, "logits/rejected": -0.22780224680900574, "logps/chosen": -643.2110595703125, "logps/rejected": -682.6861572265625, "loss": 3.2234, "rewards/accuracies": 0.5, "rewards/chosen": -49.120880126953125, "rewards/margins": 4.329990386962891, "rewards/rejected": -53.450870513916016, "step": 1244 }, { "epoch": 1.3833333333333333, "grad_norm": 2.508791923522949, "learning_rate": 3.0242709515857758e-05, "logits/chosen": -0.15062710642814636, "logits/rejected": -0.15084967017173767, "logps/chosen": -397.0738220214844, "logps/rejected": -409.02581787109375, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -22.98763656616211, "rewards/margins": 5.72774600982666, "rewards/rejected": -28.715383529663086, "step": 1245 }, { "epoch": 1.3844444444444444, "grad_norm": 0.9159281849861145, "learning_rate": 3.021276664239294e-05, "logits/chosen": -0.2905831038951874, "logits/rejected": -0.2769080400466919, "logps/chosen": -269.28009033203125, "logps/rejected": -346.874755859375, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -17.936979293823242, "rewards/margins": 5.7642621994018555, "rewards/rejected": -23.70124053955078, "step": 1246 }, { "epoch": 1.3855555555555554, "grad_norm": 0.13535690307617188, "learning_rate": 3.0182815949171517e-05, "logits/chosen": -0.15759910643100739, "logits/rejected": -0.16408202052116394, "logps/chosen": -214.46182250976562, "logps/rejected": -328.77203369140625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -14.623268127441406, "rewards/margins": 8.354690551757812, "rewards/rejected": -22.97795867919922, "step": 1247 }, { "epoch": 1.3866666666666667, "grad_norm": 0.0035481799859553576, "learning_rate": 3.0152857481123008e-05, "logits/chosen": -0.18996688723564148, "logits/rejected": -0.16536028683185577, "logps/chosen": -292.5936279296875, "logps/rejected": -384.59417724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.826154708862305, "rewards/margins": 10.416956901550293, "rewards/rejected": -29.24311065673828, "step": 1248 }, { "epoch": 1.3877777777777778, "grad_norm": 0.2338637113571167, "learning_rate": 3.0122891283188615e-05, "logits/chosen": -0.425133615732193, "logits/rejected": -0.42104679346084595, "logps/chosen": -149.0655059814453, "logps/rejected": -213.08596801757812, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -8.921409606933594, "rewards/margins": 5.499818325042725, "rewards/rejected": -14.421228408813477, "step": 1249 }, { "epoch": 1.3888888888888888, "grad_norm": 0.053613223135471344, "learning_rate": 3.0092917400321108e-05, "logits/chosen": -0.2875784635543823, "logits/rejected": -0.2882526218891144, "logps/chosen": -290.58074951171875, "logps/rejected": -426.46453857421875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -23.07957649230957, "rewards/margins": 10.424104690551758, "rewards/rejected": -33.50368118286133, "step": 1250 }, { "epoch": 1.3900000000000001, "grad_norm": 0.001383377006277442, "learning_rate": 3.0062935877484804e-05, "logits/chosen": -0.16260045766830444, "logits/rejected": -0.1996806114912033, "logps/chosen": -298.91033935546875, "logps/rejected": -433.3731689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.303489685058594, "rewards/margins": 11.823866844177246, "rewards/rejected": -32.127357482910156, "step": 1251 }, { "epoch": 1.3911111111111112, "grad_norm": 0.2155996412038803, "learning_rate": 3.003294675965548e-05, "logits/chosen": -0.07981687784194946, "logits/rejected": -0.08179891109466553, "logps/chosen": -270.18426513671875, "logps/rejected": -349.09930419921875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -17.607213973999023, "rewards/margins": 5.8505682945251465, "rewards/rejected": -23.457782745361328, "step": 1252 }, { "epoch": 1.3922222222222222, "grad_norm": 0.004742086865007877, "learning_rate": 3.0002950091820293e-05, "logits/chosen": 0.04143702983856201, "logits/rejected": 0.0316939651966095, "logps/chosen": -292.28265380859375, "logps/rejected": -422.0010681152344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -18.9835262298584, "rewards/margins": 10.0477294921875, "rewards/rejected": -29.031253814697266, "step": 1253 }, { "epoch": 1.3933333333333333, "grad_norm": 1.1842506530967967e-08, "learning_rate": 2.9972945918977746e-05, "logits/chosen": -0.1884227842092514, "logits/rejected": -0.17748624086380005, "logps/chosen": -461.4615173339844, "logps/rejected": -715.6384887695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.77865219116211, "rewards/margins": 22.93533706665039, "rewards/rejected": -55.7139892578125, "step": 1254 }, { "epoch": 1.3944444444444444, "grad_norm": 1.5283256288967095e-05, "learning_rate": 2.9942934286137586e-05, "logits/chosen": -0.2694793939590454, "logits/rejected": -0.21243372559547424, "logps/chosen": -220.48696899414062, "logps/rejected": -467.54779052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.40794563293457, "rewards/margins": 18.728696823120117, "rewards/rejected": -31.136642456054688, "step": 1255 }, { "epoch": 1.3955555555555557, "grad_norm": 3.9710419178009033, "learning_rate": 2.9912915238320754e-05, "logits/chosen": -0.2283167541027069, "logits/rejected": -0.23970340192317963, "logps/chosen": -276.88470458984375, "logps/rejected": -309.32666015625, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": -21.032991409301758, "rewards/margins": 2.232661247253418, "rewards/rejected": -23.26565170288086, "step": 1256 }, { "epoch": 1.3966666666666667, "grad_norm": 11.75169563293457, "learning_rate": 2.9882888820559317e-05, "logits/chosen": -0.19513747096061707, "logits/rejected": -0.1739692986011505, "logps/chosen": -241.32725524902344, "logps/rejected": -437.37042236328125, "loss": 0.2565, "rewards/accuracies": 1.0, "rewards/chosen": -16.091609954833984, "rewards/margins": 11.612787246704102, "rewards/rejected": -27.70439910888672, "step": 1257 }, { "epoch": 1.3977777777777778, "grad_norm": 0.46116599440574646, "learning_rate": 2.9852855077896403e-05, "logits/chosen": -0.06158025562763214, "logits/rejected": -0.05102270096540451, "logps/chosen": -428.8680114746094, "logps/rejected": -585.6865844726562, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -22.899768829345703, "rewards/margins": 11.54875373840332, "rewards/rejected": -34.44852066040039, "step": 1258 }, { "epoch": 1.3988888888888888, "grad_norm": 0.6671234965324402, "learning_rate": 2.9822814055386106e-05, "logits/chosen": -0.06558649241924286, "logits/rejected": -0.06908948719501495, "logps/chosen": -351.16943359375, "logps/rejected": -428.3381042480469, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -23.806861877441406, "rewards/margins": 4.8385138511657715, "rewards/rejected": -28.645374298095703, "step": 1259 }, { "epoch": 1.4, "grad_norm": 0.00664136977866292, "learning_rate": 2.9792765798093465e-05, "logits/chosen": -0.11809110641479492, "logits/rejected": -0.12359201163053513, "logps/chosen": -222.0159149169922, "logps/rejected": -331.16473388671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.690424919128418, "rewards/margins": 9.268362998962402, "rewards/rejected": -22.95878791809082, "step": 1260 }, { "epoch": 1.4011111111111112, "grad_norm": 0.00017073567141778767, "learning_rate": 2.976271035109436e-05, "logits/chosen": -0.09025794267654419, "logits/rejected": -0.11446227133274078, "logps/chosen": -253.79627990722656, "logps/rejected": -428.6563720703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.307321548461914, "rewards/margins": 14.465821266174316, "rewards/rejected": -29.773143768310547, "step": 1261 }, { "epoch": 1.4022222222222223, "grad_norm": 29.260421752929688, "learning_rate": 2.9732647759475462e-05, "logits/chosen": -0.10586432367563248, "logits/rejected": -0.07971487194299698, "logps/chosen": -311.30596923828125, "logps/rejected": -365.6272888183594, "loss": 1.5188, "rewards/accuracies": 0.5, "rewards/chosen": -20.5279483795166, "rewards/margins": 2.9295129776000977, "rewards/rejected": -23.457462310791016, "step": 1262 }, { "epoch": 1.4033333333333333, "grad_norm": 0.2840944826602936, "learning_rate": 2.970257806833416e-05, "logits/chosen": -0.06706520915031433, "logits/rejected": -0.06484271585941315, "logps/chosen": -208.76913452148438, "logps/rejected": -289.24810791015625, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -12.82516098022461, "rewards/margins": 6.724276065826416, "rewards/rejected": -19.549436569213867, "step": 1263 }, { "epoch": 1.4044444444444444, "grad_norm": 3.1833348274230957, "learning_rate": 2.967250132277848e-05, "logits/chosen": -0.454507976770401, "logits/rejected": -0.4645177125930786, "logps/chosen": -173.83599853515625, "logps/rejected": -225.9974822998047, "loss": 0.1556, "rewards/accuracies": 1.0, "rewards/chosen": -11.628922462463379, "rewards/margins": 4.258608818054199, "rewards/rejected": -15.887531280517578, "step": 1264 }, { "epoch": 1.4055555555555554, "grad_norm": 0.04401806369423866, "learning_rate": 2.9642417567927045e-05, "logits/chosen": -0.6393982768058777, "logits/rejected": -0.6400202512741089, "logps/chosen": -102.40669250488281, "logps/rejected": -228.80783081054688, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -5.055665016174316, "rewards/margins": 10.02108383178711, "rewards/rejected": -15.076749801635742, "step": 1265 }, { "epoch": 1.4066666666666667, "grad_norm": 0.1338489055633545, "learning_rate": 2.9612326848908995e-05, "logits/chosen": -0.4130249619483948, "logits/rejected": -0.36889559030532837, "logps/chosen": -185.9600830078125, "logps/rejected": -396.4458312988281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -11.93954849243164, "rewards/margins": 18.469375610351562, "rewards/rejected": -30.408924102783203, "step": 1266 }, { "epoch": 1.4077777777777778, "grad_norm": 8.68630944950155e-10, "learning_rate": 2.9582229210863898e-05, "logits/chosen": -0.1896037757396698, "logits/rejected": -0.20740637183189392, "logps/chosen": -349.751708984375, "logps/rejected": -692.1644897460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.587913513183594, "rewards/margins": 27.283721923828125, "rewards/rejected": -52.87163543701172, "step": 1267 }, { "epoch": 1.4088888888888889, "grad_norm": 0.0005530136986635625, "learning_rate": 2.955212469894173e-05, "logits/chosen": -0.07271046936511993, "logits/rejected": -0.06970680505037308, "logps/chosen": -311.2545166015625, "logps/rejected": -506.73095703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.35324478149414, "rewards/margins": 12.09460735321045, "rewards/rejected": -33.447853088378906, "step": 1268 }, { "epoch": 1.41, "grad_norm": 0.008852140977978706, "learning_rate": 2.952201335830275e-05, "logits/chosen": -0.07413149625062943, "logits/rejected": -0.06986261904239655, "logps/chosen": -274.23944091796875, "logps/rejected": -395.20916748046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -16.320777893066406, "rewards/margins": 9.750689506530762, "rewards/rejected": -26.071468353271484, "step": 1269 }, { "epoch": 1.411111111111111, "grad_norm": 5.143998714629561e-05, "learning_rate": 2.949189523411747e-05, "logits/chosen": -0.20834362506866455, "logits/rejected": -0.19754785299301147, "logps/chosen": -226.85414123535156, "logps/rejected": -457.6416320800781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.06186294555664, "rewards/margins": 14.844804763793945, "rewards/rejected": -29.906667709350586, "step": 1270 }, { "epoch": 1.4122222222222223, "grad_norm": 0.08817862719297409, "learning_rate": 2.94617703715666e-05, "logits/chosen": -0.5823191404342651, "logits/rejected": -0.3694084584712982, "logps/chosen": -213.97584533691406, "logps/rejected": -396.02734375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -16.08245849609375, "rewards/margins": 8.628395080566406, "rewards/rejected": -24.710853576660156, "step": 1271 }, { "epoch": 1.4133333333333333, "grad_norm": 0.04972168430685997, "learning_rate": 2.9431638815840917e-05, "logits/chosen": -0.26718854904174805, "logits/rejected": -0.2509411573410034, "logps/chosen": -288.9676513671875, "logps/rejected": -341.3250732421875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -19.541393280029297, "rewards/margins": 6.570839881896973, "rewards/rejected": -26.112232208251953, "step": 1272 }, { "epoch": 1.4144444444444444, "grad_norm": 20.38580322265625, "learning_rate": 2.940150061214128e-05, "logits/chosen": -0.10622769594192505, "logits/rejected": -0.09349526464939117, "logps/chosen": -278.2377014160156, "logps/rejected": -439.35992431640625, "loss": 0.3229, "rewards/accuracies": 1.0, "rewards/chosen": -18.97027587890625, "rewards/margins": 12.564146995544434, "rewards/rejected": -31.534423828125, "step": 1273 }, { "epoch": 1.4155555555555557, "grad_norm": 1.6090863943099976, "learning_rate": 2.9371355805678486e-05, "logits/chosen": -0.14937780797481537, "logits/rejected": -0.17280544340610504, "logps/chosen": -305.6307373046875, "logps/rejected": -389.1463928222656, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -20.630821228027344, "rewards/margins": 7.715029239654541, "rewards/rejected": -28.345849990844727, "step": 1274 }, { "epoch": 1.4166666666666667, "grad_norm": 0.15460608899593353, "learning_rate": 2.9341204441673266e-05, "logits/chosen": -0.4150492250919342, "logits/rejected": -0.41047561168670654, "logps/chosen": -164.23780822753906, "logps/rejected": -245.44862365722656, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -10.880536079406738, "rewards/margins": 5.997982501983643, "rewards/rejected": -16.87851905822754, "step": 1275 }, { "epoch": 1.4177777777777778, "grad_norm": 0.01768086664378643, "learning_rate": 2.931104656535616e-05, "logits/chosen": -0.020137207582592964, "logits/rejected": -0.011720485053956509, "logps/chosen": -353.8594665527344, "logps/rejected": -481.50738525390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -24.93708038330078, "rewards/margins": 10.804525375366211, "rewards/rejected": -35.741607666015625, "step": 1276 }, { "epoch": 1.4188888888888889, "grad_norm": 0.3429347574710846, "learning_rate": 2.9280882221967508e-05, "logits/chosen": -0.30568671226501465, "logits/rejected": -0.28291189670562744, "logps/chosen": -336.1288146972656, "logps/rejected": -409.40625, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -23.54468536376953, "rewards/margins": 7.265683174133301, "rewards/rejected": -30.810367584228516, "step": 1277 }, { "epoch": 1.42, "grad_norm": 0.1710684597492218, "learning_rate": 2.925071145675733e-05, "logits/chosen": -0.21587976813316345, "logits/rejected": -0.19978153705596924, "logps/chosen": -207.93386840820312, "logps/rejected": -292.958984375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -12.946446418762207, "rewards/margins": 7.916327476501465, "rewards/rejected": -20.862773895263672, "step": 1278 }, { "epoch": 1.4211111111111112, "grad_norm": 1.0250424146652222, "learning_rate": 2.922053431498528e-05, "logits/chosen": -0.2367759346961975, "logits/rejected": -0.23822636902332306, "logps/chosen": -341.43682861328125, "logps/rejected": -451.5006408691406, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -24.070880889892578, "rewards/margins": 9.76474666595459, "rewards/rejected": -33.835628509521484, "step": 1279 }, { "epoch": 1.4222222222222223, "grad_norm": 0.760398268699646, "learning_rate": 2.919035084192059e-05, "logits/chosen": -0.26269906759262085, "logits/rejected": -0.26232266426086426, "logps/chosen": -239.67091369628906, "logps/rejected": -387.25762939453125, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -14.84264087677002, "rewards/margins": 11.061187744140625, "rewards/rejected": -25.903827667236328, "step": 1280 }, { "epoch": 1.4233333333333333, "grad_norm": 0.25485190749168396, "learning_rate": 2.9160161082841976e-05, "logits/chosen": -0.1546352207660675, "logits/rejected": -0.14633075892925262, "logps/chosen": -247.08779907226562, "logps/rejected": -357.736328125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -15.795239448547363, "rewards/margins": 6.2680158615112305, "rewards/rejected": -22.063255310058594, "step": 1281 }, { "epoch": 1.4244444444444444, "grad_norm": 0.00593952089548111, "learning_rate": 2.9129965083037592e-05, "logits/chosen": -0.13417798280715942, "logits/rejected": -0.11179392039775848, "logps/chosen": -179.74392700195312, "logps/rejected": -349.13916015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.295492172241211, "rewards/margins": 11.04798698425293, "rewards/rejected": -22.34347915649414, "step": 1282 }, { "epoch": 1.4255555555555555, "grad_norm": 0.003047794569283724, "learning_rate": 2.909976288780496e-05, "logits/chosen": -0.35472774505615234, "logits/rejected": -0.39967185258865356, "logps/chosen": -245.40628051757812, "logps/rejected": -549.677490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.36223602294922, "rewards/margins": 18.994827270507812, "rewards/rejected": -35.35706329345703, "step": 1283 }, { "epoch": 1.4266666666666667, "grad_norm": 0.027690986171364784, "learning_rate": 2.906955454245087e-05, "logits/chosen": -0.3191990852355957, "logits/rejected": -0.3285864293575287, "logps/chosen": -306.723876953125, "logps/rejected": -449.12908935546875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -23.79242706298828, "rewards/margins": 10.530714988708496, "rewards/rejected": -34.323143005371094, "step": 1284 }, { "epoch": 1.4277777777777778, "grad_norm": 0.00047845838707871735, "learning_rate": 2.9039340092291373e-05, "logits/chosen": -0.5365324020385742, "logits/rejected": -0.5470410585403442, "logps/chosen": -231.9368896484375, "logps/rejected": -402.05316162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.514949798583984, "rewards/margins": 16.22812271118164, "rewards/rejected": -32.743072509765625, "step": 1285 }, { "epoch": 1.4288888888888889, "grad_norm": 0.10642511397600174, "learning_rate": 2.9009119582651652e-05, "logits/chosen": -0.4925473928451538, "logits/rejected": -0.4815739095211029, "logps/chosen": -119.28459930419922, "logps/rejected": -222.59228515625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -8.716172218322754, "rewards/margins": 8.14501953125, "rewards/rejected": -16.861190795898438, "step": 1286 }, { "epoch": 1.43, "grad_norm": 0.0029353657737374306, "learning_rate": 2.8978893058865987e-05, "logits/chosen": -0.3447117805480957, "logits/rejected": -0.35428622364997864, "logps/chosen": -338.9206237792969, "logps/rejected": -474.02294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.05826187133789, "rewards/margins": 10.4076566696167, "rewards/rejected": -37.465919494628906, "step": 1287 }, { "epoch": 1.431111111111111, "grad_norm": 1.8696466213441454e-05, "learning_rate": 2.8948660566277687e-05, "logits/chosen": -0.4639735221862793, "logits/rejected": -0.43910154700279236, "logps/chosen": -359.053466796875, "logps/rejected": -714.904541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.680652618408203, "rewards/margins": 18.127626419067383, "rewards/rejected": -45.80828094482422, "step": 1288 }, { "epoch": 1.4322222222222223, "grad_norm": 0.09772029519081116, "learning_rate": 2.8918422150239004e-05, "logits/chosen": -0.32260605692863464, "logits/rejected": -0.3175836205482483, "logps/chosen": -321.83721923828125, "logps/rejected": -516.5299072265625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -24.699390411376953, "rewards/margins": 15.55848217010498, "rewards/rejected": -40.25787353515625, "step": 1289 }, { "epoch": 1.4333333333333333, "grad_norm": 0.32181864976882935, "learning_rate": 2.8888177856111083e-05, "logits/chosen": -0.6646515130996704, "logits/rejected": -0.669223427772522, "logps/chosen": -290.13555908203125, "logps/rejected": -411.3962707519531, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -23.02318572998047, "rewards/margins": 8.598906517028809, "rewards/rejected": -31.622093200683594, "step": 1290 }, { "epoch": 1.4344444444444444, "grad_norm": 0.12217072397470474, "learning_rate": 2.8857927729263883e-05, "logits/chosen": -0.4231378734111786, "logits/rejected": -0.39572209119796753, "logps/chosen": -173.5696258544922, "logps/rejected": -289.0600280761719, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -12.32229232788086, "rewards/margins": 10.522431373596191, "rewards/rejected": -22.844722747802734, "step": 1291 }, { "epoch": 1.4355555555555555, "grad_norm": 0.06543240696191788, "learning_rate": 2.8827671815076118e-05, "logits/chosen": -0.5456811189651489, "logits/rejected": -0.5349911451339722, "logps/chosen": -438.27996826171875, "logps/rejected": -535.3893432617188, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -34.00331497192383, "rewards/margins": 8.161169052124023, "rewards/rejected": -42.16448211669922, "step": 1292 }, { "epoch": 1.4366666666666665, "grad_norm": 0.9194092750549316, "learning_rate": 2.879741015893519e-05, "logits/chosen": -0.5374413132667542, "logits/rejected": -0.5410245656967163, "logps/chosen": -216.27603149414062, "logps/rejected": -273.704345703125, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -15.041540145874023, "rewards/margins": 5.851393222808838, "rewards/rejected": -20.892932891845703, "step": 1293 }, { "epoch": 1.4377777777777778, "grad_norm": 0.24699725210666656, "learning_rate": 2.876714280623708e-05, "logits/chosen": -0.5231863260269165, "logits/rejected": -0.5345578789710999, "logps/chosen": -450.14276123046875, "logps/rejected": -528.6309814453125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -31.333412170410156, "rewards/margins": 7.806856155395508, "rewards/rejected": -39.1402702331543, "step": 1294 }, { "epoch": 1.4388888888888889, "grad_norm": 0.2656022906303406, "learning_rate": 2.8736869802386364e-05, "logits/chosen": -0.4806564450263977, "logits/rejected": -0.48630470037460327, "logps/chosen": -201.23269653320312, "logps/rejected": -340.38726806640625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -13.519603729248047, "rewards/margins": 10.640310287475586, "rewards/rejected": -24.159914016723633, "step": 1295 }, { "epoch": 1.44, "grad_norm": 3.820206075033639e-06, "learning_rate": 2.870659119279605e-05, "logits/chosen": -0.6033909320831299, "logits/rejected": -0.6602959036827087, "logps/chosen": -312.2649841308594, "logps/rejected": -592.515869140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.97861099243164, "rewards/margins": 21.27938461303711, "rewards/rejected": -42.25799560546875, "step": 1296 }, { "epoch": 1.4411111111111112, "grad_norm": 0.032270874828100204, "learning_rate": 2.8676307022887594e-05, "logits/chosen": -0.3734625577926636, "logits/rejected": -0.36543768644332886, "logps/chosen": -559.04443359375, "logps/rejected": -624.8543701171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -44.76750564575195, "rewards/margins": 8.177536010742188, "rewards/rejected": -52.94504165649414, "step": 1297 }, { "epoch": 1.4422222222222223, "grad_norm": 4.8038004933914635e-06, "learning_rate": 2.8646017338090768e-05, "logits/chosen": -0.6580870151519775, "logits/rejected": -0.6666950583457947, "logps/chosen": -302.829833984375, "logps/rejected": -571.1814575195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.754600524902344, "rewards/margins": 22.195659637451172, "rewards/rejected": -44.950260162353516, "step": 1298 }, { "epoch": 1.4433333333333334, "grad_norm": 1.0173382759094238, "learning_rate": 2.8615722183843624e-05, "logits/chosen": -0.7572202682495117, "logits/rejected": -0.7620872259140015, "logps/chosen": -229.6021728515625, "logps/rejected": -416.2643127441406, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -16.58181381225586, "rewards/margins": 14.882123947143555, "rewards/rejected": -31.463937759399414, "step": 1299 }, { "epoch": 1.4444444444444444, "grad_norm": 0.00035611187922768295, "learning_rate": 2.858542160559241e-05, "logits/chosen": -0.4509117305278778, "logits/rejected": -0.4488876163959503, "logps/chosen": -477.61358642578125, "logps/rejected": -615.2274169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -36.60245895385742, "rewards/margins": 13.510135650634766, "rewards/rejected": -50.11259460449219, "step": 1300 }, { "epoch": 1.4455555555555555, "grad_norm": 13.092733383178711, "learning_rate": 2.8555115648791542e-05, "logits/chosen": -0.7183066606521606, "logits/rejected": -0.7185086011886597, "logps/chosen": -285.718994140625, "logps/rejected": -411.61944580078125, "loss": 0.4465, "rewards/accuracies": 0.5, "rewards/chosen": -21.273151397705078, "rewards/margins": 11.760382652282715, "rewards/rejected": -33.033531188964844, "step": 1301 }, { "epoch": 1.4466666666666668, "grad_norm": 0.09650453180074692, "learning_rate": 2.8524804358903458e-05, "logits/chosen": -0.2681974172592163, "logits/rejected": -0.2773345112800598, "logps/chosen": -339.82135009765625, "logps/rejected": -671.9456787109375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -25.63922119140625, "rewards/margins": 26.363340377807617, "rewards/rejected": -52.0025634765625, "step": 1302 }, { "epoch": 1.4477777777777778, "grad_norm": 35.52307891845703, "learning_rate": 2.849448778139864e-05, "logits/chosen": -0.42484110593795776, "logits/rejected": -0.4115025997161865, "logps/chosen": -544.4007568359375, "logps/rejected": -623.709228515625, "loss": 1.3683, "rewards/accuracies": 0.5, "rewards/chosen": -42.21826934814453, "rewards/margins": 8.961381912231445, "rewards/rejected": -51.17965316772461, "step": 1303 }, { "epoch": 1.448888888888889, "grad_norm": 0.04231740161776543, "learning_rate": 2.8464165961755468e-05, "logits/chosen": -0.5888711810112, "logits/rejected": -0.5904315710067749, "logps/chosen": -163.4232177734375, "logps/rejected": -308.515380859375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -10.92770767211914, "rewards/margins": 11.341001510620117, "rewards/rejected": -22.268709182739258, "step": 1304 }, { "epoch": 1.45, "grad_norm": 0.008545797318220139, "learning_rate": 2.8433838945460205e-05, "logits/chosen": -0.47662168741226196, "logits/rejected": -0.4723438024520874, "logps/chosen": -233.93800354003906, "logps/rejected": -448.7249755859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -17.709470748901367, "rewards/margins": 19.44114112854004, "rewards/rejected": -37.150611877441406, "step": 1305 }, { "epoch": 1.451111111111111, "grad_norm": 0.0011324262013658881, "learning_rate": 2.840350677800691e-05, "logits/chosen": -0.6162707805633545, "logits/rejected": -0.6971988677978516, "logps/chosen": -408.04412841796875, "logps/rejected": -834.2371215820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.866933822631836, "rewards/margins": 30.770626068115234, "rewards/rejected": -61.6375617980957, "step": 1306 }, { "epoch": 1.4522222222222223, "grad_norm": 26.880685806274414, "learning_rate": 2.8373169504897356e-05, "logits/chosen": -0.6370571851730347, "logits/rejected": -0.6279306411743164, "logps/chosen": -403.2874755859375, "logps/rejected": -467.18231201171875, "loss": 0.6559, "rewards/accuracies": 0.5, "rewards/chosen": -32.3996696472168, "rewards/margins": 5.942266941070557, "rewards/rejected": -38.34193801879883, "step": 1307 }, { "epoch": 1.4533333333333334, "grad_norm": 0.074339859187603, "learning_rate": 2.8342827171640994e-05, "logits/chosen": -0.0896272212266922, "logits/rejected": -0.08878748118877411, "logps/chosen": -687.2113037109375, "logps/rejected": -760.1494750976562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -49.611846923828125, "rewards/margins": 12.0430908203125, "rewards/rejected": -61.654937744140625, "step": 1308 }, { "epoch": 1.4544444444444444, "grad_norm": 0.6506820917129517, "learning_rate": 2.8312479823754838e-05, "logits/chosen": -0.42595988512039185, "logits/rejected": -0.448856383562088, "logps/chosen": -511.2552490234375, "logps/rejected": -600.8929443359375, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -39.601600646972656, "rewards/margins": 9.769538879394531, "rewards/rejected": -49.37113952636719, "step": 1309 }, { "epoch": 1.4555555555555555, "grad_norm": 1.999558962850756e-09, "learning_rate": 2.8282127506763456e-05, "logits/chosen": -0.5015864372253418, "logits/rejected": -0.48337122797966003, "logps/chosen": -259.2657165527344, "logps/rejected": -536.9978637695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.704612731933594, "rewards/margins": 24.972515106201172, "rewards/rejected": -40.677127838134766, "step": 1310 }, { "epoch": 1.4566666666666666, "grad_norm": 2.2685341835021973, "learning_rate": 2.825177026619885e-05, "logits/chosen": -0.5479443073272705, "logits/rejected": -0.5525041222572327, "logps/chosen": -212.35989379882812, "logps/rejected": -270.72259521484375, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": -15.760797500610352, "rewards/margins": 4.4713521003723145, "rewards/rejected": -20.232149124145508, "step": 1311 }, { "epoch": 1.4577777777777778, "grad_norm": 0.29173722863197327, "learning_rate": 2.8221408147600413e-05, "logits/chosen": -0.396734356880188, "logits/rejected": -0.37516218423843384, "logps/chosen": -404.09918212890625, "logps/rejected": -646.22900390625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -30.960033416748047, "rewards/margins": 20.924991607666016, "rewards/rejected": -51.88502502441406, "step": 1312 }, { "epoch": 1.458888888888889, "grad_norm": 0.1397615522146225, "learning_rate": 2.8191041196514873e-05, "logits/chosen": -0.5349940657615662, "logits/rejected": -0.5341752767562866, "logps/chosen": -113.75978088378906, "logps/rejected": -197.23526000976562, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -5.719408988952637, "rewards/margins": 6.61810302734375, "rewards/rejected": -12.337512016296387, "step": 1313 }, { "epoch": 1.46, "grad_norm": 2.070781192742288e-05, "learning_rate": 2.8160669458496158e-05, "logits/chosen": -0.5228970050811768, "logits/rejected": -0.5218958854675293, "logps/chosen": -425.440673828125, "logps/rejected": -727.464111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.68042755126953, "rewards/margins": 24.182823181152344, "rewards/rejected": -55.863250732421875, "step": 1314 }, { "epoch": 1.4611111111111112, "grad_norm": 9.110036849975586, "learning_rate": 2.8130292979105437e-05, "logits/chosen": -0.46121251583099365, "logits/rejected": -0.46193981170654297, "logps/chosen": -300.5735778808594, "logps/rejected": -405.94635009765625, "loss": 0.6201, "rewards/accuracies": 0.5, "rewards/chosen": -22.703895568847656, "rewards/margins": 8.559552192687988, "rewards/rejected": -31.26344871520996, "step": 1315 }, { "epoch": 1.462222222222222, "grad_norm": 0.0001491637813160196, "learning_rate": 2.809991180391095e-05, "logits/chosen": -0.6117081642150879, "logits/rejected": -0.6059864163398743, "logps/chosen": -313.88616943359375, "logps/rejected": -513.59765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.808948516845703, "rewards/margins": 18.230792999267578, "rewards/rejected": -41.03974151611328, "step": 1316 }, { "epoch": 1.4633333333333334, "grad_norm": 25.27727508544922, "learning_rate": 2.8069525978488e-05, "logits/chosen": -0.6726005673408508, "logits/rejected": -0.6648356318473816, "logps/chosen": -279.5002136230469, "logps/rejected": -359.203125, "loss": 2.3489, "rewards/accuracies": 0.5, "rewards/chosen": -19.19377899169922, "rewards/margins": 7.1470046043396, "rewards/rejected": -26.340784072875977, "step": 1317 }, { "epoch": 1.4644444444444444, "grad_norm": 0.028839249163866043, "learning_rate": 2.803913554841887e-05, "logits/chosen": -0.4230731725692749, "logits/rejected": -0.4698297083377838, "logps/chosen": -340.4463195800781, "logps/rejected": -515.1111450195312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -24.629379272460938, "rewards/margins": 14.298760414123535, "rewards/rejected": -38.928138732910156, "step": 1318 }, { "epoch": 1.4655555555555555, "grad_norm": 2.750979319898761e-07, "learning_rate": 2.8008740559292722e-05, "logits/chosen": -0.42870569229125977, "logits/rejected": -0.41696715354919434, "logps/chosen": -407.4073486328125, "logps/rejected": -640.1046752929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.70569610595703, "rewards/margins": 19.946189880371094, "rewards/rejected": -51.651885986328125, "step": 1319 }, { "epoch": 1.4666666666666668, "grad_norm": 25.444103240966797, "learning_rate": 2.797834105670559e-05, "logits/chosen": -0.5412353277206421, "logits/rejected": -0.5393215417861938, "logps/chosen": -217.90512084960938, "logps/rejected": -310.6866760253906, "loss": 3.4831, "rewards/accuracies": 0.5, "rewards/chosen": -14.652669906616211, "rewards/margins": 9.235496520996094, "rewards/rejected": -23.888166427612305, "step": 1320 }, { "epoch": 1.4677777777777778, "grad_norm": 0.6047273874282837, "learning_rate": 2.7947937086260272e-05, "logits/chosen": -0.6513233184814453, "logits/rejected": -0.7094168066978455, "logps/chosen": -166.43186950683594, "logps/rejected": -286.92822265625, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -11.576526641845703, "rewards/margins": 9.885711669921875, "rewards/rejected": -21.462238311767578, "step": 1321 }, { "epoch": 1.468888888888889, "grad_norm": 0.0006461297743953764, "learning_rate": 2.791752869356625e-05, "logits/chosen": -0.372769832611084, "logits/rejected": -0.3647434711456299, "logps/chosen": -202.98202514648438, "logps/rejected": -400.112548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.935795783996582, "rewards/margins": 13.749855995178223, "rewards/rejected": -28.685651779174805, "step": 1322 }, { "epoch": 1.47, "grad_norm": 5.419121293925855e-07, "learning_rate": 2.788711592423966e-05, "logits/chosen": -0.30458420515060425, "logits/rejected": -0.2828400731086731, "logps/chosen": -202.8564453125, "logps/rejected": -500.1541748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.390724182128906, "rewards/margins": 24.897056579589844, "rewards/rejected": -39.28778076171875, "step": 1323 }, { "epoch": 1.471111111111111, "grad_norm": 0.7401752471923828, "learning_rate": 2.7856698823903183e-05, "logits/chosen": -0.4869520664215088, "logits/rejected": -0.48423707485198975, "logps/chosen": -199.38259887695312, "logps/rejected": -264.393310546875, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -13.111703872680664, "rewards/margins": 5.407463550567627, "rewards/rejected": -18.519168853759766, "step": 1324 }, { "epoch": 1.4722222222222223, "grad_norm": 1.4231061935424805, "learning_rate": 2.7826277438186022e-05, "logits/chosen": -0.6220147013664246, "logits/rejected": -0.5811623334884644, "logps/chosen": -322.689697265625, "logps/rejected": -378.0502014160156, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -22.747663497924805, "rewards/margins": 6.369682788848877, "rewards/rejected": -29.117345809936523, "step": 1325 }, { "epoch": 1.4733333333333334, "grad_norm": 61.41763687133789, "learning_rate": 2.7795851812723788e-05, "logits/chosen": -0.5370895862579346, "logits/rejected": -0.522641122341156, "logps/chosen": -355.7975158691406, "logps/rejected": -357.387939453125, "loss": 1.2122, "rewards/accuracies": 0.5, "rewards/chosen": -27.09159278869629, "rewards/margins": 0.056255340576171875, "rewards/rejected": -27.14784812927246, "step": 1326 }, { "epoch": 1.4744444444444444, "grad_norm": 0.003051814390346408, "learning_rate": 2.7765421993158464e-05, "logits/chosen": -0.36390000581741333, "logits/rejected": -0.35858508944511414, "logps/chosen": -377.01104736328125, "logps/rejected": -669.775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.262683868408203, "rewards/margins": 20.715049743652344, "rewards/rejected": -49.97772979736328, "step": 1327 }, { "epoch": 1.4755555555555555, "grad_norm": 4.6442116286016244e-07, "learning_rate": 2.7734988025138327e-05, "logits/chosen": -0.2460671067237854, "logits/rejected": -0.26389259099960327, "logps/chosen": -572.7423706054688, "logps/rejected": -874.8739013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -41.99147033691406, "rewards/margins": 26.90564727783203, "rewards/rejected": -68.8971176147461, "step": 1328 }, { "epoch": 1.4766666666666666, "grad_norm": 0.815798282623291, "learning_rate": 2.7704549954317855e-05, "logits/chosen": -0.5164622068405151, "logits/rejected": -0.49833059310913086, "logps/chosen": -151.1715545654297, "logps/rejected": -209.13131713867188, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -10.926763534545898, "rewards/margins": 5.022192478179932, "rewards/rejected": -15.948955535888672, "step": 1329 }, { "epoch": 1.4777777777777779, "grad_norm": 0.16373956203460693, "learning_rate": 2.767410782635771e-05, "logits/chosen": -0.42144879698753357, "logits/rejected": -0.39305490255355835, "logps/chosen": -191.13706970214844, "logps/rejected": -288.83612060546875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -12.606376647949219, "rewards/margins": 7.31845760345459, "rewards/rejected": -19.924835205078125, "step": 1330 }, { "epoch": 1.478888888888889, "grad_norm": 0.022021092474460602, "learning_rate": 2.7643661686924616e-05, "logits/chosen": -0.18218930065631866, "logits/rejected": -0.17123010754585266, "logps/chosen": -533.1105346679688, "logps/rejected": -632.763427734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -41.65784454345703, "rewards/margins": 8.510764122009277, "rewards/rejected": -50.168609619140625, "step": 1331 }, { "epoch": 1.48, "grad_norm": 37.936912536621094, "learning_rate": 2.761321158169134e-05, "logits/chosen": -0.34192585945129395, "logits/rejected": -0.3508835434913635, "logps/chosen": -311.1593017578125, "logps/rejected": -366.64776611328125, "loss": 1.6349, "rewards/accuracies": 0.5, "rewards/chosen": -23.998756408691406, "rewards/margins": 4.540217399597168, "rewards/rejected": -28.53897476196289, "step": 1332 }, { "epoch": 1.481111111111111, "grad_norm": 0.1597410887479782, "learning_rate": 2.7582757556336575e-05, "logits/chosen": -0.5687645673751831, "logits/rejected": -0.5595725178718567, "logps/chosen": -114.01305389404297, "logps/rejected": -304.29425048828125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -7.859042167663574, "rewards/margins": 15.323442459106445, "rewards/rejected": -23.182485580444336, "step": 1333 }, { "epoch": 1.482222222222222, "grad_norm": 0.055138278752565384, "learning_rate": 2.75522996565449e-05, "logits/chosen": -0.5875354409217834, "logits/rejected": -0.5624794960021973, "logps/chosen": -400.12554931640625, "logps/rejected": -537.703369140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -31.372238159179688, "rewards/margins": 12.20955753326416, "rewards/rejected": -43.58179473876953, "step": 1334 }, { "epoch": 1.4833333333333334, "grad_norm": 0.5403629541397095, "learning_rate": 2.752183792800671e-05, "logits/chosen": -0.4713306427001953, "logits/rejected": -0.5060089826583862, "logps/chosen": -268.2249755859375, "logps/rejected": -395.2742919921875, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -19.73638153076172, "rewards/margins": 9.237428665161133, "rewards/rejected": -28.97381019592285, "step": 1335 }, { "epoch": 1.4844444444444445, "grad_norm": 20.624267578125, "learning_rate": 2.7491372416418144e-05, "logits/chosen": -0.5382818579673767, "logits/rejected": -0.5355229377746582, "logps/chosen": -263.97161865234375, "logps/rejected": -300.70819091796875, "loss": 0.5049, "rewards/accuracies": 0.5, "rewards/chosen": -19.94051742553711, "rewards/margins": 3.168731689453125, "rewards/rejected": -23.109249114990234, "step": 1336 }, { "epoch": 1.4855555555555555, "grad_norm": 0.021935122087597847, "learning_rate": 2.7460903167481017e-05, "logits/chosen": -0.2979995012283325, "logits/rejected": -0.29168906807899475, "logps/chosen": -369.3706970214844, "logps/rejected": -502.62823486328125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -26.565250396728516, "rewards/margins": 11.665581703186035, "rewards/rejected": -38.2308349609375, "step": 1337 }, { "epoch": 1.4866666666666668, "grad_norm": 4.5822062588740664e-07, "learning_rate": 2.743043022690276e-05, "logits/chosen": -0.18098852038383484, "logits/rejected": -0.19154174625873566, "logps/chosen": -301.297119140625, "logps/rejected": -596.2998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.244234085083008, "rewards/margins": 24.88750457763672, "rewards/rejected": -47.131736755371094, "step": 1338 }, { "epoch": 1.4877777777777776, "grad_norm": 0.02872796170413494, "learning_rate": 2.7399953640396302e-05, "logits/chosen": -0.2881796061992645, "logits/rejected": -0.31603866815567017, "logps/chosen": -265.9044189453125, "logps/rejected": -434.1365661621094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -17.462512969970703, "rewards/margins": 10.992757797241211, "rewards/rejected": -28.455272674560547, "step": 1339 }, { "epoch": 1.488888888888889, "grad_norm": 0.0069345091469585896, "learning_rate": 2.7369473453680088e-05, "logits/chosen": -0.3595816195011139, "logits/rejected": -0.3448452055454254, "logps/chosen": -365.72821044921875, "logps/rejected": -515.3797607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.946916580200195, "rewards/margins": 13.03455638885498, "rewards/rejected": -36.98147201538086, "step": 1340 }, { "epoch": 1.49, "grad_norm": 2.762208938598633, "learning_rate": 2.7338989712477945e-05, "logits/chosen": -0.4909241199493408, "logits/rejected": -0.5048282146453857, "logps/chosen": -142.09469604492188, "logps/rejected": -182.52561950683594, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": -9.772804260253906, "rewards/margins": 2.9675917625427246, "rewards/rejected": -12.740396499633789, "step": 1341 }, { "epoch": 1.491111111111111, "grad_norm": 20.714885711669922, "learning_rate": 2.730850246251903e-05, "logits/chosen": -0.5010667443275452, "logits/rejected": -0.49785909056663513, "logps/chosen": -285.275146484375, "logps/rejected": -262.3319091796875, "loss": 1.8495, "rewards/accuracies": 0.5, "rewards/chosen": -19.75070571899414, "rewards/margins": -0.8332929611206055, "rewards/rejected": -18.91741180419922, "step": 1342 }, { "epoch": 1.4922222222222223, "grad_norm": 2.9953880584798753e-05, "learning_rate": 2.727801174953777e-05, "logits/chosen": -0.3498963415622711, "logits/rejected": -0.3166213631629944, "logps/chosen": -170.79443359375, "logps/rejected": -342.35186767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.867417335510254, "rewards/margins": 14.956629753112793, "rewards/rejected": -24.824047088623047, "step": 1343 }, { "epoch": 1.4933333333333334, "grad_norm": 0.01376952975988388, "learning_rate": 2.7247517619273776e-05, "logits/chosen": -0.32463356852531433, "logits/rejected": -0.2856758236885071, "logps/chosen": -217.85311889648438, "logps/rejected": -396.3876953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -15.84623908996582, "rewards/margins": 11.412266731262207, "rewards/rejected": -27.258506774902344, "step": 1344 }, { "epoch": 1.4944444444444445, "grad_norm": 6.251001358032227, "learning_rate": 2.7217020117471793e-05, "logits/chosen": -0.5478242635726929, "logits/rejected": -0.5448517203330994, "logps/chosen": -134.74468994140625, "logps/rejected": -240.6583709716797, "loss": 0.2506, "rewards/accuracies": 1.0, "rewards/chosen": -7.559524059295654, "rewards/margins": 9.738241195678711, "rewards/rejected": -17.297765731811523, "step": 1345 }, { "epoch": 1.4955555555555555, "grad_norm": 0.003558277850970626, "learning_rate": 2.7186519289881624e-05, "logits/chosen": -0.37510812282562256, "logits/rejected": -0.37196680903434753, "logps/chosen": -257.6865234375, "logps/rejected": -369.654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.063072204589844, "rewards/margins": 10.960432052612305, "rewards/rejected": -30.02350616455078, "step": 1346 }, { "epoch": 1.4966666666666666, "grad_norm": 0.012553980574011803, "learning_rate": 2.715601518225806e-05, "logits/chosen": -0.44869542121887207, "logits/rejected": -0.46366360783576965, "logps/chosen": -404.929931640625, "logps/rejected": -602.47509765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -33.018070220947266, "rewards/margins": 14.865220069885254, "rewards/rejected": -47.88329315185547, "step": 1347 }, { "epoch": 1.4977777777777779, "grad_norm": 0.030031248927116394, "learning_rate": 2.712550784036082e-05, "logits/chosen": -0.3777617812156677, "logits/rejected": -0.4235742688179016, "logps/chosen": -255.55955505371094, "logps/rejected": -416.8709411621094, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -17.802677154541016, "rewards/margins": 10.096081733703613, "rewards/rejected": -27.898757934570312, "step": 1348 }, { "epoch": 1.498888888888889, "grad_norm": 0.001964372815564275, "learning_rate": 2.709499730995445e-05, "logits/chosen": -0.3961831033229828, "logits/rejected": -0.4030787944793701, "logps/chosen": -366.42266845703125, "logps/rejected": -631.9699096679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -26.1309814453125, "rewards/margins": 18.561626434326172, "rewards/rejected": -44.69261169433594, "step": 1349 }, { "epoch": 1.5, "grad_norm": 17.626039505004883, "learning_rate": 2.7064483636808313e-05, "logits/chosen": -0.37017136812210083, "logits/rejected": -0.3802706301212311, "logps/chosen": -344.943603515625, "logps/rejected": -504.7613525390625, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": -27.156667709350586, "rewards/margins": 11.671098709106445, "rewards/rejected": -38.82776641845703, "step": 1350 }, { "epoch": 1.5, "eval_logits/chosen": -0.5302024483680725, "eval_logits/rejected": -0.5304113030433655, "eval_logps/chosen": -339.12225341796875, "eval_logps/rejected": -472.22686767578125, "eval_loss": 0.34911444783210754, "eval_rewards/accuracies": 0.8450000286102295, "eval_rewards/chosen": -25.135120391845703, "eval_rewards/margins": 10.629932403564453, "eval_rewards/rejected": -35.76505661010742, "eval_runtime": 84.5201, "eval_samples_per_second": 2.366, "eval_steps_per_second": 0.296, "step": 1350 } ], "logging_steps": 1, "max_steps": 2700, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 270, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }