{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990049751243781, "eval_steps": 100, "global_step": 753, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013266998341625207, "grad_norm": 51.0, "learning_rate": 6.578947368421052e-09, "logits/chosen": 1.8544756174087524, "logits/rejected": 1.3566310405731201, "logps/chosen": -567.539794921875, "logps/rejected": -536.8782348632812, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.013266998341625208, "grad_norm": 51.25, "learning_rate": 6.578947368421052e-08, "logits/chosen": 1.5698113441467285, "logits/rejected": 1.1476398706436157, "logps/chosen": -564.3509521484375, "logps/rejected": -494.2955627441406, "loss": 0.6946, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -0.003747415728867054, "rewards/margins": 0.002133134752511978, "rewards/rejected": -0.0058805509470403194, "step": 10 }, { "epoch": 0.026533996683250415, "grad_norm": 58.75, "learning_rate": 1.3157894736842104e-07, "logits/chosen": 1.6125901937484741, "logits/rejected": 1.1807773113250732, "logps/chosen": -624.02685546875, "logps/rejected": -589.86083984375, "loss": 0.6944, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.005675166379660368, "rewards/margins": -0.012083135545253754, "rewards/rejected": 0.006407969631254673, "step": 20 }, { "epoch": 0.03980099502487562, "grad_norm": 56.0, "learning_rate": 1.9736842105263157e-07, "logits/chosen": 1.6535797119140625, "logits/rejected": 1.1694531440734863, "logps/chosen": -641.99560546875, "logps/rejected": -569.0811157226562, "loss": 0.6943, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0023951560724526644, "rewards/margins": 0.00512791657820344, "rewards/rejected": -0.0075230724178254604, "step": 30 }, { "epoch": 0.05306799336650083, "grad_norm": 50.5, "learning_rate": 2.631578947368421e-07, "logits/chosen": 1.6425060033798218, "logits/rejected": 1.3308489322662354, "logps/chosen": -529.5000610351562, "logps/rejected": -544.4364013671875, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": 0.008951855823397636, "rewards/margins": 0.02573532797396183, "rewards/rejected": -0.016783470287919044, "step": 40 }, { "epoch": 0.06633499170812604, "grad_norm": 53.75, "learning_rate": 3.2894736842105264e-07, "logits/chosen": 1.5614960193634033, "logits/rejected": 1.2149112224578857, "logps/chosen": -581.55859375, "logps/rejected": -581.35498046875, "loss": 0.6858, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0014039975358173251, "rewards/margins": 0.006238006986677647, "rewards/rejected": -0.007642005570232868, "step": 50 }, { "epoch": 0.07960199004975124, "grad_norm": 47.0, "learning_rate": 3.9473684210526315e-07, "logits/chosen": 1.6245126724243164, "logits/rejected": 1.2693849802017212, "logps/chosen": -615.9725341796875, "logps/rejected": -612.1929931640625, "loss": 0.6743, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.006415500305593014, "rewards/margins": 0.03802403807640076, "rewards/rejected": -0.03160853683948517, "step": 60 }, { "epoch": 0.09286898839137644, "grad_norm": 52.25, "learning_rate": 4.6052631578947365e-07, "logits/chosen": 1.5123050212860107, "logits/rejected": 1.272062063217163, "logps/chosen": -627.8987426757812, "logps/rejected": -701.8155517578125, "loss": 0.6644, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02068617194890976, "rewards/margins": 0.06238304451107979, "rewards/rejected": -0.04169687628746033, "step": 70 }, { "epoch": 0.10613598673300166, "grad_norm": 46.0, "learning_rate": 4.999569334646955e-07, "logits/chosen": 1.5547429323196411, "logits/rejected": 1.128697156906128, "logps/chosen": -627.9088134765625, "logps/rejected": -580.3348388671875, "loss": 0.6478, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.031235147267580032, "rewards/margins": 0.10084448754787445, "rewards/rejected": -0.06960935145616531, "step": 80 }, { "epoch": 0.11940298507462686, "grad_norm": 39.5, "learning_rate": 4.994726053293702e-07, "logits/chosen": 1.5747498273849487, "logits/rejected": 1.2050644159317017, "logps/chosen": -617.0319213867188, "logps/rejected": -612.8756103515625, "loss": 0.6291, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.0502072349190712, "rewards/margins": 0.15916498005390167, "rewards/rejected": -0.10895773023366928, "step": 90 }, { "epoch": 0.13266998341625208, "grad_norm": 68.0, "learning_rate": 4.984511621268102e-07, "logits/chosen": 1.6143089532852173, "logits/rejected": 1.1834152936935425, "logps/chosen": -585.5972290039062, "logps/rejected": -558.1140747070312, "loss": 0.609, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.044579118490219116, "rewards/margins": 0.17150798439979553, "rewards/rejected": -0.12692885100841522, "step": 100 }, { "epoch": 0.13266998341625208, "eval_logits/chosen": 1.6120402812957764, "eval_logits/rejected": 1.188213586807251, "eval_logps/chosen": -601.8617553710938, "eval_logps/rejected": -551.5856323242188, "eval_loss": 0.6007018685340881, "eval_rewards/accuracies": 0.9059701561927795, "eval_rewards/chosen": 0.06111238896846771, "eval_rewards/margins": 0.20366284251213074, "eval_rewards/rejected": -0.14255043864250183, "eval_runtime": 944.2914, "eval_samples_per_second": 5.675, "eval_steps_per_second": 0.355, "step": 100 }, { "epoch": 0.14593698175787728, "grad_norm": 42.75, "learning_rate": 4.968948030264742e-07, "logits/chosen": 1.6046981811523438, "logits/rejected": 1.20311439037323, "logps/chosen": -601.0135498046875, "logps/rejected": -603.923828125, "loss": 0.594, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.08082517236471176, "rewards/margins": 0.2271861582994461, "rewards/rejected": -0.14636099338531494, "step": 110 }, { "epoch": 0.15920398009950248, "grad_norm": 40.75, "learning_rate": 4.948068788729238e-07, "logits/chosen": 1.6343997716903687, "logits/rejected": 1.3451590538024902, "logps/chosen": -570.892822265625, "logps/rejected": -606.6846313476562, "loss": 0.5821, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.06719505786895752, "rewards/margins": 0.24652114510536194, "rewards/rejected": -0.17932605743408203, "step": 120 }, { "epoch": 0.1724709784411277, "grad_norm": 38.5, "learning_rate": 4.921918849714475e-07, "logits/chosen": 1.5962369441986084, "logits/rejected": 1.290100336074829, "logps/chosen": -622.4376831054688, "logps/rejected": -632.4610595703125, "loss": 0.5588, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.09148349612951279, "rewards/margins": 0.30931785702705383, "rewards/rejected": -0.21783435344696045, "step": 130 }, { "epoch": 0.1857379767827529, "grad_norm": 34.25, "learning_rate": 4.890554514096591e-07, "logits/chosen": 1.5773239135742188, "logits/rejected": 1.1489975452423096, "logps/chosen": -602.868896484375, "logps/rejected": -549.524169921875, "loss": 0.5491, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.0930393859744072, "rewards/margins": 0.30938929319381714, "rewards/rejected": -0.21634991466999054, "step": 140 }, { "epoch": 0.19900497512437812, "grad_norm": 35.0, "learning_rate": 4.854043309359063e-07, "logits/chosen": 1.6148580312728882, "logits/rejected": 1.1569465398788452, "logps/chosen": -558.3030395507812, "logps/rejected": -491.3648376464844, "loss": 0.5381, "rewards/accuracies": 0.90625, "rewards/chosen": 0.10552488267421722, "rewards/margins": 0.33807462453842163, "rewards/rejected": -0.2325497567653656, "step": 150 }, { "epoch": 0.21227197346600332, "grad_norm": 34.5, "learning_rate": 4.812463844205884e-07, "logits/chosen": 1.6784461736679077, "logits/rejected": 1.2240537405014038, "logps/chosen": -619.3511352539062, "logps/rejected": -594.6825561523438, "loss": 0.5233, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11677803099155426, "rewards/margins": 0.4055229127407074, "rewards/rejected": -0.28874486684799194, "step": 160 }, { "epoch": 0.22553897180762852, "grad_norm": 32.5, "learning_rate": 4.7659056393168604e-07, "logits/chosen": 1.5311599969863892, "logits/rejected": 1.1275131702423096, "logps/chosen": -546.968994140625, "logps/rejected": -562.0914916992188, "loss": 0.5161, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11446012556552887, "rewards/margins": 0.41840943694114685, "rewards/rejected": -0.3039492964744568, "step": 170 }, { "epoch": 0.23880597014925373, "grad_norm": 35.25, "learning_rate": 4.714468934609381e-07, "logits/chosen": 1.6642940044403076, "logits/rejected": 1.2685682773590088, "logps/chosen": -535.3641357421875, "logps/rejected": -483.2560119628906, "loss": 0.5133, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.14332327246665955, "rewards/margins": 0.45225757360458374, "rewards/rejected": -0.3089343309402466, "step": 180 }, { "epoch": 0.25207296849087896, "grad_norm": 29.125, "learning_rate": 4.658264473421659e-07, "logits/chosen": 1.5365699529647827, "logits/rejected": 1.1466565132141113, "logps/chosen": -604.6485595703125, "logps/rejected": -536.5999755859375, "loss": 0.5111, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.10701610147953033, "rewards/margins": 0.44357848167419434, "rewards/rejected": -0.3365623950958252, "step": 190 }, { "epoch": 0.26533996683250416, "grad_norm": 35.0, "learning_rate": 4.597413264082086e-07, "logits/chosen": 1.5508623123168945, "logits/rejected": 1.230803370475769, "logps/chosen": -575.4376831054688, "logps/rejected": -567.358642578125, "loss": 0.4911, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.12208463251590729, "rewards/margins": 0.4712677597999573, "rewards/rejected": -0.3491831123828888, "step": 200 }, { "epoch": 0.26533996683250416, "eval_logits/chosen": 1.5940254926681519, "eval_logits/rejected": 1.1787534952163696, "eval_logps/chosen": -601.0678100585938, "eval_logps/rejected": -553.9150390625, "eval_loss": 0.4846906065940857, "eval_rewards/accuracies": 0.9328358173370361, "eval_rewards/chosen": 0.14050239324569702, "eval_rewards/margins": 0.5159949660301208, "eval_rewards/rejected": -0.3754926025867462, "eval_runtime": 949.6151, "eval_samples_per_second": 5.643, "eval_steps_per_second": 0.353, "step": 200 }, { "epoch": 0.27860696517412936, "grad_norm": 29.5, "learning_rate": 4.5320463193780256e-07, "logits/chosen": 1.622079610824585, "logits/rejected": 1.2113277912139893, "logps/chosen": -568.9658203125, "logps/rejected": -531.3096313476562, "loss": 0.4749, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.15365783870220184, "rewards/margins": 0.5153481960296631, "rewards/rejected": -0.36169037222862244, "step": 210 }, { "epoch": 0.29187396351575456, "grad_norm": 29.625, "learning_rate": 4.4623043744850044e-07, "logits/chosen": 1.4447951316833496, "logits/rejected": 1.0994714498519897, "logps/chosen": -583.0100708007812, "logps/rejected": -567.3427124023438, "loss": 0.4816, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.14035607874393463, "rewards/margins": 0.5472031831741333, "rewards/rejected": -0.4068470895290375, "step": 220 }, { "epoch": 0.30514096185737977, "grad_norm": 29.625, "learning_rate": 4.388337583963563e-07, "logits/chosen": 1.5770004987716675, "logits/rejected": 1.247883915901184, "logps/chosen": -631.2025146484375, "logps/rejected": -644.8001708984375, "loss": 0.4652, "rewards/accuracies": 0.90625, "rewards/chosen": 0.11735354363918304, "rewards/margins": 0.6114899516105652, "rewards/rejected": -0.49413642287254333, "step": 230 }, { "epoch": 0.31840796019900497, "grad_norm": 34.5, "learning_rate": 4.31030519847616e-07, "logits/chosen": 1.6700563430786133, "logits/rejected": 1.222560167312622, "logps/chosen": -560.0415649414062, "logps/rejected": -482.7701721191406, "loss": 0.4598, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.13375836610794067, "rewards/margins": 0.5355907082557678, "rewards/rejected": -0.4018322825431824, "step": 240 }, { "epoch": 0.33167495854063017, "grad_norm": 30.0, "learning_rate": 4.2283752219201464e-07, "logits/chosen": 1.5012189149856567, "logits/rejected": 1.1074771881103516, "logps/chosen": -564.4129028320312, "logps/rejected": -519.0759887695312, "loss": 0.4579, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.13092441856861115, "rewards/margins": 0.5768009424209595, "rewards/rejected": -0.44587650895118713, "step": 250 }, { "epoch": 0.3449419568822554, "grad_norm": 28.0, "learning_rate": 4.1427240497150047e-07, "logits/chosen": 1.6075923442840576, "logits/rejected": 1.154566764831543, "logps/chosen": -608.5453491210938, "logps/rejected": -556.8157348632812, "loss": 0.4454, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1563771516084671, "rewards/margins": 0.6038269996643066, "rewards/rejected": -0.44744986295700073, "step": 260 }, { "epoch": 0.3582089552238806, "grad_norm": 29.25, "learning_rate": 4.053536089022623e-07, "logits/chosen": 1.4856622219085693, "logits/rejected": 1.1645286083221436, "logps/chosen": -511.95501708984375, "logps/rejected": -489.37591552734375, "loss": 0.449, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.13391181826591492, "rewards/margins": 0.6065136194229126, "rewards/rejected": -0.4726017117500305, "step": 270 }, { "epoch": 0.3714759535655058, "grad_norm": 32.0, "learning_rate": 3.9610033617182715e-07, "logits/chosen": 1.59127938747406, "logits/rejected": 1.3113174438476562, "logps/chosen": -619.38330078125, "logps/rejected": -641.9339599609375, "loss": 0.4369, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.18023821711540222, "rewards/margins": 0.7155500650405884, "rewards/rejected": -0.5353118181228638, "step": 280 }, { "epoch": 0.38474295190713104, "grad_norm": 28.125, "learning_rate": 3.865325090967081e-07, "logits/chosen": 1.6182178258895874, "logits/rejected": 1.2581727504730225, "logps/chosen": -560.4617309570312, "logps/rejected": -567.3304443359375, "loss": 0.4416, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.16626134514808655, "rewards/margins": 0.6851488351821899, "rewards/rejected": -0.518887460231781, "step": 290 }, { "epoch": 0.39800995024875624, "grad_norm": 30.0, "learning_rate": 3.7667072722961357e-07, "logits/chosen": 1.5728381872177124, "logits/rejected": 1.1060500144958496, "logps/chosen": -588.7072143554688, "logps/rejected": -534.3040771484375, "loss": 0.4222, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.17058302462100983, "rewards/margins": 0.6764044761657715, "rewards/rejected": -0.5058214068412781, "step": 300 }, { "epoch": 0.39800995024875624, "eval_logits/chosen": 1.5840206146240234, "eval_logits/rejected": 1.173782467842102, "eval_logps/chosen": -600.7857055664062, "eval_logps/rejected": -555.5128784179688, "eval_loss": 0.4297899901866913, "eval_rewards/accuracies": 0.937313437461853, "eval_rewards/chosen": 0.16871587932109833, "eval_rewards/margins": 0.7039843201637268, "eval_rewards/rejected": -0.5352683663368225, "eval_runtime": 1222.6403, "eval_samples_per_second": 4.383, "eval_steps_per_second": 0.274, "step": 300 }, { "epoch": 0.41127694859038144, "grad_norm": 29.0, "learning_rate": 3.6653622300856457e-07, "logits/chosen": 1.4719158411026, "logits/rejected": 1.0981864929199219, "logps/chosen": -582.35107421875, "logps/rejected": -572.4479370117188, "loss": 0.4213, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.16171951591968536, "rewards/margins": 0.7223715782165527, "rewards/rejected": -0.5606520175933838, "step": 310 }, { "epoch": 0.42454394693200664, "grad_norm": 27.75, "learning_rate": 3.5615081604340903e-07, "logits/chosen": 1.532057523727417, "logits/rejected": 1.1843515634536743, "logps/chosen": -613.1371459960938, "logps/rejected": -606.70849609375, "loss": 0.4262, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.17750167846679688, "rewards/margins": 0.7329319715499878, "rewards/rejected": -0.5554302334785461, "step": 320 }, { "epoch": 0.43781094527363185, "grad_norm": 29.125, "learning_rate": 3.455368661381543e-07, "logits/chosen": 1.5567116737365723, "logits/rejected": 1.24821138381958, "logps/chosen": -485.0538024902344, "logps/rejected": -468.4979553222656, "loss": 0.4265, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.13072678446769714, "rewards/margins": 0.6463128328323364, "rewards/rejected": -0.5155860185623169, "step": 330 }, { "epoch": 0.45107794361525705, "grad_norm": 28.875, "learning_rate": 3.347172251502598e-07, "logits/chosen": 1.5567398071289062, "logits/rejected": 1.1744862794876099, "logps/chosen": -599.8635864257812, "logps/rejected": -530.7503662109375, "loss": 0.4081, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.1655641794204712, "rewards/margins": 0.7356466054916382, "rewards/rejected": -0.5700823664665222, "step": 340 }, { "epoch": 0.46434494195688225, "grad_norm": 27.5, "learning_rate": 3.2371518779053744e-07, "logits/chosen": 1.5260709524154663, "logits/rejected": 1.2051351070404053, "logps/chosen": -651.7446899414062, "logps/rejected": -658.5606079101562, "loss": 0.4077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.14886924624443054, "rewards/margins": 0.7922807931900024, "rewards/rejected": -0.6434115171432495, "step": 350 }, { "epoch": 0.47761194029850745, "grad_norm": 25.75, "learning_rate": 3.1255444146958844e-07, "logits/chosen": 1.6431595087051392, "logits/rejected": 1.3053652048110962, "logps/chosen": -573.4000854492188, "logps/rejected": -587.4290771484375, "loss": 0.4038, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.12524265050888062, "rewards/margins": 0.7791818380355835, "rewards/rejected": -0.6539390683174133, "step": 360 }, { "epoch": 0.49087893864013266, "grad_norm": 28.25, "learning_rate": 3.012590152987561e-07, "logits/chosen": 1.6208022832870483, "logits/rejected": 1.2217758893966675, "logps/chosen": -597.6519775390625, "logps/rejected": -559.587158203125, "loss": 0.4055, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15780240297317505, "rewards/margins": 0.78492271900177, "rewards/rejected": -0.627120316028595, "step": 370 }, { "epoch": 0.5041459369817579, "grad_norm": 26.875, "learning_rate": 2.8985322835539626e-07, "logits/chosen": 1.5740673542022705, "logits/rejected": 1.1342817544937134, "logps/chosen": -619.0950317382812, "logps/rejected": -571.8907470703125, "loss": 0.406, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22077970206737518, "rewards/margins": 0.8234270215034485, "rewards/rejected": -0.6026473641395569, "step": 380 }, { "epoch": 0.5174129353233831, "grad_norm": 28.125, "learning_rate": 2.7836163732385063e-07, "logits/chosen": 1.6202328205108643, "logits/rejected": 1.2261427640914917, "logps/chosen": -631.4647827148438, "logps/rejected": -581.2449951171875, "loss": 0.3901, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.15532895922660828, "rewards/margins": 0.7934967279434204, "rewards/rejected": -0.6381677389144897, "step": 390 }, { "epoch": 0.5306799336650083, "grad_norm": 26.25, "learning_rate": 2.6680898362485124e-07, "logits/chosen": 1.6189947128295898, "logits/rejected": 1.3066940307617188, "logps/chosen": -543.3778686523438, "logps/rejected": -551.9182739257812, "loss": 0.3917, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18244606256484985, "rewards/margins": 0.7903228402137756, "rewards/rejected": -0.6078766584396362, "step": 400 }, { "epoch": 0.5306799336650083, "eval_logits/chosen": 1.5761325359344482, "eval_logits/rejected": 1.168166160583496, "eval_logps/chosen": -600.7433471679688, "eval_logps/rejected": -556.4622192382812, "eval_loss": 0.4034000039100647, "eval_rewards/accuracies": 0.9417910575866699, "eval_rewards/chosen": 0.17294840514659882, "eval_rewards/margins": 0.8031538128852844, "eval_rewards/rejected": -0.6302054524421692, "eval_runtime": 911.2554, "eval_samples_per_second": 5.881, "eval_steps_per_second": 0.368, "step": 400 }, { "epoch": 0.5439469320066335, "grad_norm": 25.125, "learning_rate": 2.5522014014718697e-07, "logits/chosen": 1.6159231662750244, "logits/rejected": 1.28201162815094, "logps/chosen": -550.4302978515625, "logps/rejected": -489.55419921875, "loss": 0.4041, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.15269801020622253, "rewards/margins": 0.7327533960342407, "rewards/rejected": -0.5800554156303406, "step": 410 }, { "epoch": 0.5572139303482587, "grad_norm": 26.0, "learning_rate": 2.436200576963198e-07, "logits/chosen": 1.5422090291976929, "logits/rejected": 1.1291064023971558, "logps/chosen": -561.81201171875, "logps/rejected": -493.92218017578125, "loss": 0.3949, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19676923751831055, "rewards/margins": 0.8081914186477661, "rewards/rejected": -0.6114221811294556, "step": 420 }, { "epoch": 0.5704809286898839, "grad_norm": 27.125, "learning_rate": 2.3203371127524588e-07, "logits/chosen": 1.578450322151184, "logits/rejected": 1.1965930461883545, "logps/chosen": -540.2564086914062, "logps/rejected": -480.0533752441406, "loss": 0.4106, "rewards/accuracies": 0.90625, "rewards/chosen": 0.15361563861370087, "rewards/margins": 0.780199408531189, "rewards/rejected": -0.6265838146209717, "step": 430 }, { "epoch": 0.5837479270315091, "grad_norm": 29.625, "learning_rate": 2.2048604631325892e-07, "logits/chosen": 1.4670709371566772, "logits/rejected": 1.1125580072402954, "logps/chosen": -566.7425537109375, "logps/rejected": -548.0025634765625, "loss": 0.4048, "rewards/accuracies": 0.90625, "rewards/chosen": 0.13213221728801727, "rewards/margins": 0.7166903614997864, "rewards/rejected": -0.5845580697059631, "step": 440 }, { "epoch": 0.5970149253731343, "grad_norm": 25.0, "learning_rate": 2.0900192495838615e-07, "logits/chosen": 1.5971689224243164, "logits/rejected": 1.2005088329315186, "logps/chosen": -526.1453857421875, "logps/rejected": -480.22021484375, "loss": 0.3945, "rewards/accuracies": 0.9375, "rewards/chosen": 0.164282888174057, "rewards/margins": 0.7849767208099365, "rewards/rejected": -0.6206938028335571, "step": 450 }, { "epoch": 0.6102819237147595, "grad_norm": 25.125, "learning_rate": 1.9760607254912926e-07, "logits/chosen": 1.6195251941680908, "logits/rejected": 1.1817606687545776, "logps/chosen": -598.7520751953125, "logps/rejected": -555.9591674804688, "loss": 0.3947, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.17142745852470398, "rewards/margins": 0.7681721448898315, "rewards/rejected": -0.59674471616745, "step": 460 }, { "epoch": 0.6235489220563848, "grad_norm": 27.625, "learning_rate": 1.8632302438075613e-07, "logits/chosen": 1.5905851125717163, "logits/rejected": 1.2247693538665771, "logps/chosen": -583.4114990234375, "logps/rejected": -560.2368774414062, "loss": 0.3967, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2055719643831253, "rewards/margins": 0.8811567425727844, "rewards/rejected": -0.6755847930908203, "step": 470 }, { "epoch": 0.6368159203980099, "grad_norm": 24.75, "learning_rate": 1.7517707288075614e-07, "logits/chosen": 1.56401789188385, "logits/rejected": 1.1912111043930054, "logps/chosen": -568.5778198242188, "logps/rejected": -560.7440185546875, "loss": 0.3823, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.2012619525194168, "rewards/margins": 0.8539912104606628, "rewards/rejected": -0.6527292132377625, "step": 480 }, { "epoch": 0.6500829187396352, "grad_norm": 31.25, "learning_rate": 1.641922153071906e-07, "logits/chosen": 1.6006752252578735, "logits/rejected": 1.2529712915420532, "logps/chosen": -569.6580200195312, "logps/rejected": -560.5640869140625, "loss": 0.4024, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.14351987838745117, "rewards/margins": 0.8288625478744507, "rewards/rejected": -0.6853427290916443, "step": 490 }, { "epoch": 0.6633499170812603, "grad_norm": 25.875, "learning_rate": 1.5339210208254344e-07, "logits/chosen": 1.5206557512283325, "logits/rejected": 1.1747671365737915, "logps/chosen": -554.2539672851562, "logps/rejected": -562.5396728515625, "loss": 0.3924, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.17344887554645538, "rewards/margins": 0.8352993726730347, "rewards/rejected": -0.6618505120277405, "step": 500 }, { "epoch": 0.6633499170812603, "eval_logits/chosen": 1.5752681493759155, "eval_logits/rejected": 1.1689281463623047, "eval_logps/chosen": -600.6738891601562, "eval_logps/rejected": -556.8052368164062, "eval_loss": 0.3936308026313782, "eval_rewards/accuracies": 0.9425373077392578, "eval_rewards/chosen": 0.17989806830883026, "eval_rewards/margins": 0.8444048166275024, "eval_rewards/rejected": -0.6645067930221558, "eval_runtime": 956.6769, "eval_samples_per_second": 5.602, "eval_steps_per_second": 0.35, "step": 500 }, { "epoch": 0.6766169154228856, "grad_norm": 26.5, "learning_rate": 1.4279998587430943e-07, "logits/chosen": 1.488255262374878, "logits/rejected": 1.0699503421783447, "logps/chosen": -569.7669677734375, "logps/rejected": -469.26739501953125, "loss": 0.381, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18479984998703003, "rewards/margins": 0.8514007329940796, "rewards/rejected": -0.6666008830070496, "step": 510 }, { "epoch": 0.6898839137645107, "grad_norm": 24.625, "learning_rate": 1.324386715319503e-07, "logits/chosen": 1.5778402090072632, "logits/rejected": 1.2051408290863037, "logps/chosen": -542.29736328125, "logps/rejected": -521.4121704101562, "loss": 0.3929, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.1931554526090622, "rewards/margins": 0.8262062072753906, "rewards/rejected": -0.6330506801605225, "step": 520 }, { "epoch": 0.703150912106136, "grad_norm": 25.375, "learning_rate": 1.2233046698800343e-07, "logits/chosen": 1.6390968561172485, "logits/rejected": 1.2395927906036377, "logps/chosen": -617.5450439453125, "logps/rejected": -633.2860107421875, "loss": 0.3846, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.21280376613140106, "rewards/margins": 0.8998249769210815, "rewards/rejected": -0.6870212554931641, "step": 530 }, { "epoch": 0.7164179104477612, "grad_norm": 27.25, "learning_rate": 1.124971352290545e-07, "logits/chosen": 1.6308505535125732, "logits/rejected": 1.2848546504974365, "logps/chosen": -600.9317016601562, "logps/rejected": -597.5542602539062, "loss": 0.3796, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1786193549633026, "rewards/margins": 0.8511413335800171, "rewards/rejected": -0.6725220084190369, "step": 540 }, { "epoch": 0.7296849087893864, "grad_norm": 27.25, "learning_rate": 1.0295984743997909e-07, "logits/chosen": 1.6056525707244873, "logits/rejected": 1.1871583461761475, "logps/chosen": -601.3228759765625, "logps/rejected": -590.7117919921875, "loss": 0.3926, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.16356605291366577, "rewards/margins": 0.8453407287597656, "rewards/rejected": -0.6817747354507446, "step": 550 }, { "epoch": 0.7429519071310116, "grad_norm": 25.0, "learning_rate": 9.37391374223355e-08, "logits/chosen": 1.5422403812408447, "logits/rejected": 1.2078697681427002, "logps/chosen": -590.7647705078125, "logps/rejected": -601.2999877929688, "loss": 0.3779, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.19642992317676544, "rewards/margins": 0.8918864130973816, "rewards/rejected": -0.6954564452171326, "step": 560 }, { "epoch": 0.7562189054726368, "grad_norm": 28.875, "learning_rate": 8.485485738504488e-08, "logits/chosen": 1.541519045829773, "logits/rejected": 1.014730453491211, "logps/chosen": -625.3824462890625, "logps/rejected": -462.270263671875, "loss": 0.3864, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.18155282735824585, "rewards/margins": 0.842772364616394, "rewards/rejected": -0.6612194776535034, "step": 570 }, { "epoch": 0.7694859038142621, "grad_norm": 32.0, "learning_rate": 7.632613520254158e-08, "logits/chosen": 1.5010730028152466, "logits/rejected": 1.1362125873565674, "logps/chosen": -605.3756103515625, "logps/rejected": -537.768798828125, "loss": 0.3933, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1804153472185135, "rewards/margins": 0.9065285921096802, "rewards/rejected": -0.7261131405830383, "step": 580 }, { "epoch": 0.7827529021558872, "grad_norm": 27.375, "learning_rate": 6.817133323241755e-08, "logits/chosen": 1.6831426620483398, "logits/rejected": 1.124448537826538, "logps/chosen": -661.5943603515625, "logps/rejected": -537.2649536132812, "loss": 0.3937, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.18760402500629425, "rewards/margins": 0.840624988079071, "rewards/rejected": -0.6530209183692932, "step": 590 }, { "epoch": 0.7960199004975125, "grad_norm": 24.25, "learning_rate": 6.040800878122654e-08, "logits/chosen": 1.5694350004196167, "logits/rejected": 1.1736431121826172, "logps/chosen": -562.9141845703125, "logps/rejected": -531.40673828125, "loss": 0.3874, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.16524121165275574, "rewards/margins": 0.8450342416763306, "rewards/rejected": -0.6797930598258972, "step": 600 }, { "epoch": 0.7960199004975125, "eval_logits/chosen": 1.5742043256759644, "eval_logits/rejected": 1.1683696508407593, "eval_logps/chosen": -600.6769409179688, "eval_logps/rejected": -556.9197998046875, "eval_loss": 0.391216516494751, "eval_rewards/accuracies": 0.9432835578918457, "eval_rewards/chosen": 0.17959673702716827, "eval_rewards/margins": 0.8555588722229004, "eval_rewards/rejected": -0.6759620904922485, "eval_runtime": 926.9746, "eval_samples_per_second": 5.781, "eval_steps_per_second": 0.361, "step": 600 }, { "epoch": 0.8092868988391376, "grad_norm": 26.0, "learning_rate": 5.305287630356362e-08, "logits/chosen": 1.6091585159301758, "logits/rejected": 1.1641608476638794, "logps/chosen": -600.8865966796875, "logps/rejected": -591.0731201171875, "loss": 0.3924, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.18911994993686676, "rewards/margins": 0.8595080375671387, "rewards/rejected": -0.6703880429267883, "step": 610 }, { "epoch": 0.8225538971807629, "grad_norm": 26.5, "learning_rate": 4.612177141580875e-08, "logits/chosen": 1.5971522331237793, "logits/rejected": 1.208573341369629, "logps/chosen": -597.9872436523438, "logps/rejected": -583.856201171875, "loss": 0.391, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.2065185010433197, "rewards/margins": 0.8820034861564636, "rewards/rejected": -0.6754850149154663, "step": 620 }, { "epoch": 0.835820895522388, "grad_norm": 24.875, "learning_rate": 3.962961680200927e-08, "logits/chosen": 1.6051514148712158, "logits/rejected": 1.2354391813278198, "logps/chosen": -577.6351318359375, "logps/rejected": -571.5172729492188, "loss": 0.3891, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17026260495185852, "rewards/margins": 0.8514760732650757, "rewards/rejected": -0.6812134981155396, "step": 630 }, { "epoch": 0.8490878938640133, "grad_norm": 25.625, "learning_rate": 3.359039008530845e-08, "logits/chosen": 1.624871015548706, "logits/rejected": 1.2038135528564453, "logps/chosen": -636.6817626953125, "logps/rejected": -584.8831787109375, "loss": 0.3874, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.18935716152191162, "rewards/margins": 0.9286630749702454, "rewards/rejected": -0.7393059134483337, "step": 640 }, { "epoch": 0.8623548922056384, "grad_norm": 24.375, "learning_rate": 2.8017093734092474e-08, "logits/chosen": 1.6053879261016846, "logits/rejected": 1.1266980171203613, "logps/chosen": -616.425048828125, "logps/rejected": -537.4404907226562, "loss": 0.3914, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1774505376815796, "rewards/margins": 0.8556821942329407, "rewards/rejected": -0.6782316565513611, "step": 650 }, { "epoch": 0.8756218905472637, "grad_norm": 22.375, "learning_rate": 2.292172706764703e-08, "logits/chosen": 1.5424891710281372, "logits/rejected": 1.2280899286270142, "logps/chosen": -620.6058959960938, "logps/rejected": -629.50927734375, "loss": 0.3878, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.18996702134609222, "rewards/margins": 0.8839784860610962, "rewards/rejected": -0.6940114498138428, "step": 660 }, { "epoch": 0.8888888888888888, "grad_norm": 27.625, "learning_rate": 1.8315260421596924e-08, "logits/chosen": 1.6114118099212646, "logits/rejected": 1.2134363651275635, "logps/chosen": -561.7435302734375, "logps/rejected": -523.7152099609375, "loss": 0.3946, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.1770535558462143, "rewards/margins": 0.8293676376342773, "rewards/rejected": -0.6523140668869019, "step": 670 }, { "epoch": 0.9021558872305141, "grad_norm": 25.375, "learning_rate": 1.4207611528748997e-08, "logits/chosen": 1.526018738746643, "logits/rejected": 1.1832793951034546, "logps/chosen": -560.2198486328125, "logps/rejected": -561.2310180664062, "loss": 0.3951, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.15571674704551697, "rewards/margins": 0.8264566659927368, "rewards/rejected": -0.6707398891448975, "step": 680 }, { "epoch": 0.9154228855721394, "grad_norm": 29.25, "learning_rate": 1.0607624166191958e-08, "logits/chosen": 1.5950744152069092, "logits/rejected": 1.2837769985198975, "logps/chosen": -666.151123046875, "logps/rejected": -735.0059814453125, "loss": 0.3823, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.20073120296001434, "rewards/margins": 0.8991263508796692, "rewards/rejected": -0.6983952522277832, "step": 690 }, { "epoch": 0.9286898839137645, "grad_norm": 26.75, "learning_rate": 7.523049114624647e-09, "logits/chosen": 1.641229271888733, "logits/rejected": 1.2468892335891724, "logps/chosen": -616.0029296875, "logps/rejected": -599.7814331054688, "loss": 0.3922, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.21716108918190002, "rewards/margins": 0.9342123866081238, "rewards/rejected": -0.7170513868331909, "step": 700 }, { "epoch": 0.9286898839137645, "eval_logits/chosen": 1.5741949081420898, "eval_logits/rejected": 1.1684850454330444, "eval_logps/chosen": -600.683837890625, "eval_logps/rejected": -556.948486328125, "eval_loss": 0.39090853929519653, "eval_rewards/accuracies": 0.9395522475242615, "eval_rewards/chosen": 0.17890305817127228, "eval_rewards/margins": 0.85773104429245, "eval_rewards/rejected": -0.6788280606269836, "eval_runtime": 904.9289, "eval_samples_per_second": 5.922, "eval_steps_per_second": 0.37, "step": 700 }, { "epoch": 0.9419568822553898, "grad_norm": 29.125, "learning_rate": 4.960527470908277e-09, "logits/chosen": 1.5134766101837158, "logits/rejected": 1.1567257642745972, "logps/chosen": -620.8663330078125, "logps/rejected": -603.9954833984375, "loss": 0.3913, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.1584736704826355, "rewards/margins": 0.8911491632461548, "rewards/rejected": -0.7326754331588745, "step": 710 }, { "epoch": 0.9552238805970149, "grad_norm": 25.375, "learning_rate": 2.925576349770337e-09, "logits/chosen": 1.5958002805709839, "logits/rejected": 1.2081973552703857, "logps/chosen": -600.2033081054688, "logps/rejected": -570.5869140625, "loss": 0.396, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.21250076591968536, "rewards/margins": 0.8857507705688477, "rewards/rejected": -0.6732500195503235, "step": 720 }, { "epoch": 0.9684908789386402, "grad_norm": 26.25, "learning_rate": 1.4225770054443197e-09, "logits/chosen": 1.5151315927505493, "logits/rejected": 1.1391550302505493, "logps/chosen": -566.0535278320312, "logps/rejected": -490.119873046875, "loss": 0.3943, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.18251679837703705, "rewards/margins": 0.8498570322990417, "rewards/rejected": -0.6673402190208435, "step": 730 }, { "epoch": 0.9817578772802653, "grad_norm": 26.5, "learning_rate": 4.547653988198619e-10, "logits/chosen": 1.5504015684127808, "logits/rejected": 1.2012706995010376, "logps/chosen": -632.9603881835938, "logps/rejected": -606.04638671875, "loss": 0.385, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.17528708279132843, "rewards/margins": 0.8416942358016968, "rewards/rejected": -0.6664072275161743, "step": 740 }, { "epoch": 0.9950248756218906, "grad_norm": 25.625, "learning_rate": 2.4225230411789588e-11, "logits/chosen": 1.6227556467056274, "logits/rejected": 1.2329021692276, "logps/chosen": -631.4197998046875, "logps/rejected": -641.5535888671875, "loss": 0.3883, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.22030360996723175, "rewards/margins": 0.900096595287323, "rewards/rejected": -0.6797930002212524, "step": 750 }, { "epoch": 0.9990049751243781, "step": 753, "total_flos": 0.0, "train_loss": 0.45811287039621257, "train_runtime": 34472.405, "train_samples_per_second": 1.399, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 753, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }