{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997120644975526, "eval_steps": 100, "global_step": 1736, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 10.610388839867777, "learning_rate": 2.8735632183908045e-09, "logits/chosen": -2.688382625579834, "logits/rejected": -2.687504768371582, "logps/chosen": -154.15142822265625, "logps/rejected": -119.21998596191406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 11.287668992561438, "learning_rate": 2.8735632183908043e-08, "logits/chosen": -2.693573236465454, "logits/rejected": -2.7061853408813477, "logps/chosen": -203.12576293945312, "logps/rejected": -203.58848571777344, "loss": 0.6933, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": -0.0002493205538485199, "rewards/margins": -0.00013067919644527137, "rewards/rejected": -0.0001186413355753757, "step": 10 }, { "epoch": 0.01, "grad_norm": 10.975446002121831, "learning_rate": 5.747126436781609e-08, "logits/chosen": -2.6681714057922363, "logits/rejected": -2.6636619567871094, "logps/chosen": -208.20529174804688, "logps/rejected": -195.71517944335938, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 2.0605861209332943e-05, "rewards/margins": 0.0007079349015839398, "rewards/rejected": -0.0006873290403746068, "step": 20 }, { "epoch": 0.02, "grad_norm": 10.987240036415274, "learning_rate": 8.620689655172414e-08, "logits/chosen": -2.6226565837860107, "logits/rejected": -2.627593755722046, "logps/chosen": -179.27633666992188, "logps/rejected": -194.77871704101562, "loss": 0.693, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0009387334575876594, "rewards/margins": 0.0007720856228843331, "rewards/rejected": 0.00016664779104758054, "step": 30 }, { "epoch": 0.02, "grad_norm": 11.874024139589977, "learning_rate": 1.1494252873563217e-07, "logits/chosen": -2.610243320465088, "logits/rejected": -2.571385145187378, "logps/chosen": -208.62820434570312, "logps/rejected": -187.62649536132812, "loss": 0.6927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.001225657993927598, "rewards/margins": 0.0014799232594668865, "rewards/rejected": -0.00025426512002013624, "step": 40 }, { "epoch": 0.03, "grad_norm": 10.535632759826791, "learning_rate": 1.436781609195402e-07, "logits/chosen": -2.6413865089416504, "logits/rejected": -2.665769100189209, "logps/chosen": -236.5024871826172, "logps/rejected": -203.89524841308594, "loss": 0.6923, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0021102039609104395, "rewards/margins": 0.0023120432160794735, "rewards/rejected": -0.00020183932792861015, "step": 50 }, { "epoch": 0.03, "grad_norm": 10.953283816672645, "learning_rate": 1.7241379310344828e-07, "logits/chosen": -2.649590015411377, "logits/rejected": -2.6609647274017334, "logps/chosen": -232.6203155517578, "logps/rejected": -211.6860809326172, "loss": 0.6914, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0004964367835782468, "rewards/margins": 0.003091245424002409, "rewards/rejected": -0.0025948083493858576, "step": 60 }, { "epoch": 0.04, "grad_norm": 11.344557073712732, "learning_rate": 2.0114942528735633e-07, "logits/chosen": -2.6284663677215576, "logits/rejected": -2.6205639839172363, "logps/chosen": -203.4170684814453, "logps/rejected": -206.2279052734375, "loss": 0.6899, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004654805175960064, "rewards/margins": 0.0051066940650343895, "rewards/rejected": -0.009761499240994453, "step": 70 }, { "epoch": 0.05, "grad_norm": 11.169957980773157, "learning_rate": 2.2988505747126435e-07, "logits/chosen": -2.617027997970581, "logits/rejected": -2.653088092803955, "logps/chosen": -176.9120330810547, "logps/rejected": -186.38589477539062, "loss": 0.6887, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.01036100834608078, "rewards/margins": 0.004531105048954487, "rewards/rejected": -0.014892111532390118, "step": 80 }, { "epoch": 0.05, "grad_norm": 11.709716340155365, "learning_rate": 2.586206896551724e-07, "logits/chosen": -2.5782480239868164, "logits/rejected": -2.599475622177124, "logps/chosen": -178.95782470703125, "logps/rejected": -210.3921661376953, "loss": 0.6841, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.019216390326619148, "rewards/margins": 0.0284399576485157, "rewards/rejected": -0.0476563461124897, "step": 90 }, { "epoch": 0.06, "grad_norm": 11.076992600659995, "learning_rate": 2.873563218390804e-07, "logits/chosen": -2.6024298667907715, "logits/rejected": -2.603557825088501, "logps/chosen": -191.04461669921875, "logps/rejected": -196.60302734375, "loss": 0.6809, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.04221532493829727, "rewards/margins": 0.025046557188034058, "rewards/rejected": -0.06726188957691193, "step": 100 }, { "epoch": 0.06, "eval_logits/chosen": -2.5392000675201416, "eval_logits/rejected": -2.5504696369171143, "eval_logps/chosen": -171.71307373046875, "eval_logps/rejected": -181.7760467529297, "eval_loss": 0.6815534234046936, "eval_rewards/accuracies": 0.6090182662010193, "eval_rewards/chosen": -0.0895635262131691, "eval_rewards/margins": 0.024930791929364204, "eval_rewards/rejected": -0.11449432373046875, "eval_runtime": 523.8706, "eval_samples_per_second": 13.362, "eval_steps_per_second": 0.418, "step": 100 }, { "epoch": 0.06, "grad_norm": 11.892784161636136, "learning_rate": 3.160919540229885e-07, "logits/chosen": -2.559643268585205, "logits/rejected": -2.5869317054748535, "logps/chosen": -202.63461303710938, "logps/rejected": -223.0349578857422, "loss": 0.672, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08688319474458694, "rewards/margins": 0.04590854048728943, "rewards/rejected": -0.13279172778129578, "step": 110 }, { "epoch": 0.07, "grad_norm": 15.318535657417753, "learning_rate": 3.4482758620689656e-07, "logits/chosen": -2.5284199714660645, "logits/rejected": -2.5128540992736816, "logps/chosen": -199.5592803955078, "logps/rejected": -214.75119018554688, "loss": 0.6613, "rewards/accuracies": 0.625, "rewards/chosen": -0.17664876580238342, "rewards/margins": 0.06412236392498016, "rewards/rejected": -0.24077114462852478, "step": 120 }, { "epoch": 0.07, "grad_norm": 22.024086046505637, "learning_rate": 3.735632183908046e-07, "logits/chosen": -2.5801522731781006, "logits/rejected": -2.565929651260376, "logps/chosen": -245.0824432373047, "logps/rejected": -247.3890838623047, "loss": 0.641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3728107511997223, "rewards/margins": 0.1388251781463623, "rewards/rejected": -0.5116358995437622, "step": 130 }, { "epoch": 0.08, "grad_norm": 20.201715650528918, "learning_rate": 4.0229885057471266e-07, "logits/chosen": -2.5328726768493652, "logits/rejected": -2.5208544731140137, "logps/chosen": -302.12322998046875, "logps/rejected": -297.0425109863281, "loss": 0.6436, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7101386785507202, "rewards/margins": 0.11017869412899017, "rewards/rejected": -0.820317268371582, "step": 140 }, { "epoch": 0.09, "grad_norm": 21.265576535090425, "learning_rate": 4.310344827586206e-07, "logits/chosen": -2.440979480743408, "logits/rejected": -2.446094512939453, "logps/chosen": -281.5878601074219, "logps/rejected": -299.9305419921875, "loss": 0.6327, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.7533038258552551, "rewards/margins": 0.1949019879102707, "rewards/rejected": -0.9482057690620422, "step": 150 }, { "epoch": 0.09, "grad_norm": 24.193407542556805, "learning_rate": 4.597701149425287e-07, "logits/chosen": -2.405226707458496, "logits/rejected": -2.385442018508911, "logps/chosen": -282.8765563964844, "logps/rejected": -290.90338134765625, "loss": 0.6035, "rewards/accuracies": 0.75, "rewards/chosen": -0.7150470018386841, "rewards/margins": 0.3021948039531708, "rewards/rejected": -1.0172417163848877, "step": 160 }, { "epoch": 0.1, "grad_norm": 25.66751117876746, "learning_rate": 4.885057471264368e-07, "logits/chosen": -2.428391456604004, "logits/rejected": -2.4205939769744873, "logps/chosen": -295.0913391113281, "logps/rejected": -324.97454833984375, "loss": 0.6138, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.996240496635437, "rewards/margins": 0.40502578020095825, "rewards/rejected": -1.40126633644104, "step": 170 }, { "epoch": 0.1, "grad_norm": 23.464843947505965, "learning_rate": 4.999817969178237e-07, "logits/chosen": -2.4013054370880127, "logits/rejected": -2.398705005645752, "logps/chosen": -315.08050537109375, "logps/rejected": -362.9265441894531, "loss": 0.6065, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1423838138580322, "rewards/margins": 0.3166060149669647, "rewards/rejected": -1.4589898586273193, "step": 180 }, { "epoch": 0.11, "grad_norm": 25.400467946109586, "learning_rate": 4.998705654596034e-07, "logits/chosen": -2.467696189880371, "logits/rejected": -2.4567761421203613, "logps/chosen": -330.1573181152344, "logps/rejected": -355.02154541015625, "loss": 0.5809, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1700841188430786, "rewards/margins": 0.3594915568828583, "rewards/rejected": -1.5295757055282593, "step": 190 }, { "epoch": 0.12, "grad_norm": 29.13043617111363, "learning_rate": 4.996582603056428e-07, "logits/chosen": -2.376218557357788, "logits/rejected": -2.3482134342193604, "logps/chosen": -332.60443115234375, "logps/rejected": -390.0224914550781, "loss": 0.6002, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3785903453826904, "rewards/margins": 0.41944313049316406, "rewards/rejected": -1.7980334758758545, "step": 200 }, { "epoch": 0.12, "eval_logits/chosen": -2.3656015396118164, "eval_logits/rejected": -2.356565237045288, "eval_logps/chosen": -309.8548583984375, "eval_logps/rejected": -361.9523010253906, "eval_loss": 0.5905965566635132, "eval_rewards/accuracies": 0.6843607425689697, "eval_rewards/chosen": -1.4709811210632324, "eval_rewards/margins": 0.4452756345272064, "eval_rewards/rejected": -1.9162570238113403, "eval_runtime": 536.6296, "eval_samples_per_second": 13.044, "eval_steps_per_second": 0.408, "step": 200 }, { "epoch": 0.12, "grad_norm": 21.603808432085263, "learning_rate": 4.993449673342705e-07, "logits/chosen": -2.4084885120391846, "logits/rejected": -2.4161148071289062, "logps/chosen": -323.7695007324219, "logps/rejected": -387.0673828125, "loss": 0.594, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2521207332611084, "rewards/margins": 0.4323544502258301, "rewards/rejected": -1.684475302696228, "step": 210 }, { "epoch": 0.13, "grad_norm": 23.37624428964897, "learning_rate": 4.989308132738126e-07, "logits/chosen": -2.339341402053833, "logits/rejected": -2.3030219078063965, "logps/chosen": -309.7107849121094, "logps/rejected": -352.9278564453125, "loss": 0.5974, "rewards/accuracies": 0.6875, "rewards/chosen": -1.297975778579712, "rewards/margins": 0.35226622223854065, "rewards/rejected": -1.6502418518066406, "step": 220 }, { "epoch": 0.13, "grad_norm": 24.10831947448163, "learning_rate": 4.9841596565133e-07, "logits/chosen": -2.2944416999816895, "logits/rejected": -2.2744333744049072, "logps/chosen": -354.4916076660156, "logps/rejected": -389.98919677734375, "loss": 0.597, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5111327171325684, "rewards/margins": 0.3378602862358093, "rewards/rejected": -1.848992943763733, "step": 230 }, { "epoch": 0.14, "grad_norm": 23.94673097578735, "learning_rate": 4.978006327248536e-07, "logits/chosen": -2.4152960777282715, "logits/rejected": -2.417513370513916, "logps/chosen": -313.9660949707031, "logps/rejected": -363.4143981933594, "loss": 0.5808, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1127357482910156, "rewards/margins": 0.48004403710365295, "rewards/rejected": -1.5927797555923462, "step": 240 }, { "epoch": 0.14, "grad_norm": 45.997903240569016, "learning_rate": 4.970850633991431e-07, "logits/chosen": -2.3635926246643066, "logits/rejected": -2.3639185428619385, "logps/chosen": -357.05181884765625, "logps/rejected": -428.13134765625, "loss": 0.5965, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6936771869659424, "rewards/margins": 0.5474244952201843, "rewards/rejected": -2.2411017417907715, "step": 250 }, { "epoch": 0.15, "grad_norm": 26.05750468880025, "learning_rate": 4.962695471250032e-07, "logits/chosen": -2.3708977699279785, "logits/rejected": -2.3599140644073486, "logps/chosen": -314.1701965332031, "logps/rejected": -378.3408203125, "loss": 0.577, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.363680124282837, "rewards/margins": 0.5289397239685059, "rewards/rejected": -1.8926197290420532, "step": 260 }, { "epoch": 0.16, "grad_norm": 30.134203618956438, "learning_rate": 4.953544137822006e-07, "logits/chosen": -2.272925615310669, "logits/rejected": -2.2591726779937744, "logps/chosen": -352.3068542480469, "logps/rejected": -409.1640625, "loss": 0.5787, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.743584394454956, "rewards/margins": 0.42208537459373474, "rewards/rejected": -2.1656696796417236, "step": 270 }, { "epoch": 0.16, "grad_norm": 23.199137985460396, "learning_rate": 4.94340033546025e-07, "logits/chosen": -2.300412654876709, "logits/rejected": -2.2782740592956543, "logps/chosen": -381.15594482421875, "logps/rejected": -399.106201171875, "loss": 0.5954, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5968300104141235, "rewards/margins": 0.3864768445491791, "rewards/rejected": -1.983306884765625, "step": 280 }, { "epoch": 0.17, "grad_norm": 30.52404960049098, "learning_rate": 4.932268167375531e-07, "logits/chosen": -2.3673739433288574, "logits/rejected": -2.3496601581573486, "logps/chosen": -319.85589599609375, "logps/rejected": -363.55059814453125, "loss": 0.5868, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2304320335388184, "rewards/margins": 0.38459140062332153, "rewards/rejected": -1.6150233745574951, "step": 290 }, { "epoch": 0.17, "grad_norm": 21.552944683968224, "learning_rate": 4.920152136576705e-07, "logits/chosen": -2.301480770111084, "logits/rejected": -2.286813259124756, "logps/chosen": -361.3895263671875, "logps/rejected": -411.3047790527344, "loss": 0.591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5672380924224854, "rewards/margins": 0.5191463232040405, "rewards/rejected": -2.0863845348358154, "step": 300 }, { "epoch": 0.17, "eval_logits/chosen": -2.2067737579345703, "eval_logits/rejected": -2.193309783935547, "eval_logps/chosen": -365.80474853515625, "eval_logps/rejected": -423.1273498535156, "eval_loss": 0.5809333324432373, "eval_rewards/accuracies": 0.6923515796661377, "eval_rewards/chosen": -2.030480146408081, "eval_rewards/margins": 0.49752748012542725, "eval_rewards/rejected": -2.528007984161377, "eval_runtime": 544.0927, "eval_samples_per_second": 12.865, "eval_steps_per_second": 0.403, "step": 300 }, { "epoch": 0.18, "grad_norm": 20.57183591795313, "learning_rate": 4.907057144049243e-07, "logits/chosen": -2.2187986373901367, "logits/rejected": -2.2342276573181152, "logps/chosen": -363.1693420410156, "logps/rejected": -433.428955078125, "loss": 0.5665, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6798083782196045, "rewards/margins": 0.5087260007858276, "rewards/rejected": -2.1885344982147217, "step": 310 }, { "epoch": 0.18, "grad_norm": 29.514941076169325, "learning_rate": 4.892988486772756e-07, "logits/chosen": -2.145481586456299, "logits/rejected": -2.149977207183838, "logps/chosen": -315.6699523925781, "logps/rejected": -392.2762756347656, "loss": 0.5551, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.504184365272522, "rewards/margins": 0.5998227596282959, "rewards/rejected": -2.1040072441101074, "step": 320 }, { "epoch": 0.19, "grad_norm": 49.892806992923354, "learning_rate": 4.877951855578342e-07, "logits/chosen": -2.0608973503112793, "logits/rejected": -2.0279011726379395, "logps/chosen": -388.0411376953125, "logps/rejected": -433.9009704589844, "loss": 0.5996, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8731982707977295, "rewards/margins": 0.5407770872116089, "rewards/rejected": -2.413975477218628, "step": 330 }, { "epoch": 0.2, "grad_norm": 28.07822983249446, "learning_rate": 4.861953332846629e-07, "logits/chosen": -2.0477962493896484, "logits/rejected": -1.9786545038223267, "logps/chosen": -350.5347900390625, "logps/rejected": -404.81390380859375, "loss": 0.5561, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5449774265289307, "rewards/margins": 0.5318618416786194, "rewards/rejected": -2.0768394470214844, "step": 340 }, { "epoch": 0.2, "grad_norm": 31.750069839466466, "learning_rate": 4.844999390047419e-07, "logits/chosen": -1.9117634296417236, "logits/rejected": -1.8637244701385498, "logps/chosen": -369.7088928222656, "logps/rejected": -423.8294982910156, "loss": 0.5674, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8487409353256226, "rewards/margins": 0.5574057698249817, "rewards/rejected": -2.406146764755249, "step": 350 }, { "epoch": 0.21, "grad_norm": 40.566376234563315, "learning_rate": 4.827096885121953e-07, "logits/chosen": -1.8720242977142334, "logits/rejected": -1.849880576133728, "logps/chosen": -453.58563232421875, "logps/rejected": -510.3387145996094, "loss": 0.5451, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2124533653259277, "rewards/margins": 0.7541533708572388, "rewards/rejected": -2.966606616973877, "step": 360 }, { "epoch": 0.21, "grad_norm": 27.693964794914088, "learning_rate": 4.808253059708848e-07, "logits/chosen": -1.9786027669906616, "logits/rejected": -1.957528829574585, "logps/chosen": -384.38519287109375, "logps/rejected": -449.1851501464844, "loss": 0.5708, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7919820547103882, "rewards/margins": 0.6518365144729614, "rewards/rejected": -2.4438185691833496, "step": 370 }, { "epoch": 0.22, "grad_norm": 26.76769623003568, "learning_rate": 4.788475536214821e-07, "logits/chosen": -2.040398120880127, "logits/rejected": -2.0081913471221924, "logps/chosen": -372.25213623046875, "logps/rejected": -443.19451904296875, "loss": 0.5233, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7179028987884521, "rewards/margins": 0.7337791919708252, "rewards/rejected": -2.4516820907592773, "step": 380 }, { "epoch": 0.22, "grad_norm": 38.23522225315786, "learning_rate": 4.767772314731393e-07, "logits/chosen": -1.9009816646575928, "logits/rejected": -1.9371490478515625, "logps/chosen": -370.54229736328125, "logps/rejected": -435.6880798339844, "loss": 0.5569, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9820528030395508, "rewards/margins": 0.547071635723114, "rewards/rejected": -2.5291244983673096, "step": 390 }, { "epoch": 0.23, "grad_norm": 32.640987965795105, "learning_rate": 4.746151769798818e-07, "logits/chosen": -1.969786286354065, "logits/rejected": -1.8861439228057861, "logps/chosen": -388.787353515625, "logps/rejected": -426.386962890625, "loss": 0.5437, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8250181674957275, "rewards/margins": 0.5650046467781067, "rewards/rejected": -2.3900225162506104, "step": 400 }, { "epoch": 0.23, "eval_logits/chosen": -1.9247232675552368, "eval_logits/rejected": -1.8974039554595947, "eval_logps/chosen": -343.13470458984375, "eval_logps/rejected": -406.9888000488281, "eval_loss": 0.5683532953262329, "eval_rewards/accuracies": 0.7031963467597961, "eval_rewards/chosen": -1.80377995967865, "eval_rewards/margins": 0.5628422498703003, "eval_rewards/rejected": -2.366621971130371, "eval_runtime": 547.2464, "eval_samples_per_second": 12.791, "eval_steps_per_second": 0.4, "step": 400 }, { "epoch": 0.24, "grad_norm": 21.532686706791136, "learning_rate": 4.72362264701855e-07, "logits/chosen": -2.114487409591675, "logits/rejected": -2.0793392658233643, "logps/chosen": -370.3285217285156, "logps/rejected": -403.5226135253906, "loss": 0.5759, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2717143297195435, "rewards/margins": 0.5267833471298218, "rewards/rejected": -1.7984975576400757, "step": 410 }, { "epoch": 0.24, "grad_norm": 29.239777552832912, "learning_rate": 4.7001940595156055e-07, "logits/chosen": -2.0379366874694824, "logits/rejected": -1.9628146886825562, "logps/chosen": -385.35113525390625, "logps/rejected": -440.34222412109375, "loss": 0.5678, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6135915517807007, "rewards/margins": 0.6375263333320618, "rewards/rejected": -2.2511179447174072, "step": 420 }, { "epoch": 0.25, "grad_norm": 20.560330978299934, "learning_rate": 4.6758754842522697e-07, "logits/chosen": -2.0536270141601562, "logits/rejected": -1.9932899475097656, "logps/chosen": -365.8475036621094, "logps/rejected": -423.611083984375, "loss": 0.565, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.728179931640625, "rewards/margins": 0.6338831186294556, "rewards/rejected": -2.362062931060791, "step": 430 }, { "epoch": 0.25, "grad_norm": 26.229998665879116, "learning_rate": 4.650676758194623e-07, "logits/chosen": -2.07350492477417, "logits/rejected": -2.022712230682373, "logps/chosen": -401.141357421875, "logps/rejected": -436.979248046875, "loss": 0.5464, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8332273960113525, "rewards/margins": 0.6851301789283752, "rewards/rejected": -2.518357753753662, "step": 440 }, { "epoch": 0.26, "grad_norm": 46.29630215421365, "learning_rate": 4.6246080743334474e-07, "logits/chosen": -1.8938102722167969, "logits/rejected": -1.8106597661972046, "logps/chosen": -397.90948486328125, "logps/rejected": -467.4127502441406, "loss": 0.5466, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9406541585922241, "rewards/margins": 0.6843216419219971, "rewards/rejected": -2.6249756813049316, "step": 450 }, { "epoch": 0.26, "grad_norm": 26.630018999750448, "learning_rate": 4.5976799775611215e-07, "logits/chosen": -1.814541220664978, "logits/rejected": -1.7524267435073853, "logps/chosen": -366.3084716796875, "logps/rejected": -446.58026123046875, "loss": 0.5626, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7752193212509155, "rewards/margins": 0.6405627727508545, "rewards/rejected": -2.4157819747924805, "step": 460 }, { "epoch": 0.27, "grad_norm": 35.44334983652439, "learning_rate": 4.569903360406162e-07, "logits/chosen": -1.9025815725326538, "logits/rejected": -1.8398154973983765, "logps/chosen": -346.3355407714844, "logps/rejected": -398.0967102050781, "loss": 0.5401, "rewards/accuracies": 0.6875, "rewards/chosen": -1.503535509109497, "rewards/margins": 0.6164692640304565, "rewards/rejected": -2.1200051307678223, "step": 470 }, { "epoch": 0.28, "grad_norm": 33.12278527176869, "learning_rate": 4.5412894586271543e-07, "logits/chosen": -1.8207648992538452, "logits/rejected": -1.7967065572738647, "logps/chosen": -392.82696533203125, "logps/rejected": -462.015869140625, "loss": 0.5451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8827365636825562, "rewards/margins": 0.6777531504631042, "rewards/rejected": -2.5604898929595947, "step": 480 }, { "epoch": 0.28, "grad_norm": 25.558438319253998, "learning_rate": 4.511849846667839e-07, "logits/chosen": -1.883180022239685, "logits/rejected": -1.8137277364730835, "logps/chosen": -354.247314453125, "logps/rejected": -436.14556884765625, "loss": 0.5408, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8229477405548096, "rewards/margins": 0.7674862742424011, "rewards/rejected": -2.5904340744018555, "step": 490 }, { "epoch": 0.29, "grad_norm": 28.233129557824064, "learning_rate": 4.481596432975201e-07, "logits/chosen": -1.9428781270980835, "logits/rejected": -1.889491081237793, "logps/chosen": -410.0284729003906, "logps/rejected": -480.2649841308594, "loss": 0.5415, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8563823699951172, "rewards/margins": 0.8543184995651245, "rewards/rejected": -2.710700750350952, "step": 500 }, { "epoch": 0.29, "eval_logits/chosen": -1.8125942945480347, "eval_logits/rejected": -1.7951966524124146, "eval_logps/chosen": -405.05938720703125, "eval_logps/rejected": -476.8222961425781, "eval_loss": 0.5648065209388733, "eval_rewards/accuracies": 0.706620991230011, "eval_rewards/chosen": -2.4230268001556396, "eval_rewards/margins": 0.6419299840927124, "eval_rewards/rejected": -3.0649566650390625, "eval_runtime": 536.9406, "eval_samples_per_second": 13.037, "eval_steps_per_second": 0.408, "step": 500 }, { "epoch": 0.29, "grad_norm": 30.516998297285266, "learning_rate": 4.450541455182453e-07, "logits/chosen": -1.8995802402496338, "logits/rejected": -1.9007337093353271, "logps/chosen": -408.70635986328125, "logps/rejected": -487.16387939453125, "loss": 0.5238, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.09024977684021, "rewards/margins": 0.7947575449943542, "rewards/rejected": -2.885007381439209, "step": 510 }, { "epoch": 0.3, "grad_norm": 33.722814638920184, "learning_rate": 4.41869747515886e-07, "logits/chosen": -1.95028817653656, "logits/rejected": -1.8546888828277588, "logps/chosen": -388.6572570800781, "logps/rejected": -446.74542236328125, "loss": 0.5667, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9295694828033447, "rewards/margins": 0.7291784882545471, "rewards/rejected": -2.658748149871826, "step": 520 }, { "epoch": 0.31, "grad_norm": 25.932688832468305, "learning_rate": 4.3860773739284126e-07, "logits/chosen": -1.9748178720474243, "logits/rejected": -1.9027087688446045, "logps/chosen": -368.09832763671875, "logps/rejected": -403.284912109375, "loss": 0.573, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6870168447494507, "rewards/margins": 0.5430334806442261, "rewards/rejected": -2.2300503253936768, "step": 530 }, { "epoch": 0.31, "grad_norm": 39.79448640097382, "learning_rate": 4.352694346459396e-07, "logits/chosen": -1.9401954412460327, "logits/rejected": -1.905206322669983, "logps/chosen": -386.59918212890625, "logps/rejected": -437.18536376953125, "loss": 0.571, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.91461181640625, "rewards/margins": 0.4893025755882263, "rewards/rejected": -2.403914451599121, "step": 540 }, { "epoch": 0.32, "grad_norm": 28.112999261098803, "learning_rate": 4.318561896326973e-07, "logits/chosen": -1.959571123123169, "logits/rejected": -1.9278638362884521, "logps/chosen": -388.32073974609375, "logps/rejected": -454.91436767578125, "loss": 0.5538, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.955959677696228, "rewards/margins": 0.6656385660171509, "rewards/rejected": -2.621598482131958, "step": 550 }, { "epoch": 0.32, "grad_norm": 26.262637133504416, "learning_rate": 4.2836938302509256e-07, "logits/chosen": -2.0025877952575684, "logits/rejected": -1.9562809467315674, "logps/chosen": -359.0731201171875, "logps/rejected": -429.9349060058594, "loss": 0.5291, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6528972387313843, "rewards/margins": 0.6726639866828918, "rewards/rejected": -2.325561285018921, "step": 560 }, { "epoch": 0.33, "grad_norm": 36.25641292003506, "learning_rate": 4.248104252510785e-07, "logits/chosen": -2.134064197540283, "logits/rejected": -2.1425302028656006, "logps/chosen": -429.51153564453125, "logps/rejected": -480.48138427734375, "loss": 0.544, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.814552664756775, "rewards/margins": 0.49401578307151794, "rewards/rejected": -2.3085684776306152, "step": 570 }, { "epoch": 0.33, "grad_norm": 21.449511768929142, "learning_rate": 4.2118075592405874e-07, "logits/chosen": -1.988585114479065, "logits/rejected": -2.011026382446289, "logps/chosen": -405.82305908203125, "logps/rejected": -488.56451416015625, "loss": 0.5412, "rewards/accuracies": 0.78125, "rewards/chosen": -2.087791919708252, "rewards/margins": 0.7612438201904297, "rewards/rejected": -2.8490357398986816, "step": 580 }, { "epoch": 0.34, "grad_norm": 30.59358168073691, "learning_rate": 4.174818432605578e-07, "logits/chosen": -2.0260438919067383, "logits/rejected": -2.033987522125244, "logps/chosen": -453.0452575683594, "logps/rejected": -514.720458984375, "loss": 0.5355, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.147684335708618, "rewards/margins": 0.7406858801841736, "rewards/rejected": -2.8883700370788574, "step": 590 }, { "epoch": 0.35, "grad_norm": 28.138749590258723, "learning_rate": 4.137151834863213e-07, "logits/chosen": -1.9616165161132812, "logits/rejected": -1.972180724143982, "logps/chosen": -385.138427734375, "logps/rejected": -473.2599182128906, "loss": 0.564, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.132406234741211, "rewards/margins": 0.6182124018669128, "rewards/rejected": -2.7506186962127686, "step": 600 }, { "epoch": 0.35, "eval_logits/chosen": -1.8847192525863647, "eval_logits/rejected": -1.8836290836334229, "eval_logps/chosen": -397.7480773925781, "eval_logps/rejected": -469.41180419921875, "eval_loss": 0.5578325390815735, "eval_rewards/accuracies": 0.7191780805587769, "eval_rewards/chosen": -2.3499135971069336, "eval_rewards/margins": 0.6409377455711365, "eval_rewards/rejected": -2.990851402282715, "eval_runtime": 544.5307, "eval_samples_per_second": 12.855, "eval_steps_per_second": 0.402, "step": 600 }, { "epoch": 0.35, "grad_norm": 33.10703086608096, "learning_rate": 4.098823002310864e-07, "logits/chosen": -2.044586181640625, "logits/rejected": -1.9869381189346313, "logps/chosen": -415.4453125, "logps/rejected": -474.20526123046875, "loss": 0.5454, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8535633087158203, "rewards/margins": 0.7316546440124512, "rewards/rejected": -2.5852179527282715, "step": 610 }, { "epoch": 0.36, "grad_norm": 39.38037052781508, "learning_rate": 4.059847439122671e-07, "logits/chosen": -1.9577858448028564, "logits/rejected": -1.904496431350708, "logps/chosen": -393.66796875, "logps/rejected": -449.994140625, "loss": 0.5357, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8364942073822021, "rewards/margins": 0.6134520769119263, "rewards/rejected": -2.449946165084839, "step": 620 }, { "epoch": 0.36, "grad_norm": 29.15442393094139, "learning_rate": 4.020240911078041e-07, "logits/chosen": -1.8907365798950195, "logits/rejected": -1.8794755935668945, "logps/chosen": -393.5573425292969, "logps/rejected": -469.4529724121094, "loss": 0.5547, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1041901111602783, "rewards/margins": 0.8193286657333374, "rewards/rejected": -2.923518419265747, "step": 630 }, { "epoch": 0.37, "grad_norm": 24.710710448776272, "learning_rate": 3.98001943918432e-07, "logits/chosen": -1.87062668800354, "logits/rejected": -1.8511345386505127, "logps/chosen": -391.0401306152344, "logps/rejected": -467.5562438964844, "loss": 0.5439, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9567807912826538, "rewards/margins": 0.8031463623046875, "rewards/rejected": -2.75992751121521, "step": 640 }, { "epoch": 0.37, "grad_norm": 28.042405621162647, "learning_rate": 3.9391992931962304e-07, "logits/chosen": -1.912502646446228, "logits/rejected": -1.8945941925048828, "logps/chosen": -381.6258850097656, "logps/rejected": -439.37921142578125, "loss": 0.5279, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6793773174285889, "rewards/margins": 0.6930197477340698, "rewards/rejected": -2.3723976612091064, "step": 650 }, { "epoch": 0.38, "grad_norm": 64.63037359225194, "learning_rate": 3.8977969850346866e-07, "logits/chosen": -1.8362230062484741, "logits/rejected": -1.827745795249939, "logps/chosen": -341.99755859375, "logps/rejected": -415.6537170410156, "loss": 0.5512, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6921066045761108, "rewards/margins": 0.6708263158798218, "rewards/rejected": -2.3629326820373535, "step": 660 }, { "epoch": 0.39, "grad_norm": 68.40563732230615, "learning_rate": 3.8558292621076526e-07, "logits/chosen": -1.873615026473999, "logits/rejected": -1.8472900390625, "logps/chosen": -422.1318359375, "logps/rejected": -461.34619140625, "loss": 0.5427, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0594050884246826, "rewards/margins": 0.5281103253364563, "rewards/rejected": -2.5875158309936523, "step": 670 }, { "epoch": 0.39, "grad_norm": 22.39050226911276, "learning_rate": 3.8133131005357465e-07, "logits/chosen": -1.8999011516571045, "logits/rejected": -1.836851716041565, "logps/chosen": -397.0812072753906, "logps/rejected": -480.00823974609375, "loss": 0.5167, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0673575401306152, "rewards/margins": 0.7980934381484985, "rewards/rejected": -2.8654510974884033, "step": 680 }, { "epoch": 0.4, "grad_norm": 38.649992337166125, "learning_rate": 3.7702656982853277e-07, "logits/chosen": -1.810121774673462, "logits/rejected": -1.793265700340271, "logps/chosen": -450.671875, "logps/rejected": -518.1996459960938, "loss": 0.5696, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.448154926300049, "rewards/margins": 0.7367699146270752, "rewards/rejected": -3.184924602508545, "step": 690 }, { "epoch": 0.4, "grad_norm": 34.05006479039719, "learning_rate": 3.7267044682118435e-07, "logits/chosen": -1.860874891281128, "logits/rejected": -1.8456264734268188, "logps/chosen": -409.9309997558594, "logps/rejected": -486.42376708984375, "loss": 0.5769, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9787782430648804, "rewards/margins": 0.6770876049995422, "rewards/rejected": -2.6558656692504883, "step": 700 }, { "epoch": 0.4, "eval_logits/chosen": -1.7718605995178223, "eval_logits/rejected": -1.7858551740646362, "eval_logps/chosen": -383.05316162109375, "eval_logps/rejected": -453.5823059082031, "eval_loss": 0.5597525238990784, "eval_rewards/accuracies": 0.7031963467597961, "eval_rewards/chosen": -2.2029640674591064, "eval_rewards/margins": 0.6295928955078125, "eval_rewards/rejected": -2.832556962966919, "eval_runtime": 535.7382, "eval_samples_per_second": 13.066, "eval_steps_per_second": 0.409, "step": 700 }, { "epoch": 0.41, "grad_norm": 30.516726115650822, "learning_rate": 3.682647031016264e-07, "logits/chosen": -1.9329684972763062, "logits/rejected": -1.940243124961853, "logps/chosen": -388.291259765625, "logps/rejected": -434.0372009277344, "loss": 0.5486, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6513302326202393, "rewards/margins": 0.6132162809371948, "rewards/rejected": -2.2645463943481445, "step": 710 }, { "epoch": 0.41, "grad_norm": 38.51345602531556, "learning_rate": 3.638111208117425e-07, "logits/chosen": -1.9404680728912354, "logits/rejected": -1.9298954010009766, "logps/chosen": -385.8715515136719, "logps/rejected": -416.53155517578125, "loss": 0.5762, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8125269412994385, "rewards/margins": 0.4822394847869873, "rewards/rejected": -2.294766426086426, "step": 720 }, { "epoch": 0.42, "grad_norm": 36.417406572486875, "learning_rate": 3.593115014443195e-07, "logits/chosen": -1.9941285848617554, "logits/rejected": -1.9894773960113525, "logps/chosen": -382.0946350097656, "logps/rejected": -437.18841552734375, "loss": 0.5469, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.637915849685669, "rewards/margins": 0.631543755531311, "rewards/rejected": -2.2694597244262695, "step": 730 }, { "epoch": 0.43, "grad_norm": 23.509926948805322, "learning_rate": 3.5476766511433605e-07, "logits/chosen": -1.9100837707519531, "logits/rejected": -1.857428789138794, "logps/chosen": -366.06109619140625, "logps/rejected": -444.9000549316406, "loss": 0.5376, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6342054605484009, "rewards/margins": 0.7001287341117859, "rewards/rejected": -2.334334373474121, "step": 740 }, { "epoch": 0.43, "grad_norm": 28.491603155440426, "learning_rate": 3.5018144982271806e-07, "logits/chosen": -1.847013235092163, "logits/rejected": -1.844740867614746, "logps/chosen": -387.2216796875, "logps/rejected": -458.35247802734375, "loss": 0.5425, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9939115047454834, "rewards/margins": 0.5967587232589722, "rewards/rejected": -2.590670585632324, "step": 750 }, { "epoch": 0.44, "grad_norm": 21.711577115215622, "learning_rate": 3.455547107128602e-07, "logits/chosen": -1.7501156330108643, "logits/rejected": -1.7191545963287354, "logps/chosen": -452.614013671875, "logps/rejected": -517.114501953125, "loss": 0.5117, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3392488956451416, "rewards/margins": 0.8644348978996277, "rewards/rejected": -3.203683853149414, "step": 760 }, { "epoch": 0.44, "grad_norm": 50.17207271612329, "learning_rate": 3.4088931932021185e-07, "logits/chosen": -1.8234459161758423, "logits/rejected": -1.780574083328247, "logps/chosen": -448.5769958496094, "logps/rejected": -518.0377197265625, "loss": 0.5488, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1782760620117188, "rewards/margins": 0.8133376240730286, "rewards/rejected": -2.9916136264801025, "step": 770 }, { "epoch": 0.45, "grad_norm": 49.301861132325, "learning_rate": 3.361871628152338e-07, "logits/chosen": -1.773737907409668, "logits/rejected": -1.7517740726470947, "logps/chosen": -440.6595153808594, "logps/rejected": -493.2332458496094, "loss": 0.5173, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.263493299484253, "rewards/margins": 0.6659582853317261, "rewards/rejected": -2.9294512271881104, "step": 780 }, { "epoch": 0.45, "grad_norm": 30.255792286324436, "learning_rate": 3.314501432400294e-07, "logits/chosen": -1.7690521478652954, "logits/rejected": -1.7298529148101807, "logps/chosen": -411.845703125, "logps/rejected": -474.04425048828125, "loss": 0.566, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1643013954162598, "rewards/margins": 0.6198045015335083, "rewards/rejected": -2.7841057777404785, "step": 790 }, { "epoch": 0.46, "grad_norm": 22.17250118566977, "learning_rate": 3.2668017673896077e-07, "logits/chosen": -1.8177188634872437, "logits/rejected": -1.7350183725357056, "logps/chosen": -399.64495849609375, "logps/rejected": -457.10601806640625, "loss": 0.5598, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0094985961914062, "rewards/margins": 0.7035370469093323, "rewards/rejected": -2.7130355834960938, "step": 800 }, { "epoch": 0.46, "eval_logits/chosen": -1.7061283588409424, "eval_logits/rejected": -1.7086626291275024, "eval_logps/chosen": -387.18157958984375, "eval_logps/rejected": -455.0378723144531, "eval_loss": 0.558580219745636, "eval_rewards/accuracies": 0.7163242101669312, "eval_rewards/chosen": -2.244248390197754, "eval_rewards/margins": 0.6028640270233154, "eval_rewards/rejected": -2.8471124172210693, "eval_runtime": 544.1327, "eval_samples_per_second": 12.865, "eval_steps_per_second": 0.402, "step": 800 }, { "epoch": 0.47, "grad_norm": 29.19961014949389, "learning_rate": 3.218791927835602e-07, "logits/chosen": -1.8107563257217407, "logits/rejected": -1.7641499042510986, "logps/chosen": -369.27203369140625, "logps/rejected": -456.6036682128906, "loss": 0.5304, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8051646947860718, "rewards/margins": 0.7181805968284607, "rewards/rejected": -2.523345470428467, "step": 810 }, { "epoch": 0.47, "grad_norm": 25.26002008872549, "learning_rate": 3.1704913339205103e-07, "logits/chosen": -1.8677990436553955, "logits/rejected": -1.825749158859253, "logps/chosen": -379.98321533203125, "logps/rejected": -454.1268005371094, "loss": 0.5288, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6397157907485962, "rewards/margins": 0.8766795992851257, "rewards/rejected": -2.516395092010498, "step": 820 }, { "epoch": 0.48, "grad_norm": 28.6375298855639, "learning_rate": 3.1219195234379265e-07, "logits/chosen": -1.6751445531845093, "logits/rejected": -1.6866257190704346, "logps/chosen": -346.9654846191406, "logps/rejected": -451.60498046875, "loss": 0.5566, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7691535949707031, "rewards/margins": 0.7554360628128052, "rewards/rejected": -2.5245893001556396, "step": 830 }, { "epoch": 0.48, "grad_norm": 44.73580525279706, "learning_rate": 3.0730961438896885e-07, "logits/chosen": -1.7529224157333374, "logits/rejected": -1.7129818201065063, "logps/chosen": -400.9212951660156, "logps/rejected": -464.69305419921875, "loss": 0.5584, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0139617919921875, "rewards/margins": 0.5847775936126709, "rewards/rejected": -2.5987396240234375, "step": 840 }, { "epoch": 0.49, "grad_norm": 30.284221885120694, "learning_rate": 3.024040944538383e-07, "logits/chosen": -1.7323232889175415, "logits/rejected": -1.7132787704467773, "logps/chosen": -379.4556579589844, "logps/rejected": -454.51531982421875, "loss": 0.5314, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9580036401748657, "rewards/margins": 0.7366491556167603, "rewards/rejected": -2.694653034210205, "step": 850 }, { "epoch": 0.5, "grad_norm": 27.718050401992414, "learning_rate": 2.9747737684186795e-07, "logits/chosen": -1.7737243175506592, "logits/rejected": -1.7415263652801514, "logps/chosen": -404.40509033203125, "logps/rejected": -465.7650451660156, "loss": 0.5184, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0648417472839355, "rewards/margins": 0.7340750694274902, "rewards/rejected": -2.798916816711426, "step": 860 }, { "epoch": 0.5, "grad_norm": 31.011489118398675, "learning_rate": 2.925314544310745e-07, "logits/chosen": -1.745216727256775, "logits/rejected": -1.727979302406311, "logps/chosen": -392.7491149902344, "logps/rejected": -456.2132263183594, "loss": 0.5497, "rewards/accuracies": 0.65625, "rewards/chosen": -2.195338487625122, "rewards/margins": 0.5425236225128174, "rewards/rejected": -2.7378618717193604, "step": 870 }, { "epoch": 0.51, "grad_norm": 35.37211460888614, "learning_rate": 2.8756832786789663e-07, "logits/chosen": -1.8434585332870483, "logits/rejected": -1.8155876398086548, "logps/chosen": -413.1863708496094, "logps/rejected": -489.76220703125, "loss": 0.5608, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.009500026702881, "rewards/margins": 0.6946345567703247, "rewards/rejected": -2.704134464263916, "step": 880 }, { "epoch": 0.51, "grad_norm": 33.27106994315821, "learning_rate": 2.8259000475792503e-07, "logits/chosen": -1.876704454421997, "logits/rejected": -1.7968547344207764, "logps/chosen": -395.55706787109375, "logps/rejected": -460.11669921875, "loss": 0.5543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7892892360687256, "rewards/margins": 0.7638824582099915, "rewards/rejected": -2.5531716346740723, "step": 890 }, { "epoch": 0.52, "grad_norm": 31.881562451650627, "learning_rate": 2.7759849885381747e-07, "logits/chosen": -1.868417739868164, "logits/rejected": -1.7971748113632202, "logps/chosen": -378.93353271484375, "logps/rejected": -464.1891174316406, "loss": 0.5374, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.808215856552124, "rewards/margins": 0.7741836309432983, "rewards/rejected": -2.582399368286133, "step": 900 }, { "epoch": 0.52, "eval_logits/chosen": -1.6767016649246216, "eval_logits/rejected": -1.6597568988800049, "eval_logps/chosen": -382.5883483886719, "eval_logps/rejected": -453.9528503417969, "eval_loss": 0.5555324554443359, "eval_rewards/accuracies": 0.7151826620101929, "eval_rewards/chosen": -2.198316812515259, "eval_rewards/margins": 0.6379454731941223, "eval_rewards/rejected": -2.8362622261047363, "eval_runtime": 537.245, "eval_samples_per_second": 13.029, "eval_steps_per_second": 0.408, "step": 900 }, { "epoch": 0.52, "grad_norm": 21.961619231813007, "learning_rate": 2.7259582924072756e-07, "logits/chosen": -1.8725192546844482, "logits/rejected": -1.8156566619873047, "logps/chosen": -350.8863220214844, "logps/rejected": -413.61993408203125, "loss": 0.5397, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7346986532211304, "rewards/margins": 0.5974160432815552, "rewards/rejected": -2.3321146965026855, "step": 910 }, { "epoch": 0.53, "grad_norm": 35.79222168716502, "learning_rate": 2.675840195195762e-07, "logits/chosen": -1.8498157262802124, "logits/rejected": -1.8300836086273193, "logps/chosen": -376.912353515625, "logps/rejected": -438.8692932128906, "loss": 0.5246, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9630987644195557, "rewards/margins": 0.5869341492652893, "rewards/rejected": -2.5500330924987793, "step": 920 }, { "epoch": 0.54, "grad_norm": 29.90256487232944, "learning_rate": 2.625650969884965e-07, "logits/chosen": -1.7971664667129517, "logits/rejected": -1.7699878215789795, "logps/chosen": -429.76171875, "logps/rejected": -510.20550537109375, "loss": 0.5419, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1585049629211426, "rewards/margins": 0.7950173616409302, "rewards/rejected": -2.953521966934204, "step": 930 }, { "epoch": 0.54, "grad_norm": 29.9715777964654, "learning_rate": 2.575410918227829e-07, "logits/chosen": -1.8557363748550415, "logits/rejected": -1.7954918146133423, "logps/chosen": -443.01092529296875, "logps/rejected": -511.6332092285156, "loss": 0.5316, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0825295448303223, "rewards/margins": 0.8091154098510742, "rewards/rejected": -2.8916451930999756, "step": 940 }, { "epoch": 0.55, "grad_norm": 29.036788981905207, "learning_rate": 2.525140362536775e-07, "logits/chosen": -1.7384717464447021, "logits/rejected": -1.6616607904434204, "logps/chosen": -384.7867126464844, "logps/rejected": -460.168212890625, "loss": 0.5632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1540913581848145, "rewards/margins": 0.569457471370697, "rewards/rejected": -2.723548650741577, "step": 950 }, { "epoch": 0.55, "grad_norm": 24.40866664439217, "learning_rate": 2.474859637463226e-07, "logits/chosen": -1.7090812921524048, "logits/rejected": -1.6654443740844727, "logps/chosen": -438.59613037109375, "logps/rejected": -484.14093017578125, "loss": 0.5394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.41066837310791, "rewards/margins": 0.5415581464767456, "rewards/rejected": -2.952226400375366, "step": 960 }, { "epoch": 0.56, "grad_norm": 28.97604867448642, "learning_rate": 2.42458908177217e-07, "logits/chosen": -1.8490597009658813, "logits/rejected": -1.7891228199005127, "logps/chosen": -424.61383056640625, "logps/rejected": -479.585205078125, "loss": 0.5426, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.024867296218872, "rewards/margins": 0.7355901002883911, "rewards/rejected": -2.7604575157165527, "step": 970 }, { "epoch": 0.56, "grad_norm": 35.488277243353735, "learning_rate": 2.3743490301150355e-07, "logits/chosen": -1.8032734394073486, "logits/rejected": -1.794163465499878, "logps/chosen": -417.388671875, "logps/rejected": -491.72021484375, "loss": 0.554, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1092441082000732, "rewards/margins": 0.6570713520050049, "rewards/rejected": -2.766315460205078, "step": 980 }, { "epoch": 0.57, "grad_norm": 22.23777106600426, "learning_rate": 2.324159804804238e-07, "logits/chosen": -1.8234403133392334, "logits/rejected": -1.786786675453186, "logps/chosen": -404.6798400878906, "logps/rejected": -463.2445373535156, "loss": 0.5494, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0632481575012207, "rewards/margins": 0.6302553415298462, "rewards/rejected": -2.6935033798217773, "step": 990 }, { "epoch": 0.58, "grad_norm": 31.216532615702715, "learning_rate": 2.274041707592724e-07, "logits/chosen": -1.9149761199951172, "logits/rejected": -1.8780314922332764, "logps/chosen": -430.76617431640625, "logps/rejected": -501.29132080078125, "loss": 0.5036, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9554197788238525, "rewards/margins": 0.8839017748832703, "rewards/rejected": -2.8393216133117676, "step": 1000 }, { "epoch": 0.58, "eval_logits/chosen": -1.7254499197006226, "eval_logits/rejected": -1.7160460948944092, "eval_logps/chosen": -385.9115295410156, "eval_logps/rejected": -462.5011291503906, "eval_loss": 0.5499266982078552, "eval_rewards/accuracies": 0.7208904027938843, "eval_rewards/chosen": -2.231548309326172, "eval_rewards/margins": 0.6901971697807312, "eval_rewards/rejected": -2.921745777130127, "eval_runtime": 544.8576, "eval_samples_per_second": 12.847, "eval_steps_per_second": 0.402, "step": 1000 }, { "epoch": 0.58, "grad_norm": 30.350568547131573, "learning_rate": 2.2240150114618259e-07, "logits/chosen": -1.8180408477783203, "logits/rejected": -1.7760928869247437, "logps/chosen": -416.88525390625, "logps/rejected": -509.04058837890625, "loss": 0.5276, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.980337381362915, "rewards/margins": 0.8953462839126587, "rewards/rejected": -2.875683546066284, "step": 1010 }, { "epoch": 0.59, "grad_norm": 31.603328627940357, "learning_rate": 2.17409995242075e-07, "logits/chosen": -1.8180592060089111, "logits/rejected": -1.7379405498504639, "logps/chosen": -440.83074951171875, "logps/rejected": -495.69830322265625, "loss": 0.5235, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1939332485198975, "rewards/margins": 0.9124080538749695, "rewards/rejected": -3.1063413619995117, "step": 1020 }, { "epoch": 0.59, "grad_norm": 31.20623945497072, "learning_rate": 2.1243167213210335e-07, "logits/chosen": -1.8180633783340454, "logits/rejected": -1.7436892986297607, "logps/chosen": -410.88427734375, "logps/rejected": -483.1456604003906, "loss": 0.5401, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.02951717376709, "rewards/margins": 0.87162846326828, "rewards/rejected": -2.9011454582214355, "step": 1030 }, { "epoch": 0.6, "grad_norm": 34.72608405283437, "learning_rate": 2.0746854556892544e-07, "logits/chosen": -1.804686188697815, "logits/rejected": -1.7846415042877197, "logps/chosen": -387.50067138671875, "logps/rejected": -457.11505126953125, "loss": 0.5742, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.964666724205017, "rewards/margins": 0.690390944480896, "rewards/rejected": -2.655057907104492, "step": 1040 }, { "epoch": 0.6, "grad_norm": 26.9759576683522, "learning_rate": 2.025226231581321e-07, "logits/chosen": -1.8315603733062744, "logits/rejected": -1.7954432964324951, "logps/chosen": -408.33740234375, "logps/rejected": -479.91912841796875, "loss": 0.5286, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.079817533493042, "rewards/margins": 0.8754861950874329, "rewards/rejected": -2.95530366897583, "step": 1050 }, { "epoch": 0.61, "grad_norm": 28.45648597029955, "learning_rate": 1.9759590554616173e-07, "logits/chosen": -1.8250961303710938, "logits/rejected": -1.785871148109436, "logps/chosen": -423.91607666015625, "logps/rejected": -492.83563232421875, "loss": 0.5428, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0802817344665527, "rewards/margins": 0.7300957441329956, "rewards/rejected": -2.810377597808838, "step": 1060 }, { "epoch": 0.62, "grad_norm": 36.3095911676204, "learning_rate": 1.926903856110311e-07, "logits/chosen": -1.8510675430297852, "logits/rejected": -1.7864242792129517, "logps/chosen": -412.6505432128906, "logps/rejected": -492.79095458984375, "loss": 0.53, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.102355480194092, "rewards/margins": 0.708962082862854, "rewards/rejected": -2.8113174438476562, "step": 1070 }, { "epoch": 0.62, "grad_norm": 28.29885030565513, "learning_rate": 1.8780804765620746e-07, "logits/chosen": -1.8249950408935547, "logits/rejected": -1.7665761709213257, "logps/chosen": -403.99609375, "logps/rejected": -481.81103515625, "loss": 0.5048, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.150357961654663, "rewards/margins": 0.908871054649353, "rewards/rejected": -3.0592291355133057, "step": 1080 }, { "epoch": 0.63, "grad_norm": 32.08952273669513, "learning_rate": 1.82950866607949e-07, "logits/chosen": -1.87527596950531, "logits/rejected": -1.8300920724868774, "logps/chosen": -415.8727111816406, "logps/rejected": -479.55419921875, "loss": 0.5616, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0913748741149902, "rewards/margins": 0.8811753988265991, "rewards/rejected": -2.972550630569458, "step": 1090 }, { "epoch": 0.63, "grad_norm": 34.73789118478527, "learning_rate": 1.7812080721643973e-07, "logits/chosen": -1.8299520015716553, "logits/rejected": -1.7463247776031494, "logps/chosen": -407.6546325683594, "logps/rejected": -461.5155334472656, "loss": 0.5281, "rewards/accuracies": 0.75, "rewards/chosen": -1.9930970668792725, "rewards/margins": 0.8382734060287476, "rewards/rejected": -2.8313703536987305, "step": 1100 }, { "epoch": 0.63, "eval_logits/chosen": -1.7563356161117554, "eval_logits/rejected": -1.7503989934921265, "eval_logps/chosen": -391.30999755859375, "eval_logps/rejected": -466.37115478515625, "eval_loss": 0.5488813519477844, "eval_rewards/accuracies": 0.72374427318573, "eval_rewards/chosen": -2.2855324745178223, "eval_rewards/margins": 0.674912691116333, "eval_rewards/rejected": -2.960444927215576, "eval_runtime": 536.1349, "eval_samples_per_second": 13.056, "eval_steps_per_second": 0.408, "step": 1100 }, { "epoch": 0.64, "grad_norm": 22.04228728782992, "learning_rate": 1.7331982326103918e-07, "logits/chosen": -1.9050697088241577, "logits/rejected": -1.8864399194717407, "logps/chosen": -400.99151611328125, "logps/rejected": -456.43377685546875, "loss": 0.5236, "rewards/accuracies": 0.75, "rewards/chosen": -1.9019591808319092, "rewards/margins": 0.7700345516204834, "rewards/rejected": -2.6719937324523926, "step": 1110 }, { "epoch": 0.64, "grad_norm": 32.9478891711517, "learning_rate": 1.6854985675997063e-07, "logits/chosen": -1.857361078262329, "logits/rejected": -1.8371422290802002, "logps/chosen": -456.65582275390625, "logps/rejected": -527.1624755859375, "loss": 0.538, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3455679416656494, "rewards/margins": 0.7390109300613403, "rewards/rejected": -3.0845787525177, "step": 1120 }, { "epoch": 0.65, "grad_norm": 35.31307694928471, "learning_rate": 1.638128371847662e-07, "logits/chosen": -1.8157202005386353, "logits/rejected": -1.7822942733764648, "logps/chosen": -413.3636779785156, "logps/rejected": -507.03338623046875, "loss": 0.5299, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1275038719177246, "rewards/margins": 0.8648099899291992, "rewards/rejected": -2.992314338684082, "step": 1130 }, { "epoch": 0.66, "grad_norm": 39.68360857124647, "learning_rate": 1.5911068067978818e-07, "logits/chosen": -1.8612645864486694, "logits/rejected": -1.8140894174575806, "logps/chosen": -447.96697998046875, "logps/rejected": -535.6785888671875, "loss": 0.5089, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.447895050048828, "rewards/margins": 0.799897313117981, "rewards/rejected": -3.2477920055389404, "step": 1140 }, { "epoch": 0.66, "grad_norm": 45.56752540087649, "learning_rate": 1.5444528928713985e-07, "logits/chosen": -1.8386377096176147, "logits/rejected": -1.773667335510254, "logps/chosen": -397.98663330078125, "logps/rejected": -473.56329345703125, "loss": 0.5192, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0095162391662598, "rewards/margins": 0.8521126508712769, "rewards/rejected": -2.861629009246826, "step": 1150 }, { "epoch": 0.67, "grad_norm": 28.03328574000068, "learning_rate": 1.4981855017728197e-07, "logits/chosen": -1.7747135162353516, "logits/rejected": -1.7615177631378174, "logps/chosen": -415.66680908203125, "logps/rejected": -485.73944091796875, "loss": 0.5243, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2126426696777344, "rewards/margins": 0.8091427087783813, "rewards/rejected": -3.021785259246826, "step": 1160 }, { "epoch": 0.67, "grad_norm": 29.571461514972317, "learning_rate": 1.452323348856639e-07, "logits/chosen": -1.9694970846176147, "logits/rejected": -1.9434292316436768, "logps/chosen": -398.3510437011719, "logps/rejected": -494.0318908691406, "loss": 0.5583, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8146642446517944, "rewards/margins": 0.8760486841201782, "rewards/rejected": -2.6907129287719727, "step": 1170 }, { "epoch": 0.68, "grad_norm": 24.402984146105567, "learning_rate": 1.406884985556804e-07, "logits/chosen": -1.8805034160614014, "logits/rejected": -1.8436615467071533, "logps/chosen": -395.9003601074219, "logps/rejected": -471.02685546875, "loss": 0.5487, "rewards/accuracies": 0.75, "rewards/chosen": -2.027919292449951, "rewards/margins": 0.7736718058586121, "rewards/rejected": -2.801591396331787, "step": 1180 }, { "epoch": 0.69, "grad_norm": 20.687254330852166, "learning_rate": 1.361888791882575e-07, "logits/chosen": -1.900747299194336, "logits/rejected": -1.8584699630737305, "logps/chosen": -339.09442138671875, "logps/rejected": -388.8125, "loss": 0.5293, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7120403051376343, "rewards/margins": 0.5825742483139038, "rewards/rejected": -2.294614315032959, "step": 1190 }, { "epoch": 0.69, "grad_norm": 26.083643158531725, "learning_rate": 1.3173529689837354e-07, "logits/chosen": -2.0513994693756104, "logits/rejected": -1.986104965209961, "logps/chosen": -375.33746337890625, "logps/rejected": -463.71051025390625, "loss": 0.5067, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7211412191390991, "rewards/margins": 0.6738361120223999, "rewards/rejected": -2.394977331161499, "step": 1200 }, { "epoch": 0.69, "eval_logits/chosen": -1.8046900033950806, "eval_logits/rejected": -1.7967232465744019, "eval_logps/chosen": -393.0003662109375, "eval_logps/rejected": -471.0760192871094, "eval_loss": 0.5447794198989868, "eval_rewards/accuracies": 0.7243150472640991, "eval_rewards/chosen": -2.302436351776123, "eval_rewards/margins": 0.7050578594207764, "eval_rewards/rejected": -3.0074942111968994, "eval_runtime": 543.7258, "eval_samples_per_second": 12.874, "eval_steps_per_second": 0.403, "step": 1200 }, { "epoch": 0.7, "grad_norm": 27.1913571170997, "learning_rate": 1.273295531788156e-07, "logits/chosen": -1.8818267583847046, "logits/rejected": -1.8337571620941162, "logps/chosen": -382.8564758300781, "logps/rejected": -479.34423828125, "loss": 0.4961, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9514567852020264, "rewards/margins": 0.9589536786079407, "rewards/rejected": -2.9104104042053223, "step": 1210 }, { "epoch": 0.7, "grad_norm": 30.87994960869174, "learning_rate": 1.2297343017146726e-07, "logits/chosen": -1.9285227060317993, "logits/rejected": -1.879024863243103, "logps/chosen": -413.84722900390625, "logps/rejected": -471.5714416503906, "loss": 0.5172, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0496084690093994, "rewards/margins": 0.7111380696296692, "rewards/rejected": -2.760746479034424, "step": 1220 }, { "epoch": 0.71, "grad_norm": 30.103114841199876, "learning_rate": 1.1866868994642534e-07, "logits/chosen": -1.910308599472046, "logits/rejected": -1.8798065185546875, "logps/chosen": -428.7994689941406, "logps/rejected": -501.61572265625, "loss": 0.532, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0664076805114746, "rewards/margins": 0.800611138343811, "rewards/rejected": -2.867018938064575, "step": 1230 }, { "epoch": 0.71, "grad_norm": 33.952742355560765, "learning_rate": 1.1441707378923474e-07, "logits/chosen": -1.954697608947754, "logits/rejected": -1.9360759258270264, "logps/chosen": -358.89459228515625, "logps/rejected": -451.65509033203125, "loss": 0.5161, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7160298824310303, "rewards/margins": 0.8581940531730652, "rewards/rejected": -2.5742239952087402, "step": 1240 }, { "epoch": 0.72, "grad_norm": 26.669019235150035, "learning_rate": 1.1022030149653133e-07, "logits/chosen": -1.8900222778320312, "logits/rejected": -1.8807737827301025, "logps/chosen": -370.8710021972656, "logps/rejected": -462.078857421875, "loss": 0.5378, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.005030870437622, "rewards/margins": 0.7315531969070435, "rewards/rejected": -2.736584186553955, "step": 1250 }, { "epoch": 0.73, "grad_norm": 29.39370070872787, "learning_rate": 1.06080070680377e-07, "logits/chosen": -1.9039020538330078, "logits/rejected": -1.8967231512069702, "logps/chosen": -407.52886962890625, "logps/rejected": -471.5879821777344, "loss": 0.5022, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9449284076690674, "rewards/margins": 0.866219699382782, "rewards/rejected": -2.811148166656494, "step": 1260 }, { "epoch": 0.73, "grad_norm": 34.28058476728983, "learning_rate": 1.01998056081568e-07, "logits/chosen": -1.947778344154358, "logits/rejected": -1.933396339416504, "logps/chosen": -409.4700622558594, "logps/rejected": -500.43798828125, "loss": 0.5202, "rewards/accuracies": 0.75, "rewards/chosen": -2.1493587493896484, "rewards/margins": 0.8992208242416382, "rewards/rejected": -3.048579692840576, "step": 1270 }, { "epoch": 0.74, "grad_norm": 38.74422247304706, "learning_rate": 9.797590889219587e-08, "logits/chosen": -1.9459298849105835, "logits/rejected": -1.902991533279419, "logps/chosen": -424.58380126953125, "logps/rejected": -504.6437072753906, "loss": 0.5452, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1141209602355957, "rewards/margins": 0.8704532384872437, "rewards/rejected": -2.984574317932129, "step": 1280 }, { "epoch": 0.74, "grad_norm": 28.244962433086215, "learning_rate": 9.401525608773292e-08, "logits/chosen": -1.8756380081176758, "logits/rejected": -1.8184016942977905, "logps/chosen": -392.9984130859375, "logps/rejected": -461.8180236816406, "loss": 0.5398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9684727191925049, "rewards/margins": 0.7552623748779297, "rewards/rejected": -2.7237350940704346, "step": 1290 }, { "epoch": 0.75, "grad_norm": 34.33816096896355, "learning_rate": 9.011769976891367e-08, "logits/chosen": -1.903464913368225, "logits/rejected": -1.8477399349212646, "logps/chosen": -419.75750732421875, "logps/rejected": -506.533935546875, "loss": 0.5095, "rewards/accuracies": 0.71875, "rewards/chosen": -2.119554042816162, "rewards/margins": 0.8639281392097473, "rewards/rejected": -2.9834823608398438, "step": 1300 }, { "epoch": 0.75, "eval_logits/chosen": -1.824761152267456, "eval_logits/rejected": -1.8238047361373901, "eval_logps/chosen": -383.5680236816406, "eval_logps/rejected": -460.3614196777344, "eval_loss": 0.545096218585968, "eval_rewards/accuracies": 0.7186073064804077, "eval_rewards/chosen": -2.208112955093384, "eval_rewards/margins": 0.6922349333763123, "eval_rewards/rejected": -2.90034818649292, "eval_runtime": 523.3073, "eval_samples_per_second": 13.376, "eval_steps_per_second": 0.418, "step": 1300 }, { "epoch": 0.75, "grad_norm": 31.642563855828758, "learning_rate": 8.628481651367875e-08, "logits/chosen": -1.98309326171875, "logits/rejected": -1.9879448413848877, "logps/chosen": -390.30712890625, "logps/rejected": -448.2904357910156, "loss": 0.5447, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7398111820220947, "rewards/margins": 0.5182247757911682, "rewards/rejected": -2.258035898208618, "step": 1310 }, { "epoch": 0.76, "grad_norm": 30.145195997712573, "learning_rate": 8.251815673944218e-08, "logits/chosen": -1.9566547870635986, "logits/rejected": -1.9202098846435547, "logps/chosen": -395.9665832519531, "logps/rejected": -496.5779724121094, "loss": 0.5563, "rewards/accuracies": 0.75, "rewards/chosen": -1.9744913578033447, "rewards/margins": 0.9160418510437012, "rewards/rejected": -2.890532970428467, "step": 1320 }, { "epoch": 0.77, "grad_norm": 26.76251092001294, "learning_rate": 7.881924407594129e-08, "logits/chosen": -1.9259990453720093, "logits/rejected": -1.8814588785171509, "logps/chosen": -420.6258850097656, "logps/rejected": -471.1128845214844, "loss": 0.5346, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2005252838134766, "rewards/margins": 0.6154937744140625, "rewards/rejected": -2.816019296646118, "step": 1330 }, { "epoch": 0.77, "grad_norm": 31.270267795635966, "learning_rate": 7.518957474892148e-08, "logits/chosen": -1.873970627784729, "logits/rejected": -1.8780710697174072, "logps/chosen": -387.642333984375, "logps/rejected": -460.996337890625, "loss": 0.561, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0759739875793457, "rewards/margins": 0.5735403895378113, "rewards/rejected": -2.6495144367218018, "step": 1340 }, { "epoch": 0.78, "grad_norm": 25.79672067849548, "learning_rate": 7.16306169749074e-08, "logits/chosen": -1.9269657135009766, "logits/rejected": -1.8575401306152344, "logps/chosen": -386.0765380859375, "logps/rejected": -446.42095947265625, "loss": 0.5122, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8944480419158936, "rewards/margins": 0.7809109687805176, "rewards/rejected": -2.675359010696411, "step": 1350 }, { "epoch": 0.78, "grad_norm": 36.12006272451077, "learning_rate": 6.814381036730274e-08, "logits/chosen": -1.9610059261322021, "logits/rejected": -1.9268226623535156, "logps/chosen": -397.60357666015625, "logps/rejected": -479.10125732421875, "loss": 0.5363, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9262597560882568, "rewards/margins": 0.6647717952728271, "rewards/rejected": -2.591031551361084, "step": 1360 }, { "epoch": 0.79, "grad_norm": 32.103751962383164, "learning_rate": 6.473056535406035e-08, "logits/chosen": -1.970505714416504, "logits/rejected": -1.9748294353485107, "logps/chosen": -398.49639892578125, "logps/rejected": -483.1766052246094, "loss": 0.5542, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9573405981063843, "rewards/margins": 0.7238850593566895, "rewards/rejected": -2.681225538253784, "step": 1370 }, { "epoch": 0.79, "grad_norm": 29.27367179768827, "learning_rate": 6.139226260715872e-08, "logits/chosen": -1.9642279148101807, "logits/rejected": -1.9199190139770508, "logps/chosen": -412.734619140625, "logps/rejected": -482.08740234375, "loss": 0.5026, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0462894439697266, "rewards/margins": 0.792068600654602, "rewards/rejected": -2.838358163833618, "step": 1380 }, { "epoch": 0.8, "grad_norm": 36.00438391939365, "learning_rate": 5.8130252484113964e-08, "logits/chosen": -1.9426565170288086, "logits/rejected": -1.9226014614105225, "logps/chosen": -385.748046875, "logps/rejected": -454.22412109375, "loss": 0.5491, "rewards/accuracies": 0.75, "rewards/chosen": -1.8511062860488892, "rewards/margins": 0.8074220418930054, "rewards/rejected": -2.6585285663604736, "step": 1390 }, { "epoch": 0.81, "grad_norm": 32.96643024329086, "learning_rate": 5.4945854481754734e-08, "logits/chosen": -1.9304873943328857, "logits/rejected": -1.900002121925354, "logps/chosen": -371.5887145996094, "logps/rejected": -445.46221923828125, "loss": 0.5265, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.909517526626587, "rewards/margins": 0.7846697568893433, "rewards/rejected": -2.6941871643066406, "step": 1400 }, { "epoch": 0.81, "eval_logits/chosen": -1.8071422576904297, "eval_logits/rejected": -1.7997641563415527, "eval_logps/chosen": -391.1993408203125, "eval_logps/rejected": -469.79913330078125, "eval_loss": 0.5436315536499023, "eval_rewards/accuracies": 0.7214611768722534, "eval_rewards/chosen": -2.28442645072937, "eval_rewards/margins": 0.7102989554405212, "eval_rewards/rejected": -2.994725227355957, "eval_runtime": 524.298, "eval_samples_per_second": 13.351, "eval_steps_per_second": 0.418, "step": 1400 }, { "epoch": 0.81, "grad_norm": 37.572173988295035, "learning_rate": 5.184035670247988e-08, "logits/chosen": -1.934077262878418, "logits/rejected": -1.920440912246704, "logps/chosen": -378.1131286621094, "logps/rejected": -462.83929443359375, "loss": 0.5312, "rewards/accuracies": 0.75, "rewards/chosen": -1.9387060403823853, "rewards/margins": 0.7842427492141724, "rewards/rejected": -2.7229487895965576, "step": 1410 }, { "epoch": 0.82, "grad_norm": 35.65341932271922, "learning_rate": 4.881501533321605e-08, "logits/chosen": -1.87544846534729, "logits/rejected": -1.8604532480239868, "logps/chosen": -388.5731506347656, "logps/rejected": -475.32330322265625, "loss": 0.519, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1021876335144043, "rewards/margins": 0.7690648436546326, "rewards/rejected": -2.8712525367736816, "step": 1420 }, { "epoch": 0.82, "grad_norm": 32.19415920453824, "learning_rate": 4.5871054137284564e-08, "logits/chosen": -1.9715772867202759, "logits/rejected": -1.93185555934906, "logps/chosen": -403.1676025390625, "logps/rejected": -498.1748962402344, "loss": 0.5203, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9203879833221436, "rewards/margins": 0.8546259999275208, "rewards/rejected": -2.7750141620635986, "step": 1430 }, { "epoch": 0.83, "grad_norm": 57.11327150205332, "learning_rate": 4.300966395938377e-08, "logits/chosen": -1.9714921712875366, "logits/rejected": -1.9407069683074951, "logps/chosen": -409.3877258300781, "logps/rejected": -456.50244140625, "loss": 0.5818, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.9431222677230835, "rewards/margins": 0.5195000767707825, "rewards/rejected": -2.4626221656799316, "step": 1440 }, { "epoch": 0.84, "grad_norm": 21.830708345963956, "learning_rate": 4.023200224388787e-08, "logits/chosen": -1.9089914560317993, "logits/rejected": -1.855542778968811, "logps/chosen": -377.63653564453125, "logps/rejected": -457.47052001953125, "loss": 0.5071, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7652651071548462, "rewards/margins": 0.8306269645690918, "rewards/rejected": -2.5958924293518066, "step": 1450 }, { "epoch": 0.84, "grad_norm": 38.15424519204087, "learning_rate": 3.7539192566655246e-08, "logits/chosen": -1.870527982711792, "logits/rejected": -1.821215271949768, "logps/chosen": -396.0817565917969, "logps/rejected": -436.74102783203125, "loss": 0.5454, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0135273933410645, "rewards/margins": 0.600884735584259, "rewards/rejected": -2.6144118309020996, "step": 1460 }, { "epoch": 0.85, "grad_norm": 31.36535036359726, "learning_rate": 3.4932324180537736e-08, "logits/chosen": -1.926028847694397, "logits/rejected": -1.9150245189666748, "logps/chosen": -379.81573486328125, "logps/rejected": -472.4080505371094, "loss": 0.5271, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8782703876495361, "rewards/margins": 0.8038197755813599, "rewards/rejected": -2.6820900440216064, "step": 1470 }, { "epoch": 0.85, "grad_norm": 29.336291125895066, "learning_rate": 3.24124515747731e-08, "logits/chosen": -1.9342174530029297, "logits/rejected": -1.8941189050674438, "logps/chosen": -409.3456726074219, "logps/rejected": -470.3741149902344, "loss": 0.5135, "rewards/accuracies": 0.75, "rewards/chosen": -1.9259653091430664, "rewards/margins": 0.7116618752479553, "rewards/rejected": -2.637627363204956, "step": 1480 }, { "epoch": 0.86, "grad_norm": 26.986665461110498, "learning_rate": 2.998059404843947e-08, "logits/chosen": -1.8567430973052979, "logits/rejected": -1.8131777048110962, "logps/chosen": -383.24371337890625, "logps/rejected": -440.67315673828125, "loss": 0.5239, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9675956964492798, "rewards/margins": 0.6357102990150452, "rewards/rejected": -2.6033058166503906, "step": 1490 }, { "epoch": 0.86, "grad_norm": 26.37071629611169, "learning_rate": 2.763773529814506e-08, "logits/chosen": -1.8718721866607666, "logits/rejected": -1.8146419525146484, "logps/chosen": -384.16162109375, "logps/rejected": -487.1327209472656, "loss": 0.4844, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8588807582855225, "rewards/margins": 0.849262535572052, "rewards/rejected": -2.708143711090088, "step": 1500 }, { "epoch": 0.86, "eval_logits/chosen": -1.7992874383926392, "eval_logits/rejected": -1.7887682914733887, "eval_logps/chosen": -386.97613525390625, "eval_logps/rejected": -465.661376953125, "eval_loss": 0.5432813763618469, "eval_rewards/accuracies": 0.719748854637146, "eval_rewards/chosen": -2.242194175720215, "eval_rewards/margins": 0.7111533284187317, "eval_rewards/rejected": -2.9533474445343018, "eval_runtime": 546.6018, "eval_samples_per_second": 12.806, "eval_steps_per_second": 0.401, "step": 1500 }, { "epoch": 0.87, "grad_norm": 33.8754723176866, "learning_rate": 2.5384823020118212e-08, "logits/chosen": -1.8839702606201172, "logits/rejected": -1.8382689952850342, "logps/chosen": -380.5359191894531, "logps/rejected": -447.4384765625, "loss": 0.5575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.76059889793396, "rewards/margins": 0.7574474215507507, "rewards/rejected": -2.5180463790893555, "step": 1510 }, { "epoch": 0.88, "grad_norm": 26.785794346840675, "learning_rate": 2.3222768526860698e-08, "logits/chosen": -1.8873153924942017, "logits/rejected": -1.8357185125350952, "logps/chosen": -381.34564208984375, "logps/rejected": -430.72259521484375, "loss": 0.5231, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8608728647232056, "rewards/margins": 0.7487791180610657, "rewards/rejected": -2.609651803970337, "step": 1520 }, { "epoch": 0.88, "grad_norm": 30.22760318351379, "learning_rate": 2.1152446378517818e-08, "logits/chosen": -1.8901411294937134, "logits/rejected": -1.839329719543457, "logps/chosen": -393.60308837890625, "logps/rejected": -461.057861328125, "loss": 0.5391, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9723374843597412, "rewards/margins": 0.6977485418319702, "rewards/rejected": -2.670086145401001, "step": 1530 }, { "epoch": 0.89, "grad_norm": 26.484898729776308, "learning_rate": 1.9174694029115146e-08, "logits/chosen": -1.9374538660049438, "logits/rejected": -1.8765513896942139, "logps/chosen": -434.1646423339844, "logps/rejected": -468.3814392089844, "loss": 0.5262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.045320987701416, "rewards/margins": 0.6686374545097351, "rewards/rejected": -2.713958263397217, "step": 1540 }, { "epoch": 0.89, "grad_norm": 26.251643116785377, "learning_rate": 1.7290311487804687e-08, "logits/chosen": -1.9080512523651123, "logits/rejected": -1.84622323513031, "logps/chosen": -375.2956848144531, "logps/rejected": -463.65765380859375, "loss": 0.5239, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9260823726654053, "rewards/margins": 0.8633429408073425, "rewards/rejected": -2.7894253730773926, "step": 1550 }, { "epoch": 0.9, "grad_norm": 22.1873285162568, "learning_rate": 1.5500060995258134e-08, "logits/chosen": -1.9254217147827148, "logits/rejected": -1.8602027893066406, "logps/chosen": -406.3652648925781, "logps/rejected": -459.74273681640625, "loss": 0.5039, "rewards/accuracies": 0.75, "rewards/chosen": -1.8533226251602173, "rewards/margins": 0.7270603179931641, "rewards/rejected": -2.580382823944092, "step": 1560 }, { "epoch": 0.9, "grad_norm": 31.87656820271237, "learning_rate": 1.3804666715337116e-08, "logits/chosen": -1.911505103111267, "logits/rejected": -1.8812297582626343, "logps/chosen": -399.38519287109375, "logps/rejected": -496.04168701171875, "loss": 0.54, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9619176387786865, "rewards/margins": 0.8611427545547485, "rewards/rejected": -2.8230605125427246, "step": 1570 }, { "epoch": 0.91, "grad_norm": 28.90758640199268, "learning_rate": 1.2204814442165812e-08, "logits/chosen": -1.8718591928482056, "logits/rejected": -1.8608993291854858, "logps/chosen": -397.22100830078125, "logps/rejected": -456.198486328125, "loss": 0.5244, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.995205283164978, "rewards/margins": 0.7341451644897461, "rewards/rejected": -2.7293505668640137, "step": 1580 }, { "epoch": 0.92, "grad_norm": 31.58825212692507, "learning_rate": 1.070115132272445e-08, "logits/chosen": -1.8871160745620728, "logits/rejected": -1.827559232711792, "logps/chosen": -390.8136901855469, "logps/rejected": -458.43798828125, "loss": 0.5167, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8970882892608643, "rewards/margins": 0.9112474322319031, "rewards/rejected": -2.808335781097412, "step": 1590 }, { "epoch": 0.92, "grad_norm": 29.086437438100777, "learning_rate": 9.294285595075669e-09, "logits/chosen": -1.9274282455444336, "logits/rejected": -1.9093879461288452, "logps/chosen": -414.84576416015625, "logps/rejected": -500.18768310546875, "loss": 0.5612, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.002596616744995, "rewards/margins": 0.8606833219528198, "rewards/rejected": -2.8632798194885254, "step": 1600 }, { "epoch": 0.92, "eval_logits/chosen": -1.7875818014144897, "eval_logits/rejected": -1.7765936851501465, "eval_logps/chosen": -389.4626159667969, "eval_logps/rejected": -469.811279296875, "eval_loss": 0.5427327156066895, "eval_rewards/accuracies": 0.7208904027938843, "eval_rewards/chosen": -2.267058849334717, "eval_rewards/margins": 0.727787435054779, "eval_rewards/rejected": -2.9948465824127197, "eval_runtime": 523.5533, "eval_samples_per_second": 13.37, "eval_steps_per_second": 0.418, "step": 1600 }, { "epoch": 0.93, "grad_norm": 27.963113959175715, "learning_rate": 7.984786342329492e-09, "logits/chosen": -1.9024436473846436, "logits/rejected": -1.8931423425674438, "logps/chosen": -392.8687438964844, "logps/rejected": -472.52203369140625, "loss": 0.5214, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.090585470199585, "rewards/margins": 0.7528320550918579, "rewards/rejected": -2.843417167663574, "step": 1610 }, { "epoch": 0.93, "grad_norm": 38.701178208422625, "learning_rate": 6.773183262446914e-09, "logits/chosen": -1.9047428369522095, "logits/rejected": -1.8428666591644287, "logps/chosen": -408.89788818359375, "logps/rejected": -467.36309814453125, "loss": 0.5324, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9106022119522095, "rewards/margins": 0.8250144720077515, "rewards/rejected": -2.735616683959961, "step": 1620 }, { "epoch": 0.94, "grad_norm": 30.118277072421385, "learning_rate": 5.6599664539749295e-09, "logits/chosen": -1.9470701217651367, "logits/rejected": -1.926031470298767, "logps/chosen": -415.57330322265625, "logps/rejected": -491.3902282714844, "loss": 0.5135, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.970380425453186, "rewards/margins": 0.8686148524284363, "rewards/rejected": -2.8389952182769775, "step": 1630 }, { "epoch": 0.94, "grad_norm": 37.40579887540256, "learning_rate": 4.645586217799452e-09, "logits/chosen": -1.9280283451080322, "logits/rejected": -1.9276561737060547, "logps/chosen": -423.79217529296875, "logps/rejected": -496.7911071777344, "loss": 0.5503, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9242738485336304, "rewards/margins": 0.7551409602165222, "rewards/rejected": -2.679414749145508, "step": 1640 }, { "epoch": 0.95, "grad_norm": 32.746312132544105, "learning_rate": 3.730452874996737e-09, "logits/chosen": -1.92330801486969, "logits/rejected": -1.8721330165863037, "logps/chosen": -395.4407958984375, "logps/rejected": -454.64178466796875, "loss": 0.5192, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9143590927124023, "rewards/margins": 0.6810831427574158, "rewards/rejected": -2.595442056655884, "step": 1650 }, { "epoch": 0.96, "grad_norm": 35.835333829114475, "learning_rate": 2.9149366008568987e-09, "logits/chosen": -1.9155769348144531, "logits/rejected": -1.8720881938934326, "logps/chosen": -402.2400207519531, "logps/rejected": -452.55755615234375, "loss": 0.5239, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.848528265953064, "rewards/margins": 0.7459059953689575, "rewards/rejected": -2.5944347381591797, "step": 1660 }, { "epoch": 0.96, "grad_norm": 27.738054909743056, "learning_rate": 2.1993672751463576e-09, "logits/chosen": -1.9466373920440674, "logits/rejected": -1.9023081064224243, "logps/chosen": -407.79254150390625, "logps/rejected": -487.1842346191406, "loss": 0.499, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.0100741386413574, "rewards/margins": 0.8334406614303589, "rewards/rejected": -2.843514919281006, "step": 1670 }, { "epoch": 0.97, "grad_norm": 28.810543533175498, "learning_rate": 1.5840343486700215e-09, "logits/chosen": -1.9565961360931396, "logits/rejected": -1.8820337057113647, "logps/chosen": -377.89697265625, "logps/rejected": -461.2203063964844, "loss": 0.4816, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7978055477142334, "rewards/margins": 0.8113381266593933, "rewards/rejected": -2.6091437339782715, "step": 1680 }, { "epoch": 0.97, "grad_norm": 26.747095486222516, "learning_rate": 1.0691867261874154e-09, "logits/chosen": -1.9276363849639893, "logits/rejected": -1.8795725107192993, "logps/chosen": -400.2092590332031, "logps/rejected": -457.8243103027344, "loss": 0.5346, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8860292434692383, "rewards/margins": 0.778502881526947, "rewards/rejected": -2.66453218460083, "step": 1690 }, { "epoch": 0.98, "grad_norm": 21.69669900920832, "learning_rate": 6.550326657293881e-10, "logits/chosen": -1.915302038192749, "logits/rejected": -1.897491216659546, "logps/chosen": -411.01904296875, "logps/rejected": -480.05078125, "loss": 0.5017, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.052908420562744, "rewards/margins": 0.8378399610519409, "rewards/rejected": -2.8907482624053955, "step": 1700 }, { "epoch": 0.98, "eval_logits/chosen": -1.7904165983200073, "eval_logits/rejected": -1.7796399593353271, "eval_logps/chosen": -389.3405456542969, "eval_logps/rejected": -469.7990417480469, "eval_loss": 0.542646050453186, "eval_rewards/accuracies": 0.7214611768722534, "eval_rewards/chosen": -2.2658379077911377, "eval_rewards/margins": 0.7288866639137268, "eval_rewards/rejected": -2.9947245121002197, "eval_runtime": 549.1787, "eval_samples_per_second": 12.746, "eval_steps_per_second": 0.399, "step": 1700 }, { "epoch": 0.98, "grad_norm": 27.017405119205325, "learning_rate": 3.4173969435710715e-10, "logits/chosen": -1.907459020614624, "logits/rejected": -1.9028345346450806, "logps/chosen": -404.1794738769531, "logps/rejected": -475.02490234375, "loss": 0.5398, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1053857803344727, "rewards/margins": 0.5925677418708801, "rewards/rejected": -2.697953701019287, "step": 1710 }, { "epoch": 0.99, "grad_norm": 28.429723265538055, "learning_rate": 1.2943454039654467e-10, "logits/chosen": -1.8941481113433838, "logits/rejected": -1.8836424350738525, "logps/chosen": -381.0569152832031, "logps/rejected": -472.2247009277344, "loss": 0.5433, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9636991024017334, "rewards/margins": 0.7842205762863159, "rewards/rejected": -2.7479193210601807, "step": 1720 }, { "epoch": 1.0, "grad_norm": 23.62637877799544, "learning_rate": 1.8203082176287964e-11, "logits/chosen": -1.8356783390045166, "logits/rejected": -1.797844648361206, "logps/chosen": -407.73516845703125, "logps/rejected": -473.55120849609375, "loss": 0.5069, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.220022201538086, "rewards/margins": 0.7199020385742188, "rewards/rejected": -2.9399242401123047, "step": 1730 }, { "epoch": 1.0, "step": 1736, "total_flos": 0.0, "train_loss": 0.55459001399405, "train_runtime": 38266.551, "train_samples_per_second": 2.904, "train_steps_per_second": 0.045 } ], "logging_steps": 10, "max_steps": 1736, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }