zephyr-7b-sft-safeDPO / trainer_state.json
AmberYifan's picture
Model save
3b29619 verified
raw
history blame
101 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997120644975526,
"eval_steps": 100,
"global_step": 1736,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 10.610388839867777,
"learning_rate": 2.8735632183908045e-09,
"logits/chosen": -2.688382625579834,
"logits/rejected": -2.687504768371582,
"logps/chosen": -154.15142822265625,
"logps/rejected": -119.21998596191406,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 11.287668992561438,
"learning_rate": 2.8735632183908043e-08,
"logits/chosen": -2.693573236465454,
"logits/rejected": -2.7061853408813477,
"logps/chosen": -203.12576293945312,
"logps/rejected": -203.58848571777344,
"loss": 0.6933,
"rewards/accuracies": 0.4305555522441864,
"rewards/chosen": -0.0002493205538485199,
"rewards/margins": -0.00013067919644527137,
"rewards/rejected": -0.0001186413355753757,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 10.975446002121831,
"learning_rate": 5.747126436781609e-08,
"logits/chosen": -2.6681714057922363,
"logits/rejected": -2.6636619567871094,
"logps/chosen": -208.20529174804688,
"logps/rejected": -195.71517944335938,
"loss": 0.6931,
"rewards/accuracies": 0.53125,
"rewards/chosen": 2.0605861209332943e-05,
"rewards/margins": 0.0007079349015839398,
"rewards/rejected": -0.0006873290403746068,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 10.987240036415274,
"learning_rate": 8.620689655172414e-08,
"logits/chosen": -2.6226565837860107,
"logits/rejected": -2.627593755722046,
"logps/chosen": -179.27633666992188,
"logps/rejected": -194.77871704101562,
"loss": 0.693,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.0009387334575876594,
"rewards/margins": 0.0007720856228843331,
"rewards/rejected": 0.00016664779104758054,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 11.874024139589977,
"learning_rate": 1.1494252873563217e-07,
"logits/chosen": -2.610243320465088,
"logits/rejected": -2.571385145187378,
"logps/chosen": -208.62820434570312,
"logps/rejected": -187.62649536132812,
"loss": 0.6927,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.001225657993927598,
"rewards/margins": 0.0014799232594668865,
"rewards/rejected": -0.00025426512002013624,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 10.535632759826791,
"learning_rate": 1.436781609195402e-07,
"logits/chosen": -2.6413865089416504,
"logits/rejected": -2.665769100189209,
"logps/chosen": -236.5024871826172,
"logps/rejected": -203.89524841308594,
"loss": 0.6923,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0021102039609104395,
"rewards/margins": 0.0023120432160794735,
"rewards/rejected": -0.00020183932792861015,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 10.953283816672645,
"learning_rate": 1.7241379310344828e-07,
"logits/chosen": -2.649590015411377,
"logits/rejected": -2.6609647274017334,
"logps/chosen": -232.6203155517578,
"logps/rejected": -211.6860809326172,
"loss": 0.6914,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0004964367835782468,
"rewards/margins": 0.003091245424002409,
"rewards/rejected": -0.0025948083493858576,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 11.344557073712732,
"learning_rate": 2.0114942528735633e-07,
"logits/chosen": -2.6284663677215576,
"logits/rejected": -2.6205639839172363,
"logps/chosen": -203.4170684814453,
"logps/rejected": -206.2279052734375,
"loss": 0.6899,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.004654805175960064,
"rewards/margins": 0.0051066940650343895,
"rewards/rejected": -0.009761499240994453,
"step": 70
},
{
"epoch": 0.05,
"grad_norm": 11.169957980773157,
"learning_rate": 2.2988505747126435e-07,
"logits/chosen": -2.617027997970581,
"logits/rejected": -2.653088092803955,
"logps/chosen": -176.9120330810547,
"logps/rejected": -186.38589477539062,
"loss": 0.6887,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.01036100834608078,
"rewards/margins": 0.004531105048954487,
"rewards/rejected": -0.014892111532390118,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 11.709716340155365,
"learning_rate": 2.586206896551724e-07,
"logits/chosen": -2.5782480239868164,
"logits/rejected": -2.599475622177124,
"logps/chosen": -178.95782470703125,
"logps/rejected": -210.3921661376953,
"loss": 0.6841,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.019216390326619148,
"rewards/margins": 0.0284399576485157,
"rewards/rejected": -0.0476563461124897,
"step": 90
},
{
"epoch": 0.06,
"grad_norm": 11.076992600659995,
"learning_rate": 2.873563218390804e-07,
"logits/chosen": -2.6024298667907715,
"logits/rejected": -2.603557825088501,
"logps/chosen": -191.04461669921875,
"logps/rejected": -196.60302734375,
"loss": 0.6809,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.04221532493829727,
"rewards/margins": 0.025046557188034058,
"rewards/rejected": -0.06726188957691193,
"step": 100
},
{
"epoch": 0.06,
"eval_logits/chosen": -2.5392000675201416,
"eval_logits/rejected": -2.5504696369171143,
"eval_logps/chosen": -171.71307373046875,
"eval_logps/rejected": -181.7760467529297,
"eval_loss": 0.6815534234046936,
"eval_rewards/accuracies": 0.6090182662010193,
"eval_rewards/chosen": -0.0895635262131691,
"eval_rewards/margins": 0.024930791929364204,
"eval_rewards/rejected": -0.11449432373046875,
"eval_runtime": 523.8706,
"eval_samples_per_second": 13.362,
"eval_steps_per_second": 0.418,
"step": 100
},
{
"epoch": 0.06,
"grad_norm": 11.892784161636136,
"learning_rate": 3.160919540229885e-07,
"logits/chosen": -2.559643268585205,
"logits/rejected": -2.5869317054748535,
"logps/chosen": -202.63461303710938,
"logps/rejected": -223.0349578857422,
"loss": 0.672,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08688319474458694,
"rewards/margins": 0.04590854048728943,
"rewards/rejected": -0.13279172778129578,
"step": 110
},
{
"epoch": 0.07,
"grad_norm": 15.318535657417753,
"learning_rate": 3.4482758620689656e-07,
"logits/chosen": -2.5284199714660645,
"logits/rejected": -2.5128540992736816,
"logps/chosen": -199.5592803955078,
"logps/rejected": -214.75119018554688,
"loss": 0.6613,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.17664876580238342,
"rewards/margins": 0.06412236392498016,
"rewards/rejected": -0.24077114462852478,
"step": 120
},
{
"epoch": 0.07,
"grad_norm": 22.024086046505637,
"learning_rate": 3.735632183908046e-07,
"logits/chosen": -2.5801522731781006,
"logits/rejected": -2.565929651260376,
"logps/chosen": -245.0824432373047,
"logps/rejected": -247.3890838623047,
"loss": 0.641,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3728107511997223,
"rewards/margins": 0.1388251781463623,
"rewards/rejected": -0.5116358995437622,
"step": 130
},
{
"epoch": 0.08,
"grad_norm": 20.201715650528918,
"learning_rate": 4.0229885057471266e-07,
"logits/chosen": -2.5328726768493652,
"logits/rejected": -2.5208544731140137,
"logps/chosen": -302.12322998046875,
"logps/rejected": -297.0425109863281,
"loss": 0.6436,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.7101386785507202,
"rewards/margins": 0.11017869412899017,
"rewards/rejected": -0.820317268371582,
"step": 140
},
{
"epoch": 0.09,
"grad_norm": 21.265576535090425,
"learning_rate": 4.310344827586206e-07,
"logits/chosen": -2.440979480743408,
"logits/rejected": -2.446094512939453,
"logps/chosen": -281.5878601074219,
"logps/rejected": -299.9305419921875,
"loss": 0.6327,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.7533038258552551,
"rewards/margins": 0.1949019879102707,
"rewards/rejected": -0.9482057690620422,
"step": 150
},
{
"epoch": 0.09,
"grad_norm": 24.193407542556805,
"learning_rate": 4.597701149425287e-07,
"logits/chosen": -2.405226707458496,
"logits/rejected": -2.385442018508911,
"logps/chosen": -282.8765563964844,
"logps/rejected": -290.90338134765625,
"loss": 0.6035,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7150470018386841,
"rewards/margins": 0.3021948039531708,
"rewards/rejected": -1.0172417163848877,
"step": 160
},
{
"epoch": 0.1,
"grad_norm": 25.66751117876746,
"learning_rate": 4.885057471264368e-07,
"logits/chosen": -2.428391456604004,
"logits/rejected": -2.4205939769744873,
"logps/chosen": -295.0913391113281,
"logps/rejected": -324.97454833984375,
"loss": 0.6138,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.996240496635437,
"rewards/margins": 0.40502578020095825,
"rewards/rejected": -1.40126633644104,
"step": 170
},
{
"epoch": 0.1,
"grad_norm": 23.464843947505965,
"learning_rate": 4.999817969178237e-07,
"logits/chosen": -2.4013054370880127,
"logits/rejected": -2.398705005645752,
"logps/chosen": -315.08050537109375,
"logps/rejected": -362.9265441894531,
"loss": 0.6065,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1423838138580322,
"rewards/margins": 0.3166060149669647,
"rewards/rejected": -1.4589898586273193,
"step": 180
},
{
"epoch": 0.11,
"grad_norm": 25.400467946109586,
"learning_rate": 4.998705654596034e-07,
"logits/chosen": -2.467696189880371,
"logits/rejected": -2.4567761421203613,
"logps/chosen": -330.1573181152344,
"logps/rejected": -355.02154541015625,
"loss": 0.5809,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1700841188430786,
"rewards/margins": 0.3594915568828583,
"rewards/rejected": -1.5295757055282593,
"step": 190
},
{
"epoch": 0.12,
"grad_norm": 29.13043617111363,
"learning_rate": 4.996582603056428e-07,
"logits/chosen": -2.376218557357788,
"logits/rejected": -2.3482134342193604,
"logps/chosen": -332.60443115234375,
"logps/rejected": -390.0224914550781,
"loss": 0.6002,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3785903453826904,
"rewards/margins": 0.41944313049316406,
"rewards/rejected": -1.7980334758758545,
"step": 200
},
{
"epoch": 0.12,
"eval_logits/chosen": -2.3656015396118164,
"eval_logits/rejected": -2.356565237045288,
"eval_logps/chosen": -309.8548583984375,
"eval_logps/rejected": -361.9523010253906,
"eval_loss": 0.5905965566635132,
"eval_rewards/accuracies": 0.6843607425689697,
"eval_rewards/chosen": -1.4709811210632324,
"eval_rewards/margins": 0.4452756345272064,
"eval_rewards/rejected": -1.9162570238113403,
"eval_runtime": 536.6296,
"eval_samples_per_second": 13.044,
"eval_steps_per_second": 0.408,
"step": 200
},
{
"epoch": 0.12,
"grad_norm": 21.603808432085263,
"learning_rate": 4.993449673342705e-07,
"logits/chosen": -2.4084885120391846,
"logits/rejected": -2.4161148071289062,
"logps/chosen": -323.7695007324219,
"logps/rejected": -387.0673828125,
"loss": 0.594,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.2521207332611084,
"rewards/margins": 0.4323544502258301,
"rewards/rejected": -1.684475302696228,
"step": 210
},
{
"epoch": 0.13,
"grad_norm": 23.37624428964897,
"learning_rate": 4.989308132738126e-07,
"logits/chosen": -2.339341402053833,
"logits/rejected": -2.3030219078063965,
"logps/chosen": -309.7107849121094,
"logps/rejected": -352.9278564453125,
"loss": 0.5974,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.297975778579712,
"rewards/margins": 0.35226622223854065,
"rewards/rejected": -1.6502418518066406,
"step": 220
},
{
"epoch": 0.13,
"grad_norm": 24.10831947448163,
"learning_rate": 4.9841596565133e-07,
"logits/chosen": -2.2944416999816895,
"logits/rejected": -2.2744333744049072,
"logps/chosen": -354.4916076660156,
"logps/rejected": -389.98919677734375,
"loss": 0.597,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.5111327171325684,
"rewards/margins": 0.3378602862358093,
"rewards/rejected": -1.848992943763733,
"step": 230
},
{
"epoch": 0.14,
"grad_norm": 23.94673097578735,
"learning_rate": 4.978006327248536e-07,
"logits/chosen": -2.4152960777282715,
"logits/rejected": -2.417513370513916,
"logps/chosen": -313.9660949707031,
"logps/rejected": -363.4143981933594,
"loss": 0.5808,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1127357482910156,
"rewards/margins": 0.48004403710365295,
"rewards/rejected": -1.5927797555923462,
"step": 240
},
{
"epoch": 0.14,
"grad_norm": 45.997903240569016,
"learning_rate": 4.970850633991431e-07,
"logits/chosen": -2.3635926246643066,
"logits/rejected": -2.3639185428619385,
"logps/chosen": -357.05181884765625,
"logps/rejected": -428.13134765625,
"loss": 0.5965,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.6936771869659424,
"rewards/margins": 0.5474244952201843,
"rewards/rejected": -2.2411017417907715,
"step": 250
},
{
"epoch": 0.15,
"grad_norm": 26.05750468880025,
"learning_rate": 4.962695471250032e-07,
"logits/chosen": -2.3708977699279785,
"logits/rejected": -2.3599140644073486,
"logps/chosen": -314.1701965332031,
"logps/rejected": -378.3408203125,
"loss": 0.577,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.363680124282837,
"rewards/margins": 0.5289397239685059,
"rewards/rejected": -1.8926197290420532,
"step": 260
},
{
"epoch": 0.16,
"grad_norm": 30.134203618956438,
"learning_rate": 4.953544137822006e-07,
"logits/chosen": -2.272925615310669,
"logits/rejected": -2.2591726779937744,
"logps/chosen": -352.3068542480469,
"logps/rejected": -409.1640625,
"loss": 0.5787,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.743584394454956,
"rewards/margins": 0.42208537459373474,
"rewards/rejected": -2.1656696796417236,
"step": 270
},
{
"epoch": 0.16,
"grad_norm": 23.199137985460396,
"learning_rate": 4.94340033546025e-07,
"logits/chosen": -2.300412654876709,
"logits/rejected": -2.2782740592956543,
"logps/chosen": -381.15594482421875,
"logps/rejected": -399.106201171875,
"loss": 0.5954,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.5968300104141235,
"rewards/margins": 0.3864768445491791,
"rewards/rejected": -1.983306884765625,
"step": 280
},
{
"epoch": 0.17,
"grad_norm": 30.52404960049098,
"learning_rate": 4.932268167375531e-07,
"logits/chosen": -2.3673739433288574,
"logits/rejected": -2.3496601581573486,
"logps/chosen": -319.85589599609375,
"logps/rejected": -363.55059814453125,
"loss": 0.5868,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2304320335388184,
"rewards/margins": 0.38459140062332153,
"rewards/rejected": -1.6150233745574951,
"step": 290
},
{
"epoch": 0.17,
"grad_norm": 21.552944683968224,
"learning_rate": 4.920152136576705e-07,
"logits/chosen": -2.301480770111084,
"logits/rejected": -2.286813259124756,
"logps/chosen": -361.3895263671875,
"logps/rejected": -411.3047790527344,
"loss": 0.591,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5672380924224854,
"rewards/margins": 0.5191463232040405,
"rewards/rejected": -2.0863845348358154,
"step": 300
},
{
"epoch": 0.17,
"eval_logits/chosen": -2.2067737579345703,
"eval_logits/rejected": -2.193309783935547,
"eval_logps/chosen": -365.80474853515625,
"eval_logps/rejected": -423.1273498535156,
"eval_loss": 0.5809333324432373,
"eval_rewards/accuracies": 0.6923515796661377,
"eval_rewards/chosen": -2.030480146408081,
"eval_rewards/margins": 0.49752748012542725,
"eval_rewards/rejected": -2.528007984161377,
"eval_runtime": 544.0927,
"eval_samples_per_second": 12.865,
"eval_steps_per_second": 0.403,
"step": 300
},
{
"epoch": 0.18,
"grad_norm": 20.57183591795313,
"learning_rate": 4.907057144049243e-07,
"logits/chosen": -2.2187986373901367,
"logits/rejected": -2.2342276573181152,
"logps/chosen": -363.1693420410156,
"logps/rejected": -433.428955078125,
"loss": 0.5665,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.6798083782196045,
"rewards/margins": 0.5087260007858276,
"rewards/rejected": -2.1885344982147217,
"step": 310
},
{
"epoch": 0.18,
"grad_norm": 29.514941076169325,
"learning_rate": 4.892988486772756e-07,
"logits/chosen": -2.145481586456299,
"logits/rejected": -2.149977207183838,
"logps/chosen": -315.6699523925781,
"logps/rejected": -392.2762756347656,
"loss": 0.5551,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.504184365272522,
"rewards/margins": 0.5998227596282959,
"rewards/rejected": -2.1040072441101074,
"step": 320
},
{
"epoch": 0.19,
"grad_norm": 49.892806992923354,
"learning_rate": 4.877951855578342e-07,
"logits/chosen": -2.0608973503112793,
"logits/rejected": -2.0279011726379395,
"logps/chosen": -388.0411376953125,
"logps/rejected": -433.9009704589844,
"loss": 0.5996,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.8731982707977295,
"rewards/margins": 0.5407770872116089,
"rewards/rejected": -2.413975477218628,
"step": 330
},
{
"epoch": 0.2,
"grad_norm": 28.07822983249446,
"learning_rate": 4.861953332846629e-07,
"logits/chosen": -2.0477962493896484,
"logits/rejected": -1.9786545038223267,
"logps/chosen": -350.5347900390625,
"logps/rejected": -404.81390380859375,
"loss": 0.5561,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.5449774265289307,
"rewards/margins": 0.5318618416786194,
"rewards/rejected": -2.0768394470214844,
"step": 340
},
{
"epoch": 0.2,
"grad_norm": 31.750069839466466,
"learning_rate": 4.844999390047419e-07,
"logits/chosen": -1.9117634296417236,
"logits/rejected": -1.8637244701385498,
"logps/chosen": -369.7088928222656,
"logps/rejected": -423.8294982910156,
"loss": 0.5674,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.8487409353256226,
"rewards/margins": 0.5574057698249817,
"rewards/rejected": -2.406146764755249,
"step": 350
},
{
"epoch": 0.21,
"grad_norm": 40.566376234563315,
"learning_rate": 4.827096885121953e-07,
"logits/chosen": -1.8720242977142334,
"logits/rejected": -1.849880576133728,
"logps/chosen": -453.58563232421875,
"logps/rejected": -510.3387145996094,
"loss": 0.5451,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.2124533653259277,
"rewards/margins": 0.7541533708572388,
"rewards/rejected": -2.966606616973877,
"step": 360
},
{
"epoch": 0.21,
"grad_norm": 27.693964794914088,
"learning_rate": 4.808253059708848e-07,
"logits/chosen": -1.9786027669906616,
"logits/rejected": -1.957528829574585,
"logps/chosen": -384.38519287109375,
"logps/rejected": -449.1851501464844,
"loss": 0.5708,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.7919820547103882,
"rewards/margins": 0.6518365144729614,
"rewards/rejected": -2.4438185691833496,
"step": 370
},
{
"epoch": 0.22,
"grad_norm": 26.76769623003568,
"learning_rate": 4.788475536214821e-07,
"logits/chosen": -2.040398120880127,
"logits/rejected": -2.0081913471221924,
"logps/chosen": -372.25213623046875,
"logps/rejected": -443.19451904296875,
"loss": 0.5233,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7179028987884521,
"rewards/margins": 0.7337791919708252,
"rewards/rejected": -2.4516820907592773,
"step": 380
},
{
"epoch": 0.22,
"grad_norm": 38.23522225315786,
"learning_rate": 4.767772314731393e-07,
"logits/chosen": -1.9009816646575928,
"logits/rejected": -1.9371490478515625,
"logps/chosen": -370.54229736328125,
"logps/rejected": -435.6880798339844,
"loss": 0.5569,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9820528030395508,
"rewards/margins": 0.547071635723114,
"rewards/rejected": -2.5291244983673096,
"step": 390
},
{
"epoch": 0.23,
"grad_norm": 32.640987965795105,
"learning_rate": 4.746151769798818e-07,
"logits/chosen": -1.969786286354065,
"logits/rejected": -1.8861439228057861,
"logps/chosen": -388.787353515625,
"logps/rejected": -426.386962890625,
"loss": 0.5437,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.8250181674957275,
"rewards/margins": 0.5650046467781067,
"rewards/rejected": -2.3900225162506104,
"step": 400
},
{
"epoch": 0.23,
"eval_logits/chosen": -1.9247232675552368,
"eval_logits/rejected": -1.8974039554595947,
"eval_logps/chosen": -343.13470458984375,
"eval_logps/rejected": -406.9888000488281,
"eval_loss": 0.5683532953262329,
"eval_rewards/accuracies": 0.7031963467597961,
"eval_rewards/chosen": -1.80377995967865,
"eval_rewards/margins": 0.5628422498703003,
"eval_rewards/rejected": -2.366621971130371,
"eval_runtime": 547.2464,
"eval_samples_per_second": 12.791,
"eval_steps_per_second": 0.4,
"step": 400
},
{
"epoch": 0.24,
"grad_norm": 21.532686706791136,
"learning_rate": 4.72362264701855e-07,
"logits/chosen": -2.114487409591675,
"logits/rejected": -2.0793392658233643,
"logps/chosen": -370.3285217285156,
"logps/rejected": -403.5226135253906,
"loss": 0.5759,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2717143297195435,
"rewards/margins": 0.5267833471298218,
"rewards/rejected": -1.7984975576400757,
"step": 410
},
{
"epoch": 0.24,
"grad_norm": 29.239777552832912,
"learning_rate": 4.7001940595156055e-07,
"logits/chosen": -2.0379366874694824,
"logits/rejected": -1.9628146886825562,
"logps/chosen": -385.35113525390625,
"logps/rejected": -440.34222412109375,
"loss": 0.5678,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.6135915517807007,
"rewards/margins": 0.6375263333320618,
"rewards/rejected": -2.2511179447174072,
"step": 420
},
{
"epoch": 0.25,
"grad_norm": 20.560330978299934,
"learning_rate": 4.6758754842522697e-07,
"logits/chosen": -2.0536270141601562,
"logits/rejected": -1.9932899475097656,
"logps/chosen": -365.8475036621094,
"logps/rejected": -423.611083984375,
"loss": 0.565,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.728179931640625,
"rewards/margins": 0.6338831186294556,
"rewards/rejected": -2.362062931060791,
"step": 430
},
{
"epoch": 0.25,
"grad_norm": 26.229998665879116,
"learning_rate": 4.650676758194623e-07,
"logits/chosen": -2.07350492477417,
"logits/rejected": -2.022712230682373,
"logps/chosen": -401.141357421875,
"logps/rejected": -436.979248046875,
"loss": 0.5464,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.8332273960113525,
"rewards/margins": 0.6851301789283752,
"rewards/rejected": -2.518357753753662,
"step": 440
},
{
"epoch": 0.26,
"grad_norm": 46.29630215421365,
"learning_rate": 4.6246080743334474e-07,
"logits/chosen": -1.8938102722167969,
"logits/rejected": -1.8106597661972046,
"logps/chosen": -397.90948486328125,
"logps/rejected": -467.4127502441406,
"loss": 0.5466,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.9406541585922241,
"rewards/margins": 0.6843216419219971,
"rewards/rejected": -2.6249756813049316,
"step": 450
},
{
"epoch": 0.26,
"grad_norm": 26.630018999750448,
"learning_rate": 4.5976799775611215e-07,
"logits/chosen": -1.814541220664978,
"logits/rejected": -1.7524267435073853,
"logps/chosen": -366.3084716796875,
"logps/rejected": -446.58026123046875,
"loss": 0.5626,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7752193212509155,
"rewards/margins": 0.6405627727508545,
"rewards/rejected": -2.4157819747924805,
"step": 460
},
{
"epoch": 0.27,
"grad_norm": 35.44334983652439,
"learning_rate": 4.569903360406162e-07,
"logits/chosen": -1.9025815725326538,
"logits/rejected": -1.8398154973983765,
"logps/chosen": -346.3355407714844,
"logps/rejected": -398.0967102050781,
"loss": 0.5401,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.503535509109497,
"rewards/margins": 0.6164692640304565,
"rewards/rejected": -2.1200051307678223,
"step": 470
},
{
"epoch": 0.28,
"grad_norm": 33.12278527176869,
"learning_rate": 4.5412894586271543e-07,
"logits/chosen": -1.8207648992538452,
"logits/rejected": -1.7967065572738647,
"logps/chosen": -392.82696533203125,
"logps/rejected": -462.015869140625,
"loss": 0.5451,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8827365636825562,
"rewards/margins": 0.6777531504631042,
"rewards/rejected": -2.5604898929595947,
"step": 480
},
{
"epoch": 0.28,
"grad_norm": 25.558438319253998,
"learning_rate": 4.511849846667839e-07,
"logits/chosen": -1.883180022239685,
"logits/rejected": -1.8137277364730835,
"logps/chosen": -354.247314453125,
"logps/rejected": -436.14556884765625,
"loss": 0.5408,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8229477405548096,
"rewards/margins": 0.7674862742424011,
"rewards/rejected": -2.5904340744018555,
"step": 490
},
{
"epoch": 0.29,
"grad_norm": 28.233129557824064,
"learning_rate": 4.481596432975201e-07,
"logits/chosen": -1.9428781270980835,
"logits/rejected": -1.889491081237793,
"logps/chosen": -410.0284729003906,
"logps/rejected": -480.2649841308594,
"loss": 0.5415,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.8563823699951172,
"rewards/margins": 0.8543184995651245,
"rewards/rejected": -2.710700750350952,
"step": 500
},
{
"epoch": 0.29,
"eval_logits/chosen": -1.8125942945480347,
"eval_logits/rejected": -1.7951966524124146,
"eval_logps/chosen": -405.05938720703125,
"eval_logps/rejected": -476.8222961425781,
"eval_loss": 0.5648065209388733,
"eval_rewards/accuracies": 0.706620991230011,
"eval_rewards/chosen": -2.4230268001556396,
"eval_rewards/margins": 0.6419299840927124,
"eval_rewards/rejected": -3.0649566650390625,
"eval_runtime": 536.9406,
"eval_samples_per_second": 13.037,
"eval_steps_per_second": 0.408,
"step": 500
},
{
"epoch": 0.29,
"grad_norm": 30.516998297285266,
"learning_rate": 4.450541455182453e-07,
"logits/chosen": -1.8995802402496338,
"logits/rejected": -1.9007337093353271,
"logps/chosen": -408.70635986328125,
"logps/rejected": -487.16387939453125,
"loss": 0.5238,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.09024977684021,
"rewards/margins": 0.7947575449943542,
"rewards/rejected": -2.885007381439209,
"step": 510
},
{
"epoch": 0.3,
"grad_norm": 33.722814638920184,
"learning_rate": 4.41869747515886e-07,
"logits/chosen": -1.95028817653656,
"logits/rejected": -1.8546888828277588,
"logps/chosen": -388.6572570800781,
"logps/rejected": -446.74542236328125,
"loss": 0.5667,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9295694828033447,
"rewards/margins": 0.7291784882545471,
"rewards/rejected": -2.658748149871826,
"step": 520
},
{
"epoch": 0.31,
"grad_norm": 25.932688832468305,
"learning_rate": 4.3860773739284126e-07,
"logits/chosen": -1.9748178720474243,
"logits/rejected": -1.9027087688446045,
"logps/chosen": -368.09832763671875,
"logps/rejected": -403.284912109375,
"loss": 0.573,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.6870168447494507,
"rewards/margins": 0.5430334806442261,
"rewards/rejected": -2.2300503253936768,
"step": 530
},
{
"epoch": 0.31,
"grad_norm": 39.79448640097382,
"learning_rate": 4.352694346459396e-07,
"logits/chosen": -1.9401954412460327,
"logits/rejected": -1.905206322669983,
"logps/chosen": -386.59918212890625,
"logps/rejected": -437.18536376953125,
"loss": 0.571,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.91461181640625,
"rewards/margins": 0.4893025755882263,
"rewards/rejected": -2.403914451599121,
"step": 540
},
{
"epoch": 0.32,
"grad_norm": 28.112999261098803,
"learning_rate": 4.318561896326973e-07,
"logits/chosen": -1.959571123123169,
"logits/rejected": -1.9278638362884521,
"logps/chosen": -388.32073974609375,
"logps/rejected": -454.91436767578125,
"loss": 0.5538,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.955959677696228,
"rewards/margins": 0.6656385660171509,
"rewards/rejected": -2.621598482131958,
"step": 550
},
{
"epoch": 0.32,
"grad_norm": 26.262637133504416,
"learning_rate": 4.2836938302509256e-07,
"logits/chosen": -2.0025877952575684,
"logits/rejected": -1.9562809467315674,
"logps/chosen": -359.0731201171875,
"logps/rejected": -429.9349060058594,
"loss": 0.5291,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.6528972387313843,
"rewards/margins": 0.6726639866828918,
"rewards/rejected": -2.325561285018921,
"step": 560
},
{
"epoch": 0.33,
"grad_norm": 36.25641292003506,
"learning_rate": 4.248104252510785e-07,
"logits/chosen": -2.134064197540283,
"logits/rejected": -2.1425302028656006,
"logps/chosen": -429.51153564453125,
"logps/rejected": -480.48138427734375,
"loss": 0.544,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.814552664756775,
"rewards/margins": 0.49401578307151794,
"rewards/rejected": -2.3085684776306152,
"step": 570
},
{
"epoch": 0.33,
"grad_norm": 21.449511768929142,
"learning_rate": 4.2118075592405874e-07,
"logits/chosen": -1.988585114479065,
"logits/rejected": -2.011026382446289,
"logps/chosen": -405.82305908203125,
"logps/rejected": -488.56451416015625,
"loss": 0.5412,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.087791919708252,
"rewards/margins": 0.7612438201904297,
"rewards/rejected": -2.8490357398986816,
"step": 580
},
{
"epoch": 0.34,
"grad_norm": 30.59358168073691,
"learning_rate": 4.174818432605578e-07,
"logits/chosen": -2.0260438919067383,
"logits/rejected": -2.033987522125244,
"logps/chosen": -453.0452575683594,
"logps/rejected": -514.720458984375,
"loss": 0.5355,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.147684335708618,
"rewards/margins": 0.7406858801841736,
"rewards/rejected": -2.8883700370788574,
"step": 590
},
{
"epoch": 0.35,
"grad_norm": 28.138749590258723,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": -1.9616165161132812,
"logits/rejected": -1.972180724143982,
"logps/chosen": -385.138427734375,
"logps/rejected": -473.2599182128906,
"loss": 0.564,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.132406234741211,
"rewards/margins": 0.6182124018669128,
"rewards/rejected": -2.7506186962127686,
"step": 600
},
{
"epoch": 0.35,
"eval_logits/chosen": -1.8847192525863647,
"eval_logits/rejected": -1.8836290836334229,
"eval_logps/chosen": -397.7480773925781,
"eval_logps/rejected": -469.41180419921875,
"eval_loss": 0.5578325390815735,
"eval_rewards/accuracies": 0.7191780805587769,
"eval_rewards/chosen": -2.3499135971069336,
"eval_rewards/margins": 0.6409377455711365,
"eval_rewards/rejected": -2.990851402282715,
"eval_runtime": 544.5307,
"eval_samples_per_second": 12.855,
"eval_steps_per_second": 0.402,
"step": 600
},
{
"epoch": 0.35,
"grad_norm": 33.10703086608096,
"learning_rate": 4.098823002310864e-07,
"logits/chosen": -2.044586181640625,
"logits/rejected": -1.9869381189346313,
"logps/chosen": -415.4453125,
"logps/rejected": -474.20526123046875,
"loss": 0.5454,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8535633087158203,
"rewards/margins": 0.7316546440124512,
"rewards/rejected": -2.5852179527282715,
"step": 610
},
{
"epoch": 0.36,
"grad_norm": 39.38037052781508,
"learning_rate": 4.059847439122671e-07,
"logits/chosen": -1.9577858448028564,
"logits/rejected": -1.904496431350708,
"logps/chosen": -393.66796875,
"logps/rejected": -449.994140625,
"loss": 0.5357,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8364942073822021,
"rewards/margins": 0.6134520769119263,
"rewards/rejected": -2.449946165084839,
"step": 620
},
{
"epoch": 0.36,
"grad_norm": 29.15442393094139,
"learning_rate": 4.020240911078041e-07,
"logits/chosen": -1.8907365798950195,
"logits/rejected": -1.8794755935668945,
"logps/chosen": -393.5573425292969,
"logps/rejected": -469.4529724121094,
"loss": 0.5547,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.1041901111602783,
"rewards/margins": 0.8193286657333374,
"rewards/rejected": -2.923518419265747,
"step": 630
},
{
"epoch": 0.37,
"grad_norm": 24.710710448776272,
"learning_rate": 3.98001943918432e-07,
"logits/chosen": -1.87062668800354,
"logits/rejected": -1.8511345386505127,
"logps/chosen": -391.0401306152344,
"logps/rejected": -467.5562438964844,
"loss": 0.5439,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.9567807912826538,
"rewards/margins": 0.8031463623046875,
"rewards/rejected": -2.75992751121521,
"step": 640
},
{
"epoch": 0.37,
"grad_norm": 28.042405621162647,
"learning_rate": 3.9391992931962304e-07,
"logits/chosen": -1.912502646446228,
"logits/rejected": -1.8945941925048828,
"logps/chosen": -381.6258850097656,
"logps/rejected": -439.37921142578125,
"loss": 0.5279,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6793773174285889,
"rewards/margins": 0.6930197477340698,
"rewards/rejected": -2.3723976612091064,
"step": 650
},
{
"epoch": 0.38,
"grad_norm": 64.63037359225194,
"learning_rate": 3.8977969850346866e-07,
"logits/chosen": -1.8362230062484741,
"logits/rejected": -1.827745795249939,
"logps/chosen": -341.99755859375,
"logps/rejected": -415.6537170410156,
"loss": 0.5512,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.6921066045761108,
"rewards/margins": 0.6708263158798218,
"rewards/rejected": -2.3629326820373535,
"step": 660
},
{
"epoch": 0.39,
"grad_norm": 68.40563732230615,
"learning_rate": 3.8558292621076526e-07,
"logits/chosen": -1.873615026473999,
"logits/rejected": -1.8472900390625,
"logps/chosen": -422.1318359375,
"logps/rejected": -461.34619140625,
"loss": 0.5427,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.0594050884246826,
"rewards/margins": 0.5281103253364563,
"rewards/rejected": -2.5875158309936523,
"step": 670
},
{
"epoch": 0.39,
"grad_norm": 22.39050226911276,
"learning_rate": 3.8133131005357465e-07,
"logits/chosen": -1.8999011516571045,
"logits/rejected": -1.836851716041565,
"logps/chosen": -397.0812072753906,
"logps/rejected": -480.00823974609375,
"loss": 0.5167,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.0673575401306152,
"rewards/margins": 0.7980934381484985,
"rewards/rejected": -2.8654510974884033,
"step": 680
},
{
"epoch": 0.4,
"grad_norm": 38.649992337166125,
"learning_rate": 3.7702656982853277e-07,
"logits/chosen": -1.810121774673462,
"logits/rejected": -1.793265700340271,
"logps/chosen": -450.671875,
"logps/rejected": -518.1996459960938,
"loss": 0.5696,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.448154926300049,
"rewards/margins": 0.7367699146270752,
"rewards/rejected": -3.184924602508545,
"step": 690
},
{
"epoch": 0.4,
"grad_norm": 34.05006479039719,
"learning_rate": 3.7267044682118435e-07,
"logits/chosen": -1.860874891281128,
"logits/rejected": -1.8456264734268188,
"logps/chosen": -409.9309997558594,
"logps/rejected": -486.42376708984375,
"loss": 0.5769,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.9787782430648804,
"rewards/margins": 0.6770876049995422,
"rewards/rejected": -2.6558656692504883,
"step": 700
},
{
"epoch": 0.4,
"eval_logits/chosen": -1.7718605995178223,
"eval_logits/rejected": -1.7858551740646362,
"eval_logps/chosen": -383.05316162109375,
"eval_logps/rejected": -453.5823059082031,
"eval_loss": 0.5597525238990784,
"eval_rewards/accuracies": 0.7031963467597961,
"eval_rewards/chosen": -2.2029640674591064,
"eval_rewards/margins": 0.6295928955078125,
"eval_rewards/rejected": -2.832556962966919,
"eval_runtime": 535.7382,
"eval_samples_per_second": 13.066,
"eval_steps_per_second": 0.409,
"step": 700
},
{
"epoch": 0.41,
"grad_norm": 30.516726115650822,
"learning_rate": 3.682647031016264e-07,
"logits/chosen": -1.9329684972763062,
"logits/rejected": -1.940243124961853,
"logps/chosen": -388.291259765625,
"logps/rejected": -434.0372009277344,
"loss": 0.5486,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6513302326202393,
"rewards/margins": 0.6132162809371948,
"rewards/rejected": -2.2645463943481445,
"step": 710
},
{
"epoch": 0.41,
"grad_norm": 38.51345602531556,
"learning_rate": 3.638111208117425e-07,
"logits/chosen": -1.9404680728912354,
"logits/rejected": -1.9298954010009766,
"logps/chosen": -385.8715515136719,
"logps/rejected": -416.53155517578125,
"loss": 0.5762,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8125269412994385,
"rewards/margins": 0.4822394847869873,
"rewards/rejected": -2.294766426086426,
"step": 720
},
{
"epoch": 0.42,
"grad_norm": 36.417406572486875,
"learning_rate": 3.593115014443195e-07,
"logits/chosen": -1.9941285848617554,
"logits/rejected": -1.9894773960113525,
"logps/chosen": -382.0946350097656,
"logps/rejected": -437.18841552734375,
"loss": 0.5469,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.637915849685669,
"rewards/margins": 0.631543755531311,
"rewards/rejected": -2.2694597244262695,
"step": 730
},
{
"epoch": 0.43,
"grad_norm": 23.509926948805322,
"learning_rate": 3.5476766511433605e-07,
"logits/chosen": -1.9100837707519531,
"logits/rejected": -1.857428789138794,
"logps/chosen": -366.06109619140625,
"logps/rejected": -444.9000549316406,
"loss": 0.5376,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.6342054605484009,
"rewards/margins": 0.7001287341117859,
"rewards/rejected": -2.334334373474121,
"step": 740
},
{
"epoch": 0.43,
"grad_norm": 28.491603155440426,
"learning_rate": 3.5018144982271806e-07,
"logits/chosen": -1.847013235092163,
"logits/rejected": -1.844740867614746,
"logps/chosen": -387.2216796875,
"logps/rejected": -458.35247802734375,
"loss": 0.5425,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9939115047454834,
"rewards/margins": 0.5967587232589722,
"rewards/rejected": -2.590670585632324,
"step": 750
},
{
"epoch": 0.44,
"grad_norm": 21.711577115215622,
"learning_rate": 3.455547107128602e-07,
"logits/chosen": -1.7501156330108643,
"logits/rejected": -1.7191545963287354,
"logps/chosen": -452.614013671875,
"logps/rejected": -517.114501953125,
"loss": 0.5117,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.3392488956451416,
"rewards/margins": 0.8644348978996277,
"rewards/rejected": -3.203683853149414,
"step": 760
},
{
"epoch": 0.44,
"grad_norm": 50.17207271612329,
"learning_rate": 3.4088931932021185e-07,
"logits/chosen": -1.8234459161758423,
"logits/rejected": -1.780574083328247,
"logps/chosen": -448.5769958496094,
"logps/rejected": -518.0377197265625,
"loss": 0.5488,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.1782760620117188,
"rewards/margins": 0.8133376240730286,
"rewards/rejected": -2.9916136264801025,
"step": 770
},
{
"epoch": 0.45,
"grad_norm": 49.301861132325,
"learning_rate": 3.361871628152338e-07,
"logits/chosen": -1.773737907409668,
"logits/rejected": -1.7517740726470947,
"logps/chosen": -440.6595153808594,
"logps/rejected": -493.2332458496094,
"loss": 0.5173,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.263493299484253,
"rewards/margins": 0.6659582853317261,
"rewards/rejected": -2.9294512271881104,
"step": 780
},
{
"epoch": 0.45,
"grad_norm": 30.255792286324436,
"learning_rate": 3.314501432400294e-07,
"logits/chosen": -1.7690521478652954,
"logits/rejected": -1.7298529148101807,
"logps/chosen": -411.845703125,
"logps/rejected": -474.04425048828125,
"loss": 0.566,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.1643013954162598,
"rewards/margins": 0.6198045015335083,
"rewards/rejected": -2.7841057777404785,
"step": 790
},
{
"epoch": 0.46,
"grad_norm": 22.17250118566977,
"learning_rate": 3.2668017673896077e-07,
"logits/chosen": -1.8177188634872437,
"logits/rejected": -1.7350183725357056,
"logps/chosen": -399.64495849609375,
"logps/rejected": -457.10601806640625,
"loss": 0.5598,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.0094985961914062,
"rewards/margins": 0.7035370469093323,
"rewards/rejected": -2.7130355834960938,
"step": 800
},
{
"epoch": 0.46,
"eval_logits/chosen": -1.7061283588409424,
"eval_logits/rejected": -1.7086626291275024,
"eval_logps/chosen": -387.18157958984375,
"eval_logps/rejected": -455.0378723144531,
"eval_loss": 0.558580219745636,
"eval_rewards/accuracies": 0.7163242101669312,
"eval_rewards/chosen": -2.244248390197754,
"eval_rewards/margins": 0.6028640270233154,
"eval_rewards/rejected": -2.8471124172210693,
"eval_runtime": 544.1327,
"eval_samples_per_second": 12.865,
"eval_steps_per_second": 0.402,
"step": 800
},
{
"epoch": 0.47,
"grad_norm": 29.19961014949389,
"learning_rate": 3.218791927835602e-07,
"logits/chosen": -1.8107563257217407,
"logits/rejected": -1.7641499042510986,
"logps/chosen": -369.27203369140625,
"logps/rejected": -456.6036682128906,
"loss": 0.5304,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8051646947860718,
"rewards/margins": 0.7181805968284607,
"rewards/rejected": -2.523345470428467,
"step": 810
},
{
"epoch": 0.47,
"grad_norm": 25.26002008872549,
"learning_rate": 3.1704913339205103e-07,
"logits/chosen": -1.8677990436553955,
"logits/rejected": -1.825749158859253,
"logps/chosen": -379.98321533203125,
"logps/rejected": -454.1268005371094,
"loss": 0.5288,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6397157907485962,
"rewards/margins": 0.8766795992851257,
"rewards/rejected": -2.516395092010498,
"step": 820
},
{
"epoch": 0.48,
"grad_norm": 28.6375298855639,
"learning_rate": 3.1219195234379265e-07,
"logits/chosen": -1.6751445531845093,
"logits/rejected": -1.6866257190704346,
"logps/chosen": -346.9654846191406,
"logps/rejected": -451.60498046875,
"loss": 0.5566,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7691535949707031,
"rewards/margins": 0.7554360628128052,
"rewards/rejected": -2.5245893001556396,
"step": 830
},
{
"epoch": 0.48,
"grad_norm": 44.73580525279706,
"learning_rate": 3.0730961438896885e-07,
"logits/chosen": -1.7529224157333374,
"logits/rejected": -1.7129818201065063,
"logps/chosen": -400.9212951660156,
"logps/rejected": -464.69305419921875,
"loss": 0.5584,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.0139617919921875,
"rewards/margins": 0.5847775936126709,
"rewards/rejected": -2.5987396240234375,
"step": 840
},
{
"epoch": 0.49,
"grad_norm": 30.284221885120694,
"learning_rate": 3.024040944538383e-07,
"logits/chosen": -1.7323232889175415,
"logits/rejected": -1.7132787704467773,
"logps/chosen": -379.4556579589844,
"logps/rejected": -454.51531982421875,
"loss": 0.5314,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9580036401748657,
"rewards/margins": 0.7366491556167603,
"rewards/rejected": -2.694653034210205,
"step": 850
},
{
"epoch": 0.5,
"grad_norm": 27.718050401992414,
"learning_rate": 2.9747737684186795e-07,
"logits/chosen": -1.7737243175506592,
"logits/rejected": -1.7415263652801514,
"logps/chosen": -404.40509033203125,
"logps/rejected": -465.7650451660156,
"loss": 0.5184,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.0648417472839355,
"rewards/margins": 0.7340750694274902,
"rewards/rejected": -2.798916816711426,
"step": 860
},
{
"epoch": 0.5,
"grad_norm": 31.011489118398675,
"learning_rate": 2.925314544310745e-07,
"logits/chosen": -1.745216727256775,
"logits/rejected": -1.727979302406311,
"logps/chosen": -392.7491149902344,
"logps/rejected": -456.2132263183594,
"loss": 0.5497,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.195338487625122,
"rewards/margins": 0.5425236225128174,
"rewards/rejected": -2.7378618717193604,
"step": 870
},
{
"epoch": 0.51,
"grad_norm": 35.37211460888614,
"learning_rate": 2.8756832786789663e-07,
"logits/chosen": -1.8434585332870483,
"logits/rejected": -1.8155876398086548,
"logps/chosen": -413.1863708496094,
"logps/rejected": -489.76220703125,
"loss": 0.5608,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.009500026702881,
"rewards/margins": 0.6946345567703247,
"rewards/rejected": -2.704134464263916,
"step": 880
},
{
"epoch": 0.51,
"grad_norm": 33.27106994315821,
"learning_rate": 2.8259000475792503e-07,
"logits/chosen": -1.876704454421997,
"logits/rejected": -1.7968547344207764,
"logps/chosen": -395.55706787109375,
"logps/rejected": -460.11669921875,
"loss": 0.5543,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.7892892360687256,
"rewards/margins": 0.7638824582099915,
"rewards/rejected": -2.5531716346740723,
"step": 890
},
{
"epoch": 0.52,
"grad_norm": 31.881562451650627,
"learning_rate": 2.7759849885381747e-07,
"logits/chosen": -1.868417739868164,
"logits/rejected": -1.7971748113632202,
"logps/chosen": -378.93353271484375,
"logps/rejected": -464.1891174316406,
"loss": 0.5374,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.808215856552124,
"rewards/margins": 0.7741836309432983,
"rewards/rejected": -2.582399368286133,
"step": 900
},
{
"epoch": 0.52,
"eval_logits/chosen": -1.6767016649246216,
"eval_logits/rejected": -1.6597568988800049,
"eval_logps/chosen": -382.5883483886719,
"eval_logps/rejected": -453.9528503417969,
"eval_loss": 0.5555324554443359,
"eval_rewards/accuracies": 0.7151826620101929,
"eval_rewards/chosen": -2.198316812515259,
"eval_rewards/margins": 0.6379454731941223,
"eval_rewards/rejected": -2.8362622261047363,
"eval_runtime": 537.245,
"eval_samples_per_second": 13.029,
"eval_steps_per_second": 0.408,
"step": 900
},
{
"epoch": 0.52,
"grad_norm": 21.961619231813007,
"learning_rate": 2.7259582924072756e-07,
"logits/chosen": -1.8725192546844482,
"logits/rejected": -1.8156566619873047,
"logps/chosen": -350.8863220214844,
"logps/rejected": -413.61993408203125,
"loss": 0.5397,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7346986532211304,
"rewards/margins": 0.5974160432815552,
"rewards/rejected": -2.3321146965026855,
"step": 910
},
{
"epoch": 0.53,
"grad_norm": 35.79222168716502,
"learning_rate": 2.675840195195762e-07,
"logits/chosen": -1.8498157262802124,
"logits/rejected": -1.8300836086273193,
"logps/chosen": -376.912353515625,
"logps/rejected": -438.8692932128906,
"loss": 0.5246,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.9630987644195557,
"rewards/margins": 0.5869341492652893,
"rewards/rejected": -2.5500330924987793,
"step": 920
},
{
"epoch": 0.54,
"grad_norm": 29.90256487232944,
"learning_rate": 2.625650969884965e-07,
"logits/chosen": -1.7971664667129517,
"logits/rejected": -1.7699878215789795,
"logps/chosen": -429.76171875,
"logps/rejected": -510.20550537109375,
"loss": 0.5419,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.1585049629211426,
"rewards/margins": 0.7950173616409302,
"rewards/rejected": -2.953521966934204,
"step": 930
},
{
"epoch": 0.54,
"grad_norm": 29.9715777964654,
"learning_rate": 2.575410918227829e-07,
"logits/chosen": -1.8557363748550415,
"logits/rejected": -1.7954918146133423,
"logps/chosen": -443.01092529296875,
"logps/rejected": -511.6332092285156,
"loss": 0.5316,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.0825295448303223,
"rewards/margins": 0.8091154098510742,
"rewards/rejected": -2.8916451930999756,
"step": 940
},
{
"epoch": 0.55,
"grad_norm": 29.036788981905207,
"learning_rate": 2.525140362536775e-07,
"logits/chosen": -1.7384717464447021,
"logits/rejected": -1.6616607904434204,
"logps/chosen": -384.7867126464844,
"logps/rejected": -460.168212890625,
"loss": 0.5632,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.1540913581848145,
"rewards/margins": 0.569457471370697,
"rewards/rejected": -2.723548650741577,
"step": 950
},
{
"epoch": 0.55,
"grad_norm": 24.40866664439217,
"learning_rate": 2.474859637463226e-07,
"logits/chosen": -1.7090812921524048,
"logits/rejected": -1.6654443740844727,
"logps/chosen": -438.59613037109375,
"logps/rejected": -484.14093017578125,
"loss": 0.5394,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.41066837310791,
"rewards/margins": 0.5415581464767456,
"rewards/rejected": -2.952226400375366,
"step": 960
},
{
"epoch": 0.56,
"grad_norm": 28.97604867448642,
"learning_rate": 2.42458908177217e-07,
"logits/chosen": -1.8490597009658813,
"logits/rejected": -1.7891228199005127,
"logps/chosen": -424.61383056640625,
"logps/rejected": -479.585205078125,
"loss": 0.5426,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.024867296218872,
"rewards/margins": 0.7355901002883911,
"rewards/rejected": -2.7604575157165527,
"step": 970
},
{
"epoch": 0.56,
"grad_norm": 35.488277243353735,
"learning_rate": 2.3743490301150355e-07,
"logits/chosen": -1.8032734394073486,
"logits/rejected": -1.794163465499878,
"logps/chosen": -417.388671875,
"logps/rejected": -491.72021484375,
"loss": 0.554,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.1092441082000732,
"rewards/margins": 0.6570713520050049,
"rewards/rejected": -2.766315460205078,
"step": 980
},
{
"epoch": 0.57,
"grad_norm": 22.23777106600426,
"learning_rate": 2.324159804804238e-07,
"logits/chosen": -1.8234403133392334,
"logits/rejected": -1.786786675453186,
"logps/chosen": -404.6798400878906,
"logps/rejected": -463.2445373535156,
"loss": 0.5494,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.0632481575012207,
"rewards/margins": 0.6302553415298462,
"rewards/rejected": -2.6935033798217773,
"step": 990
},
{
"epoch": 0.58,
"grad_norm": 31.216532615702715,
"learning_rate": 2.274041707592724e-07,
"logits/chosen": -1.9149761199951172,
"logits/rejected": -1.8780314922332764,
"logps/chosen": -430.76617431640625,
"logps/rejected": -501.29132080078125,
"loss": 0.5036,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9554197788238525,
"rewards/margins": 0.8839017748832703,
"rewards/rejected": -2.8393216133117676,
"step": 1000
},
{
"epoch": 0.58,
"eval_logits/chosen": -1.7254499197006226,
"eval_logits/rejected": -1.7160460948944092,
"eval_logps/chosen": -385.9115295410156,
"eval_logps/rejected": -462.5011291503906,
"eval_loss": 0.5499266982078552,
"eval_rewards/accuracies": 0.7208904027938843,
"eval_rewards/chosen": -2.231548309326172,
"eval_rewards/margins": 0.6901971697807312,
"eval_rewards/rejected": -2.921745777130127,
"eval_runtime": 544.8576,
"eval_samples_per_second": 12.847,
"eval_steps_per_second": 0.402,
"step": 1000
},
{
"epoch": 0.58,
"grad_norm": 30.350568547131573,
"learning_rate": 2.2240150114618259e-07,
"logits/chosen": -1.8180408477783203,
"logits/rejected": -1.7760928869247437,
"logps/chosen": -416.88525390625,
"logps/rejected": -509.04058837890625,
"loss": 0.5276,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.980337381362915,
"rewards/margins": 0.8953462839126587,
"rewards/rejected": -2.875683546066284,
"step": 1010
},
{
"epoch": 0.59,
"grad_norm": 31.603328627940357,
"learning_rate": 2.17409995242075e-07,
"logits/chosen": -1.8180592060089111,
"logits/rejected": -1.7379405498504639,
"logps/chosen": -440.83074951171875,
"logps/rejected": -495.69830322265625,
"loss": 0.5235,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.1939332485198975,
"rewards/margins": 0.9124080538749695,
"rewards/rejected": -3.1063413619995117,
"step": 1020
},
{
"epoch": 0.59,
"grad_norm": 31.20623945497072,
"learning_rate": 2.1243167213210335e-07,
"logits/chosen": -1.8180633783340454,
"logits/rejected": -1.7436892986297607,
"logps/chosen": -410.88427734375,
"logps/rejected": -483.1456604003906,
"loss": 0.5401,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.02951717376709,
"rewards/margins": 0.87162846326828,
"rewards/rejected": -2.9011454582214355,
"step": 1030
},
{
"epoch": 0.6,
"grad_norm": 34.72608405283437,
"learning_rate": 2.0746854556892544e-07,
"logits/chosen": -1.804686188697815,
"logits/rejected": -1.7846415042877197,
"logps/chosen": -387.50067138671875,
"logps/rejected": -457.11505126953125,
"loss": 0.5742,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.964666724205017,
"rewards/margins": 0.690390944480896,
"rewards/rejected": -2.655057907104492,
"step": 1040
},
{
"epoch": 0.6,
"grad_norm": 26.9759576683522,
"learning_rate": 2.025226231581321e-07,
"logits/chosen": -1.8315603733062744,
"logits/rejected": -1.7954432964324951,
"logps/chosen": -408.33740234375,
"logps/rejected": -479.91912841796875,
"loss": 0.5286,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.079817533493042,
"rewards/margins": 0.8754861950874329,
"rewards/rejected": -2.95530366897583,
"step": 1050
},
{
"epoch": 0.61,
"grad_norm": 28.45648597029955,
"learning_rate": 1.9759590554616173e-07,
"logits/chosen": -1.8250961303710938,
"logits/rejected": -1.785871148109436,
"logps/chosen": -423.91607666015625,
"logps/rejected": -492.83563232421875,
"loss": 0.5428,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.0802817344665527,
"rewards/margins": 0.7300957441329956,
"rewards/rejected": -2.810377597808838,
"step": 1060
},
{
"epoch": 0.62,
"grad_norm": 36.3095911676204,
"learning_rate": 1.926903856110311e-07,
"logits/chosen": -1.8510675430297852,
"logits/rejected": -1.7864242792129517,
"logps/chosen": -412.6505432128906,
"logps/rejected": -492.79095458984375,
"loss": 0.53,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.102355480194092,
"rewards/margins": 0.708962082862854,
"rewards/rejected": -2.8113174438476562,
"step": 1070
},
{
"epoch": 0.62,
"grad_norm": 28.29885030565513,
"learning_rate": 1.8780804765620746e-07,
"logits/chosen": -1.8249950408935547,
"logits/rejected": -1.7665761709213257,
"logps/chosen": -403.99609375,
"logps/rejected": -481.81103515625,
"loss": 0.5048,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.150357961654663,
"rewards/margins": 0.908871054649353,
"rewards/rejected": -3.0592291355133057,
"step": 1080
},
{
"epoch": 0.63,
"grad_norm": 32.08952273669513,
"learning_rate": 1.82950866607949e-07,
"logits/chosen": -1.87527596950531,
"logits/rejected": -1.8300920724868774,
"logps/chosen": -415.8727111816406,
"logps/rejected": -479.55419921875,
"loss": 0.5616,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.0913748741149902,
"rewards/margins": 0.8811753988265991,
"rewards/rejected": -2.972550630569458,
"step": 1090
},
{
"epoch": 0.63,
"grad_norm": 34.73789118478527,
"learning_rate": 1.7812080721643973e-07,
"logits/chosen": -1.8299520015716553,
"logits/rejected": -1.7463247776031494,
"logps/chosen": -407.6546325683594,
"logps/rejected": -461.5155334472656,
"loss": 0.5281,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9930970668792725,
"rewards/margins": 0.8382734060287476,
"rewards/rejected": -2.8313703536987305,
"step": 1100
},
{
"epoch": 0.63,
"eval_logits/chosen": -1.7563356161117554,
"eval_logits/rejected": -1.7503989934921265,
"eval_logps/chosen": -391.30999755859375,
"eval_logps/rejected": -466.37115478515625,
"eval_loss": 0.5488813519477844,
"eval_rewards/accuracies": 0.72374427318573,
"eval_rewards/chosen": -2.2855324745178223,
"eval_rewards/margins": 0.674912691116333,
"eval_rewards/rejected": -2.960444927215576,
"eval_runtime": 536.1349,
"eval_samples_per_second": 13.056,
"eval_steps_per_second": 0.408,
"step": 1100
},
{
"epoch": 0.64,
"grad_norm": 22.04228728782992,
"learning_rate": 1.7331982326103918e-07,
"logits/chosen": -1.9050697088241577,
"logits/rejected": -1.8864399194717407,
"logps/chosen": -400.99151611328125,
"logps/rejected": -456.43377685546875,
"loss": 0.5236,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9019591808319092,
"rewards/margins": 0.7700345516204834,
"rewards/rejected": -2.6719937324523926,
"step": 1110
},
{
"epoch": 0.64,
"grad_norm": 32.9478891711517,
"learning_rate": 1.6854985675997063e-07,
"logits/chosen": -1.857361078262329,
"logits/rejected": -1.8371422290802002,
"logps/chosen": -456.65582275390625,
"logps/rejected": -527.1624755859375,
"loss": 0.538,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.3455679416656494,
"rewards/margins": 0.7390109300613403,
"rewards/rejected": -3.0845787525177,
"step": 1120
},
{
"epoch": 0.65,
"grad_norm": 35.31307694928471,
"learning_rate": 1.638128371847662e-07,
"logits/chosen": -1.8157202005386353,
"logits/rejected": -1.7822942733764648,
"logps/chosen": -413.3636779785156,
"logps/rejected": -507.03338623046875,
"loss": 0.5299,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.1275038719177246,
"rewards/margins": 0.8648099899291992,
"rewards/rejected": -2.992314338684082,
"step": 1130
},
{
"epoch": 0.66,
"grad_norm": 39.68360857124647,
"learning_rate": 1.5911068067978818e-07,
"logits/chosen": -1.8612645864486694,
"logits/rejected": -1.8140894174575806,
"logps/chosen": -447.96697998046875,
"logps/rejected": -535.6785888671875,
"loss": 0.5089,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.447895050048828,
"rewards/margins": 0.799897313117981,
"rewards/rejected": -3.2477920055389404,
"step": 1140
},
{
"epoch": 0.66,
"grad_norm": 45.56752540087649,
"learning_rate": 1.5444528928713985e-07,
"logits/chosen": -1.8386377096176147,
"logits/rejected": -1.773667335510254,
"logps/chosen": -397.98663330078125,
"logps/rejected": -473.56329345703125,
"loss": 0.5192,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.0095162391662598,
"rewards/margins": 0.8521126508712769,
"rewards/rejected": -2.861629009246826,
"step": 1150
},
{
"epoch": 0.67,
"grad_norm": 28.03328574000068,
"learning_rate": 1.4981855017728197e-07,
"logits/chosen": -1.7747135162353516,
"logits/rejected": -1.7615177631378174,
"logps/chosen": -415.66680908203125,
"logps/rejected": -485.73944091796875,
"loss": 0.5243,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.2126426696777344,
"rewards/margins": 0.8091427087783813,
"rewards/rejected": -3.021785259246826,
"step": 1160
},
{
"epoch": 0.67,
"grad_norm": 29.571461514972317,
"learning_rate": 1.452323348856639e-07,
"logits/chosen": -1.9694970846176147,
"logits/rejected": -1.9434292316436768,
"logps/chosen": -398.3510437011719,
"logps/rejected": -494.0318908691406,
"loss": 0.5583,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8146642446517944,
"rewards/margins": 0.8760486841201782,
"rewards/rejected": -2.6907129287719727,
"step": 1170
},
{
"epoch": 0.68,
"grad_norm": 24.402984146105567,
"learning_rate": 1.406884985556804e-07,
"logits/chosen": -1.8805034160614014,
"logits/rejected": -1.8436615467071533,
"logps/chosen": -395.9003601074219,
"logps/rejected": -471.02685546875,
"loss": 0.5487,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.027919292449951,
"rewards/margins": 0.7736718058586121,
"rewards/rejected": -2.801591396331787,
"step": 1180
},
{
"epoch": 0.69,
"grad_norm": 20.687254330852166,
"learning_rate": 1.361888791882575e-07,
"logits/chosen": -1.900747299194336,
"logits/rejected": -1.8584699630737305,
"logps/chosen": -339.09442138671875,
"logps/rejected": -388.8125,
"loss": 0.5293,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7120403051376343,
"rewards/margins": 0.5825742483139038,
"rewards/rejected": -2.294614315032959,
"step": 1190
},
{
"epoch": 0.69,
"grad_norm": 26.083643158531725,
"learning_rate": 1.3173529689837354e-07,
"logits/chosen": -2.0513994693756104,
"logits/rejected": -1.986104965209961,
"logps/chosen": -375.33746337890625,
"logps/rejected": -463.71051025390625,
"loss": 0.5067,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7211412191390991,
"rewards/margins": 0.6738361120223999,
"rewards/rejected": -2.394977331161499,
"step": 1200
},
{
"epoch": 0.69,
"eval_logits/chosen": -1.8046900033950806,
"eval_logits/rejected": -1.7967232465744019,
"eval_logps/chosen": -393.0003662109375,
"eval_logps/rejected": -471.0760192871094,
"eval_loss": 0.5447794198989868,
"eval_rewards/accuracies": 0.7243150472640991,
"eval_rewards/chosen": -2.302436351776123,
"eval_rewards/margins": 0.7050578594207764,
"eval_rewards/rejected": -3.0074942111968994,
"eval_runtime": 543.7258,
"eval_samples_per_second": 12.874,
"eval_steps_per_second": 0.403,
"step": 1200
},
{
"epoch": 0.7,
"grad_norm": 27.1913571170997,
"learning_rate": 1.273295531788156e-07,
"logits/chosen": -1.8818267583847046,
"logits/rejected": -1.8337571620941162,
"logps/chosen": -382.8564758300781,
"logps/rejected": -479.34423828125,
"loss": 0.4961,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9514567852020264,
"rewards/margins": 0.9589536786079407,
"rewards/rejected": -2.9104104042053223,
"step": 1210
},
{
"epoch": 0.7,
"grad_norm": 30.87994960869174,
"learning_rate": 1.2297343017146726e-07,
"logits/chosen": -1.9285227060317993,
"logits/rejected": -1.879024863243103,
"logps/chosen": -413.84722900390625,
"logps/rejected": -471.5714416503906,
"loss": 0.5172,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.0496084690093994,
"rewards/margins": 0.7111380696296692,
"rewards/rejected": -2.760746479034424,
"step": 1220
},
{
"epoch": 0.71,
"grad_norm": 30.103114841199876,
"learning_rate": 1.1866868994642534e-07,
"logits/chosen": -1.910308599472046,
"logits/rejected": -1.8798065185546875,
"logps/chosen": -428.7994689941406,
"logps/rejected": -501.61572265625,
"loss": 0.532,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.0664076805114746,
"rewards/margins": 0.800611138343811,
"rewards/rejected": -2.867018938064575,
"step": 1230
},
{
"epoch": 0.71,
"grad_norm": 33.952742355560765,
"learning_rate": 1.1441707378923474e-07,
"logits/chosen": -1.954697608947754,
"logits/rejected": -1.9360759258270264,
"logps/chosen": -358.89459228515625,
"logps/rejected": -451.65509033203125,
"loss": 0.5161,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.7160298824310303,
"rewards/margins": 0.8581940531730652,
"rewards/rejected": -2.5742239952087402,
"step": 1240
},
{
"epoch": 0.72,
"grad_norm": 26.669019235150035,
"learning_rate": 1.1022030149653133e-07,
"logits/chosen": -1.8900222778320312,
"logits/rejected": -1.8807737827301025,
"logps/chosen": -370.8710021972656,
"logps/rejected": -462.078857421875,
"loss": 0.5378,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.005030870437622,
"rewards/margins": 0.7315531969070435,
"rewards/rejected": -2.736584186553955,
"step": 1250
},
{
"epoch": 0.73,
"grad_norm": 29.39370070872787,
"learning_rate": 1.06080070680377e-07,
"logits/chosen": -1.9039020538330078,
"logits/rejected": -1.8967231512069702,
"logps/chosen": -407.52886962890625,
"logps/rejected": -471.5879821777344,
"loss": 0.5022,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.9449284076690674,
"rewards/margins": 0.866219699382782,
"rewards/rejected": -2.811148166656494,
"step": 1260
},
{
"epoch": 0.73,
"grad_norm": 34.28058476728983,
"learning_rate": 1.01998056081568e-07,
"logits/chosen": -1.947778344154358,
"logits/rejected": -1.933396339416504,
"logps/chosen": -409.4700622558594,
"logps/rejected": -500.43798828125,
"loss": 0.5202,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.1493587493896484,
"rewards/margins": 0.8992208242416382,
"rewards/rejected": -3.048579692840576,
"step": 1270
},
{
"epoch": 0.74,
"grad_norm": 38.74422247304706,
"learning_rate": 9.797590889219587e-08,
"logits/chosen": -1.9459298849105835,
"logits/rejected": -1.902991533279419,
"logps/chosen": -424.58380126953125,
"logps/rejected": -504.6437072753906,
"loss": 0.5452,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.1141209602355957,
"rewards/margins": 0.8704532384872437,
"rewards/rejected": -2.984574317932129,
"step": 1280
},
{
"epoch": 0.74,
"grad_norm": 28.244962433086215,
"learning_rate": 9.401525608773292e-08,
"logits/chosen": -1.8756380081176758,
"logits/rejected": -1.8184016942977905,
"logps/chosen": -392.9984130859375,
"logps/rejected": -461.8180236816406,
"loss": 0.5398,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.9684727191925049,
"rewards/margins": 0.7552623748779297,
"rewards/rejected": -2.7237350940704346,
"step": 1290
},
{
"epoch": 0.75,
"grad_norm": 34.33816096896355,
"learning_rate": 9.011769976891367e-08,
"logits/chosen": -1.903464913368225,
"logits/rejected": -1.8477399349212646,
"logps/chosen": -419.75750732421875,
"logps/rejected": -506.533935546875,
"loss": 0.5095,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.119554042816162,
"rewards/margins": 0.8639281392097473,
"rewards/rejected": -2.9834823608398438,
"step": 1300
},
{
"epoch": 0.75,
"eval_logits/chosen": -1.824761152267456,
"eval_logits/rejected": -1.8238047361373901,
"eval_logps/chosen": -383.5680236816406,
"eval_logps/rejected": -460.3614196777344,
"eval_loss": 0.545096218585968,
"eval_rewards/accuracies": 0.7186073064804077,
"eval_rewards/chosen": -2.208112955093384,
"eval_rewards/margins": 0.6922349333763123,
"eval_rewards/rejected": -2.90034818649292,
"eval_runtime": 523.3073,
"eval_samples_per_second": 13.376,
"eval_steps_per_second": 0.418,
"step": 1300
},
{
"epoch": 0.75,
"grad_norm": 31.642563855828758,
"learning_rate": 8.628481651367875e-08,
"logits/chosen": -1.98309326171875,
"logits/rejected": -1.9879448413848877,
"logps/chosen": -390.30712890625,
"logps/rejected": -448.2904357910156,
"loss": 0.5447,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.7398111820220947,
"rewards/margins": 0.5182247757911682,
"rewards/rejected": -2.258035898208618,
"step": 1310
},
{
"epoch": 0.76,
"grad_norm": 30.145195997712573,
"learning_rate": 8.251815673944218e-08,
"logits/chosen": -1.9566547870635986,
"logits/rejected": -1.9202098846435547,
"logps/chosen": -395.9665832519531,
"logps/rejected": -496.5779724121094,
"loss": 0.5563,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9744913578033447,
"rewards/margins": 0.9160418510437012,
"rewards/rejected": -2.890532970428467,
"step": 1320
},
{
"epoch": 0.77,
"grad_norm": 26.76251092001294,
"learning_rate": 7.881924407594129e-08,
"logits/chosen": -1.9259990453720093,
"logits/rejected": -1.8814588785171509,
"logps/chosen": -420.6258850097656,
"logps/rejected": -471.1128845214844,
"loss": 0.5346,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.2005252838134766,
"rewards/margins": 0.6154937744140625,
"rewards/rejected": -2.816019296646118,
"step": 1330
},
{
"epoch": 0.77,
"grad_norm": 31.270267795635966,
"learning_rate": 7.518957474892148e-08,
"logits/chosen": -1.873970627784729,
"logits/rejected": -1.8780710697174072,
"logps/chosen": -387.642333984375,
"logps/rejected": -460.996337890625,
"loss": 0.561,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0759739875793457,
"rewards/margins": 0.5735403895378113,
"rewards/rejected": -2.6495144367218018,
"step": 1340
},
{
"epoch": 0.78,
"grad_norm": 25.79672067849548,
"learning_rate": 7.16306169749074e-08,
"logits/chosen": -1.9269657135009766,
"logits/rejected": -1.8575401306152344,
"logps/chosen": -386.0765380859375,
"logps/rejected": -446.42095947265625,
"loss": 0.5122,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8944480419158936,
"rewards/margins": 0.7809109687805176,
"rewards/rejected": -2.675359010696411,
"step": 1350
},
{
"epoch": 0.78,
"grad_norm": 36.12006272451077,
"learning_rate": 6.814381036730274e-08,
"logits/chosen": -1.9610059261322021,
"logits/rejected": -1.9268226623535156,
"logps/chosen": -397.60357666015625,
"logps/rejected": -479.10125732421875,
"loss": 0.5363,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9262597560882568,
"rewards/margins": 0.6647717952728271,
"rewards/rejected": -2.591031551361084,
"step": 1360
},
{
"epoch": 0.79,
"grad_norm": 32.103751962383164,
"learning_rate": 6.473056535406035e-08,
"logits/chosen": -1.970505714416504,
"logits/rejected": -1.9748294353485107,
"logps/chosen": -398.49639892578125,
"logps/rejected": -483.1766052246094,
"loss": 0.5542,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9573405981063843,
"rewards/margins": 0.7238850593566895,
"rewards/rejected": -2.681225538253784,
"step": 1370
},
{
"epoch": 0.79,
"grad_norm": 29.27367179768827,
"learning_rate": 6.139226260715872e-08,
"logits/chosen": -1.9642279148101807,
"logits/rejected": -1.9199190139770508,
"logps/chosen": -412.734619140625,
"logps/rejected": -482.08740234375,
"loss": 0.5026,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.0462894439697266,
"rewards/margins": 0.792068600654602,
"rewards/rejected": -2.838358163833618,
"step": 1380
},
{
"epoch": 0.8,
"grad_norm": 36.00438391939365,
"learning_rate": 5.8130252484113964e-08,
"logits/chosen": -1.9426565170288086,
"logits/rejected": -1.9226014614105225,
"logps/chosen": -385.748046875,
"logps/rejected": -454.22412109375,
"loss": 0.5491,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8511062860488892,
"rewards/margins": 0.8074220418930054,
"rewards/rejected": -2.6585285663604736,
"step": 1390
},
{
"epoch": 0.81,
"grad_norm": 32.96643024329086,
"learning_rate": 5.4945854481754734e-08,
"logits/chosen": -1.9304873943328857,
"logits/rejected": -1.900002121925354,
"logps/chosen": -371.5887145996094,
"logps/rejected": -445.46221923828125,
"loss": 0.5265,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.909517526626587,
"rewards/margins": 0.7846697568893433,
"rewards/rejected": -2.6941871643066406,
"step": 1400
},
{
"epoch": 0.81,
"eval_logits/chosen": -1.8071422576904297,
"eval_logits/rejected": -1.7997641563415527,
"eval_logps/chosen": -391.1993408203125,
"eval_logps/rejected": -469.79913330078125,
"eval_loss": 0.5436315536499023,
"eval_rewards/accuracies": 0.7214611768722534,
"eval_rewards/chosen": -2.28442645072937,
"eval_rewards/margins": 0.7102989554405212,
"eval_rewards/rejected": -2.994725227355957,
"eval_runtime": 524.298,
"eval_samples_per_second": 13.351,
"eval_steps_per_second": 0.418,
"step": 1400
},
{
"epoch": 0.81,
"grad_norm": 37.572173988295035,
"learning_rate": 5.184035670247988e-08,
"logits/chosen": -1.934077262878418,
"logits/rejected": -1.920440912246704,
"logps/chosen": -378.1131286621094,
"logps/rejected": -462.83929443359375,
"loss": 0.5312,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9387060403823853,
"rewards/margins": 0.7842427492141724,
"rewards/rejected": -2.7229487895965576,
"step": 1410
},
{
"epoch": 0.82,
"grad_norm": 35.65341932271922,
"learning_rate": 4.881501533321605e-08,
"logits/chosen": -1.87544846534729,
"logits/rejected": -1.8604532480239868,
"logps/chosen": -388.5731506347656,
"logps/rejected": -475.32330322265625,
"loss": 0.519,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1021876335144043,
"rewards/margins": 0.7690648436546326,
"rewards/rejected": -2.8712525367736816,
"step": 1420
},
{
"epoch": 0.82,
"grad_norm": 32.19415920453824,
"learning_rate": 4.5871054137284564e-08,
"logits/chosen": -1.9715772867202759,
"logits/rejected": -1.93185555934906,
"logps/chosen": -403.1676025390625,
"logps/rejected": -498.1748962402344,
"loss": 0.5203,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9203879833221436,
"rewards/margins": 0.8546259999275208,
"rewards/rejected": -2.7750141620635986,
"step": 1430
},
{
"epoch": 0.83,
"grad_norm": 57.11327150205332,
"learning_rate": 4.300966395938377e-08,
"logits/chosen": -1.9714921712875366,
"logits/rejected": -1.9407069683074951,
"logps/chosen": -409.3877258300781,
"logps/rejected": -456.50244140625,
"loss": 0.5818,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.9431222677230835,
"rewards/margins": 0.5195000767707825,
"rewards/rejected": -2.4626221656799316,
"step": 1440
},
{
"epoch": 0.84,
"grad_norm": 21.830708345963956,
"learning_rate": 4.023200224388787e-08,
"logits/chosen": -1.9089914560317993,
"logits/rejected": -1.855542778968811,
"logps/chosen": -377.63653564453125,
"logps/rejected": -457.47052001953125,
"loss": 0.5071,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7652651071548462,
"rewards/margins": 0.8306269645690918,
"rewards/rejected": -2.5958924293518066,
"step": 1450
},
{
"epoch": 0.84,
"grad_norm": 38.15424519204087,
"learning_rate": 3.7539192566655246e-08,
"logits/chosen": -1.870527982711792,
"logits/rejected": -1.821215271949768,
"logps/chosen": -396.0817565917969,
"logps/rejected": -436.74102783203125,
"loss": 0.5454,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0135273933410645,
"rewards/margins": 0.600884735584259,
"rewards/rejected": -2.6144118309020996,
"step": 1460
},
{
"epoch": 0.85,
"grad_norm": 31.36535036359726,
"learning_rate": 3.4932324180537736e-08,
"logits/chosen": -1.926028847694397,
"logits/rejected": -1.9150245189666748,
"logps/chosen": -379.81573486328125,
"logps/rejected": -472.4080505371094,
"loss": 0.5271,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.8782703876495361,
"rewards/margins": 0.8038197755813599,
"rewards/rejected": -2.6820900440216064,
"step": 1470
},
{
"epoch": 0.85,
"grad_norm": 29.336291125895066,
"learning_rate": 3.24124515747731e-08,
"logits/chosen": -1.9342174530029297,
"logits/rejected": -1.8941189050674438,
"logps/chosen": -409.3456726074219,
"logps/rejected": -470.3741149902344,
"loss": 0.5135,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9259653091430664,
"rewards/margins": 0.7116618752479553,
"rewards/rejected": -2.637627363204956,
"step": 1480
},
{
"epoch": 0.86,
"grad_norm": 26.986665461110498,
"learning_rate": 2.998059404843947e-08,
"logits/chosen": -1.8567430973052979,
"logits/rejected": -1.8131777048110962,
"logps/chosen": -383.24371337890625,
"logps/rejected": -440.67315673828125,
"loss": 0.5239,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9675956964492798,
"rewards/margins": 0.6357102990150452,
"rewards/rejected": -2.6033058166503906,
"step": 1490
},
{
"epoch": 0.86,
"grad_norm": 26.37071629611169,
"learning_rate": 2.763773529814506e-08,
"logits/chosen": -1.8718721866607666,
"logits/rejected": -1.8146419525146484,
"logps/chosen": -384.16162109375,
"logps/rejected": -487.1327209472656,
"loss": 0.4844,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8588807582855225,
"rewards/margins": 0.849262535572052,
"rewards/rejected": -2.708143711090088,
"step": 1500
},
{
"epoch": 0.86,
"eval_logits/chosen": -1.7992874383926392,
"eval_logits/rejected": -1.7887682914733887,
"eval_logps/chosen": -386.97613525390625,
"eval_logps/rejected": -465.661376953125,
"eval_loss": 0.5432813763618469,
"eval_rewards/accuracies": 0.719748854637146,
"eval_rewards/chosen": -2.242194175720215,
"eval_rewards/margins": 0.7111533284187317,
"eval_rewards/rejected": -2.9533474445343018,
"eval_runtime": 546.6018,
"eval_samples_per_second": 12.806,
"eval_steps_per_second": 0.401,
"step": 1500
},
{
"epoch": 0.87,
"grad_norm": 33.8754723176866,
"learning_rate": 2.5384823020118212e-08,
"logits/chosen": -1.8839702606201172,
"logits/rejected": -1.8382689952850342,
"logps/chosen": -380.5359191894531,
"logps/rejected": -447.4384765625,
"loss": 0.5575,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.76059889793396,
"rewards/margins": 0.7574474215507507,
"rewards/rejected": -2.5180463790893555,
"step": 1510
},
{
"epoch": 0.88,
"grad_norm": 26.785794346840675,
"learning_rate": 2.3222768526860698e-08,
"logits/chosen": -1.8873153924942017,
"logits/rejected": -1.8357185125350952,
"logps/chosen": -381.34564208984375,
"logps/rejected": -430.72259521484375,
"loss": 0.5231,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8608728647232056,
"rewards/margins": 0.7487791180610657,
"rewards/rejected": -2.609651803970337,
"step": 1520
},
{
"epoch": 0.88,
"grad_norm": 30.22760318351379,
"learning_rate": 2.1152446378517818e-08,
"logits/chosen": -1.8901411294937134,
"logits/rejected": -1.839329719543457,
"logps/chosen": -393.60308837890625,
"logps/rejected": -461.057861328125,
"loss": 0.5391,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.9723374843597412,
"rewards/margins": 0.6977485418319702,
"rewards/rejected": -2.670086145401001,
"step": 1530
},
{
"epoch": 0.89,
"grad_norm": 26.484898729776308,
"learning_rate": 1.9174694029115146e-08,
"logits/chosen": -1.9374538660049438,
"logits/rejected": -1.8765513896942139,
"logps/chosen": -434.1646423339844,
"logps/rejected": -468.3814392089844,
"loss": 0.5262,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.045320987701416,
"rewards/margins": 0.6686374545097351,
"rewards/rejected": -2.713958263397217,
"step": 1540
},
{
"epoch": 0.89,
"grad_norm": 26.251643116785377,
"learning_rate": 1.7290311487804687e-08,
"logits/chosen": -1.9080512523651123,
"logits/rejected": -1.84622323513031,
"logps/chosen": -375.2956848144531,
"logps/rejected": -463.65765380859375,
"loss": 0.5239,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9260823726654053,
"rewards/margins": 0.8633429408073425,
"rewards/rejected": -2.7894253730773926,
"step": 1550
},
{
"epoch": 0.9,
"grad_norm": 22.1873285162568,
"learning_rate": 1.5500060995258134e-08,
"logits/chosen": -1.9254217147827148,
"logits/rejected": -1.8602027893066406,
"logps/chosen": -406.3652648925781,
"logps/rejected": -459.74273681640625,
"loss": 0.5039,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8533226251602173,
"rewards/margins": 0.7270603179931641,
"rewards/rejected": -2.580382823944092,
"step": 1560
},
{
"epoch": 0.9,
"grad_norm": 31.87656820271237,
"learning_rate": 1.3804666715337116e-08,
"logits/chosen": -1.911505103111267,
"logits/rejected": -1.8812297582626343,
"logps/chosen": -399.38519287109375,
"logps/rejected": -496.04168701171875,
"loss": 0.54,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9619176387786865,
"rewards/margins": 0.8611427545547485,
"rewards/rejected": -2.8230605125427246,
"step": 1570
},
{
"epoch": 0.91,
"grad_norm": 28.90758640199268,
"learning_rate": 1.2204814442165812e-08,
"logits/chosen": -1.8718591928482056,
"logits/rejected": -1.8608993291854858,
"logps/chosen": -397.22100830078125,
"logps/rejected": -456.198486328125,
"loss": 0.5244,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.995205283164978,
"rewards/margins": 0.7341451644897461,
"rewards/rejected": -2.7293505668640137,
"step": 1580
},
{
"epoch": 0.92,
"grad_norm": 31.58825212692507,
"learning_rate": 1.070115132272445e-08,
"logits/chosen": -1.8871160745620728,
"logits/rejected": -1.827559232711792,
"logps/chosen": -390.8136901855469,
"logps/rejected": -458.43798828125,
"loss": 0.5167,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8970882892608643,
"rewards/margins": 0.9112474322319031,
"rewards/rejected": -2.808335781097412,
"step": 1590
},
{
"epoch": 0.92,
"grad_norm": 29.086437438100777,
"learning_rate": 9.294285595075669e-09,
"logits/chosen": -1.9274282455444336,
"logits/rejected": -1.9093879461288452,
"logps/chosen": -414.84576416015625,
"logps/rejected": -500.18768310546875,
"loss": 0.5612,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.002596616744995,
"rewards/margins": 0.8606833219528198,
"rewards/rejected": -2.8632798194885254,
"step": 1600
},
{
"epoch": 0.92,
"eval_logits/chosen": -1.7875818014144897,
"eval_logits/rejected": -1.7765936851501465,
"eval_logps/chosen": -389.4626159667969,
"eval_logps/rejected": -469.811279296875,
"eval_loss": 0.5427327156066895,
"eval_rewards/accuracies": 0.7208904027938843,
"eval_rewards/chosen": -2.267058849334717,
"eval_rewards/margins": 0.727787435054779,
"eval_rewards/rejected": -2.9948465824127197,
"eval_runtime": 523.5533,
"eval_samples_per_second": 13.37,
"eval_steps_per_second": 0.418,
"step": 1600
},
{
"epoch": 0.93,
"grad_norm": 27.963113959175715,
"learning_rate": 7.984786342329492e-09,
"logits/chosen": -1.9024436473846436,
"logits/rejected": -1.8931423425674438,
"logps/chosen": -392.8687438964844,
"logps/rejected": -472.52203369140625,
"loss": 0.5214,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.090585470199585,
"rewards/margins": 0.7528320550918579,
"rewards/rejected": -2.843417167663574,
"step": 1610
},
{
"epoch": 0.93,
"grad_norm": 38.701178208422625,
"learning_rate": 6.773183262446914e-09,
"logits/chosen": -1.9047428369522095,
"logits/rejected": -1.8428666591644287,
"logps/chosen": -408.89788818359375,
"logps/rejected": -467.36309814453125,
"loss": 0.5324,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9106022119522095,
"rewards/margins": 0.8250144720077515,
"rewards/rejected": -2.735616683959961,
"step": 1620
},
{
"epoch": 0.94,
"grad_norm": 30.118277072421385,
"learning_rate": 5.6599664539749295e-09,
"logits/chosen": -1.9470701217651367,
"logits/rejected": -1.926031470298767,
"logps/chosen": -415.57330322265625,
"logps/rejected": -491.3902282714844,
"loss": 0.5135,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.970380425453186,
"rewards/margins": 0.8686148524284363,
"rewards/rejected": -2.8389952182769775,
"step": 1630
},
{
"epoch": 0.94,
"grad_norm": 37.40579887540256,
"learning_rate": 4.645586217799452e-09,
"logits/chosen": -1.9280283451080322,
"logits/rejected": -1.9276561737060547,
"logps/chosen": -423.79217529296875,
"logps/rejected": -496.7911071777344,
"loss": 0.5503,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9242738485336304,
"rewards/margins": 0.7551409602165222,
"rewards/rejected": -2.679414749145508,
"step": 1640
},
{
"epoch": 0.95,
"grad_norm": 32.746312132544105,
"learning_rate": 3.730452874996737e-09,
"logits/chosen": -1.92330801486969,
"logits/rejected": -1.8721330165863037,
"logps/chosen": -395.4407958984375,
"logps/rejected": -454.64178466796875,
"loss": 0.5192,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.9143590927124023,
"rewards/margins": 0.6810831427574158,
"rewards/rejected": -2.595442056655884,
"step": 1650
},
{
"epoch": 0.96,
"grad_norm": 35.835333829114475,
"learning_rate": 2.9149366008568987e-09,
"logits/chosen": -1.9155769348144531,
"logits/rejected": -1.8720881938934326,
"logps/chosen": -402.2400207519531,
"logps/rejected": -452.55755615234375,
"loss": 0.5239,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.848528265953064,
"rewards/margins": 0.7459059953689575,
"rewards/rejected": -2.5944347381591797,
"step": 1660
},
{
"epoch": 0.96,
"grad_norm": 27.738054909743056,
"learning_rate": 2.1993672751463576e-09,
"logits/chosen": -1.9466373920440674,
"logits/rejected": -1.9023081064224243,
"logps/chosen": -407.79254150390625,
"logps/rejected": -487.1842346191406,
"loss": 0.499,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.0100741386413574,
"rewards/margins": 0.8334406614303589,
"rewards/rejected": -2.843514919281006,
"step": 1670
},
{
"epoch": 0.97,
"grad_norm": 28.810543533175498,
"learning_rate": 1.5840343486700215e-09,
"logits/chosen": -1.9565961360931396,
"logits/rejected": -1.8820337057113647,
"logps/chosen": -377.89697265625,
"logps/rejected": -461.2203063964844,
"loss": 0.4816,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7978055477142334,
"rewards/margins": 0.8113381266593933,
"rewards/rejected": -2.6091437339782715,
"step": 1680
},
{
"epoch": 0.97,
"grad_norm": 26.747095486222516,
"learning_rate": 1.0691867261874154e-09,
"logits/chosen": -1.9276363849639893,
"logits/rejected": -1.8795725107192993,
"logps/chosen": -400.2092590332031,
"logps/rejected": -457.8243103027344,
"loss": 0.5346,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8860292434692383,
"rewards/margins": 0.778502881526947,
"rewards/rejected": -2.66453218460083,
"step": 1690
},
{
"epoch": 0.98,
"grad_norm": 21.69669900920832,
"learning_rate": 6.550326657293881e-10,
"logits/chosen": -1.915302038192749,
"logits/rejected": -1.897491216659546,
"logps/chosen": -411.01904296875,
"logps/rejected": -480.05078125,
"loss": 0.5017,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.052908420562744,
"rewards/margins": 0.8378399610519409,
"rewards/rejected": -2.8907482624053955,
"step": 1700
},
{
"epoch": 0.98,
"eval_logits/chosen": -1.7904165983200073,
"eval_logits/rejected": -1.7796399593353271,
"eval_logps/chosen": -389.3405456542969,
"eval_logps/rejected": -469.7990417480469,
"eval_loss": 0.542646050453186,
"eval_rewards/accuracies": 0.7214611768722534,
"eval_rewards/chosen": -2.2658379077911377,
"eval_rewards/margins": 0.7288866639137268,
"eval_rewards/rejected": -2.9947245121002197,
"eval_runtime": 549.1787,
"eval_samples_per_second": 12.746,
"eval_steps_per_second": 0.399,
"step": 1700
},
{
"epoch": 0.98,
"grad_norm": 27.017405119205325,
"learning_rate": 3.4173969435710715e-10,
"logits/chosen": -1.907459020614624,
"logits/rejected": -1.9028345346450806,
"logps/chosen": -404.1794738769531,
"logps/rejected": -475.02490234375,
"loss": 0.5398,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1053857803344727,
"rewards/margins": 0.5925677418708801,
"rewards/rejected": -2.697953701019287,
"step": 1710
},
{
"epoch": 0.99,
"grad_norm": 28.429723265538055,
"learning_rate": 1.2943454039654467e-10,
"logits/chosen": -1.8941481113433838,
"logits/rejected": -1.8836424350738525,
"logps/chosen": -381.0569152832031,
"logps/rejected": -472.2247009277344,
"loss": 0.5433,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.9636991024017334,
"rewards/margins": 0.7842205762863159,
"rewards/rejected": -2.7479193210601807,
"step": 1720
},
{
"epoch": 1.0,
"grad_norm": 23.62637877799544,
"learning_rate": 1.8203082176287964e-11,
"logits/chosen": -1.8356783390045166,
"logits/rejected": -1.797844648361206,
"logps/chosen": -407.73516845703125,
"logps/rejected": -473.55120849609375,
"loss": 0.5069,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.220022201538086,
"rewards/margins": 0.7199020385742188,
"rewards/rejected": -2.9399242401123047,
"step": 1730
},
{
"epoch": 1.0,
"step": 1736,
"total_flos": 0.0,
"train_loss": 0.55459001399405,
"train_runtime": 38266.551,
"train_samples_per_second": 2.904,
"train_steps_per_second": 0.045
}
],
"logging_steps": 10,
"max_steps": 1736,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}